diff options
Diffstat (limited to 'media/ffvpx/libavcodec')
335 files changed, 117431 insertions, 0 deletions
diff --git a/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c new file mode 100644 index 0000000000..77f5607960 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" + +#include "libavcodec/fft.h" + +void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); +void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); + +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); + +av_cold void ff_fft_init_aarch64(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + if (s->nbits < 17) { + s->fft_permute = ff_fft_permute_neon; + s->fft_calc = ff_fft_calc_neon; + } +#if CONFIG_MDCT + s->imdct_calc = ff_imdct_calc_neon; + s->imdct_half = ff_imdct_half_neon; + s->mdct_calc = ff_mdct_calc_neon; + s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; +#endif + } +} diff --git a/media/ffvpx/libavcodec/aarch64/fft_neon.S b/media/ffvpx/libavcodec/aarch64/fft_neon.S new file mode 100644 index 0000000000..d7225511dd --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/fft_neon.S @@ -0,0 +1,447 @@ +/* + * ARM NEON optimised FFT + * + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2009 Naotoshi Nojiri + * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> + * + * This algorithm (though not any of the implementation details) is + * based on libdjbfft by D. J. Bernstein. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +#define M_SQRT1_2 0.70710678118654752440 + +.macro transpose d0, d1, s0, s1 + trn1 \d0, \s0, \s1 + trn2 \d1, \s0, \s1 +.endm + + +function fft4_neon + AARCH64_VALID_JUMP_TARGET + ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] + + fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1 + fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1 + + ext v16.8b, v2.8b, v3.8b, #4 + ext v17.8b, v3.8b, v2.8b, #4 + + fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3 + fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3 + + fadd v0.2s, v4.2s, v5.2s + fsub v2.2s, v4.2s, v5.2s + fadd v1.2s, v6.2s, v7.2s + fsub v3.2s, v6.2s, v7.2s + + st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] + + ret +endfunc + +function fft8_neon + AARCH64_VALID_JUMP_TARGET + mov x1, x0 + ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 + ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] + ext v22.8b, v2.8b, v3.8b, #4 + ext v23.8b, v3.8b, v2.8b, #4 + fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 + fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 + fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 + fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 + rev64 v27.2s, v28.2s // ??? + fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 + fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 + fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w + ext v6.8b, v4.8b, v5.8b, #4 + ext v7.8b, v5.8b, v4.8b, #4 + fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w + fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 + fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 + fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w + fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w + fadd v0.2s, v20.2s, v21.2s + fsub v2.2s, v20.2s, v21.2s + fadd v1.2s, v22.2s, v23.2s + rev64 v26.2s, v26.2s + rev64 v27.2s, v27.2s + fsub v3.2s, v22.2s, v23.2s + fsub v6.2s, v6.2s, v7.2s + fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 + fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 + fadd v7.2s, v4.2s, v5.2s + fsub v18.2s, v2.2s, v6.2s + ext v26.8b, v24.8b, v25.8b, #4 + ext v27.8b, v25.8b, v24.8b, #4 + fadd v2.2s, v2.2s, v6.2s + fsub v16.2s, v0.2s, v7.2s + fadd v5.2s, v25.2s, v24.2s + fsub v4.2s, v26.2s, v27.2s + fadd v0.2s, v0.2s, v7.2s + fsub v17.2s, v1.2s, v5.2s + fsub v19.2s, v3.2s, v4.2s + fadd v3.2s, v3.2s, v4.2s + fadd v1.2s, v1.2s, v5.2s + + st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] + st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1] + + ret +endfunc + +function fft16_neon + AARCH64_VALID_JUMP_TARGET + mov x1, x0 + ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 + ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32 + ext v22.8b, v2.8b, v3.8b, #4 + ext v23.8b, v3.8b, v2.8b, #4 + fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 + fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 + fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 + fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 + rev64 v27.2s, v28.2s // ??? + fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 + fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 + fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w + ext v6.8b, v4.8b, v5.8b, #4 + ext v7.8b, v5.8b, v4.8b, #4 + fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w + fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 + fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 + fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w + fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w + fadd v0.2s, v20.2s, v21.2s + fsub v2.2s, v20.2s, v21.2s + fadd v1.2s, v22.2s, v23.2s + rev64 v26.2s, v26.2s + rev64 v27.2s, v27.2s + fsub v3.2s, v22.2s, v23.2s + fsub v6.2s, v6.2s, v7.2s + fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 + fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 + fadd v7.2s, v4.2s, v5.2s + fsub v18.2s, v2.2s, v6.2s + ld1 {v20.4s,v21.4s}, [x0], #32 + ld1 {v22.4s,v23.4s}, [x0], #32 + ext v26.8b, v24.8b, v25.8b, #4 + ext v27.8b, v25.8b, v24.8b, #4 + fadd v2.2s, v2.2s, v6.2s + fsub v16.2s, v0.2s, v7.2s + fadd v5.2s, v25.2s, v24.2s + fsub v4.2s, v26.2s, v27.2s + transpose v24.2d, v25.2d, v20.2d, v22.2d + transpose v26.2d, v27.2d, v21.2d, v23.2d + fadd v0.2s, v0.2s, v7.2s + fsub v17.2s, v1.2s, v5.2s + fsub v19.2s, v3.2s, v4.2s + fadd v3.2s, v3.2s, v4.2s + fadd v1.2s, v1.2s, v5.2s + ext v20.16b, v21.16b, v21.16b, #4 + ext v21.16b, v23.16b, v23.16b, #4 + + zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]} + zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]} + zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]} + zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]} + + // 2 x fft4 + transpose v22.2d, v23.2d, v20.2d, v21.2d + + fadd v4.4s, v24.4s, v25.4s + fadd v5.4s, v26.4s, v27.4s + fsub v6.4s, v24.4s, v25.4s + fsub v7.4s, v22.4s, v23.4s + + ld1 {v23.4s}, [x14] + + fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]} + fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]} + fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]} + fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]} + + //fft_pass_neon_16 + rev64 v7.4s, v25.4s + fmul v25.4s, v25.4s, v23.s[1] + fmul v7.4s, v7.4s, v29.4s + fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a} + + zip1 v20.4s, v24.4s, v25.4s + zip2 v21.4s, v24.4s, v25.4s + fneg v22.4s, v20.4s + fadd v4.4s, v21.4s, v20.4s + fsub v6.4s, v20.4s, v21.4s // just the second half + fadd v5.4s, v21.4s, v22.4s // just the first half + + tbl v4.16b, {v4.16b}, v30.16b // trans4_float + tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float + + fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]} + fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]} + fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]} + fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]} + +//second half + rev64 v6.4s, v26.4s + fmul v26.4s, v26.4s, v23.s[2] + rev64 v7.4s, v27.4s + fmul v27.4s, v27.4s, v23.s[3] + fmul v6.4s, v6.4s, v29.4s + fmul v7.4s, v7.4s, v29.4s + fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6} + fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a} + + zip1 v24.4s, v26.4s, v27.4s + zip2 v25.4s, v26.4s, v27.4s + fneg v26.4s, v24.4s + fadd v4.4s, v25.4s, v24.4s + fsub v6.4s, v24.4s, v25.4s // just the second half + fadd v5.4s, v25.4s, v26.4s // just the first half + + tbl v4.16b, {v4.16b}, v30.16b // trans4_float + tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float + + fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]} + fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]} + fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]} + fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]} + + st1 {v16.4s,v17.4s}, [x1], #32 + st1 {v18.4s,v19.4s}, [x1], #32 + st1 {v20.4s,v21.4s}, [x1], #32 + st1 {v22.4s,v23.4s}, [x1], #32 + + ret +endfunc + + +const trans4_float, align=4 + .byte 0, 1, 2, 3 + .byte 8, 9, 10, 11 + .byte 4, 5, 6, 7 + .byte 12, 13, 14, 15 +endconst + +const trans8_float, align=4 + .byte 24, 25, 26, 27 + .byte 0, 1, 2, 3 + .byte 28, 29, 30, 31 + .byte 4, 5, 6, 7 +endconst + +function fft_pass_neon + sub x6, x2, #1 // n - 1, loop counter + lsl x5, x2, #3 // 2 * n * sizeof FFTSample + lsl x1, x2, #4 // 2 * n * sizeof FFTComplex + add x5, x4, x5 // wim + add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex + add x2, x0, x2, lsl #5 // &z[o2] + add x3, x0, x3 // &z[o3] + add x1, x0, x1 // &z[o1] + ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} + ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} + ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} + trn2 v25.2d, v20.2d, v22.2d + sub x5, x5, #4 // wim-- + trn1 v24.2d, v20.2d, v22.2d + ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1] + rev64 v7.4s, v25.4s + fmul v25.4s, v25.4s, v4.s[1] + ld1 {v16.4s}, [x0] // {z[0],z[1]} + fmul v7.4s, v7.4s, v29.4s + ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]} + prfm pldl1keep, [x2, #16] + prfm pldl1keep, [x3, #16] + fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} + prfm pldl1keep, [x0, #16] + prfm pldl1keep, [x1, #16] + + zip1 v20.4s, v24.4s, v25.4s + zip2 v21.4s, v24.4s, v25.4s + fneg v22.4s, v20.4s + fadd v4.4s, v21.4s, v20.4s + fsub v6.4s, v20.4s, v21.4s // just the second half + fadd v5.4s, v21.4s, v22.4s // just the first half + + tbl v4.16b, {v4.16b}, v30.16b // trans4_float + tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float + + fadd v20.4s, v16.4s, v4.4s + fsub v22.4s, v16.4s, v4.4s + fadd v21.4s, v17.4s, v5.4s + st1 {v20.4s}, [x0], #16 // {z[0], z[1]} + fsub v23.4s, v17.4s, v5.4s + + st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} + st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} + st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} +1: + ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} + ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} + ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} + transpose v26.2d, v27.2d, v20.2d, v22.2d + ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]} + rev64 v6.4s, v26.4s + fmul v26.4s, v26.4s, v4.s[0] + rev64 v7.4s, v27.4s + fmul v27.4s, v27.4s, v4.s[1] + fmul v6.4s, v6.4s, v29.4s + fmul v7.4s, v7.4s, v29.4s + ld1 {v16.4s},[x0] // {z[0],z[1]} + fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6} + fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} + ld1 {v17.4s},[x1] // {z[o1],z[o1+1]} + + subs x6, x6, #1 // n-- + + zip1 v20.4s, v26.4s, v27.4s + zip2 v21.4s, v26.4s, v27.4s + fneg v22.4s, v20.4s + fadd v4.4s, v21.4s, v20.4s + fsub v6.4s, v20.4s, v21.4s // just the second half + fadd v5.4s, v21.4s, v22.4s // just the first half + + tbl v4.16b, {v4.16b}, v30.16b // trans4_float + tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float + + fadd v20.4s, v16.4s, v4.4s + fsub v22.4s, v16.4s, v4.4s + fadd v21.4s, v17.4s, v5.4s + st1 {v20.4s}, [x0], #16 // {z[0], z[1]} + fsub v23.4s, v17.4s, v5.4s + + st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} + st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} + st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} + b.ne 1b + + ret +endfunc + +.macro def_fft n, n2, n4 +function fft\n\()_neon, align=6 + AARCH64_VALID_JUMP_TARGET + AARCH64_SIGN_LINK_REGISTER + stp x28, x30, [sp, #-16]! + add x28, x0, #\n4*2*8 + bl fft\n2\()_neon + mov x0, x28 + bl fft\n4\()_neon + add x0, x28, #\n4*1*8 + bl fft\n4\()_neon + sub x0, x28, #\n4*2*8 + ldp x28, x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + movrel x4, X(ff_cos_\n) + mov x2, #\n4>>1 + b fft_pass_neon +endfunc +.endm + + def_fft 32, 16, 8 + def_fft 64, 32, 16 + def_fft 128, 64, 32 + def_fft 256, 128, 64 + def_fft 512, 256, 128 + def_fft 1024, 512, 256 + def_fft 2048, 1024, 512 + def_fft 4096, 2048, 1024 + def_fft 8192, 4096, 2048 + def_fft 16384, 8192, 4096 + def_fft 32768, 16384, 8192 + def_fft 65536, 32768, 16384 + +function ff_fft_calc_neon, export=1 + prfm pldl1keep, [x1] + movrel x10, trans4_float + ldr w2, [x0] + movrel x11, trans8_float + sub w2, w2, #2 + movrel x3, fft_tab_neon + ld1 {v30.16b}, [x10] + mov x7, #-8 + movrel x12, pmmp + ldr x3, [x3, x2, lsl #3] + movrel x13, mppm + movrel x14, X(ff_cos_16) + ld1 {v31.16b}, [x11] + mov x0, x1 + ld1 {v29.4s}, [x12] // pmmp + ld1 {v28.4s}, [x13] + br x3 +endfunc + +function ff_fft_permute_neon, export=1 + mov x6, #1 + ldr w2, [x0] // nbits + ldr x3, [x0, #16] // tmp_buf + ldr x0, [x0, #8] // revtab + lsl x6, x6, x2 + mov x2, x6 +1: + ld1 {v0.2s,v1.2s}, [x1], #16 + ldr w4, [x0], #4 + uxth w5, w4 + lsr w4, w4, #16 + add x5, x3, x5, lsl #3 + add x4, x3, x4, lsl #3 + st1 {v0.2s}, [x5] + st1 {v1.2s}, [x4] + subs x6, x6, #2 + b.gt 1b + + sub x1, x1, x2, lsl #3 +1: + ld1 {v0.4s,v1.4s}, [x3], #32 + st1 {v0.4s,v1.4s}, [x1], #32 + subs x2, x2, #4 + b.gt 1b + + ret +endfunc + +const fft_tab_neon, relocate=1 + .quad fft4_neon + .quad fft8_neon + .quad fft16_neon + .quad fft32_neon + .quad fft64_neon + .quad fft128_neon + .quad fft256_neon + .quad fft512_neon + .quad fft1024_neon + .quad fft2048_neon + .quad fft4096_neon + .quad fft8192_neon + .quad fft16384_neon + .quad fft32768_neon + .quad fft65536_neon +endconst + +const pmmp, align=4 + .float +1.0, -1.0, -1.0, +1.0 +endconst + +const mppm, align=4 + .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +endconst diff --git a/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c new file mode 100644 index 0000000000..00fc7b20f1 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c @@ -0,0 +1,59 @@ +/* + * ARM NEON optimised H.264 chroma functions + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/h264chroma.h" + +#include "config.h" + +void ff_put_h264_chroma_mc8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, + int h, int x, int y); +void ff_put_h264_chroma_mc4_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, + int h, int x, int y); +void ff_put_h264_chroma_mc2_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + +void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, + int h, int x, int y); +void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, + int h, int x, int y); +void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + +av_cold void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth) +{ + const int high_bit_depth = bit_depth > 8; + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags) && !high_bit_depth) { + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; + c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; + c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon; + + c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; + c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; + c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon; + } +} diff --git a/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S b/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S new file mode 100644 index 0000000000..88ccd727d0 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config_components.h" + +#include "libavutil/aarch64/asm.S" + +/* chroma_mc8(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y) */ +.macro h264_chroma_mc8 type, codec=h264 +function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 + .ifc \type,avg + mov x8, x0 + .endif + prfm pldl1strm, [x1] + prfm pldl1strm, [x1, x2] + .ifc \codec,rv40 + movrel x6, rv40bias + lsr w9, w5, #1 + lsr w10, w4, #1 + lsl w9, w9, #3 + lsl w10, w10, #1 + add w9, w9, w10 + add x6, x6, w9, UXTW + ld1r {v22.8H}, [x6] + .endif + .ifc \codec,vc1 + movi v22.8H, #28 + .endif + mul w7, w4, w5 + lsl w14, w5, #3 + lsl w13, w4, #3 + cmp w7, #0 + sub w6, w14, w7 + sub w12, w13, w7 + sub w4, w7, w13 + sub w4, w4, w14 + add w4, w4, #64 + b.eq 2f + + dup v0.8B, w4 + dup v1.8B, w12 + ld1 {v4.8B, v5.8B}, [x1], x2 + dup v2.8B, w6 + dup v3.8B, w7 + ext v5.8B, v4.8B, v5.8B, #1 +1: ld1 {v6.8B, v7.8B}, [x1], x2 + umull v16.8H, v4.8B, v0.8B + umlal v16.8H, v5.8B, v1.8B + ext v7.8B, v6.8B, v7.8B, #1 + ld1 {v4.8B, v5.8B}, [x1], x2 + umlal v16.8H, v6.8B, v2.8B + prfm pldl1strm, [x1] + ext v5.8B, v4.8B, v5.8B, #1 + umlal v16.8H, v7.8B, v3.8B + umull v17.8H, v6.8B, v0.8B + subs w3, w3, #2 + umlal v17.8H, v7.8B, v1.8B + umlal v17.8H, v4.8B, v2.8B + umlal v17.8H, v5.8B, v3.8B + prfm pldl1strm, [x1, x2] + .ifc \codec,h264 + rshrn v16.8B, v16.8H, #6 + rshrn v17.8B, v17.8H, #6 + .else + add v16.8H, v16.8H, v22.8H + add v17.8H, v17.8H, v22.8H + shrn v16.8B, v16.8H, #6 + shrn v17.8B, v17.8H, #6 + .endif + .ifc \type,avg + ld1 {v20.8B}, [x8], x2 + ld1 {v21.8B}, [x8], x2 + urhadd v16.8B, v16.8B, v20.8B + urhadd v17.8B, v17.8B, v21.8B + .endif + st1 {v16.8B}, [x0], x2 + st1 {v17.8B}, [x0], x2 + b.gt 1b + ret + +2: adds w12, w12, w6 + dup v0.8B, w4 + b.eq 5f + tst w6, w6 + dup v1.8B, w12 + b.eq 4f + + ld1 {v4.8B}, [x1], x2 +3: ld1 {v6.8B}, [x1], x2 + umull v16.8H, v4.8B, v0.8B + umlal v16.8H, v6.8B, v1.8B + ld1 {v4.8B}, [x1], x2 + umull v17.8H, v6.8B, v0.8B + umlal v17.8H, v4.8B, v1.8B + prfm pldl1strm, [x1] + .ifc \codec,h264 + rshrn v16.8B, v16.8H, #6 + rshrn v17.8B, v17.8H, #6 + .else + add v16.8H, v16.8H, v22.8H + add v17.8H, v17.8H, v22.8H + shrn v16.8B, v16.8H, #6 + shrn v17.8B, v17.8H, #6 + .endif + prfm pldl1strm, [x1, x2] + .ifc \type,avg + ld1 {v20.8B}, [x8], x2 + ld1 {v21.8B}, [x8], x2 + urhadd v16.8B, v16.8B, v20.8B + urhadd v17.8B, v17.8B, v21.8B + .endif + subs w3, w3, #2 + st1 {v16.8B}, [x0], x2 + st1 {v17.8B}, [x0], x2 + b.gt 3b + ret + +4: ld1 {v4.8B, v5.8B}, [x1], x2 + ld1 {v6.8B, v7.8B}, [x1], x2 + ext v5.8B, v4.8B, v5.8B, #1 + ext v7.8B, v6.8B, v7.8B, #1 + prfm pldl1strm, [x1] + subs w3, w3, #2 + umull v16.8H, v4.8B, v0.8B + umlal v16.8H, v5.8B, v1.8B + umull v17.8H, v6.8B, v0.8B + umlal v17.8H, v7.8B, v1.8B + prfm pldl1strm, [x1, x2] + .ifc \codec,h264 + rshrn v16.8B, v16.8H, #6 + rshrn v17.8B, v17.8H, #6 + .else + add v16.8H, v16.8H, v22.8H + add v17.8H, v17.8H, v22.8H + shrn v16.8B, v16.8H, #6 + shrn v17.8B, v17.8H, #6 + .endif + .ifc \type,avg + ld1 {v20.8B}, [x8], x2 + ld1 {v21.8B}, [x8], x2 + urhadd v16.8B, v16.8B, v20.8B + urhadd v17.8B, v17.8B, v21.8B + .endif + st1 {v16.8B}, [x0], x2 + st1 {v17.8B}, [x0], x2 + b.gt 4b + ret + +5: ld1 {v4.8B}, [x1], x2 + ld1 {v5.8B}, [x1], x2 + prfm pldl1strm, [x1] + subs w3, w3, #2 + umull v16.8H, v4.8B, v0.8B + umull v17.8H, v5.8B, v0.8B + prfm pldl1strm, [x1, x2] + .ifc \codec,h264 + rshrn v16.8B, v16.8H, #6 + rshrn v17.8B, v17.8H, #6 + .else + add v16.8H, v16.8H, v22.8H + add v17.8H, v17.8H, v22.8H + shrn v16.8B, v16.8H, #6 + shrn v17.8B, v17.8H, #6 + .endif + .ifc \type,avg + ld1 {v20.8B}, [x8], x2 + ld1 {v21.8B}, [x8], x2 + urhadd v16.8B, v16.8B, v20.8B + urhadd v17.8B, v17.8B, v21.8B + .endif + st1 {v16.8B}, [x0], x2 + st1 {v17.8B}, [x0], x2 + b.gt 5b + ret +endfunc +.endm + +/* chroma_mc4(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y) */ +.macro h264_chroma_mc4 type, codec=h264 +function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 + .ifc \type,avg + mov x8, x0 + .endif + prfm pldl1strm, [x1] + prfm pldl1strm, [x1, x2] + .ifc \codec,rv40 + movrel x6, rv40bias + lsr w9, w5, #1 + lsr w10, w4, #1 + lsl w9, w9, #3 + lsl w10, w10, #1 + add w9, w9, w10 + add x6, x6, w9, UXTW + ld1r {v22.8H}, [x6] + .endif + .ifc \codec,vc1 + movi v22.8H, #28 + .endif + mul w7, w4, w5 + lsl w14, w5, #3 + lsl w13, w4, #3 + cmp w7, #0 + sub w6, w14, w7 + sub w12, w13, w7 + sub w4, w7, w13 + sub w4, w4, w14 + add w4, w4, #64 + b.eq 2f + + dup v24.8B, w4 + dup v25.8B, w12 + ld1 {v4.8B}, [x1], x2 + dup v26.8B, w6 + dup v27.8B, w7 + ext v5.8B, v4.8B, v5.8B, #1 + trn1 v0.2S, v24.2S, v25.2S + trn1 v2.2S, v26.2S, v27.2S + trn1 v4.2S, v4.2S, v5.2S +1: ld1 {v6.8B}, [x1], x2 + ext v7.8B, v6.8B, v7.8B, #1 + trn1 v6.2S, v6.2S, v7.2S + umull v18.8H, v4.8B, v0.8B + umlal v18.8H, v6.8B, v2.8B + ld1 {v4.8B}, [x1], x2 + ext v5.8B, v4.8B, v5.8B, #1 + trn1 v4.2S, v4.2S, v5.2S + prfm pldl1strm, [x1] + umull v19.8H, v6.8B, v0.8B + umlal v19.8H, v4.8B, v2.8B + trn1 v30.2D, v18.2D, v19.2D + trn2 v31.2D, v18.2D, v19.2D + add v18.8H, v30.8H, v31.8H + .ifc \codec,h264 + rshrn v16.8B, v18.8H, #6 + .else + add v18.8H, v18.8H, v22.8H + shrn v16.8B, v18.8H, #6 + .endif + subs w3, w3, #2 + prfm pldl1strm, [x1, x2] + .ifc \type,avg + ld1 {v20.S}[0], [x8], x2 + ld1 {v20.S}[1], [x8], x2 + urhadd v16.8B, v16.8B, v20.8B + .endif + st1 {v16.S}[0], [x0], x2 + st1 {v16.S}[1], [x0], x2 + b.gt 1b + ret + +2: adds w12, w12, w6 + dup v30.8B, w4 + b.eq 5f + tst w6, w6 + dup v31.8B, w12 + trn1 v0.2S, v30.2S, v31.2S + trn2 v1.2S, v30.2S, v31.2S + b.eq 4f + + ext v1.8B, v0.8B, v1.8B, #4 + ld1 {v4.S}[0], [x1], x2 +3: ld1 {v4.S}[1], [x1], x2 + umull v18.8H, v4.8B, v0.8B + ld1 {v4.S}[0], [x1], x2 + umull v19.8H, v4.8B, v1.8B + trn1 v30.2D, v18.2D, v19.2D + trn2 v31.2D, v18.2D, v19.2D + add v18.8H, v30.8H, v31.8H + prfm pldl1strm, [x1] + .ifc \codec,h264 + rshrn v16.8B, v18.8H, #6 + .else + add v18.8H, v18.8H, v22.8H + shrn v16.8B, v18.8H, #6 + .endif + .ifc \type,avg + ld1 {v20.S}[0], [x8], x2 + ld1 {v20.S}[1], [x8], x2 + urhadd v16.8B, v16.8B, v20.8B + .endif + subs w3, w3, #2 + prfm pldl1strm, [x1, x2] + st1 {v16.S}[0], [x0], x2 + st1 {v16.S}[1], [x0], x2 + b.gt 3b + ret + +4: ld1 {v4.8B}, [x1], x2 + ld1 {v6.8B}, [x1], x2 + ext v5.8B, v4.8B, v5.8B, #1 + ext v7.8B, v6.8B, v7.8B, #1 + trn1 v4.2S, v4.2S, v5.2S + trn1 v6.2S, v6.2S, v7.2S + umull v18.8H, v4.8B, v0.8B + umull v19.8H, v6.8B, v0.8B + subs w3, w3, #2 + trn1 v30.2D, v18.2D, v19.2D + trn2 v31.2D, v18.2D, v19.2D + add v18.8H, v30.8H, v31.8H + prfm pldl1strm, [x1] + .ifc \codec,h264 + rshrn v16.8B, v18.8H, #6 + .else + add v18.8H, v18.8H, v22.8H + shrn v16.8B, v18.8H, #6 + .endif + .ifc \type,avg + ld1 {v20.S}[0], [x8], x2 + ld1 {v20.S}[1], [x8], x2 + urhadd v16.8B, v16.8B, v20.8B + .endif + prfm pldl1strm, [x1] + st1 {v16.S}[0], [x0], x2 + st1 {v16.S}[1], [x0], x2 + b.gt 4b + ret + +5: ld1 {v4.S}[0], [x1], x2 + ld1 {v4.S}[1], [x1], x2 + umull v18.8H, v4.8B, v30.8B + subs w3, w3, #2 + prfm pldl1strm, [x1] + .ifc \codec,h264 + rshrn v16.8B, v18.8H, #6 + .else + add v18.8H, v18.8H, v22.8H + shrn v16.8B, v18.8H, #6 + .endif + .ifc \type,avg + ld1 {v20.S}[0], [x8], x2 + ld1 {v20.S}[1], [x8], x2 + urhadd v16.8B, v16.8B, v20.8B + .endif + prfm pldl1strm, [x1] + st1 {v16.S}[0], [x0], x2 + st1 {v16.S}[1], [x0], x2 + b.gt 5b + ret +endfunc +.endm + +.macro h264_chroma_mc2 type +function ff_\type\()_h264_chroma_mc2_neon, export=1 + prfm pldl1strm, [x1] + prfm pldl1strm, [x1, x2] + orr w7, w4, w5 + cbz w7, 2f + + mul w7, w4, w5 + lsl w14, w5, #3 + lsl w13, w4, #3 + sub w6, w14, w7 + sub w12, w13, w7 + sub w4, w7, w13 + sub w4, w4, w14 + add w4, w4, #64 + dup v0.8B, w4 + dup v2.8B, w12 + dup v1.8B, w6 + dup v3.8B, w7 + trn1 v0.4H, v0.4H, v2.4H + trn1 v1.4H, v1.4H, v3.4H +1: + ld1 {v4.S}[0], [x1], x2 + ld1 {v4.S}[1], [x1], x2 + rev64 v5.2S, v4.2S + ld1 {v5.S}[1], [x1] + ext v6.8B, v4.8B, v5.8B, #1 + ext v7.8B, v5.8B, v4.8B, #1 + trn1 v4.4H, v4.4H, v6.4H + trn1 v5.4H, v5.4H, v7.4H + umull v16.8H, v4.8B, v0.8B + umlal v16.8H, v5.8B, v1.8B + .ifc \type,avg + ld1 {v18.H}[0], [x0], x2 + ld1 {v18.H}[2], [x0] + sub x0, x0, x2 + .endif + rev64 v17.4S, v16.4S + add v16.8H, v16.8H, v17.8H + rshrn v16.8B, v16.8H, #6 + .ifc \type,avg + urhadd v16.8B, v16.8B, v18.8B + .endif + st1 {v16.H}[0], [x0], x2 + st1 {v16.H}[2], [x0], x2 + subs w3, w3, #2 + b.gt 1b + ret + +2: + ld1 {v16.H}[0], [x1], x2 + ld1 {v16.H}[1], [x1], x2 + .ifc \type,avg + ld1 {v18.H}[0], [x0], x2 + ld1 {v18.H}[1], [x0] + sub x0, x0, x2 + urhadd v16.8B, v16.8B, v18.8B + .endif + st1 {v16.H}[0], [x0], x2 + st1 {v16.H}[1], [x0], x2 + subs w3, w3, #2 + b.gt 2b + ret +endfunc +.endm + + h264_chroma_mc8 put + h264_chroma_mc8 avg + h264_chroma_mc4 put + h264_chroma_mc4 avg + h264_chroma_mc2 put + h264_chroma_mc2 avg + +#if CONFIG_RV40_DECODER +const rv40bias + .short 0, 16, 32, 16 + .short 32, 28, 32, 28 + .short 0, 32, 16, 32 + .short 32, 28, 32, 28 +endconst + + h264_chroma_mc8 put, rv40 + h264_chroma_mc8 avg, rv40 + h264_chroma_mc4 put, rv40 + h264_chroma_mc4 avg, rv40 +#endif + +#if CONFIG_VC1DSP + h264_chroma_mc8 put, vc1 + h264_chroma_mc8 avg, vc1 + h264_chroma_mc4 put, vc1 + h264_chroma_mc4 avg, vc1 +#endif diff --git a/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c new file mode 100644 index 0000000000..6bf3ecb8a1 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/h264dsp.h" + +void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); +void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); +void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma422_neon(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma422_intra_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); + +void ff_weight_h264_pixels_16_neon(uint8_t *dst, ptrdiff_t stride, int height, + int log2_den, int weight, int offset); +void ff_weight_h264_pixels_8_neon(uint8_t *dst, ptrdiff_t stride, int height, + int log2_den, int weight, int offset); +void ff_weight_h264_pixels_4_neon(uint8_t *dst, ptrdiff_t stride, int height, + int log2_den, int weight, int offset); + +void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int height, int log2_den, int weightd, + int weights, int offset); +void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int height, int log2_den, int weightd, + int weights, int offset); +void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int height, int log2_den, int weightd, + int weights, int offset); + +void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[5 * 8]); +void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[5 * 8]); +void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[15 * 8]); + +void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[5 * 8]); + +void ff_h264_v_loop_filter_luma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_luma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_v_loop_filter_luma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); +void ff_h264_h_loop_filter_luma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta); +void ff_h264_v_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma422_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_v_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); + +av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags) && bit_depth == 8) { + c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; + c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; + c->h264_v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon; + c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; + + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; + c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; + + if (chroma_format_idc <= 1) { + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon; + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon; + } else { + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon; + c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon; + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon; + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon; + } + + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; + c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon; + + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon; + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon; + c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon; + + c->h264_idct_add = ff_h264_idct_add_neon; + c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; + c->h264_idct_add16 = ff_h264_idct_add16_neon; + c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; + if (chroma_format_idc <= 1) + c->h264_idct_add8 = ff_h264_idct_add8_neon; + c->h264_idct8_add = ff_h264_idct8_add_neon; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon; + c->h264_idct8_add4 = ff_h264_idct8_add4_neon; + } else if (have_neon(cpu_flags) && bit_depth == 10) { + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon_10; + c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon_10; + + if (chroma_format_idc <= 1) { + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon_10; + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon_10; + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10; + } else { + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon_10; + c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon_10; + c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10; + c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10; + } + } +} diff --git a/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S b/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S new file mode 100644 index 0000000000..ea221e6862 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S @@ -0,0 +1,1076 @@ +/* + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> + * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +.macro h264_loop_filter_start + cmp w2, #0 + ldr w6, [x4] + ccmp w3, #0, #0, ne + mov v24.S[0], w6 + and w8, w6, w6, lsl #16 + b.eq 1f + ands w8, w8, w8, lsl #8 + b.ge 2f +1: + ret +2: +.endm + +.macro h264_loop_filter_luma + dup v22.16B, w2 // alpha + uxtl v24.8H, v24.8B + uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0) + uxtl v24.4S, v24.4H + uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0) + sli v24.8H, v24.8H, #8 + uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0) + sli v24.4S, v24.4S, #16 + cmhi v21.16B, v22.16B, v21.16B // < alpha + dup v22.16B, w3 // beta + cmlt v23.16B, v24.16B, #0 + cmhi v28.16B, v22.16B, v28.16B // < beta + cmhi v30.16B, v22.16B, v30.16B // < beta + bic v21.16B, v21.16B, v23.16B + uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) + and v21.16B, v21.16B, v28.16B + uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) + and v21.16B, v21.16B, v30.16B // < beta + shrn v30.8b, v21.8h, #4 + mov x7, v30.d[0] + cmhi v17.16B, v22.16B, v17.16B // < beta + cmhi v19.16B, v22.16B, v19.16B // < beta + cbz x7, 9f + and v17.16B, v17.16B, v21.16B + and v19.16B, v19.16B, v21.16B + and v24.16B, v24.16B, v21.16B + urhadd v28.16B, v16.16B, v0.16B + sub v21.16B, v24.16B, v17.16B + uqadd v23.16B, v18.16B, v24.16B + uhadd v20.16B, v20.16B, v28.16B + sub v21.16B, v21.16B, v19.16B + uhadd v28.16B, v4.16B, v28.16B + umin v23.16B, v23.16B, v20.16B + uqsub v22.16B, v18.16B, v24.16B + uqadd v4.16B, v2.16B, v24.16B + umax v23.16B, v23.16B, v22.16B + uqsub v22.16B, v2.16B, v24.16B + umin v28.16B, v4.16B, v28.16B + uxtl v4.8H, v0.8B + umax v28.16B, v28.16B, v22.16B + uxtl2 v20.8H, v0.16B + usubw v4.8H, v4.8H, v16.8B + usubw2 v20.8H, v20.8H, v16.16B + shl v4.8H, v4.8H, #2 + shl v20.8H, v20.8H, #2 + uaddw v4.8H, v4.8H, v18.8B + uaddw2 v20.8H, v20.8H, v18.16B + usubw v4.8H, v4.8H, v2.8B + usubw2 v20.8H, v20.8H, v2.16B + rshrn v4.8B, v4.8H, #3 + rshrn2 v4.16B, v20.8H, #3 + bsl v17.16B, v23.16B, v18.16B + bsl v19.16B, v28.16B, v2.16B + neg v23.16B, v21.16B + uxtl v28.8H, v16.8B + smin v4.16B, v4.16B, v21.16B + uxtl2 v21.8H, v16.16B + smax v4.16B, v4.16B, v23.16B + uxtl v22.8H, v0.8B + uxtl2 v24.8H, v0.16B + saddw v28.8H, v28.8H, v4.8B + saddw2 v21.8H, v21.8H, v4.16B + ssubw v22.8H, v22.8H, v4.8B + ssubw2 v24.8H, v24.8H, v4.16B + sqxtun v16.8B, v28.8H + sqxtun2 v16.16B, v21.8H + sqxtun v0.8B, v22.8H + sqxtun2 v0.16B, v24.8H +.endm + +function ff_h264_v_loop_filter_luma_neon, export=1 + h264_loop_filter_start + + ld1 {v0.16B}, [x0], x1 + ld1 {v2.16B}, [x0], x1 + ld1 {v4.16B}, [x0], x1 + sub x0, x0, x1, lsl #2 + sub x0, x0, x1, lsl #1 + ld1 {v20.16B}, [x0], x1 + ld1 {v18.16B}, [x0], x1 + ld1 {v16.16B}, [x0], x1 + + h264_loop_filter_luma + + sub x0, x0, x1, lsl #1 + st1 {v17.16B}, [x0], x1 + st1 {v16.16B}, [x0], x1 + st1 {v0.16B}, [x0], x1 + st1 {v19.16B}, [x0] +9: + ret +endfunc + +function ff_h264_h_loop_filter_luma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, #4 + ld1 {v6.8B}, [x0], x1 + ld1 {v20.8B}, [x0], x1 + ld1 {v18.8B}, [x0], x1 + ld1 {v16.8B}, [x0], x1 + ld1 {v0.8B}, [x0], x1 + ld1 {v2.8B}, [x0], x1 + ld1 {v4.8B}, [x0], x1 + ld1 {v26.8B}, [x0], x1 + ld1 {v6.D}[1], [x0], x1 + ld1 {v20.D}[1], [x0], x1 + ld1 {v18.D}[1], [x0], x1 + ld1 {v16.D}[1], [x0], x1 + ld1 {v0.D}[1], [x0], x1 + ld1 {v2.D}[1], [x0], x1 + ld1 {v4.D}[1], [x0], x1 + ld1 {v26.D}[1], [x0], x1 + + transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 + + h264_loop_filter_luma + + transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27 + + sub x0, x0, x1, lsl #4 + add x0, x0, #2 + st1 {v17.S}[0], [x0], x1 + st1 {v16.S}[0], [x0], x1 + st1 {v0.S}[0], [x0], x1 + st1 {v19.S}[0], [x0], x1 + st1 {v17.S}[1], [x0], x1 + st1 {v16.S}[1], [x0], x1 + st1 {v0.S}[1], [x0], x1 + st1 {v19.S}[1], [x0], x1 + st1 {v17.S}[2], [x0], x1 + st1 {v16.S}[2], [x0], x1 + st1 {v0.S}[2], [x0], x1 + st1 {v19.S}[2], [x0], x1 + st1 {v17.S}[3], [x0], x1 + st1 {v16.S}[3], [x0], x1 + st1 {v0.S}[3], [x0], x1 + st1 {v19.S}[3], [x0], x1 +9: + ret +endfunc + + +.macro h264_loop_filter_start_intra + orr w4, w2, w3 + cbnz w4, 1f + ret +1: + dup v30.16b, w2 // alpha + dup v31.16b, w3 // beta +.endm + +.macro h264_loop_filter_luma_intra + uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0) + uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0) + uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0) + cmhi v19.16b, v30.16b, v16.16b // < alpha + cmhi v17.16b, v31.16b, v17.16b // < beta + cmhi v18.16b, v31.16b, v18.16b // < beta + + movi v29.16b, #2 + ushr v30.16b, v30.16b, #2 // alpha >> 2 + add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 + cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 + + and v19.16b, v19.16b, v17.16b + and v19.16b, v19.16b, v18.16b + shrn v20.8b, v19.8h, #4 + mov x4, v20.d[0] + cbz x4, 9f + + ushll v20.8h, v6.8b, #1 + ushll v22.8h, v1.8b, #1 + ushll2 v21.8h, v6.16b, #1 + ushll2 v23.8h, v1.16b, #1 + uaddw v20.8h, v20.8h, v7.8b + uaddw v22.8h, v22.8h, v0.8b + uaddw2 v21.8h, v21.8h, v7.16b + uaddw2 v23.8h, v23.8h, v0.16b + uaddw v20.8h, v20.8h, v1.8b + uaddw v22.8h, v22.8h, v6.8b + uaddw2 v21.8h, v21.8h, v1.16b + uaddw2 v23.8h, v23.8h, v6.16b + + rshrn v24.8b, v20.8h, #2 // p0'_1 + rshrn v25.8b, v22.8h, #2 // q0'_1 + rshrn2 v24.16b, v21.8h, #2 // p0'_1 + rshrn2 v25.16b, v23.8h, #2 // q0'_1 + + uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0) + uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0) + cmhi v17.16b, v31.16b, v17.16b // < beta + cmhi v18.16b, v31.16b, v18.16b // < beta + + and v17.16b, v16.16b, v17.16b // if_2 && if_3 + and v18.16b, v16.16b, v18.16b // if_2 && if_4 + + not v30.16b, v17.16b + not v31.16b, v18.16b + + and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3) + and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4) + + and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3 + and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4 + + //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4 + uaddl v26.8h, v5.8b, v7.8b + uaddl2 v27.8h, v5.16b, v7.16b + uaddw v26.8h, v26.8h, v0.8b + uaddw2 v27.8h, v27.8h, v0.16b + add v20.8h, v20.8h, v26.8h + add v21.8h, v21.8h, v27.8h + uaddw v20.8h, v20.8h, v0.8b + uaddw2 v21.8h, v21.8h, v0.16b + rshrn v20.8b, v20.8h, #3 // p0'_2 + rshrn2 v20.16b, v21.8h, #3 // p0'_2 + uaddw v26.8h, v26.8h, v6.8b + uaddw2 v27.8h, v27.8h, v6.16b + rshrn v21.8b, v26.8h, #2 // p1'_2 + rshrn2 v21.16b, v27.8h, #2 // p1'_2 + uaddl v28.8h, v4.8b, v5.8b + uaddl2 v29.8h, v4.16b, v5.16b + shl v28.8h, v28.8h, #1 + shl v29.8h, v29.8h, #1 + add v28.8h, v28.8h, v26.8h + add v29.8h, v29.8h, v27.8h + rshrn v19.8b, v28.8h, #3 // p2'_2 + rshrn2 v19.16b, v29.8h, #3 // p2'_2 + + //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3 + uaddl v26.8h, v2.8b, v0.8b + uaddl2 v27.8h, v2.16b, v0.16b + uaddw v26.8h, v26.8h, v7.8b + uaddw2 v27.8h, v27.8h, v7.16b + add v22.8h, v22.8h, v26.8h + add v23.8h, v23.8h, v27.8h + uaddw v22.8h, v22.8h, v7.8b + uaddw2 v23.8h, v23.8h, v7.16b + rshrn v22.8b, v22.8h, #3 // q0'_2 + rshrn2 v22.16b, v23.8h, #3 // q0'_2 + uaddw v26.8h, v26.8h, v1.8b + uaddw2 v27.8h, v27.8h, v1.16b + rshrn v23.8b, v26.8h, #2 // q1'_2 + rshrn2 v23.16b, v27.8h, #2 // q1'_2 + uaddl v28.8h, v2.8b, v3.8b + uaddl2 v29.8h, v2.16b, v3.16b + shl v28.8h, v28.8h, #1 + shl v29.8h, v29.8h, #1 + add v28.8h, v28.8h, v26.8h + add v29.8h, v29.8h, v27.8h + rshrn v26.8b, v28.8h, #3 // q2'_2 + rshrn2 v26.16b, v29.8h, #3 // q2'_2 + + bit v7.16b, v24.16b, v30.16b // p0'_1 + bit v0.16b, v25.16b, v31.16b // q0'_1 + bit v7.16b, v20.16b, v17.16b // p0'_2 + bit v6.16b, v21.16b, v17.16b // p1'_2 + bit v5.16b, v19.16b, v17.16b // p2'_2 + bit v0.16b, v22.16b, v18.16b // q0'_2 + bit v1.16b, v23.16b, v18.16b // q1'_2 + bit v2.16b, v26.16b, v18.16b // q2'_2 +.endm + +function ff_h264_v_loop_filter_luma_intra_neon, export=1 + h264_loop_filter_start_intra + + ld1 {v0.16b}, [x0], x1 // q0 + ld1 {v1.16b}, [x0], x1 // q1 + ld1 {v2.16b}, [x0], x1 // q2 + ld1 {v3.16b}, [x0], x1 // q3 + sub x0, x0, x1, lsl #3 + ld1 {v4.16b}, [x0], x1 // p3 + ld1 {v5.16b}, [x0], x1 // p2 + ld1 {v6.16b}, [x0], x1 // p1 + ld1 {v7.16b}, [x0] // p0 + + h264_loop_filter_luma_intra + + sub x0, x0, x1, lsl #1 + st1 {v5.16b}, [x0], x1 // p2 + st1 {v6.16b}, [x0], x1 // p1 + st1 {v7.16b}, [x0], x1 // p0 + st1 {v0.16b}, [x0], x1 // q0 + st1 {v1.16b}, [x0], x1 // q1 + st1 {v2.16b}, [x0] // q2 +9: + ret +endfunc + +function ff_h264_h_loop_filter_luma_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x0, x0, #4 + ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x0], x1 + ld1 {v6.8b}, [x0], x1 + ld1 {v7.8b}, [x0], x1 + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x0], x1 + ld1 {v4.d}[1], [x0], x1 + ld1 {v5.d}[1], [x0], x1 + ld1 {v6.d}[1], [x0], x1 + ld1 {v7.d}[1], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v3.d}[1], [x0], x1 + + transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 + + h264_loop_filter_luma_intra + + transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 + + sub x0, x0, x1, lsl #4 + st1 {v4.8b}, [x0], x1 + st1 {v5.8b}, [x0], x1 + st1 {v6.8b}, [x0], x1 + st1 {v7.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + st1 {v4.d}[1], [x0], x1 + st1 {v5.d}[1], [x0], x1 + st1 {v6.d}[1], [x0], x1 + st1 {v7.d}[1], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 + st1 {v3.d}[1], [x0], x1 +9: + ret +endfunc + +.macro h264_loop_filter_chroma + dup v22.8B, w2 // alpha + dup v23.8B, w3 // beta + uxtl v24.8H, v24.8B + uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) + uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) + uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) + cmhi v26.8B, v22.8B, v26.8B // < alpha + cmhi v28.8B, v23.8B, v28.8B // < beta + cmhi v30.8B, v23.8B, v30.8B // < beta + uxtl v4.8H, v0.8B + and v26.8B, v26.8B, v28.8B + usubw v4.8H, v4.8H, v16.8B + and v26.8B, v26.8B, v30.8B + shl v4.8H, v4.8H, #2 + mov x8, v26.d[0] + sli v24.8H, v24.8H, #8 + uaddw v4.8H, v4.8H, v18.8B + cbz x8, 9f + usubw v4.8H, v4.8H, v2.8B + rshrn v4.8B, v4.8H, #3 + smin v4.8B, v4.8B, v24.8B + neg v25.8B, v24.8B + smax v4.8B, v4.8B, v25.8B + uxtl v22.8H, v0.8B + and v4.8B, v4.8B, v26.8B + uxtl v28.8H, v16.8B + saddw v28.8H, v28.8H, v4.8B + ssubw v22.8H, v22.8H, v4.8B + sqxtun v16.8B, v28.8H + sqxtun v0.8B, v22.8H +.endm + +function ff_h264_v_loop_filter_chroma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, x1, lsl #1 + ld1 {v18.8B}, [x0], x1 + ld1 {v16.8B}, [x0], x1 + ld1 {v0.8B}, [x0], x1 + ld1 {v2.8B}, [x0] + + h264_loop_filter_chroma + + sub x0, x0, x1, lsl #1 + st1 {v16.8B}, [x0], x1 + st1 {v0.8B}, [x0], x1 +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, #2 +h_loop_filter_chroma420: + ld1 {v18.S}[0], [x0], x1 + ld1 {v16.S}[0], [x0], x1 + ld1 {v0.S}[0], [x0], x1 + ld1 {v2.S}[0], [x0], x1 + ld1 {v18.S}[1], [x0], x1 + ld1 {v16.S}[1], [x0], x1 + ld1 {v0.S}[1], [x0], x1 + ld1 {v2.S}[1], [x0], x1 + + transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 + + h264_loop_filter_chroma + + transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 + + sub x0, x0, x1, lsl #3 + st1 {v18.S}[0], [x0], x1 + st1 {v16.S}[0], [x0], x1 + st1 {v0.S}[0], [x0], x1 + st1 {v2.S}[0], [x0], x1 + st1 {v18.S}[1], [x0], x1 + st1 {v16.S}[1], [x0], x1 + st1 {v0.S}[1], [x0], x1 + st1 {v2.S}[1], [x0], x1 +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma422_neon, export=1 + h264_loop_filter_start + add x5, x0, x1 + sub x0, x0, #2 + add x1, x1, x1 + mov x7, x30 + bl h_loop_filter_chroma420 + mov x30, x7 + sub x0, x5, #2 + mov v24.s[0], w6 + b h_loop_filter_chroma420 +endfunc + +.macro h264_loop_filter_chroma_intra + uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) + uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0) + uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0) + cmhi v26.8b, v30.8b, v26.8b // < alpha + cmhi v27.8b, v31.8b, v27.8b // < beta + cmhi v28.8b, v31.8b, v28.8b // < beta + and v26.8b, v26.8b, v27.8b + and v26.8b, v26.8b, v28.8b + mov x2, v26.d[0] + + ushll v4.8h, v18.8b, #1 + ushll v6.8h, v19.8b, #1 + cbz x2, 9f + uaddl v20.8h, v16.8b, v19.8b + uaddl v22.8h, v17.8b, v18.8b + add v20.8h, v20.8h, v4.8h + add v22.8h, v22.8h, v6.8h + uqrshrn v24.8b, v20.8h, #2 + uqrshrn v25.8b, v22.8h, #2 + bit v16.8b, v24.8b, v26.8b + bit v17.8b, v25.8b, v26.8b +.endm + +function ff_h264_v_loop_filter_chroma_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x0, x0, x1, lsl #1 + ld1 {v18.8b}, [x0], x1 + ld1 {v16.8b}, [x0], x1 + ld1 {v17.8b}, [x0], x1 + ld1 {v19.8b}, [x0] + + h264_loop_filter_chroma_intra + + sub x0, x0, x1, lsl #1 + st1 {v16.8b}, [x0], x1 + st1 {v17.8b}, [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x4, x0, #2 + sub x0, x0, #1 + ld1 {v18.8b}, [x4], x1 + ld1 {v16.8b}, [x4], x1 + ld1 {v17.8b}, [x4], x1 + ld1 {v19.8b}, [x4], x1 + + transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra + + st2 {v16.b,v17.b}[0], [x0], x1 + st2 {v16.b,v17.b}[1], [x0], x1 + st2 {v16.b,v17.b}[2], [x0], x1 + st2 {v16.b,v17.b}[3], [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x4, x0, #2 + sub x0, x0, #1 +h_loop_filter_chroma420_intra: + ld1 {v18.8b}, [x4], x1 + ld1 {v16.8b}, [x4], x1 + ld1 {v17.8b}, [x4], x1 + ld1 {v19.8b}, [x4], x1 + ld1 {v18.s}[1], [x4], x1 + ld1 {v16.s}[1], [x4], x1 + ld1 {v17.s}[1], [x4], x1 + ld1 {v19.s}[1], [x4], x1 + + transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra + + st2 {v16.b,v17.b}[0], [x0], x1 + st2 {v16.b,v17.b}[1], [x0], x1 + st2 {v16.b,v17.b}[2], [x0], x1 + st2 {v16.b,v17.b}[3], [x0], x1 + st2 {v16.b,v17.b}[4], [x0], x1 + st2 {v16.b,v17.b}[5], [x0], x1 + st2 {v16.b,v17.b}[6], [x0], x1 + st2 {v16.b,v17.b}[7], [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma422_intra_neon, export=1 + h264_loop_filter_start_intra + sub x4, x0, #2 + add x5, x0, x1, lsl #3 + sub x0, x0, #1 + mov x7, x30 + bl h_loop_filter_chroma420_intra + sub x0, x5, #1 + mov x30, x7 + b h_loop_filter_chroma420_intra +endfunc + +.macro biweight_16 macs, macd + dup v0.16B, w5 + dup v1.16B, w6 + mov v4.16B, v16.16B + mov v6.16B, v16.16B +1: subs w3, w3, #2 + ld1 {v20.16B}, [x0], x2 + \macd v4.8H, v0.8B, v20.8B + \macd\()2 v6.8H, v0.16B, v20.16B + ld1 {v22.16B}, [x1], x2 + \macs v4.8H, v1.8B, v22.8B + \macs\()2 v6.8H, v1.16B, v22.16B + mov v24.16B, v16.16B + ld1 {v28.16B}, [x0], x2 + mov v26.16B, v16.16B + \macd v24.8H, v0.8B, v28.8B + \macd\()2 v26.8H, v0.16B, v28.16B + ld1 {v30.16B}, [x1], x2 + \macs v24.8H, v1.8B, v30.8B + \macs\()2 v26.8H, v1.16B, v30.16B + sshl v4.8H, v4.8H, v18.8H + sshl v6.8H, v6.8H, v18.8H + sqxtun v4.8B, v4.8H + sqxtun2 v4.16B, v6.8H + sshl v24.8H, v24.8H, v18.8H + sshl v26.8H, v26.8H, v18.8H + sqxtun v24.8B, v24.8H + sqxtun2 v24.16B, v26.8H + mov v6.16B, v16.16B + st1 {v4.16B}, [x7], x2 + mov v4.16B, v16.16B + st1 {v24.16B}, [x7], x2 + b.ne 1b + ret +.endm + +.macro biweight_8 macs, macd + dup v0.8B, w5 + dup v1.8B, w6 + mov v2.16B, v16.16B + mov v20.16B, v16.16B +1: subs w3, w3, #2 + ld1 {v4.8B}, [x0], x2 + \macd v2.8H, v0.8B, v4.8B + ld1 {v5.8B}, [x1], x2 + \macs v2.8H, v1.8B, v5.8B + ld1 {v6.8B}, [x0], x2 + \macd v20.8H, v0.8B, v6.8B + ld1 {v7.8B}, [x1], x2 + \macs v20.8H, v1.8B, v7.8B + sshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + sshl v20.8H, v20.8H, v18.8H + sqxtun v4.8B, v20.8H + mov v20.16B, v16.16B + st1 {v2.8B}, [x7], x2 + mov v2.16B, v16.16B + st1 {v4.8B}, [x7], x2 + b.ne 1b + ret +.endm + +.macro biweight_4 macs, macd + dup v0.8B, w5 + dup v1.8B, w6 + mov v2.16B, v16.16B + mov v20.16B,v16.16B +1: subs w3, w3, #4 + ld1 {v4.S}[0], [x0], x2 + ld1 {v4.S}[1], [x0], x2 + \macd v2.8H, v0.8B, v4.8B + ld1 {v5.S}[0], [x1], x2 + ld1 {v5.S}[1], [x1], x2 + \macs v2.8H, v1.8B, v5.8B + b.lt 2f + ld1 {v6.S}[0], [x0], x2 + ld1 {v6.S}[1], [x0], x2 + \macd v20.8H, v0.8B, v6.8B + ld1 {v7.S}[0], [x1], x2 + ld1 {v7.S}[1], [x1], x2 + \macs v20.8H, v1.8B, v7.8B + sshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + sshl v20.8H, v20.8H, v18.8H + sqxtun v4.8B, v20.8H + mov v20.16B, v16.16B + st1 {v2.S}[0], [x7], x2 + st1 {v2.S}[1], [x7], x2 + mov v2.16B, v16.16B + st1 {v4.S}[0], [x7], x2 + st1 {v4.S}[1], [x7], x2 + b.ne 1b + ret +2: sshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + st1 {v2.S}[0], [x7], x2 + st1 {v2.S}[1], [x7], x2 + ret +.endm + +.macro biweight_func w +function ff_biweight_h264_pixels_\w\()_neon, export=1 + lsr w8, w5, #31 + add w7, w7, #1 + eor w8, w8, w6, lsr #30 + orr w7, w7, #1 + dup v18.8H, w4 + lsl w7, w7, w4 + not v18.16B, v18.16B + dup v16.8H, w7 + mov x7, x0 + cbz w8, 10f + subs w8, w8, #1 + b.eq 20f + subs w8, w8, #1 + b.eq 30f + b 40f +10: biweight_\w umlal, umlal +20: neg w5, w5 + biweight_\w umlal, umlsl +30: neg w5, w5 + neg w6, w6 + biweight_\w umlsl, umlsl +40: neg w6, w6 + biweight_\w umlsl, umlal +endfunc +.endm + + biweight_func 16 + biweight_func 8 + biweight_func 4 + +.macro weight_16 add + dup v0.16B, w4 +1: subs w2, w2, #2 + ld1 {v20.16B}, [x0], x1 + umull v4.8H, v0.8B, v20.8B + umull2 v6.8H, v0.16B, v20.16B + ld1 {v28.16B}, [x0], x1 + umull v24.8H, v0.8B, v28.8B + umull2 v26.8H, v0.16B, v28.16B + \add v4.8H, v16.8H, v4.8H + srshl v4.8H, v4.8H, v18.8H + \add v6.8H, v16.8H, v6.8H + srshl v6.8H, v6.8H, v18.8H + sqxtun v4.8B, v4.8H + sqxtun2 v4.16B, v6.8H + \add v24.8H, v16.8H, v24.8H + srshl v24.8H, v24.8H, v18.8H + \add v26.8H, v16.8H, v26.8H + srshl v26.8H, v26.8H, v18.8H + sqxtun v24.8B, v24.8H + sqxtun2 v24.16B, v26.8H + st1 {v4.16B}, [x5], x1 + st1 {v24.16B}, [x5], x1 + b.ne 1b + ret +.endm + +.macro weight_8 add + dup v0.8B, w4 +1: subs w2, w2, #2 + ld1 {v4.8B}, [x0], x1 + umull v2.8H, v0.8B, v4.8B + ld1 {v6.8B}, [x0], x1 + umull v20.8H, v0.8B, v6.8B + \add v2.8H, v16.8H, v2.8H + srshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + \add v20.8H, v16.8H, v20.8H + srshl v20.8H, v20.8H, v18.8H + sqxtun v4.8B, v20.8H + st1 {v2.8B}, [x5], x1 + st1 {v4.8B}, [x5], x1 + b.ne 1b + ret +.endm + +.macro weight_4 add + dup v0.8B, w4 +1: subs w2, w2, #4 + ld1 {v4.S}[0], [x0], x1 + ld1 {v4.S}[1], [x0], x1 + umull v2.8H, v0.8B, v4.8B + b.lt 2f + ld1 {v6.S}[0], [x0], x1 + ld1 {v6.S}[1], [x0], x1 + umull v20.8H, v0.8B, v6.8B + \add v2.8H, v16.8H, v2.8H + srshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + \add v20.8H, v16.8H, v20.8H + srshl v20.8H, v20.8h, v18.8H + sqxtun v4.8B, v20.8H + st1 {v2.S}[0], [x5], x1 + st1 {v2.S}[1], [x5], x1 + st1 {v4.S}[0], [x5], x1 + st1 {v4.S}[1], [x5], x1 + b.ne 1b + ret +2: \add v2.8H, v16.8H, v2.8H + srshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + st1 {v2.S}[0], [x5], x1 + st1 {v2.S}[1], [x5], x1 + ret +.endm + +.macro weight_func w +function ff_weight_h264_pixels_\w\()_neon, export=1 + cmp w3, #1 + mov w6, #1 + lsl w5, w5, w3 + dup v16.8H, w5 + mov x5, x0 + b.le 20f + sub w6, w6, w3 + dup v18.8H, w6 + cmp w4, #0 + b.lt 10f + weight_\w shadd +10: neg w4, w4 + weight_\w shsub +20: neg w6, w3 + dup v18.8H, w6 + cmp w4, #0 + b.lt 10f + weight_\w add +10: neg w4, w4 + weight_\w sub +endfunc +.endm + + weight_func 16 + weight_func 8 + weight_func 4 + +.macro h264_loop_filter_start_10 + cmp w2, #0 + ldr w6, [x4] + ccmp w3, #0, #0, ne + lsl w2, w2, #2 + mov v24.S[0], w6 + lsl w3, w3, #2 + and w8, w6, w6, lsl #16 + b.eq 1f + ands w8, w8, w8, lsl #8 + b.ge 2f +1: + ret +2: +.endm + +.macro h264_loop_filter_start_intra_10 + orr w4, w2, w3 + cbnz w4, 1f + ret +1: + lsl w2, w2, #2 + lsl w3, w3, #2 + dup v30.8h, w2 // alpha + dup v31.8h, w3 // beta +.endm + +.macro h264_loop_filter_chroma_10 + dup v22.8h, w2 // alpha + dup v23.8h, w3 // beta + uxtl v24.8h, v24.8b // tc0 + + uabd v26.8h, v16.8h, v0.8h // abs(p0 - q0) + uabd v28.8h, v18.8h, v16.8h // abs(p1 - p0) + uabd v30.8h, v2.8h, v0.8h // abs(q1 - q0) + cmhi v26.8h, v22.8h, v26.8h // < alpha + cmhi v28.8h, v23.8h, v28.8h // < beta + cmhi v30.8h, v23.8h, v30.8h // < beta + + and v26.16b, v26.16b, v28.16b + mov v4.16b, v0.16b + sub v4.8h, v4.8h, v16.8h + and v26.16b, v26.16b, v30.16b + shl v4.8h, v4.8h, #2 + mov x8, v26.d[0] + mov x9, v26.d[1] + sli v24.8h, v24.8h, #8 + uxtl v24.8h, v24.8b + add v4.8h, v4.8h, v18.8h + adds x8, x8, x9 + shl v24.8h, v24.8h, #2 + + b.eq 9f + + movi v31.8h, #3 // (tc0 - 1) << (BIT_DEPTH - 8)) + 1 + uqsub v24.8h, v24.8h, v31.8h + sub v4.8h, v4.8h, v2.8h + srshr v4.8h, v4.8h, #3 + smin v4.8h, v4.8h, v24.8h + neg v25.8h, v24.8h + smax v4.8h, v4.8h, v25.8h + and v4.16b, v4.16b, v26.16b + add v16.8h, v16.8h, v4.8h + sub v0.8h, v0.8h, v4.8h + + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping + movi v5.8h, #0 + smin v0.8h, v0.8h, v4.8h + smin v16.8h, v16.8h, v4.8h + smax v0.8h, v0.8h, v5.8h + smax v16.8h, v16.8h, v5.8h +.endm + +function ff_h264_v_loop_filter_chroma_neon_10, export=1 + h264_loop_filter_start_10 + + mov x10, x0 + sub x0, x0, x1, lsl #1 + ld1 {v18.8h}, [x0 ], x1 + ld1 {v0.8h}, [x10], x1 + ld1 {v16.8h}, [x0 ], x1 + ld1 {v2.8h}, [x10] + + h264_loop_filter_chroma_10 + + sub x0, x10, x1, lsl #1 + st1 {v16.8h}, [x0], x1 + st1 {v0.8h}, [x0], x1 +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_neon_10, export=1 + h264_loop_filter_start_10 + + sub x0, x0, #4 // access the 2nd left pixel +h_loop_filter_chroma420_10: + add x10, x0, x1, lsl #2 + ld1 {v18.d}[0], [x0 ], x1 + ld1 {v18.d}[1], [x10], x1 + ld1 {v16.d}[0], [x0 ], x1 + ld1 {v16.d}[1], [x10], x1 + ld1 {v0.d}[0], [x0 ], x1 + ld1 {v0.d}[1], [x10], x1 + ld1 {v2.d}[0], [x0 ], x1 + ld1 {v2.d}[1], [x10], x1 + + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 + + h264_loop_filter_chroma_10 + + transpose_4x8H v18, v16, v0, v2, v28, v29, v30, v31 + + sub x0, x10, x1, lsl #3 + st1 {v18.d}[0], [x0], x1 + st1 {v16.d}[0], [x0], x1 + st1 {v0.d}[0], [x0], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v18.d}[1], [x0], x1 + st1 {v16.d}[1], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma422_neon_10, export=1 + h264_loop_filter_start_10 + add x5, x0, x1 + sub x0, x0, #4 + add x1, x1, x1 + mov x7, x30 + bl h_loop_filter_chroma420_10 + mov x30, x7 + sub x0, x5, #4 + mov v24.s[0], w6 + b h_loop_filter_chroma420_10 +endfunc + +.macro h264_loop_filter_chroma_intra_10 + uabd v26.8h, v16.8h, v17.8h // abs(p0 - q0) + uabd v27.8h, v18.8h, v16.8h // abs(p1 - p0) + uabd v28.8h, v19.8h, v17.8h // abs(q1 - q0) + cmhi v26.8h, v30.8h, v26.8h // < alpha + cmhi v27.8h, v31.8h, v27.8h // < beta + cmhi v28.8h, v31.8h, v28.8h // < beta + and v26.16b, v26.16b, v27.16b + and v26.16b, v26.16b, v28.16b + mov x2, v26.d[0] + mov x3, v26.d[1] + + shl v4.8h, v18.8h, #1 + shl v6.8h, v19.8h, #1 + + adds x2, x2, x3 + b.eq 9f + + add v20.8h, v16.8h, v19.8h + add v22.8h, v17.8h, v18.8h + add v20.8h, v20.8h, v4.8h + add v22.8h, v22.8h, v6.8h + urshr v24.8h, v20.8h, #2 + urshr v25.8h, v22.8h, #2 + bit v16.16b, v24.16b, v26.16b + bit v17.16b, v25.16b, v26.16b +.endm + +function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + mov x9, x0 + sub x0, x0, x1, lsl #1 + ld1 {v18.8h}, [x0], x1 + ld1 {v17.8h}, [x9], x1 + ld1 {v16.8h}, [x0], x1 + ld1 {v19.8h}, [x9] + + h264_loop_filter_chroma_intra_10 + + sub x0, x9, x1, lsl #1 + st1 {v16.8h}, [x0], x1 + st1 {v17.8h}, [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + + sub x4, x0, #4 + sub x0, x0, #2 + add x9, x4, x1, lsl #1 + ld1 {v18.8h}, [x4], x1 + ld1 {v17.8h}, [x9], x1 + ld1 {v16.8h}, [x4], x1 + ld1 {v19.8h}, [x9], x1 + + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra_10 + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + sub x4, x0, #4 + sub x0, x0, #2 +h_loop_filter_chroma420_intra_10: + add x9, x4, x1, lsl #2 + ld1 {v18.4h}, [x4], x1 + ld1 {v18.d}[1], [x9], x1 + ld1 {v16.4h}, [x4], x1 + ld1 {v16.d}[1], [x9], x1 + ld1 {v17.4h}, [x4], x1 + ld1 {v17.d}[1], [x9], x1 + ld1 {v19.4h}, [x4], x1 + ld1 {v19.d}[1], [x9], x1 + + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra_10 + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + st2 {v16.h,v17.h}[4], [x0], x1 + st2 {v16.h,v17.h}[5], [x0], x1 + st2 {v16.h,v17.h}[6], [x0], x1 + st2 {v16.h,v17.h}[7], [x0], x1 + +9: + ret +endfunc + +function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1 + h264_loop_filter_start_intra_10 + sub x4, x0, #4 + add x5, x0, x1, lsl #3 + sub x0, x0, #2 + mov x7, x30 + bl h_loop_filter_chroma420_intra_10 + mov x4, x9 + sub x0, x5, #2 + mov x30, x7 + b h_loop_filter_chroma420_intra_10 +endfunc diff --git a/media/ffvpx/libavcodec/aarch64/h264idct_neon.S b/media/ffvpx/libavcodec/aarch64/h264idct_neon.S new file mode 100644 index 0000000000..375da31d65 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/h264idct_neon.S @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +function ff_h264_idct_add_neon, export=1 +.L_ff_h264_idct_add_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1] + sxtw x2, w2 + movi v30.8H, #0 + + add v4.4H, v0.4H, v2.4H + sshr v16.4H, v1.4H, #1 + st1 {v30.8H}, [x1], #16 + sshr v17.4H, v3.4H, #1 + st1 {v30.8H}, [x1], #16 + sub v5.4H, v0.4H, v2.4H + sub v6.4H, v16.4H, v3.4H + add v7.4H, v1.4H, v17.4H + add v0.4H, v4.4H, v7.4H + add v1.4H, v5.4H, v6.4H + sub v2.4H, v5.4H, v6.4H + sub v3.4H, v4.4H, v7.4H + + transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 + + add v4.4H, v0.4H, v2.4H + ld1 {v18.S}[0], [x0], x2 + sshr v16.4H, v3.4H, #1 + sshr v17.4H, v1.4H, #1 + ld1 {v18.S}[1], [x0], x2 + sub v5.4H, v0.4H, v2.4H + ld1 {v19.S}[1], [x0], x2 + add v6.4H, v16.4H, v1.4H + ins v4.D[1], v5.D[0] + sub v7.4H, v17.4H, v3.4H + ld1 {v19.S}[0], [x0], x2 + ins v6.D[1], v7.D[0] + sub x0, x0, x2, lsl #2 + add v0.8H, v4.8H, v6.8H + sub v1.8H, v4.8H, v6.8H + + srshr v0.8H, v0.8H, #6 + srshr v1.8H, v1.8H, #6 + + uaddw v0.8H, v0.8H, v18.8B + uaddw v1.8H, v1.8H, v19.8B + + sqxtun v0.8B, v0.8H + sqxtun v1.8B, v1.8H + + st1 {v0.S}[0], [x0], x2 + st1 {v0.S}[1], [x0], x2 + st1 {v1.S}[1], [x0], x2 + st1 {v1.S}[0], [x0], x2 + + sub x1, x1, #32 + ret +endfunc + +function ff_h264_idct_dc_add_neon, export=1 +.L_ff_h264_idct_dc_add_neon: + AARCH64_VALID_CALL_TARGET + sxtw x2, w2 + mov w3, #0 + ld1r {v2.8H}, [x1] + strh w3, [x1] + srshr v2.8H, v2.8H, #6 + ld1 {v0.S}[0], [x0], x2 + ld1 {v0.S}[1], [x0], x2 + uaddw v3.8H, v2.8H, v0.8B + ld1 {v1.S}[0], [x0], x2 + ld1 {v1.S}[1], [x0], x2 + uaddw v4.8H, v2.8H, v1.8B + sqxtun v0.8B, v3.8H + sqxtun v1.8B, v4.8H + sub x0, x0, x2, lsl #2 + st1 {v0.S}[0], [x0], x2 + st1 {v0.S}[1], [x0], x2 + st1 {v1.S}[0], [x0], x2 + st1 {v1.S}[1], [x0], x2 + ret +endfunc + +function ff_h264_idct_add16_neon, export=1 + mov x12, x30 + mov x6, x0 // dest + mov x5, x1 // block_offset + mov x1, x2 // block + mov w9, w3 // stride + movrel x7, scan8 + mov x10, #16 + movrel x13, .L_ff_h264_idct_dc_add_neon + movrel x14, .L_ff_h264_idct_add_neon +1: mov w2, w9 + ldrb w3, [x7], #1 + ldrsw x0, [x5], #4 + ldrb w3, [x4, w3, uxtw] + subs w3, w3, #1 + b.lt 2f + ldrsh w3, [x1] + add x0, x0, x6 + ccmp w3, #0, #4, eq + csel x15, x13, x14, ne + blr x15 +2: subs x10, x10, #1 + add x1, x1, #32 + b.ne 1b + ret x12 +endfunc + +function ff_h264_idct_add16intra_neon, export=1 + mov x12, x30 + mov x6, x0 // dest + mov x5, x1 // block_offset + mov x1, x2 // block + mov w9, w3 // stride + movrel x7, scan8 + mov x10, #16 + movrel x13, .L_ff_h264_idct_dc_add_neon + movrel x14, .L_ff_h264_idct_add_neon +1: mov w2, w9 + ldrb w3, [x7], #1 + ldrsw x0, [x5], #4 + ldrb w3, [x4, w3, uxtw] + add x0, x0, x6 + cmp w3, #0 + ldrsh w3, [x1] + csel x15, x13, x14, eq + ccmp w3, #0, #0, eq + b.eq 2f + blr x15 +2: subs x10, x10, #1 + add x1, x1, #32 + b.ne 1b + ret x12 +endfunc + +function ff_h264_idct_add8_neon, export=1 + stp x19, x20, [sp, #-0x40]! + mov x12, x30 + ldp x6, x15, [x0] // dest[0], dest[1] + add x5, x1, #16*4 // block_offset + add x9, x2, #16*32 // block + mov w19, w3 // stride + movrel x13, .L_ff_h264_idct_dc_add_neon + movrel x14, .L_ff_h264_idct_add_neon + movrel x7, scan8, 16 + mov x10, #0 + mov x11, #16 +1: mov w2, w19 + ldrb w3, [x7, x10] // scan8[i] + ldrsw x0, [x5, x10, lsl #2] // block_offset[i] + ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ] + add x0, x0, x6 // block_offset[i] + dst[j-1] + add x1, x9, x10, lsl #5 // block + i * 16 + cmp w3, #0 + ldrsh w3, [x1] // block[i*16] + csel x20, x13, x14, eq + ccmp w3, #0, #0, eq + b.eq 2f + blr x20 +2: add x10, x10, #1 + cmp x10, #4 + csel x10, x11, x10, eq // mov x10, #16 + csel x6, x15, x6, eq + cmp x10, #20 + b.lt 1b + ldp x19, x20, [sp], #0x40 + ret x12 +endfunc + +.macro idct8x8_cols pass + .if \pass == 0 + va .req v18 + vb .req v30 + sshr v18.8H, v26.8H, #1 + add v16.8H, v24.8H, v28.8H + ld1 {v30.8H, v31.8H}, [x1] + st1 {v19.8H}, [x1], #16 + st1 {v19.8H}, [x1], #16 + sub v17.8H, v24.8H, v28.8H + sshr v19.8H, v30.8H, #1 + sub v18.8H, v18.8H, v30.8H + add v19.8H, v19.8H, v26.8H + .else + va .req v30 + vb .req v18 + sshr v30.8H, v26.8H, #1 + sshr v19.8H, v18.8H, #1 + add v16.8H, v24.8H, v28.8H + sub v17.8H, v24.8H, v28.8H + sub v30.8H, v30.8H, v18.8H + add v19.8H, v19.8H, v26.8H + .endif + add v26.8H, v17.8H, va.8H + sub v28.8H, v17.8H, va.8H + add v24.8H, v16.8H, v19.8H + sub vb.8H, v16.8H, v19.8H + sub v16.8H, v29.8H, v27.8H + add v17.8H, v31.8H, v25.8H + sub va.8H, v31.8H, v25.8H + add v19.8H, v29.8H, v27.8H + sub v16.8H, v16.8H, v31.8H + sub v17.8H, v17.8H, v27.8H + add va.8H, va.8H, v29.8H + add v19.8H, v19.8H, v25.8H + sshr v25.8H, v25.8H, #1 + sshr v27.8H, v27.8H, #1 + sshr v29.8H, v29.8H, #1 + sshr v31.8H, v31.8H, #1 + sub v16.8H, v16.8H, v31.8H + sub v17.8H, v17.8H, v27.8H + add va.8H, va.8H, v29.8H + add v19.8H, v19.8H, v25.8H + sshr v25.8H, v16.8H, #2 + sshr v27.8H, v17.8H, #2 + sshr v29.8H, va.8H, #2 + sshr v31.8H, v19.8H, #2 + sub v19.8H, v19.8H, v25.8H + sub va.8H, v27.8H, va.8H + add v17.8H, v17.8H, v29.8H + add v16.8H, v16.8H, v31.8H + .if \pass == 0 + sub v31.8H, v24.8H, v19.8H + add v24.8H, v24.8H, v19.8H + add v25.8H, v26.8H, v18.8H + sub v18.8H, v26.8H, v18.8H + add v26.8H, v28.8H, v17.8H + add v27.8H, v30.8H, v16.8H + sub v29.8H, v28.8H, v17.8H + sub v28.8H, v30.8H, v16.8H + .else + sub v31.8H, v24.8H, v19.8H + add v24.8H, v24.8H, v19.8H + add v25.8H, v26.8H, v30.8H + sub v30.8H, v26.8H, v30.8H + add v26.8H, v28.8H, v17.8H + sub v29.8H, v28.8H, v17.8H + add v27.8H, v18.8H, v16.8H + sub v28.8H, v18.8H, v16.8H + .endif + .unreq va + .unreq vb +.endm + +function ff_h264_idct8_add_neon, export=1 +.L_ff_h264_idct8_add_neon: + AARCH64_VALID_CALL_TARGET + movi v19.8H, #0 + sxtw x2, w2 + ld1 {v24.8H, v25.8H}, [x1] + st1 {v19.8H}, [x1], #16 + st1 {v19.8H}, [x1], #16 + ld1 {v26.8H, v27.8H}, [x1] + st1 {v19.8H}, [x1], #16 + st1 {v19.8H}, [x1], #16 + ld1 {v28.8H, v29.8H}, [x1] + st1 {v19.8H}, [x1], #16 + st1 {v19.8H}, [x1], #16 + + idct8x8_cols 0 + transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 + idct8x8_cols 1 + + mov x3, x0 + srshr v24.8H, v24.8H, #6 + ld1 {v0.8B}, [x0], x2 + srshr v25.8H, v25.8H, #6 + ld1 {v1.8B}, [x0], x2 + srshr v26.8H, v26.8H, #6 + ld1 {v2.8B}, [x0], x2 + srshr v27.8H, v27.8H, #6 + ld1 {v3.8B}, [x0], x2 + srshr v28.8H, v28.8H, #6 + ld1 {v4.8B}, [x0], x2 + srshr v29.8H, v29.8H, #6 + ld1 {v5.8B}, [x0], x2 + srshr v30.8H, v30.8H, #6 + ld1 {v6.8B}, [x0], x2 + srshr v31.8H, v31.8H, #6 + ld1 {v7.8B}, [x0], x2 + uaddw v24.8H, v24.8H, v0.8B + uaddw v25.8H, v25.8H, v1.8B + uaddw v26.8H, v26.8H, v2.8B + sqxtun v0.8B, v24.8H + uaddw v27.8H, v27.8H, v3.8B + sqxtun v1.8B, v25.8H + uaddw v28.8H, v28.8H, v4.8B + sqxtun v2.8B, v26.8H + st1 {v0.8B}, [x3], x2 + uaddw v29.8H, v29.8H, v5.8B + sqxtun v3.8B, v27.8H + st1 {v1.8B}, [x3], x2 + uaddw v30.8H, v30.8H, v6.8B + sqxtun v4.8B, v28.8H + st1 {v2.8B}, [x3], x2 + uaddw v31.8H, v31.8H, v7.8B + sqxtun v5.8B, v29.8H + st1 {v3.8B}, [x3], x2 + sqxtun v6.8B, v30.8H + sqxtun v7.8B, v31.8H + st1 {v4.8B}, [x3], x2 + st1 {v5.8B}, [x3], x2 + st1 {v6.8B}, [x3], x2 + st1 {v7.8B}, [x3], x2 + + sub x1, x1, #128 + ret +endfunc + +function ff_h264_idct8_dc_add_neon, export=1 +.L_ff_h264_idct8_dc_add_neon: + AARCH64_VALID_CALL_TARGET + mov w3, #0 + sxtw x2, w2 + ld1r {v31.8H}, [x1] + strh w3, [x1] + ld1 {v0.8B}, [x0], x2 + srshr v31.8H, v31.8H, #6 + ld1 {v1.8B}, [x0], x2 + ld1 {v2.8B}, [x0], x2 + uaddw v24.8H, v31.8H, v0.8B + ld1 {v3.8B}, [x0], x2 + uaddw v25.8H, v31.8H, v1.8B + ld1 {v4.8B}, [x0], x2 + uaddw v26.8H, v31.8H, v2.8B + ld1 {v5.8B}, [x0], x2 + uaddw v27.8H, v31.8H, v3.8B + ld1 {v6.8B}, [x0], x2 + uaddw v28.8H, v31.8H, v4.8B + ld1 {v7.8B}, [x0], x2 + uaddw v29.8H, v31.8H, v5.8B + uaddw v30.8H, v31.8H, v6.8B + uaddw v31.8H, v31.8H, v7.8B + sqxtun v0.8B, v24.8H + sqxtun v1.8B, v25.8H + sqxtun v2.8B, v26.8H + sqxtun v3.8B, v27.8H + sub x0, x0, x2, lsl #3 + st1 {v0.8B}, [x0], x2 + sqxtun v4.8B, v28.8H + st1 {v1.8B}, [x0], x2 + sqxtun v5.8B, v29.8H + st1 {v2.8B}, [x0], x2 + sqxtun v6.8B, v30.8H + st1 {v3.8B}, [x0], x2 + sqxtun v7.8B, v31.8H + st1 {v4.8B}, [x0], x2 + st1 {v5.8B}, [x0], x2 + st1 {v6.8B}, [x0], x2 + st1 {v7.8B}, [x0], x2 + ret +endfunc + +function ff_h264_idct8_add4_neon, export=1 + mov x12, x30 + mov x6, x0 + mov x5, x1 + mov x1, x2 + mov w2, w3 + movrel x7, scan8 + mov w10, #16 + movrel x13, .L_ff_h264_idct8_dc_add_neon + movrel x14, .L_ff_h264_idct8_add_neon +1: ldrb w9, [x7], #4 + ldrsw x0, [x5], #16 + ldrb w9, [x4, w9, UXTW] + subs w9, w9, #1 + b.lt 2f + ldrsh w11, [x1] + add x0, x6, x0 + ccmp w11, #0, #4, eq + csel x15, x13, x14, ne + blr x15 +2: subs w10, w10, #4 + add x1, x1, #128 + b.ne 1b + ret x12 +endfunc + +const scan8 + .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 + .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 + .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 + .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 + .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 + .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 + .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 + .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 + .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 + .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 + .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 + .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 +endconst diff --git a/media/ffvpx/libavcodec/aarch64/h264pred_init.c b/media/ffvpx/libavcodec/aarch64/h264pred_init.c new file mode 100644 index 0000000000..0ae8f70d23 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/h264pred_init.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/h264pred.h" + +void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride); + +void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); + +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_plane_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); + +void ff_pred8x8_vert_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_hor_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_plane_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_128_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_left_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_l0t_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_0lt_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_l00_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_0l0_dc_neon_10(uint8_t *src, ptrdiff_t stride); + +static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, + const int bit_depth, + const int chroma_format_idc) +{ + if (bit_depth == 8) { + if (chroma_format_idc <= 1) { + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon; + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon; + if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) + h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon; + h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon; + if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 && + codec_id != AV_CODEC_ID_VP8) { + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon; + h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon; + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon; + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon; + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon; + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon; + } + } + + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon; + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon; + h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon; + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon; + h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon; + if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && + codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; + } + if (bit_depth == 10) { + if (chroma_format_idc <= 1) { + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon_10; + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon_10; + if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) + h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon_10; + h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon_10; + if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 && + codec_id != AV_CODEC_ID_VP8) { + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon_10; + h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon_10; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon_10; + } + } + + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10; + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10; + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10; + if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && + codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon_10; + } +} + +av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, + int bit_depth, const int chroma_format_idc) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc); +} diff --git a/media/ffvpx/libavcodec/aarch64/h264pred_neon.S b/media/ffvpx/libavcodec/aarch64/h264pred_neon.S new file mode 100644 index 0000000000..ea37689f34 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/h264pred_neon.S @@ -0,0 +1,765 @@ +/* + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +.macro ldcol.8 rd, rs, rt, n=8, hi=0 +.if \n >= 8 || \hi == 0 + ld1 {\rd\().b}[0], [\rs], \rt + ld1 {\rd\().b}[1], [\rs], \rt + ld1 {\rd\().b}[2], [\rs], \rt + ld1 {\rd\().b}[3], [\rs], \rt +.endif +.if \n >= 8 || \hi == 1 + ld1 {\rd\().b}[4], [\rs], \rt + ld1 {\rd\().b}[5], [\rs], \rt + ld1 {\rd\().b}[6], [\rs], \rt + ld1 {\rd\().b}[7], [\rs], \rt +.endif +.if \n == 16 + ld1 {\rd\().b}[8], [\rs], \rt + ld1 {\rd\().b}[9], [\rs], \rt + ld1 {\rd\().b}[10], [\rs], \rt + ld1 {\rd\().b}[11], [\rs], \rt + ld1 {\rd\().b}[12], [\rs], \rt + ld1 {\rd\().b}[13], [\rs], \rt + ld1 {\rd\().b}[14], [\rs], \rt + ld1 {\rd\().b}[15], [\rs], \rt +.endif +.endm + +function ff_pred16x16_128_dc_neon, export=1 + movi v0.16b, #128 + b .L_pred16x16_dc_end +endfunc + +function ff_pred16x16_top_dc_neon, export=1 + sub x2, x0, x1 + ld1 {v0.16b}, [x2] + uaddlv h0, v0.16b + rshrn v0.8b, v0.8h, #4 + dup v0.16b, v0.b[0] + b .L_pred16x16_dc_end +endfunc + +function ff_pred16x16_left_dc_neon, export=1 + sub x2, x0, #1 + ldcol.8 v0, x2, x1, 16 + uaddlv h0, v0.16b + rshrn v0.8b, v0.8h, #4 + dup v0.16b, v0.b[0] + b .L_pred16x16_dc_end +endfunc + +function ff_pred16x16_dc_neon, export=1 + sub x2, x0, x1 + sub x3, x0, #1 + ld1 {v0.16b}, [x2] + ldcol.8 v1, x3, x1, 16 + uaddlv h0, v0.16b + uaddlv h1, v1.16b + add v0.4h, v0.4h, v1.4h + rshrn v0.8b, v0.8h, #5 + dup v0.16b, v0.b[0] +.L_pred16x16_dc_end: + mov w3, #8 +6: st1 {v0.16b}, [x0], x1 + subs w3, w3, #1 + st1 {v0.16b}, [x0], x1 + b.ne 6b + ret +endfunc + +function ff_pred16x16_hor_neon, export=1 + sub x2, x0, #1 + mov w3, #16 +1: ld1r {v0.16b}, [x2], x1 + subs w3, w3, #1 + st1 {v0.16b}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_pred16x16_vert_neon, export=1 + sub x2, x0, x1 + add x1, x1, x1 + ld1 {v0.16b}, [x2], x1 + mov w3, #8 +1: subs w3, w3, #1 + st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x2], x1 + b.ne 1b + ret +endfunc + +function ff_pred16x16_plane_neon, export=1 + sub x3, x0, x1 + movrel x4, p16weight + add x2, x3, #8 + sub x3, x3, #1 + ld1 {v0.8b}, [x3] + ld1 {v2.8b}, [x2], x1 + ldcol.8 v1, x3, x1 + add x3, x3, x1 + ldcol.8 v3, x3, x1 + rev64 v0.8b, v0.8b + rev64 v1.8b, v1.8b + uaddl v7.8h, v2.8b, v3.8b + usubl v2.8h, v2.8b, v0.8b + usubl v3.8h, v3.8b, v1.8b + ld1 {v0.8h}, [x4] + mul v2.8h, v2.8h, v0.8h + mul v3.8h, v3.8h, v0.8h + addp v2.8h, v2.8h, v3.8h + addp v2.8h, v2.8h, v2.8h + addp v2.4h, v2.4h, v2.4h + sshll v3.4s, v2.4h, #2 + saddw v2.4s, v3.4s, v2.4h + rshrn v4.4h, v2.4s, #6 + trn2 v5.4h, v4.4h, v4.4h + add v2.4h, v4.4h, v5.4h + shl v3.4h, v2.4h, #3 + ext v7.16b, v7.16b, v7.16b, #14 + sub v3.4h, v3.4h, v2.4h // 7 * (b + c) + add v7.4h, v7.4h, v0.4h + shl v2.4h, v7.4h, #4 + sub v2.4h, v2.4h, v3.4h + shl v3.4h, v4.4h, #4 + ext v0.16b, v0.16b, v0.16b, #14 + sub v6.4h, v5.4h, v3.4h + mov v0.h[0], wzr + mul v0.8h, v0.8h, v4.h[0] + dup v1.8h, v2.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v6.h[0] + shl v2.8h, v2.8h, #3 + add v1.8h, v1.8h, v0.8h + add v3.8h, v3.8h, v2.8h + mov w3, #16 +1: + sqshrun v0.8b, v1.8h, #5 + add v1.8h, v1.8h, v2.8h + sqshrun2 v0.16b, v1.8h, #5 + add v1.8h, v1.8h, v3.8h + subs w3, w3, #1 + st1 {v0.16b}, [x0], x1 + b.ne 1b + ret +endfunc + +const p16weight, align=4 + .short 1,2,3,4,5,6,7,8 +endconst +const p8weight, align=4 + .short 1,2,3,4,1,2,3,4 +endconst + +function ff_pred8x8_hor_neon, export=1 + sub x2, x0, #1 + mov w3, #8 +1: ld1r {v0.8b}, [x2], x1 + subs w3, w3, #1 + st1 {v0.8b}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_vert_neon, export=1 + sub x2, x0, x1 + lsl x1, x1, #1 + ld1 {v0.8b}, [x2], x1 + mov w3, #4 +1: subs w3, w3, #1 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x2], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_plane_neon, export=1 + sub x3, x0, x1 + movrel x4, p8weight + movrel x5, p16weight + add x2, x3, #4 + sub x3, x3, #1 + ld1 {v0.s}[0], [x3] + ld1 {v2.s}[0], [x2], x1 + ldcol.8 v0, x3, x1, 4, hi=1 + add x3, x3, x1 + ldcol.8 v3, x3, x1, 4 + uaddl v7.8h, v2.8b, v3.8b + rev32 v0.8b, v0.8b + trn1 v2.2s, v2.2s, v3.2s + usubl v2.8h, v2.8b, v0.8b + ld1 {v6.8h}, [x4] + mul v2.8h, v2.8h, v6.8h + ld1 {v0.8h}, [x5] + saddlp v2.4s, v2.8h + addp v2.4s, v2.4s, v2.4s + shl v3.4s, v2.4s, #4 + add v2.4s, v3.4s, v2.4s + rshrn v5.4h, v2.4s, #5 + addp v2.4h, v5.4h, v5.4h + shl v3.4h, v2.4h, #1 + add v3.4h, v3.4h, v2.4h + rev64 v7.4h, v7.4h + add v7.4h, v7.4h, v0.4h + shl v2.4h, v7.4h, #4 + sub v2.4h, v2.4h, v3.4h + ext v0.16b, v0.16b, v0.16b, #14 + mov v0.h[0], wzr + mul v0.8h, v0.8h, v5.h[0] + dup v1.8h, v2.h[0] + dup v2.8h, v5.h[1] + add v1.8h, v1.8h, v0.8h + mov w3, #8 +1: + sqshrun v0.8b, v1.8h, #5 + subs w3, w3, #1 + add v1.8h, v1.8h, v2.8h + st1 {v0.8b}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_128_dc_neon, export=1 + movi v0.8b, #128 + movi v1.8b, #128 + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_top_dc_neon, export=1 + sub x2, x0, x1 + ld1 {v0.8b}, [x2] + uaddlp v0.4h, v0.8b + addp v0.4h, v0.4h, v0.4h + zip1 v0.8h, v0.8h, v0.8h + rshrn v2.8b, v0.8h, #2 + zip1 v0.8b, v2.8b, v2.8b + zip1 v1.8b, v2.8b, v2.8b + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_left_dc_neon, export=1 + sub x2, x0, #1 + ldcol.8 v0, x2, x1 + uaddlp v0.4h, v0.8b + addp v0.4h, v0.4h, v0.4h + rshrn v2.8b, v0.8h, #2 + dup v1.8b, v2.b[1] + dup v0.8b, v2.b[0] + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_dc_neon, export=1 + sub x2, x0, x1 + sub x3, x0, #1 + ld1 {v0.8b}, [x2] + ldcol.8 v1, x3, x1 + uaddlp v0.4h, v0.8b + uaddlp v1.4h, v1.8b + trn1 v2.2s, v0.2s, v1.2s + trn2 v3.2s, v0.2s, v1.2s + addp v4.4h, v2.4h, v3.4h + addp v5.4h, v4.4h, v4.4h + rshrn v6.8b, v5.8h, #3 + rshrn v7.8b, v4.8h, #2 + dup v0.8b, v6.b[0] + dup v2.8b, v7.b[2] + dup v1.8b, v7.b[3] + dup v3.8b, v6.b[1] + zip1 v0.2s, v0.2s, v2.2s + zip1 v1.2s, v1.2s, v3.2s +.L_pred8x8_dc_end: + mov w3, #4 + add x2, x0, x1, lsl #2 +6: subs w3, w3, #1 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x2], x1 + b.ne 6b + ret +endfunc + +function ff_pred8x8_l0t_dc_neon, export=1 + sub x2, x0, x1 + sub x3, x0, #1 + ld1 {v0.8b}, [x2] + ldcol.8 v1, x3, x1, 4 + zip1 v0.4s, v0.4s, v1.4s + uaddlp v0.8h, v0.16b + addp v0.8h, v0.8h, v0.8h + addp v1.4h, v0.4h, v0.4h + rshrn v2.8b, v0.8h, #2 + rshrn v3.8b, v1.8h, #3 + dup v4.8b, v3.b[0] + dup v6.8b, v2.b[2] + dup v5.8b, v2.b[0] + zip1 v0.2s, v4.2s, v6.2s + zip1 v1.2s, v5.2s, v6.2s + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_l00_dc_neon, export=1 + sub x2, x0, #1 + ldcol.8 v0, x2, x1, 4 + uaddlp v0.4h, v0.8b + addp v0.4h, v0.4h, v0.4h + rshrn v0.8b, v0.8h, #2 + movi v1.8b, #128 + dup v0.8b, v0.b[0] + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_0lt_dc_neon, export=1 + add x3, x0, x1, lsl #2 + sub x2, x0, x1 + sub x3, x3, #1 + ld1 {v0.8b}, [x2] + ldcol.8 v1, x3, x1, 4, hi=1 + zip1 v0.4s, v0.4s, v1.4s + uaddlp v0.8h, v0.16b + addp v0.8h, v0.8h, v0.8h + addp v1.4h, v0.4h, v0.4h + rshrn v2.8b, v0.8h, #2 + rshrn v3.8b, v1.8h, #3 + dup v4.8b, v2.b[0] + dup v5.8b, v2.b[3] + dup v6.8b, v2.b[2] + dup v7.8b, v3.b[1] + zip1 v0.2s, v4.2s, v6.2s + zip1 v1.2s, v5.2s, v7.2s + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_0l0_dc_neon, export=1 + add x2, x0, x1, lsl #2 + sub x2, x2, #1 + ldcol.8 v1, x2, x1, 4 + uaddlp v2.4h, v1.8b + addp v2.4h, v2.4h, v2.4h + rshrn v1.8b, v2.8h, #2 + movi v0.8b, #128 + dup v1.8b, v1.b[0] + b .L_pred8x8_dc_end +endfunc + +.macro ldcol.16 rd, rs, rt, n=4, hi=0 +.if \n >= 4 && \hi == 0 + ld1 {\rd\().h}[0], [\rs], \rt + ld1 {\rd\().h}[1], [\rs], \rt + ld1 {\rd\().h}[2], [\rs], \rt + ld1 {\rd\().h}[3], [\rs], \rt +.endif +.if \n == 8 || \hi == 1 + ld1 {\rd\().h}[4], [\rs], \rt + ld1 {\rd\().h}[5], [\rs], \rt + ld1 {\rd\().h}[6], [\rs], \rt + ld1 {\rd\().h}[7], [\rs], \rt +.endif +.endm + +// slower than C +/* +function ff_pred16x16_128_dc_neon_10, export=1 + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) + + b .L_pred16x16_dc_10_end +endfunc +*/ + +function ff_pred16x16_top_dc_neon_10, export=1 + sub x2, x0, x1 + + ld1 {v0.8h, v1.8h}, [x2] + + add v0.8h, v0.8h, v1.8h + addv h0, v0.8h + + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + b .L_pred16x16_dc_10_end +endfunc + +// slower than C +/* +function ff_pred16x16_left_dc_neon_10, export=1 + sub x2, x0, #2 // access to the "left" column + ldcol.16 v0, x2, x1, 8 + ldcol.16 v1, x2, x1, 8 // load "left" column + + add v0.8h, v0.8h, v1.8h + addv h0, v0.8h + + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + b .L_pred16x16_dc_10_end +endfunc +*/ + +function ff_pred16x16_dc_neon_10, export=1 + sub x2, x0, x1 // access to the "top" row + sub x3, x0, #2 // access to the "left" column + + ld1 {v0.8h, v1.8h}, [x2] + ldcol.16 v2, x3, x1, 8 + ldcol.16 v3, x3, x1, 8 // load pixels in "top" row and "left" col + + add v0.8h, v0.8h, v1.8h + add v2.8h, v2.8h, v3.8h + add v0.8h, v0.8h, v2.8h + addv h0, v0.8h + + urshr v0.4h, v0.4h, #5 + dup v0.8h, v0.h[0] +.L_pred16x16_dc_10_end: + mov v1.16b, v0.16b + mov w3, #8 +6: st1 {v0.8h, v1.8h}, [x0], x1 + subs w3, w3, #1 + st1 {v0.8h, v1.8h}, [x0], x1 + b.ne 6b + ret +endfunc + +function ff_pred16x16_hor_neon_10, export=1 + sub x2, x0, #2 + add x3, x0, #16 + + mov w4, #16 +1: ld1r {v0.8h}, [x2], x1 + subs w4, w4, #1 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x3], x1 + b.ne 1b + ret +endfunc + +function ff_pred16x16_vert_neon_10, export=1 + sub x2, x0, x1 + add x1, x1, x1 + + ld1 {v0.8h, v1.8h}, [x2], x1 + + mov w3, #8 +1: subs w3, w3, #1 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x2], x1 + + b.ne 1b + ret +endfunc + +function ff_pred16x16_plane_neon_10, export=1 + sub x3, x0, x1 + movrel x4, p16weight + add x2, x3, #16 + sub x3, x3, #2 + ld1 {v0.8h}, [x3] + ld1 {v2.8h}, [x2], x1 + ldcol.16 v1, x3, x1, 8 + add x3, x3, x1 + ldcol.16 v3, x3, x1, 8 + + rev64 v16.8h, v0.8h + rev64 v17.8h, v1.8h + ext v0.16b, v16.16b, v16.16b, #8 + ext v1.16b, v17.16b, v17.16b, #8 + + add v7.8h, v2.8h, v3.8h + sub v2.8h, v2.8h, v0.8h + sub v3.8h, v3.8h, v1.8h + ld1 {v0.8h}, [x4] + mul v2.8h, v2.8h, v0.8h + mul v3.8h, v3.8h, v0.8h + addp v2.8h, v2.8h, v3.8h + addp v2.8h, v2.8h, v2.8h + addp v2.4h, v2.4h, v2.4h + sshll v3.4s, v2.4h, #2 + saddw v2.4s, v3.4s, v2.4h + rshrn v4.4h, v2.4s, #6 + trn2 v5.4h, v4.4h, v4.4h + add v2.4h, v4.4h, v5.4h + shl v3.4h, v2.4h, #3 + ext v7.16b, v7.16b, v7.16b, #14 + sub v3.4h, v3.4h, v2.4h // 7 * (b + c) + add v7.4h, v7.4h, v0.4h + shl v2.4h, v7.4h, #4 + ssubl v2.4s, v2.4h, v3.4h + shl v3.4h, v4.4h, #4 + ext v0.16b, v0.16b, v0.16b, #14 + ssubl v6.4s, v5.4h, v3.4h + + mov v0.h[0], wzr + mul v0.8h, v0.8h, v4.h[0] + dup v16.4s, v2.s[0] + dup v17.4s, v2.s[0] + dup v2.8h, v4.h[0] + dup v3.4s, v6.s[0] + shl v2.8h, v2.8h, #3 + saddw v16.4s, v16.4s, v0.4h + saddw2 v17.4s, v17.4s, v0.8h + saddw v3.4s, v3.4s, v2.4h + + mov w3, #16 + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping +1: + sqshrun v0.4h, v16.4s, #5 + sqshrun2 v0.8h, v17.4s, #5 + saddw v16.4s, v16.4s, v2.4h + saddw v17.4s, v17.4s, v2.4h + sqshrun v1.4h, v16.4s, #5 + sqshrun2 v1.8h, v17.4s, #5 + add v16.4s, v16.4s, v3.4s + add v17.4s, v17.4s, v3.4s + + subs w3, w3, #1 + + smin v0.8h, v0.8h, v4.8h + smin v1.8h, v1.8h, v4.8h + + st1 {v0.8h, v1.8h}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_hor_neon_10, export=1 + sub x2, x0, #2 + mov w3, #8 + +1: ld1r {v0.8h}, [x2], x1 + subs w3, w3, #1 + st1 {v0.8h}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_vert_neon_10, export=1 + sub x2, x0, x1 + lsl x1, x1, #1 + + ld1 {v0.8h}, [x2], x1 + mov w3, #4 +1: subs w3, w3, #1 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x2], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_plane_neon_10, export=1 + sub x3, x0, x1 + movrel x4, p8weight + movrel x5, p16weight + add x2, x3, #8 + sub x3, x3, #2 + ld1 {v0.d}[0], [x3] + ld1 {v2.d}[0], [x2], x1 + ldcol.16 v0, x3, x1, hi=1 + add x3, x3, x1 + ldcol.16 v3, x3, x1, 4 + add v7.8h, v2.8h, v3.8h + rev64 v0.8h, v0.8h + trn1 v2.2d, v2.2d, v3.2d + sub v2.8h, v2.8h, v0.8h + ld1 {v6.8h}, [x4] + mul v2.8h, v2.8h, v6.8h + ld1 {v0.8h}, [x5] + saddlp v2.4s, v2.8h + addp v2.4s, v2.4s, v2.4s + shl v3.4s, v2.4s, #4 + add v2.4s, v3.4s, v2.4s + rshrn v5.4h, v2.4s, #5 + addp v2.4h, v5.4h, v5.4h + shl v3.4h, v2.4h, #1 + add v3.4h, v3.4h, v2.4h + rev64 v7.4h, v7.4h + add v7.4h, v7.4h, v0.4h + shl v2.4h, v7.4h, #4 + ssubl v2.4s, v2.4h, v3.4h + ext v0.16b, v0.16b, v0.16b, #14 + mov v0.h[0], wzr + mul v0.8h, v0.8h, v5.h[0] + dup v1.4s, v2.s[0] + dup v2.4s, v2.s[0] + dup v3.8h, v5.h[1] + saddw v1.4s, v1.4s, v0.4h + saddw2 v2.4s, v2.4s, v0.8h + mov w3, #8 + mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping +1: + sqshrun v0.4h, v1.4s, #5 + sqshrun2 v0.8h, v2.4s, #5 + + saddw v1.4s, v1.4s, v3.4h + saddw v2.4s, v2.4s, v3.4h + + subs w3, w3, #1 + + smin v0.8h, v0.8h, v4.8h + + st1 {v0.8h}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_128_dc_neon_10, export=1 + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) + movi v1.8h, #2, lsl #8 + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_top_dc_neon_10, export=1 + sub x2, x0, x1 + ld1 {v0.8h}, [x2] + + addp v0.8h, v0.8h, v0.8h + addp v0.4h, v0.4h, v0.4h + zip1 v0.4h, v0.4h, v0.4h + urshr v2.4h, v0.4h, #2 + zip1 v0.8h, v2.8h, v2.8h + zip1 v1.8h, v2.8h, v2.8h + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_left_dc_neon_10, export=1 + sub x2, x0, #2 + ldcol.16 v0, x2, x1, 8 + + addp v0.8h, v0.8h, v0.8h + addp v0.4h, v0.4h, v0.4h + urshr v2.4h, v0.4h, #2 + dup v1.8h, v2.h[1] + dup v0.8h, v2.h[0] + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_dc_neon_10, export=1 + sub x2, x0, x1 + sub x3, x0, #2 + + ld1 {v0.8h}, [x2] + ldcol.16 v1, x3, x1, 8 + + addp v0.8h, v0.8h, v0.8h + addp v1.8h, v1.8h, v1.8h + trn1 v2.2s, v0.2s, v1.2s + trn2 v3.2s, v0.2s, v1.2s + addp v4.4h, v2.4h, v3.4h + addp v5.4h, v4.4h, v4.4h + urshr v6.4h, v5.4h, #3 + urshr v7.4h, v4.4h, #2 + dup v0.8h, v6.h[0] + dup v2.8h, v7.h[2] + dup v1.8h, v7.h[3] + dup v3.8h, v6.h[1] + zip1 v0.2d, v0.2d, v2.2d + zip1 v1.2d, v1.2d, v3.2d +.L_pred8x8_dc_10_end: + mov w3, #4 + add x2, x0, x1, lsl #2 + +6: st1 {v0.8h}, [x0], x1 + subs w3, w3, #1 + st1 {v1.8h}, [x2], x1 + b.ne 6b + ret +endfunc + +function ff_pred8x8_l0t_dc_neon_10, export=1 + sub x2, x0, x1 + sub x3, x0, #2 + + ld1 {v0.8h}, [x2] + ldcol.16 v1, x3, x1, 4 + + addp v0.8h, v0.8h, v0.8h + addp v1.4h, v1.4h, v1.4h + addp v0.4h, v0.4h, v0.4h + addp v1.4h, v1.4h, v1.4h + add v1.4h, v1.4h, v0.4h + + urshr v2.4h, v0.4h, #2 + urshr v3.4h, v1.4h, #3 // the pred4x4 part + + dup v4.4h, v3.h[0] + dup v5.4h, v2.h[0] + dup v6.4h, v2.h[1] + + zip1 v0.2d, v4.2d, v6.2d + zip1 v1.2d, v5.2d, v6.2d + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_l00_dc_neon_10, export=1 + sub x2, x0, #2 + + ldcol.16 v0, x2, x1, 4 + + addp v0.4h, v0.4h, v0.4h + addp v0.4h, v0.4h, v0.4h + urshr v0.4h, v0.4h, #2 + + movi v1.8h, #2, lsl #8 // 512 + dup v0.8h, v0.h[0] + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_0lt_dc_neon_10, export=1 + add x3, x0, x1, lsl #2 + sub x2, x0, x1 + sub x3, x3, #2 + + ld1 {v0.8h}, [x2] + ldcol.16 v1, x3, x1, hi=1 + + addp v0.8h, v0.8h, v0.8h + addp v1.8h, v1.8h, v1.8h + addp v0.4h, v0.4h, v0.4h + addp v1.4h, v1.4h, v1.4h + zip1 v0.2s, v0.2s, v1.2s + add v1.4h, v0.4h, v1.4h + + urshr v2.4h, v0.4h, #2 + urshr v3.4h, v1.4h, #3 + + dup v4.4h, v2.h[0] + dup v5.4h, v2.h[3] + dup v6.4h, v2.h[1] + dup v7.4h, v3.h[1] + + zip1 v0.2d, v4.2d, v6.2d + zip1 v1.2d, v5.2d, v7.2d + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_0l0_dc_neon_10, export=1 + add x2, x0, x1, lsl #2 + sub x2, x2, #2 + + ldcol.16 v1, x2, x1, 4 + + addp v2.8h, v1.8h, v1.8h + addp v2.4h, v2.4h, v2.4h + urshr v1.4h, v2.4h, #2 + + movi v0.8h, #2, lsl #8 // 512 + dup v1.8h, v1.h[0] + b .L_pred8x8_dc_10_end +endfunc diff --git a/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c new file mode 100644 index 0000000000..144ae2bcc4 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c @@ -0,0 +1,123 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stddef.h> +#include <stdint.h> + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/hpeldsp.h" + +void ff_put_pixels16_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels8_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); + +void ff_put_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels8_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels8_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); + +void ff_avg_pixels16_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels8_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); + +void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); + +av_cold void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; + c->put_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; + c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon; + c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon; + c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon; + c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon; + c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon; + + c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon; + c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon; + c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon; + c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon; + } +} diff --git a/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S b/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S new file mode 100644 index 0000000000..a491c173bb --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S @@ -0,0 +1,397 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +.macro pixels16 rnd=1, avg=0 + .if \avg + mov x12, x0 + .endif +1: ld1 {v0.16B}, [x1], x2 + ld1 {v1.16B}, [x1], x2 + ld1 {v2.16B}, [x1], x2 + ld1 {v3.16B}, [x1], x2 + .if \avg + ld1 {v4.16B}, [x12], x2 + urhadd v0.16B, v0.16B, v4.16B + ld1 {v5.16B}, [x12], x2 + urhadd v1.16B, v1.16B, v5.16B + ld1 {v6.16B}, [x12], x2 + urhadd v2.16B, v2.16B, v6.16B + ld1 {v7.16B}, [x12], x2 + urhadd v3.16B, v3.16B, v7.16B + .endif + subs w3, w3, #4 + st1 {v0.16B}, [x0], x2 + st1 {v1.16B}, [x0], x2 + st1 {v2.16B}, [x0], x2 + st1 {v3.16B}, [x0], x2 + b.ne 1b + ret +.endm + +.macro pixels16_x2 rnd=1, avg=0 +1: ld1 {v0.16B, v1.16B}, [x1], x2 + ld1 {v2.16B, v3.16B}, [x1], x2 + subs w3, w3, #2 + ext v1.16B, v0.16B, v1.16B, #1 + avg v0.16B, v0.16B, v1.16B + ext v3.16B, v2.16B, v3.16B, #1 + avg v2.16B, v2.16B, v3.16B + .if \avg + ld1 {v1.16B}, [x0], x2 + ld1 {v3.16B}, [x0] + urhadd v0.16B, v0.16B, v1.16B + urhadd v2.16B, v2.16B, v3.16B + sub x0, x0, x2 + .endif + st1 {v0.16B}, [x0], x2 + st1 {v2.16B}, [x0], x2 + b.ne 1b + ret +.endm + +.macro pixels16_y2 rnd=1, avg=0 + sub w3, w3, #2 + ld1 {v0.16B}, [x1], x2 + ld1 {v1.16B}, [x1], x2 +1: subs w3, w3, #2 + avg v2.16B, v0.16B, v1.16B + ld1 {v0.16B}, [x1], x2 + avg v3.16B, v0.16B, v1.16B + ld1 {v1.16B}, [x1], x2 + .if \avg + ld1 {v4.16B}, [x0], x2 + ld1 {v5.16B}, [x0] + urhadd v2.16B, v2.16B, v4.16B + urhadd v3.16B, v3.16B, v5.16B + sub x0, x0, x2 + .endif + st1 {v2.16B}, [x0], x2 + st1 {v3.16B}, [x0], x2 + b.ne 1b + + avg v2.16B, v0.16B, v1.16B + ld1 {v0.16B}, [x1], x2 + avg v3.16B, v0.16B, v1.16B + .if \avg + ld1 {v4.16B}, [x0], x2 + ld1 {v5.16B}, [x0] + urhadd v2.16B, v2.16B, v4.16B + urhadd v3.16B, v3.16B, v5.16B + sub x0, x0, x2 + .endif + st1 {v2.16B}, [x0], x2 + st1 {v3.16B}, [x0], x2 + + ret +.endm + +.macro pixels16_xy2 rnd=1, avg=0 + sub w3, w3, #2 + ld1 {v0.16B, v1.16B}, [x1], x2 + ld1 {v4.16B, v5.16B}, [x1], x2 +NRND movi v26.8H, #1 + ext v1.16B, v0.16B, v1.16B, #1 + ext v5.16B, v4.16B, v5.16B, #1 + uaddl v16.8H, v0.8B, v1.8B + uaddl2 v20.8H, v0.16B, v1.16B + uaddl v18.8H, v4.8B, v5.8B + uaddl2 v22.8H, v4.16B, v5.16B +1: subs w3, w3, #2 + ld1 {v0.16B, v1.16B}, [x1], x2 + add v24.8H, v16.8H, v18.8H +NRND add v24.8H, v24.8H, v26.8H + ext v30.16B, v0.16B, v1.16B, #1 + add v1.8H, v20.8H, v22.8H + mshrn v28.8B, v24.8H, #2 +NRND add v1.8H, v1.8H, v26.8H + mshrn2 v28.16B, v1.8H, #2 + .if \avg + ld1 {v16.16B}, [x0] + urhadd v28.16B, v28.16B, v16.16B + .endif + uaddl v16.8H, v0.8B, v30.8B + ld1 {v2.16B, v3.16B}, [x1], x2 + uaddl2 v20.8H, v0.16B, v30.16B + st1 {v28.16B}, [x0], x2 + add v24.8H, v16.8H, v18.8H +NRND add v24.8H, v24.8H, v26.8H + ext v3.16B, v2.16B, v3.16B, #1 + add v0.8H, v20.8H, v22.8H + mshrn v30.8B, v24.8H, #2 +NRND add v0.8H, v0.8H, v26.8H + mshrn2 v30.16B, v0.8H, #2 + .if \avg + ld1 {v18.16B}, [x0] + urhadd v30.16B, v30.16B, v18.16B + .endif + uaddl v18.8H, v2.8B, v3.8B + uaddl2 v22.8H, v2.16B, v3.16B + st1 {v30.16B}, [x0], x2 + b.gt 1b + + ld1 {v0.16B, v1.16B}, [x1], x2 + add v24.8H, v16.8H, v18.8H +NRND add v24.8H, v24.8H, v26.8H + ext v30.16B, v0.16B, v1.16B, #1 + add v1.8H, v20.8H, v22.8H + mshrn v28.8B, v24.8H, #2 +NRND add v1.8H, v1.8H, v26.8H + mshrn2 v28.16B, v1.8H, #2 + .if \avg + ld1 {v16.16B}, [x0] + urhadd v28.16B, v28.16B, v16.16B + .endif + uaddl v16.8H, v0.8B, v30.8B + uaddl2 v20.8H, v0.16B, v30.16B + st1 {v28.16B}, [x0], x2 + add v24.8H, v16.8H, v18.8H +NRND add v24.8H, v24.8H, v26.8H + add v0.8H, v20.8H, v22.8H + mshrn v30.8B, v24.8H, #2 +NRND add v0.8H, v0.8H, v26.8H + mshrn2 v30.16B, v0.8H, #2 + .if \avg + ld1 {v18.16B}, [x0] + urhadd v30.16B, v30.16B, v18.16B + .endif + st1 {v30.16B}, [x0], x2 + + ret +.endm + +.macro pixels8 rnd=1, avg=0 +1: ld1 {v0.8B}, [x1], x2 + ld1 {v1.8B}, [x1], x2 + ld1 {v2.8B}, [x1], x2 + ld1 {v3.8B}, [x1], x2 + .if \avg + ld1 {v4.8B}, [x0], x2 + urhadd v0.8B, v0.8B, v4.8B + ld1 {v5.8B}, [x0], x2 + urhadd v1.8B, v1.8B, v5.8B + ld1 {v6.8B}, [x0], x2 + urhadd v2.8B, v2.8B, v6.8B + ld1 {v7.8B}, [x0], x2 + urhadd v3.8B, v3.8B, v7.8B + sub x0, x0, x2, lsl #2 + .endif + subs w3, w3, #4 + st1 {v0.8B}, [x0], x2 + st1 {v1.8B}, [x0], x2 + st1 {v2.8B}, [x0], x2 + st1 {v3.8B}, [x0], x2 + b.ne 1b + ret +.endm + +.macro pixels8_x2 rnd=1, avg=0 +1: ld1 {v0.8B, v1.8B}, [x1], x2 + ext v1.8B, v0.8B, v1.8B, #1 + ld1 {v2.8B, v3.8B}, [x1], x2 + ext v3.8B, v2.8B, v3.8B, #1 + subs w3, w3, #2 + avg v0.8B, v0.8B, v1.8B + avg v2.8B, v2.8B, v3.8B + .if \avg + ld1 {v4.8B}, [x0], x2 + ld1 {v5.8B}, [x0] + urhadd v0.8B, v0.8B, v4.8B + urhadd v2.8B, v2.8B, v5.8B + sub x0, x0, x2 + .endif + st1 {v0.8B}, [x0], x2 + st1 {v2.8B}, [x0], x2 + b.ne 1b + ret +.endm + +.macro pixels8_y2 rnd=1, avg=0 + sub w3, w3, #2 + ld1 {v0.8B}, [x1], x2 + ld1 {v1.8B}, [x1], x2 +1: subs w3, w3, #2 + avg v4.8B, v0.8B, v1.8B + ld1 {v0.8B}, [x1], x2 + avg v5.8B, v0.8B, v1.8B + ld1 {v1.8B}, [x1], x2 + .if \avg + ld1 {v2.8B}, [x0], x2 + ld1 {v3.8B}, [x0] + urhadd v4.8B, v4.8B, v2.8B + urhadd v5.8B, v5.8B, v3.8B + sub x0, x0, x2 + .endif + st1 {v4.8B}, [x0], x2 + st1 {v5.8B}, [x0], x2 + b.ne 1b + + avg v4.8B, v0.8B, v1.8B + ld1 {v0.8B}, [x1], x2 + avg v5.8B, v0.8B, v1.8B + .if \avg + ld1 {v2.8B}, [x0], x2 + ld1 {v3.8B}, [x0] + urhadd v4.8B, v4.8B, v2.8B + urhadd v5.8B, v5.8B, v3.8B + sub x0, x0, x2 + .endif + st1 {v4.8B}, [x0], x2 + st1 {v5.8B}, [x0], x2 + + ret +.endm + +.macro pixels8_xy2 rnd=1, avg=0 + sub w3, w3, #2 + ld1 {v0.16B}, [x1], x2 + ld1 {v1.16B}, [x1], x2 +NRND movi v19.8H, #1 + ext v4.16B, v0.16B, v4.16B, #1 + ext v6.16B, v1.16B, v6.16B, #1 + uaddl v16.8H, v0.8B, v4.8B + uaddl v17.8H, v1.8B, v6.8B +1: subs w3, w3, #2 + ld1 {v0.16B}, [x1], x2 + add v18.8H, v16.8H, v17.8H + ext v4.16B, v0.16B, v4.16B, #1 +NRND add v18.8H, v18.8H, v19.8H + uaddl v16.8H, v0.8B, v4.8B + mshrn v5.8B, v18.8H, #2 + ld1 {v1.16B}, [x1], x2 + add v18.8H, v16.8H, v17.8H + .if \avg + ld1 {v7.8B}, [x0] + urhadd v5.8B, v5.8B, v7.8B + .endif +NRND add v18.8H, v18.8H, v19.8H + st1 {v5.8B}, [x0], x2 + mshrn v7.8B, v18.8H, #2 + .if \avg + ld1 {v5.8B}, [x0] + urhadd v7.8B, v7.8B, v5.8B + .endif + ext v6.16B, v1.16B, v6.16B, #1 + uaddl v17.8H, v1.8B, v6.8B + st1 {v7.8B}, [x0], x2 + b.gt 1b + + ld1 {v0.16B}, [x1], x2 + add v18.8H, v16.8H, v17.8H + ext v4.16B, v0.16B, v4.16B, #1 +NRND add v18.8H, v18.8H, v19.8H + uaddl v16.8H, v0.8B, v4.8B + mshrn v5.8B, v18.8H, #2 + add v18.8H, v16.8H, v17.8H + .if \avg + ld1 {v7.8B}, [x0] + urhadd v5.8B, v5.8B, v7.8B + .endif +NRND add v18.8H, v18.8H, v19.8H + st1 {v5.8B}, [x0], x2 + mshrn v7.8B, v18.8H, #2 + .if \avg + ld1 {v5.8B}, [x0] + urhadd v7.8B, v7.8B, v5.8B + .endif + st1 {v7.8B}, [x0], x2 + + ret +.endm + +.macro pixfunc pfx, name, suf, rnd=1, avg=0 + .if \rnd + .macro avg rd, rn, rm + urhadd \rd, \rn, \rm + .endm + .macro mshrn rd, rn, rm + rshrn \rd, \rn, \rm + .endm + .macro mshrn2 rd, rn, rm + rshrn2 \rd, \rn, \rm + .endm + .macro NRND insn:vararg + .endm + .else + .macro avg rd, rn, rm + uhadd \rd, \rn, \rm + .endm + .macro mshrn rd, rn, rm + shrn \rd, \rn, \rm + .endm + .macro mshrn2 rd, rn, rm + shrn2 \rd, \rn, \rm + .endm + .macro NRND insn:vararg + \insn + .endm + .endif +function ff_\pfx\name\suf\()_neon, export=1 + \name \rnd, \avg +endfunc + .purgem avg + .purgem mshrn + .purgem mshrn2 + .purgem NRND +.endm + +.macro pixfunc2 pfx, name, avg=0 + pixfunc \pfx, \name, rnd=1, avg=\avg + pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg +.endm + +function ff_put_h264_qpel16_mc00_neon, export=1 + mov w3, #16 +endfunc + + pixfunc put_, pixels16, avg=0 + pixfunc2 put_, pixels16_x2, avg=0 + pixfunc2 put_, pixels16_y2, avg=0 + pixfunc2 put_, pixels16_xy2, avg=0 + +function ff_avg_h264_qpel16_mc00_neon, export=1 + mov w3, #16 +endfunc + + pixfunc avg_, pixels16, avg=1 + pixfunc2 avg_, pixels16_x2, avg=1 + pixfunc2 avg_, pixels16_y2, avg=1 + pixfunc2 avg_, pixels16_xy2, avg=1 + +function ff_put_h264_qpel8_mc00_neon, export=1 + mov w3, #8 +endfunc + + pixfunc put_, pixels8, avg=0 + pixfunc2 put_, pixels8_x2, avg=0 + pixfunc2 put_, pixels8_y2, avg=0 + pixfunc2 put_, pixels8_xy2, avg=0 + +function ff_avg_h264_qpel8_mc00_neon, export=1 + mov w3, #8 +endfunc + + pixfunc avg_, pixels8, avg=1 + pixfunc avg_, pixels8_x2, avg=1 + pixfunc avg_, pixels8_y2, avg=1 + pixfunc avg_, pixels8_xy2, avg=1 diff --git a/media/ffvpx/libavcodec/aarch64/idct.h b/media/ffvpx/libavcodec/aarch64/idct.h new file mode 100644 index 0000000000..97ee0a64af --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/idct.h @@ -0,0 +1,29 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AARCH64_IDCT_H +#define AVCODEC_AARCH64_IDCT_H + +#include <stddef.h> +#include <stdint.h> + +void ff_simple_idct_neon(int16_t *data); +void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data); +void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data); + +#endif /* AVCODEC_AARCH64_IDCT_H */ diff --git a/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c new file mode 100644 index 0000000000..eec21aa5a2 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c @@ -0,0 +1,55 @@ +/* + * ARM-NEON-optimized IDCT functions + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idct.h" + +void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); +void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); +void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); + +av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + if (!avctx->lowres && !high_bit_depth) { + if (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLENEON) { + c->idct_put = ff_simple_idct_put_neon; + c->idct_add = ff_simple_idct_add_neon; + c->idct = ff_simple_idct_neon; + c->perm_type = FF_IDCT_PERM_PARTTRANS; + } + } + + c->add_pixels_clamped = ff_add_pixels_clamped_neon; + c->put_pixels_clamped = ff_put_pixels_clamped_neon; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; + } +} diff --git a/media/ffvpx/libavcodec/aarch64/idctdsp_neon.S b/media/ffvpx/libavcodec/aarch64/idctdsp_neon.S new file mode 100644 index 0000000000..7f47611206 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/idctdsp_neon.S @@ -0,0 +1,130 @@ +/* + * IDCT AArch64 NEON optimisations + * + * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// Clamp 16-bit signed block coefficients to unsigned 8-bit +// On entry: +// x0 -> array of 64x 16-bit coefficients +// x1 -> 8-bit results +// x2 = row stride for results, bytes +function ff_put_pixels_clamped_neon, export=1 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0] + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + sqxtun v4.8b, v4.8h + st1 {v0.8b}, [x1], x2 + sqxtun v0.8b, v5.8h + st1 {v1.8b}, [x1], x2 + sqxtun v1.8b, v6.8h + st1 {v2.8b}, [x1], x2 + sqxtun v2.8b, v7.8h + st1 {v3.8b}, [x1], x2 + st1 {v4.8b}, [x1], x2 + st1 {v0.8b}, [x1], x2 + st1 {v1.8b}, [x1], x2 + st1 {v2.8b}, [x1] + ret +endfunc + +// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128) +// On entry: +// x0 -> array of 64x 16-bit coefficients +// x1 -> 8-bit results +// x2 = row stride for results, bytes +function ff_put_signed_pixels_clamped_neon, export=1 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 + movi v4.8b, #128 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] + sqxtn v0.8b, v0.8h + sqxtn v1.8b, v1.8h + sqxtn v2.8b, v2.8h + sqxtn v3.8b, v3.8h + sqxtn v5.8b, v16.8h + add v0.8b, v0.8b, v4.8b + sqxtn v6.8b, v17.8h + add v1.8b, v1.8b, v4.8b + sqxtn v7.8b, v18.8h + add v2.8b, v2.8b, v4.8b + sqxtn v16.8b, v19.8h + add v3.8b, v3.8b, v4.8b + st1 {v0.8b}, [x1], x2 + add v0.8b, v5.8b, v4.8b + st1 {v1.8b}, [x1], x2 + add v1.8b, v6.8b, v4.8b + st1 {v2.8b}, [x1], x2 + add v2.8b, v7.8b, v4.8b + st1 {v3.8b}, [x1], x2 + add v3.8b, v16.8b, v4.8b + st1 {v0.8b}, [x1], x2 + st1 {v1.8b}, [x1], x2 + st1 {v2.8b}, [x1], x2 + st1 {v3.8b}, [x1] + ret +endfunc + +// Add 16-bit signed block coefficients to unsigned 8-bit +// On entry: +// x0 -> array of 64x 16-bit coefficients +// x1 -> 8-bit input and results +// x2 = row stride for 8-bit input and results, bytes +function ff_add_pixels_clamped_neon, export=1 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 + mov x3, x1 + ld1 {v4.8b}, [x1], x2 + ld1 {v5.8b}, [x1], x2 + ld1 {v6.8b}, [x1], x2 + ld1 {v7.8b}, [x1], x2 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] + uaddw v0.8h, v0.8h, v4.8b + uaddw v1.8h, v1.8h, v5.8b + uaddw v2.8h, v2.8h, v6.8b + ld1 {v4.8b}, [x1], x2 + uaddw v3.8h, v3.8h, v7.8b + ld1 {v5.8b}, [x1], x2 + sqxtun v0.8b, v0.8h + ld1 {v6.8b}, [x1], x2 + sqxtun v1.8b, v1.8h + ld1 {v7.8b}, [x1] + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + uaddw v4.8h, v16.8h, v4.8b + st1 {v0.8b}, [x3], x2 + uaddw v0.8h, v17.8h, v5.8b + st1 {v1.8b}, [x3], x2 + uaddw v1.8h, v18.8h, v6.8b + st1 {v2.8b}, [x3], x2 + uaddw v2.8h, v19.8h, v7.8b + sqxtun v4.8b, v4.8h + sqxtun v0.8b, v0.8h + st1 {v3.8b}, [x3], x2 + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + st1 {v4.8b}, [x3], x2 + st1 {v0.8b}, [x3], x2 + st1 {v1.8b}, [x3], x2 + st1 {v2.8b}, [x3] + ret +endfunc diff --git a/media/ffvpx/libavcodec/aarch64/mdct_neon.S b/media/ffvpx/libavcodec/aarch64/mdct_neon.S new file mode 100644 index 0000000000..98b09bf1ab --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/mdct_neon.S @@ -0,0 +1,326 @@ +/* + * AArch64 NEON optimised MDCT + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +function ff_imdct_half_neon, export=1 + stp x19, x20, [sp, #-32]! + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #16] + mov x12, #1 + ldr w14, [x0, #28] // mdct_bits + ldr x4, [x0, #32] // tcos + ldr x3, [x0, #8] // revtab + lsl x12, x12, x14 // n = 1 << nbits + lsr x14, x12, #2 // n4 = n >> 2 + add x7, x2, x12, lsl #1 + mov x12, #-16 + sub x7, x7, #16 + + ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0 + ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x + rev64 v17.2s, v17.2s + ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 + fmul v6.2s, v17.2s, v2.2s + fmul v7.2s, v0.2s, v2.2s +1: + subs x14, x14, #2 + ldr w6, [x3], #4 + fmul v4.2s, v0.2s, v3.2s + fmul v5.2s, v17.2s, v3.2s + fsub v4.2s, v6.2s, v4.2s + fadd v5.2s, v5.2s, v7.2s + ubfm x8, x6, #16, #31 + ubfm x6, x6, #0, #15 + add x8, x1, x8, lsl #3 + add x6, x1, x6, lsl #3 + b.eq 2f + ld2 {v16.2s,v17.2s}, [x7], x12 + ld2 {v0.2s,v1.2s}, [x2], #16 + rev64 v17.2s, v17.2s + ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 + fmul v6.2s, v17.2s, v2.2s + fmul v7.2s, v0.2s, v2.2s + st2 {v4.s,v5.s}[0], [x6] + st2 {v4.s,v5.s}[1], [x8] + b 1b +2: + st2 {v4.s,v5.s}[0], [x6] + st2 {v4.s,v5.s}[1], [x8] + + mov x19, x0 + mov x20, x1 + bl X(ff_fft_calc_neon) + + mov x12, #1 + ldr w14, [x19, #28] // mdct_bits + ldr x4, [x19, #32] // tcos + lsl x12, x12, x14 // n = 1 << nbits + lsr x14, x12, #3 // n8 = n >> 3 + + add x4, x4, x14, lsl #3 + add x6, x20, x14, lsl #3 + sub x1, x4, #16 + sub x3, x6, #16 + + mov x7, #-16 + mov x8, x6 + mov x0, x3 + + ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0 + ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3 + ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 +3: + subs x14, x14, #2 + fmul v7.2s, v0.2s, v17.2s + ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3 + fmul v4.2s, v1.2s, v17.2s + fmul v6.2s, v21.2s, v19.2s + fmul v5.2s, v20.2s, v19.2s + fmul v22.2s, v1.2s, v16.2s + fmul v23.2s, v21.2s, v18.2s + fmul v24.2s, v0.2s, v16.2s + fmul v25.2s, v20.2s, v18.2s + fadd v7.2s, v7.2s, v22.2s + fadd v5.2s, v5.2s, v23.2s + fsub v4.2s, v4.2s, v24.2s + fsub v6.2s, v6.2s, v25.2s + b.eq 4f + ld2 {v0.2s,v1.2s}, [x3], x7 + ld2 {v20.2s,v21.2s},[x6], #16 + ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 + rev64 v5.2s, v5.2s + rev64 v7.2s, v7.2s + st2 {v4.2s,v5.2s}, [x0], x7 + st2 {v6.2s,v7.2s}, [x8], #16 + b 3b +4: + rev64 v5.2s, v5.2s + rev64 v7.2s, v7.2s + st2 {v4.2s,v5.2s}, [x0] + st2 {v6.2s,v7.2s}, [x8] + + ldr x30, [sp, #16] + AARCH64_VALIDATE_LINK_REGISTER + ldp x19, x20, [sp], #32 + + ret +endfunc + +function ff_imdct_calc_neon, export=1 + stp x19, x20, [sp, #-32]! + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #16] + ldr w3, [x0, #28] // mdct_bits + mov x19, #1 + mov x20, x1 + lsl x19, x19, x3 + add x1, x1, x19 + + bl X(ff_imdct_half_neon) + + add x0, x20, x19, lsl #2 + add x1, x20, x19, lsl #1 + sub x0, x0, #8 + sub x2, x1, #16 + mov x3, #-16 + mov x6, #-8 +1: + ld1 {v0.4s}, [x2], x3 + prfum pldl1keep, [x0, #-16] + rev64 v0.4s, v0.4s + ld1 {v2.2s,v3.2s}, [x1], #16 + fneg v4.4s, v0.4s + prfum pldl1keep, [x2, #-16] + rev64 v2.2s, v2.2s + rev64 v3.2s, v3.2s + ext v4.16b, v4.16b, v4.16b, #8 + st1 {v2.2s}, [x0], x6 + st1 {v3.2s}, [x0], x6 + st1 {v4.4s}, [x20], #16 + subs x19, x19, #16 + b.gt 1b + + ldr x30, [sp, #16] + AARCH64_VALIDATE_LINK_REGISTER + ldp x19, x20, [sp], #32 + + ret +endfunc + + +function ff_mdct_calc_neon, export=1 + stp x19, x20, [sp, #-32]! + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #16] + + mov x12, #1 + ldr w14, [x0, #28] // mdct_bits + ldr x4, [x0, #32] // tcos + ldr x3, [x0, #8] // revtab + lsl x14, x12, x14 // n = 1 << nbits + add x7, x2, x14 // in4u + sub x9, x7, #16 // in4d + add x2, x7, x14, lsl #1 // in3u + add x8, x9, x14, lsl #1 // in3d + add x5, x4, x14, lsl #1 + sub x5, x5, #16 + sub x3, x3, #4 + mov x12, #-16 + lsr x13, x14, #1 + + ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 + ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 + ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 + rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 + rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 + ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 + fsub v0.2s, v17.2s, v0.2s // in4d-in4u I + ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 + rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 + rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 + ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 + fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R + fsub v16.2s, v16.2s, v1.2s // in0u-in2d R + fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I +1: + fmul v7.2s, v0.2s, v21.2s // I*s + ldr w10, [x3, x13] + fmul v6.2s, v2.2s, v20.2s // -R*c + ldr w6, [x3, #4]! + fmul v4.2s, v2.2s, v21.2s // -R*s + fmul v5.2s, v0.2s, v20.2s // I*c + fmul v24.2s, v16.2s, v30.2s // R*c + fmul v25.2s, v18.2s, v31.2s // -I*s + fmul v22.2s, v16.2s, v31.2s // R*s + fmul v23.2s, v18.2s, v30.2s // I*c + subs x14, x14, #16 + subs x13, x13, #8 + fsub v6.2s, v6.2s, v7.2s // -R*c-I*s + fadd v7.2s, v4.2s, v5.2s // -R*s+I*c + fsub v24.2s, v25.2s, v24.2s // I*s-R*c + fadd v25.2s, v22.2s, v23.2s // R*s-I*c + b.eq 1f + mov x12, #-16 + ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 + ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 + fneg v7.2s, v7.2s // R*s-I*c + ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 + rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 + rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 + ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 + fsub v0.2s, v17.2s, v0.2s // in4d-in4u I + ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 + rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 + rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 + ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 + fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R + fsub v16.2s, v16.2s, v1.2s // in0u-in2d R + fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I + ubfm x12, x6, #16, #31 + ubfm x6, x6, #0, #15 + add x12, x1, x12, lsl #3 + add x6, x1, x6, lsl #3 + st2 {v6.s,v7.s}[0], [x6] + st2 {v6.s,v7.s}[1], [x12] + ubfm x6, x10, #16, #31 + ubfm x10, x10, #0, #15 + add x6 , x1, x6, lsl #3 + add x10, x1, x10, lsl #3 + st2 {v24.s,v25.s}[0], [x10] + st2 {v24.s,v25.s}[1], [x6] + b 1b +1: + fneg v7.2s, v7.2s // R*s-I*c + ubfm x12, x6, #16, #31 + ubfm x6, x6, #0, #15 + add x12, x1, x12, lsl #3 + add x6, x1, x6, lsl #3 + st2 {v6.s,v7.s}[0], [x6] + st2 {v6.s,v7.s}[1], [x12] + ubfm x6, x10, #16, #31 + ubfm x10, x10, #0, #15 + add x6 , x1, x6, lsl #3 + add x10, x1, x10, lsl #3 + st2 {v24.s,v25.s}[0], [x10] + st2 {v24.s,v25.s}[1], [x6] + + mov x19, x0 + mov x20, x1 + bl X(ff_fft_calc_neon) + + mov x12, #1 + ldr w14, [x19, #28] // mdct_bits + ldr x4, [x19, #32] // tcos + lsl x12, x12, x14 // n = 1 << nbits + lsr x14, x12, #3 // n8 = n >> 3 + + add x4, x4, x14, lsl #3 + add x6, x20, x14, lsl #3 + sub x1, x4, #16 + sub x3, x6, #16 + + mov x7, #-16 + mov x8, x6 + mov x0, x3 + + ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0 + ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3 + ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 +1: + subs x14, x14, #2 + fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0 + ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3 + fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0 + fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3 + fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3 + fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0 + fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3 + fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3 + fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0 + fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0 + fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3 + fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3 + fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0 + fneg v4.2s, v4.2s + fneg v6.2s, v6.2s + b.eq 1f + ld2 {v0.2s, v1.2s}, [x3], x7 + ld2 {v20.2s,v21.2s}, [x6], #16 + ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 + rev64 v5.2s, v5.2s + rev64 v7.2s, v7.2s + st2 {v4.2s,v5.2s}, [x0], x7 + st2 {v6.2s,v7.2s}, [x8], #16 + b 1b +1: + rev64 v5.2s, v5.2s + rev64 v7.2s, v7.2s + st2 {v4.2s,v5.2s}, [x0] + st2 {v6.2s,v7.2s}, [x8] + + ldr x30, [sp, #16] + AARCH64_VALIDATE_LINK_REGISTER + ldp x19, x20, [sp], #32 + + ret +endfunc diff --git a/media/ffvpx/libavcodec/aarch64/moz.build b/media/ffvpx/libavcodec/aarch64/moz.build new file mode 100644 index 0000000000..7126a39648 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/moz.build @@ -0,0 +1,59 @@ +## -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +## vim: set filetype=python: +## This Source Code Form is subject to the terms of the Mozilla Public +## License, v. 2.0. If a copy of the MPL was not distributed with this +## file, You can obtain one at http://mozilla.org/MPL/2.0/. + +if not CONFIG['MOZ_FFVPX_AUDIOONLY']: + SOURCES += [ + 'fft_init_aarch64.c', + 'fft_neon.S', + 'h264chroma_init_aarch64.c', + 'h264cmc_neon.S', + 'h264dsp_init_aarch64.c', + 'h264dsp_neon.S', + 'h264idct_neon.S', + 'h264pred_init.c', + 'h264pred_neon.S', + 'hpeldsp_init_aarch64.c', + 'hpeldsp_neon.S', + 'idctdsp_init_aarch64.c', + 'idctdsp_neon.S', + 'mdct_neon.S', + 'mpegaudiodsp_init.c', + 'mpegaudiodsp_neon.S', + 'neon.S', + 'simple_idct_neon.S', + 'videodsp.S', + 'videodsp_init.c', + 'vp8dsp_init_aarch64.c', + 'vp8dsp_neon.S', + 'vp9dsp_init_10bpp_aarch64.c', + 'vp9dsp_init_12bpp_aarch64.c', + 'vp9dsp_init_aarch64.c', + 'vp9itxfm_16bpp_neon.S', + 'vp9itxfm_neon.S', + 'vp9lpf_16bpp_neon.S', + 'vp9lpf_neon.S', + 'vp9mc_16bpp_neon.S', + 'vp9mc_aarch64.S', + 'vp9mc_neon.S', + ] +else: + SOURCES += [ + 'fft_init_aarch64.c', + 'fft_neon.S', + 'idctdsp_init_aarch64.c', + 'idctdsp_neon.S', + 'mpegaudiodsp_init.c', + 'mpegaudiodsp_neon.S', + 'simple_idct_neon.S', + ] + +if CONFIG['OS_ARCH'] == 'WINNT': + USE_INTEGRATED_CLANGCL_AS = True + DEFINES['EXTERN_ASM'] = '' + +FINAL_LIBRARY = 'mozavcodec' + +include('/media/ffvpx/ffvpxcommon.mozbuild') diff --git a/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_init.c b/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_init.c new file mode 100644 index 0000000000..5d966af5f4 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_init.c @@ -0,0 +1,40 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/mpegaudiodsp.h" +#include "config.h" + +void ff_mpadsp_apply_window_fixed_neon(int32_t *synth_buf, int32_t *window, + int *dither, int16_t *samples, ptrdiff_t incr); +void ff_mpadsp_apply_window_float_neon(float *synth_buf, float *window, + int *dither, float *samples, ptrdiff_t incr); + +av_cold void ff_mpadsp_init_aarch64(MPADSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + s->apply_window_fixed = ff_mpadsp_apply_window_fixed_neon; + s->apply_window_float = ff_mpadsp_apply_window_float_neon; + } +} diff --git a/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_neon.S b/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_neon.S new file mode 100644 index 0000000000..b6ef131228 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/mpegaudiodsp_neon.S @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +#define FRAC_BITS 23 // fractional bits for sb_samples and dct +#define WFRAC_BITS 16 // fractional bits for window +#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15) + +const tbl_rev128_s, align=4 + .byte 12, 13, 14, 15 + .byte 8, 9, 10, 11 + .byte 4, 5, 6, 7 + .byte 0, 1, 2, 3 +endconst + +.macro apply_window type, st +function ff_mpadsp_apply_window_\type\()_neon, export=1 + mov x7, x0 + add x8, x0, #512<<2 + ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x7], #64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x7], #64 + st1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x8], #64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x8], #64 + movrel x15, tbl_rev128_s + ld1 {v27.4s}, [x15] +.ifc \type, fixed + lsl x4, x4, #1 +.else + lsl x4, x4, #2 +.endif + add x10, x0, #45<<2 + add x0, x0, #16<<2 + add x1, x1, #16<<2 + add x5, x3, x4, lsl #5 + sub x5, x5, x4 // samples2 + neg x13, x4 // -incr + mov x9, #64<<2 +.ifc \type, fixed + ld1r {v16.2s}, [x2] // dither_state + sxtl v16.2d, v16.2s + movi v29.2d, #0 + movi v30.2d, #(1<<OUT_SHIFT)-1 + trn1 v31.2d, v29.2d, v30.2d + trn2 v30.2d, v30.2d, v29.2d + trn1 v16.2d, v16.2d, v29.2d +.else + movi v16.4s, #0 + movi v28.4s, #0 +.endif + mov x14, #4 +1: + mov x8, x0 + sub x7, x1, #3<<2 + sub x6, x1, x14, lsl #4 + add x7, x7, x14, lsl #4 + add x11, x6, #(32)<<2 // w + 32 + add x12, x7, #(32)<<2 // w2 + 32 + mov x15, #8 + movi v17.2d, #0 + movi v18.2d, #0 + movi v19.2d, #0 +2: + subs x15, x15, #1 + ld1 {v0.4s}, [x8], x9 + ld1 {v1.4s}, [x10], x9 + ld1 {v2.4s}, [x6], x9 + ld1 {v3.4s}, [x7], x9 + tbl v6.16b, {v0.16b}, v27.16b + tbl v7.16b, {v1.16b}, v27.16b + ld1 {v4.4s}, [x11], x9 + ld1 {v5.4s}, [x12], x9 + MLA v16, v2, v0 + MLA2 v17, v2, v0 + MLS v18, v3, v6 + MLS2 v19, v3, v6 + MLS v16, v4, v7 + MLS2 v17, v4, v7 + MLS v18, v5, v1 + MLS2 v19, v5, v1 + b.gt 2b + + cmp x14, #4 + sub x10, x10, #64<<5 // 64 * 8 * sizeof(int32_t) + +.ifc \type, fixed + and v28.16b, v16.16b, v30.16b + ext v28.16b, v29.16b, v28.16b, #8 + + b.eq 4f + round_sample v19, 1, 1 +4: + round_sample v16, 1, 0 + shrn v16.2s, v16.2d, #OUT_SHIFT + round_sample v19, 0, 0 + shrn v19.2s, v19.2d, #OUT_SHIFT + round_sample v17, 0, 1 + round_sample v18, 1, 1 + round_sample v17, 1, 0 + shrn2 v16.4s, v17.2d, #OUT_SHIFT + round_sample v18, 0, 0 + shrn2 v19.4s, v18.2d, #OUT_SHIFT + sqxtn v16.4h, v16.4s + sqxtn v18.4h, v19.4s +.else + ext v18.16b, v18.16b, v18.16b, #8 +.endif + + st1 {v16.\st\()}[0], [x3], x4 + b.eq 4f + st1 {v18.\st\()}[1], [x5], x13 +4: + st1 {v16.\st\()}[1], [x3], x4 + st1 {v18.\st\()}[0], [x5], x13 + st1 {v16.\st\()}[2], [x3], x4 + st1 {v18.\st\()}[3], [x5], x13 + st1 {v16.\st\()}[3], [x3], x4 + st1 {v18.\st\()}[2], [x5], x13 + + mov v16.16b, v28.16b + + subs x14, x14, #1 + add x0, x0, #4<<2 + sub x10, x10, #4<<2 + b.gt 1b + +// computing samples[16] + add x6, x1, #32<<2 + ld1 {v0.2s}, [x6], x9 + ld1 {v1.2s}, [x0], x9 +.rept 3 + ld1 {v2.2s}, [x6], x9 + ld1 {v3.2s}, [x0], x9 + MLS v16, v0, v1 + ld1 {v0.2s}, [x6], x9 + ld1 {v1.2s}, [x0], x9 + MLS v16, v2, v3 +.endr + ld1 {v2.2s}, [x6], x9 + ld1 {v3.2s}, [x0], x9 + MLS v16, v0, v1 + MLS v16, v2, v3 + +.ifc \type, fixed + and v28.16b, v16.16b, v30.16b + shrn v20.2s, v16.2d, #OUT_SHIFT + xtn v28.2s, v28.2d + sqxtn v20.4h, v20.4s + st1 {v28.s}[0], [x2] // save dither_state + st1 {v20.h}[0], [x3] +.else + st1 {v16.s}[0], [x3] +.endif + + ret +endfunc +.purgem round_sample +.purgem MLA +.purgem MLA2 +.purgem MLS +.purgem MLS2 +.endm + + +.macro round_sample r, idx, next + add \r\().2d, \r\().2d, v28.2d +.if \idx == 0 + and v28.16b, \r\().16b, v30.16b +.else // \idx == 1 + and v28.16b, \r\().16b, v31.16b +.endif +.if \idx != \next + .if \next == 0 + ext v28.16b, v28.16b, v29.16b, #8 + .else + ext v28.16b, v29.16b, v28.16b, #8 + .endif +.endif +.endm +.macro MLA d, s1, s2 + smlal \d\().2d, \s1\().2s, \s2\().2s +.endm +.macro MLA2 d, s1, s2 + smlal2 \d\().2d, \s1\().4s, \s2\().4s +.endm +.macro MLS d, s1, s2 + smlsl \d\().2d, \s1\().2s, \s2\().2s +.endm +.macro MLS2 d, s1, s2 + smlsl2 \d\().2d, \s1\().4s, \s2\().4s +.endm +apply_window fixed, h + + +// nothing to do for round_sample and ML{A,S}2 +.macro round_sample r, idx, next +.endm +.macro MLA2 d, s1, s2 +.endm +.macro MLS2 d, s1, s2 +.endm +.macro MLA d, s1, s2 + fmla \d\().4s, \s1\().4s, \s2\().4s +.endm +.macro MLS d, s1, s2 + fmls \d\().4s, \s1\().4s, \s2\().4s +.endm +apply_window float, s diff --git a/media/ffvpx/libavcodec/aarch64/neon.S b/media/ffvpx/libavcodec/aarch64/neon.S new file mode 100644 index 0000000000..1ad32c359d --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/neon.S @@ -0,0 +1,162 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 + trn1 \r8\().8B, \r0\().8B, \r1\().8B + trn2 \r9\().8B, \r0\().8B, \r1\().8B + trn1 \r1\().8B, \r2\().8B, \r3\().8B + trn2 \r3\().8B, \r2\().8B, \r3\().8B + trn1 \r0\().8B, \r4\().8B, \r5\().8B + trn2 \r5\().8B, \r4\().8B, \r5\().8B + trn1 \r2\().8B, \r6\().8B, \r7\().8B + trn2 \r7\().8B, \r6\().8B, \r7\().8B + + trn1 \r4\().4H, \r0\().4H, \r2\().4H + trn2 \r2\().4H, \r0\().4H, \r2\().4H + trn1 \r6\().4H, \r5\().4H, \r7\().4H + trn2 \r7\().4H, \r5\().4H, \r7\().4H + trn1 \r5\().4H, \r9\().4H, \r3\().4H + trn2 \r9\().4H, \r9\().4H, \r3\().4H + trn1 \r3\().4H, \r8\().4H, \r1\().4H + trn2 \r8\().4H, \r8\().4H, \r1\().4H + + trn1 \r0\().2S, \r3\().2S, \r4\().2S + trn2 \r4\().2S, \r3\().2S, \r4\().2S + + trn1 \r1\().2S, \r5\().2S, \r6\().2S + trn2 \r5\().2S, \r5\().2S, \r6\().2S + + trn2 \r6\().2S, \r8\().2S, \r2\().2S + trn1 \r2\().2S, \r8\().2S, \r2\().2S + + trn1 \r3\().2S, \r9\().2S, \r7\().2S + trn2 \r7\().2S, \r9\().2S, \r7\().2S +.endm + +.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 + trn1 \t0\().16B, \r0\().16B, \r1\().16B + trn2 \t1\().16B, \r0\().16B, \r1\().16B + trn1 \r1\().16B, \r2\().16B, \r3\().16B + trn2 \r3\().16B, \r2\().16B, \r3\().16B + trn1 \r0\().16B, \r4\().16B, \r5\().16B + trn2 \r5\().16B, \r4\().16B, \r5\().16B + trn1 \r2\().16B, \r6\().16B, \r7\().16B + trn2 \r7\().16B, \r6\().16B, \r7\().16B + + trn1 \r4\().8H, \r0\().8H, \r2\().8H + trn2 \r2\().8H, \r0\().8H, \r2\().8H + trn1 \r6\().8H, \r5\().8H, \r7\().8H + trn2 \r7\().8H, \r5\().8H, \r7\().8H + trn1 \r5\().8H, \t1\().8H, \r3\().8H + trn2 \t1\().8H, \t1\().8H, \r3\().8H + trn1 \r3\().8H, \t0\().8H, \r1\().8H + trn2 \t0\().8H, \t0\().8H, \r1\().8H + + trn1 \r0\().4S, \r3\().4S, \r4\().4S + trn2 \r4\().4S, \r3\().4S, \r4\().4S + + trn1 \r1\().4S, \r5\().4S, \r6\().4S + trn2 \r5\().4S, \r5\().4S, \r6\().4S + + trn2 \r6\().4S, \t0\().4S, \r2\().4S + trn1 \r2\().4S, \t0\().4S, \r2\().4S + + trn1 \r3\().4S, \t1\().4S, \r7\().4S + trn2 \r7\().4S, \t1\().4S, \r7\().4S +.endm + +.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().16B, \r0\().16B, \r1\().16B + trn2 \t5\().16B, \r0\().16B, \r1\().16B + trn1 \t6\().16B, \r2\().16B, \r3\().16B + trn2 \t7\().16B, \r2\().16B, \r3\().16B + + trn1 \r0\().8H, \t4\().8H, \t6\().8H + trn2 \r2\().8H, \t4\().8H, \t6\().8H + trn1 \r1\().8H, \t5\().8H, \t7\().8H + trn2 \r3\().8H, \t5\().8H, \t7\().8H +.endm + +.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().8B, \r0\().8B, \r1\().8B + trn2 \t5\().8B, \r0\().8B, \r1\().8B + trn1 \t6\().8B, \r2\().8B, \r3\().8B + trn2 \t7\().8B, \r2\().8B, \r3\().8B + + trn1 \r0\().4H, \t4\().4H, \t6\().4H + trn2 \r2\().4H, \t4\().4H, \t6\().4H + trn1 \r1\().4H, \t5\().4H, \t7\().4H + trn2 \r3\().4H, \t5\().4H, \t7\().4H +.endm + +.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7 + trn1 \r4\().4H, \r0\().4H, \r1\().4H + trn2 \r5\().4H, \r0\().4H, \r1\().4H + trn1 \r6\().4H, \r2\().4H, \r3\().4H + trn2 \r7\().4H, \r2\().4H, \r3\().4H + + trn1 \r0\().2S, \r4\().2S, \r6\().2S + trn2 \r2\().2S, \r4\().2S, \r6\().2S + trn1 \r1\().2S, \r5\().2S, \r7\().2S + trn2 \r3\().2S, \r5\().2S, \r7\().2S +.endm + +.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().8H, \r0\().8H, \r1\().8H + trn2 \t5\().8H, \r0\().8H, \r1\().8H + trn1 \t6\().8H, \r2\().8H, \r3\().8H + trn2 \t7\().8H, \r2\().8H, \r3\().8H + + trn1 \r0\().4S, \t4\().4S, \t6\().4S + trn2 \r2\().4S, \t4\().4S, \t6\().4S + trn1 \r1\().4S, \t5\().4S, \t7\().4S + trn2 \r3\().4S, \t5\().4S, \t7\().4S +.endm + +.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 + trn1 \r8\().8H, \r0\().8H, \r1\().8H + trn2 \r9\().8H, \r0\().8H, \r1\().8H + trn1 \r1\().8H, \r2\().8H, \r3\().8H + trn2 \r3\().8H, \r2\().8H, \r3\().8H + trn1 \r0\().8H, \r4\().8H, \r5\().8H + trn2 \r5\().8H, \r4\().8H, \r5\().8H + trn1 \r2\().8H, \r6\().8H, \r7\().8H + trn2 \r7\().8H, \r6\().8H, \r7\().8H + + trn1 \r4\().4S, \r0\().4S, \r2\().4S + trn2 \r2\().4S, \r0\().4S, \r2\().4S + trn1 \r6\().4S, \r5\().4S, \r7\().4S + trn2 \r7\().4S, \r5\().4S, \r7\().4S + trn1 \r5\().4S, \r9\().4S, \r3\().4S + trn2 \r9\().4S, \r9\().4S, \r3\().4S + trn1 \r3\().4S, \r8\().4S, \r1\().4S + trn2 \r8\().4S, \r8\().4S, \r1\().4S + + trn1 \r0\().2D, \r3\().2D, \r4\().2D + trn2 \r4\().2D, \r3\().2D, \r4\().2D + + trn1 \r1\().2D, \r5\().2D, \r6\().2D + trn2 \r5\().2D, \r5\().2D, \r6\().2D + + trn2 \r6\().2D, \r8\().2D, \r2\().2D + trn1 \r2\().2D, \r8\().2D, \r2\().2D + + trn1 \r3\().2D, \r9\().2D, \r7\().2D + trn2 \r7\().2D, \r9\().2D, \r7\().2D + +.endm diff --git a/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S b/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S new file mode 100644 index 0000000000..210182ff21 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S @@ -0,0 +1,362 @@ +/* + * ARM NEON IDCT + * + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com> + * + * Based on Simple IDCT + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +#define Z1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define Z4c ((1<<(COL_SHIFT-1))/Z4) +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define z1 v0.H[0] +#define z2 v0.H[1] +#define z3 v0.H[2] +#define z4 v0.H[3] +#define z5 v0.H[4] +#define z6 v0.H[5] +#define z7 v0.H[6] +#define z4c v0.H[7] + +const idct_coeff_neon, align=4 + .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c +endconst + +.macro idct_start data + prfm pldl1keep, [\data] + mov x10, x30 + movrel x3, idct_coeff_neon + ld1 {v0.2D}, [x3] +.endm + +.macro idct_end + ret x10 +.endm + +.macro smull1 a, b, c + smull \a, \b, \c +.endm + +.macro smlal1 a, b, c + smlal \a, \b, \c +.endm + +.macro smlsl1 a, b, c + smlsl \a, \b, \c +.endm + +.macro idct_col4_top y1, y2, y3, y4, i, l + smull\i v7.4S, \y3\l, z2 + smull\i v16.4S, \y3\l, z6 + smull\i v17.4S, \y2\l, z1 + add v19.4S, v23.4S, v7.4S + smull\i v18.4S, \y2\l, z3 + add v20.4S, v23.4S, v16.4S + smull\i v5.4S, \y2\l, z5 + sub v21.4S, v23.4S, v16.4S + smull\i v6.4S, \y2\l, z7 + sub v22.4S, v23.4S, v7.4S + + smlal\i v17.4S, \y4\l, z3 + smlsl\i v18.4S, \y4\l, z7 + smlsl\i v5.4S, \y4\l, z1 + smlsl\i v6.4S, \y4\l, z5 +.endm + +.macro idct_row4_neon y1, y2, y3, y4, pass + ld1 {\y1\().2D,\y2\().2D}, [x2], #32 + movi v23.4S, #1<<2, lsl #8 + orr v5.16B, \y1\().16B, \y2\().16B + ld1 {\y3\().2D,\y4\().2D}, [x2], #32 + orr v6.16B, \y3\().16B, \y4\().16B + orr v5.16B, v5.16B, v6.16B + mov x3, v5.D[1] + smlal v23.4S, \y1\().4H, z4 + + idct_col4_top \y1, \y2, \y3, \y4, 1, .4H + + cmp x3, #0 + b.eq \pass\()f + + smull2 v7.4S, \y1\().8H, z4 + smlal2 v17.4S, \y2\().8H, z5 + smlsl2 v18.4S, \y2\().8H, z1 + smull2 v16.4S, \y3\().8H, z2 + smlal2 v5.4S, \y2\().8H, z7 + add v19.4S, v19.4S, v7.4S + sub v20.4S, v20.4S, v7.4S + sub v21.4S, v21.4S, v7.4S + add v22.4S, v22.4S, v7.4S + smlal2 v6.4S, \y2\().8H, z3 + smull2 v7.4S, \y3\().8H, z6 + smlal2 v17.4S, \y4\().8H, z7 + smlsl2 v18.4S, \y4\().8H, z5 + smlal2 v5.4S, \y4\().8H, z3 + smlsl2 v6.4S, \y4\().8H, z1 + add v19.4S, v19.4S, v7.4S + sub v20.4S, v20.4S, v16.4S + add v21.4S, v21.4S, v16.4S + sub v22.4S, v22.4S, v7.4S + +\pass: add \y3\().4S, v19.4S, v17.4S + add \y4\().4S, v20.4S, v18.4S + shrn \y1\().4H, \y3\().4S, #ROW_SHIFT + shrn \y2\().4H, \y4\().4S, #ROW_SHIFT + add v7.4S, v21.4S, v5.4S + add v16.4S, v22.4S, v6.4S + shrn \y3\().4H, v7.4S, #ROW_SHIFT + shrn \y4\().4H, v16.4S, #ROW_SHIFT + sub v22.4S, v22.4S, v6.4S + sub v19.4S, v19.4S, v17.4S + sub v21.4S, v21.4S, v5.4S + shrn2 \y1\().8H, v22.4S, #ROW_SHIFT + sub v20.4S, v20.4S, v18.4S + shrn2 \y2\().8H, v21.4S, #ROW_SHIFT + shrn2 \y3\().8H, v20.4S, #ROW_SHIFT + shrn2 \y4\().8H, v19.4S, #ROW_SHIFT + + trn1 v16.8H, \y1\().8H, \y2\().8H + trn2 v17.8H, \y1\().8H, \y2\().8H + trn1 v18.8H, \y3\().8H, \y4\().8H + trn2 v19.8H, \y3\().8H, \y4\().8H + trn1 \y1\().4S, v16.4S, v18.4S + trn1 \y2\().4S, v17.4S, v19.4S + trn2 \y3\().4S, v16.4S, v18.4S + trn2 \y4\().4S, v17.4S, v19.4S +.endm + +.macro declare_idct_col4_neon i, l +function idct_col4_neon\i + dup v23.4H, z4c +.if \i == 1 + add v23.4H, v23.4H, v24.4H +.else + mov v5.D[0], v24.D[1] + add v23.4H, v23.4H, v5.4H +.endif + smull v23.4S, v23.4H, z4 + + idct_col4_top v24, v25, v26, v27, \i, \l + + mov x4, v28.D[\i - 1] + mov x5, v29.D[\i - 1] + cmp x4, #0 + b.eq 1f + + smull\i v7.4S, v28\l, z4 + add v19.4S, v19.4S, v7.4S + sub v20.4S, v20.4S, v7.4S + sub v21.4S, v21.4S, v7.4S + add v22.4S, v22.4S, v7.4S + +1: mov x4, v30.D[\i - 1] + cmp x5, #0 + b.eq 2f + + smlal\i v17.4S, v29\l, z5 + smlsl\i v18.4S, v29\l, z1 + smlal\i v5.4S, v29\l, z7 + smlal\i v6.4S, v29\l, z3 + +2: mov x5, v31.D[\i - 1] + cmp x4, #0 + b.eq 3f + + smull\i v7.4S, v30\l, z6 + smull\i v16.4S, v30\l, z2 + add v19.4S, v19.4S, v7.4S + sub v22.4S, v22.4S, v7.4S + sub v20.4S, v20.4S, v16.4S + add v21.4S, v21.4S, v16.4S + +3: cmp x5, #0 + b.eq 4f + + smlal\i v17.4S, v31\l, z7 + smlsl\i v18.4S, v31\l, z5 + smlal\i v5.4S, v31\l, z3 + smlsl\i v6.4S, v31\l, z1 + +4: addhn v7.4H, v19.4S, v17.4S + addhn2 v7.8H, v20.4S, v18.4S + subhn v18.4H, v20.4S, v18.4S + subhn2 v18.8H, v19.4S, v17.4S + + addhn v16.4H, v21.4S, v5.4S + addhn2 v16.8H, v22.4S, v6.4S + subhn v17.4H, v22.4S, v6.4S + subhn2 v17.8H, v21.4S, v5.4S + + ret +endfunc +.endm + +declare_idct_col4_neon 1, .4H +declare_idct_col4_neon 2, .8H + +function ff_simple_idct_put_neon, export=1 + idct_start x2 + + idct_row4_neon v24, v25, v26, v27, 1 + idct_row4_neon v28, v29, v30, v31, 2 + bl idct_col4_neon1 + + sqshrun v1.8B, v7.8H, #COL_SHIFT-16 + sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16 + sqshrun v3.8B, v17.8H, #COL_SHIFT-16 + sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16 + + bl idct_col4_neon2 + + sqshrun v2.8B, v7.8H, #COL_SHIFT-16 + sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16 + sqshrun v4.8B, v17.8H, #COL_SHIFT-16 + sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16 + + zip1 v16.4S, v1.4S, v2.4S + zip2 v17.4S, v1.4S, v2.4S + + st1 {v16.D}[0], [x0], x1 + st1 {v16.D}[1], [x0], x1 + + zip1 v18.4S, v3.4S, v4.4S + zip2 v19.4S, v3.4S, v4.4S + + st1 {v17.D}[0], [x0], x1 + st1 {v17.D}[1], [x0], x1 + st1 {v18.D}[0], [x0], x1 + st1 {v18.D}[1], [x0], x1 + st1 {v19.D}[0], [x0], x1 + st1 {v19.D}[1], [x0], x1 + + idct_end +endfunc + +function ff_simple_idct_add_neon, export=1 + idct_start x2 + + idct_row4_neon v24, v25, v26, v27, 1 + idct_row4_neon v28, v29, v30, v31, 2 + bl idct_col4_neon1 + + sshr v1.8H, v7.8H, #COL_SHIFT-16 + sshr v2.8H, v16.8H, #COL_SHIFT-16 + sshr v3.8H, v17.8H, #COL_SHIFT-16 + sshr v4.8H, v18.8H, #COL_SHIFT-16 + + bl idct_col4_neon2 + + sshr v7.8H, v7.8H, #COL_SHIFT-16 + sshr v16.8H, v16.8H, #COL_SHIFT-16 + sshr v17.8H, v17.8H, #COL_SHIFT-16 + sshr v18.8H, v18.8H, #COL_SHIFT-16 + + mov x9, x0 + ld1 {v19.D}[0], [x0], x1 + zip1 v23.2D, v1.2D, v7.2D + zip2 v24.2D, v1.2D, v7.2D + ld1 {v19.D}[1], [x0], x1 + zip1 v25.2D, v2.2D, v16.2D + zip2 v26.2D, v2.2D, v16.2D + ld1 {v20.D}[0], [x0], x1 + zip1 v27.2D, v3.2D, v17.2D + zip2 v28.2D, v3.2D, v17.2D + ld1 {v20.D}[1], [x0], x1 + zip1 v29.2D, v4.2D, v18.2D + zip2 v30.2D, v4.2D, v18.2D + ld1 {v21.D}[0], [x0], x1 + uaddw v23.8H, v23.8H, v19.8B + uaddw2 v24.8H, v24.8H, v19.16B + ld1 {v21.D}[1], [x0], x1 + sqxtun v23.8B, v23.8H + sqxtun2 v23.16B, v24.8H + ld1 {v22.D}[0], [x0], x1 + uaddw v24.8H, v25.8H, v20.8B + uaddw2 v25.8H, v26.8H, v20.16B + ld1 {v22.D}[1], [x0], x1 + sqxtun v24.8B, v24.8H + sqxtun2 v24.16B, v25.8H + st1 {v23.D}[0], [x9], x1 + uaddw v25.8H, v27.8H, v21.8B + uaddw2 v26.8H, v28.8H, v21.16B + st1 {v23.D}[1], [x9], x1 + sqxtun v25.8B, v25.8H + sqxtun2 v25.16B, v26.8H + st1 {v24.D}[0], [x9], x1 + uaddw v26.8H, v29.8H, v22.8B + uaddw2 v27.8H, v30.8H, v22.16B + st1 {v24.D}[1], [x9], x1 + sqxtun v26.8B, v26.8H + sqxtun2 v26.16B, v27.8H + st1 {v25.D}[0], [x9], x1 + st1 {v25.D}[1], [x9], x1 + st1 {v26.D}[0], [x9], x1 + st1 {v26.D}[1], [x9], x1 + + idct_end +endfunc + +function ff_simple_idct_neon, export=1 + idct_start x0 + + mov x2, x0 + idct_row4_neon v24, v25, v26, v27, 1 + idct_row4_neon v28, v29, v30, v31, 2 + sub x2, x2, #128 + bl idct_col4_neon1 + + sshr v1.8H, v7.8H, #COL_SHIFT-16 + sshr v2.8H, v16.8H, #COL_SHIFT-16 + sshr v3.8H, v17.8H, #COL_SHIFT-16 + sshr v4.8H, v18.8H, #COL_SHIFT-16 + + bl idct_col4_neon2 + + sshr v7.8H, v7.8H, #COL_SHIFT-16 + sshr v16.8H, v16.8H, #COL_SHIFT-16 + sshr v17.8H, v17.8H, #COL_SHIFT-16 + sshr v18.8H, v18.8H, #COL_SHIFT-16 + + zip1 v23.2D, v1.2D, v7.2D + zip2 v24.2D, v1.2D, v7.2D + st1 {v23.2D,v24.2D}, [x2], #32 + zip1 v25.2D, v2.2D, v16.2D + zip2 v26.2D, v2.2D, v16.2D + st1 {v25.2D,v26.2D}, [x2], #32 + zip1 v27.2D, v3.2D, v17.2D + zip2 v28.2D, v3.2D, v17.2D + st1 {v27.2D,v28.2D}, [x2], #32 + zip1 v29.2D, v4.2D, v18.2D + zip2 v30.2D, v4.2D, v18.2D + st1 {v29.2D,v30.2D}, [x2], #32 + + idct_end +endfunc diff --git a/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c new file mode 100644 index 0000000000..3bc0bd17ee --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c @@ -0,0 +1,141 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" +#include "libavutil/intreadwrite.h" +#include "libavcodec/vc1dsp.h" + +#include "config.h" + +void ff_vc1_inv_trans_8x8_neon(int16_t *block); +void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + +void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + +void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); +void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); +void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); +void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); +void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); +void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq); + +void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, + int h, int x, int y); +void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, + int h, int x, int y); +void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, + int h, int x, int y); +void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); + +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) +{ + /* Dealing with starting and stopping, and removing escape bytes, are + * comparatively less time-sensitive, so are more clearly expressed using + * a C wrapper around the assembly inner loop. Note that we assume a + * little-endian machine that supports unaligned loads. */ + int dsize = 0; + while (size >= 4) + { + int found = 0; + while (!found && (((uintptr_t) dst) & 7) && size >= 4) + { + found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; + if (!found) + { + *dst++ = *src++; + --size; + ++dsize; + } + } + if (!found) + { + int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); + dst += skip; + src += skip; + size -= skip; + dsize += skip; + while (!found && size >= 4) + { + found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; + if (!found) + { + *dst++ = *src++; + --size; + ++dsize; + } + } + } + if (found) + { + *dst++ = *src++; + *dst++ = *src++; + ++src; + size -= 3; + dsize += 2; + } + } + while (size > 0) + { + *dst++ = *src++; + --size; + ++dsize; + } + return dsize; +} + +av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon; + dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon; + dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon; + dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon; + dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon; + dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; + dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon; + dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; + + dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; + dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; + dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; + dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; + dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; + dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; + + dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; + dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; + + dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; + } +} diff --git a/media/ffvpx/libavcodec/aarch64/videodsp.S b/media/ffvpx/libavcodec/aarch64/videodsp.S new file mode 100644 index 0000000000..fe2da0658e --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/videodsp.S @@ -0,0 +1,29 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +function ff_prefetch_aarch64, export=1 +1: + subs w2, w2, #2 + prfm pldl1strm, [x0] + prfm pldl1strm, [x0, x1] + add x0, x0, x1, lsl #1 + b.gt 1b + ret +endfunc diff --git a/media/ffvpx/libavcodec/aarch64/videodsp_init.c b/media/ffvpx/libavcodec/aarch64/videodsp_init.c new file mode 100644 index 0000000000..1f77a918d7 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/videodsp_init.c @@ -0,0 +1,32 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/videodsp.h" + +void ff_prefetch_aarch64(const uint8_t *mem, ptrdiff_t stride, int h); + +av_cold void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_armv8(cpu_flags)) + ctx->prefetch = ff_prefetch_aarch64; +} diff --git a/media/ffvpx/libavcodec/aarch64/vp8dsp.h b/media/ffvpx/libavcodec/aarch64/vp8dsp.h new file mode 100644 index 0000000000..4e59de28b1 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp8dsp.h @@ -0,0 +1,75 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AARCH64_VP8DSP_H +#define AVCODEC_AARCH64_VP8DSP_H + +#include "libavcodec/vp8dsp.h" + +#define VP8_LF_Y(hv, inner, opt) \ + void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim_E, int flim_I, \ + int hev_thresh) + +#define VP8_LF_UV(hv, inner, opt) \ + void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t stride, \ + int flim_E, int flim_I, \ + int hev_thresh) + +#define VP8_LF_SIMPLE(hv, opt) \ + void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim) + +#define VP8_LF_HV(inner, opt) \ + VP8_LF_Y(h, inner, opt); \ + VP8_LF_Y(v, inner, opt); \ + VP8_LF_UV(h, inner, opt); \ + VP8_LF_UV(v, inner, opt) + +#define VP8_LF(opt) \ + VP8_LF_HV(, opt); \ + VP8_LF_HV(_inner, opt); \ + VP8_LF_SIMPLE(h, opt); \ + VP8_LF_SIMPLE(v, opt) + +#define VP8_MC(n, opt) \ + void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride,\ + int h, int x, int y) + +#define VP8_EPEL(w, opt) \ + VP8_MC(pixels ## w, opt); \ + VP8_MC(epel ## w ## _h4, opt); \ + VP8_MC(epel ## w ## _h6, opt); \ + VP8_MC(epel ## w ## _v4, opt); \ + VP8_MC(epel ## w ## _h4v4, opt); \ + VP8_MC(epel ## w ## _h6v4, opt); \ + VP8_MC(epel ## w ## _v6, opt); \ + VP8_MC(epel ## w ## _h4v6, opt); \ + VP8_MC(epel ## w ## _h6v6, opt) + +#define VP8_BILIN(w, opt) \ + VP8_MC(bilin ## w ## _h, opt); \ + VP8_MC(bilin ## w ## _v, opt); \ + VP8_MC(bilin ## w ## _hv, opt) + +#endif /* AVCODEC_AARCH64_VP8DSP_H */ diff --git a/media/ffvpx/libavcodec/aarch64/vp8dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp8dsp_init_aarch64.c new file mode 100644 index 0000000000..fc7e831d17 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp8dsp_init_aarch64.c @@ -0,0 +1,124 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/vp8dsp.h" +#include "vp8dsp.h" + +void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]); + +void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); +void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); + +VP8_LF(neon); + +VP8_EPEL(16, neon); +VP8_EPEL(8, neon); +VP8_EPEL(4, neon); + +VP8_BILIN(16, neon); +VP8_BILIN(8, neon); +VP8_BILIN(4, neon); + +av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) +{ + if (!have_neon(av_get_cpu_flags())) + return; + dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; + dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon; + dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon; + dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon; + + dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; + dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon; + dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon; + dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon; + dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon; + dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon; + dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon; + dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon; + dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon; + + dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon; + dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon; + dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon; + dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon; + dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon; + dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon; + dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon; + dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon; + + dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; + dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon; + dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon; + + dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; + dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon; + dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon; + + dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon; + dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon; +} + +av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp) +{ + if (!have_neon(av_get_cpu_flags())) + return; + dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon; + + dsp->vp8_idct_add = ff_vp8_idct_add_neon; + dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon; + dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon; + dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon; + + dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon; + dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon; + dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon; + dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon; + + dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon; + dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon; + dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon; + dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon; + + dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon; + dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon; +} diff --git a/media/ffvpx/libavcodec/aarch64/vp8dsp_neon.S b/media/ffvpx/libavcodec/aarch64/vp8dsp_neon.S new file mode 100644 index 0000000000..4bbf16d1a4 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp8dsp_neon.S @@ -0,0 +1,1790 @@ +/* + * VP8 NEON optimisations + * + * Copyright (c) 2010 Rob Clark <rob@ti.com> + * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com> + * Copyright (c) 2019 Martin Storsjo <martin@martin.st> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +function ff_vp8_luma_dc_wht_neon, export=1 + ld1 {v0.4h - v3.4h}, [x1] + movi v30.8h, #0 + + add v4.4h, v0.4h, v3.4h + add v6.4h, v1.4h, v2.4h + st1 {v30.8h}, [x1], #16 + sub v7.4h, v1.4h, v2.4h + sub v5.4h, v0.4h, v3.4h + st1 {v30.8h}, [x1] + add v0.4h, v4.4h, v6.4h + add v1.4h, v5.4h, v7.4h + sub v2.4h, v4.4h, v6.4h + sub v3.4h, v5.4h, v7.4h + + movi v16.4h, #3 + + transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 + + add v0.4h, v0.4h, v16.4h + + add v4.4h, v0.4h, v3.4h + add v6.4h, v1.4h, v2.4h + sub v7.4h, v1.4h, v2.4h + sub v5.4h, v0.4h, v3.4h + add v0.4h, v4.4h, v6.4h + add v1.4h, v5.4h, v7.4h + sub v2.4h, v4.4h, v6.4h + sub v3.4h, v5.4h, v7.4h + + sshr v0.4h, v0.4h, #3 + sshr v1.4h, v1.4h, #3 + sshr v2.4h, v2.4h, #3 + sshr v3.4h, v3.4h, #3 + + mov x3, #32 + st1 {v0.h}[0], [x0], x3 + st1 {v1.h}[0], [x0], x3 + st1 {v2.h}[0], [x0], x3 + st1 {v3.h}[0], [x0], x3 + st1 {v0.h}[1], [x0], x3 + st1 {v1.h}[1], [x0], x3 + st1 {v2.h}[1], [x0], x3 + st1 {v3.h}[1], [x0], x3 + st1 {v0.h}[2], [x0], x3 + st1 {v1.h}[2], [x0], x3 + st1 {v2.h}[2], [x0], x3 + st1 {v3.h}[2], [x0], x3 + st1 {v0.h}[3], [x0], x3 + st1 {v1.h}[3], [x0], x3 + st1 {v2.h}[3], [x0], x3 + st1 {v3.h}[3], [x0], x3 + + ret +endfunc + +function ff_vp8_idct_add_neon, export=1 + ld1 {v0.8b - v3.8b}, [x1] + mov w4, #20091 + movk w4, #35468/2, lsl #16 + dup v4.2s, w4 + + smull v26.4s, v1.4h, v4.h[0] + smull v27.4s, v3.4h, v4.h[0] + sqdmulh v20.4h, v1.4h, v4.h[1] + sqdmulh v23.4h, v3.4h, v4.h[1] + shrn v21.4h, v26.4s, #16 + shrn v22.4h, v27.4s, #16 + add v21.4h, v21.4h, v1.4h + add v22.4h, v22.4h, v3.4h + + add v16.4h, v0.4h, v2.4h + sub v17.4h, v0.4h, v2.4h + + add v18.4h, v21.4h, v23.4h + sub v19.4h, v20.4h, v22.4h + + add v0.4h, v16.4h, v18.4h + add v1.4h, v17.4h, v19.4h + sub v3.4h, v16.4h, v18.4h + sub v2.4h, v17.4h, v19.4h + + transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7 + + movi v29.8h, #0 + smull v26.4s, v1.4h, v4.h[0] + st1 {v29.8h}, [x1], #16 + smull v27.4s, v3.4h, v4.h[0] + st1 {v29.16b}, [x1] + sqdmulh v21.4h, v1.4h, v4.h[1] + sqdmulh v23.4h, v3.4h, v4.h[1] + shrn v20.4h, v26.4s, #16 + shrn v22.4h, v27.4s, #16 + add v20.4h, v20.4h, v1.4h + add v22.4h, v22.4h, v3.4h + add v16.4h, v0.4h, v2.4h + sub v17.4h, v0.4h, v2.4h + + add v18.4h, v20.4h, v23.4h + ld1 {v24.s}[0], [x0], x2 + sub v19.4h, v21.4h, v22.4h + ld1 {v25.s}[0], [x0], x2 + add v0.4h, v16.4h, v18.4h + add v1.4h, v17.4h, v19.4h + ld1 {v26.s}[0], [x0], x2 + sub v3.4h, v16.4h, v18.4h + sub v2.4h, v17.4h, v19.4h + ld1 {v27.s}[0], [x0], x2 + srshr v0.4h, v0.4h, #3 + srshr v1.4h, v1.4h, #3 + srshr v2.4h, v2.4h, #3 + srshr v3.4h, v3.4h, #3 + + sub x0, x0, x2, lsl #2 + + transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16 + + uaddw v0.8h, v0.8h, v24.8b + uaddw v1.8h, v1.8h, v25.8b + uaddw v2.8h, v2.8h, v26.8b + uaddw v3.8h, v3.8h, v27.8b + sqxtun v0.8b, v0.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + + st1 {v0.s}[0], [x0], x2 + st1 {v1.s}[0], [x0], x2 + st1 {v2.s}[0], [x0], x2 + st1 {v3.s}[0], [x0], x2 + + ret +endfunc + +function ff_vp8_idct_dc_add4uv_neon, export=1 + movi v0.4h, #0 + mov x3, #32 + ld1r {v16.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v17.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v18.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v19.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + mov x3, x0 + srshr v16.8h, v16.8h, #3 // dc >>= 3 + ld1 {v0.8b}, [x0], x2 + srshr v18.8h, v18.8h, #3 + ld1 {v1.8b}, [x0], x2 + uaddw v20.8h, v16.8h, v0.8b + ld1 {v2.8b}, [x0], x2 + uaddw v0.8h, v16.8h, v1.8b + ld1 {v3.8b}, [x0], x2 + uaddw v22.8h, v16.8h, v2.8b + ld1 {v4.8b}, [x0], x2 + uaddw v2.8h, v16.8h, v3.8b + ld1 {v5.8b}, [x0], x2 + uaddw v24.8h, v18.8h, v4.8b + ld1 {v6.8b}, [x0], x2 + uaddw v4.8h, v18.8h, v5.8b + ld1 {v7.8b}, [x0], x2 + uaddw v26.8h, v18.8h, v6.8b + sqxtun v20.8b, v20.8h + uaddw v6.8h, v18.8h, v7.8b + sqxtun v21.8b, v0.8h + sqxtun v22.8b, v22.8h + st1 {v20.8b}, [x3], x2 + sqxtun v23.8b, v2.8h + st1 {v21.8b}, [x3], x2 + sqxtun v24.8b, v24.8h + st1 {v22.8b}, [x3], x2 + sqxtun v25.8b, v4.8h + st1 {v23.8b}, [x3], x2 + sqxtun v26.8b, v26.8h + st1 {v24.8b}, [x3], x2 + sqxtun v27.8b, v6.8h + st1 {v25.8b}, [x3], x2 + st1 {v26.8b}, [x3], x2 + st1 {v27.8b}, [x3], x2 + + ret +endfunc + +function ff_vp8_idct_dc_add4y_neon, export=1 + movi v0.16b, #0 + mov x3, #32 + ld1r {v16.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v17.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + zip1 v16.2d, v16.2d, v17.2d + ld1r {v18.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v19.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + zip1 v18.2d, v18.2d, v19.2d + srshr v16.8h, v16.8h, #3 // dc >>= 3 + ld1 {v0.16b}, [x0], x2 + srshr v18.8h, v18.8h, #3 + ld1 {v1.16b}, [x0], x2 + uaddw v20.8h, v16.8h, v0.8b + ld1 {v2.16b}, [x0], x2 + uaddw2 v0.8h, v18.8h, v0.16b + ld1 {v3.16b}, [x0], x2 + uaddw v21.8h, v16.8h, v1.8b + uaddw2 v1.8h, v18.8h, v1.16b + uaddw v22.8h, v16.8h, v2.8b + uaddw2 v2.8h, v18.8h, v2.16b + uaddw v23.8h, v16.8h, v3.8b + uaddw2 v3.8h, v18.8h, v3.16b + sub x0, x0, x2, lsl #2 + sqxtun v20.8b, v20.8h + sqxtun2 v20.16b, v0.8h + sqxtun v21.8b, v21.8h + sqxtun2 v21.16b, v1.8h + sqxtun v22.8b, v22.8h + st1 {v20.16b}, [x0], x2 + sqxtun2 v22.16b, v2.8h + st1 {v21.16b}, [x0], x2 + sqxtun v23.8b, v23.8h + st1 {v22.16b}, [x0], x2 + sqxtun2 v23.16b, v3.8h + st1 {v23.16b}, [x0], x2 + + ret +endfunc + +function ff_vp8_idct_dc_add_neon, export=1 + mov w3, #0 + ld1r {v2.8h}, [x1] + strh w3, [x1] + srshr v2.8h, v2.8h, #3 + ld1 {v0.s}[0], [x0], x2 + ld1 {v0.s}[1], [x0], x2 + uaddw v3.8h, v2.8h, v0.8b + ld1 {v1.s}[0], [x0], x2 + ld1 {v1.s}[1], [x0], x2 + uaddw v4.8h, v2.8h, v1.8b + sqxtun v0.8b, v3.8h + sqxtun v1.8b, v4.8h + sub x0, x0, x2, lsl #2 + st1 {v0.s}[0], [x0], x2 + st1 {v0.s}[1], [x0], x2 + st1 {v1.s}[0], [x0], x2 + st1 {v1.s}[1], [x0], x2 + ret +endfunc + +// Register layout: +// P3..Q3 -> v0..v7 +// flim_E -> v22 +// flim_I -> v23 +// hev_thresh -> x5 +// +.macro vp8_loop_filter, inner=0, simple=0, hev_thresh + .if \simple + uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0) + uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1) + uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2 + ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2 + uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) + movi v21.16b, #0x80 + cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim + .else + // calculate hev and normal_limit: + uabd v20.16b, v2.16b, v3.16b // abs(P1-P0) + uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0) + uabd v18.16b, v0.16b, v1.16b // abs(P3-P2) + uabd v19.16b, v1.16b, v2.16b // abs(P2-P1) + cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I + cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I + cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I + cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I + and v16.16b, v17.16b, v16.16b + uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2) + and v16.16b, v16.16b, v19.16b + uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1) + and v16.16b, v16.16b, v18.16b + cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I + cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I + uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0) + uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1) + and v16.16b, v16.16b, v18.16b + uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2 + and v16.16b, v16.16b, v19.16b + ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2 + dup v23.16b, \hev_thresh // hev_thresh + uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) + cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh + cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E + cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh + and v16.16b, v16.16b, v19.16b + movi v21.16b, #0x80 + orr v17.16b, v20.16b, v22.16b + .endif + + // at this point: + // v16: normal_limit + // v17: hev + + // convert to signed value: + eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80 + eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80 + + movi v20.8h, #3 + ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0 + ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit) + eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80 + mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0) + mul v19.8h, v19.8h, v20.8h + + sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1) + movi v22.16b, #4 + movi v23.16b, #3 + .if \inner + and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1) + .endif + saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1) + saddw2 v19.8h, v19.8h, v20.16b + sqxtn v18.8b, v18.8h // narrow result back into v18 + sqxtn2 v18.16b, v19.8h + .if !\inner && !\simple + eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80 + eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80 + .endif + and v18.16b, v18.16b, v16.16b // w &= normal_limit + + // registers used at this point.. + // v0 -> P3 (don't corrupt) + // v1-v6 -> PS2-QS2 + // v7 -> Q3 (don't corrupt) + // v17 -> hev + // v18 -> w + // v21 -> #0x80 + // v22 -> #4 + // v23 -> #3 + // v16, v19, v29 -> unused + // + // filter_common: is4tap==1 + // c1 = clamp(w + 4) >> 3; + // c2 = clamp(w + 3) >> 3; + // Q0 = s2u(QS0 - c1); + // P0 = s2u(PS0 + c2); + + .if \simple + sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + .elseif \inner + // the !is4tap case of filter_common, only used for inner blocks + // c3 = ((c1&~hev) + 1) >> 1; + // Q1 = s2u(QS1 - c3); + // P1 = s2u(PS1 + c3); + sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + bic v19.16b, v19.16b, v17.16b // c1 & ~hev + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + srshr v19.16b, v19.16b, #1 // c3 >>= 1 + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3) + sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3) + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + .else + and v20.16b, v18.16b, v17.16b // w & hev + sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + bic v18.16b, v18.16b, v17.16b // w &= ~hev + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + + // filter_mbedge: + // a = clamp((27*w + 63) >> 7); + // Q0 = s2u(QS0 - a); + // P0 = s2u(PS0 + a); + // a = clamp((18*w + 63) >> 7); + // Q1 = s2u(QS1 - a); + // P1 = s2u(PS1 + a); + // a = clamp((9*w + 63) >> 7); + // Q2 = s2u(QS2 - a); + // P2 = s2u(PS2 + a); + movi v17.8h, #63 + sshll v22.8h, v18.8b, #3 + sshll2 v23.8h, v18.16b, #3 + saddw v22.8h, v22.8h, v18.8b + saddw2 v23.8h, v23.8h, v18.16b + add v16.8h, v17.8h, v22.8h + add v17.8h, v17.8h, v23.8h // 9*w + 63 + add v19.8h, v16.8h, v22.8h + add v20.8h, v17.8h, v23.8h // 18*w + 63 + add v22.8h, v19.8h, v22.8h + add v23.8h, v20.8h, v23.8h // 27*w + 63 + sqshrn v16.8b, v16.8h, #7 + sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7) + sqshrn v19.8b, v19.8h, #7 + sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7) + sqshrn v22.8b, v22.8h, #7 + sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7) + sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a) + sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a) + sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a) + sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a) + sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a) + sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a) + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80 + eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80 + .endif +.endm + +.macro vp8_v_loop_filter16 name, inner=0, simple=0 +function ff_vp8_v_loop_filter16\name\()_neon, export=1 + sub x0, x0, x1, lsl #1+!\simple + + // Load pixels: + .if !\simple + ld1 {v0.16b}, [x0], x1 // P3 + ld1 {v1.16b}, [x0], x1 // P2 + .endif + ld1 {v2.16b}, [x0], x1 // P1 + ld1 {v3.16b}, [x0], x1 // P0 + ld1 {v4.16b}, [x0], x1 // Q0 + ld1 {v5.16b}, [x0], x1 // Q1 + .if !\simple + ld1 {v6.16b}, [x0], x1 // Q2 + ld1 {v7.16b}, [x0] // Q3 + dup v23.16b, w3 // flim_I + .endif + dup v22.16b, w2 // flim_E + + vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4 + + // back up to P2: dst -= stride * 6 + sub x0, x0, x1, lsl #2 + .if !\simple + sub x0, x0, x1, lsl #1 + + // Store pixels: + st1 {v1.16b}, [x0], x1 // P2 + .endif + st1 {v2.16b}, [x0], x1 // P1 + st1 {v3.16b}, [x0], x1 // P0 + st1 {v4.16b}, [x0], x1 // Q0 + st1 {v5.16b}, [x0], x1 // Q1 + .if !\simple + st1 {v6.16b}, [x0] // Q2 + .endif + + ret +endfunc +.endm + +vp8_v_loop_filter16 +vp8_v_loop_filter16 _inner, inner=1 +vp8_v_loop_filter16 _simple, simple=1 + +.macro vp8_v_loop_filter8uv name, inner=0 +function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 + sub x0, x0, x2, lsl #2 + sub x1, x1, x2, lsl #2 + // Load pixels: + ld1 {v0.d}[0], [x0], x2 // P3 + ld1 {v0.d}[1], [x1], x2 // P3 + ld1 {v1.d}[0], [x0], x2 // P2 + ld1 {v1.d}[1], [x1], x2 // P2 + ld1 {v2.d}[0], [x0], x2 // P1 + ld1 {v2.d}[1], [x1], x2 // P1 + ld1 {v3.d}[0], [x0], x2 // P0 + ld1 {v3.d}[1], [x1], x2 // P0 + ld1 {v4.d}[0], [x0], x2 // Q0 + ld1 {v4.d}[1], [x1], x2 // Q0 + ld1 {v5.d}[0], [x0], x2 // Q1 + ld1 {v5.d}[1], [x1], x2 // Q1 + ld1 {v6.d}[0], [x0], x2 // Q2 + ld1 {v6.d}[1], [x1], x2 // Q2 + ld1 {v7.d}[0], [x0] // Q3 + ld1 {v7.d}[1], [x1] // Q3 + + dup v22.16b, w3 // flim_E + dup v23.16b, w4 // flim_I + + vp8_loop_filter inner=\inner, hev_thresh=w5 + + // back up to P2: u,v -= stride * 6 + sub x0, x0, x2, lsl #2 + sub x1, x1, x2, lsl #2 + sub x0, x0, x2, lsl #1 + sub x1, x1, x2, lsl #1 + + // Store pixels: + + st1 {v1.d}[0], [x0], x2 // P2 + st1 {v1.d}[1], [x1], x2 // P2 + st1 {v2.d}[0], [x0], x2 // P1 + st1 {v2.d}[1], [x1], x2 // P1 + st1 {v3.d}[0], [x0], x2 // P0 + st1 {v3.d}[1], [x1], x2 // P0 + st1 {v4.d}[0], [x0], x2 // Q0 + st1 {v4.d}[1], [x1], x2 // Q0 + st1 {v5.d}[0], [x0], x2 // Q1 + st1 {v5.d}[1], [x1], x2 // Q1 + st1 {v6.d}[0], [x0] // Q2 + st1 {v6.d}[1], [x1] // Q2 + + ret +endfunc +.endm + +vp8_v_loop_filter8uv +vp8_v_loop_filter8uv _inner, inner=1 + +.macro vp8_h_loop_filter16 name, inner=0, simple=0 +function ff_vp8_h_loop_filter16\name\()_neon, export=1 + + sub x0, x0, #4 + // Load pixels: + ld1 {v0.d}[0], [x0], x1 + ld1 {v1.d}[0], [x0], x1 + ld1 {v2.d}[0], [x0], x1 + ld1 {v3.d}[0], [x0], x1 + ld1 {v4.d}[0], [x0], x1 + ld1 {v5.d}[0], [x0], x1 + ld1 {v6.d}[0], [x0], x1 + ld1 {v7.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v3.d}[1], [x0], x1 + ld1 {v4.d}[1], [x0], x1 + ld1 {v5.d}[1], [x0], x1 + ld1 {v6.d}[1], [x0], x1 + ld1 {v7.d}[1], [x0], x1 + + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + dup v22.16b, w2 // flim_E + .if !\simple + dup v23.16b, w3 // flim_I + .endif + + vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4 + + sub x0, x0, x1, lsl #4 // backup 16 rows + + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + // Store pixels: + st1 {v0.d}[0], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v3.d}[0], [x0], x1 + st1 {v4.d}[0], [x0], x1 + st1 {v5.d}[0], [x0], x1 + st1 {v6.d}[0], [x0], x1 + st1 {v7.d}[0], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 + st1 {v3.d}[1], [x0], x1 + st1 {v4.d}[1], [x0], x1 + st1 {v5.d}[1], [x0], x1 + st1 {v6.d}[1], [x0], x1 + st1 {v7.d}[1], [x0] + + ret +endfunc +.endm + +vp8_h_loop_filter16 +vp8_h_loop_filter16 _inner, inner=1 +vp8_h_loop_filter16 _simple, simple=1 + +.macro vp8_h_loop_filter8uv name, inner=0 +function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 + sub x0, x0, #4 + sub x1, x1, #4 + + // Load pixels: + ld1 {v0.d}[0], [x0], x2 // load u + ld1 {v0.d}[1], [x1], x2 // load v + ld1 {v1.d}[0], [x0], x2 + ld1 {v1.d}[1], [x1], x2 + ld1 {v2.d}[0], [x0], x2 + ld1 {v2.d}[1], [x1], x2 + ld1 {v3.d}[0], [x0], x2 + ld1 {v3.d}[1], [x1], x2 + ld1 {v4.d}[0], [x0], x2 + ld1 {v4.d}[1], [x1], x2 + ld1 {v5.d}[0], [x0], x2 + ld1 {v5.d}[1], [x1], x2 + ld1 {v6.d}[0], [x0], x2 + ld1 {v6.d}[1], [x1], x2 + ld1 {v7.d}[0], [x0], x2 + ld1 {v7.d}[1], [x1], x2 + + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + dup v22.16b, w3 // flim_E + dup v23.16b, w4 // flim_I + + vp8_loop_filter inner=\inner, hev_thresh=w5 + + sub x0, x0, x2, lsl #3 // backup u 8 rows + sub x1, x1, x2, lsl #3 // backup v 8 rows + + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + // Store pixels: + st1 {v0.d}[0], [x0], x2 // load u + st1 {v0.d}[1], [x1], x2 // load v + st1 {v1.d}[0], [x0], x2 + st1 {v1.d}[1], [x1], x2 + st1 {v2.d}[0], [x0], x2 + st1 {v2.d}[1], [x1], x2 + st1 {v3.d}[0], [x0], x2 + st1 {v3.d}[1], [x1], x2 + st1 {v4.d}[0], [x0], x2 + st1 {v4.d}[1], [x1], x2 + st1 {v5.d}[0], [x0], x2 + st1 {v5.d}[1], [x1], x2 + st1 {v6.d}[0], [x0], x2 + st1 {v6.d}[1], [x1], x2 + st1 {v7.d}[0], [x0] + st1 {v7.d}[1], [x1] + + ret + +endfunc +.endm + +vp8_h_loop_filter8uv +vp8_h_loop_filter8uv _inner, inner=1 + + +function ff_put_vp8_pixels16_neon, export=1 +1: + subs w4, w4, #4 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x2], x3 + ld1 {v2.16b}, [x2], x3 + ld1 {v3.16b}, [x2], x3 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + b.gt 1b + ret +endfunc + +function ff_put_vp8_pixels8_neon, export=1 +1: + subs w4, w4, #4 + ld1 {v0.8b}, [x2], x3 + ld1 {v0.d}[1], [x2], x3 + ld1 {v1.8b}, [x2], x3 + ld1 {v1.d}[1], [x2], x3 + st1 {v0.8b}, [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v1.d}[1], [x0], x1 + b.gt 1b + ret +endfunc + +/* 4/6-tap 8th-pel MC */ + +.macro vp8_epel8_h6 d, s0, s1 + ext v22.8b, \s0\().8b, \s1\().8b, #1 + uxtl v18.8h, \s0\().8b + ext v23.8b, \s0\().8b, \s1\().8b, #2 + uxtl v19.8h, v22.8b + ext v24.8b, \s0\().8b, \s1\().8b, #3 + uxtl v21.8h, v23.8b + ext v25.8b, \s0\().8b, \s1\().8b, #4 + uxtl v22.8h, v24.8b + ext v26.8b, \s0\().8b, \s1\().8b, #5 + uxtl v25.8h, v25.8b + mul v21.8h, v21.8h, v0.h[2] + uxtl v26.8h, v26.8b + mul v22.8h, v22.8h, v0.h[3] + mls v21.8h, v19.8h, v0.h[1] + mls v22.8h, v25.8h, v0.h[4] + mla v21.8h, v18.8h, v0.h[0] + mla v22.8h, v26.8h, v0.h[5] + sqadd v22.8h, v21.8h, v22.8h + sqrshrun \d\().8b, v22.8h, #7 +.endm + +.macro vp8_epel16_h6 d0, v0, v1 + ext v22.16b, \v0\().16b, \v1\().16b, #3 + ext v23.16b, \v0\().16b, \v1\().16b, #4 + uxtl v19.8h, v22.8b + uxtl2 v22.8h, v22.16b + ext v3.16b, \v0\().16b, \v1\().16b, #2 + uxtl v20.8h, v23.8b + uxtl2 v23.8h, v23.16b + ext v16.16b, \v0\().16b, \v1\().16b, #1 + uxtl v18.8h, v3.8b + uxtl2 v3.8h, v3.16b + ext v2.16b, \v0\().16b, \v1\().16b, #5 + uxtl v21.8h, v2.8b + uxtl2 v2.8h, v2.16b + uxtl v17.8h, v16.8b + uxtl2 v16.8h, v16.16b + mul v19.8h, v19.8h, v0.h[3] + mul v18.8h, v18.8h, v0.h[2] + mul v3.8h, v3.8h, v0.h[2] + mul v22.8h, v22.8h, v0.h[3] + mls v19.8h, v20.8h, v0.h[4] + uxtl v20.8h, \v0\().8b + uxtl2 v1.8h, \v0\().16b + mls v18.8h, v17.8h, v0.h[1] + mls v3.8h, v16.8h, v0.h[1] + mls v22.8h, v23.8h, v0.h[4] + mla v18.8h, v20.8h, v0.h[0] + mla v19.8h, v21.8h, v0.h[5] + mla v3.8h, v1.8h, v0.h[0] + mla v22.8h, v2.8h, v0.h[5] + sqadd v19.8h, v18.8h, v19.8h + sqadd v22.8h, v3.8h, v22.8h + sqrshrun \d0\().8b, v19.8h, #7 + sqrshrun2 \d0\().16b, v22.8h, #7 +.endm + +.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 + uxtl \s0\().8h, \s0\().8b + uxtl \s3\().8h, \s3\().8b + uxtl \s6\().8h, \s6\().8b + uxtl \s1\().8h, \s1\().8b + uxtl \s4\().8h, \s4\().8b + uxtl \s2\().8h, \s2\().8b + uxtl \s5\().8h, \s5\().8b + mul \s0\().8h, \s0\().8h, v0.h[0] + mul v31.8h , \s3\().8h, v0.h[3] + mul \s3\().8h, \s3\().8h, v0.h[2] + mul \s6\().8h, \s6\().8h, v0.h[5] + + mls \s0\().8h, \s1\().8h, v0.h[1] + mls v31.8h , \s4\().8h, v0.h[4] + mls \s3\().8h, \s2\().8h, v0.h[1] + mls \s6\().8h, \s5\().8h, v0.h[4] + + mla \s0\().8h, \s2\().8h, v0.h[2] + mla v31.8h , \s5\().8h, v0.h[5] + mla \s3\().8h, \s1\().8h, v0.h[0] + mla \s6\().8h, \s4\().8h, v0.h[3] + sqadd v31.8h , \s0\().8h, v31.8h + sqadd \s6\().8h, \s3\().8h, \s6\().8h + sqrshrun \d0\().8b, v31.8h, #7 + sqrshrun \d1\().8b, \s6\().8h, #7 +.endm + +.macro vp8_epel8_h4 d, v0, v1 + ext v22.8b, \v0\().8b, \v1\().8b, #1 + uxtl v19.8h, \v0\().8b + ext v23.8b, \v0\().8b, \v1\().8b, #2 + uxtl v20.8h, v22.8b + ext v25.8b, \v0\().8b, \v1\().8b, #3 + uxtl v22.8h, v23.8b + uxtl v25.8h, v25.8b + mul v20.8h, v20.8h, v0.h[2] + mul v22.8h, v22.8h, v0.h[3] + mls v20.8h, v19.8h, v0.h[1] + mls v22.8h, v25.8h, v0.h[4] + sqadd v22.8h, v20.8h, v22.8h + sqrshrun \d\().8b, v22.8h, #7 +.endm + +.macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4 + uxtl \s0\().8h, \s0\().8b + uxtl \s1\().8h, \s1\().8b + uxtl \s2\().8h, \s2\().8b + uxtl \s3\().8h, \s3\().8b + uxtl \s4\().8h, \s4\().8b + mul v21.8h, \s1\().8h, v0.h[2] + mul v23.8h, \s2\().8h, v0.h[3] + mul \s2\().8h, \s2\().8h, v0.h[2] + mul v22.8h, \s3\().8h, v0.h[3] + mls v21.8h, \s0\().8h, v0.h[1] + mls v23.8h, \s3\().8h, v0.h[4] + mls \s2\().8h, \s1\().8h, v0.h[1] + mls v22.8h, \s4\().8h, v0.h[4] + sqadd v21.8h, v21.8h, v23.8h + sqadd \s2\().8h, \s2\().8h, v22.8h + sqrshrun \d0\().8b, v21.8h, #7 + sqrshrun2 \d0\().16b, \s2\().8h, #7 +.endm + + +// note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit +// arithmetic can be used to apply filters +const subpel_filters, align=4 + .short 0, 6, 123, 12, 1, 0, 0, 0 + .short 2, 11, 108, 36, 8, 1, 0, 0 + .short 0, 9, 93, 50, 6, 0, 0, 0 + .short 3, 16, 77, 77, 16, 3, 0, 0 + .short 0, 6, 50, 93, 9, 0, 0, 0 + .short 1, 8, 36, 108, 11, 2, 0, 0 + .short 0, 1, 12, 123, 6, 0, 0, 0 +endconst + +function ff_put_vp8_epel16_v6_neon, export=1 + sub x2, x2, x3, lsl #1 + + sxtw x4, w4 + sxtw x6, w6 + movrel x17, subpel_filters, -16 + add x6, x17, x6, lsl #4 // y + ld1 {v0.8h}, [x6] +1: + ld1 {v1.1d - v2.1d}, [x2], x3 + ld1 {v3.1d - v4.1d}, [x2], x3 + ld1 {v16.1d - v17.1d}, [x2], x3 + ld1 {v18.1d - v19.1d}, [x2], x3 + ld1 {v20.1d - v21.1d}, [x2], x3 + ld1 {v22.1d - v23.1d}, [x2], x3 + ld1 {v24.1d - v25.1d}, [x2] + sub x2, x2, x3, lsl #2 + + vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24 + vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25 + + st1 {v1.1d - v2.1d}, [x0], x1 + st1 {v3.1d - v4.1d}, [x0], x1 + subs x4, x4, #2 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel16_h6_neon, export=1 + sub x2, x2, #2 + sxtw x5, w5 // x + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + add x5, x17, x5, lsl #4 // x + ld1 {v0.8h}, [x5] +1: + ld1 {v1.16b, v2.16b}, [x2], x3 + vp8_epel16_h6 v1, v1, v2 + st1 {v1.16b}, [x0], x1 + + subs w4, w4, #1 + b.ne 1b + ret +endfunc + + +function ff_put_vp8_epel16_h6v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #2 + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + sxtw x5, w5 // x + add x16, x17, x5, lsl #4 // x + sub sp, sp, #336+16 + ld1 {v0.8h}, [x16] + add x7, sp, #15 + sxtw x4, w4 + add x16, x4, #5 // h + bic x7, x7, #15 +1: + ld1 {v1.16b, v2.16b}, [x2], x3 + vp8_epel16_h6 v1, v1, v2 + st1 {v1.16b}, [x7], #16 + subs x16, x16, #1 + b.ne 1b + + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v4.8b}, [x7], #32 + ld1 {v16.8b - v19.8b}, [x7], #32 + ld1 {v20.8b - v23.8b}, [x7], #32 + ld1 {v24.8b - v25.8b}, [x7] + sub x7, x7, #64 + + vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24 + vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25 + trn1 v1.2d, v1.2d, v2.2d + trn1 v3.2d, v3.2d, v4.2d + + st1 {v1.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + subs x4, x4, #2 + b.ne 2b + + add sp, sp, #336+16 + ret +endfunc + +function ff_put_vp8_epel8_v6_neon, export=1 + sub x2, x2, x3, lsl #1 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x2], x3 + ld1 {v6.8b}, [x2], x3 + ld1 {v7.8b}, [x2], x3 + ld1 {v28.8b}, [x2] + + sub x2, x2, x3, lsl #2 + + vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 + + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + subs w4, w4, #2 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel8_h6_neon, export=1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b, v3.8b}, [x2], x3 + + vp8_epel8_h6 v2, v2, v3 + + st1 {v2.8b}, [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel8_h6v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #2 + sxtw x4, w4 + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #5 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h6 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + b.ne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v4.8b}, [x7], #32 + ld1 {v5.8b - v7.8b}, [x7] + + sub x7, x7, #16 + + vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7 + + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + subs x4, x4, #2 + b.ne 2b + + add sp, sp, #168+16 + ret +endfunc + +function ff_put_vp8_epel8_v4_neon, export=1 + sub x2, x2, x3 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x2], x3 + ld1 {v6.8b}, [x2] + sub x2, x2, x3, lsl #1 + + vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 + + st1 {v2.d}[0], [x0], x1 + st1 {v2.d}[1], [x0], x1 + subs w4, w4, #2 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel8_h4_neon, export=1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + + vp8_epel8_h4 v2, v2, v3 + + st1 {v2.8b}, [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel8_h4v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #1 + sxtw x4, w4 + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #5 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h4 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + b.ne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v4.8b}, [x7], #32 + ld1 {v5.8b - v7.8b}, [x7] + + sub x7, x7, #16 + + vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7 + + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + subs x4, x4, #2 + b.ne 2b + + add sp, sp, #168+16 + ret +endfunc + +function ff_put_vp8_epel8_h4v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #1 + sxtw x4, w4 + + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #3 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h4 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + b.ne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v2.8b}, [x7], #16 + ld1 {v3.8b - v5.8b}, [x7] + + vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5 + + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + subs x4, x4, #2 + b.ne 2b + + add sp, sp, #168+16 + ret +endfunc + +function ff_put_vp8_epel8_h6v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #2 + sxtw x4, w4 + + + // first pass (horizontal): + movrel x17, subpel_filters, -16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #3 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h6 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + b.ne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v2.8b}, [x7], #16 + ld1 {v3.8b - v5.8b}, [x7] + + vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5 + + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + subs x4, x4, #2 + b.ne 2b + + add sp, sp, #168+16 + ret +endfunc + +function ff_put_vp8_epel4_v6_neon, export=1 + sub x2, x2, x3, lsl #1 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1r {v2.2s}, [x2], x3 + ld1r {v3.2s}, [x2], x3 + ld1r {v4.2s}, [x2], x3 + ld1r {v5.2s}, [x2], x3 + ld1r {v6.2s}, [x2], x3 + ld1r {v7.2s}, [x2], x3 + ld1r {v28.2s}, [x2] + sub x2, x2, x3, lsl #2 + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + ld1 {v4.s}[1], [x2], x3 + ld1 {v5.s}[1], [x2], x3 + ld1 {v6.s}[1], [x2], x3 + ld1 {v7.s}[1], [x2], x3 + ld1 {v28.s}[1], [x2] + sub x2, x2, x3, lsl #2 + + vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 + + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h6_neon, export=1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h6v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #52 + add w8, w4, #5 + mov x9, sp +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1 {v6.8b}, [x9], #8 + ld1r {v28.2s}, [x9] + sub x9, x9, #16 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v7.8b}, [x9], #8 + ld1 {v28.s}[1], [x9] + sub x9, x9, #16 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + trn1 v3.2s, v6.2s, v7.2s + trn2 v7.2s, v6.2s, v7.2s + vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #52 + ret +endfunc + +function ff_put_vp8_epel4_h4v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #52 + add w8, w4, #5 + mov x9, sp +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v2 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1 {v6.8b}, [x9], #8 + ld1r {v28.2s}, [x9] + sub x9, x9, #16 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v7.8b}, [x9], #8 + ld1 {v28.s}[1], [x9] + sub x9, x9, #16 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + trn1 v3.2s, v6.2s, v7.2s + trn2 v7.2s, v6.2s, v7.2s + vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #52 + ret +endfunc + +function ff_put_vp8_epel4_h6v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #44 + add w8, w4, #3 + mov x9, sp +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1r {v6.2s}, [x9] + sub x9, x9, #8 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v6.s}[1], [x9] + sub x9, x9, #8 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[2], [x0], x1 + st1 {v1.s}[1], [x0], x1 + st1 {v1.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #44 + ret +endfunc + +function ff_put_vp8_epel4_h4_neon, export=1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v2 + st1 {v2.s}[0], [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_v4_neon, export=1 + sub x2, x2, x3 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1r {v2.2s}, [x2], x3 + ld1r {v3.2s}, [x2], x3 + ld1r {v4.2s}, [x2], x3 + ld1r {v5.2s}, [x2], x3 + ld1r {v6.2s}, [x2] + sub x2, x2, x3, lsl #1 + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + ld1 {v4.s}[1], [x2], x3 + ld1 {v5.s}[1], [x2], x3 + ld1 {v6.s}[1], [x2] + sub x2, x2, x3, lsl #1 + + vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 + + st1 {v2.s}[0], [x0], x1 + st1 {v2.s}[2], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v2.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h4v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #44 + add w8, w4, #3 + mov x9, sp +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1r {v6.2s}, [x9] + sub x9, x9, #8 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v6.s}[1], [x9] + sub x9, x9, #8 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[2], [x0], x1 + st1 {v1.s}[1], [x0], x1 + st1 {v1.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #44 + ret +endfunc + +/* Bilinear MC */ + +function ff_put_vp8_bilin16_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b,v3.8b,v4.8b}, [x2], x3 + ext v5.8b, v3.8b, v4.8b, #1 + ext v4.8b, v2.8b, v3.8b, #1 + umull v16.8h, v2.8b, v1.8b + umlal v16.8h, v4.8b, v0.8b + ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 + umull v6.8h, v3.8b, v1.8b + umlal v6.8h, v5.8b, v0.8b + ext v21.8b, v19.8b, v20.8b, #1 + ext v20.8b, v18.8b, v19.8b, #1 + umull v22.8h, v18.8b, v1.8b + umlal v22.8h, v20.8b, v0.8b + umull v24.8h, v19.8b, v1.8b + umlal v24.8h, v21.8b, v0.8b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v6.8h, #3 + rshrn v6.8b, v22.8h, #3 + rshrn2 v6.16b, v24.8h, #3 + st1 {v4.16b}, [x0], x1 + st1 {v6.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin16_v_neon, export=1 + mov w7, #8 + dup v0.16b, w6 + sub w6, w7, w6 + dup v1.16b, w6 + + ld1 {v2.16b}, [x2], x3 +1: + subs w4, w4, #2 + ld1 {v4.16b}, [x2], x3 + umull v6.8h, v2.8b, v1.8b + umlal v6.8h, v4.8b, v0.8b + umull2 v16.8h, v2.16b, v1.16b + umlal2 v16.8h, v4.16b, v0.16b + ld1 {v2.16b}, [x2], x3 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v2.8b, v0.8b + umull2 v20.8h, v4.16b, v1.16b + umlal2 v20.8h, v2.16b, v0.16b + rshrn v4.8b, v6.8h, #3 + rshrn2 v4.16b, v16.8h, #3 + rshrn v6.8b, v18.8h, #3 + rshrn2 v6.16b, v20.8h, #3 + st1 {v4.16b}, [x0], x1 + st1 {v6.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin16_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.16b, w6 // my + sub w6, w7, w6 + dup v3.16b, w6 + + ld1 {v4.8b,v5.8b,v6.8b}, [x2], x3 + + ext v7.8b, v5.8b, v6.8b, #1 + ext v6.8b, v4.8b, v5.8b, #1 + umull v16.8h, v4.8b, v1.8b + umlal v16.8h, v6.8b, v0.8b + umull v18.8h, v5.8b, v1.8b + umlal v18.8h, v7.8b, v0.8b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 + ext v21.8b, v19.8b, v20.8b, #1 + ext v20.8b, v18.8b, v19.8b, #1 + umull v22.8h, v18.8b, v1.8b + umlal v22.8h, v20.8b, v0.8b + ld1 {v26.8b,v27.8b,v28.8b}, [x2], x3 + umull v24.8h, v19.8b, v1.8b + umlal v24.8h, v21.8b, v0.8b + ext v29.8b, v27.8b, v28.8b, #1 + ext v28.8b, v26.8b, v27.8b, #1 + umull v16.8h, v26.8b, v1.8b + umlal v16.8h, v28.8b, v0.8b + umull v18.8h, v27.8b, v1.8b + umlal v18.8h, v29.8b, v0.8b + rshrn v6.8b, v22.8h, #3 + rshrn2 v6.16b, v24.8h, #3 + umull v24.8h, v4.8b, v3.8b + umlal v24.8h, v6.8b, v2.8b + umull2 v30.8h, v4.16b, v3.16b + umlal2 v30.8h, v6.16b, v2.16b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v18.8h, #3 + umull v20.8h, v6.8b, v3.8b + umlal v20.8h, v4.8b, v2.8b + umull2 v22.8h, v6.16b, v3.16b + umlal2 v22.8h, v4.16b, v2.16b + rshrn v24.8b, v24.8h, #3 + rshrn2 v24.16b, v30.8h, #3 + st1 {v24.16b}, [x0], x1 + rshrn v20.8b, v20.8h, #3 + rshrn2 v20.16b, v22.8h, #3 + st1 {v20.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b,v3.8b}, [x2], x3 + ext v3.8b, v2.8b, v3.8b, #1 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + ld1 {v6.8b,v7.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + rshrn v16.8b, v16.8h, #3 + st1 {v4.8b}, [x0], x1 + st1 {v16.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_v_neon, export=1 + mov w7, #8 + dup v0.8b, w6 + sub w6, w7, w6 + dup v1.8b, w6 + + ld1 {v2.8b}, [x2], x3 +1: + subs w4, w4, #2 + ld1 {v3.8b}, [x2], x3 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + ld1 {v2.8b}, [x2], x3 + umull v6.8h, v3.8b, v1.8b + umlal v6.8h, v2.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + rshrn v6.8b, v6.8h, #3 + st1 {v4.8b}, [x0], x1 + st1 {v6.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.8b, w6 // my + sub w6, w7, w6 + dup v3.8b, w6 + + ld1 {v4.8b,v5.8b}, [x2], x3 + ext v5.8b, v4.8b, v5.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v22.8b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v6.8b,v7.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + ld1 {v4.8b,v5.8b}, [x2], x3 + ext v5.8b, v4.8b, v5.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v16.8b, v16.8h, #3 + umull v20.8h, v22.8b, v3.8b + umlal v20.8h, v16.8b, v2.8b + rshrn v22.8b, v18.8h, #3 + umull v24.8h, v16.8b, v3.8b + umlal v24.8h, v22.8b, v2.8b + rshrn v20.8b, v20.8h, #3 + st1 {v20.8b}, [x0], x1 + rshrn v23.8b, v24.8h, #3 + st1 {v23.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b}, [x2], x3 + ext v3.8b, v2.8b, v3.8b, #1 + ld1 {v6.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + trn1 v2.2s, v2.2s, v6.2s + trn1 v3.2s, v3.2s, v7.2s + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_v_neon, export=1 + mov w7, #8 + dup v0.8b, w6 + sub w6, w7, w6 + dup v1.8b, w6 + + ld1r {v2.2s}, [x2], x3 +1: + ld1r {v3.2s}, [x2] + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + trn2 v2.2s, v3.2s, v2.2s + rshrn v4.8b, v4.8h, #3 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x0], x1 + subs w4, w4, #2 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.8b, w6 // my + sub w6, w7, w6 + dup v3.8b, w6 + + ld1 {v4.8b}, [x2], x3 + ext v5.8b, v4.8b, v4.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v22.8b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v6.8b}, [x2], x3 + ext v7.8b, v6.8b, v6.8b, #1 + ld1 {v4.8b}, [x2], x3 + ext v5.8b, v4.8b, v4.8b, #1 + trn1 v6.2s, v6.2s, v4.2s + trn1 v7.2s, v7.2s, v5.2s + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + rshrn v16.8b, v16.8h, #3 + umull v20.8h, v16.8b, v2.8b + trn1 v22.2s, v22.2s, v16.2s + umlal v20.8h, v22.8b, v3.8b + rev64 v22.2s, v16.2s + rshrn v20.8b, v20.8h, #3 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x0], x1 + b.gt 1b + + ret +endfunc diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h b/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h new file mode 100644 index 0000000000..9df1752c62 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H +#define AVCODEC_AARCH64_VP9DSP_INIT_H + +#include "libavcodec/vp9dsp.h" + +void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp); +void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp); + +#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */ diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c new file mode 100644 index 0000000000..0fa0d7f8c2 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BPP 10 +#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64 +#include "vp9dsp_init_16bpp_aarch64_template.c" diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c new file mode 100644 index 0000000000..dae2232403 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BPP 12 +#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64 +#include "vp9dsp_init_16bpp_aarch64_template.c" diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c new file mode 100644 index 0000000000..d2a4e90b3a --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/internal.h" +#include "libavutil/mem_internal.h" +#include "libavutil/aarch64/cpu.h" +#include "vp9dsp_init.h" + +#define declare_fpel(type, sz, suffix) \ +void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define decl_mc_func(op, filter, dir, sz, bpp) \ +void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define define_8tap_2d_fn(op, filter, sz, bpp) \ +static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, \ + ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \ + /* We only need h + 7 lines, but the horizontal filter assumes an \ + * even number of rows, so filter h + 8 lines here. */ \ + ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \ + src - 3 * src_stride, src_stride, \ + h + 8, mx, 0); \ + ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \ + temp + 3 * 2 * sz, 2 * sz, \ + h, 0, my); \ +} + +#define decl_filter_funcs(op, dir, sz, bpp) \ + decl_mc_func(op, regular, dir, sz, bpp); \ + decl_mc_func(op, sharp, dir, sz, bpp); \ + decl_mc_func(op, smooth, dir, sz, bpp) + +#define decl_mc_funcs(sz, bpp) \ + decl_filter_funcs(put, h, sz, bpp); \ + decl_filter_funcs(avg, h, sz, bpp); \ + decl_filter_funcs(put, v, sz, bpp); \ + decl_filter_funcs(avg, v, sz, bpp); \ + decl_filter_funcs(put, hv, sz, bpp); \ + decl_filter_funcs(avg, hv, sz, bpp) + +#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64 +#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64 +#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64 + +declare_fpel(copy, 128, ); +declare_fpel(copy, 64, ); +declare_fpel(copy, 32, ); +declare_fpel(copy, 16, ); +declare_fpel(copy, 8, ); +declare_fpel(avg, 64, _16); +declare_fpel(avg, 32, _16); +declare_fpel(avg, 16, _16); +declare_fpel(avg, 8, _16); +declare_fpel(avg, 4, _16); + +decl_mc_funcs(64, BPP); +decl_mc_funcs(32, BPP); +decl_mc_funcs(16, BPP); +decl_mc_funcs(8, BPP); +decl_mc_funcs(4, BPP); + +#define define_8tap_2d_funcs(sz, bpp) \ + define_8tap_2d_fn(put, regular, sz, bpp) \ + define_8tap_2d_fn(put, sharp, sz, bpp) \ + define_8tap_2d_fn(put, smooth, sz, bpp) \ + define_8tap_2d_fn(avg, regular, sz, bpp) \ + define_8tap_2d_fn(avg, sharp, sz, bpp) \ + define_8tap_2d_fn(avg, smooth, sz, bpp) + +define_8tap_2d_funcs(64, BPP) +define_8tap_2d_funcs(32, BPP) +define_8tap_2d_funcs(16, BPP) +define_8tap_2d_funcs(8, BPP) +define_8tap_2d_funcs(4, BPP) + +static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + +#define init_fpel(idx1, idx2, sz, type, suffix) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix + +#define init_copy(idx, sz, suffix) \ + init_fpel(idx, 0, sz, copy, suffix) + +#define init_avg(idx, sz, suffix) \ + init_fpel(idx, 1, sz, avg, suffix) + +#define init_copy_avg(idx, sz1, sz2) \ + init_copy(idx, sz2, _neon); \ + init_avg (idx, sz1, _16_neon) + + if (have_armv8(cpu_flags)) { + init_copy(0, 128, _aarch64); + init_copy(1, 64, _aarch64); + init_copy(2, 32, _aarch64); + } + + if (have_neon(cpu_flags)) { +#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \ + dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon + +#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \ + init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp) + +#define init_mc_funcs_dirs(idx, sz, bpp) \ + init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \ + init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \ + init_mc_funcs(idx, hv, 1, 1, sz, , bpp) + + + init_avg(0, 64, _16_neon); + init_avg(1, 32, _16_neon); + init_avg(2, 16, _16_neon); + init_copy_avg(3, 8, 16); + init_copy_avg(4, 4, 8); + + init_mc_funcs_dirs(0, 64, BPP); + init_mc_funcs_dirs(1, 32, BPP); + init_mc_funcs_dirs(2, 16, BPP); + init_mc_funcs_dirs(3, 8, BPP); + init_mc_funcs_dirs(4, 4, BPP); + } +} + +#define define_itxfm2(type_a, type_b, sz, bpp) \ +void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \ + ptrdiff_t stride, \ + int16_t *_block, int eob) +#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp) + +#define define_itxfm_funcs(sz, bpp) \ + define_itxfm(idct, idct, sz, bpp); \ + define_itxfm(iadst, idct, sz, bpp); \ + define_itxfm(idct, iadst, sz, bpp); \ + define_itxfm(iadst, iadst, sz, bpp) + +define_itxfm_funcs(4, BPP); +define_itxfm_funcs(8, BPP); +define_itxfm_funcs(16, BPP); +define_itxfm(idct, idct, 32, BPP); +define_itxfm(iwht, iwht, 4, BPP); + + +static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { +#define init_itxfm2(tx, sz, bpp) \ + dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \ + dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \ + dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \ + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon +#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp) + +#define init_idct2(tx, nm, bpp) \ + dsp->itxfm_add[tx][DCT_DCT] = \ + dsp->itxfm_add[tx][ADST_DCT] = \ + dsp->itxfm_add[tx][DCT_ADST] = \ + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon +#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp) + + init_itxfm(TX_4X4, 4x4, BPP); + init_itxfm(TX_8X8, 8x8, BPP); + init_itxfm(TX_16X16, 16x16, BPP); + init_idct(TX_32X32, idct_idct_32x32, BPP); + init_idct(4, iwht_iwht_4x4, BPP); + } +} + +#define define_loop_filter(dir, wd, size, bpp) \ +void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H) + +#define define_loop_filters(wd, size, bpp) \ + define_loop_filter(h, wd, size, bpp); \ + define_loop_filter(v, wd, size, bpp) + +define_loop_filters(4, 8, BPP); +define_loop_filters(8, 8, BPP); +define_loop_filters(16, 8, BPP); + +define_loop_filters(16, 16, BPP); + +define_loop_filters(44, 16, BPP); +define_loop_filters(48, 16, BPP); +define_loop_filters(84, 16, BPP); +define_loop_filters(88, 16, BPP); + +static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { +#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \ + dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon + +#define init_lpf_func_16(idx, dir, bpp) \ + dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon + +#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \ + dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon + +#define init_lpf_funcs_8_wd(idx, wd, bpp) \ + init_lpf_func_8(idx, 0, h, wd, bpp); \ + init_lpf_func_8(idx, 1, v, wd, bpp) + +#define init_lpf_funcs_16(bpp) \ + init_lpf_func_16(0, h, bpp); \ + init_lpf_func_16(1, v, bpp) + +#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \ + init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp); \ + init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp) + +#define init_lpf_funcs_8(bpp) \ + init_lpf_funcs_8_wd(0, 4, bpp); \ + init_lpf_funcs_8_wd(1, 8, bpp); \ + init_lpf_funcs_8_wd(2, 16, bpp) + +#define init_lpf_funcs_mix2(bpp) \ + init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \ + init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \ + init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \ + init_lpf_funcs_mix2_wd(1, 1, 88, bpp) + + init_lpf_funcs_8(BPP); + init_lpf_funcs_16(BPP); + init_lpf_funcs_mix2(BPP); + } +} + +av_cold void INIT_FUNC(VP9DSPContext *dsp) +{ + vp9dsp_mc_init_aarch64(dsp); + vp9dsp_loopfilter_init_aarch64(dsp); + vp9dsp_itxfm_init_aarch64(dsp); +} diff --git a/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c new file mode 100644 index 0000000000..4d1fee62de --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2016 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/internal.h" +#include "libavutil/mem_internal.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/vp9dsp.h" +#include "vp9dsp_init.h" + +#define declare_fpel(type, sz) \ +void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define declare_copy_avg(sz) \ + declare_fpel(copy, sz); \ + declare_fpel(avg , sz) + +#define decl_mc_func(op, filter, dir, sz) \ +void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define define_8tap_2d_fn(op, filter, sz) \ +static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]); \ + /* We only need h + 7 lines, but the horizontal filter assumes an \ + * even number of rows, so filter h + 8 lines here. */ \ + ff_vp9_put_##filter##sz##_h_neon(temp, sz, \ + src - 3 * src_stride, src_stride, \ + h + 8, mx, 0); \ + ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride, \ + temp + 3 * sz, sz, \ + h, 0, my); \ +} + +#define decl_filter_funcs(op, dir, sz) \ + decl_mc_func(op, regular, dir, sz); \ + decl_mc_func(op, sharp, dir, sz); \ + decl_mc_func(op, smooth, dir, sz) + +#define decl_mc_funcs(sz) \ + decl_filter_funcs(put, h, sz); \ + decl_filter_funcs(avg, h, sz); \ + decl_filter_funcs(put, v, sz); \ + decl_filter_funcs(avg, v, sz); \ + decl_filter_funcs(put, hv, sz); \ + decl_filter_funcs(avg, hv, sz) + +#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64 +#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64 + +declare_copy_avg(64); +declare_copy_avg(32); +declare_copy_avg(16); +declare_copy_avg(8); +declare_copy_avg(4); + +decl_mc_funcs(64); +decl_mc_funcs(32); +decl_mc_funcs(16); +decl_mc_funcs(8); +decl_mc_funcs(4); + +#define define_8tap_2d_funcs(sz) \ + define_8tap_2d_fn(put, regular, sz) \ + define_8tap_2d_fn(put, sharp, sz) \ + define_8tap_2d_fn(put, smooth, sz) \ + define_8tap_2d_fn(avg, regular, sz) \ + define_8tap_2d_fn(avg, sharp, sz) \ + define_8tap_2d_fn(avg, smooth, sz) + +define_8tap_2d_funcs(64) +define_8tap_2d_funcs(32) +define_8tap_2d_funcs(16) +define_8tap_2d_funcs(8) +define_8tap_2d_funcs(4) + +static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + +#define init_fpel(idx1, idx2, sz, type, suffix) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix + +#define init_copy(idx, sz, suffix) \ + init_fpel(idx, 0, sz, copy, suffix) + +#define init_avg(idx, sz, suffix) \ + init_fpel(idx, 1, sz, avg, suffix) + +#define init_copy_avg(idx, sz) \ + init_copy(idx, sz, _neon); \ + init_avg (idx, sz, _neon) + + if (have_armv8(cpu_flags)) { + init_copy(0, 64, _aarch64); + init_copy(1, 32, _aarch64); + } + + if (have_neon(cpu_flags)) { +#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \ + dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon + +#define init_mc_funcs(idx, dir, mx, my, sz, pfx) \ + init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \ + init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \ + init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \ + init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx) + +#define init_mc_funcs_dirs(idx, sz) \ + init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_); \ + init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_); \ + init_mc_funcs(idx, hv, 1, 1, sz,) + + init_avg(0, 64, _neon); + init_avg(1, 32, _neon); + init_copy_avg(2, 16); + init_copy_avg(3, 8); + init_copy_avg(4, 4); + + init_mc_funcs_dirs(0, 64); + init_mc_funcs_dirs(1, 32); + init_mc_funcs_dirs(2, 16); + init_mc_funcs_dirs(3, 8); + init_mc_funcs_dirs(4, 4); + } +} + +#define define_itxfm(type_a, type_b, sz) \ +void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst, \ + ptrdiff_t stride, \ + int16_t *_block, int eob) + +#define define_itxfm_funcs(sz) \ + define_itxfm(idct, idct, sz); \ + define_itxfm(iadst, idct, sz); \ + define_itxfm(idct, iadst, sz); \ + define_itxfm(iadst, iadst, sz) + +define_itxfm_funcs(4); +define_itxfm_funcs(8); +define_itxfm_funcs(16); +define_itxfm(idct, idct, 32); +define_itxfm(iwht, iwht, 4); + + +static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { +#define init_itxfm(tx, sz) \ + dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_neon; \ + dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_neon; \ + dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_neon; \ + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon + +#define init_idct(tx, nm) \ + dsp->itxfm_add[tx][DCT_DCT] = \ + dsp->itxfm_add[tx][ADST_DCT] = \ + dsp->itxfm_add[tx][DCT_ADST] = \ + dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon + + init_itxfm(TX_4X4, 4x4); + init_itxfm(TX_8X8, 8x8); + init_itxfm(TX_16X16, 16x16); + init_idct(TX_32X32, idct_idct_32x32); + init_idct(4, iwht_iwht_4x4); + } +} + +#define define_loop_filter(dir, wd, len) \ +void ff_vp9_loop_filter_##dir##_##wd##_##len##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H) + +#define define_loop_filters(wd, len) \ + define_loop_filter(h, wd, len); \ + define_loop_filter(v, wd, len) + +define_loop_filters(4, 8); +define_loop_filters(8, 8); +define_loop_filters(16, 8); + +define_loop_filters(16, 16); + +define_loop_filters(44, 16); +define_loop_filters(48, 16); +define_loop_filters(84, 16); +define_loop_filters(88, 16); + +static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon; + dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon; + dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon; + dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon; + dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon; + dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon; + + dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon; + dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon; + + dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon; + dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon; + dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_neon; + dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_neon; + dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_neon; + dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_neon; + dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_neon; + dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_neon; + } +} + +av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp) +{ + if (bpp == 10) { + ff_vp9dsp_init_10bpp_aarch64(dsp); + return; + } else if (bpp == 12) { + ff_vp9dsp_init_12bpp_aarch64(dsp); + return; + } else if (bpp != 8) + return; + + vp9dsp_mc_init_aarch64(dsp); + vp9dsp_loopfilter_init_aarch64(dsp); + vp9dsp_itxfm_init_aarch64(dsp); +} diff --git a/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S new file mode 100644 index 0000000000..c5f43d36a3 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S @@ -0,0 +1,2017 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +const itxfm4_coeffs, align=4 + .short 11585, 0, 6270, 15137 +iadst4_coeffs: + .short 5283, 15212, 9929, 13377 +endconst + +const iadst8_coeffs, align=4 + .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 +idct_coeffs: + .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 + .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 + .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 + .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 +endconst + +const iadst16_coeffs, align=4 + .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 + .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 +endconst + +.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7 + trn1 \r4\().4s, \r0\().4s, \r1\().4s + trn2 \r5\().4s, \r0\().4s, \r1\().4s + trn1 \r6\().4s, \r2\().4s, \r3\().4s + trn2 \r7\().4s, \r2\().4s, \r3\().4s + trn1 \r0\().2d, \r4\().2d, \r6\().2d + trn2 \r2\().2d, \r4\().2d, \r6\().2d + trn1 \r1\().2d, \r5\().2d, \r7\().2d + trn2 \r3\().2d, \r5\().2d, \r7\().2d +.endm + +// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out +// over two registers. +.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3 + transpose_4x4s \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3 + transpose_4x4s \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3 + + // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14 + // while swapping the two 4x4 matrices between each other + + // First step of the 4x4 transpose of r1-r7, into t0-t3 + trn1 \t0\().4s, \r1\().4s, \r3\().4s + trn2 \t1\().4s, \r1\().4s, \r3\().4s + trn1 \t2\().4s, \r5\().4s, \r7\().4s + trn2 \t3\().4s, \r5\().4s, \r7\().4s + + // First step of the 4x4 transpose of r8-r12, into r1-r7 + trn1 \r1\().4s, \r8\().4s, \r10\().4s + trn2 \r3\().4s, \r8\().4s, \r10\().4s + trn1 \r5\().4s, \r12\().4s, \r14\().4s + trn2 \r7\().4s, \r12\().4s, \r14\().4s + + // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12 + trn1 \r8\().2d, \t0\().2d, \t2\().2d + trn2 \r12\().2d, \t0\().2d, \t2\().2d + trn1 \r10\().2d, \t1\().2d, \t3\().2d + trn2 \r14\().2d, \t1\().2d, \t3\().2d + + // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible + trn1 \t0\().2d, \r1\().2d, \r5\().2d + trn2 \r5\().2d, \r1\().2d, \r5\().2d + trn1 \t1\().2d, \r3\().2d, \r7\().2d + trn2 \r7\().2d, \r3\().2d, \r7\().2d + + // Move the outputs of trn1 back in place + mov \r1\().16b, \t0\().16b + mov \r3\().16b, \t1\().16b +.endm + +// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 +// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 +// in/out are .4s registers; this can do with 4 temp registers, but is +// more efficient if 6 temp registers are available. +.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0 +.if \neg > 0 + neg \tmp4\().4s, v0.4s +.endif + add \tmp1\().4s, \in1\().4s, \in2\().4s + sub \tmp2\().4s, \in1\().4s, \in2\().4s +.if \neg > 0 + smull \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0] + smull2 \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0] +.else + smull \tmp3\().2d, \tmp1\().2s, v0.s[0] + smull2 \tmp4\().2d, \tmp1\().4s, v0.s[0] +.endif +.ifb \tmp5 + rshrn \out1\().2s, \tmp3\().2d, #14 + rshrn2 \out1\().4s, \tmp4\().2d, #14 + smull \tmp3\().2d, \tmp2\().2s, v0.s[0] + smull2 \tmp4\().2d, \tmp2\().4s, v0.s[0] + rshrn \out2\().2s, \tmp3\().2d, #14 + rshrn2 \out2\().4s, \tmp4\().2d, #14 +.else + smull \tmp5\().2d, \tmp2\().2s, v0.s[0] + smull2 \tmp6\().2d, \tmp2\().4s, v0.s[0] + rshrn \out1\().2s, \tmp3\().2d, #14 + rshrn2 \out1\().4s, \tmp4\().2d, #14 + rshrn \out2\().2s, \tmp5\().2d, #14 + rshrn2 \out2\().4s, \tmp6\().2d, #14 +.endif +.endm + +// Same as dmbutterfly0 above, but treating the input in in2 as zero, +// writing the same output into both out1 and out2. +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 + smull \tmp1\().2d, \in1\().2s, v0.s[0] + smull2 \tmp2\().2d, \in1\().4s, v0.s[0] + rshrn \out1\().2s, \tmp1\().2d, #14 + rshrn2 \out1\().4s, \tmp2\().2d, #14 + rshrn \out2\().2s, \tmp1\().2d, #14 + rshrn2 \out2\().4s, \tmp2\().2d, #14 +.endm + +// out1,out2 = in1 * coef1 - in2 * coef2 +// out3,out4 = in1 * coef2 + in2 * coef1 +// out are 4 x .2d registers, in are 2 x .4s registers +.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2 + smull \out1\().2d, \in1\().2s, \coef1 + smull2 \out2\().2d, \in1\().4s, \coef1 + smull \out3\().2d, \in1\().2s, \coef2 + smull2 \out4\().2d, \in1\().4s, \coef2 + smlsl \out1\().2d, \in2\().2s, \coef2 + smlsl2 \out2\().2d, \in2\().4s, \coef2 + smlal \out3\().2d, \in2\().2s, \coef1 + smlal2 \out4\().2d, \in2\().4s, \coef1 +.endm + +// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 +// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 +// inout are 2 x .4s registers +.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0 + dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2 +.if \neg > 0 + neg \tmp3\().2d, \tmp3\().2d + neg \tmp4\().2d, \tmp4\().2d +.endif + rshrn \inout1\().2s, \tmp1\().2d, #14 + rshrn2 \inout1\().4s, \tmp2\().2d, #14 + rshrn \inout2\().2s, \tmp3\().2d, #14 + rshrn2 \inout2\().4s, \tmp4\().2d, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout2 as zero +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().2d, \inout1\().2s, \coef1 + smull2 \tmp2\().2d, \inout1\().4s, \coef1 + smull \tmp3\().2d, \inout1\().2s, \coef2 + smull2 \tmp4\().2d, \inout1\().4s, \coef2 + rshrn \inout1\().2s, \tmp1\().2d, #14 + rshrn2 \inout1\().4s, \tmp2\().2d, #14 + rshrn \inout2\().2s, \tmp3\().2d, #14 + rshrn2 \inout2\().4s, \tmp4\().2d, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout1 as zero +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().2d, \inout2\().2s, \coef2 + smull2 \tmp2\().2d, \inout2\().4s, \coef2 + smull \tmp3\().2d, \inout2\().2s, \coef1 + smull2 \tmp4\().2d, \inout2\().4s, \coef1 + neg \tmp1\().2d, \tmp1\().2d + neg \tmp2\().2d, \tmp2\().2d + rshrn \inout2\().2s, \tmp3\().2d, #14 + rshrn2 \inout2\().4s, \tmp4\().2d, #14 + rshrn \inout1\().2s, \tmp1\().2d, #14 + rshrn2 \inout1\().4s, \tmp2\().2d, #14 +.endm + +.macro dsmull_h out1, out2, in, coef + smull \out1\().2d, \in\().2s, \coef + smull2 \out2\().2d, \in\().4s, \coef +.endm + +.macro drshrn_h out, in1, in2, shift + rshrn \out\().2s, \in1\().2d, \shift + rshrn2 \out\().4s, \in2\().2d, \shift +.endm + + +// out1 = in1 + in2 +// out2 = in1 - in2 +.macro butterfly_4s out1, out2, in1, in2 + add \out1\().4s, \in1\().4s, \in2\().4s + sub \out2\().4s, \in1\().4s, \in2\().4s +.endm + +// out1 = in1 - in2 +// out2 = in1 + in2 +.macro butterfly_4s_r out1, out2, in1, in2 + sub \out1\().4s, \in1\().4s, \in2\().4s + add \out2\().4s, \in1\().4s, \in2\().4s +.endm + +// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 +// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 +// out are 2 x .4s registers, in are 4 x .2d registers +.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 + add \tmp1\().2d, \in1\().2d, \in3\().2d + add \tmp2\().2d, \in2\().2d, \in4\().2d + sub \tmp3\().2d, \in1\().2d, \in3\().2d + sub \tmp4\().2d, \in2\().2d, \in4\().2d + rshrn \out1\().2s, \tmp1\().2d, #14 + rshrn2 \out1\().4s, \tmp2\().2d, #14 + rshrn \out2\().2s, \tmp3\().2d, #14 + rshrn2 \out2\().4s, \tmp4\().2d, #14 +.endm + +.macro iwht4_10 c0, c1, c2, c3 + add \c0\().4s, \c0\().4s, \c1\().4s + sub v17.4s, \c2\().4s, \c3\().4s + sub v16.4s, \c0\().4s, v17.4s + sshr v16.4s, v16.4s, #1 + sub \c2\().4s, v16.4s, \c1\().4s + sub \c1\().4s, v16.4s, \c3\().4s + add \c3\().4s, v17.4s, \c2\().4s + sub \c0\().4s, \c0\().4s, \c1\().4s +.endm + +.macro iwht4_12 c0, c1, c2, c3 + iwht4_10 \c0, \c1, \c2, \c3 +.endm + +.macro idct4_10 c0, c1, c2, c3 + mul v22.4s, \c1\().4s, v0.s[3] + mul v20.4s, \c1\().4s, v0.s[2] + add v16.4s, \c0\().4s, \c2\().4s + sub v17.4s, \c0\().4s, \c2\().4s + mla v22.4s, \c3\().4s, v0.s[2] + mul v18.4s, v16.4s, v0.s[0] + mul v24.4s, v17.4s, v0.s[0] + mls v20.4s, \c3\().4s, v0.s[3] + srshr v22.4s, v22.4s, #14 + srshr v18.4s, v18.4s, #14 + srshr v24.4s, v24.4s, #14 + srshr v20.4s, v20.4s, #14 + add \c0\().4s, v18.4s, v22.4s + sub \c3\().4s, v18.4s, v22.4s + add \c1\().4s, v24.4s, v20.4s + sub \c2\().4s, v24.4s, v20.4s +.endm + +.macro idct4_12 c0, c1, c2, c3 + smull v22.2d, \c1\().2s, v0.s[3] + smull2 v23.2d, \c1\().4s, v0.s[3] + smull v20.2d, \c1\().2s, v0.s[2] + smull2 v21.2d, \c1\().4s, v0.s[2] + add v16.4s, \c0\().4s, \c2\().4s + sub v17.4s, \c0\().4s, \c2\().4s + smlal v22.2d, \c3\().2s, v0.s[2] + smlal2 v23.2d, \c3\().4s, v0.s[2] + smull v18.2d, v16.2s, v0.s[0] + smull2 v19.2d, v16.4s, v0.s[0] + smull v24.2d, v17.2s, v0.s[0] + smull2 v25.2d, v17.4s, v0.s[0] + smlsl v20.2d, \c3\().2s, v0.s[3] + smlsl2 v21.2d, \c3\().4s, v0.s[3] + rshrn v22.2s, v22.2d, #14 + rshrn2 v22.4s, v23.2d, #14 + rshrn v18.2s, v18.2d, #14 + rshrn2 v18.4s, v19.2d, #14 + rshrn v24.2s, v24.2d, #14 + rshrn2 v24.4s, v25.2d, #14 + rshrn v20.2s, v20.2d, #14 + rshrn2 v20.4s, v21.2d, #14 + add \c0\().4s, v18.4s, v22.4s + sub \c3\().4s, v18.4s, v22.4s + add \c1\().4s, v24.4s, v20.4s + sub \c2\().4s, v24.4s, v20.4s +.endm + +.macro iadst4_10 c0, c1, c2, c3 + mul v16.4s, \c0\().4s, v1.s[0] + mla v16.4s, \c2\().4s, v1.s[1] + mla v16.4s, \c3\().4s, v1.s[2] + mul v18.4s, \c0\().4s, v1.s[2] + mls v18.4s, \c2\().4s, v1.s[0] + sub \c0\().4s, \c0\().4s, \c2\().4s + mls v18.4s, \c3\().4s, v1.s[1] + add \c0\().4s, \c0\().4s, \c3\().4s + mul v22.4s, \c1\().4s, v1.s[3] + mul v20.4s, \c0\().4s, v1.s[3] + add v24.4s, v16.4s, v22.4s + add v26.4s, v18.4s, v22.4s + srshr \c0\().4s, v24.4s, #14 + add v16.4s, v16.4s, v18.4s + srshr \c1\().4s, v26.4s, #14 + sub v16.4s, v16.4s, v22.4s + srshr \c2\().4s, v20.4s, #14 + srshr \c3\().4s, v16.4s, #14 +.endm + +.macro iadst4_12 c0, c1, c2, c3 + smull v16.2d, \c0\().2s, v1.s[0] + smull2 v17.2d, \c0\().4s, v1.s[0] + smlal v16.2d, \c2\().2s, v1.s[1] + smlal2 v17.2d, \c2\().4s, v1.s[1] + smlal v16.2d, \c3\().2s, v1.s[2] + smlal2 v17.2d, \c3\().4s, v1.s[2] + smull v18.2d, \c0\().2s, v1.s[2] + smull2 v19.2d, \c0\().4s, v1.s[2] + smlsl v18.2d, \c2\().2s, v1.s[0] + smlsl2 v19.2d, \c2\().4s, v1.s[0] + sub \c0\().4s, \c0\().4s, \c2\().4s + smlsl v18.2d, \c3\().2s, v1.s[1] + smlsl2 v19.2d, \c3\().4s, v1.s[1] + add \c0\().4s, \c0\().4s, \c3\().4s + smull v22.2d, \c1\().2s, v1.s[3] + smull2 v23.2d, \c1\().4s, v1.s[3] + smull v20.2d, \c0\().2s, v1.s[3] + smull2 v21.2d, \c0\().4s, v1.s[3] + add v24.2d, v16.2d, v22.2d + add v25.2d, v17.2d, v23.2d + add v26.2d, v18.2d, v22.2d + add v27.2d, v19.2d, v23.2d + rshrn \c0\().2s, v24.2d, #14 + rshrn2 \c0\().4s, v25.2d, #14 + add v16.2d, v16.2d, v18.2d + add v17.2d, v17.2d, v19.2d + rshrn \c1\().2s, v26.2d, #14 + rshrn2 \c1\().4s, v27.2d, #14 + sub v16.2d, v16.2d, v22.2d + sub v17.2d, v17.2d, v23.2d + rshrn \c2\().2s, v20.2d, #14 + rshrn2 \c2\().4s, v21.2d, #14 + rshrn \c3\().2s, v16.2d, #14 + rshrn2 \c3\().4s, v17.2d, #14 +.endm + +// The public functions in this file have got the following signature: +// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); + +.macro itxfm_func4x4 txfm1, txfm2, bpp +function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1 +.ifc \txfm1,\txfm2 +.ifc \txfm1,idct + movrel x4, itxfm4_coeffs + ld1 {v0.4h}, [x4] + sxtl v0.4s, v0.4h +.endif +.ifc \txfm1,iadst + movrel x4, iadst4_coeffs + ld1 {v0.d}[1], [x4] + sxtl2 v1.4s, v0.8h +.endif +.else + movrel x4, itxfm4_coeffs + ld1 {v0.8h}, [x4] + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h +.endif + + movi v30.4s, #0 + movi v31.4s, #0 +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.ne 1f + // DC-only for idct/idct + ld1 {v2.s}[0], [x2] + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + st1 {v31.s}[0], [x2] + dup v4.4s, v2.s[0] + mov v5.16b, v4.16b + mov v6.16b, v4.16b + mov v7.16b, v4.16b + b 2f +.endif + +1: + ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2] + st1 {v30.4s,v31.4s}, [x2], #32 + +.ifc \txfm1,iwht + sshr v4.4s, v4.4s, #2 + sshr v5.4s, v5.4s, #2 + sshr v6.4s, v6.4s, #2 + sshr v7.4s, v7.4s, #2 +.endif + + \txfm1\()4_\bpp v4, v5, v6, v7 + + st1 {v30.4s,v31.4s}, [x2], #32 + // Transpose 4x4 with 32 bit elements + transpose_4x4s v4, v5, v6, v7, v16, v17, v18, v19 + + \txfm2\()4_\bpp v4, v5, v6, v7 +2: + mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8 + ld1 {v0.4h}, [x0], x1 + ld1 {v1.4h}, [x0], x1 +.ifnc \txfm1,iwht + srshr v4.4s, v4.4s, #4 + srshr v5.4s, v5.4s, #4 + srshr v6.4s, v6.4s, #4 + srshr v7.4s, v7.4s, #4 +.endif + uaddw v4.4s, v4.4s, v0.4h + uaddw v5.4s, v5.4s, v1.4h + ld1 {v2.4h}, [x0], x1 + ld1 {v3.4h}, [x0], x1 + sqxtun v0.4h, v4.4s + sqxtun2 v0.8h, v5.4s + sub x0, x0, x1, lsl #2 + + uaddw v6.4s, v6.4s, v2.4h + umin v0.8h, v0.8h, v31.8h + uaddw v7.4s, v7.4s, v3.4h + st1 {v0.4h}, [x0], x1 + sqxtun v2.4h, v6.4s + sqxtun2 v2.8h, v7.4s + umin v2.8h, v2.8h, v31.8h + + st1 {v0.d}[1], [x0], x1 + st1 {v2.4h}, [x0], x1 + st1 {v2.d}[1], [x0], x1 + + ret +endfunc +.endm + +.macro itxfm_funcs4x4 bpp +itxfm_func4x4 idct, idct, \bpp +itxfm_func4x4 iadst, idct, \bpp +itxfm_func4x4 idct, iadst, \bpp +itxfm_func4x4 iadst, iadst, \bpp +itxfm_func4x4 iwht, iwht, \bpp +.endm + +itxfm_funcs4x4 10 +itxfm_funcs4x4 12 + +function idct8x8_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + + movi v1.4h, #0 + sxtl v0.4s, v0.4h + + ld1 {v2.s}[0], [x2] + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + st1 {v1.s}[0], [x2] + dup v2.4s, v2.s[0] + + srshr v2.4s, v2.4s, #5 + + mov x4, #8 + mov x3, x0 + dup v31.8h, w5 +1: + // Loop to add the constant from v2 into all 8x8 outputs + subs x4, x4, #2 + ld1 {v3.8h}, [x0], x1 + ld1 {v4.8h}, [x0], x1 + uaddw v16.4s, v2.4s, v3.4h + uaddw2 v17.4s, v2.4s, v3.8h + uaddw v18.4s, v2.4s, v4.4h + uaddw2 v19.4s, v2.4s, v4.8h + sqxtun v3.4h, v16.4s + sqxtun2 v3.8h, v17.4s + sqxtun v4.4h, v18.4s + sqxtun2 v4.8h, v19.4s + umin v3.8h, v3.8h, v31.8h + umin v4.8h, v4.8h, v31.8h + st1 {v3.8h}, [x3], x1 + st1 {v4.8h}, [x3], x1 + b.ne 1b + + ret +endfunc + +.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5 + dmbutterfly0 \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a + dmbutterfly \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3 // r2 = t2a, r6 = t3a + dmbutterfly \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3 // r1 = t4a, r7 = t7a + dmbutterfly \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3 // r5 = t5a, r3 = t6a + + butterfly_4s \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3 + butterfly_4s \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a + butterfly_4s \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a + butterfly_4s \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2 + + dmbutterfly0 \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5 + + butterfly_4s \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6] + butterfly_4s \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7] + butterfly_4s \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5] + butterfly_4s \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4] +.endm + +.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5 + dmbutterfly_l \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0] // t2,t3 = t1a, t0,t1 = t0a + dmbutterfly_l \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0] // r0,r7 = t5a, t4,t5 = t4a + + dbutterfly_n \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4 + dbutterfly_n \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5 + + dmbutterfly_l \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2] // t4,t5 = t3a, t2,t3 = t2a + dmbutterfly_l \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2] // r2,r5 = t7a, r0,r7 = t6a + + dbutterfly_n \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6 + dbutterfly_n \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7 + + butterfly_4s \r7, \r4, \r4, \r0 // r7 = -out[7], r4 = t3 + neg \r7\().4s, \r7\().4s // r7 = out[7] + butterfly_4s \r0, \r1, \r3, \r1 // r0 = out[0], r1 = t2 + + dmbutterfly_l \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3] // r2,r3 = t5a, t3,t5 = t4a + dmbutterfly_l \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2] // t0,t1 = t6a, r5,r6 = t7a + + dbutterfly_n \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6], t2 = t7 + + dmbutterfly0 \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2 // r3 = -out[3], r4 = out[4] + neg \r3\().4s, \r3\().4s // r3 = out[3] + + dbutterfly_n \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6 + neg \r1\().4s, \r1\().4s // r1 = out[1] + + dmbutterfly0 \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5 // r2 = out[2], r5 = -out[5] + neg \r5\().4s, \r5\().4s // r5 = out[5] +.endm + + +.macro itxfm_func8x8 txfm1, txfm2 +function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.eq idct8x8_dc_add_neon +.endif + // The iadst also uses a few coefficients from + // idct, so those always need to be loaded. +.ifc \txfm1\()_\txfm2,idct_idct + movrel x4, idct_coeffs +.else + movrel x4, iadst8_coeffs + ld1 {v1.8h}, [x4], #16 + stp d8, d9, [sp, #-0x10]! + sxtl2 v3.4s, v1.8h + sxtl v2.4s, v1.4h +.endif + ld1 {v0.8h}, [x4] + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h + + movi v4.4s, #0 + movi v5.4s, #0 + movi v6.4s, #0 + movi v7.4s, #0 + +1: + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2], #64 + ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64 + ld1 {v24.4s,v25.4s,v26.4s,v27.4s}, [x2], #64 + ld1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 + sub x2, x2, #256 + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64 + +.ifc \txfm1\()_\txfm2,idct_idct + idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7 + idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7 +.else + \txfm1\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9 + \txfm1\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9 +.endif + + // Transpose 8x8 with 16 bit elements + transpose_8x8s v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7 + +.ifc \txfm1\()_\txfm2,idct_idct + idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7 + idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7 +.else + \txfm2\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9 + \txfm2\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9 +.endif +2: + mov x3, x0 + // Add into the destination + ld1 {v0.8h}, [x0], x1 + srshr v16.4s, v16.4s, #5 + srshr v17.4s, v17.4s, #5 + ld1 {v1.8h}, [x0], x1 + srshr v18.4s, v18.4s, #5 + srshr v19.4s, v19.4s, #5 + ld1 {v2.8h}, [x0], x1 + srshr v20.4s, v20.4s, #5 + srshr v21.4s, v21.4s, #5 + uaddw v16.4s, v16.4s, v0.4h + uaddw2 v17.4s, v17.4s, v0.8h + ld1 {v3.8h}, [x0], x1 + srshr v22.4s, v22.4s, #5 + srshr v23.4s, v23.4s, #5 + uaddw v18.4s, v18.4s, v1.4h + uaddw2 v19.4s, v19.4s, v1.8h + ld1 {v4.8h}, [x0], x1 + srshr v24.4s, v24.4s, #5 + srshr v25.4s, v25.4s, #5 + uaddw v20.4s, v20.4s, v2.4h + uaddw2 v21.4s, v21.4s, v2.8h + sqxtun v0.4h, v16.4s + sqxtun2 v0.8h, v17.4s + dup v16.8h, w5 + ld1 {v5.8h}, [x0], x1 + srshr v26.4s, v26.4s, #5 + srshr v27.4s, v27.4s, #5 + uaddw v22.4s, v22.4s, v3.4h + uaddw2 v23.4s, v23.4s, v3.8h + sqxtun v1.4h, v18.4s + sqxtun2 v1.8h, v19.4s + umin v0.8h, v0.8h, v16.8h + ld1 {v6.8h}, [x0], x1 + srshr v28.4s, v28.4s, #5 + srshr v29.4s, v29.4s, #5 + uaddw v24.4s, v24.4s, v4.4h + uaddw2 v25.4s, v25.4s, v4.8h + sqxtun v2.4h, v20.4s + sqxtun2 v2.8h, v21.4s + umin v1.8h, v1.8h, v16.8h + ld1 {v7.8h}, [x0], x1 + srshr v30.4s, v30.4s, #5 + srshr v31.4s, v31.4s, #5 + uaddw v26.4s, v26.4s, v5.4h + uaddw2 v27.4s, v27.4s, v5.8h + sqxtun v3.4h, v22.4s + sqxtun2 v3.8h, v23.4s + umin v2.8h, v2.8h, v16.8h + + st1 {v0.8h}, [x3], x1 + uaddw v28.4s, v28.4s, v6.4h + uaddw2 v29.4s, v29.4s, v6.8h + st1 {v1.8h}, [x3], x1 + sqxtun v4.4h, v24.4s + sqxtun2 v4.8h, v25.4s + umin v3.8h, v3.8h, v16.8h + st1 {v2.8h}, [x3], x1 + uaddw v30.4s, v30.4s, v7.4h + uaddw2 v31.4s, v31.4s, v7.8h + st1 {v3.8h}, [x3], x1 + sqxtun v5.4h, v26.4s + sqxtun2 v5.8h, v27.4s + umin v4.8h, v4.8h, v16.8h + st1 {v4.8h}, [x3], x1 + sqxtun v6.4h, v28.4s + sqxtun2 v6.8h, v29.4s + umin v5.8h, v5.8h, v16.8h + st1 {v5.8h}, [x3], x1 + sqxtun v7.4h, v30.4s + sqxtun2 v7.8h, v31.4s + umin v6.8h, v6.8h, v16.8h + + st1 {v6.8h}, [x3], x1 + umin v7.8h, v7.8h, v16.8h + st1 {v7.8h}, [x3], x1 + +.ifnc \txfm1\()_\txfm2,idct_idct + ldp d8, d9, [sp], 0x10 +.endif + ret +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1 + mov x5, #0x03ff + b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1 + mov x5, #0x0fff + b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon +endfunc +.endm + +itxfm_func8x8 idct, idct +itxfm_func8x8 iadst, idct +itxfm_func8x8 idct, iadst +itxfm_func8x8 iadst, iadst + + +function idct16x16_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + sxtl v0.4s, v0.4h + + movi v1.4h, #0 + + ld1 {v2.s}[0], [x2] + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + st1 {v1.s}[0], [x2] + dup v2.4s, v2.s[0] + + srshr v0.4s, v2.4s, #6 + + mov x3, x0 + mov x4, #16 + dup v31.8h, w13 +1: + // Loop to add the constant from v2 into all 16x16 outputs + subs x4, x4, #2 + ld1 {v1.8h,v2.8h}, [x0], x1 + uaddw v16.4s, v0.4s, v1.4h + uaddw2 v17.4s, v0.4s, v1.8h + ld1 {v3.8h,v4.8h}, [x0], x1 + uaddw v18.4s, v0.4s, v2.4h + uaddw2 v19.4s, v0.4s, v2.8h + uaddw v20.4s, v0.4s, v3.4h + uaddw2 v21.4s, v0.4s, v3.8h + uaddw v22.4s, v0.4s, v4.4h + uaddw2 v23.4s, v0.4s, v4.8h + sqxtun v1.4h, v16.4s + sqxtun2 v1.8h, v17.4s + sqxtun v2.4h, v18.4s + sqxtun2 v2.8h, v19.4s + sqxtun v3.4h, v20.4s + sqxtun2 v3.8h, v21.4s + sqxtun v4.4h, v22.4s + sqxtun2 v4.8h, v23.4s + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + st1 {v1.8h,v2.8h}, [x3], x1 + umin v3.8h, v3.8h, v31.8h + umin v4.8h, v4.8h, v31.8h + st1 {v3.8h,v4.8h}, [x3], x1 + b.ne 1b + + ret +endfunc + +.macro idct16_end + butterfly_4s v18, v7, v4, v7 // v18 = t0a, v7 = t7a + butterfly_4s v19, v22, v5, v22 // v19 = t1a, v22 = t6 + butterfly_4s v4, v26, v20, v26 // v4 = t2a, v26 = t5 + butterfly_4s v5, v6, v28, v6 // v5 = t3a, v6 = t4 + butterfly_4s v20, v28, v16, v24 // v20 = t8a, v28 = t11a + butterfly_4s v24, v21, v23, v21 // v24 = t9, v21 = t10 + butterfly_4s v23, v27, v25, v27 // v23 = t14, v27 = t13 + butterfly_4s v25, v29, v29, v17 // v25 = t15a, v29 = t12a + + dmbutterfly0 v8, v9, v27, v21, v8, v9, v16, v17, v30, v31 // v8 = t13a, v9 = t10a + dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11 + + butterfly_4s v16, v31, v18, v25 // v16 = out[0], v31 = out[15] + butterfly_4s v17, v30, v19, v23 // v17 = out[1], v30 = out[14] + butterfly_4s_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6] + butterfly_4s v23, v24, v7, v20 // v23 = out[7], v24 = out[8] + butterfly_4s v18, v29, v4, v8 // v18 = out[2], v29 = out[13] + butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12] + butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11] + butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10] + ret +.endm + +function idct16 + dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a + dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a + dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a + dmbutterfly v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a + dmbutterfly v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a + dmbutterfly v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a + dmbutterfly v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a + dmbutterfly v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a + + butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +endfunc + +function idct16_half + dmbutterfly0_h v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a + dmbutterfly_h1 v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a + dmbutterfly_h1 v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a + dmbutterfly_h2 v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a + dmbutterfly_h1 v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a + dmbutterfly_h2 v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a + dmbutterfly_h1 v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a + dmbutterfly_h2 v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a + + butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +endfunc + +function idct16_quarter + dsmull_h v24, v25, v19, v3.s[3] + dsmull_h v4, v5, v17, v2.s[0] + dsmull_h v7, v6, v18, v1.s[1] + dsmull_h v30, v31, v18, v1.s[0] + neg v24.2d, v24.2d + neg v25.2d, v25.2d + dsmull_h v29, v28, v17, v2.s[1] + dsmull_h v26, v27, v19, v3.s[2] + dsmull_h v22, v23, v16, v0.s[0] + drshrn_h v24, v24, v25, #14 + drshrn_h v16, v4, v5, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v6, v30, v31, #14 + drshrn_h v29, v29, v28, #14 + drshrn_h v17, v26, v27, #14 + drshrn_h v28, v22, v23, #14 + + dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3] + dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3] + neg v22.2d, v22.2d + neg v23.2d, v23.2d + drshrn_h v27, v20, v21, #14 + drshrn_h v21, v22, v23, #14 + drshrn_h v23, v18, v19, #14 + drshrn_h v25, v30, v31, #14 + mov v4.16b, v28.16b + mov v5.16b, v28.16b + dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31 + mov v20.16b, v28.16b + idct16_end +endfunc + +function iadst16 + ld1 {v0.8h,v1.8h}, [x11] + sxtl v2.4s, v1.4h + sxtl2 v3.4s, v1.8h + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h + + dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.s[1], v0.s[0] // v6,v7 = t1, v4,v5 = t0 + dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.s[1], v1.s[0] // v10,v11 = t9, v8,v9 = t8 + dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a + dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2] // v14,v15 = t3, v12,v13 = t2 + dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a + + dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.s[3], v1.s[2] // v6,v7 = t11, v4,v5 = t10 + dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a + dmbutterfly_l v10, v11, v8, v9, v27, v20, v2.s[1], v2.s[0] // v10,v11 = t5, v8,v9 = t4 + dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a + + dmbutterfly_l v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0] // v14,v15 = t13, v12,v13 = t12 + dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a + dmbutterfly_l v6, v7, v4, v5, v25, v22, v2.s[3], v2.s[2] // v6,v7 = t7, v4,v5 = t6 + dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a + + dmbutterfly_l v10, v11, v8, v9, v17, v30, v3.s[3], v3.s[2] // v10,v11 = t15, v8,v9 = t14 + ld1 {v0.8h}, [x10] + dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h + dmbutterfly_l v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1] // v14,v15 = t9, v12,v13 = t8 + dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a + + dmbutterfly_l v4, v5, v6, v7, v28, v19, v1.s[1], v1.s[0] // v4,v5 = t12, v6,v7 = t13 + dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a + dmbutterfly_l v10, v11, v8, v9, v21, v26, v1.s[2], v1.s[3] // v10,v11 = t11, v8,v9 = t10 + butterfly_4s_r v4, v27, v16, v27 // v4 = t4, v27 = t0 + dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a + + dmbutterfly_l v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2] // v12,v13 = t14, v14,v15 = t15 + butterfly_4s_r v5, v20, v31, v20 // v5 = t5, v20 = t1 + dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a + dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a + + butterfly_4s_r v6, v25, v18, v25 // v6 = t6, v25 = t2 + butterfly_4s_r v7, v22, v29, v22 // v7 = t7, v22 = t3 + + dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.s[2], v0.s[3] // v10,v11 = t13, v8,v9 = t12 + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2] // v12,v13 = t14, v14,v15 = t15 + + dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a + dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a + neg v29.4s, v29.4s // v29 = out[13] + + dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.s[2], v0.s[3] // v10,v11 = t5a, v8,v9 = t4a + dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.s[3], v0.s[2] // v12,v13 = t6a, v14,v15 = t7a + + butterfly_4s v2, v6, v27, v25 // v2 = out[0], v6 = t2a + butterfly_4s v3, v7, v23, v21 // v3 =-out[1], v7 = t10 + + dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6 + neg v19.4s, v19.4s // v19 = out[3] + dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7 + + butterfly_4s v5, v8, v20, v22 // v5 =-out[15],v8 = t3a + butterfly_4s v4, v9, v24, v26 // v4 = out[14],v9 = t11 + + dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8] + dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10] + dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11] + dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9] + + neg v31.4s, v5.4s // v31 = out[15] + neg v17.4s, v3.4s // v17 = out[1] + + mov v16.16b, v2.16b + mov v30.16b, v4.16b + ret +endfunc + +// Helper macros; we can't use these expressions directly within +// e.g. .irp due to the extra concatenation \(). Therefore wrap +// them in macros to allow using .irp below. +.macro load i, src, inc + ld1 {v\i\().4s}, [\src], \inc +.endm +.macro store i, dst, inc + st1 {v\i\().4s}, [\dst], \inc +.endm +.macro movi_v i, size, imm + movi v\i\()\size, \imm +.endm +.macro load_clear i, src, inc + ld1 {v\i\().4s}, [\src] + st1 {v4.4s}, [\src], \inc +.endm + +.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7 + srshr \coef0, \coef0, #6 + ld1 {v4.4h}, [x0], x1 + srshr \coef1, \coef1, #6 + ld1 {v4.d}[1], [x3], x1 + srshr \coef2, \coef2, #6 + ld1 {v5.4h}, [x0], x1 + srshr \coef3, \coef3, #6 + uaddw \coef0, \coef0, v4.4h + ld1 {v5.d}[1], [x3], x1 + srshr \coef4, \coef4, #6 + uaddw2 \coef1, \coef1, v4.8h + ld1 {v6.4h}, [x0], x1 + srshr \coef5, \coef5, #6 + uaddw \coef2, \coef2, v5.4h + ld1 {v6.d}[1], [x3], x1 + sqxtun v4.4h, \coef0 + srshr \coef6, \coef6, #6 + uaddw2 \coef3, \coef3, v5.8h + ld1 {v7.4h}, [x0], x1 + sqxtun2 v4.8h, \coef1 + srshr \coef7, \coef7, #6 + uaddw \coef4, \coef4, v6.4h + ld1 {v7.d}[1], [x3], x1 + umin v4.8h, v4.8h, v8.8h + sub x0, x0, x1, lsl #2 + sub x3, x3, x1, lsl #2 + sqxtun v5.4h, \coef2 + uaddw2 \coef5, \coef5, v6.8h + st1 {v4.4h}, [x0], x1 + sqxtun2 v5.8h, \coef3 + uaddw \coef6, \coef6, v7.4h + st1 {v4.d}[1], [x3], x1 + umin v5.8h, v5.8h, v8.8h + sqxtun v6.4h, \coef4 + uaddw2 \coef7, \coef7, v7.8h + st1 {v5.4h}, [x0], x1 + sqxtun2 v6.8h, \coef5 + st1 {v5.d}[1], [x3], x1 + umin v6.8h, v6.8h, v8.8h + sqxtun v7.4h, \coef6 + st1 {v6.4h}, [x0], x1 + sqxtun2 v7.8h, \coef7 + st1 {v6.d}[1], [x3], x1 + umin v7.8h, v7.8h, v8.8h + st1 {v7.4h}, [x0], x1 + st1 {v7.d}[1], [x3], x1 +.endm + +// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, +// transpose into a horizontal 16x4 slice and store. +// x0 = dst (temp buffer) +// x1 = slice offset +// x2 = src +// x9 = input stride +.macro itxfm16_1d_funcs txfm +function \txfm\()16_1d_4x16_pass1_neon + mov x14, x30 + + movi v4.4s, #0 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr + + bl \txfm\()16 + + // Do four 4x4 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 + // contain the four transposed 4x4 blocks. + transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 + transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 + transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 + + // Store the transposed 4x4 blocks horizontally. + cmp x1, #12 + b.eq 1f +.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 + store \i, x0, #16 +.endr + ret x14 +1: + // Special case: For the last input column (x1 == 12), + // which would be stored as the last row in the temp buffer, + // don't store the first 4x4 block, but keep it in registers + // for the first slice of the second pass (where it is the + // last 4x4 block). + add x0, x0, #16 + st1 {v20.4s}, [x0], #16 + st1 {v24.4s}, [x0], #16 + st1 {v28.4s}, [x0], #16 + add x0, x0, #16 + st1 {v21.4s}, [x0], #16 + st1 {v25.4s}, [x0], #16 + st1 {v29.4s}, [x0], #16 + add x0, x0, #16 + st1 {v22.4s}, [x0], #16 + st1 {v26.4s}, [x0], #16 + st1 {v30.4s}, [x0], #16 + add x0, x0, #16 + st1 {v23.4s}, [x0], #16 + st1 {v27.4s}, [x0], #16 + st1 {v31.4s}, [x0], #16 + + mov v28.16b, v16.16b + mov v29.16b, v17.16b + mov v30.16b, v18.16b + mov v31.16b, v19.16b + ret x14 +endfunc + +// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, +// load the destination pixels (from a similar 4x16 slice), add and store back. +// x0 = dst +// x1 = dst stride +// x2 = src (temp buffer) +// x3 = slice offset +// x9 = temp buffer stride +function \txfm\()16_1d_4x16_pass2_neon + mov x14, x30 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 + load \i, x2, x9 +.endr + cbz x3, 1f +.irp i, 28, 29, 30, 31 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl \txfm\()16 + + dup v8.8h, w13 + load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + + ret x14 +endfunc +.endm + +itxfm16_1d_funcs idct +itxfm16_1d_funcs iadst + +// This is the minimum eob value for each subpartition, in increments of 4 +const min_eob_idct_idct_16, align=4 + .short 0, 10, 38, 89 +endconst + +.macro itxfm_func16x16 txfm1, txfm2 +function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.eq idct16x16_dc_add_neon +.endif + mov x15, x30 + // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9. +.ifnc \txfm1\()_\txfm2,idct_idct + stp d14, d15, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! +.endif + stp d8, d9, [sp, #-0x10]! + + sub sp, sp, #1024 + + mov x4, x0 + mov x5, x1 + mov x6, x2 + + movrel x10, idct_coeffs +.ifnc \txfm1\()_\txfm2,idct_idct + movrel x11, iadst16_coeffs +.endif +.ifc \txfm1,idct + ld1 {v0.8h,v1.8h}, [x10] + sxtl v2.4s, v1.4h + sxtl2 v3.4s, v1.8h + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h +.endif + mov x9, #64 + +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #10 + b.le idct16x16_quarter_add_16_neon + cmp w3, #38 + b.le idct16x16_half_add_16_neon + + movrel x12, min_eob_idct_idct_16, 2 +.endif + +.irp i, 0, 4, 8, 12 + add x0, sp, #(\i*64) +.ifc \txfm1\()_\txfm2,idct_idct +.if \i > 0 + ldrh w1, [x12], #2 + cmp w3, w1 + mov x1, #(16 - \i)/4 + b.le 1f +.endif +.endif + mov x1, #\i + add x2, x6, #(\i*4) + bl \txfm1\()16_1d_4x16_pass1_neon +.endr +.ifc \txfm1\()_\txfm2,iadst_idct + ld1 {v0.8h,v1.8h}, [x10] + sxtl v2.4s, v1.4h + sxtl2 v3.4s, v1.8h + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h +.endif + +.ifc \txfm1\()_\txfm2,idct_idct + b 3f +1: + // Set v28-v31 to zero, for the in-register passthrough of + // coefficients to pass 2. + movi v28.4s, #0 + movi v29.4s, #0 + movi v30.4s, #0 + movi v31.4s, #0 +2: + subs x1, x1, #1 +.rept 4 + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9 +.endr + b.ne 2b +3: +.endif + +.irp i, 0, 4, 8, 12 + add x0, x4, #(\i*2) + mov x1, x5 + add x2, sp, #(\i*4) + mov x3, #\i + bl \txfm2\()16_1d_4x16_pass2_neon +.endr + + add sp, sp, #1024 + ldp d8, d9, [sp], 0x10 +.ifnc \txfm1\()_\txfm2,idct_idct + ldp d10, d11, [sp], 0x10 + ldp d12, d13, [sp], 0x10 + ldp d14, d15, [sp], 0x10 +.endif + ret x15 +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1 + mov x13, #0x03ff + b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon +endfunc + +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1 + mov x13, #0x0fff + b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon +endfunc +.endm + +itxfm_func16x16 idct, idct +itxfm_func16x16 iadst, idct +itxfm_func16x16 idct, iadst +itxfm_func16x16 iadst, iadst + +function idct16_1d_4x16_pass1_quarter_neon + mov x14, x30 + + movi v4.4s, #0 +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr + + bl idct16_quarter + + // Do four 4x4 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 + // contain the four transposed 4x4 blocks. + transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 + transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 + transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 + + // Store the transposed 4x4 blocks horizontally. + // The first 4x4 block is kept in registers for the second pass, + // store the rest in the temp buffer. + add x0, x0, #16 + st1 {v20.4s}, [x0], #16 + st1 {v24.4s}, [x0], #16 + st1 {v28.4s}, [x0], #16 + add x0, x0, #16 + st1 {v21.4s}, [x0], #16 + st1 {v25.4s}, [x0], #16 + st1 {v29.4s}, [x0], #16 + add x0, x0, #16 + st1 {v22.4s}, [x0], #16 + st1 {v26.4s}, [x0], #16 + st1 {v30.4s}, [x0], #16 + add x0, x0, #16 + st1 {v23.4s}, [x0], #16 + st1 {v27.4s}, [x0], #16 + st1 {v31.4s}, [x0], #16 + ret x14 +endfunc + +function idct16_1d_4x16_pass2_quarter_neon + mov x14, x30 + + // Only load the top 4 lines, and only do it for the later slices. + // For the first slice, d16-d19 is kept in registers from the first pass. + cbz x3, 1f +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl idct16_quarter + + dup v8.8h, w13 + load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + + ret x14 +endfunc + +function idct16_1d_4x16_pass1_half_neon + mov x14, x30 + + movi v4.4s, #0 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr + + bl idct16_half + + // Do four 4x4 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 + // contain the four transposed 4x4 blocks. + transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 + transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 + transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 + + // Store the transposed 4x4 blocks horizontally. + cmp x1, #4 + b.eq 1f +.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 + store \i, x0, #16 +.endr + ret x14 +1: + // Special case: For the second input column (r1 == 4), + // which would be stored as the second row in the temp buffer, + // don't store the first 4x4 block, but keep it in registers + // for the first slice of the second pass (where it is the + // second 4x4 block). + add x0, x0, #16 + st1 {v20.4s}, [x0], #16 + st1 {v24.4s}, [x0], #16 + st1 {v28.4s}, [x0], #16 + add x0, x0, #16 + st1 {v21.4s}, [x0], #16 + st1 {v25.4s}, [x0], #16 + st1 {v29.4s}, [x0], #16 + add x0, x0, #16 + st1 {v22.4s}, [x0], #16 + st1 {v26.4s}, [x0], #16 + st1 {v30.4s}, [x0], #16 + add x0, x0, #16 + st1 {v23.4s}, [x0], #16 + st1 {v27.4s}, [x0], #16 + st1 {v31.4s}, [x0], #16 + + mov v20.16b, v16.16b + mov v21.16b, v17.16b + mov v22.16b, v18.16b + mov v23.16b, v19.16b + ret x14 +endfunc + +function idct16_1d_4x16_pass2_half_neon + mov x14, x30 + +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + cbz x3, 1f +.irp i, 20, 21, 22, 23 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl idct16_half + + dup v8.8h, w13 + load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + + ret x14 +endfunc + +.macro idct16_partial size +function idct16x16_\size\()_add_16_neon + add x0, sp, #(0*64) + mov x1, #0 + add x2, x6, #(0*4) + bl idct16_1d_4x16_pass1_\size\()_neon +.ifc \size,half + add x0, sp, #(4*64) + mov x1, #4 + add x2, x6, #(4*4) + bl idct16_1d_4x16_pass1_\size\()_neon +.endif + +.irp i, 0, 4, 8, 12 + add x0, x4, #(\i*2) + mov x1, x5 + add x2, sp, #(\i*4) + mov x3, #\i + bl idct16_1d_4x16_pass2_\size\()_neon +.endr + + add sp, sp, #1024 + ldp d8, d9, [sp], 0x10 + ret x15 +endfunc +.endm + +idct16_partial quarter +idct16_partial half + +function idct32x32_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + sxtl v0.4s, v0.4h + + movi v1.4h, #0 + + ld1 {v2.s}[0], [x2] + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + smull v2.2d, v2.2s, v0.s[0] + rshrn v2.2s, v2.2d, #14 + st1 {v1.s}[0], [x2] + dup v2.4s, v2.s[0] + + srshr v0.4s, v2.4s, #6 + + mov x3, x0 + mov x4, #32 + sub x1, x1, #32 + dup v31.8h, w13 +1: + // Loop to add the constant v0 into all 32x32 outputs + subs x4, x4, #1 + ld1 {v1.8h,v2.8h}, [x0], #32 + uaddw v16.4s, v0.4s, v1.4h + uaddw2 v17.4s, v0.4s, v1.8h + ld1 {v3.8h,v4.8h}, [x0], x1 + uaddw v18.4s, v0.4s, v2.4h + uaddw2 v19.4s, v0.4s, v2.8h + uaddw v20.4s, v0.4s, v3.4h + uaddw2 v21.4s, v0.4s, v3.8h + uaddw v22.4s, v0.4s, v4.4h + uaddw2 v23.4s, v0.4s, v4.8h + sqxtun v1.4h, v16.4s + sqxtun2 v1.8h, v17.4s + sqxtun v2.4h, v18.4s + sqxtun2 v2.8h, v19.4s + sqxtun v3.4h, v20.4s + sqxtun2 v3.8h, v21.4s + sqxtun v4.4h, v22.4s + sqxtun2 v4.8h, v23.4s + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + st1 {v1.8h,v2.8h}, [x3], #32 + umin v3.8h, v3.8h, v31.8h + umin v4.8h, v4.8h, v31.8h + st1 {v3.8h,v4.8h}, [x3], x1 + b.ne 1b + + ret +endfunc + +.macro idct32_end + butterfly_4s v16, v5, v4, v5 // v16 = t16a, v5 = t19a + butterfly_4s v17, v20, v23, v20 // v17 = t17, v20 = t18 + butterfly_4s v18, v6, v7, v6 // v18 = t23a, v6 = t20a + butterfly_4s v19, v21, v22, v21 // v19 = t22, v21 = t21 + butterfly_4s v4, v28, v28, v30 // v4 = t24a, v28 = t27a + butterfly_4s v23, v26, v25, v26 // v23 = t25, v26 = t26 + butterfly_4s v7, v8, v29, v31 // v7 = t31a, v3 = t28a + butterfly_4s v22, v27, v24, v27 // v22 = t30, v27 = t29 + + dmbutterfly v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a + dmbutterfly v8, v5, v0.s[2], v0.s[3], v24, v25, v30, v31 // v3 = t19, v5 = t28 + dmbutterfly v28, v6, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 + dmbutterfly v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a + + butterfly_4s v31, v24, v7, v4 // v31 = t31, v24 = t24 + butterfly_4s v30, v25, v22, v23 // v30 = t30a, v25 = t25a + butterfly_4s_r v23, v16, v16, v18 // v23 = t23, v16 = t16 + butterfly_4s_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a + butterfly_4s v18, v21, v27, v21 // v18 = t18, v21 = t21 + butterfly_4s_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a + butterfly_4s v29, v26, v20, v26 // v29 = t29, v26 = t26 + butterfly_4s v19, v20, v8, v6 // v19 = t19a, v20 = t20 + + dmbutterfly0 v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27, v20 = t20 + dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a + dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22 + dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a + ret +.endm + +function idct32_odd + dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +endfunc + +function idct32_odd_half + dmbutterfly_h1 v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly_h2 v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly_h1 v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly_h2 v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly_h1 v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly_h2 v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly_h1 v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly_h2 v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +endfunc + +function idct32_odd_quarter + dsmull_h v4, v5, v16, v10.s[0] + dsmull_h v28, v29, v19, v11.s[3] + dsmull_h v30, v31, v16, v10.s[1] + dsmull_h v22, v23, v17, v13.s[2] + dsmull_h v7, v6, v17, v13.s[3] + dsmull_h v26, v27, v19, v11.s[2] + dsmull_h v20, v21, v18, v12.s[0] + dsmull_h v24, v25, v18, v12.s[1] + + neg v28.2d, v28.2d + neg v29.2d, v29.2d + neg v7.2d, v7.2d + neg v6.2d, v6.2d + + drshrn_h v4, v4, v5, #14 + drshrn_h v5, v28, v29, #14 + drshrn_h v29, v30, v31, #14 + drshrn_h v28, v22, v23, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v31, v26, v27, #14 + drshrn_h v6, v20, v21, #14 + drshrn_h v30, v24, v25, #14 + + dmbutterfly_l v16, v17, v18, v19, v29, v4, v1.s[0], v1.s[1] + dmbutterfly_l v27, v26, v20, v21, v31, v5, v1.s[0], v1.s[1] + drshrn_h v23, v16, v17, #14 + drshrn_h v24, v18, v19, #14 + neg v20.2d, v20.2d + neg v21.2d, v21.2d + drshrn_h v27, v27, v26, #14 + drshrn_h v20, v20, v21, #14 + dmbutterfly_l v16, v17, v18, v19, v30, v6, v1.s[2], v1.s[3] + drshrn_h v21, v16, v17, #14 + drshrn_h v26, v18, v19, #14 + dmbutterfly_l v16, v17, v18, v19, v28, v7, v1.s[2], v1.s[3] + drshrn_h v25, v16, v17, #14 + neg v18.2d, v18.2d + neg v19.2d, v19.2d + drshrn_h v22, v18, v19, #14 + + idct32_end +endfunc + +.macro idct32_funcs suffix +// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. +// The 32-point IDCT can be decomposed into two 16-point IDCTs; +// a normal IDCT16 with every other input component (the even ones, with +// each output written twice), followed by a separate 16-point IDCT +// of the odd inputs, added/subtracted onto the outputs of the first idct16. +// x0 = dst (temp buffer) +// x1 = unused +// x2 = src +// x9 = double input stride +function idct32_1d_4x32_pass1\suffix\()_neon + mov x14, x30 + + movi v4.4s, #0 + + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr +.endif + + bl idct16\suffix + + // Do four 4x4 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 + // contain the four transposed 4x4 blocks. + transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7 + transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7 + transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7 + + // Store the registers a, b, c, d horizontally, followed by the + // same registers d, c, b, a mirrored. +.macro store_rev a, b, c, d + // There's no rev128 instruction, but we reverse each 64 bit + // half, and then flip them using an ext with 8 bytes offset. + rev64 v7.4s, \d + st1 {\a}, [x0], #16 + ext v7.16b, v7.16b, v7.16b, #8 + st1 {\b}, [x0], #16 + rev64 v6.4s, \c + st1 {\c}, [x0], #16 + ext v6.16b, v6.16b, v6.16b, #8 + st1 {\d}, [x0], #16 + rev64 v5.4s, \b + st1 {v7.4s}, [x0], #16 + ext v5.16b, v5.16b, v5.16b, #8 + st1 {v6.4s}, [x0], #16 + rev64 v4.4s, \a + st1 {v5.4s}, [x0], #16 + ext v4.16b, v4.16b, v4.16b, #8 + st1 {v4.4s}, [x0], #16 +.endm + store_rev v16.4s, v20.4s, v24.4s, v28.4s + store_rev v17.4s, v21.4s, v25.4s, v29.4s + store_rev v18.4s, v22.4s, v26.4s, v30.4s + store_rev v19.4s, v23.4s, v27.4s, v31.4s + sub x0, x0, #512 +.purgem store_rev + + // Move x2 back to the start of the input, and move + // to the first odd row +.ifb \suffix + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half + sub x2, x2, x9, lsl #3 +.endif + add x2, x2, #128 + + movi v4.4s, #0 + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr +.endif + + bl idct32_odd\suffix + + transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7 + transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7 + transpose_4x4s v23, v22, v21, v20, v4, v5, v6, v7 + transpose_4x4s v19, v18, v17, v16, v4, v5, v6, v7 + + // Store the registers a, b, c, d horizontally, + // adding into the output first, and the mirrored, + // subtracted from the output. +.macro store_rev a, b, c, d, a16b, b16b + ld1 {v4.4s}, [x0] + rev64 v9.4s, \d + add v4.4s, v4.4s, \a + st1 {v4.4s}, [x0], #16 + rev64 v8.4s, \c + ld1 {v4.4s}, [x0] + ext v9.16b, v9.16b, v9.16b, #8 + add v4.4s, v4.4s, \b + st1 {v4.4s}, [x0], #16 + ext v8.16b, v8.16b, v8.16b, #8 + ld1 {v4.4s}, [x0] + rev64 \b, \b + add v4.4s, v4.4s, \c + st1 {v4.4s}, [x0], #16 + rev64 \a, \a + ld1 {v4.4s}, [x0] + ext \b16b, \b16b, \b16b, #8 + add v4.4s, v4.4s, \d + st1 {v4.4s}, [x0], #16 + ext \a16b, \a16b, \a16b, #8 + ld1 {v4.4s}, [x0] + sub v4.4s, v4.4s, v9.4s + st1 {v4.4s}, [x0], #16 + ld1 {v4.4s}, [x0] + sub v4.4s, v4.4s, v8.4s + st1 {v4.4s}, [x0], #16 + ld1 {v4.4s}, [x0] + sub v4.4s, v4.4s, \b + st1 {v4.4s}, [x0], #16 + ld1 {v4.4s}, [x0] + sub v4.4s, v4.4s, \a + st1 {v4.4s}, [x0], #16 +.endm + + store_rev v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b + store_rev v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b + store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b + store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b +.purgem store_rev + ret x14 +endfunc + +// This is mostly the same as 4x32_pass1, but without the transpose, +// and use the source as temp buffer between the two idct passes, and +// add into the destination. +// x0 = dst +// x1 = dst stride +// x2 = src (temp buffer) +// x7 = negative double temp buffer stride +// x9 = double temp buffer stride +function idct32_1d_4x32_pass2\suffix\()_neon + mov x14, x30 + + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #3 +.endif + + bl idct16\suffix + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + store \i, x2, x9 +.endr + + sub x2, x2, x9, lsl #4 + add x2, x2, #128 + + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #3 +.endif + sub x2, x2, #128 + + bl idct32_odd\suffix + +.macro load_acc_store a, b, c, d, neg=0 +.if \neg == 0 + ld1 {v4.4s}, [x2], x9 + ld1 {v5.4s}, [x2], x9 + add v4.4s, v4.4s, \a + ld1 {v6.4s}, [x2], x9 + add v5.4s, v5.4s, \b + ld1 {v7.4s}, [x2], x9 + add v6.4s, v6.4s, \c + add v7.4s, v7.4s, \d +.else + ld1 {v4.4s}, [x2], x7 + ld1 {v5.4s}, [x2], x7 + sub v4.4s, v4.4s, \a + ld1 {v6.4s}, [x2], x7 + sub v5.4s, v5.4s, \b + ld1 {v7.4s}, [x2], x7 + sub v6.4s, v6.4s, \c + sub v7.4s, v7.4s, \d +.endif + ld1 {v8.4h}, [x0], x1 + ld1 {v8.d}[1], [x0], x1 + srshr v4.4s, v4.4s, #6 + ld1 {v9.4h}, [x0], x1 + srshr v5.4s, v5.4s, #6 + uaddw v4.4s, v4.4s, v8.4h + ld1 {v9.d}[1], [x0], x1 + srshr v6.4s, v6.4s, #6 + uaddw2 v5.4s, v5.4s, v8.8h + srshr v7.4s, v7.4s, #6 + sub x0, x0, x1, lsl #2 + uaddw v6.4s, v6.4s, v9.4h + sqxtun v4.4h, v4.4s + uaddw2 v7.4s, v7.4s, v9.8h + sqxtun2 v4.8h, v5.4s + umin v4.8h, v4.8h, v15.8h + st1 {v4.4h}, [x0], x1 + sqxtun v5.4h, v6.4s + st1 {v4.d}[1], [x0], x1 + sqxtun2 v5.8h, v7.4s + umin v5.8h, v5.8h, v15.8h + st1 {v5.4h}, [x0], x1 + st1 {v5.d}[1], [x0], x1 +.endm + load_acc_store v31.4s, v30.4s, v29.4s, v28.4s + load_acc_store v27.4s, v26.4s, v25.4s, v24.4s + load_acc_store v23.4s, v22.4s, v21.4s, v20.4s + load_acc_store v19.4s, v18.4s, v17.4s, v16.4s + sub x2, x2, x9 + load_acc_store v16.4s, v17.4s, v18.4s, v19.4s, 1 + load_acc_store v20.4s, v21.4s, v22.4s, v23.4s, 1 + load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1 + load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1 +.purgem load_acc_store + ret x14 +endfunc +.endm + +idct32_funcs +idct32_funcs _quarter +idct32_funcs _half + +const min_eob_idct_idct_32, align=4 + .short 0, 9, 34, 70, 135, 240, 336, 448 +endconst + +function vp9_idct_idct_32x32_add_16_neon + cmp w3, #1 + b.eq idct32x32_dc_add_neon + + movrel x10, idct_coeffs + + mov x15, x30 + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + + sub sp, sp, #4096 + + mov x4, x0 + mov x5, x1 + mov x6, x2 + + // Double stride of the input, since we only read every other line + mov x9, #256 + neg x7, x9 + + ld1 {v0.8h,v1.8h}, [x10], #32 + sxtl v2.4s, v1.4h + sxtl2 v3.4s, v1.8h + sxtl2 v1.4s, v0.8h + sxtl v0.4s, v0.4h + ld1 {v10.8h,v11.8h}, [x10] + sxtl v12.4s, v11.4h + sxtl2 v13.4s, v11.8h + sxtl2 v11.4s, v10.8h + sxtl v10.4s, v10.4h + + dup v15.8h, w13 + + cmp w3, #34 + b.le idct32x32_quarter_add_16_neon + cmp w3, #135 + b.le idct32x32_half_add_16_neon + + movrel x12, min_eob_idct_idct_32, 2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x0, sp, #(\i*128) +.if \i > 0 + ldrh w1, [x12], #2 + cmp w3, w1 + mov x1, #(32 - \i)/4 + b.le 1f +.endif + add x2, x6, #(\i*4) + bl idct32_1d_4x32_pass1_neon +.endr + b 3f + +1: + // Write zeros to the temp buffer for pass 2 + movi v16.4s, #0 + movi v17.4s, #0 + movi v18.4s, #0 + movi v19.4s, #0 +2: + subs x1, x1, #1 +.rept 4 + st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 + st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 +.endr + b.ne 2b +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x0, x4, #(\i*2) + mov x1, x5 + add x2, sp, #(\i*4) + bl idct32_1d_4x32_pass2_neon +.endr + + add sp, sp, #4096 + ldp d14, d15, [sp], 0x10 + ldp d12, d13, [sp], 0x10 + ldp d10, d11, [sp], 0x10 + ldp d8, d9, [sp], 0x10 + + ret x15 +endfunc + +function ff_vp9_idct_idct_32x32_add_10_neon, export=1 + mov x13, #0x03ff + b vp9_idct_idct_32x32_add_16_neon +endfunc + +function ff_vp9_idct_idct_32x32_add_12_neon, export=1 + mov x13, #0x0fff + b vp9_idct_idct_32x32_add_16_neon +endfunc + +.macro idct32_partial size +function idct32x32_\size\()_add_16_neon +.irp i, 0, 4 + add x0, sp, #(\i*128) +.ifc \size,quarter +.if \i == 4 + cmp w3, #9 + b.le 1f +.endif +.endif + add x2, x6, #(\i*4) + bl idct32_1d_4x32_pass1_\size\()_neon +.endr + +.ifc \size,half +.irp i, 8, 12 + add x0, sp, #(\i*128) +.if \i == 12 + cmp w3, #70 + b.le 1f +.endif + add x2, x6, #(\i*4) + bl idct32_1d_4x32_pass1_\size\()_neon +.endr +.endif + b 3f + +1: + // Write zeros to the temp buffer for pass 2 + movi v16.4s, #0 + movi v17.4s, #0 + movi v18.4s, #0 + movi v19.4s, #0 + +.rept 4 + st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 + st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 +.endr + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x0, x4, #(\i*2) + mov x1, x5 + add x2, sp, #(\i*4) + bl idct32_1d_4x32_pass2_\size\()_neon +.endr + + add sp, sp, #4096 + ldp d14, d15, [sp], 0x10 + ldp d12, d13, [sp], 0x10 + ldp d10, d11, [sp], 0x10 + ldp d8, d9, [sp], 0x10 + + ret x15 +endfunc +.endm + +idct32_partial quarter +idct32_partial half diff --git a/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S b/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S new file mode 100644 index 0000000000..a27f7b8ae5 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S @@ -0,0 +1,1580 @@ +/* + * Copyright (c) 2016 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +const itxfm4_coeffs, align=4 + .short 11585, 0, 6270, 15137 +iadst4_coeffs: + .short 5283, 15212, 9929, 13377 +endconst + +const iadst8_coeffs, align=4 + .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679 +idct_coeffs: + .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102 + .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756 + .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520 + .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404 +endconst + +const iadst16_coeffs, align=4 + .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053 + .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207 +endconst + +// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14 +// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14 +// in/out are .8h registers; this can do with 4 temp registers, but is +// more efficient if 6 temp registers are available. +.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0 +.if \neg > 0 + neg \tmp4\().4h, v0.4h +.endif + add \tmp1\().8h, \in1\().8h, \in2\().8h + sub \tmp2\().8h, \in1\().8h, \in2\().8h +.if \neg > 0 + smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0] + smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0] +.else + smull \tmp3\().4s, \tmp1\().4h, v0.h[0] + smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0] +.endif +.ifb \tmp5 + rshrn \out1\().4h, \tmp3\().4s, #14 + rshrn2 \out1\().8h, \tmp4\().4s, #14 + smull \tmp3\().4s, \tmp2\().4h, v0.h[0] + smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0] + rshrn \out2\().4h, \tmp3\().4s, #14 + rshrn2 \out2\().8h, \tmp4\().4s, #14 +.else + smull \tmp5\().4s, \tmp2\().4h, v0.h[0] + smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0] + rshrn \out1\().4h, \tmp3\().4s, #14 + rshrn2 \out1\().8h, \tmp4\().4s, #14 + rshrn \out2\().4h, \tmp5\().4s, #14 + rshrn2 \out2\().8h, \tmp6\().4s, #14 +.endif +.endm + +// Same as dmbutterfly0 above, but treating the input in in2 as zero, +// writing the same output into both out1 and out2. +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 + smull \tmp1\().4s, \in1\().4h, v0.h[0] + smull2 \tmp2\().4s, \in1\().8h, v0.h[0] + rshrn \out1\().4h, \tmp1\().4s, #14 + rshrn2 \out1\().8h, \tmp2\().4s, #14 + rshrn \out2\().4h, \tmp1\().4s, #14 + rshrn2 \out2\().8h, \tmp2\().4s, #14 +.endm + +// out1,out2 = in1 * coef1 - in2 * coef2 +// out3,out4 = in1 * coef2 + in2 * coef1 +// out are 4 x .4s registers, in are 2 x .8h registers +.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2 + smull \out1\().4s, \in1\().4h, \coef1 + smull2 \out2\().4s, \in1\().8h, \coef1 + smull \out3\().4s, \in1\().4h, \coef2 + smull2 \out4\().4s, \in1\().8h, \coef2 + smlsl \out1\().4s, \in2\().4h, \coef2 + smlsl2 \out2\().4s, \in2\().8h, \coef2 + smlal \out3\().4s, \in2\().4h, \coef1 + smlal2 \out4\().4s, \in2\().8h, \coef1 +.endm + +// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 +// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 +// inout are 2 x .8h registers +.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0 + dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2 +.if \neg > 0 + neg \tmp3\().4s, \tmp3\().4s + neg \tmp4\().4s, \tmp4\().4s +.endif + rshrn \inout1\().4h, \tmp1\().4s, #14 + rshrn2 \inout1\().8h, \tmp2\().4s, #14 + rshrn \inout2\().4h, \tmp3\().4s, #14 + rshrn2 \inout2\().8h, \tmp4\().4s, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout2 as zero +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().4s, \inout1\().4h, \coef1 + smull2 \tmp2\().4s, \inout1\().8h, \coef1 + smull \tmp3\().4s, \inout1\().4h, \coef2 + smull2 \tmp4\().4s, \inout1\().8h, \coef2 + rshrn \inout1\().4h, \tmp1\().4s, #14 + rshrn2 \inout1\().8h, \tmp2\().4s, #14 + rshrn \inout2\().4h, \tmp3\().4s, #14 + rshrn2 \inout2\().8h, \tmp4\().4s, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout1 as zero +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().4s, \inout2\().4h, \coef2 + smull2 \tmp2\().4s, \inout2\().8h, \coef2 + smull \tmp3\().4s, \inout2\().4h, \coef1 + smull2 \tmp4\().4s, \inout2\().8h, \coef1 + neg \tmp1\().4s, \tmp1\().4s + neg \tmp2\().4s, \tmp2\().4s + rshrn \inout2\().4h, \tmp3\().4s, #14 + rshrn2 \inout2\().8h, \tmp4\().4s, #14 + rshrn \inout1\().4h, \tmp1\().4s, #14 + rshrn2 \inout1\().8h, \tmp2\().4s, #14 +.endm + +.macro dsmull_h out1, out2, in, coef + smull \out1\().4s, \in\().4h, \coef + smull2 \out2\().4s, \in\().8h, \coef +.endm + +.macro drshrn_h out, in1, in2, shift + rshrn \out\().4h, \in1\().4s, \shift + rshrn2 \out\().8h, \in2\().4s, \shift +.endm + + +// out1 = in1 + in2 +// out2 = in1 - in2 +.macro butterfly_8h out1, out2, in1, in2 + add \out1\().8h, \in1\().8h, \in2\().8h + sub \out2\().8h, \in1\().8h, \in2\().8h +.endm + +// out1 = in1 - in2 +// out2 = in1 + in2 +.macro butterfly_8h_r out1, out2, in1, in2 + sub \out1\().8h, \in1\().8h, \in2\().8h + add \out2\().8h, \in1\().8h, \in2\().8h +.endm + +// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14 +// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14 +// out are 2 x .8h registers, in are 4 x .4s registers +.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4 + add \tmp1\().4s, \in1\().4s, \in3\().4s + add \tmp2\().4s, \in2\().4s, \in4\().4s + sub \tmp3\().4s, \in1\().4s, \in3\().4s + sub \tmp4\().4s, \in2\().4s, \in4\().4s + rshrn \out1\().4h, \tmp1\().4s, #14 + rshrn2 \out1\().8h, \tmp2\().4s, #14 + rshrn \out2\().4h, \tmp3\().4s, #14 + rshrn2 \out2\().8h, \tmp4\().4s, #14 +.endm + +.macro iwht4 c0, c1, c2, c3 + add \c0\().4h, \c0\().4h, \c1\().4h + sub v17.4h, \c2\().4h, \c3\().4h + sub v16.4h, \c0\().4h, v17.4h + sshr v16.4h, v16.4h, #1 + sub \c2\().4h, v16.4h, \c1\().4h + sub \c1\().4h, v16.4h, \c3\().4h + add \c3\().4h, v17.4h, \c2\().4h + sub \c0\().4h, \c0\().4h, \c1\().4h +.endm + +.macro idct4 c0, c1, c2, c3 + smull v22.4s, \c1\().4h, v0.h[3] + smull v20.4s, \c1\().4h, v0.h[2] + add v16.4h, \c0\().4h, \c2\().4h + sub v17.4h, \c0\().4h, \c2\().4h + smlal v22.4s, \c3\().4h, v0.h[2] + smull v18.4s, v16.4h, v0.h[0] + smull v19.4s, v17.4h, v0.h[0] + smlsl v20.4s, \c3\().4h, v0.h[3] + rshrn v22.4h, v22.4s, #14 + rshrn v18.4h, v18.4s, #14 + rshrn v19.4h, v19.4s, #14 + rshrn v20.4h, v20.4s, #14 + add \c0\().4h, v18.4h, v22.4h + sub \c3\().4h, v18.4h, v22.4h + add \c1\().4h, v19.4h, v20.4h + sub \c2\().4h, v19.4h, v20.4h +.endm + +.macro iadst4 c0, c1, c2, c3 + smull v16.4s, \c0\().4h, v0.h[4] + smlal v16.4s, \c2\().4h, v0.h[5] + smlal v16.4s, \c3\().4h, v0.h[6] + smull v17.4s, \c0\().4h, v0.h[6] + smlsl v17.4s, \c2\().4h, v0.h[4] + sub \c0\().4h, \c0\().4h, \c2\().4h + smlsl v17.4s, \c3\().4h, v0.h[5] + add \c0\().4h, \c0\().4h, \c3\().4h + smull v19.4s, \c1\().4h, v0.h[7] + smull v18.4s, \c0\().4h, v0.h[7] + add v20.4s, v16.4s, v19.4s + add v21.4s, v17.4s, v19.4s + rshrn \c0\().4h, v20.4s, #14 + add v16.4s, v16.4s, v17.4s + rshrn \c1\().4h, v21.4s, #14 + sub v16.4s, v16.4s, v19.4s + rshrn \c2\().4h, v18.4s, #14 + rshrn \c3\().4h, v16.4s, #14 +.endm + +// The public functions in this file have got the following signature: +// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); + +.macro itxfm_func4x4 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1 +.ifc \txfm1,\txfm2 +.ifc \txfm1,idct + movrel x4, itxfm4_coeffs + ld1 {v0.4h}, [x4] +.endif +.ifc \txfm1,iadst + movrel x4, iadst4_coeffs + ld1 {v0.d}[1], [x4] +.endif +.else + movrel x4, itxfm4_coeffs + ld1 {v0.8h}, [x4] +.endif + + movi v31.8h, #0 +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.ne 1f + // DC-only for idct/idct + ld1 {v2.h}[0], [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + st1 {v31.h}[0], [x2] + dup v4.4h, v2.h[0] + mov v5.16b, v4.16b + mov v6.16b, v4.16b + mov v7.16b, v4.16b + b 2f +.endif + +1: + ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2] + st1 {v31.8h}, [x2], #16 + +.ifc \txfm1,iwht + sshr v4.4h, v4.4h, #2 + sshr v5.4h, v5.4h, #2 + sshr v6.4h, v6.4h, #2 + sshr v7.4h, v7.4h, #2 +.endif + + \txfm1\()4 v4, v5, v6, v7 + + st1 {v31.8h}, [x2], #16 + // Transpose 4x4 with 16 bit elements + transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19 + + \txfm2\()4 v4, v5, v6, v7 +2: + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[0], [x0], x1 +.ifnc \txfm1,iwht + srshr v4.4h, v4.4h, #4 + srshr v5.4h, v5.4h, #4 + srshr v6.4h, v6.4h, #4 + srshr v7.4h, v7.4h, #4 +.endif + uaddw v4.8h, v4.8h, v0.8b + uaddw v5.8h, v5.8h, v1.8b + ld1 {v2.s}[0], [x0], x1 + ld1 {v3.s}[0], [x0], x1 + sqxtun v0.8b, v4.8h + sqxtun v1.8b, v5.8h + sub x0, x0, x1, lsl #2 + + uaddw v6.8h, v6.8h, v2.8b + uaddw v7.8h, v7.8h, v3.8b + st1 {v0.s}[0], [x0], x1 + sqxtun v2.8b, v6.8h + sqxtun v3.8b, v7.8h + + st1 {v1.s}[0], [x0], x1 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + + ret +endfunc +.endm + +itxfm_func4x4 idct, idct +itxfm_func4x4 iadst, idct +itxfm_func4x4 idct, iadst +itxfm_func4x4 iadst, iadst +itxfm_func4x4 iwht, iwht + + +.macro idct8 + dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a + dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a + dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a + dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a + + butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3 + butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a + butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a + butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2 + + dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5 + + butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7] + butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6] + butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5] + butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4] +.endm + +.macro iadst8 + dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a + dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a + dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a + dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a + + dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4 + dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5 + dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6 + dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7 + + butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2 + butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3 + neg v23.8h, v23.8h // v23 = out[7] + + dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4] + neg v19.8h, v19.8h // v19 = out[3] + + dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3] // v26,v27 = t5a, v28,v29 = t4a + dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2] // v2,v3 = t6a, v4,v5 = t7a + + dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6 + dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7 + neg v17.8h, v17.8h // v17 = out[1] + + dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5] + neg v21.8h, v21.8h // v21 = out[5] +.endm + + +.macro itxfm_func8x8 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1 + // The iadst also uses a few coefficients from + // idct, so those always need to be loaded. +.ifc \txfm1\()_\txfm2,idct_idct + movrel x4, idct_coeffs +.else + movrel x4, iadst8_coeffs + ld1 {v1.8h}, [x4], #16 +.endif + ld1 {v0.8h}, [x4] + + movi v2.8h, #0 + movi v3.8h, #0 + movi v4.8h, #0 + movi v5.8h, #0 + +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.ne 1f + // DC-only for idct/idct + ld1 {v2.h}[0], [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + st1 {v3.h}[0], [x2] + dup v16.8h, v2.h[0] + mov v17.16b, v16.16b + mov v18.16b, v16.16b + mov v19.16b, v16.16b + mov v20.16b, v16.16b + mov v21.16b, v16.16b + mov v22.16b, v16.16b + mov v23.16b, v16.16b + b 2f +.endif +1: + ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64 + ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64 + sub x2, x2, #128 + st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64 + st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64 + + \txfm1\()8 + + // Transpose 8x8 with 16 bit elements + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + + \txfm2\()8 +2: + mov x3, x0 + // Add into the destination + ld1 {v0.8b}, [x0], x1 + srshr v16.8h, v16.8h, #5 + ld1 {v1.8b}, [x0], x1 + srshr v17.8h, v17.8h, #5 + ld1 {v2.8b}, [x0], x1 + srshr v18.8h, v18.8h, #5 + uaddw v16.8h, v16.8h, v0.8b + ld1 {v3.8b}, [x0], x1 + srshr v19.8h, v19.8h, #5 + uaddw v17.8h, v17.8h, v1.8b + ld1 {v4.8b}, [x0], x1 + srshr v20.8h, v20.8h, #5 + uaddw v18.8h, v18.8h, v2.8b + sqxtun v0.8b, v16.8h + ld1 {v5.8b}, [x0], x1 + srshr v21.8h, v21.8h, #5 + uaddw v19.8h, v19.8h, v3.8b + sqxtun v1.8b, v17.8h + ld1 {v6.8b}, [x0], x1 + srshr v22.8h, v22.8h, #5 + uaddw v20.8h, v20.8h, v4.8b + sqxtun v2.8b, v18.8h + ld1 {v7.8b}, [x0], x1 + srshr v23.8h, v23.8h, #5 + uaddw v21.8h, v21.8h, v5.8b + sqxtun v3.8b, v19.8h + + st1 {v0.8b}, [x3], x1 + uaddw v22.8h, v22.8h, v6.8b + st1 {v1.8b}, [x3], x1 + sqxtun v4.8b, v20.8h + st1 {v2.8b}, [x3], x1 + uaddw v23.8h, v23.8h, v7.8b + st1 {v3.8b}, [x3], x1 + sqxtun v5.8b, v21.8h + st1 {v4.8b}, [x3], x1 + sqxtun v6.8b, v22.8h + st1 {v5.8b}, [x3], x1 + sqxtun v7.8b, v23.8h + + st1 {v6.8b}, [x3], x1 + st1 {v7.8b}, [x3], x1 + + ret +endfunc +.endm + +itxfm_func8x8 idct, idct +itxfm_func8x8 iadst, idct +itxfm_func8x8 idct, iadst +itxfm_func8x8 iadst, iadst + + +function idct16x16_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + + movi v1.4h, #0 + + ld1 {v2.h}[0], [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + dup v2.8h, v2.h[0] + st1 {v1.h}[0], [x2] + + srshr v2.8h, v2.8h, #6 + + mov x3, x0 + mov x4, #16 +1: + // Loop to add the constant from v2 into all 16x16 outputs + subs x4, x4, #2 + ld1 {v3.16b}, [x0], x1 + ld1 {v4.16b}, [x0], x1 + uaddw v16.8h, v2.8h, v3.8b + uaddw2 v17.8h, v2.8h, v3.16b + uaddw v18.8h, v2.8h, v4.8b + uaddw2 v19.8h, v2.8h, v4.16b + sqxtun v3.8b, v16.8h + sqxtun2 v3.16b, v17.8h + sqxtun v4.8b, v18.8h + sqxtun2 v4.16b, v19.8h + st1 {v3.16b}, [x3], x1 + st1 {v4.16b}, [x3], x1 + b.ne 1b + + ret +endfunc + +.macro idct16_end + butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a + butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6 + butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5 + butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4 + butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a + butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10 + butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13 + butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a + + dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a + dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11 + + butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15] + butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14] + butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6] + butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8] + butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13] + butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12] + butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11] + butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10] + ret +.endm + +function idct16 + dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a + dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a + dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a + dmbutterfly v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a + dmbutterfly v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a + dmbutterfly v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a + dmbutterfly v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a + dmbutterfly v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a + + butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +endfunc + +function idct16_half + dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a + dmbutterfly_h1 v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a + dmbutterfly_h1 v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a + dmbutterfly_h2 v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a + dmbutterfly_h1 v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a + dmbutterfly_h2 v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a + dmbutterfly_h1 v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a + dmbutterfly_h2 v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a + + butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +endfunc + +function idct16_quarter + dsmull_h v24, v25, v19, v1.h[7] + dsmull_h v4, v5, v17, v1.h[0] + dsmull_h v7, v6, v18, v0.h[5] + dsmull_h v30, v31, v18, v0.h[4] + neg v24.4s, v24.4s + neg v25.4s, v25.4s + dsmull_h v29, v28, v17, v1.h[1] + dsmull_h v26, v27, v19, v1.h[6] + dsmull_h v22, v23, v16, v0.h[0] + drshrn_h v24, v24, v25, #14 + drshrn_h v16, v4, v5, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v6, v30, v31, #14 + drshrn_h v29, v29, v28, #14 + drshrn_h v17, v26, v27, #14 + drshrn_h v28, v22, v23, #14 + + dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3] + dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3] + neg v22.4s, v22.4s + neg v23.4s, v23.4s + drshrn_h v27, v20, v21, #14 + drshrn_h v21, v22, v23, #14 + drshrn_h v23, v18, v19, #14 + drshrn_h v25, v30, v31, #14 + mov v4.16b, v28.16b + mov v5.16b, v28.16b + dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31 + mov v20.16b, v28.16b + idct16_end +endfunc + +function iadst16 + ld1 {v0.8h,v1.8h}, [x11] + + dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0 + dmbutterfly_l v10, v11, v8, v9, v23, v24, v0.h[5], v0.h[4] // v10,v11 = t9, v8,v9 = t8 + dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a + dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2 + dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a + + dmbutterfly_l v6, v7, v4, v5, v21, v26, v0.h[7], v0.h[6] // v6,v7 = t11, v4,v5 = t10 + dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a + dmbutterfly_l v10, v11, v8, v9, v27, v20, v1.h[1], v1.h[0] // v10,v11 = t5, v8,v9 = t4 + dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a + + dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12 + dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a + dmbutterfly_l v6, v7, v4, v5, v25, v22, v1.h[3], v1.h[2] // v6,v7 = t7, v4,v5 = t6 + dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a + + dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14 + ld1 {v0.8h}, [x10] + dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a + dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5] // v14,v15 = t9, v12,v13 = t8 + dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a + + dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[5], v0.h[4] // v4,v5 = t12, v6,v7 = t13 + dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a + dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[6], v0.h[7] // v10,v11 = t11, v8,v9 = t10 + butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0 + dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a + + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6] // v12,v13 = t14, v14,v15 = t15 + butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1 + dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a + dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a + + butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2 + butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3 + + dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[2], v0.h[3] // v10,v11 = t13, v8,v9 = t12 + dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2] // v12,v13 = t14, v14,v15 = t15 + + dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a + dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a + neg v29.8h, v29.8h // v29 = out[13] + + dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[2], v0.h[3] // v10,v11 = t5a, v8,v9 = t4a + dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[3], v0.h[2] // v12,v13 = t6a, v14,v15 = t7a + + butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a + butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10 + + dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6 + neg v19.8h, v19.8h // v19 = out[3] + dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7 + + butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a + butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11 + + dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8] + dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10] + dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11] + dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9] + + neg v31.8h, v5.8h // v31 = out[15] + neg v17.8h, v3.8h // v17 = out[1] + + mov v16.16b, v2.16b + mov v30.16b, v4.16b + ret +endfunc + +// Helper macros; we can't use these expressions directly within +// e.g. .irp due to the extra concatenation \(). Therefore wrap +// them in macros to allow using .irp below. +.macro load i, src, inc + ld1 {v\i\().8h}, [\src], \inc +.endm +.macro store i, dst, inc + st1 {v\i\().8h}, [\dst], \inc +.endm +.macro movi_v i, size, imm + movi v\i\()\size, \imm +.endm +.macro load_clear i, src, inc + ld1 {v\i\().8h}, [\src] + st1 {v2.8h}, [\src], \inc +.endm + +.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2 + srshr \coef0, \coef0, #6 + ld1 {v2.8b}, [x0], x1 + srshr \coef1, \coef1, #6 + ld1 {v3.8b}, [x3], x1 + srshr \coef2, \coef2, #6 + ld1 {v4.8b}, [x0], x1 + srshr \coef3, \coef3, #6 + uaddw \coef0, \coef0, v2.8b + ld1 {v5.8b}, [x3], x1 + uaddw \coef1, \coef1, v3.8b + srshr \coef4, \coef4, #6 + ld1 {v6.8b}, [x0], x1 + srshr \coef5, \coef5, #6 + ld1 {v7.8b}, [x3], x1 + sqxtun v2.8b, \coef0 + srshr \coef6, \coef6, #6 + sqxtun v3.8b, \coef1 + srshr \coef7, \coef7, #6 + uaddw \coef2, \coef2, v4.8b + ld1 {\tmp1}, [x0], x1 + uaddw \coef3, \coef3, v5.8b + ld1 {\tmp2}, [x3], x1 + sqxtun v4.8b, \coef2 + sub x0, x0, x1, lsl #2 + sub x3, x3, x1, lsl #2 + sqxtun v5.8b, \coef3 + uaddw \coef4, \coef4, v6.8b + st1 {v2.8b}, [x0], x1 + uaddw \coef5, \coef5, v7.8b + st1 {v3.8b}, [x3], x1 + sqxtun v6.8b, \coef4 + st1 {v4.8b}, [x0], x1 + sqxtun v7.8b, \coef5 + st1 {v5.8b}, [x3], x1 + uaddw \coef6, \coef6, \tmp1 + st1 {v6.8b}, [x0], x1 + uaddw \coef7, \coef7, \tmp2 + st1 {v7.8b}, [x3], x1 + sqxtun \tmp1, \coef6 + sqxtun \tmp2, \coef7 + st1 {\tmp1}, [x0], x1 + st1 {\tmp2}, [x3], x1 +.endm + +// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, +// transpose into a horizontal 16x8 slice and store. +// x0 = dst (temp buffer) +// x1 = slice offset +// x2 = src +// x9 = input stride +.macro itxfm16_1d_funcs txfm +function \txfm\()16_1d_8x16_pass1_neon + mov x14, x30 + + movi v2.8h, #0 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr + + bl \txfm\()16 + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two + // transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the transposed 8x8 blocks horizontally. + cmp x1, #8 + b.eq 1f +.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 + store \i, x0, #16 +.endr + ret x14 +1: + // Special case: For the last input column (x1 == 8), + // which would be stored as the last row in the temp buffer, + // don't store the first 8x8 block, but keep it in registers + // for the first slice of the second pass (where it is the + // last 8x8 block). +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + add x0, x0, #16 + store \i, x0, #16 +.endr + mov v24.16b, v16.16b + mov v25.16b, v17.16b + mov v26.16b, v18.16b + mov v27.16b, v19.16b + mov v28.16b, v20.16b + mov v29.16b, v21.16b + mov v30.16b, v22.16b + mov v31.16b, v23.16b + ret x14 +endfunc + +// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, +// load the destination pixels (from a similar 8x16 slice), add and store back. +// x0 = dst +// x1 = dst stride +// x2 = src (temp buffer) +// x3 = slice offset +// x9 = temp buffer stride +function \txfm\()16_1d_8x16_pass2_neon + mov x14, x30 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + cbz x3, 1f +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl \txfm\()16 + + load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b + load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b + + ret x14 +endfunc +.endm + +itxfm16_1d_funcs idct +itxfm16_1d_funcs iadst + +.macro itxfm_func16x16 txfm1, txfm2 +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #1 + b.eq idct16x16_dc_add_neon +.endif + mov x15, x30 + // iadst16 requires clobbering v8-v15, but idct16 doesn't need to. +.ifnc \txfm1\()_\txfm2,idct_idct + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] +.endif + + sub sp, sp, #512 + + mov x4, x0 + mov x5, x1 + mov x6, x2 + + movrel x10, idct_coeffs +.ifnc \txfm1\()_\txfm2,idct_idct + movrel x11, iadst16_coeffs +.endif +.ifc \txfm1,idct + ld1 {v0.8h,v1.8h}, [x10] +.endif + mov x9, #32 + +.ifc \txfm1\()_\txfm2,idct_idct + cmp w3, #10 + b.le idct16x16_quarter_add_neon + cmp w3, #38 + b.le idct16x16_half_add_neon +.endif + +.irp i, 0, 8 + add x0, sp, #(\i*32) +.ifc \txfm1\()_\txfm2,idct_idct +.if \i == 8 + cmp w3, #38 + b.le 1f +.endif +.endif + mov x1, #\i + add x2, x6, #(\i*2) + bl \txfm1\()16_1d_8x16_pass1_neon +.endr +.ifc \txfm1\()_\txfm2,iadst_idct + ld1 {v0.8h,v1.8h}, [x10] +.endif + +.ifc \txfm1\()_\txfm2,idct_idct + b 3f +1: + // Set v24-v31 to zero, for the in-register passthrough of + // coefficients to pass 2. Since we only do two slices, this can + // only ever happen for the second slice. So we only need to store + // zeros to the temp buffer for the second half of the buffer. + // Move x0 to the second half, and use x9 == 32 as increment. + add x0, x0, #16 +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + movi_v \i, .16b, #0 + st1 {v24.8h}, [x0], x9 +.endr +3: +.endif + +.irp i, 0, 8 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + mov x3, #\i + bl \txfm2\()16_1d_8x16_pass2_neon +.endr + + add sp, sp, #512 +.ifnc \txfm1\()_\txfm2,idct_idct + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 +.endif + ret x15 +endfunc +.endm + +itxfm_func16x16 idct, idct +itxfm_func16x16 iadst, idct +itxfm_func16x16 idct, iadst +itxfm_func16x16 iadst, iadst + +function idct16_1d_8x16_pass1_quarter_neon + mov x14, x30 + movi v2.8h, #0 +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr + + bl idct16_quarter + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two + // transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the transposed 8x8 blocks horizontally. + // The first 8x8 block is kept in registers for the second pass, + // store the rest in the temp buffer. + // Since only a 4x4 part of the input was nonzero, this means that + // only 4 rows are nonzero after transposing, and the second pass + // only reads the topmost 4 rows. Therefore only store the topmost + // 4 rows. + add x0, x0, #16 +.irp i, 24, 25, 26, 27 + store \i, x0, x9 +.endr + ret x14 +endfunc + +function idct16_1d_8x16_pass2_quarter_neon + mov x14, x30 + cbz x3, 1f +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl idct16_quarter + + load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b + load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b + + ret x14 +endfunc + +function idct16_1d_8x16_pass1_half_neon + mov x14, x30 + movi v2.8h, #0 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr + + bl idct16_half + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two + // transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the transposed 8x8 blocks horizontally. + // The first 8x8 block is kept in registers for the second pass, + // store the rest in the temp buffer. + add x0, x0, #16 +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 + store \i, x0, x9 +.endr + ret x14 +endfunc + +function idct16_1d_8x16_pass2_half_neon + mov x14, x30 + cbz x3, 1f +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr +1: + + add x3, x0, x1 + lsl x1, x1, #1 + bl idct16_half + + load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b + load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b + + ret x14 +endfunc + +.macro idct16_partial size +function idct16x16_\size\()_add_neon + add x0, sp, #(0*32) + add x2, x6, #(0*2) + bl idct16_1d_8x16_pass1_\size\()_neon +.irp i, 0, 8 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + mov x3, #\i + bl idct16_1d_8x16_pass2_\size\()_neon +.endr + + add sp, sp, #512 + ret x15 +endfunc +.endm + +idct16_partial quarter +idct16_partial half + +function idct32x32_dc_add_neon + movrel x4, idct_coeffs + ld1 {v0.4h}, [x4] + + movi v1.4h, #0 + + ld1 {v2.h}[0], [x2] + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + smull v2.4s, v2.4h, v0.h[0] + rshrn v2.4h, v2.4s, #14 + dup v2.8h, v2.h[0] + st1 {v1.h}[0], [x2] + + srshr v0.8h, v2.8h, #6 + + mov x3, x0 + mov x4, #32 +1: + // Loop to add the constant v0 into all 32x32 outputs + subs x4, x4, #2 + ld1 {v1.16b,v2.16b}, [x0], x1 + uaddw v16.8h, v0.8h, v1.8b + uaddw2 v17.8h, v0.8h, v1.16b + ld1 {v3.16b,v4.16b}, [x0], x1 + uaddw v18.8h, v0.8h, v2.8b + uaddw2 v19.8h, v0.8h, v2.16b + uaddw v20.8h, v0.8h, v3.8b + uaddw2 v21.8h, v0.8h, v3.16b + uaddw v22.8h, v0.8h, v4.8b + uaddw2 v23.8h, v0.8h, v4.16b + sqxtun v1.8b, v16.8h + sqxtun2 v1.16b, v17.8h + sqxtun v2.8b, v18.8h + sqxtun2 v2.16b, v19.8h + sqxtun v3.8b, v20.8h + sqxtun2 v3.16b, v21.8h + st1 {v1.16b,v2.16b}, [x3], x1 + sqxtun v4.8b, v22.8h + sqxtun2 v4.16b, v23.8h + st1 {v3.16b,v4.16b}, [x3], x1 + b.ne 1b + + ret +endfunc + +.macro idct32_end + butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a + butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18 + butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a + butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21 + butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a + butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26 + butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a + butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29 + + dmbutterfly v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a + dmbutterfly v3, v5, v0.h[2], v0.h[3], v24, v25, v30, v31 // v3 = t19, v5 = t28 + dmbutterfly v28, v6, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20 + dmbutterfly v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a + + butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24 + butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a + butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16 + butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a + butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21 + butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a + butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26 + butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20 + + dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20 + dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a + dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22 + dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a + ret +.endm + +function idct32_odd + dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +endfunc + +function idct32_odd_half + dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +endfunc + +function idct32_odd_quarter + dsmull_h v4, v5, v16, v8.h[0] + dsmull_h v28, v29, v19, v8.h[7] + dsmull_h v30, v31, v16, v8.h[1] + dsmull_h v22, v23, v17, v9.h[6] + dsmull_h v7, v6, v17, v9.h[7] + dsmull_h v26, v27, v19, v8.h[6] + dsmull_h v20, v21, v18, v9.h[0] + dsmull_h v24, v25, v18, v9.h[1] + + neg v28.4s, v28.4s + neg v29.4s, v29.4s + neg v7.4s, v7.4s + neg v6.4s, v6.4s + + drshrn_h v4, v4, v5, #14 + drshrn_h v5, v28, v29, #14 + drshrn_h v29, v30, v31, #14 + drshrn_h v28, v22, v23, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v31, v26, v27, #14 + drshrn_h v6, v20, v21, #14 + drshrn_h v30, v24, v25, #14 + + dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[4], v0.h[5] + dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[4], v0.h[5] + drshrn_h v23, v16, v17, #14 + drshrn_h v24, v18, v19, #14 + neg v20.4s, v20.4s + neg v21.4s, v21.4s + drshrn_h v27, v27, v26, #14 + drshrn_h v20, v20, v21, #14 + dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[6], v0.h[7] + drshrn_h v21, v16, v17, #14 + drshrn_h v26, v18, v19, #14 + dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[6], v0.h[7] + drshrn_h v25, v16, v17, #14 + neg v18.4s, v18.4s + neg v19.4s, v19.4s + drshrn_h v22, v18, v19, #14 + + idct32_end +endfunc + +.macro idct32_funcs suffix +// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix. +// The 32-point IDCT can be decomposed into two 16-point IDCTs; +// a normal IDCT16 with every other input component (the even ones, with +// each output written twice), followed by a separate 16-point IDCT +// of the odd inputs, added/subtracted onto the outputs of the first idct16. +// x0 = dst (temp buffer) +// x1 = unused +// x2 = src +// x9 = double input stride +function idct32_1d_8x32_pass1\suffix\()_neon + mov x14, x30 + movi v2.8h, #0 + + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr +.endif + + bl idct16\suffix + + // Do two 8x8 transposes. Originally, v16-v31 contain the + // 16 rows. Afterwards, v16-v23 and v24-v31 contain the + // two transposed 8x8 blocks. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 + + // Store the registers a, b horizontally, followed by the + // same registers b, a mirrored. +.macro store_rev a, b + // There's no rev128 instruction, but we reverse each 64 bit + // half, and then flip them using an ext with 8 bytes offset. + rev64 v3.8h, \b + st1 {\a}, [x0], #16 + rev64 v2.8h, \a + ext v3.16b, v3.16b, v3.16b, #8 + st1 {\b}, [x0], #16 + ext v2.16b, v2.16b, v2.16b, #8 + st1 {v3.8h}, [x0], #16 + st1 {v2.8h}, [x0], #16 +.endm + store_rev v16.8h, v24.8h + store_rev v17.8h, v25.8h + store_rev v18.8h, v26.8h + store_rev v19.8h, v27.8h + store_rev v20.8h, v28.8h + store_rev v21.8h, v29.8h + store_rev v22.8h, v30.8h + store_rev v23.8h, v31.8h + sub x0, x0, #512 +.purgem store_rev + + // Move x2 back to the start of the input, and move + // to the first odd row +.ifb \suffix + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half + sub x2, x2, x9, lsl #3 +.endif + add x2, x2, #64 + + movi v2.8h, #0 + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr +.endif + + bl idct32_odd\suffix + + transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3 + transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3 + + // Store the registers a, b horizontally, + // adding into the output first, and the mirrored, + // subtracted from the output. +.macro store_rev a, b + ld1 {v4.8h}, [x0] + rev64 v3.8h, \b + add v4.8h, v4.8h, \a + rev64 v2.8h, \a + st1 {v4.8h}, [x0], #16 + ext v3.16b, v3.16b, v3.16b, #8 + ld1 {v5.8h}, [x0] + ext v2.16b, v2.16b, v2.16b, #8 + add v5.8h, v5.8h, \b + st1 {v5.8h}, [x0], #16 + ld1 {v6.8h}, [x0] + sub v6.8h, v6.8h, v3.8h + st1 {v6.8h}, [x0], #16 + ld1 {v7.8h}, [x0] + sub v7.8h, v7.8h, v2.8h + st1 {v7.8h}, [x0], #16 +.endm + + store_rev v31.8h, v23.8h + store_rev v30.8h, v22.8h + store_rev v29.8h, v21.8h + store_rev v28.8h, v20.8h + store_rev v27.8h, v19.8h + store_rev v26.8h, v18.8h + store_rev v25.8h, v17.8h + store_rev v24.8h, v16.8h +.purgem store_rev + ret x14 +endfunc + +// This is mostly the same as 8x32_pass1, but without the transpose, +// and use the source as temp buffer between the two idct passes, and +// add into the destination. +// x0 = dst +// x1 = dst stride +// x2 = src (temp buffer) +// x7 = negative double temp buffer stride +// x9 = double temp buffer stride +function idct32_1d_8x32_pass2\suffix\()_neon + mov x14, x30 + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #3 +.endif + + bl idct16\suffix + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + store \i, x2, x9 +.endr + + sub x2, x2, x9, lsl #4 + add x2, x2, #64 + + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) +.ifb \suffix +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #4 +.endif +.ifc \suffix,_quarter +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #2 +.endif +.ifc \suffix,_half +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + sub x2, x2, x9, lsl #3 +.endif + sub x2, x2, #64 + + bl idct32_odd\suffix + +.macro load_acc_store a, b, c, d, neg=0 +.if \neg == 0 + ld1 {v4.8h}, [x2], x9 + ld1 {v5.8h}, [x2], x9 + add v4.8h, v4.8h, \a + ld1 {v6.8h}, [x2], x9 + add v5.8h, v5.8h, \b + ld1 {v7.8h}, [x2], x9 + add v6.8h, v6.8h, \c + add v7.8h, v7.8h, \d +.else + ld1 {v4.8h}, [x2], x7 + ld1 {v5.8h}, [x2], x7 + sub v4.8h, v4.8h, \a + ld1 {v6.8h}, [x2], x7 + sub v5.8h, v5.8h, \b + ld1 {v7.8h}, [x2], x7 + sub v6.8h, v6.8h, \c + sub v7.8h, v7.8h, \d +.endif + ld1 {v10.8b}, [x0], x1 + ld1 {v11.8b}, [x0], x1 + srshr v4.8h, v4.8h, #6 + ld1 {v2.8b}, [x0], x1 + srshr v5.8h, v5.8h, #6 + uaddw v4.8h, v4.8h, v10.8b + ld1 {v3.8b}, [x0], x1 + srshr v6.8h, v6.8h, #6 + uaddw v5.8h, v5.8h, v11.8b + srshr v7.8h, v7.8h, #6 + sub x0, x0, x1, lsl #2 + uaddw v6.8h, v6.8h, v2.8b + sqxtun v4.8b, v4.8h + uaddw v7.8h, v7.8h, v3.8b + sqxtun v5.8b, v5.8h + st1 {v4.8b}, [x0], x1 + sqxtun v6.8b, v6.8h + st1 {v5.8b}, [x0], x1 + sqxtun v7.8b, v7.8h + st1 {v6.8b}, [x0], x1 + st1 {v7.8b}, [x0], x1 +.endm + load_acc_store v31.8h, v30.8h, v29.8h, v28.8h + load_acc_store v27.8h, v26.8h, v25.8h, v24.8h + load_acc_store v23.8h, v22.8h, v21.8h, v20.8h + load_acc_store v19.8h, v18.8h, v17.8h, v16.8h + sub x2, x2, x9 + load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1 + load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1 + load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1 + load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1 +.purgem load_acc_store + ret x14 +endfunc +.endm + +idct32_funcs +idct32_funcs _quarter +idct32_funcs _half + +const min_eob_idct_idct_32, align=4 + .short 0, 34, 135, 336 +endconst + +function ff_vp9_idct_idct_32x32_add_neon, export=1 + cmp w3, #1 + b.eq idct32x32_dc_add_neon + + movrel x10, idct_coeffs + + mov x15, x30 + + stp d8, d9, [sp, #-0x20]! + stp d10, d11, [sp, #0x10] + + sub sp, sp, #2048 + + mov x4, x0 + mov x5, x1 + mov x6, x2 + + // Double stride of the input, since we only read every other line + mov x9, #128 + neg x7, x9 + + ld1 {v0.8h,v1.8h}, [x10], #32 + ld1 {v8.8h,v9.8h}, [x10] + + cmp w3, #34 + b.le idct32x32_quarter_add_neon + cmp w3, #135 + b.le idct32x32_half_add_neon + + movrel x12, min_eob_idct_idct_32, 2 + +.irp i, 0, 8, 16, 24 + add x0, sp, #(\i*64) +.if \i > 0 + ldrh w1, [x12], #2 + cmp w3, w1 + mov x1, #(32 - \i)/4 + b.le 1f +.endif + add x2, x6, #(\i*2) + bl idct32_1d_8x32_pass1_neon +.endr + b 3f + +1: + // Write zeros to the temp buffer for pass 2 + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 +2: + subs x1, x1, #1 +.rept 4 + st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x0], #64 +.endr + b.ne 2b +3: +.irp i, 0, 8, 16, 24 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + bl idct32_1d_8x32_pass2_neon +.endr + + add sp, sp, #2048 + + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], #0x20 + + ret x15 +endfunc + +.macro idct32_partial size +function idct32x32_\size\()_add_neon + add x0, sp, #(0*64) + add x2, x6, #(0*2) + bl idct32_1d_8x32_pass1_\size\()_neon +.ifc \size,half + add x0, sp, #(8*64) + add x2, x6, #(8*2) + bl idct32_1d_8x32_pass1_\size\()_neon +.endif +.irp i, 0, 8, 16, 24 + add x0, x4, #(\i) + mov x1, x5 + add x2, sp, #(\i*2) + bl idct32_1d_8x32_pass2_\size\()_neon +.endr + + add sp, sp, #2048 + + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], #0x20 + + ret x15 +endfunc +.endm + +idct32_partial quarter +idct32_partial half diff --git a/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S new file mode 100644 index 0000000000..e3e70491c6 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S @@ -0,0 +1,861 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + + +// The input to and output from this macro is in the registers v16-v31, +// and v0-v7 are used as scratch registers. +// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31 +// Depending on the width of the loop filter, we either use v16-v19 +// and v28-v31 as temp registers, or v8-v15. +.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 + dup v0.8h, w2 // E + dup v2.8h, w3 // I + dup v3.8h, w4 // H + + uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2) + uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1) + uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0) + uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1) + uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2) + uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3) + umax v4.8h, v4.8h, v5.8h + umax v5.8h, v6.8h, v7.8h + umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h + uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0) + umax v4.8h, v4.8h, v5.8h + add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2 + uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1) + umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3)) + ushr v5.8h, v5.8h, #1 + cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I + add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 + cmhs v6.8h, v0.8h, v6.8h + and v4.16b, v4.16b, v6.16b // fm + + // If no pixels need filtering, just exit as soon as possible + mov x11, v4.d[0] + mov x12, v4.d[1] + adds x11, x11, x12 + b.ne 1f + ret x10 +1: + +.if \wd >= 8 + dup v0.8h, w5 + + uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0) + uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) + uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0) + uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0) + uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0) + uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0) + umax v6.8h, v6.8h, v2.8h + umax v1.8h, v1.8h, \tmp1\().8h + umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h +.if \wd == 16 + uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0) + umax v6.8h, v6.8h, v1.8h + uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0) + umax v6.8h, v6.8h, \tmp2\().8h + uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0) + cmhs v6.8h, v0.8h, v6.8h // flat8in + uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0) + and v6.16b, v6.16b, v4.16b // flat8in && fm + uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0) + bic v4.16b, v4.16b, v6.16b // fm && !flat8in + uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0) + uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0) + uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0) + + umax v7.8h, v7.8h, v2.8h + umax v1.8h, v1.8h, v8.8h + umax v9.8h, v9.8h, v10.8h + umax v11.8h, v11.8h, v12.8h + // The rest of the calculation of flat8out is interleaved below +.else + // The rest of the calculation of flat8in is interleaved below +.endif +.endif + + // Calculate the normal inner loop filter for 2 or 4 pixels + uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0) +.if \wd == 16 + umax v7.8h, v7.8h, v1.8h + umax v9.8h, v9.8h, v11.8h +.elseif \wd == 8 + umax v6.8h, v6.8h, v1.8h +.endif + uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0) +.if \wd == 16 + umax v7.8h, v7.8h, v9.8h +.elseif \wd == 8 + umax v6.8h, v6.8h, \tmp2\().8h +.endif + dup \tmp2\().8h, w6 // left shift for saturation + sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1 + neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation + umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0)) + sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0 + movi \tmp5\().8h, #3 +.if \wd == 8 + cmhs v6.8h, v0.8h, v6.8h // flat8in +.endif + cmhs v5.8h, v3.8h, v5.8h // !hev +.if \wd == 8 + and v6.16b, v6.16b, v4.16b // flat8in && fm +.endif + sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h +.if \wd == 16 + cmhs v7.8h, v0.8h, v7.8h // flat8out +.elseif \wd == 8 + bic v4.16b, v4.16b, v6.16b // fm && !flat8in +.endif + and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in +.if \wd == 16 + and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm +.endif + sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1) + + mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0) + bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0 + movi v2.8h, #4 + add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] + movi v3.8h, #3 + sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h + movi \tmp5\().8h, #0 + sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f + dup \tmp6\().8h, w7 // max pixel value +.if \wd == 16 + bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out +.endif + + ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1 + + add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4 + add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3 + smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) + smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) + sshr \tmp3\().8h, \tmp3\().8h, #3 // f1 + sshr \tmp4\().8h, \tmp4\().8h, #3 // f2 + + add v0.8h, v23.8h, \tmp4\().8h // p0 + f2 + sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1 + smin v0.8h, v0.8h, \tmp6\().8h + smin v2.8h, v2.8h, \tmp6\().8h + srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1 + smax v0.8h, v0.8h, \tmp5\().8h // out p0 + smax v2.8h, v2.8h, \tmp5\().8h // out q0 + bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in) + bit v24.16b, v2.16b, v4.16b + + add v0.8h, v22.8h, \tmp3\().8h // p1 + f + sub v2.8h, v25.8h, \tmp3\().8h // q1 - f +.if \wd >= 8 + mov x11, v6.d[0] +.endif + smin v0.8h, v0.8h, \tmp6\().8h + smin v2.8h, v2.8h, \tmp6\().8h +.if \wd >= 8 + mov x12, v6.d[1] +.endif + smax v0.8h, v0.8h, \tmp5\().8h // out p1 + smax v2.8h, v2.8h, \tmp5\().8h // out q1 +.if \wd >= 8 + adds x11, x11, x12 +.endif + bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in) + bit v25.16b, v2.16b, v5.16b + + // If no pixels need flat8in, jump to flat8out + // (or to a writeout of the inner 4 pixels, for wd=8) +.if \wd >= 8 +.if \wd == 16 + b.eq 6f +.else + b.ne 1f + ret x13 +1: +.endif + + // flat8in + add \tmp1\().8h, v20.8h, v21.8h + add \tmp3\().8h, v22.8h, v25.8h + add \tmp5\().8h, v20.8h, v22.8h + add \tmp7\().8h, v23.8h, v26.8h + add v0.8h, \tmp1\().8h, \tmp1\().8h + add v0.8h, v0.8h, v23.8h + add v0.8h, v0.8h, v24.8h + add v0.8h, v0.8h, \tmp5\().8h + sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h + sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h + urshr v2.8h, v0.8h, #3 // out p2 + + add v0.8h, v0.8h, \tmp3\().8h + add \tmp1\().8h, v20.8h, v23.8h + add \tmp3\().8h, v24.8h, v27.8h + urshr v3.8h, v0.8h, #3 // out p1 + + add v0.8h, v0.8h, \tmp7\().8h + sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h + add \tmp5\().8h, v21.8h, v24.8h + add \tmp7\().8h, v25.8h, v27.8h + urshr v4.8h, v0.8h, #3 // out p0 + + add v0.8h, v0.8h, \tmp3\().8h + sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h + add \tmp1\().8h, v22.8h, v25.8h + add \tmp3\().8h, v26.8h, v27.8h + urshr v5.8h, v0.8h, #3 // out q0 + + add v0.8h, v0.8h, \tmp7\().8h + sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h + urshr \tmp5\().8h, v0.8h, #3 // out q1 + + add v0.8h, v0.8h, \tmp3\().8h + // The output here is written back into the input registers. This doesn't + // matter for the flat8part below, since we only update those pixels + // which won't be touched below. + bit v21.16b, v2.16b, v6.16b + bit v22.16b, v3.16b, v6.16b + bit v23.16b, v4.16b, v6.16b + urshr \tmp6\().8h, v0.8h, #3 // out q2 + bit v24.16b, v5.16b, v6.16b + bit v25.16b, \tmp5\().16b, v6.16b + bit v26.16b, \tmp6\().16b, v6.16b +.endif +.if \wd == 16 +6: + orr v2.16b, v6.16b, v7.16b + mov x11, v2.d[0] + mov x12, v2.d[1] + adds x11, x11, x12 + b.ne 1f + // If no pixels needed flat8in nor flat8out, jump to a + // writeout of the inner 4 pixels + ret x14 +1: + + mov x11, v7.d[0] + mov x12, v7.d[1] + adds x11, x11, x12 + b.ne 1f + // If no pixels need flat8out, jump to a writeout of the inner 6 pixels + ret x15 + +1: + // flat8out + // This writes all outputs into v2-v17 (skipping v6 and v16). + // If this part is skipped, the output is read from v21-v26 (which is the input + // to this section). + shl v0.8h, v16.8h, #3 // 8 * v16 + sub v0.8h, v0.8h, v16.8h // 7 * v16 + add v0.8h, v0.8h, v17.8h + add v8.8h, v17.8h, v18.8h + add v10.8h, v19.8h, v20.8h + add v0.8h, v0.8h, v8.8h + add v8.8h, v16.8h, v17.8h + add v12.8h, v21.8h, v22.8h + add v0.8h, v0.8h, v10.8h + add v10.8h, v18.8h, v25.8h + add v14.8h, v23.8h, v24.8h + sub v10.8h, v10.8h, v8.8h + add v0.8h, v0.8h, v12.8h + add v0.8h, v0.8h, v14.8h + add v12.8h, v16.8h, v18.8h + add v14.8h, v19.8h, v26.8h + urshr v2.8h, v0.8h, #4 + + add v0.8h, v0.8h, v10.8h + add v8.8h, v16.8h, v19.8h + add v10.8h, v20.8h, v27.8h + sub v14.8h, v14.8h, v12.8h + bif v2.16b, v17.16b, v7.16b + urshr v3.8h , v0.8h, #4 + + add v0.8h, v0.8h, v14.8h + add v12.8h, v16.8h, v20.8h + add v14.8h, v21.8h, v28.8h + sub v10.8h, v10.8h, v8.8h + bif v3.16b, v18.16b, v7.16b + urshr v4.8h, v0.8h, #4 + + add v0.8h, v0.8h, v10.8h + add v8.8h, v16.8h, v21.8h + add v10.8h, v22.8h, v29.8h + sub v14.8h, v14.8h, v12.8h + bif v4.16b, v19.16b, v7.16b + urshr v5.8h, v0.8h, #4 + + add v0.8h, v0.8h, v14.8h + add v12.8h, v16.8h, v22.8h + add v14.8h, v23.8h, v30.8h + sub v10.8h, v10.8h, v8.8h + bif v5.16b, v20.16b, v7.16b + urshr v6.8h, v0.8h, #4 + + add v0.8h, v0.8h, v10.8h + add v10.8h, v16.8h, v23.8h + sub v14.8h, v14.8h, v12.8h + add v12.8h, v24.8h, v31.8h + bif v6.16b, v21.16b, v7.16b + urshr v8.8h, v0.8h, #4 + + add v0.8h, v0.8h, v14.8h + sub v10.8h, v12.8h, v10.8h + add v12.8h, v17.8h, v24.8h + add v14.8h, v25.8h, v31.8h + bif v8.16b, v22.16b, v7.16b + urshr v9.8h, v0.8h, #4 + + add v0.8h, v0.8h, v10.8h + sub v14.8h, v14.8h, v12.8h + add v12.8h, v26.8h, v31.8h + bif v9.16b, v23.16b, v7.16b + urshr v10.8h, v0.8h, #4 + + add v0.8h, v0.8h, v14.8h + add v14.8h, v18.8h, v25.8h + add v18.8h, v19.8h, v26.8h + sub v12.8h, v12.8h, v14.8h + add v14.8h, v27.8h, v31.8h + bif v10.16b, v24.16b, v7.16b + urshr v11.8h, v0.8h, #4 + + add v0.8h, v0.8h, v12.8h + add v12.8h, v20.8h, v27.8h + sub v14.8h, v14.8h, v18.8h + add v18.8h, v28.8h, v31.8h + bif v11.16b, v25.16b, v7.16b + sub v18.8h, v18.8h, v12.8h + urshr v12.8h, v0.8h, #4 + + add v0.8h, v0.8h, v14.8h + add v14.8h, v21.8h, v28.8h + add v20.8h, v29.8h, v31.8h + bif v12.16b, v26.16b, v7.16b + urshr v13.8h, v0.8h, #4 + + add v0.8h, v0.8h, v18.8h + sub v20.8h, v20.8h, v14.8h + add v18.8h, v22.8h, v29.8h + add v22.8h, v30.8h, v31.8h + bif v13.16b, v27.16b, v7.16b + urshr v14.8h, v0.8h, #4 + + add v0.8h, v0.8h, v20.8h + sub v22.8h, v22.8h, v18.8h + bif v14.16b, v28.16b, v7.16b + urshr v15.8h, v0.8h, #4 + + add v0.8h, v0.8h, v22.8h + bif v15.16b, v29.16b, v7.16b + urshr v17.8h, v0.8h, #4 + bif v17.16b, v30.16b, v7.16b +.endif +.endm + +// For wd <= 8, we use v16-v19 and v28-v31 for temp registers, +// while we need those for inputs/outputs in wd=16 and use v8-v15 +// for temp registers there instead. +function vp9_loop_filter_4 + loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31 + ret +endfunc + +function vp9_loop_filter_8 + loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31 + ret +endfunc + +function vp9_loop_filter_16 + loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15 + ret +endfunc + +.macro loop_filter_4 + bl vp9_loop_filter_4 +.endm + +.macro loop_filter_8 + // calculate alternative 'return' targets + adr x13, 6f + bl vp9_loop_filter_8 +.endm + +.macro loop_filter_16 + // calculate alternative 'return' targets + adr x14, 7f + adr x15, 8f + bl vp9_loop_filter_16 +.endm + + +// The public functions in this file have got the following signature: +// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); + +.macro bpp_frontend func, bpp, push +function ff_\func\()_\bpp\()_neon, export=1 +.if \push + mov x16, x30 + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] +.endif + lsl w2, w2, #\bpp - 8 + lsl w3, w3, #\bpp - 8 + lsl w4, w4, #\bpp - 8 + mov x5, #1 << (\bpp - 8) + mov x6, #16 - \bpp + mov x7, #((1 << \bpp) - 1) +.if \push + bl \func\()_16_neon + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 + ret x16 +.else + b \func\()_16_neon +.endif +endfunc +.endm + +.macro bpp_frontends func, push=0 + bpp_frontend \func, 10, \push + bpp_frontend \func, 12, \push +.endm + +.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push +function ff_\func\()_\suffix\()_\bpp\()_neon, export=1 + mov x16, x30 +.if \push + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] +.endif + lsl w2, w2, #\bpp - 8 + lsl w3, w3, #\bpp - 8 + lsl w4, w4, #\bpp - 8 + mov x5, #1 << (\bpp - 8) + mov x6, #16 - \bpp + mov x7, #((1 << \bpp) - 1) + bl \func\()_\int_suffix\()_16_neon +.ifc \dir,h + add x0, x0, x1, lsl #3 +.else + add x0, x0, #16 +.endif + bl \func\()_\int_suffix\()_16_neon +.if \push + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], 0x40 +.endif + ret x16 +endfunc +.endm + +.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0 + bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push + bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push +.endm + +.macro bpp_frontend_mix2 wd1, wd2, dir, bpp +function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1 + mov x16, x30 + lsr w8, w2, #8 + lsr w14, w3, #8 + lsr w15, w4, #8 + and w2, w2, #0xff + and w3, w3, #0xff + and w4, w4, #0xff + lsl w2, w2, #\bpp - 8 + lsl w3, w3, #\bpp - 8 + lsl w4, w4, #\bpp - 8 + mov x5, #1 << (\bpp - 8) + mov x6, #16 - \bpp + mov x7, #((1 << \bpp) - 1) + bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon +.ifc \dir,h + add x0, x0, x1, lsl #3 +.else + add x0, x0, #16 +.endif + lsl w2, w8, #\bpp - 8 + lsl w3, w14, #\bpp - 8 + lsl w4, w15, #\bpp - 8 + bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon + ret x16 +endfunc +.endm + +.macro bpp_frontends_mix2 wd1, wd2 + bpp_frontend_mix2 \wd1, \wd2, v, 10 + bpp_frontend_mix2 \wd1, \wd2, v, 12 + bpp_frontend_mix2 \wd1, \wd2, h, 10 + bpp_frontend_mix2 \wd1, \wd2, h, 12 +.endm + +function vp9_loop_filter_v_4_8_16_neon + mov x10, x30 + sub x9, x0, x1, lsl #2 + ld1 {v20.8h}, [x9], x1 // p3 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v21.8h}, [x9], x1 // p2 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v22.8h}, [x9], x1 // p1 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v23.8h}, [x9], x1 // p0 + ld1 {v27.8h}, [x0], x1 // q3 + sub x0, x0, x1, lsl #2 + sub x9, x9, x1, lsl #1 + + loop_filter_4 + + st1 {v22.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + sub x0, x0, x1, lsl #1 + + ret x10 +endfunc + +bpp_frontends vp9_loop_filter_v_4_8 + +function vp9_loop_filter_h_4_8_16_neon + mov x10, x30 + sub x9, x0, #8 + add x0, x9, x1, lsl #2 + ld1 {v20.8h}, [x9], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v21.8h}, [x9], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v22.8h}, [x9], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v23.8h}, [x9], x1 + ld1 {v27.8h}, [x0], x1 + + sub x9, x9, x1, lsl #2 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + + transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + loop_filter_4 + + // Move x9 forward by 2 pixels; we don't need to rewrite the + // outermost 2 pixels since they aren't changed. + add x9, x9, #4 + add x0, x9, x1, lsl #2 + + // We only will write the mid 4 pixels back; after the loop filter, + // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels). + // We need to transpose them to columns, done with a 4x8 transpose + // (which in practice is two 4x4 transposes of the two 4x4 halves + // of the 8x4 pixels; into 4x8 pixels). + transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 + st1 {v22.d}[0], [x9], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x9], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x9], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x9], x1 + st1 {v25.d}[1], [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, #4 + + ret x10 +endfunc + +bpp_frontends vp9_loop_filter_h_4_8 + +function vp9_loop_filter_v_8_8_16_neon + mov x10, x30 + sub x9, x0, x1, lsl #2 + ld1 {v20.8h}, [x9], x1 // p3 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v21.8h}, [x9], x1 // p2 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v22.8h}, [x9], x1 // p1 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v23.8h}, [x9], x1 // p0 + ld1 {v27.8h}, [x0], x1 // q3 + sub x9, x9, x1, lsl #2 + sub x0, x0, x1, lsl #2 + add x9, x9, x1 + + loop_filter_8 + + st1 {v21.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v22.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v26.8h}, [x0], x1 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + + ret x10 +6: + sub x9, x0, x1, lsl #1 + st1 {v22.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + sub x0, x0, x1, lsl #1 + ret x10 +endfunc + +bpp_frontends vp9_loop_filter_v_8_8 + +function vp9_loop_filter_h_8_8_16_neon + mov x10, x30 + sub x9, x0, #8 + add x0, x9, x1, lsl #2 + ld1 {v20.8h}, [x9], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v21.8h}, [x9], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v22.8h}, [x9], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v23.8h}, [x9], x1 + ld1 {v27.8h}, [x0], x1 + + sub x9, x9, x1, lsl #2 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + + transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + loop_filter_8 + + add x0, x9, x1, lsl #2 + + // Even though only 6 pixels per row have been changed, we write the + // full 8 pixel registers. + transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + st1 {v20.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v21.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + st1 {v22.8h}, [x9], x1 + st1 {v26.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v27.8h}, [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + + ret x10 +6: + // If we didn't need to do the flat8in part, we use the same writeback + // as in loop_filter_h_4_8. + add x9, x9, #4 + add x0, x9, x1, lsl #2 + transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 + st1 {v22.d}[0], [x9], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x9], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x9], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x9], x1 + st1 {v25.d}[1], [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, #4 + ret x10 +endfunc + +bpp_frontends vp9_loop_filter_h_8_8 + +bpp_frontends_mix2 4, 4 +bpp_frontends_mix2 4, 8 +bpp_frontends_mix2 8, 4 +bpp_frontends_mix2 8, 8 + +function vp9_loop_filter_v_16_8_16_neon + mov x10, x30 + sub x9, x0, x1, lsl #3 + ld1 {v16.8h}, [x9], x1 // p7 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v17.8h}, [x9], x1 // p6 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v18.8h}, [x9], x1 // p5 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v19.8h}, [x9], x1 // p4 + ld1 {v27.8h}, [x0], x1 // q3 + ld1 {v20.8h}, [x9], x1 // p3 + ld1 {v28.8h}, [x0], x1 // q4 + ld1 {v21.8h}, [x9], x1 // p2 + ld1 {v29.8h}, [x0], x1 // q5 + ld1 {v22.8h}, [x9], x1 // p1 + ld1 {v30.8h}, [x0], x1 // q6 + ld1 {v23.8h}, [x9], x1 // p0 + ld1 {v31.8h}, [x0], x1 // q7 + sub x9, x9, x1, lsl #3 + sub x0, x0, x1, lsl #3 + add x9, x9, x1 + + loop_filter_16 + + // If we did the flat8out part, we get the output in + // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride, + // store v2-v9 there, and v10-v17 into x0. + st1 {v2.8h}, [x9], x1 + st1 {v10.8h}, [x0], x1 + st1 {v3.8h}, [x9], x1 + st1 {v11.8h}, [x0], x1 + st1 {v4.8h}, [x9], x1 + st1 {v12.8h}, [x0], x1 + st1 {v5.8h}, [x9], x1 + st1 {v13.8h}, [x0], x1 + st1 {v6.8h}, [x9], x1 + st1 {v14.8h}, [x0], x1 + st1 {v8.8h}, [x9], x1 + st1 {v15.8h}, [x0], x1 + st1 {v9.8h}, [x9], x1 + st1 {v17.8h}, [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, x1 + + ret x10 +8: + add x9, x9, x1, lsl #2 + // If we didn't do the flat8out part, the output is left in the + // input registers. + st1 {v21.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v22.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v26.8h}, [x0], x1 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + ret x10 +7: + sub x9, x0, x1, lsl #1 + st1 {v22.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + sub x0, x0, x1, lsl #1 + ret x10 +endfunc + +bpp_frontends vp9_loop_filter_v_16_8, push=1 +bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1 + +function vp9_loop_filter_h_16_8_16_neon + mov x10, x30 + sub x9, x0, #16 + ld1 {v16.8h}, [x9], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v17.8h}, [x9], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v18.8h}, [x9], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v19.8h}, [x9], x1 + ld1 {v27.8h}, [x0], x1 + ld1 {v20.8h}, [x9], x1 + ld1 {v28.8h}, [x0], x1 + ld1 {v21.8h}, [x9], x1 + ld1 {v29.8h}, [x0], x1 + ld1 {v22.8h}, [x9], x1 + ld1 {v30.8h}, [x0], x1 + ld1 {v23.8h}, [x9], x1 + ld1 {v31.8h}, [x0], x1 + sub x0, x0, x1, lsl #3 + sub x9, x9, x1, lsl #3 + + // The 16x8 pixels read above is in two 8x8 blocks; the left + // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes + // of this, to get one column per register. + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 + + loop_filter_16 + + transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1 + transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1 + + st1 {v16.8h}, [x9], x1 + st1 {v10.8h}, [x0], x1 + st1 {v2.8h}, [x9], x1 + st1 {v11.8h}, [x0], x1 + st1 {v3.8h}, [x9], x1 + st1 {v12.8h}, [x0], x1 + st1 {v4.8h}, [x9], x1 + st1 {v13.8h}, [x0], x1 + st1 {v5.8h}, [x9], x1 + st1 {v14.8h}, [x0], x1 + st1 {v6.8h}, [x9], x1 + st1 {v15.8h}, [x0], x1 + st1 {v8.8h}, [x9], x1 + st1 {v17.8h}, [x0], x1 + st1 {v9.8h}, [x9], x1 + st1 {v31.8h}, [x0], x1 + sub x0, x0, x1, lsl #3 + + ret x10 +8: + // The same writeback as in loop_filter_h_8_8 + sub x9, x0, #8 + add x0, x9, x1, lsl #2 + transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + st1 {v20.8h}, [x9], x1 + st1 {v24.8h}, [x0], x1 + st1 {v21.8h}, [x9], x1 + st1 {v25.8h}, [x0], x1 + st1 {v22.8h}, [x9], x1 + st1 {v26.8h}, [x0], x1 + st1 {v23.8h}, [x9], x1 + st1 {v27.8h}, [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + ret x10 +7: + // The same writeback as in loop_filter_h_4_8 + sub x9, x0, #4 + add x0, x9, x1, lsl #2 + transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 + st1 {v22.d}[0], [x9], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x9], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x9], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x9], x1 + st1 {v25.d}[1], [x0], x1 + sub x0, x0, x1, lsl #3 + add x0, x0, #4 + ret x10 +endfunc + +bpp_frontends vp9_loop_filter_h_16_8, push=1 +bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1 diff --git a/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S b/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S new file mode 100644 index 0000000000..9a79f48df3 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S @@ -0,0 +1,1334 @@ +/* + * Copyright (c) 2016 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + + +// The main loop filter macro is templated and can produce filters for +// vectors of 8 or 16 bytes. The register mapping throughout the filter +// is close to identical to the arm version (please try to maintain this, +// if either is changed!). When the arm version uses e.g. d20 for the +// input variable p3, the aarch64 version uses v20.8b or v20.16b, depending +// on vector length. +// +// The number of elements in the vector is passed in via the macro parameter +// \sz, which is either .8b or .16b. For simple instructions that doesn't +// lengthen or narrow things, this can easily be templated like this: +// uabd v4\sz, v20\sz, v21\sz +// +// For instructions that lengthen or narrow content, the arm version would +// have used q registers. For these instructions, we have macros that expand +// into either a single e.g. uaddl instruction, or into a uaddl + uaddl2 +// pair, depending on the \sz parameter. Wherever the arm version would have +// used a q register, these macros instead take two v registers, i.e. q3 +// is mapped to v6+v7. For the case with 8 byte input vectors, such a +// lengthening operation is only stored in v6.8h (what was in q3 in the arm +// case), while the 16 byte input vectors will use v6.8h + v7.8h. +// Such a macro invocation would look like this: +// uaddl_sz v8.8h, v9.8h, v17, v18, \sz +// +// That is, in the 8 byte input vector case, the second register in these +// register pairs will be unused. +// Unfortunately, this makes the code quite hard to read. For readability, +// see the arm version instead. + + +.macro add_sz dst1, dst2, in1, in2, in3, in4, sz + add \dst1, \in1, \in3 +.ifc \sz, .16b + add \dst2, \in2, \in4 +.endif +.endm + +.macro sub_sz dst1, dst2, in1, in2, in3, in4, sz + sub \dst1, \in1, \in3 +.ifc \sz, .16b + sub \dst2, \in2, \in4 +.endif +.endm + +.macro uaddw_sz dst1, dst2, in1, in2, in3, sz + uaddw \dst1, \in1, \in3\().8b +.ifc \sz, .16b + uaddw2 \dst2, \in2, \in3\().16b +.endif +.endm + +.macro usubw_sz dst1, dst2, in1, in2, in3, sz + usubw \dst1, \in1, \in3\().8b +.ifc \sz, .16b + usubw2 \dst2, \in2, \in3\().16b +.endif +.endm + +.macro usubl_sz dst1, dst2, in1, in2, sz + usubl \dst1, \in1\().8b, \in2\().8b +.ifc \sz, .16b + usubl2 \dst2, \in1\().16b, \in2\().16b +.endif +.endm + +.macro sqxtn_sz dst, in1, in2, sz + sqxtn \dst\().8b, \in1 +.ifc \sz, .16b + sqxtn2 \dst\().16b, \in2 +.endif +.endm + +.macro sqxtun_sz dst, in1, in2, sz + sqxtun \dst\().8b, \in1 +.ifc \sz, .16b + sqxtun2 \dst\().16b, \in2 +.endif +.endm + +.macro mul_sz dst1, dst2, in1, in2, in3, in4, sz + mul \dst1, \in1, \in3 +.ifc \sz, .16b + mul \dst2, \in2, \in4 +.endif +.endm + +.macro saddw_sz dst1, dst2, in1, in2, in3, sz + saddw \dst1, \in1, \in3\().8b +.ifc \sz, .16b + saddw2 \dst2, \in2, \in3\().16b +.endif +.endm + +.macro ssubw_sz dst1, dst2, in1, in2, in3, sz + ssubw \dst1, \in1, \in3\().8b +.ifc \sz, .16b + ssubw2 \dst2, \in2, \in3\().16b +.endif +.endm + +.macro uxtl_sz dst1, dst2, in, sz + uxtl \dst1, \in\().8b +.ifc \sz, .16b + uxtl2 \dst2, \in\().16b +.endif +.endm + +.macro uaddl_sz dst1, dst2, in1, in2, sz + uaddl \dst1, \in1\().8b, \in2\().8b +.ifc \sz, .16b + uaddl2 \dst2, \in1\().16b, \in2\().16b +.endif +.endm + +.macro rshrn_sz dst, in1, in2, shift, sz + rshrn \dst\().8b, \in1, \shift +.ifc \sz, .16b + rshrn2 \dst\().16b, \in2, \shift +.endif +.endm + +.macro ushll_sz dst1, dst2, in, shift, sz + ushll \dst1, \in\().8b, \shift +.ifc \sz, .16b + ushll2 \dst2, \in\().16b, \shift +.endif +.endm + +// The input to and output from this macro is in the registers v16-v31, +// and v0-v7 are used as scratch registers. +// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31 +// Depending on the width of the loop filter, we either use v16-v19 +// and v28-v31 as temp registers, or v8-v15. +// When comparing to the arm version, tmpq1 == tmp1 + tmp2, +// tmpq2 == tmp3 + tmp4, etc. +.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 +.if \mix == 0 + dup v0\sz, w2 // E + dup v2\sz, w3 // I + dup v3\sz, w4 // H +.else + dup v0.8h, w2 // E + dup v2.8h, w3 // I + dup v3.8h, w4 // H + rev16 v1.16b, v0.16b // E + rev16 v4.16b, v2.16b // I + rev16 v5.16b, v3.16b // H + uzp1 v0.16b, v0.16b, v1.16b + uzp1 v2.16b, v2.16b, v4.16b + uzp1 v3.16b, v3.16b, v5.16b +.endif + + uabd v4\sz, v20\sz, v21\sz // abs(p3 - p2) + uabd v5\sz, v21\sz, v22\sz // abs(p2 - p1) + uabd v6\sz, v22\sz, v23\sz // abs(p1 - p0) + uabd v7\sz, v24\sz, v25\sz // abs(q0 - q1) + uabd \tmp1\sz, v25\sz, v26\sz // abs(q1 - q2) + uabd \tmp2\sz, v26\sz, v27\sz // abs(q2 - q3) + umax v4\sz, v4\sz, v5\sz + umax v5\sz, v6\sz, v7\sz + umax \tmp1\sz, \tmp1\sz, \tmp2\sz + uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0) + umax v4\sz, v4\sz, v5\sz + uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2 + uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1) + umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3)) + ushr v5\sz, v5\sz, #1 + cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I + uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 + cmhs v5\sz, v0\sz, v6\sz + and v4\sz, v4\sz, v5\sz // fm + + // If no pixels need filtering, just exit as soon as possible + mov x5, v4.d[0] +.ifc \sz, .16b + mov x6, v4.d[1] + adds x5, x5, x6 + b.eq 9f +.else + cbz x5, 9f +.endif + +.if \wd >= 8 + movi v0\sz, #1 + + uabd v6\sz, v20\sz, v23\sz // abs(p3 - p0) + uabd v2\sz, v21\sz, v23\sz // abs(p2 - p0) + uabd v1\sz, v22\sz, v23\sz // abs(p1 - p0) + uabd \tmp1\sz, v25\sz, v24\sz // abs(q1 - q0) + uabd \tmp2\sz, v26\sz, v24\sz // abs(q2 - q0) + uabd \tmp3\sz, v27\sz, v24\sz // abs(q3 - q0) + umax v6\sz, v6\sz, v2\sz + umax v1\sz, v1\sz, \tmp1\sz + umax \tmp2\sz, \tmp2\sz, \tmp3\sz +.if \wd == 16 + uabd v7\sz, v16\sz, v23\sz // abs(p7 - p0) + umax v6\sz, v6\sz, v1\sz + uabd v2\sz, v17\sz, v23\sz // abs(p6 - p0) + umax v6\sz, v6\sz, \tmp2\sz + uabd v1\sz, v18\sz, v23\sz // abs(p5 - p0) + cmhs v6\sz, v0\sz, v6\sz // flat8in + uabd v8\sz, v19\sz, v23\sz // abs(p4 - p0) + and v6\sz, v6\sz, v4\sz // flat8in && fm + uabd v9\sz, v28\sz, v24\sz // abs(q4 - q0) + bic v4\sz, v4\sz, v6\sz // fm && !flat8in + uabd v10\sz, v29\sz, v24\sz // abs(q5 - q0) + uabd v11\sz, v30\sz, v24\sz // abs(q6 - q0) + uabd v12\sz, v31\sz, v24\sz // abs(q7 - q0) + + umax v7\sz, v7\sz, v2\sz + umax v1\sz, v1\sz, v8\sz + umax v9\sz, v9\sz, v10\sz + umax v11\sz, v11\sz, v12\sz + // The rest of the calculation of flat8out is interleaved below +.else + // The rest of the calculation of flat8in is interleaved below +.endif +.endif + + // Calculate the normal inner loop filter for 2 or 4 pixels + uabd v5\sz, v22\sz, v23\sz // abs(p1 - p0) +.if \wd == 16 + umax v7\sz, v7\sz, v1\sz + umax v9\sz, v9\sz, v11\sz +.elseif \wd == 8 + umax v6\sz, v6\sz, v1\sz +.endif + uabd v1\sz, v25\sz, v24\sz // abs(q1 - q0) +.if \wd == 16 + umax v7\sz, v7\sz, v9\sz +.elseif \wd == 8 + umax v6\sz, v6\sz, \tmp2\sz +.endif + usubl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz // p1 - q1 + umax v5\sz, v5\sz, v1\sz // max(abs(p1 - p0), abs(q1 - q0)) +.if \mix != 0 + mov v1.d[0], x11 +.endif + usubl_sz \tmp3\().8h, \tmp4\().8h, v24, v23, \sz // q0 - p0 + movi \tmp5\().8h, #3 +.if \wd == 8 + cmhs v6\sz, v0\sz, v6\sz // flat8in +.endif +.if \mix != 0 + sxtl v1.8h, v1.8b +.endif + cmhs v5\sz, v3\sz, v5\sz // !hev +.if \wd == 8 + // If a 4/8 or 8/4 mix is used, clear the relevant half of v6 +.if \mix != 0 + and v6\sz, v6\sz, v1.16b +.endif + and v6\sz, v6\sz, v4\sz // flat8in && fm +.endif + sqxtn_sz \tmp1, \tmp1\().8h, \tmp2\().8h, \sz // av_clip_int8(p1 - q1) +.if \wd == 16 + cmhs v7\sz, v0\sz, v7\sz // flat8out +.elseif \wd == 8 + bic v4\sz, v4\sz, v6\sz // fm && !flat8in +.endif + and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in +.if \wd == 16 + and v7\sz, v7\sz, v6\sz // flat8out && flat8in && fm +.endif + + mul_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp5\().8h, \tmp5\().8h, \sz // 3 * (q0 - p0) + bic \tmp1\sz, \tmp1\sz, v5\sz // if (!hev) av_clip_int8 = 0 + movi v2\sz, #4 + saddw_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1, \sz // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] + movi v3\sz, #3 + sqxtn_sz \tmp1, \tmp3\().8h, \tmp4\().8h, \sz // f +.if \wd == 16 + bic v6\sz, v6\sz, v7\sz // fm && flat8in && !flat8out +.endif + + sqadd \tmp3\sz, \tmp1\sz, v2\sz // FFMIN(f + 4, 127) + sqadd \tmp4\sz, \tmp1\sz, v3\sz // FFMIN(f + 3, 127) + uxtl_sz v0.8h, v1.8h, v23, \sz // p0 + sshr \tmp3\sz, \tmp3\sz, #3 // f1 + sshr \tmp4\sz, \tmp4\sz, #3 // f2 + + uxtl_sz v2.8h, v3.8h, v24, \sz // q0 + saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp4, \sz // p0 + f2 + ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q0 - f1 + sqxtun_sz v0, v0.8h, v1.8h, \sz // out p0 + sqxtun_sz v1, v2.8h, v3.8h, \sz // out q0 + srshr \tmp3\sz, \tmp3\sz, #1 // f = (f1 + 1) >> 1 + bit v23\sz, v0\sz, v4\sz // if (fm && !flat8in) + bit v24\sz, v1\sz, v4\sz + + uxtl_sz v0.8h, v1.8h, v22, \sz // p1 + uxtl_sz v2.8h, v3.8h, v25, \sz // q1 +.if \wd >= 8 + mov x5, v6.d[0] +.ifc \sz, .16b + mov x6, v6.d[1] +.endif +.endif + saddw_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3, \sz // p1 + f + ssubw_sz v2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q1 - f + sqxtun_sz v0, v0.8h, v1.8h, \sz // out p1 + sqxtun_sz v2, v2.8h, v3.8h, \sz // out q1 +.if \wd >= 8 +.ifc \sz, .16b + adds x5, x5, x6 +.endif +.endif + bit v22\sz, v0\sz, v5\sz // if (!hev && fm && !flat8in) + bit v25\sz, v2\sz, v5\sz + + // If no pixels need flat8in, jump to flat8out + // (or to a writeout of the inner 4 pixels, for wd=8) +.if \wd >= 8 +.ifc \sz, .16b + b.eq 6f +.else + cbz x5, 6f +.endif + + // flat8in + uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v21, \sz + uaddl_sz \tmp3\().8h, \tmp4\().8h, v22, v25, \sz + uaddl_sz \tmp5\().8h, \tmp6\().8h, v20, v22, \sz + uaddl_sz \tmp7\().8h, \tmp8\().8h, v23, v26, \sz + add_sz v0.8h, v1.8h, \tmp1\().8h, \tmp2\().8h, \tmp1\().8h, \tmp2\().8h, \sz + uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v23, \sz + uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v24, \sz + add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp5\().8h, \tmp6\().8h, \sz + sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz + sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz + rshrn_sz v2, v0.8h, v1.8h, #3, \sz // out p2 + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz + uaddl_sz \tmp1\().8h, \tmp2\().8h, v20, v23, \sz + uaddl_sz \tmp3\().8h, \tmp4\().8h, v24, v27, \sz + rshrn_sz v3, v0.8h, v1.8h, #3, \sz // out p1 + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz + sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz + uaddl_sz \tmp5\().8h, \tmp6\().8h, v21, v24, \sz + uaddl_sz \tmp7\().8h, \tmp8\().8h, v25, v27, \sz + rshrn_sz v4, v0.8h, v1.8h, #3, \sz // out p0 + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz + sub_sz \tmp7\().8h, \tmp8\().8h, \tmp7\().8h, \tmp8\().8h, \tmp5\().8h, \tmp6\().8h, \sz + uaddl_sz \tmp1\().8h, \tmp2\().8h, v22, v25, \sz + uaddl_sz \tmp3\().8h, \tmp4\().8h, v26, v27, \sz + rshrn_sz v5, v0.8h, v1.8h, #3, \sz // out q0 + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp7\().8h, \tmp8\().8h, \sz + sub_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp1\().8h, \tmp2\().8h, \sz + rshrn_sz \tmp5, v0.8h, v1.8h, #3, \sz // out q1 + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, \tmp3\().8h, \tmp4\().8h, \sz + // The output here is written back into the input registers. This doesn't + // matter for the flat8part below, since we only update those pixels + // which won't be touched below. + bit v21\sz, v2\sz, v6\sz + bit v22\sz, v3\sz, v6\sz + bit v23\sz, v4\sz, v6\sz + rshrn_sz \tmp6, v0.8h, v1.8h, #3, \sz // out q2 + bit v24\sz, v5\sz, v6\sz + bit v25\sz, \tmp5\sz, v6\sz + bit v26\sz, \tmp6\sz, v6\sz +.endif +.if \wd == 16 +6: + orr v2\sz, v6\sz, v7\sz + mov x5, v2.d[0] +.ifc \sz, .16b + mov x6, v2.d[1] + adds x5, x5, x6 + b.ne 1f +.else + cbnz x5, 1f +.endif + // If no pixels needed flat8in nor flat8out, jump to a + // writeout of the inner 4 pixels + ret x14 +1: + + mov x5, v7.d[0] +.ifc \sz, .16b + mov x6, v7.d[1] + adds x5, x5, x6 + b.ne 1f +.else + cbnz x5, 1f +.endif + // If no pixels need flat8out, jump to a writeout of the inner 6 pixels + ret x15 + +1: + // flat8out + // This writes all outputs into v2-v17 (skipping v6 and v16). + // If this part is skipped, the output is read from v21-v26 (which is the input + // to this section). + ushll_sz v0.8h, v1.8h, v16, #3, \sz // 8 * v16 + usubw_sz v0.8h, v1.8h, v0.8h, v1.8h, v16, \sz // 7 * v16 + uaddw_sz v0.8h, v1.8h, v0.8h, v1.8h, v17, \sz + uaddl_sz v8.8h, v9.8h, v17, v18, \sz + uaddl_sz v10.8h, v11.8h, v19, v20, \sz + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v8.8h, v9.8h, \sz + uaddl_sz v8.8h, v9.8h, v16, v17, \sz + uaddl_sz v12.8h, v13.8h, v21, v22, \sz + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz + uaddl_sz v10.8h, v11.8h, v18, v25, \sz + uaddl_sz v14.8h, v15.8h, v23, v24, \sz + sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz + uaddl_sz v12.8h, v13.8h, v16, v18, \sz + uaddl_sz v14.8h, v15.8h, v19, v26, \sz + rshrn_sz v2, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz + uaddl_sz v8.8h, v9.8h, v16, v19, \sz + uaddl_sz v10.8h, v11.8h, v20, v27, \sz + sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz + bif v2\sz, v17\sz, v7\sz + rshrn_sz v3, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz + uaddl_sz v12.8h, v13.8h, v16, v20, \sz + uaddl_sz v14.8h, v15.8h, v21, v28, \sz + sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz + bif v3\sz, v18\sz, v7\sz + rshrn_sz v4, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz + uaddl_sz v8.8h, v9.8h, v16, v21, \sz + uaddl_sz v10.8h, v11.8h, v22, v29, \sz + sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz + bif v4\sz, v19\sz, v7\sz + rshrn_sz v5, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz + uaddl_sz v12.8h, v13.8h, v16, v22, \sz + uaddl_sz v14.8h, v15.8h, v23, v30, \sz + sub_sz v10.8h, v11.8h, v10.8h, v11.8h, v8.8h, v9.8h, \sz + bif v5\sz, v20\sz, v7\sz + rshrn_sz v6, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz + uaddl_sz v10.8h, v11.8h, v16, v23, \sz + sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz + uaddl_sz v12.8h, v13.8h, v24, v31, \sz + bif v6\sz, v21\sz, v7\sz + rshrn_sz v8, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz + sub_sz v10.8h, v11.8h, v12.8h, v13.8h, v10.8h, v11.8h, \sz + uaddl_sz v12.8h, v13.8h, v17, v24, \sz + uaddl_sz v14.8h, v15.8h, v25, v31, \sz + bif v8\sz, v22\sz, v7\sz + rshrn_sz v9, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v10.8h, v11.8h, \sz + sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v12.8h, v13.8h, \sz + uaddl_sz v12.8h, v13.8h, v26, v31, \sz + bif v9\sz, v23\sz, v7\sz + rshrn_sz v10, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz + uaddl_sz v14.8h, v15.8h, v18, v25, \sz + uaddl_sz v18.8h, v19.8h, v19, v26, \sz + sub_sz v12.8h, v13.8h, v12.8h, v13.8h, v14.8h, v15.8h, \sz + uaddl_sz v14.8h, v15.8h, v27, v31, \sz + bif v10\sz, v24\sz, v7\sz + rshrn_sz v11, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v12.8h, v13.8h, \sz + uaddl_sz v12.8h, v13.8h, v20, v27, \sz + sub_sz v14.8h, v15.8h, v14.8h, v15.8h, v18.8h, v19.8h, \sz + uaddl_sz v18.8h, v19.8h, v28, v31, \sz + bif v11\sz, v25\sz, v7\sz + sub_sz v18.8h, v19.8h, v18.8h, v19.8h, v12.8h, v13.8h, \sz + rshrn_sz v12, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v14.8h, v15.8h, \sz + uaddl_sz v14.8h, v15.8h, v21, v28, \sz + uaddl_sz v20.8h, v21.8h, v29, v31, \sz + bif v12\sz, v26\sz, v7\sz + rshrn_sz v13, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v18.8h, v19.8h, \sz + sub_sz v20.8h, v21.8h, v20.8h, v21.8h, v14.8h, v15.8h, \sz + uaddl_sz v18.8h, v19.8h, v22, v29, \sz + uaddl_sz v22.8h, v23.8h, v30, v31, \sz + bif v13\sz, v27\sz, v7\sz + rshrn_sz v14, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v20.8h, v21.8h, \sz + sub_sz v22.8h, v23.8h, v22.8h, v23.8h, v18.8h, v19.8h, \sz + bif v14\sz, v28\sz, v7\sz + rshrn_sz v15, v0.8h, v1.8h, #4, \sz + + add_sz v0.8h, v1.8h, v0.8h, v1.8h, v22.8h, v23.8h, \sz + bif v15\sz, v29\sz, v7\sz + rshrn_sz v17, v0.8h, v1.8h, #4, \sz + bif v17\sz, v30\sz, v7\sz +.endif +.endm + +// For wd <= 8, we use v16-v19 and v28-v31 for temp registers, +// while we need those for inputs/outputs in wd=16 and use v8-v15 +// for temp registers there instead. +function vp9_loop_filter_4 + loop_filter 4, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 + ret +9: + ret x10 +endfunc + +function vp9_loop_filter_4_16b_mix_44 + loop_filter 4, .16b, 44, v16, v17, v18, v19, v28, v29, v30, v31 + ret +9: + ret x10 +endfunc + +function vp9_loop_filter_8 + loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 + ret +6: + ret x13 +9: + ret x10 +endfunc + +function vp9_loop_filter_8_16b_mix + loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31 + ret +6: + ret x13 +9: + ret x10 +endfunc + +function vp9_loop_filter_16 + loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15 + ret +9: + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 + ret x10 +endfunc + +function vp9_loop_filter_16_16b + loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15 + ret +9: + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 + ret x10 +endfunc + +.macro loop_filter_4 + bl vp9_loop_filter_4 +.endm + +.macro loop_filter_4_16b_mix mix + bl vp9_loop_filter_4_16b_mix_\mix +.endm + +.macro loop_filter_8 + // calculate alternative 'return' targets + adr x13, 6f + bl vp9_loop_filter_8 +.endm + +.macro loop_filter_8_16b_mix mix + // calculate alternative 'return' targets + adr x13, 6f +.if \mix == 48 + mov x11, #0xffffffff00000000 +.elseif \mix == 84 + mov x11, #0x00000000ffffffff +.else + mov x11, #0xffffffffffffffff +.endif + bl vp9_loop_filter_8_16b_mix +.endm + +.macro loop_filter_16 + // calculate alternative 'return' targets + adr x14, 7f + adr x15, 8f + bl vp9_loop_filter_16 +.endm + +.macro loop_filter_16_16b + // calculate alternative 'return' targets + adr x14, 7f + adr x15, 8f + bl vp9_loop_filter_16_16b +.endm + + +// The public functions in this file have got the following signature: +// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); + +function ff_vp9_loop_filter_v_4_8_neon, export=1 + mov x10, x30 + sub x9, x0, x1, lsl #2 + ld1 {v20.8b}, [x9], x1 // p3 + ld1 {v24.8b}, [x0], x1 // q0 + ld1 {v21.8b}, [x9], x1 // p2 + ld1 {v25.8b}, [x0], x1 // q1 + ld1 {v22.8b}, [x9], x1 // p1 + ld1 {v26.8b}, [x0], x1 // q2 + ld1 {v23.8b}, [x9], x1 // p0 + ld1 {v27.8b}, [x0], x1 // q3 + sub x0, x0, x1, lsl #2 + sub x9, x9, x1, lsl #1 + + loop_filter_4 + + st1 {v22.8b}, [x9], x1 + st1 {v24.8b}, [x0], x1 + st1 {v23.8b}, [x9], x1 + st1 {v25.8b}, [x0], x1 + + ret x10 +endfunc + +function ff_vp9_loop_filter_v_44_16_neon, export=1 + mov x10, x30 + sub x9, x0, x1, lsl #2 + ld1 {v20.16b}, [x9], x1 // p3 + ld1 {v24.16b}, [x0], x1 // q0 + ld1 {v21.16b}, [x9], x1 // p2 + ld1 {v25.16b}, [x0], x1 // q1 + ld1 {v22.16b}, [x9], x1 // p1 + ld1 {v26.16b}, [x0], x1 // q2 + ld1 {v23.16b}, [x9], x1 // p0 + ld1 {v27.16b}, [x0], x1 // q3 + sub x0, x0, x1, lsl #2 + sub x9, x9, x1, lsl #1 + + loop_filter_4_16b_mix 44 + + st1 {v22.16b}, [x9], x1 + st1 {v24.16b}, [x0], x1 + st1 {v23.16b}, [x9], x1 + st1 {v25.16b}, [x0], x1 + + ret x10 +endfunc + +function ff_vp9_loop_filter_h_4_8_neon, export=1 + mov x10, x30 + sub x9, x0, #4 + add x0, x9, x1, lsl #2 + ld1 {v20.8b}, [x9], x1 + ld1 {v24.8b}, [x0], x1 + ld1 {v21.8b}, [x9], x1 + ld1 {v25.8b}, [x0], x1 + ld1 {v22.8b}, [x9], x1 + ld1 {v26.8b}, [x0], x1 + ld1 {v23.8b}, [x9], x1 + ld1 {v27.8b}, [x0], x1 + + sub x9, x9, x1, lsl #2 + sub x0, x0, x1, lsl #2 + // Move x0/x9 forward by 2 pixels; we don't need to rewrite the + // outermost 2 pixels since they aren't changed. + add x9, x9, #2 + add x0, x0, #2 + + transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + loop_filter_4 + + // We only will write the mid 4 pixels back; after the loop filter, + // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels). + // We need to transpose them to columns, done with a 4x8 transpose + // (which in practice is two 4x4 transposes of the two 4x4 halves + // of the 8x4 pixels; into 4x8 pixels). + transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29 + st1 {v22.s}[0], [x9], x1 + st1 {v22.s}[1], [x0], x1 + st1 {v23.s}[0], [x9], x1 + st1 {v23.s}[1], [x0], x1 + st1 {v24.s}[0], [x9], x1 + st1 {v24.s}[1], [x0], x1 + st1 {v25.s}[0], [x9], x1 + st1 {v25.s}[1], [x0], x1 + + ret x10 +endfunc + +function ff_vp9_loop_filter_h_44_16_neon, export=1 + mov x10, x30 + sub x9, x0, #4 + add x0, x9, x1, lsl #3 + ld1 {v20.8b}, [x9], x1 + ld1 {v20.d}[1], [x0], x1 + ld1 {v21.8b}, [x9], x1 + ld1 {v21.d}[1], [x0], x1 + ld1 {v22.8b}, [x9], x1 + ld1 {v22.d}[1], [x0], x1 + ld1 {v23.8b}, [x9], x1 + ld1 {v23.d}[1], [x0], x1 + ld1 {v24.8b}, [x9], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v25.8b}, [x9], x1 + ld1 {v25.d}[1], [x0], x1 + ld1 {v26.8b}, [x9], x1 + ld1 {v26.d}[1], [x0], x1 + ld1 {v27.8b}, [x9], x1 + ld1 {v27.d}[1], [x0], x1 + + sub x9, x9, x1, lsl #3 + sub x0, x0, x1, lsl #3 + add x9, x9, #2 + add x0, x0, #2 + + transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + loop_filter_4_16b_mix 44 + + transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29 + + st1 {v22.s}[0], [x9], x1 + st1 {v22.s}[2], [x0], x1 + st1 {v23.s}[0], [x9], x1 + st1 {v23.s}[2], [x0], x1 + st1 {v24.s}[0], [x9], x1 + st1 {v24.s}[2], [x0], x1 + st1 {v25.s}[0], [x9], x1 + st1 {v25.s}[2], [x0], x1 + st1 {v22.s}[1], [x9], x1 + st1 {v22.s}[3], [x0], x1 + st1 {v23.s}[1], [x9], x1 + st1 {v23.s}[3], [x0], x1 + st1 {v24.s}[1], [x9], x1 + st1 {v24.s}[3], [x0], x1 + st1 {v25.s}[1], [x9], x1 + st1 {v25.s}[3], [x0], x1 + + ret x10 +endfunc + +function ff_vp9_loop_filter_v_8_8_neon, export=1 + mov x10, x30 + sub x9, x0, x1, lsl #2 + ld1 {v20.8b}, [x9], x1 // p3 + ld1 {v24.8b}, [x0], x1 // q0 + ld1 {v21.8b}, [x9], x1 // p2 + ld1 {v25.8b}, [x0], x1 // q1 + ld1 {v22.8b}, [x9], x1 // p1 + ld1 {v26.8b}, [x0], x1 // q2 + ld1 {v23.8b}, [x9], x1 // p0 + ld1 {v27.8b}, [x0], x1 // q3 + sub x9, x9, x1, lsl #2 + sub x0, x0, x1, lsl #2 + add x9, x9, x1 + + loop_filter_8 + + st1 {v21.8b}, [x9], x1 + st1 {v24.8b}, [x0], x1 + st1 {v22.8b}, [x9], x1 + st1 {v25.8b}, [x0], x1 + st1 {v23.8b}, [x9], x1 + st1 {v26.8b}, [x0], x1 + + ret x10 +6: + sub x9, x0, x1, lsl #1 + st1 {v22.8b}, [x9], x1 + st1 {v24.8b}, [x0], x1 + st1 {v23.8b}, [x9], x1 + st1 {v25.8b}, [x0], x1 + ret x10 +endfunc + +.macro mix_v_16 mix +function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1 + mov x10, x30 + sub x9, x0, x1, lsl #2 + ld1 {v20.16b}, [x9], x1 // p3 + ld1 {v24.16b}, [x0], x1 // q0 + ld1 {v21.16b}, [x9], x1 // p2 + ld1 {v25.16b}, [x0], x1 // q1 + ld1 {v22.16b}, [x9], x1 // p1 + ld1 {v26.16b}, [x0], x1 // q2 + ld1 {v23.16b}, [x9], x1 // p0 + ld1 {v27.16b}, [x0], x1 // q3 + sub x9, x9, x1, lsl #2 + sub x0, x0, x1, lsl #2 + add x9, x9, x1 + + loop_filter_8_16b_mix \mix + + st1 {v21.16b}, [x9], x1 + st1 {v24.16b}, [x0], x1 + st1 {v22.16b}, [x9], x1 + st1 {v25.16b}, [x0], x1 + st1 {v23.16b}, [x9], x1 + st1 {v26.16b}, [x0], x1 + + ret x10 +6: + sub x9, x0, x1, lsl #1 + st1 {v22.16b}, [x9], x1 + st1 {v24.16b}, [x0], x1 + st1 {v23.16b}, [x9], x1 + st1 {v25.16b}, [x0], x1 + ret x10 +endfunc +.endm + +mix_v_16 48 +mix_v_16 84 +mix_v_16 88 + +function ff_vp9_loop_filter_h_8_8_neon, export=1 + mov x10, x30 + sub x9, x0, #4 + add x0, x9, x1, lsl #2 + ld1 {v20.8b}, [x9], x1 + ld1 {v24.8b}, [x0], x1 + ld1 {v21.8b}, [x9], x1 + ld1 {v25.8b}, [x0], x1 + ld1 {v22.8b}, [x9], x1 + ld1 {v26.8b}, [x0], x1 + ld1 {v23.8b}, [x9], x1 + ld1 {v27.8b}, [x0], x1 + + sub x9, x9, x1, lsl #2 + sub x0, x0, x1, lsl #2 + + transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + loop_filter_8 + + // Even though only 6 pixels per row have been changed, we write the + // full 8 pixel registers. + transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + st1 {v20.8b}, [x9], x1 + st1 {v24.8b}, [x0], x1 + st1 {v21.8b}, [x9], x1 + st1 {v25.8b}, [x0], x1 + st1 {v22.8b}, [x9], x1 + st1 {v26.8b}, [x0], x1 + st1 {v23.8b}, [x9], x1 + st1 {v27.8b}, [x0], x1 + + ret x10 +6: + // If we didn't need to do the flat8in part, we use the same writeback + // as in loop_filter_h_4_8. + add x9, x9, #2 + add x0, x0, #2 + transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29 + st1 {v22.s}[0], [x9], x1 + st1 {v22.s}[1], [x0], x1 + st1 {v23.s}[0], [x9], x1 + st1 {v23.s}[1], [x0], x1 + st1 {v24.s}[0], [x9], x1 + st1 {v24.s}[1], [x0], x1 + st1 {v25.s}[0], [x9], x1 + st1 {v25.s}[1], [x0], x1 + ret x10 +endfunc + +.macro mix_h_16 mix +function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1 + mov x10, x30 + sub x9, x0, #4 + add x0, x9, x1, lsl #3 + ld1 {v20.8b}, [x9], x1 + ld1 {v20.d}[1], [x0], x1 + ld1 {v21.8b}, [x9], x1 + ld1 {v21.d}[1], [x0], x1 + ld1 {v22.8b}, [x9], x1 + ld1 {v22.d}[1], [x0], x1 + ld1 {v23.8b}, [x9], x1 + ld1 {v23.d}[1], [x0], x1 + ld1 {v24.8b}, [x9], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v25.8b}, [x9], x1 + ld1 {v25.d}[1], [x0], x1 + ld1 {v26.8b}, [x9], x1 + ld1 {v26.d}[1], [x0], x1 + ld1 {v27.8b}, [x9], x1 + ld1 {v27.d}[1], [x0], x1 + + sub x9, x9, x1, lsl #3 + sub x0, x0, x1, lsl #3 + + transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + loop_filter_8_16b_mix \mix + + transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + st1 {v20.8b}, [x9], x1 + st1 {v20.d}[1], [x0], x1 + st1 {v21.8b}, [x9], x1 + st1 {v21.d}[1], [x0], x1 + st1 {v22.8b}, [x9], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.8b}, [x9], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.8b}, [x9], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.8b}, [x9], x1 + st1 {v25.d}[1], [x0], x1 + st1 {v26.8b}, [x9], x1 + st1 {v26.d}[1], [x0], x1 + st1 {v27.8b}, [x9], x1 + st1 {v27.d}[1], [x0], x1 + + ret x10 +6: + add x9, x9, #2 + add x0, x0, #2 + transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29 + st1 {v22.s}[0], [x9], x1 + st1 {v22.s}[2], [x0], x1 + st1 {v23.s}[0], [x9], x1 + st1 {v23.s}[2], [x0], x1 + st1 {v24.s}[0], [x9], x1 + st1 {v24.s}[2], [x0], x1 + st1 {v25.s}[0], [x9], x1 + st1 {v25.s}[2], [x0], x1 + st1 {v22.s}[1], [x9], x1 + st1 {v22.s}[3], [x0], x1 + st1 {v23.s}[1], [x9], x1 + st1 {v23.s}[3], [x0], x1 + st1 {v24.s}[1], [x9], x1 + st1 {v24.s}[3], [x0], x1 + st1 {v25.s}[1], [x9], x1 + st1 {v25.s}[3], [x0], x1 + ret x10 +endfunc +.endm + +mix_h_16 48 +mix_h_16 84 +mix_h_16 88 + +function ff_vp9_loop_filter_v_16_8_neon, export=1 + mov x10, x30 + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] + sub x9, x0, x1, lsl #3 + ld1 {v16.8b}, [x9], x1 // p7 + ld1 {v24.8b}, [x0], x1 // q0 + ld1 {v17.8b}, [x9], x1 // p6 + ld1 {v25.8b}, [x0], x1 // q1 + ld1 {v18.8b}, [x9], x1 // p5 + ld1 {v26.8b}, [x0], x1 // q2 + ld1 {v19.8b}, [x9], x1 // p4 + ld1 {v27.8b}, [x0], x1 // q3 + ld1 {v20.8b}, [x9], x1 // p3 + ld1 {v28.8b}, [x0], x1 // q4 + ld1 {v21.8b}, [x9], x1 // p2 + ld1 {v29.8b}, [x0], x1 // q5 + ld1 {v22.8b}, [x9], x1 // p1 + ld1 {v30.8b}, [x0], x1 // q6 + ld1 {v23.8b}, [x9], x1 // p0 + ld1 {v31.8b}, [x0], x1 // q7 + sub x9, x9, x1, lsl #3 + sub x0, x0, x1, lsl #3 + add x9, x9, x1 + + loop_filter_16 + + // If we did the flat8out part, we get the output in + // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride, + // store v2-v9 there, and v10-v17 into x0. + st1 {v2.8b}, [x9], x1 + st1 {v10.8b}, [x0], x1 + st1 {v3.8b}, [x9], x1 + st1 {v11.8b}, [x0], x1 + st1 {v4.8b}, [x9], x1 + st1 {v12.8b}, [x0], x1 + st1 {v5.8b}, [x9], x1 + st1 {v13.8b}, [x0], x1 + st1 {v6.8b}, [x9], x1 + st1 {v14.8b}, [x0], x1 + st1 {v8.8b}, [x9], x1 + st1 {v15.8b}, [x0], x1 + st1 {v9.8b}, [x9], x1 + st1 {v17.8b}, [x0], x1 +9: + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 + ret x10 +8: + add x9, x9, x1, lsl #2 + // If we didn't do the flat8out part, the output is left in the + // input registers. + st1 {v21.8b}, [x9], x1 + st1 {v24.8b}, [x0], x1 + st1 {v22.8b}, [x9], x1 + st1 {v25.8b}, [x0], x1 + st1 {v23.8b}, [x9], x1 + st1 {v26.8b}, [x0], x1 + b 9b +7: + sub x9, x0, x1, lsl #1 + st1 {v22.8b}, [x9], x1 + st1 {v24.8b}, [x0], x1 + st1 {v23.8b}, [x9], x1 + st1 {v25.8b}, [x0], x1 + b 9b +endfunc + +function ff_vp9_loop_filter_v_16_16_neon, export=1 + mov x10, x30 + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] + sub x9, x0, x1, lsl #3 + ld1 {v16.16b}, [x9], x1 // p7 + ld1 {v24.16b}, [x0], x1 // q0 + ld1 {v17.16b}, [x9], x1 // p6 + ld1 {v25.16b}, [x0], x1 // q1 + ld1 {v18.16b}, [x9], x1 // p5 + ld1 {v26.16b}, [x0], x1 // q2 + ld1 {v19.16b}, [x9], x1 // p4 + ld1 {v27.16b}, [x0], x1 // q3 + ld1 {v20.16b}, [x9], x1 // p3 + ld1 {v28.16b}, [x0], x1 // q4 + ld1 {v21.16b}, [x9], x1 // p2 + ld1 {v29.16b}, [x0], x1 // q5 + ld1 {v22.16b}, [x9], x1 // p1 + ld1 {v30.16b}, [x0], x1 // q6 + ld1 {v23.16b}, [x9], x1 // p0 + ld1 {v31.16b}, [x0], x1 // q7 + sub x9, x9, x1, lsl #3 + sub x0, x0, x1, lsl #3 + add x9, x9, x1 + + loop_filter_16_16b + + st1 {v2.16b}, [x9], x1 + st1 {v10.16b}, [x0], x1 + st1 {v3.16b}, [x9], x1 + st1 {v11.16b}, [x0], x1 + st1 {v4.16b}, [x9], x1 + st1 {v12.16b}, [x0], x1 + st1 {v5.16b}, [x9], x1 + st1 {v13.16b}, [x0], x1 + st1 {v6.16b}, [x9], x1 + st1 {v14.16b}, [x0], x1 + st1 {v8.16b}, [x9], x1 + st1 {v15.16b}, [x0], x1 + st1 {v9.16b}, [x9], x1 + st1 {v17.16b}, [x0], x1 +9: + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 + ret x10 +8: + add x9, x9, x1, lsl #2 + st1 {v21.16b}, [x9], x1 + st1 {v24.16b}, [x0], x1 + st1 {v22.16b}, [x9], x1 + st1 {v25.16b}, [x0], x1 + st1 {v23.16b}, [x9], x1 + st1 {v26.16b}, [x0], x1 + b 9b +7: + sub x9, x0, x1, lsl #1 + st1 {v22.16b}, [x9], x1 + st1 {v24.16b}, [x0], x1 + st1 {v23.16b}, [x9], x1 + st1 {v25.16b}, [x0], x1 + b 9b +endfunc + +function ff_vp9_loop_filter_h_16_8_neon, export=1 + mov x10, x30 + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] + sub x9, x0, #8 + ld1 {v16.8b}, [x9], x1 + ld1 {v24.8b}, [x0], x1 + ld1 {v17.8b}, [x9], x1 + ld1 {v25.8b}, [x0], x1 + ld1 {v18.8b}, [x9], x1 + ld1 {v26.8b}, [x0], x1 + ld1 {v19.8b}, [x9], x1 + ld1 {v27.8b}, [x0], x1 + ld1 {v20.8b}, [x9], x1 + ld1 {v28.8b}, [x0], x1 + ld1 {v21.8b}, [x9], x1 + ld1 {v29.8b}, [x0], x1 + ld1 {v22.8b}, [x9], x1 + ld1 {v30.8b}, [x0], x1 + ld1 {v23.8b}, [x9], x1 + ld1 {v31.8b}, [x0], x1 + sub x0, x0, x1, lsl #3 + sub x9, x9, x1, lsl #3 + + // The 16x8 pixels read above is in two 8x8 blocks; the left + // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes + // of this, to get one column per register. + transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + transpose_8x8B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 + + loop_filter_16 + + transpose_8x8B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1 + transpose_8x8B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1 + + st1 {v16.8b}, [x9], x1 + st1 {v10.8b}, [x0], x1 + st1 {v2.8b}, [x9], x1 + st1 {v11.8b}, [x0], x1 + st1 {v3.8b}, [x9], x1 + st1 {v12.8b}, [x0], x1 + st1 {v4.8b}, [x9], x1 + st1 {v13.8b}, [x0], x1 + st1 {v5.8b}, [x9], x1 + st1 {v14.8b}, [x0], x1 + st1 {v6.8b}, [x9], x1 + st1 {v15.8b}, [x0], x1 + st1 {v8.8b}, [x9], x1 + st1 {v17.8b}, [x0], x1 + st1 {v9.8b}, [x9], x1 + st1 {v31.8b}, [x0], x1 +9: + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 + ret x10 +8: + // The same writeback as in loop_filter_h_8_8 + sub x9, x0, #4 + add x0, x9, x1, lsl #2 + transpose_8x8B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + st1 {v20.8b}, [x9], x1 + st1 {v24.8b}, [x0], x1 + st1 {v21.8b}, [x9], x1 + st1 {v25.8b}, [x0], x1 + st1 {v22.8b}, [x9], x1 + st1 {v26.8b}, [x0], x1 + st1 {v23.8b}, [x9], x1 + st1 {v27.8b}, [x0], x1 + b 9b +7: + // The same writeback as in loop_filter_h_4_8 + sub x9, x0, #2 + add x0, x9, x1, lsl #2 + transpose_4x8B v22, v23, v24, v25, v26, v27, v28, v29 + st1 {v22.s}[0], [x9], x1 + st1 {v22.s}[1], [x0], x1 + st1 {v23.s}[0], [x9], x1 + st1 {v23.s}[1], [x0], x1 + st1 {v24.s}[0], [x9], x1 + st1 {v24.s}[1], [x0], x1 + st1 {v25.s}[0], [x9], x1 + st1 {v25.s}[1], [x0], x1 + b 9b +endfunc + +function ff_vp9_loop_filter_h_16_16_neon, export=1 + mov x10, x30 + stp d8, d9, [sp, #-0x40]! + stp d14, d15, [sp, #0x30] + stp d12, d13, [sp, #0x20] + stp d10, d11, [sp, #0x10] + sub x9, x0, #8 + ld1 {v16.8b}, [x9], x1 + ld1 {v24.8b}, [x0], x1 + ld1 {v17.8b}, [x9], x1 + ld1 {v25.8b}, [x0], x1 + ld1 {v18.8b}, [x9], x1 + ld1 {v26.8b}, [x0], x1 + ld1 {v19.8b}, [x9], x1 + ld1 {v27.8b}, [x0], x1 + ld1 {v20.8b}, [x9], x1 + ld1 {v28.8b}, [x0], x1 + ld1 {v21.8b}, [x9], x1 + ld1 {v29.8b}, [x0], x1 + ld1 {v22.8b}, [x9], x1 + ld1 {v30.8b}, [x0], x1 + ld1 {v23.8b}, [x9], x1 + ld1 {v31.8b}, [x0], x1 + ld1 {v16.d}[1], [x9], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v17.d}[1], [x9], x1 + ld1 {v25.d}[1], [x0], x1 + ld1 {v18.d}[1], [x9], x1 + ld1 {v26.d}[1], [x0], x1 + ld1 {v19.d}[1], [x9], x1 + ld1 {v27.d}[1], [x0], x1 + ld1 {v20.d}[1], [x9], x1 + ld1 {v28.d}[1], [x0], x1 + ld1 {v21.d}[1], [x9], x1 + ld1 {v29.d}[1], [x0], x1 + ld1 {v22.d}[1], [x9], x1 + ld1 {v30.d}[1], [x0], x1 + ld1 {v23.d}[1], [x9], x1 + ld1 {v31.d}[1], [x0], x1 + sub x0, x0, x1, lsl #4 + sub x9, x9, x1, lsl #4 + + transpose_8x16B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + transpose_8x16B v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 + + loop_filter_16_16b + + transpose_8x16B v16, v2, v3, v4, v5, v6, v8, v9, v0, v1 + transpose_8x16B v10, v11, v12, v13, v14, v15, v17, v31, v0, v1 + + st1 {v16.8b}, [x9], x1 + st1 {v10.8b}, [x0], x1 + st1 {v2.8b}, [x9], x1 + st1 {v11.8b}, [x0], x1 + st1 {v3.8b}, [x9], x1 + st1 {v12.8b}, [x0], x1 + st1 {v4.8b}, [x9], x1 + st1 {v13.8b}, [x0], x1 + st1 {v5.8b}, [x9], x1 + st1 {v14.8b}, [x0], x1 + st1 {v6.8b}, [x9], x1 + st1 {v15.8b}, [x0], x1 + st1 {v8.8b}, [x9], x1 + st1 {v17.8b}, [x0], x1 + st1 {v9.8b}, [x9], x1 + st1 {v31.8b}, [x0], x1 + st1 {v16.d}[1], [x9], x1 + st1 {v10.d}[1], [x0], x1 + st1 {v2.d}[1], [x9], x1 + st1 {v11.d}[1], [x0], x1 + st1 {v3.d}[1], [x9], x1 + st1 {v12.d}[1], [x0], x1 + st1 {v4.d}[1], [x9], x1 + st1 {v13.d}[1], [x0], x1 + st1 {v5.d}[1], [x9], x1 + st1 {v14.d}[1], [x0], x1 + st1 {v6.d}[1], [x9], x1 + st1 {v15.d}[1], [x0], x1 + st1 {v8.d}[1], [x9], x1 + st1 {v17.d}[1], [x0], x1 + st1 {v9.d}[1], [x9], x1 + st1 {v31.d}[1], [x0], x1 +9: + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + ldp d8, d9, [sp], #0x40 + ret x10 +8: + sub x9, x0, #4 + add x0, x9, x1, lsl #3 + transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + st1 {v20.8b}, [x9], x1 + st1 {v20.d}[1], [x0], x1 + st1 {v21.8b}, [x9], x1 + st1 {v21.d}[1], [x0], x1 + st1 {v22.8b}, [x9], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.8b}, [x9], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.8b}, [x9], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.8b}, [x9], x1 + st1 {v25.d}[1], [x0], x1 + st1 {v26.8b}, [x9], x1 + st1 {v26.d}[1], [x0], x1 + st1 {v27.8b}, [x9], x1 + st1 {v27.d}[1], [x0], x1 + b 9b +7: + sub x9, x0, #2 + add x0, x9, x1, lsl #3 + transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29 + st1 {v22.s}[0], [x9], x1 + st1 {v22.s}[2], [x0], x1 + st1 {v23.s}[0], [x9], x1 + st1 {v23.s}[2], [x0], x1 + st1 {v24.s}[0], [x9], x1 + st1 {v24.s}[2], [x0], x1 + st1 {v25.s}[0], [x9], x1 + st1 {v25.s}[2], [x0], x1 + st1 {v22.s}[1], [x9], x1 + st1 {v22.s}[3], [x0], x1 + st1 {v23.s}[1], [x9], x1 + st1 {v23.s}[3], [x0], x1 + st1 {v24.s}[1], [x9], x1 + st1 {v24.s}[3], [x0], x1 + st1 {v25.s}[1], [x9], x1 + st1 {v25.s}[3], [x0], x1 + b 9b +endfunc diff --git a/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S b/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S new file mode 100644 index 0000000000..53b372c262 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S @@ -0,0 +1,606 @@ +/* + * Copyright (c) 2017 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// All public functions in this file have the following signature: +// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, +// const uint8_t *ref, ptrdiff_t ref_stride, +// int h, int mx, int my); + +function ff_vp9_avg64_16_neon, export=1 + mov x5, x0 + sub x1, x1, #64 + sub x3, x3, #64 +1: + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3 + urhadd v0.8h, v0.8h, v4.8h + urhadd v1.8h, v1.8h, v5.8h + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1 + urhadd v2.8h, v2.8h, v6.8h + urhadd v3.8h, v3.8h, v7.8h + subs w4, w4, #1 + urhadd v16.8h, v16.8h, v20.8h + urhadd v17.8h, v17.8h, v21.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], #64 + urhadd v18.8h, v18.8h, v22.8h + urhadd v19.8h, v19.8h, v23.8h + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg32_16_neon, export=1 + mov x5, x0 +1: + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x3 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3 + urhadd v0.8h, v0.8h, v4.8h + urhadd v1.8h, v1.8h, v5.8h + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1 + urhadd v2.8h, v2.8h, v6.8h + urhadd v3.8h, v3.8h, v7.8h + subs w4, w4, #2 + urhadd v16.8h, v16.8h, v20.8h + urhadd v17.8h, v17.8h, v21.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], x1 + urhadd v18.8h, v18.8h, v22.8h + urhadd v19.8h, v19.8h, v23.8h + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg16_16_neon, export=1 +1: + ld1 {v2.8h, v3.8h}, [x2], x3 + ld1 {v0.8h, v1.8h}, [x0] + urhadd v0.8h, v0.8h, v2.8h + urhadd v1.8h, v1.8h, v3.8h + subs w4, w4, #1 + st1 {v0.8h, v1.8h}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg8_16_neon, export=1 + mov x5, x0 +1: + ld1 {v2.8h}, [x2], x3 + ld1 {v0.8h}, [x0], x1 + ld1 {v3.8h}, [x2], x3 + urhadd v0.8h, v0.8h, v2.8h + ld1 {v1.8h}, [x0], x1 + urhadd v1.8h, v1.8h, v3.8h + subs w4, w4, #2 + st1 {v0.8h}, [x5], x1 + st1 {v1.8h}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg4_16_neon, export=1 + mov x5, x0 +1: + ld1 {v2.4h}, [x2], x3 + ld1 {v0.4h}, [x0], x1 + ld1 {v3.4h}, [x2], x3 + urhadd v0.4h, v0.4h, v2.4h + ld1 {v1.4h}, [x0], x1 + urhadd v1.4h, v1.4h, v3.4h + subs w4, w4, #2 + st1 {v0.4h}, [x5], x1 + st1 {v1.8b}, [x5], x1 + b.ne 1b + ret +endfunc + + +// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 +// for size >= 16), and multiply-accumulate into dst1 and dst5 (or +// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8 +// for size >= 16) +.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size + ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) + ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) + smlal \dst1\().4s, v20.4h, v0.h[\offset] + smlal \dst5\().4s, v22.4h, v0.h[\offset] +.if \size >= 16 + ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) + ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) +.endif +.if \size >= 8 + smlal2 \dst2\().4s, v20.8h, v0.h[\offset] + smlal2 \dst6\().4s, v22.8h, v0.h[\offset] +.endif +.if \size >= 16 + smlal \dst3\().4s, v21.4h, v0.h[\offset] + smlal \dst7\().4s, v23.4h, v0.h[\offset] + smlal2 \dst4\().4s, v21.8h, v0.h[\offset] + smlal2 \dst8\().4s, v23.8h, v0.h[\offset] +.endif +.endm + + +// Instantiate a horizontal filter function for the given size. +// This can work on 4, 8 or 16 pixels in parallel; for larger +// widths it will do 16 pixels at a time and loop horizontally. +// The actual width (in bytes) is passed in x5, the height in w4 and +// the filter coefficients in x9. +.macro do_8tap_h type, size +function \type\()_8tap_\size\()h + sub x2, x2, #6 + add x6, x0, x1 + add x7, x2, x3 + add x1, x1, x1 + add x3, x3, x3 + // Only size >= 16 loops horizontally and needs + // reduced dst stride +.if \size >= 16 + sub x1, x1, x5 +.endif + // size >= 16 loads two qwords and increments r2, + // for size 4/8 it's enough with one qword and no + // postincrement +.if \size >= 16 + sub x3, x3, x5 + sub x3, x3, #16 +.endif + // Load the filter vector + ld1 {v0.8h}, [x9] +1: +.if \size >= 16 + mov x9, x5 +.endif + // Load src +.if \size >= 16 + ld1 {v5.8h, v6.8h, v7.8h}, [x2], #48 + ld1 {v16.8h, v17.8h, v18.8h}, [x7], #48 +.else + ld1 {v5.8h, v6.8h}, [x2] + ld1 {v16.8h, v17.8h}, [x7] +.endif +2: + + smull v1.4s, v5.4h, v0.h[0] + smull v24.4s, v16.4h, v0.h[0] +.if \size >= 8 + smull2 v2.4s, v5.8h, v0.h[0] + smull2 v25.4s, v16.8h, v0.h[0] +.endif +.if \size >= 16 + smull v3.4s, v6.4h, v0.h[0] + smull v26.4s, v17.4h, v0.h[0] + smull2 v4.4s, v6.8h, v0.h[0] + smull2 v27.4s, v17.8h, v0.h[0] +.endif + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 1, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 2, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 3, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 4, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 5, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 6, \size + extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 7, \size + + // Round, shift and saturate + // The sqrshrun takes care of clamping negative values to zero, but + // we manually need to do umin with the max pixel value. + sqrshrun v1.4h, v1.4s, #7 + sqrshrun v24.4h, v24.4s, #7 +.if \size >= 8 + sqrshrun2 v1.8h, v2.4s, #7 + sqrshrun2 v24.8h, v25.4s, #7 + umin v1.8h, v1.8h, v31.8h + umin v24.8h, v24.8h, v31.8h +.if \size >= 16 + sqrshrun v2.4h, v3.4s, #7 + sqrshrun v25.4h, v26.4s, #7 + sqrshrun2 v2.8h, v4.4s, #7 + sqrshrun2 v25.8h, v27.4s, #7 + umin v2.8h, v2.8h, v31.8h + umin v25.8h, v25.8h, v31.8h +.endif +.else + umin v1.4h, v1.4h, v31.4h + umin v24.4h, v24.4h, v31.4h +.endif + // Average +.ifc \type,avg +.if \size >= 16 + ld1 {v3.8h, v4.8h}, [x0] + ld1 {v29.8h, v30.8h}, [x6] + urhadd v1.8h, v1.8h, v3.8h + urhadd v2.8h, v2.8h, v4.8h + urhadd v24.8h, v24.8h, v29.8h + urhadd v25.8h, v25.8h, v30.8h +.elseif \size >= 8 + ld1 {v3.8h}, [x0] + ld1 {v4.8h}, [x6] + urhadd v1.8h, v1.8h, v3.8h + urhadd v24.8h, v24.8h, v4.8h +.else + ld1 {v3.4h}, [x0] + ld1 {v4.4h}, [x6] + urhadd v1.4h, v1.4h, v3.4h + urhadd v24.4h, v24.4h, v4.4h +.endif +.endif + // Store and loop horizontally (for size >= 16) +.if \size >= 16 + subs x9, x9, #32 + st1 {v1.8h, v2.8h}, [x0], #32 + st1 {v24.8h, v25.8h}, [x6], #32 + b.eq 3f + mov v5.16b, v7.16b + mov v16.16b, v18.16b + ld1 {v6.8h, v7.8h}, [x2], #32 + ld1 {v17.8h, v18.8h}, [x7], #32 + b 2b +.elseif \size == 8 + st1 {v1.8h}, [x0] + st1 {v24.8h}, [x6] +.else // \size == 4 + st1 {v1.4h}, [x0] + st1 {v24.4h}, [x6] +.endif +3: + // Loop vertically + add x0, x0, x1 + add x6, x6, x1 + add x2, x2, x3 + add x7, x7, x3 + subs w4, w4, #2 + b.ne 1b + ret +endfunc +.endm + +.macro do_8tap_h_size size +do_8tap_h put, \size +do_8tap_h avg, \size +.endm + +do_8tap_h_size 4 +do_8tap_h_size 8 +do_8tap_h_size 16 + +.macro do_8tap_h_func type, filter, offset, size, bpp +function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1 + mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8 + movrel x6, X(ff_vp9_subpel_filters), 256*\offset + cmp w5, #8 + add x9, x6, w5, uxtw #4 + mov x5, #2*\size +.if \size >= 16 + b \type\()_8tap_16h +.else + b \type\()_8tap_\size\()h +.endif +endfunc +.endm + +.macro do_8tap_h_filters size, bpp +do_8tap_h_func put, regular, 1, \size, \bpp +do_8tap_h_func avg, regular, 1, \size, \bpp +do_8tap_h_func put, sharp, 2, \size, \bpp +do_8tap_h_func avg, sharp, 2, \size, \bpp +do_8tap_h_func put, smooth, 0, \size, \bpp +do_8tap_h_func avg, smooth, 0, \size, \bpp +.endm + +.macro do_8tap_h_filters_bpp bpp +do_8tap_h_filters 64, \bpp +do_8tap_h_filters 32, \bpp +do_8tap_h_filters 16, \bpp +do_8tap_h_filters 8, \bpp +do_8tap_h_filters 4, \bpp +.endm + +do_8tap_h_filters_bpp 10 +do_8tap_h_filters_bpp 12 + + +// Vertical filters + +// Round, shift and saturate and store reg1-reg4 +.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type + sqrshrun \reg1\().4h, \reg1\().4s, #7 + sqrshrun \reg2\().4h, \reg2\().4s, #7 + sqrshrun \reg3\().4h, \reg3\().4s, #7 + sqrshrun \reg4\().4h, \reg4\().4s, #7 +.ifc \type,avg + ld1 {\tmp1\().4h}, [x7], x1 + ld1 {\tmp2\().4h}, [x7], x1 + ld1 {\tmp3\().4h}, [x7], x1 + ld1 {\tmp4\().4h}, [x7], x1 +.endif + umin \reg1\().4h, \reg1\().4h, \minreg\().4h + umin \reg2\().4h, \reg2\().4h, \minreg\().4h + umin \reg3\().4h, \reg3\().4h, \minreg\().4h + umin \reg4\().4h, \reg4\().4h, \minreg\().4h +.ifc \type,avg + urhadd \reg1\().4h, \reg1\().4h, \tmp1\().4h + urhadd \reg2\().4h, \reg2\().4h, \tmp2\().4h + urhadd \reg3\().4h, \reg3\().4h, \tmp3\().4h + urhadd \reg4\().4h, \reg4\().4h, \tmp4\().4h +.endif + st1 {\reg1\().4h}, [x0], x1 + st1 {\reg2\().4h}, [x0], x1 + st1 {\reg3\().4h}, [x0], x1 + st1 {\reg4\().4h}, [x0], x1 +.endm + +// Round, shift and saturate and store reg1-8, where +// reg1-2, reg3-4 etc pairwise correspond to 4 rows. +.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type + sqrshrun \reg1\().4h, \reg1\().4s, #7 + sqrshrun2 \reg1\().8h, \reg2\().4s, #7 + sqrshrun \reg2\().4h, \reg3\().4s, #7 + sqrshrun2 \reg2\().8h, \reg4\().4s, #7 + sqrshrun \reg3\().4h, \reg5\().4s, #7 + sqrshrun2 \reg3\().8h, \reg6\().4s, #7 + sqrshrun \reg4\().4h, \reg7\().4s, #7 + sqrshrun2 \reg4\().8h, \reg8\().4s, #7 +.ifc \type,avg + ld1 {\reg5\().8h}, [x7], x1 + ld1 {\reg6\().8h}, [x7], x1 + ld1 {\reg7\().8h}, [x7], x1 + ld1 {\reg8\().8h}, [x7], x1 +.endif + umin \reg1\().8h, \reg1\().8h, \minreg\().8h + umin \reg2\().8h, \reg2\().8h, \minreg\().8h + umin \reg3\().8h, \reg3\().8h, \minreg\().8h + umin \reg4\().8h, \reg4\().8h, \minreg\().8h +.ifc \type,avg + urhadd \reg1\().8h, \reg1\().8h, \reg5\().8h + urhadd \reg2\().8h, \reg2\().8h, \reg6\().8h + urhadd \reg3\().8h, \reg3\().8h, \reg7\().8h + urhadd \reg4\().8h, \reg4\().8h, \reg8\().8h +.endif + st1 {\reg1\().8h}, [x0], x1 + st1 {\reg2\().8h}, [x0], x1 + st1 {\reg3\().8h}, [x0], x1 + st1 {\reg4\().8h}, [x0], x1 +.endm + +// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 +// (src1-src8 into dst1, src2-src9 into dst2). +.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2 + smull \dst1\().4s, \src1\().4h, v0.h[0] + smull \dst2\().4s, \src2\().4h, v0.h[0] + smull \tmp1\().4s, \src2\().4h, v0.h[1] + smull \tmp2\().4s, \src3\().4h, v0.h[1] + smlal \dst1\().4s, \src3\().4h, v0.h[2] + smlal \dst2\().4s, \src4\().4h, v0.h[2] + smlal \tmp1\().4s, \src4\().4h, v0.h[3] + smlal \tmp2\().4s, \src5\().4h, v0.h[3] + smlal \dst1\().4s, \src5\().4h, v0.h[4] + smlal \dst2\().4s, \src6\().4h, v0.h[4] + smlal \tmp1\().4s, \src6\().4h, v0.h[5] + smlal \tmp2\().4s, \src7\().4h, v0.h[5] + smlal \dst1\().4s, \src7\().4h, v0.h[6] + smlal \dst2\().4s, \src8\().4h, v0.h[6] + smlal \tmp1\().4s, \src8\().4h, v0.h[7] + smlal \tmp2\().4s, \src9\().4h, v0.h[7] + add \dst1\().4s, \dst1\().4s, \tmp1\().4s + add \dst2\().4s, \dst2\().4s, \tmp2\().4s +.endm + +// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4 +// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4). +.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9 + smull \dst1\().4s, \src1\().4h, v0.h[0] + smull2 \dst2\().4s, \src1\().8h, v0.h[0] + smull \dst3\().4s, \src2\().4h, v0.h[0] + smull2 \dst4\().4s, \src2\().8h, v0.h[0] + smlal \dst1\().4s, \src2\().4h, v0.h[1] + smlal2 \dst2\().4s, \src2\().8h, v0.h[1] + smlal \dst3\().4s, \src3\().4h, v0.h[1] + smlal2 \dst4\().4s, \src3\().8h, v0.h[1] + smlal \dst1\().4s, \src3\().4h, v0.h[2] + smlal2 \dst2\().4s, \src3\().8h, v0.h[2] + smlal \dst3\().4s, \src4\().4h, v0.h[2] + smlal2 \dst4\().4s, \src4\().8h, v0.h[2] + smlal \dst1\().4s, \src4\().4h, v0.h[3] + smlal2 \dst2\().4s, \src4\().8h, v0.h[3] + smlal \dst3\().4s, \src5\().4h, v0.h[3] + smlal2 \dst4\().4s, \src5\().8h, v0.h[3] + smlal \dst1\().4s, \src5\().4h, v0.h[4] + smlal2 \dst2\().4s, \src5\().8h, v0.h[4] + smlal \dst3\().4s, \src6\().4h, v0.h[4] + smlal2 \dst4\().4s, \src6\().8h, v0.h[4] + smlal \dst1\().4s, \src6\().4h, v0.h[5] + smlal2 \dst2\().4s, \src6\().8h, v0.h[5] + smlal \dst3\().4s, \src7\().4h, v0.h[5] + smlal2 \dst4\().4s, \src7\().8h, v0.h[5] + smlal \dst1\().4s, \src7\().4h, v0.h[6] + smlal2 \dst2\().4s, \src7\().8h, v0.h[6] + smlal \dst3\().4s, \src8\().4h, v0.h[6] + smlal2 \dst4\().4s, \src8\().8h, v0.h[6] + smlal \dst1\().4s, \src8\().4h, v0.h[7] + smlal2 \dst2\().4s, \src8\().8h, v0.h[7] + smlal \dst3\().4s, \src9\().4h, v0.h[7] + smlal2 \dst4\().4s, \src9\().8h, v0.h[7] +.endm + +// Instantiate a vertical filter function for filtering 8 pixels at a time. +// The height is passed in x4, the width in x5 and the filter coefficients +// in x6. +.macro do_8tap_8v type +function \type\()_8tap_8v + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + ld1 {v0.8h}, [x6] +1: +.ifc \type,avg + mov x7, x0 +.endif + mov x6, x4 + + ld1 {v17.8h}, [x2], x3 + ld1 {v18.8h}, [x2], x3 + ld1 {v19.8h}, [x2], x3 + ld1 {v20.8h}, [x2], x3 + ld1 {v21.8h}, [x2], x3 + ld1 {v22.8h}, [x2], x3 + ld1 {v23.8h}, [x2], x3 +2: + ld1 {v24.8h}, [x2], x3 + ld1 {v25.8h}, [x2], x3 + ld1 {v26.8h}, [x2], x3 + ld1 {v27.8h}, [x2], x3 + + convolve8 v2, v3, v4, v5, v17, v18, v19, v20, v21, v22, v23, v24, v25 + convolve8 v6, v7, v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27 + do_store8 v2, v3, v4, v5, v6, v7, v30, v31, v1, \type + + subs x6, x6, #4 + b.eq 8f + + ld1 {v16.8h}, [x2], x3 + ld1 {v17.8h}, [x2], x3 + ld1 {v18.8h}, [x2], x3 + ld1 {v19.8h}, [x2], x3 + convolve8 v2, v3, v4, v5, v21, v22, v23, v24, v25, v26, v27, v16, v17 + convolve8 v6, v7, v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19 + do_store8 v2, v3, v4, v5, v6, v7, v20, v21, v1, \type + + subs x6, x6, #4 + b.eq 8f + + ld1 {v20.8h}, [x2], x3 + ld1 {v21.8h}, [x2], x3 + ld1 {v22.8h}, [x2], x3 + ld1 {v23.8h}, [x2], x3 + convolve8 v2, v3, v4, v5, v25, v26, v27, v16, v17, v18, v19, v20, v21 + convolve8 v6, v7, v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23 + do_store8 v2, v3, v4, v5, v6, v7, v24, v25, v1, \type + + subs x6, x6, #4 + b.ne 2b + +8: + subs x5, x5, #8 + b.eq 9f + // x0 -= h * dst_stride + msub x0, x1, x4, x0 + // x2 -= h * src_stride + msub x2, x3, x4, x2 + // x2 -= 8 * src_stride + sub x2, x2, x3, lsl #3 + // x2 += 1 * src_stride + add x2, x2, x3 + add x2, x2, #16 + add x0, x0, #16 + b 1b +9: + ret +endfunc +.endm + +do_8tap_8v put +do_8tap_8v avg + + +// Instantiate a vertical filter function for filtering a 4 pixels wide +// slice. This only is designed to work for 4 or 8 output lines. +.macro do_8tap_4v type +function \type\()_8tap_4v + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + ld1 {v0.8h}, [x6] +.ifc \type,avg + mov x7, x0 +.endif + + ld1 {v16.4h}, [x2], x3 + ld1 {v17.4h}, [x2], x3 + ld1 {v18.4h}, [x2], x3 + ld1 {v19.4h}, [x2], x3 + ld1 {v20.4h}, [x2], x3 + ld1 {v21.4h}, [x2], x3 + ld1 {v22.4h}, [x2], x3 + ld1 {v23.4h}, [x2], x3 + ld1 {v24.4h}, [x2], x3 + ld1 {v25.4h}, [x2], x3 + ld1 {v26.4h}, [x2], x3 + + convolve4 v2, v3, v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31 + convolve4 v4, v5, v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31 + do_store4 v2, v3, v4, v5, v28, v29, v30, v31, v1, \type + + subs x4, x4, #4 + b.eq 9f + + ld1 {v27.4h}, [x2], x3 + ld1 {v28.4h}, [x2], x3 + ld1 {v29.4h}, [x2], x3 + ld1 {v30.4h}, [x2], x3 + + convolve4 v2, v3, v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17 + convolve4 v4, v5, v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17 + do_store4 v2, v3, v4, v5, v16, v17, v18, v19, v1, \type + +9: + ret +endfunc +.endm + +do_8tap_4v put +do_8tap_4v avg + + +.macro do_8tap_v_func type, filter, offset, size, bpp +function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1 + uxtw x4, w4 + mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8 + movrel x5, X(ff_vp9_subpel_filters), 256*\offset + add x6, x5, w6, uxtw #4 + mov x5, #\size +.if \size >= 8 + b \type\()_8tap_8v +.else + b \type\()_8tap_4v +.endif +endfunc +.endm + +.macro do_8tap_v_filters size, bpp +do_8tap_v_func put, regular, 1, \size, \bpp +do_8tap_v_func avg, regular, 1, \size, \bpp +do_8tap_v_func put, sharp, 2, \size, \bpp +do_8tap_v_func avg, sharp, 2, \size, \bpp +do_8tap_v_func put, smooth, 0, \size, \bpp +do_8tap_v_func avg, smooth, 0, \size, \bpp +.endm + +.macro do_8tap_v_filters_bpp bpp +do_8tap_v_filters 64, \bpp +do_8tap_v_filters 32, \bpp +do_8tap_v_filters 16, \bpp +do_8tap_v_filters 8, \bpp +do_8tap_v_filters 4, \bpp +.endm + +do_8tap_v_filters_bpp 10 +do_8tap_v_filters_bpp 12 diff --git a/media/ffvpx/libavcodec/aarch64/vp9mc_aarch64.S b/media/ffvpx/libavcodec/aarch64/vp9mc_aarch64.S new file mode 100644 index 0000000000..f17a8cf04a --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9mc_aarch64.S @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// All public functions in this file have the following signature: +// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, +// const uint8_t *ref, ptrdiff_t ref_stride, +// int h, int mx, int my); + +function ff_vp9_copy128_aarch64, export=1 +1: + ldp x5, x6, [x2] + ldp x7, x8, [x2, #16] + stp x5, x6, [x0] + ldp x9, x10, [x2, #32] + stp x7, x8, [x0, #16] + subs w4, w4, #1 + ldp x11, x12, [x2, #48] + stp x9, x10, [x0, #32] + stp x11, x12, [x0, #48] + ldp x5, x6, [x2, #64] + ldp x7, x8, [x2, #80] + stp x5, x6, [x0, #64] + ldp x9, x10, [x2, #96] + stp x7, x8, [x0, #80] + ldp x11, x12, [x2, #112] + stp x9, x10, [x0, #96] + stp x11, x12, [x0, #112] + add x2, x2, x3 + add x0, x0, x1 + b.ne 1b + ret +endfunc + +function ff_vp9_copy64_aarch64, export=1 +1: + ldp x5, x6, [x2] + ldp x7, x8, [x2, #16] + stp x5, x6, [x0] + ldp x9, x10, [x2, #32] + stp x7, x8, [x0, #16] + subs w4, w4, #1 + ldp x11, x12, [x2, #48] + stp x9, x10, [x0, #32] + stp x11, x12, [x0, #48] + add x2, x2, x3 + add x0, x0, x1 + b.ne 1b + ret +endfunc + +function ff_vp9_copy32_aarch64, export=1 +1: + ldp x5, x6, [x2] + ldp x7, x8, [x2, #16] + stp x5, x6, [x0] + subs w4, w4, #1 + stp x7, x8, [x0, #16] + add x2, x2, x3 + add x0, x0, x1 + b.ne 1b + ret +endfunc diff --git a/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S b/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S new file mode 100644 index 0000000000..abf2bae9db --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/vp9mc_neon.S @@ -0,0 +1,657 @@ +/* + * Copyright (c) 2016 Google Inc. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +// All public functions in this file have the following signature: +// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, +// const uint8_t *ref, ptrdiff_t ref_stride, +// int h, int mx, int my); + +function ff_vp9_avg64_neon, export=1 + mov x5, x0 +1: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3 + urhadd v0.16b, v0.16b, v4.16b + urhadd v1.16b, v1.16b, v5.16b + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 + urhadd v2.16b, v2.16b, v6.16b + urhadd v3.16b, v3.16b, v7.16b + subs w4, w4, #2 + urhadd v16.16b, v16.16b, v20.16b + urhadd v17.16b, v17.16b, v21.16b + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], x1 + urhadd v18.16b, v18.16b, v22.16b + urhadd v19.16b, v19.16b, v23.16b + st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg32_neon, export=1 +1: + ld1 {v2.16b, v3.16b}, [x2], x3 + ld1 {v0.16b, v1.16b}, [x0] + urhadd v0.16b, v0.16b, v2.16b + urhadd v1.16b, v1.16b, v3.16b + subs w4, w4, #1 + st1 {v0.16b, v1.16b}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_copy16_neon, export=1 + add x5, x0, x1 + lsl x1, x1, #1 + add x6, x2, x3 + lsl x3, x3, #1 +1: + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x6], x3 + ld1 {v2.16b}, [x2], x3 + ld1 {v3.16b}, [x6], x3 + subs w4, w4, #4 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x5], x1 + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg16_neon, export=1 + mov x5, x0 +1: + ld1 {v2.16b}, [x2], x3 + ld1 {v0.16b}, [x0], x1 + ld1 {v3.16b}, [x2], x3 + urhadd v0.16b, v0.16b, v2.16b + ld1 {v1.16b}, [x0], x1 + urhadd v1.16b, v1.16b, v3.16b + subs w4, w4, #2 + st1 {v0.16b}, [x5], x1 + st1 {v1.16b}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_copy8_neon, export=1 +1: + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x2], x3 + subs w4, w4, #2 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg8_neon, export=1 + mov x5, x0 +1: + ld1 {v2.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + urhadd v0.8b, v0.8b, v2.8b + ld1 {v1.8b}, [x0], x1 + urhadd v1.8b, v1.8b, v3.8b + subs w4, w4, #2 + st1 {v0.8b}, [x5], x1 + st1 {v1.8b}, [x5], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_copy4_neon, export=1 +1: + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x2], x3 + st1 {v0.s}[0], [x0], x1 + ld1 {v2.s}[0], [x2], x3 + st1 {v1.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + subs w4, w4, #4 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + b.ne 1b + ret +endfunc + +function ff_vp9_avg4_neon, export=1 + mov x5, x0 +1: + ld1 {v2.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v2.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v1.s}[0], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v1.s}[1], [x0], x1 + subs w4, w4, #4 + urhadd v0.8b, v0.8b, v2.8b + urhadd v1.8b, v1.8b, v3.8b + st1 {v0.s}[0], [x5], x1 + st1 {v0.s}[1], [x5], x1 + st1 {v1.s}[0], [x5], x1 + st1 {v1.s}[1], [x5], x1 + b.ne 1b + ret +endfunc + + +// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 +// for size >= 16), and multiply-accumulate into dst1 and dst3 (or +// dst1-dst2 and dst3-dst4 for size >= 16) +.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size + ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) + ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) +.if \size >= 16 + mla \dst1\().8h, v20.8h, v0.h[\offset] + ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) + mla \dst3\().8h, v22.8h, v0.h[\offset] + ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) + mla \dst2\().8h, v21.8h, v0.h[\offset] + mla \dst4\().8h, v23.8h, v0.h[\offset] +.elseif \size == 8 + mla \dst1\().8h, v20.8h, v0.h[\offset] + mla \dst3\().8h, v22.8h, v0.h[\offset] +.else + mla \dst1\().4h, v20.4h, v0.h[\offset] + mla \dst3\().4h, v22.4h, v0.h[\offset] +.endif +.endm +// The same as above, but don't accumulate straight into the +// destination, but use a temp register and accumulate with saturation. +.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size + ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) + ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) +.if \size >= 16 + mul v20.8h, v20.8h, v0.h[\offset] + ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) + mul v22.8h, v22.8h, v0.h[\offset] + ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) + mul v21.8h, v21.8h, v0.h[\offset] + mul v23.8h, v23.8h, v0.h[\offset] +.elseif \size == 8 + mul v20.8h, v20.8h, v0.h[\offset] + mul v22.8h, v22.8h, v0.h[\offset] +.else + mul v20.4h, v20.4h, v0.h[\offset] + mul v22.4h, v22.4h, v0.h[\offset] +.endif +.if \size == 4 + sqadd \dst1\().4h, \dst1\().4h, v20.4h + sqadd \dst3\().4h, \dst3\().4h, v22.4h +.else + sqadd \dst1\().8h, \dst1\().8h, v20.8h + sqadd \dst3\().8h, \dst3\().8h, v22.8h +.if \size >= 16 + sqadd \dst2\().8h, \dst2\().8h, v21.8h + sqadd \dst4\().8h, \dst4\().8h, v23.8h +.endif +.endif +.endm + + +// Instantiate a horizontal filter function for the given size. +// This can work on 4, 8 or 16 pixels in parallel; for larger +// widths it will do 16 pixels at a time and loop horizontally. +// The actual width is passed in x5, the height in w4 and the +// filter coefficients in x9. idx2 is the index of the largest +// filter coefficient (3 or 4) and idx1 is the other one of them. +.macro do_8tap_h type, size, idx1, idx2 +function \type\()_8tap_\size\()h_\idx1\idx2 + sub x2, x2, #3 + add x6, x0, x1 + add x7, x2, x3 + add x1, x1, x1 + add x3, x3, x3 + // Only size >= 16 loops horizontally and needs + // reduced dst stride +.if \size >= 16 + sub x1, x1, x5 +.endif + // size >= 16 loads two qwords and increments x2, + // for size 4/8 it's enough with one qword and no + // postincrement +.if \size >= 16 + sub x3, x3, x5 + sub x3, x3, #8 +.endif + // Load the filter vector + ld1 {v0.8h}, [x9] +1: +.if \size >= 16 + mov x9, x5 +.endif + // Load src +.if \size >= 16 + ld1 {v4.8b, v5.8b, v6.8b}, [x2], #24 + ld1 {v16.8b, v17.8b, v18.8b}, [x7], #24 +.else + ld1 {v4.8b, v5.8b}, [x2] + ld1 {v16.8b, v17.8b}, [x7] +.endif + uxtl v4.8h, v4.8b + uxtl v5.8h, v5.8b + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b +.if \size >= 16 + uxtl v6.8h, v6.8b + uxtl v18.8h, v18.8b +.endif +2: + + // Accumulate, adding idx2 last with a separate + // saturating add. The positive filter coefficients + // for all indices except idx2 must add up to less + // than 127 for this not to overflow. + mul v1.8h, v4.8h, v0.h[0] + mul v24.8h, v16.8h, v0.h[0] +.if \size >= 16 + mul v2.8h, v5.8h, v0.h[0] + mul v25.8h, v17.8h, v0.h[0] +.endif + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, \size + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 2, \size + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx1, \size + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 5, \size + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 6, \size + extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 7, \size + extmulqadd v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx2, \size + + // Round, shift and saturate + sqrshrun v1.8b, v1.8h, #7 + sqrshrun v24.8b, v24.8h, #7 +.if \size >= 16 + sqrshrun2 v1.16b, v2.8h, #7 + sqrshrun2 v24.16b, v25.8h, #7 +.endif + // Average +.ifc \type,avg +.if \size >= 16 + ld1 {v2.16b}, [x0] + ld1 {v3.16b}, [x6] + urhadd v1.16b, v1.16b, v2.16b + urhadd v24.16b, v24.16b, v3.16b +.elseif \size == 8 + ld1 {v2.8b}, [x0] + ld1 {v3.8b}, [x6] + urhadd v1.8b, v1.8b, v2.8b + urhadd v24.8b, v24.8b, v3.8b +.else + ld1 {v2.s}[0], [x0] + ld1 {v3.s}[0], [x6] + urhadd v1.8b, v1.8b, v2.8b + urhadd v24.8b, v24.8b, v3.8b +.endif +.endif + // Store and loop horizontally (for size >= 16) +.if \size >= 16 + subs x9, x9, #16 + st1 {v1.16b}, [x0], #16 + st1 {v24.16b}, [x6], #16 + b.eq 3f + mov v4.16b, v6.16b + mov v16.16b, v18.16b + ld1 {v6.16b}, [x2], #16 + ld1 {v18.16b}, [x7], #16 + uxtl v5.8h, v6.8b + uxtl2 v6.8h, v6.16b + uxtl v17.8h, v18.8b + uxtl2 v18.8h, v18.16b + b 2b +.elseif \size == 8 + st1 {v1.8b}, [x0] + st1 {v24.8b}, [x6] +.else // \size == 4 + st1 {v1.s}[0], [x0] + st1 {v24.s}[0], [x6] +.endif +3: + // Loop vertically + add x0, x0, x1 + add x6, x6, x1 + add x2, x2, x3 + add x7, x7, x3 + subs w4, w4, #2 + b.ne 1b + ret +endfunc +.endm + +.macro do_8tap_h_size size +do_8tap_h put, \size, 3, 4 +do_8tap_h avg, \size, 3, 4 +do_8tap_h put, \size, 4, 3 +do_8tap_h avg, \size, 4, 3 +.endm + +do_8tap_h_size 4 +do_8tap_h_size 8 +do_8tap_h_size 16 + +.macro do_8tap_h_func type, filter, offset, size +function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 + movrel x6, X(ff_vp9_subpel_filters), 256*\offset + cmp w5, #8 + add x9, x6, w5, uxtw #4 + mov x5, #\size +.if \size >= 16 + b.ge \type\()_8tap_16h_34 + b \type\()_8tap_16h_43 +.else + b.ge \type\()_8tap_\size\()h_34 + b \type\()_8tap_\size\()h_43 +.endif +endfunc +.endm + +.macro do_8tap_h_filters size +do_8tap_h_func put, regular, 1, \size +do_8tap_h_func avg, regular, 1, \size +do_8tap_h_func put, sharp, 2, \size +do_8tap_h_func avg, sharp, 2, \size +do_8tap_h_func put, smooth, 0, \size +do_8tap_h_func avg, smooth, 0, \size +.endm + +do_8tap_h_filters 64 +do_8tap_h_filters 32 +do_8tap_h_filters 16 +do_8tap_h_filters 8 +do_8tap_h_filters 4 + + +// Vertical filters + +// Round, shift and saturate and store reg1-reg2 over 4 lines +.macro do_store4 reg1, reg2, tmp1, tmp2, type + sqrshrun \reg1\().8b, \reg1\().8h, #7 + sqrshrun \reg2\().8b, \reg2\().8h, #7 +.ifc \type,avg + ld1 {\tmp1\().s}[0], [x7], x1 + ld1 {\tmp2\().s}[0], [x7], x1 + ld1 {\tmp1\().s}[1], [x7], x1 + ld1 {\tmp2\().s}[1], [x7], x1 + urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b + urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b +.endif + st1 {\reg1\().s}[0], [x0], x1 + st1 {\reg2\().s}[0], [x0], x1 + st1 {\reg1\().s}[1], [x0], x1 + st1 {\reg2\().s}[1], [x0], x1 +.endm + +// Round, shift and saturate and store reg1-4 +.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type + sqrshrun \reg1\().8b, \reg1\().8h, #7 + sqrshrun \reg2\().8b, \reg2\().8h, #7 + sqrshrun \reg3\().8b, \reg3\().8h, #7 + sqrshrun \reg4\().8b, \reg4\().8h, #7 +.ifc \type,avg + ld1 {\tmp1\().8b}, [x7], x1 + ld1 {\tmp2\().8b}, [x7], x1 + ld1 {\tmp3\().8b}, [x7], x1 + ld1 {\tmp4\().8b}, [x7], x1 + urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b + urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b + urhadd \reg3\().8b, \reg3\().8b, \tmp3\().8b + urhadd \reg4\().8b, \reg4\().8b, \tmp4\().8b +.endif + st1 {\reg1\().8b}, [x0], x1 + st1 {\reg2\().8b}, [x0], x1 + st1 {\reg3\().8b}, [x0], x1 + st1 {\reg4\().8b}, [x0], x1 +.endm + +// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 +// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately +// at the end with saturation. Indices 0 and 7 always have negative or zero +// coefficients, so they can be accumulated into tmp1-tmp2 together with the +// largest coefficient. +.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2 + mul \dst1\().8h, \src2\().8h, v0.h[1] + mul \dst2\().8h, \src3\().8h, v0.h[1] + mul \tmp1\().8h, \src1\().8h, v0.h[0] + mul \tmp2\().8h, \src2\().8h, v0.h[0] + mla \dst1\().8h, \src3\().8h, v0.h[2] + mla \dst2\().8h, \src4\().8h, v0.h[2] +.if \idx1 == 3 + mla \dst1\().8h, \src4\().8h, v0.h[3] + mla \dst2\().8h, \src5\().8h, v0.h[3] +.else + mla \dst1\().8h, \src5\().8h, v0.h[4] + mla \dst2\().8h, \src6\().8h, v0.h[4] +.endif + mla \dst1\().8h, \src6\().8h, v0.h[5] + mla \dst2\().8h, \src7\().8h, v0.h[5] + mla \tmp1\().8h, \src8\().8h, v0.h[7] + mla \tmp2\().8h, \src9\().8h, v0.h[7] + mla \dst1\().8h, \src7\().8h, v0.h[6] + mla \dst2\().8h, \src8\().8h, v0.h[6] +.if \idx2 == 3 + mla \tmp1\().8h, \src4\().8h, v0.h[3] + mla \tmp2\().8h, \src5\().8h, v0.h[3] +.else + mla \tmp1\().8h, \src5\().8h, v0.h[4] + mla \tmp2\().8h, \src6\().8h, v0.h[4] +.endif + sqadd \dst1\().8h, \dst1\().8h, \tmp1\().8h + sqadd \dst2\().8h, \dst2\().8h, \tmp2\().8h +.endm + +// Load pixels and extend them to 16 bit +.macro loadl dst1, dst2, dst3, dst4 + ld1 {v1.8b}, [x2], x3 + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 +.ifnb \dst4 + ld1 {v4.8b}, [x2], x3 +.endif + uxtl \dst1\().8h, v1.8b + uxtl \dst2\().8h, v2.8b + uxtl \dst3\().8h, v3.8b +.ifnb \dst4 + uxtl \dst4\().8h, v4.8b +.endif +.endm + +// Instantiate a vertical filter function for filtering 8 pixels at a time. +// The height is passed in x4, the width in x5 and the filter coefficients +// in x6. idx2 is the index of the largest filter coefficient (3 or 4) +// and idx1 is the other one of them. +.macro do_8tap_8v type, idx1, idx2 +function \type\()_8tap_8v_\idx1\idx2 + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + ld1 {v0.8h}, [x6] +1: +.ifc \type,avg + mov x7, x0 +.endif + mov x6, x4 + + loadl v17, v18, v19 + + loadl v20, v21, v22, v23 +2: + loadl v24, v25, v26, v27 + convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5, v6 + convolve v3, v4, v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5, v6 + do_store v1, v2, v3, v4, v5, v6, v7, v28, \type + + subs x6, x6, #4 + b.eq 8f + + loadl v16, v17, v18, v19 + convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5, v6 + convolve v3, v4, v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5, v6 + do_store v1, v2, v3, v4, v5, v6, v7, v28, \type + + subs x6, x6, #4 + b.eq 8f + + loadl v20, v21, v22, v23 + convolve v1, v2, v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5, v6 + convolve v3, v4, v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5, v6 + do_store v1, v2, v3, v4, v5, v6, v7, v28, \type + + subs x6, x6, #4 + b.ne 2b + +8: + subs x5, x5, #8 + b.eq 9f + // x0 -= h * dst_stride + msub x0, x1, x4, x0 + // x2 -= h * src_stride + msub x2, x3, x4, x2 + // x2 -= 8 * src_stride + sub x2, x2, x3, lsl #3 + // x2 += 1 * src_stride + add x2, x2, x3 + add x2, x2, #8 + add x0, x0, #8 + b 1b +9: + ret +endfunc +.endm + +do_8tap_8v put, 3, 4 +do_8tap_8v put, 4, 3 +do_8tap_8v avg, 3, 4 +do_8tap_8v avg, 4, 3 + + +// Instantiate a vertical filter function for filtering a 4 pixels wide +// slice. The first half of the registers contain one row, while the second +// half of a register contains the second-next row (also stored in the first +// half of the register two steps ahead). The convolution does two outputs +// at a time; the output of v17-v24 into one, and v18-v25 into another one. +// The first half of first output is the first output row, the first half +// of the other output is the second output row. The second halves of the +// registers are rows 3 and 4. +// This only is designed to work for 4 or 8 output lines. +.macro do_8tap_4v type, idx1, idx2 +function \type\()_8tap_4v_\idx1\idx2 + sub x2, x2, x3, lsl #1 + sub x2, x2, x3 + ld1 {v0.8h}, [x6] +.ifc \type,avg + mov x7, x0 +.endif + + ld1 {v1.s}[0], [x2], x3 + ld1 {v2.s}[0], [x2], x3 + ld1 {v3.s}[0], [x2], x3 + ld1 {v4.s}[0], [x2], x3 + ld1 {v5.s}[0], [x2], x3 + ld1 {v6.s}[0], [x2], x3 + trn1 v1.2s, v1.2s, v3.2s + ld1 {v7.s}[0], [x2], x3 + trn1 v2.2s, v2.2s, v4.2s + ld1 {v26.s}[0], [x2], x3 + uxtl v17.8h, v1.8b + trn1 v3.2s, v3.2s, v5.2s + ld1 {v27.s}[0], [x2], x3 + uxtl v18.8h, v2.8b + trn1 v4.2s, v4.2s, v6.2s + ld1 {v28.s}[0], [x2], x3 + uxtl v19.8h, v3.8b + trn1 v5.2s, v5.2s, v7.2s + ld1 {v29.s}[0], [x2], x3 + uxtl v20.8h, v4.8b + trn1 v6.2s, v6.2s, v26.2s + uxtl v21.8h, v5.8b + trn1 v7.2s, v7.2s, v27.2s + uxtl v22.8h, v6.8b + trn1 v26.2s, v26.2s, v28.2s + uxtl v23.8h, v7.8b + trn1 v27.2s, v27.2s, v29.2s + uxtl v24.8h, v26.8b + uxtl v25.8h, v27.8b + + convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3, v4 + do_store4 v1, v2, v5, v6, \type + + subs x4, x4, #4 + b.eq 9f + + ld1 {v1.s}[0], [x2], x3 + ld1 {v2.s}[0], [x2], x3 + trn1 v28.2s, v28.2s, v1.2s + trn1 v29.2s, v29.2s, v2.2s + ld1 {v1.s}[1], [x2], x3 + uxtl v26.8h, v28.8b + ld1 {v2.s}[1], [x2], x3 + uxtl v27.8h, v29.8b + uxtl v28.8h, v1.8b + uxtl v29.8h, v2.8b + + convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3, v4 + do_store4 v1, v2, v5, v6, \type + +9: + ret +endfunc +.endm + +do_8tap_4v put, 3, 4 +do_8tap_4v put, 4, 3 +do_8tap_4v avg, 3, 4 +do_8tap_4v avg, 4, 3 + + +.macro do_8tap_v_func type, filter, offset, size +function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 + uxtw x4, w4 + movrel x5, X(ff_vp9_subpel_filters), 256*\offset + cmp w6, #8 + add x6, x5, w6, uxtw #4 + mov x5, #\size +.if \size >= 8 + b.ge \type\()_8tap_8v_34 + b \type\()_8tap_8v_43 +.else + b.ge \type\()_8tap_4v_34 + b \type\()_8tap_4v_43 +.endif +endfunc +.endm + +.macro do_8tap_v_filters size +do_8tap_v_func put, regular, 1, \size +do_8tap_v_func avg, regular, 1, \size +do_8tap_v_func put, sharp, 2, \size +do_8tap_v_func avg, sharp, 2, \size +do_8tap_v_func put, smooth, 0, \size +do_8tap_v_func avg, smooth, 0, \size +.endm + +do_8tap_v_filters 64 +do_8tap_v_filters 32 +do_8tap_v_filters 16 +do_8tap_v_filters 8 +do_8tap_v_filters 4 diff --git a/media/ffvpx/libavcodec/allcodecs.c b/media/ffvpx/libavcodec/allcodecs.c new file mode 100644 index 0000000000..e593ad19af --- /dev/null +++ b/media/ffvpx/libavcodec/allcodecs.c @@ -0,0 +1,995 @@ +/* + * Provide registration of all codecs, parsers and bitstream filters for libavcodec. + * Copyright (c) 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Provide registration of all codecs, parsers and bitstream filters for libavcodec. + */ + +#include <stdint.h> +#include <string.h> + +#include "config.h" +#include "config_components.h" +#include "libavutil/thread.h" +#include "codec.h" +#include "codec_id.h" +#include "codec_internal.h" + +extern const FFCodec ff_a64multi_encoder; +extern const FFCodec ff_a64multi5_encoder; +extern const FFCodec ff_aasc_decoder; +extern const FFCodec ff_aic_decoder; +extern const FFCodec ff_alias_pix_encoder; +extern const FFCodec ff_alias_pix_decoder; +extern const FFCodec ff_agm_decoder; +extern const FFCodec ff_amv_encoder; +extern const FFCodec ff_amv_decoder; +extern const FFCodec ff_anm_decoder; +extern const FFCodec ff_ansi_decoder; +extern const FFCodec ff_apng_encoder; +extern const FFCodec ff_apng_decoder; +extern const FFCodec ff_arbc_decoder; +extern const FFCodec ff_argo_decoder; +extern const FFCodec ff_asv1_encoder; +extern const FFCodec ff_asv1_decoder; +extern const FFCodec ff_asv2_encoder; +extern const FFCodec ff_asv2_decoder; +extern const FFCodec ff_aura_decoder; +extern const FFCodec ff_aura2_decoder; +extern const FFCodec ff_avrp_encoder; +extern const FFCodec ff_avrp_decoder; +extern const FFCodec ff_avrn_decoder; +extern const FFCodec ff_avs_decoder; +extern const FFCodec ff_avui_encoder; +extern const FFCodec ff_avui_decoder; +#if FF_API_AYUV_CODECID +extern const FFCodec ff_ayuv_encoder; +extern const FFCodec ff_ayuv_decoder; +#endif +extern const FFCodec ff_bethsoftvid_decoder; +extern const FFCodec ff_bfi_decoder; +extern const FFCodec ff_bink_decoder; +extern const FFCodec ff_bitpacked_decoder; +extern const FFCodec ff_bitpacked_encoder; +extern const FFCodec ff_bmp_encoder; +extern const FFCodec ff_bmp_decoder; +extern const FFCodec ff_bmv_video_decoder; +extern const FFCodec ff_brender_pix_decoder; +extern const FFCodec ff_c93_decoder; +extern const FFCodec ff_cavs_decoder; +extern const FFCodec ff_cdgraphics_decoder; +extern const FFCodec ff_cdtoons_decoder; +extern const FFCodec ff_cdxl_decoder; +extern const FFCodec ff_cfhd_encoder; +extern const FFCodec ff_cfhd_decoder; +extern const FFCodec ff_cinepak_encoder; +extern const FFCodec ff_cinepak_decoder; +extern const FFCodec ff_clearvideo_decoder; +extern const FFCodec ff_cljr_encoder; +extern const FFCodec ff_cljr_decoder; +extern const FFCodec ff_cllc_decoder; +extern const FFCodec ff_comfortnoise_encoder; +extern const FFCodec ff_comfortnoise_decoder; +extern const FFCodec ff_cpia_decoder; +extern const FFCodec ff_cri_decoder; +extern const FFCodec ff_cscd_decoder; +extern const FFCodec ff_cyuv_decoder; +extern const FFCodec ff_dds_decoder; +extern const FFCodec ff_dfa_decoder; +extern const FFCodec ff_dirac_decoder; +extern const FFCodec ff_dnxhd_encoder; +extern const FFCodec ff_dnxhd_decoder; +extern const FFCodec ff_dpx_encoder; +extern const FFCodec ff_dpx_decoder; +extern const FFCodec ff_dsicinvideo_decoder; +extern const FFCodec ff_dvaudio_decoder; +extern const FFCodec ff_dvvideo_encoder; +extern const FFCodec ff_dvvideo_decoder; +extern const FFCodec ff_dxa_decoder; +extern const FFCodec ff_dxtory_decoder; +extern const FFCodec ff_dxv_decoder; +extern const FFCodec ff_eacmv_decoder; +extern const FFCodec ff_eamad_decoder; +extern const FFCodec ff_eatgq_decoder; +extern const FFCodec ff_eatgv_decoder; +extern const FFCodec ff_eatqi_decoder; +extern const FFCodec ff_eightbps_decoder; +extern const FFCodec ff_eightsvx_exp_decoder; +extern const FFCodec ff_eightsvx_fib_decoder; +extern const FFCodec ff_escape124_decoder; +extern const FFCodec ff_escape130_decoder; +extern const FFCodec ff_exr_encoder; +extern const FFCodec ff_exr_decoder; +extern const FFCodec ff_ffv1_encoder; +extern const FFCodec ff_ffv1_decoder; +extern const FFCodec ff_ffvhuff_encoder; +extern const FFCodec ff_ffvhuff_decoder; +extern const FFCodec ff_fic_decoder; +extern const FFCodec ff_fits_encoder; +extern const FFCodec ff_fits_decoder; +extern const FFCodec ff_flashsv_encoder; +extern const FFCodec ff_flashsv_decoder; +extern const FFCodec ff_flashsv2_encoder; +extern const FFCodec ff_flashsv2_decoder; +extern const FFCodec ff_flic_decoder; +extern const FFCodec ff_flv_encoder; +extern const FFCodec ff_flv_decoder; +extern const FFCodec ff_fmvc_decoder; +extern const FFCodec ff_fourxm_decoder; +extern const FFCodec ff_fraps_decoder; +extern const FFCodec ff_frwu_decoder; +extern const FFCodec ff_g2m_decoder; +extern const FFCodec ff_gdv_decoder; +extern const FFCodec ff_gem_decoder; +extern const FFCodec ff_gif_encoder; +extern const FFCodec ff_gif_decoder; +extern const FFCodec ff_h261_encoder; +extern const FFCodec ff_h261_decoder; +extern const FFCodec ff_h263_encoder; +extern const FFCodec ff_h263_decoder; +extern const FFCodec ff_h263i_decoder; +extern const FFCodec ff_h263p_encoder; +extern const FFCodec ff_h263p_decoder; +extern const FFCodec ff_h263_v4l2m2m_decoder; +extern const FFCodec ff_h264_decoder; +extern const FFCodec ff_h264_crystalhd_decoder; +extern const FFCodec ff_h264_v4l2m2m_decoder; +extern const FFCodec ff_h264_mediacodec_decoder; +extern const FFCodec ff_h264_mediacodec_encoder; +extern const FFCodec ff_h264_mmal_decoder; +extern const FFCodec ff_h264_qsv_decoder; +extern const FFCodec ff_h264_rkmpp_decoder; +extern const FFCodec ff_hap_encoder; +extern const FFCodec ff_hap_decoder; +extern const FFCodec ff_hevc_decoder; +extern const FFCodec ff_hevc_qsv_decoder; +extern const FFCodec ff_hevc_rkmpp_decoder; +extern const FFCodec ff_hevc_v4l2m2m_decoder; +extern const FFCodec ff_hnm4_video_decoder; +extern const FFCodec ff_hq_hqa_decoder; +extern const FFCodec ff_hqx_decoder; +extern const FFCodec ff_huffyuv_encoder; +extern const FFCodec ff_huffyuv_decoder; +extern const FFCodec ff_hymt_decoder; +extern const FFCodec ff_idcin_decoder; +extern const FFCodec ff_iff_ilbm_decoder; +extern const FFCodec ff_imm4_decoder; +extern const FFCodec ff_imm5_decoder; +extern const FFCodec ff_indeo2_decoder; +extern const FFCodec ff_indeo3_decoder; +extern const FFCodec ff_indeo4_decoder; +extern const FFCodec ff_indeo5_decoder; +extern const FFCodec ff_interplay_video_decoder; +extern const FFCodec ff_ipu_decoder; +extern const FFCodec ff_jpeg2000_encoder; +extern const FFCodec ff_jpeg2000_decoder; +extern const FFCodec ff_jpegls_encoder; +extern const FFCodec ff_jpegls_decoder; +extern const FFCodec ff_jv_decoder; +extern const FFCodec ff_kgv1_decoder; +extern const FFCodec ff_kmvc_decoder; +extern const FFCodec ff_lagarith_decoder; +extern const FFCodec ff_ljpeg_encoder; +extern const FFCodec ff_loco_decoder; +extern const FFCodec ff_lscr_decoder; +extern const FFCodec ff_m101_decoder; +extern const FFCodec ff_magicyuv_encoder; +extern const FFCodec ff_magicyuv_decoder; +extern const FFCodec ff_mdec_decoder; +extern const FFCodec ff_media100_decoder; +extern const FFCodec ff_mimic_decoder; +extern const FFCodec ff_mjpeg_encoder; +extern const FFCodec ff_mjpeg_decoder; +extern const FFCodec ff_mjpegb_decoder; +extern const FFCodec ff_mmvideo_decoder; +extern const FFCodec ff_mobiclip_decoder; +extern const FFCodec ff_motionpixels_decoder; +extern const FFCodec ff_mpeg1video_encoder; +extern const FFCodec ff_mpeg1video_decoder; +extern const FFCodec ff_mpeg2video_encoder; +extern const FFCodec ff_mpeg2video_decoder; +extern const FFCodec ff_mpeg4_encoder; +extern const FFCodec ff_mpeg4_decoder; +extern const FFCodec ff_mpeg4_crystalhd_decoder; +extern const FFCodec ff_mpeg4_v4l2m2m_decoder; +extern const FFCodec ff_mpeg4_mmal_decoder; +extern const FFCodec ff_mpegvideo_decoder; +extern const FFCodec ff_mpeg1_v4l2m2m_decoder; +extern const FFCodec ff_mpeg2_mmal_decoder; +extern const FFCodec ff_mpeg2_crystalhd_decoder; +extern const FFCodec ff_mpeg2_v4l2m2m_decoder; +extern const FFCodec ff_mpeg2_qsv_decoder; +extern const FFCodec ff_mpeg2_mediacodec_decoder; +extern const FFCodec ff_msa1_decoder; +extern const FFCodec ff_mscc_decoder; +extern const FFCodec ff_msmpeg4v1_decoder; +extern const FFCodec ff_msmpeg4v2_encoder; +extern const FFCodec ff_msmpeg4v2_decoder; +extern const FFCodec ff_msmpeg4v3_encoder; +extern const FFCodec ff_msmpeg4v3_decoder; +extern const FFCodec ff_msmpeg4_crystalhd_decoder; +extern const FFCodec ff_msp2_decoder; +extern const FFCodec ff_msrle_decoder; +extern const FFCodec ff_mss1_decoder; +extern const FFCodec ff_mss2_decoder; +extern const FFCodec ff_msvideo1_encoder; +extern const FFCodec ff_msvideo1_decoder; +extern const FFCodec ff_mszh_decoder; +extern const FFCodec ff_mts2_decoder; +extern const FFCodec ff_mv30_decoder; +extern const FFCodec ff_mvc1_decoder; +extern const FFCodec ff_mvc2_decoder; +extern const FFCodec ff_mvdv_decoder; +extern const FFCodec ff_mvha_decoder; +extern const FFCodec ff_mwsc_decoder; +extern const FFCodec ff_mxpeg_decoder; +extern const FFCodec ff_notchlc_decoder; +extern const FFCodec ff_nuv_decoder; +extern const FFCodec ff_paf_video_decoder; +extern const FFCodec ff_pam_encoder; +extern const FFCodec ff_pam_decoder; +extern const FFCodec ff_pbm_encoder; +extern const FFCodec ff_pbm_decoder; +extern const FFCodec ff_pcx_encoder; +extern const FFCodec ff_pcx_decoder; +extern const FFCodec ff_pfm_encoder; +extern const FFCodec ff_pfm_decoder; +extern const FFCodec ff_pgm_encoder; +extern const FFCodec ff_pgm_decoder; +extern const FFCodec ff_pgmyuv_encoder; +extern const FFCodec ff_pgmyuv_decoder; +extern const FFCodec ff_pgx_decoder; +extern const FFCodec ff_phm_encoder; +extern const FFCodec ff_phm_decoder; +extern const FFCodec ff_photocd_decoder; +extern const FFCodec ff_pictor_decoder; +extern const FFCodec ff_pixlet_decoder; +extern const FFCodec ff_png_encoder; +extern const FFCodec ff_png_decoder; +extern const FFCodec ff_ppm_encoder; +extern const FFCodec ff_ppm_decoder; +extern const FFCodec ff_prores_encoder; +extern const FFCodec ff_prores_decoder; +extern const FFCodec ff_prores_aw_encoder; +extern const FFCodec ff_prores_ks_encoder; +extern const FFCodec ff_prosumer_decoder; +extern const FFCodec ff_psd_decoder; +extern const FFCodec ff_ptx_decoder; +extern const FFCodec ff_qdraw_decoder; +extern const FFCodec ff_qoi_encoder; +extern const FFCodec ff_qoi_decoder; +extern const FFCodec ff_qpeg_decoder; +extern const FFCodec ff_qtrle_encoder; +extern const FFCodec ff_qtrle_decoder; +extern const FFCodec ff_r10k_encoder; +extern const FFCodec ff_r10k_decoder; +extern const FFCodec ff_r210_encoder; +extern const FFCodec ff_r210_decoder; +extern const FFCodec ff_rasc_decoder; +extern const FFCodec ff_rawvideo_encoder; +extern const FFCodec ff_rawvideo_decoder; +extern const FFCodec ff_rka_decoder; +extern const FFCodec ff_rl2_decoder; +extern const FFCodec ff_roq_encoder; +extern const FFCodec ff_roq_decoder; +extern const FFCodec ff_rpza_encoder; +extern const FFCodec ff_rpza_decoder; +extern const FFCodec ff_rscc_decoder; +extern const FFCodec ff_rv10_encoder; +extern const FFCodec ff_rv10_decoder; +extern const FFCodec ff_rv20_encoder; +extern const FFCodec ff_rv20_decoder; +extern const FFCodec ff_rv30_decoder; +extern const FFCodec ff_rv40_decoder; +extern const FFCodec ff_s302m_encoder; +extern const FFCodec ff_s302m_decoder; +extern const FFCodec ff_sanm_decoder; +extern const FFCodec ff_scpr_decoder; +extern const FFCodec ff_screenpresso_decoder; +extern const FFCodec ff_sga_decoder; +extern const FFCodec ff_sgi_encoder; +extern const FFCodec ff_sgi_decoder; +extern const FFCodec ff_sgirle_decoder; +extern const FFCodec ff_sheervideo_decoder; +extern const FFCodec ff_simbiosis_imx_decoder; +extern const FFCodec ff_smacker_decoder; +extern const FFCodec ff_smc_encoder; +extern const FFCodec ff_smc_decoder; +extern const FFCodec ff_smvjpeg_decoder; +extern const FFCodec ff_snow_encoder; +extern const FFCodec ff_snow_decoder; +extern const FFCodec ff_sp5x_decoder; +extern const FFCodec ff_speedhq_decoder; +extern const FFCodec ff_speedhq_encoder; +extern const FFCodec ff_speex_decoder; +extern const FFCodec ff_srgc_decoder; +extern const FFCodec ff_sunrast_encoder; +extern const FFCodec ff_sunrast_decoder; +extern const FFCodec ff_svq1_encoder; +extern const FFCodec ff_svq1_decoder; +extern const FFCodec ff_svq3_decoder; +extern const FFCodec ff_targa_encoder; +extern const FFCodec ff_targa_decoder; +extern const FFCodec ff_targa_y216_decoder; +extern const FFCodec ff_tdsc_decoder; +extern const FFCodec ff_theora_decoder; +extern const FFCodec ff_thp_decoder; +extern const FFCodec ff_tiertexseqvideo_decoder; +extern const FFCodec ff_tiff_encoder; +extern const FFCodec ff_tiff_decoder; +extern const FFCodec ff_tmv_decoder; +extern const FFCodec ff_truemotion1_decoder; +extern const FFCodec ff_truemotion2_decoder; +extern const FFCodec ff_truemotion2rt_decoder; +extern const FFCodec ff_tscc_decoder; +extern const FFCodec ff_tscc2_decoder; +extern const FFCodec ff_txd_decoder; +extern const FFCodec ff_ulti_decoder; +extern const FFCodec ff_utvideo_encoder; +extern const FFCodec ff_utvideo_decoder; +extern const FFCodec ff_v210_encoder; +extern const FFCodec ff_v210_decoder; +extern const FFCodec ff_v210x_decoder; +extern const FFCodec ff_v308_encoder; +extern const FFCodec ff_v308_decoder; +extern const FFCodec ff_v408_encoder; +extern const FFCodec ff_v408_decoder; +extern const FFCodec ff_v410_encoder; +extern const FFCodec ff_v410_decoder; +extern const FFCodec ff_vb_decoder; +extern const FFCodec ff_vbn_encoder; +extern const FFCodec ff_vbn_decoder; +extern const FFCodec ff_vble_decoder; +extern const FFCodec ff_vc1_decoder; +extern const FFCodec ff_vc1_crystalhd_decoder; +extern const FFCodec ff_vc1image_decoder; +extern const FFCodec ff_vc1_mmal_decoder; +extern const FFCodec ff_vc1_qsv_decoder; +extern const FFCodec ff_vc1_v4l2m2m_decoder; +extern const FFCodec ff_vc2_encoder; +extern const FFCodec ff_vcr1_decoder; +extern const FFCodec ff_vmdvideo_decoder; +extern const FFCodec ff_vmnc_decoder; +extern const FFCodec ff_vp3_decoder; +extern const FFCodec ff_vp4_decoder; +extern const FFCodec ff_vp5_decoder; +extern const FFCodec ff_vp6_decoder; +extern const FFCodec ff_vp6a_decoder; +extern const FFCodec ff_vp6f_decoder; +extern const FFCodec ff_vp7_decoder; +extern const FFCodec ff_vp8_decoder; +extern const FFCodec ff_vp8_rkmpp_decoder; +extern const FFCodec ff_vp8_v4l2m2m_decoder; +extern const FFCodec ff_vp9_decoder; +extern const FFCodec ff_vp9_rkmpp_decoder; +extern const FFCodec ff_vp9_v4l2m2m_decoder; +extern const FFCodec ff_vqa_decoder; +extern const FFCodec ff_vqc_decoder; +extern const FFCodec ff_wbmp_decoder; +extern const FFCodec ff_wbmp_encoder; +extern const FFCodec ff_webp_decoder; +extern const FFCodec ff_wcmv_decoder; +extern const FFCodec ff_wrapped_avframe_encoder; +extern const FFCodec ff_wrapped_avframe_decoder; +extern const FFCodec ff_wmv1_encoder; +extern const FFCodec ff_wmv1_decoder; +extern const FFCodec ff_wmv2_encoder; +extern const FFCodec ff_wmv2_decoder; +extern const FFCodec ff_wmv3_decoder; +extern const FFCodec ff_wmv3_crystalhd_decoder; +extern const FFCodec ff_wmv3image_decoder; +extern const FFCodec ff_wnv1_decoder; +extern const FFCodec ff_xan_wc3_decoder; +extern const FFCodec ff_xan_wc4_decoder; +extern const FFCodec ff_xbm_encoder; +extern const FFCodec ff_xbm_decoder; +extern const FFCodec ff_xface_encoder; +extern const FFCodec ff_xface_decoder; +extern const FFCodec ff_xl_decoder; +extern const FFCodec ff_xpm_decoder; +extern const FFCodec ff_xwd_encoder; +extern const FFCodec ff_xwd_decoder; +extern const FFCodec ff_y41p_encoder; +extern const FFCodec ff_y41p_decoder; +extern const FFCodec ff_ylc_decoder; +extern const FFCodec ff_yop_decoder; +extern const FFCodec ff_yuv4_encoder; +extern const FFCodec ff_yuv4_decoder; +extern const FFCodec ff_zero12v_decoder; +extern const FFCodec ff_zerocodec_decoder; +extern const FFCodec ff_zlib_encoder; +extern const FFCodec ff_zlib_decoder; +extern const FFCodec ff_zmbv_encoder; +extern const FFCodec ff_zmbv_decoder; + +/* audio codecs */ +extern const FFCodec ff_aac_encoder; +extern const FFCodec ff_aac_decoder; +extern const FFCodec ff_aac_fixed_decoder; +extern const FFCodec ff_aac_latm_decoder; +extern const FFCodec ff_ac3_encoder; +extern const FFCodec ff_ac3_decoder; +extern const FFCodec ff_ac3_fixed_encoder; +extern const FFCodec ff_ac3_fixed_decoder; +extern const FFCodec ff_acelp_kelvin_decoder; +extern const FFCodec ff_alac_encoder; +extern const FFCodec ff_alac_decoder; +extern const FFCodec ff_als_decoder; +extern const FFCodec ff_amrnb_decoder; +extern const FFCodec ff_amrwb_decoder; +extern const FFCodec ff_apac_decoder; +extern const FFCodec ff_ape_decoder; +extern const FFCodec ff_aptx_encoder; +extern const FFCodec ff_aptx_decoder; +extern const FFCodec ff_aptx_hd_encoder; +extern const FFCodec ff_aptx_hd_decoder; +extern const FFCodec ff_atrac1_decoder; +extern const FFCodec ff_atrac3_decoder; +extern const FFCodec ff_atrac3al_decoder; +extern const FFCodec ff_atrac3p_decoder; +extern const FFCodec ff_atrac3pal_decoder; +extern const FFCodec ff_atrac9_decoder; +extern const FFCodec ff_binkaudio_dct_decoder; +extern const FFCodec ff_binkaudio_rdft_decoder; +extern const FFCodec ff_bmv_audio_decoder; +extern const FFCodec ff_bonk_decoder; +extern const FFCodec ff_cook_decoder; +extern const FFCodec ff_dca_encoder; +extern const FFCodec ff_dca_decoder; +extern const FFCodec ff_dfpwm_encoder; +extern const FFCodec ff_dfpwm_decoder; +extern const FFCodec ff_dolby_e_decoder; +extern const FFCodec ff_dsd_lsbf_decoder; +extern const FFCodec ff_dsd_msbf_decoder; +extern const FFCodec ff_dsd_lsbf_planar_decoder; +extern const FFCodec ff_dsd_msbf_planar_decoder; +extern const FFCodec ff_dsicinaudio_decoder; +extern const FFCodec ff_dss_sp_decoder; +extern const FFCodec ff_dst_decoder; +extern const FFCodec ff_eac3_encoder; +extern const FFCodec ff_eac3_decoder; +extern const FFCodec ff_evrc_decoder; +extern const FFCodec ff_fastaudio_decoder; +extern const FFCodec ff_ffwavesynth_decoder; +extern const FFCodec ff_flac_encoder; +extern const FFCodec ff_flac_decoder; +extern const FFCodec ff_ftr_decoder; +extern const FFCodec ff_g723_1_encoder; +extern const FFCodec ff_g723_1_decoder; +extern const FFCodec ff_g729_decoder; +extern const FFCodec ff_gsm_decoder; +extern const FFCodec ff_gsm_ms_decoder; +extern const FFCodec ff_hca_decoder; +extern const FFCodec ff_hcom_decoder; +extern const FFCodec ff_hdr_encoder; +extern const FFCodec ff_hdr_decoder; +extern const FFCodec ff_iac_decoder; +extern const FFCodec ff_ilbc_decoder; +extern const FFCodec ff_imc_decoder; +extern const FFCodec ff_interplay_acm_decoder; +extern const FFCodec ff_mace3_decoder; +extern const FFCodec ff_mace6_decoder; +extern const FFCodec ff_metasound_decoder; +extern const FFCodec ff_misc4_decoder; +extern const FFCodec ff_mlp_encoder; +extern const FFCodec ff_mlp_decoder; +extern const FFCodec ff_mp1_decoder; +extern const FFCodec ff_mp1float_decoder; +extern const FFCodec ff_mp2_encoder; +extern const FFCodec ff_mp2_decoder; +extern const FFCodec ff_mp2float_decoder; +extern const FFCodec ff_mp2fixed_encoder; +extern const FFCodec ff_mp3float_decoder; +extern const FFCodec ff_mp3_decoder; +extern const FFCodec ff_mp3adufloat_decoder; +extern const FFCodec ff_mp3adu_decoder; +extern const FFCodec ff_mp3on4float_decoder; +extern const FFCodec ff_mp3on4_decoder; +extern const FFCodec ff_mpc7_decoder; +extern const FFCodec ff_mpc8_decoder; +extern const FFCodec ff_msnsiren_decoder; +extern const FFCodec ff_nellymoser_encoder; +extern const FFCodec ff_nellymoser_decoder; +extern const FFCodec ff_on2avc_decoder; +extern const FFCodec ff_opus_encoder; +extern const FFCodec ff_opus_decoder; +extern const FFCodec ff_paf_audio_decoder; +extern const FFCodec ff_qcelp_decoder; +extern const FFCodec ff_qdm2_decoder; +extern const FFCodec ff_qdmc_decoder; +extern const FFCodec ff_ra_144_encoder; +extern const FFCodec ff_ra_144_decoder; +extern const FFCodec ff_ra_288_decoder; +extern const FFCodec ff_ralf_decoder; +extern const FFCodec ff_sbc_encoder; +extern const FFCodec ff_sbc_decoder; +extern const FFCodec ff_shorten_decoder; +extern const FFCodec ff_sipr_decoder; +extern const FFCodec ff_siren_decoder; +extern const FFCodec ff_smackaud_decoder; +extern const FFCodec ff_sonic_encoder; +extern const FFCodec ff_sonic_decoder; +extern const FFCodec ff_sonic_ls_encoder; +extern const FFCodec ff_tak_decoder; +extern const FFCodec ff_truehd_encoder; +extern const FFCodec ff_truehd_decoder; +extern const FFCodec ff_truespeech_decoder; +extern const FFCodec ff_tta_encoder; +extern const FFCodec ff_tta_decoder; +extern const FFCodec ff_twinvq_decoder; +extern const FFCodec ff_vmdaudio_decoder; +extern const FFCodec ff_vorbis_encoder; +extern const FFCodec ff_vorbis_decoder; +extern const FFCodec ff_wavarc_decoder; +extern const FFCodec ff_wavpack_encoder; +extern const FFCodec ff_wavpack_decoder; +extern const FFCodec ff_wmalossless_decoder; +extern const FFCodec ff_wmapro_decoder; +extern const FFCodec ff_wmav1_encoder; +extern const FFCodec ff_wmav1_decoder; +extern const FFCodec ff_wmav2_encoder; +extern const FFCodec ff_wmav2_decoder; +extern const FFCodec ff_wmavoice_decoder; +extern const FFCodec ff_ws_snd1_decoder; +extern const FFCodec ff_xma1_decoder; +extern const FFCodec ff_xma2_decoder; + +/* PCM codecs */ +extern const FFCodec ff_pcm_alaw_encoder; +extern const FFCodec ff_pcm_alaw_decoder; +extern const FFCodec ff_pcm_bluray_encoder; +extern const FFCodec ff_pcm_bluray_decoder; +extern const FFCodec ff_pcm_dvd_encoder; +extern const FFCodec ff_pcm_dvd_decoder; +extern const FFCodec ff_pcm_f16le_decoder; +extern const FFCodec ff_pcm_f24le_decoder; +extern const FFCodec ff_pcm_f32be_encoder; +extern const FFCodec ff_pcm_f32be_decoder; +extern const FFCodec ff_pcm_f32le_encoder; +extern const FFCodec ff_pcm_f32le_decoder; +extern const FFCodec ff_pcm_f64be_encoder; +extern const FFCodec ff_pcm_f64be_decoder; +extern const FFCodec ff_pcm_f64le_encoder; +extern const FFCodec ff_pcm_f64le_decoder; +extern const FFCodec ff_pcm_lxf_decoder; +extern const FFCodec ff_pcm_mulaw_encoder; +extern const FFCodec ff_pcm_mulaw_decoder; +extern const FFCodec ff_pcm_s8_encoder; +extern const FFCodec ff_pcm_s8_decoder; +extern const FFCodec ff_pcm_s8_planar_encoder; +extern const FFCodec ff_pcm_s8_planar_decoder; +extern const FFCodec ff_pcm_s16be_encoder; +extern const FFCodec ff_pcm_s16be_decoder; +extern const FFCodec ff_pcm_s16be_planar_encoder; +extern const FFCodec ff_pcm_s16be_planar_decoder; +extern const FFCodec ff_pcm_s16le_encoder; +extern const FFCodec ff_pcm_s16le_decoder; +extern const FFCodec ff_pcm_s16le_planar_encoder; +extern const FFCodec ff_pcm_s16le_planar_decoder; +extern const FFCodec ff_pcm_s24be_encoder; +extern const FFCodec ff_pcm_s24be_decoder; +extern const FFCodec ff_pcm_s24daud_encoder; +extern const FFCodec ff_pcm_s24daud_decoder; +extern const FFCodec ff_pcm_s24le_encoder; +extern const FFCodec ff_pcm_s24le_decoder; +extern const FFCodec ff_pcm_s24le_planar_encoder; +extern const FFCodec ff_pcm_s24le_planar_decoder; +extern const FFCodec ff_pcm_s32be_encoder; +extern const FFCodec ff_pcm_s32be_decoder; +extern const FFCodec ff_pcm_s32le_encoder; +extern const FFCodec ff_pcm_s32le_decoder; +extern const FFCodec ff_pcm_s32le_planar_encoder; +extern const FFCodec ff_pcm_s32le_planar_decoder; +extern const FFCodec ff_pcm_s64be_encoder; +extern const FFCodec ff_pcm_s64be_decoder; +extern const FFCodec ff_pcm_s64le_encoder; +extern const FFCodec ff_pcm_s64le_decoder; +extern const FFCodec ff_pcm_sga_decoder; +extern const FFCodec ff_pcm_u8_encoder; +extern const FFCodec ff_pcm_u8_decoder; +extern const FFCodec ff_pcm_u16be_encoder; +extern const FFCodec ff_pcm_u16be_decoder; +extern const FFCodec ff_pcm_u16le_encoder; +extern const FFCodec ff_pcm_u16le_decoder; +extern const FFCodec ff_pcm_u24be_encoder; +extern const FFCodec ff_pcm_u24be_decoder; +extern const FFCodec ff_pcm_u24le_encoder; +extern const FFCodec ff_pcm_u24le_decoder; +extern const FFCodec ff_pcm_u32be_encoder; +extern const FFCodec ff_pcm_u32be_decoder; +extern const FFCodec ff_pcm_u32le_encoder; +extern const FFCodec ff_pcm_u32le_decoder; +extern const FFCodec ff_pcm_vidc_encoder; +extern const FFCodec ff_pcm_vidc_decoder; + +/* DPCM codecs */ +extern const FFCodec ff_cbd2_dpcm_decoder; +extern const FFCodec ff_derf_dpcm_decoder; +extern const FFCodec ff_gremlin_dpcm_decoder; +extern const FFCodec ff_interplay_dpcm_decoder; +extern const FFCodec ff_roq_dpcm_encoder; +extern const FFCodec ff_roq_dpcm_decoder; +extern const FFCodec ff_sdx2_dpcm_decoder; +extern const FFCodec ff_sol_dpcm_decoder; +extern const FFCodec ff_xan_dpcm_decoder; +extern const FFCodec ff_wady_dpcm_decoder; + +/* ADPCM codecs */ +extern const FFCodec ff_adpcm_4xm_decoder; +extern const FFCodec ff_adpcm_adx_encoder; +extern const FFCodec ff_adpcm_adx_decoder; +extern const FFCodec ff_adpcm_afc_decoder; +extern const FFCodec ff_adpcm_agm_decoder; +extern const FFCodec ff_adpcm_aica_decoder; +extern const FFCodec ff_adpcm_argo_decoder; +extern const FFCodec ff_adpcm_argo_encoder; +extern const FFCodec ff_adpcm_ct_decoder; +extern const FFCodec ff_adpcm_dtk_decoder; +extern const FFCodec ff_adpcm_ea_decoder; +extern const FFCodec ff_adpcm_ea_maxis_xa_decoder; +extern const FFCodec ff_adpcm_ea_r1_decoder; +extern const FFCodec ff_adpcm_ea_r2_decoder; +extern const FFCodec ff_adpcm_ea_r3_decoder; +extern const FFCodec ff_adpcm_ea_xas_decoder; +extern const FFCodec ff_adpcm_g722_encoder; +extern const FFCodec ff_adpcm_g722_decoder; +extern const FFCodec ff_adpcm_g726_encoder; +extern const FFCodec ff_adpcm_g726_decoder; +extern const FFCodec ff_adpcm_g726le_encoder; +extern const FFCodec ff_adpcm_g726le_decoder; +extern const FFCodec ff_adpcm_ima_acorn_decoder; +extern const FFCodec ff_adpcm_ima_amv_decoder; +extern const FFCodec ff_adpcm_ima_amv_encoder; +extern const FFCodec ff_adpcm_ima_alp_decoder; +extern const FFCodec ff_adpcm_ima_alp_encoder; +extern const FFCodec ff_adpcm_ima_apc_decoder; +extern const FFCodec ff_adpcm_ima_apm_decoder; +extern const FFCodec ff_adpcm_ima_apm_encoder; +extern const FFCodec ff_adpcm_ima_cunning_decoder; +extern const FFCodec ff_adpcm_ima_dat4_decoder; +extern const FFCodec ff_adpcm_ima_dk3_decoder; +extern const FFCodec ff_adpcm_ima_dk4_decoder; +extern const FFCodec ff_adpcm_ima_ea_eacs_decoder; +extern const FFCodec ff_adpcm_ima_ea_sead_decoder; +extern const FFCodec ff_adpcm_ima_iss_decoder; +extern const FFCodec ff_adpcm_ima_moflex_decoder; +extern const FFCodec ff_adpcm_ima_mtf_decoder; +extern const FFCodec ff_adpcm_ima_oki_decoder; +extern const FFCodec ff_adpcm_ima_qt_encoder; +extern const FFCodec ff_adpcm_ima_qt_decoder; +extern const FFCodec ff_adpcm_ima_rad_decoder; +extern const FFCodec ff_adpcm_ima_ssi_decoder; +extern const FFCodec ff_adpcm_ima_ssi_encoder; +extern const FFCodec ff_adpcm_ima_smjpeg_decoder; +extern const FFCodec ff_adpcm_ima_wav_encoder; +extern const FFCodec ff_adpcm_ima_wav_decoder; +extern const FFCodec ff_adpcm_ima_ws_encoder; +extern const FFCodec ff_adpcm_ima_ws_decoder; +extern const FFCodec ff_adpcm_ms_encoder; +extern const FFCodec ff_adpcm_ms_decoder; +extern const FFCodec ff_adpcm_mtaf_decoder; +extern const FFCodec ff_adpcm_psx_decoder; +extern const FFCodec ff_adpcm_sbpro_2_decoder; +extern const FFCodec ff_adpcm_sbpro_3_decoder; +extern const FFCodec ff_adpcm_sbpro_4_decoder; +extern const FFCodec ff_adpcm_swf_encoder; +extern const FFCodec ff_adpcm_swf_decoder; +extern const FFCodec ff_adpcm_thp_decoder; +extern const FFCodec ff_adpcm_thp_le_decoder; +extern const FFCodec ff_adpcm_vima_decoder; +extern const FFCodec ff_adpcm_xa_decoder; +extern const FFCodec ff_adpcm_xmd_decoder; +extern const FFCodec ff_adpcm_yamaha_encoder; +extern const FFCodec ff_adpcm_yamaha_decoder; +extern const FFCodec ff_adpcm_zork_decoder; + +/* subtitles */ +extern const FFCodec ff_ssa_encoder; +extern const FFCodec ff_ssa_decoder; +extern const FFCodec ff_ass_encoder; +extern const FFCodec ff_ass_decoder; +extern const FFCodec ff_ccaption_decoder; +extern const FFCodec ff_dvbsub_encoder; +extern const FFCodec ff_dvbsub_decoder; +extern const FFCodec ff_dvdsub_encoder; +extern const FFCodec ff_dvdsub_decoder; +extern const FFCodec ff_jacosub_decoder; +extern const FFCodec ff_microdvd_decoder; +extern const FFCodec ff_movtext_encoder; +extern const FFCodec ff_movtext_decoder; +extern const FFCodec ff_mpl2_decoder; +extern const FFCodec ff_pgssub_decoder; +extern const FFCodec ff_pjs_decoder; +extern const FFCodec ff_realtext_decoder; +extern const FFCodec ff_sami_decoder; +extern const FFCodec ff_srt_encoder; +extern const FFCodec ff_srt_decoder; +extern const FFCodec ff_stl_decoder; +extern const FFCodec ff_subrip_encoder; +extern const FFCodec ff_subrip_decoder; +extern const FFCodec ff_subviewer_decoder; +extern const FFCodec ff_subviewer1_decoder; +extern const FFCodec ff_text_encoder; +extern const FFCodec ff_text_decoder; +extern const FFCodec ff_ttml_encoder; +extern const FFCodec ff_vplayer_decoder; +extern const FFCodec ff_webvtt_encoder; +extern const FFCodec ff_webvtt_decoder; +extern const FFCodec ff_xsub_encoder; +extern const FFCodec ff_xsub_decoder; + +/* external libraries */ +extern const FFCodec ff_aac_at_encoder; +extern const FFCodec ff_aac_at_decoder; +extern const FFCodec ff_ac3_at_decoder; +extern const FFCodec ff_adpcm_ima_qt_at_decoder; +extern const FFCodec ff_alac_at_encoder; +extern const FFCodec ff_alac_at_decoder; +extern const FFCodec ff_amr_nb_at_decoder; +extern const FFCodec ff_eac3_at_decoder; +extern const FFCodec ff_gsm_ms_at_decoder; +extern const FFCodec ff_ilbc_at_encoder; +extern const FFCodec ff_ilbc_at_decoder; +extern const FFCodec ff_mp1_at_decoder; +extern const FFCodec ff_mp2_at_decoder; +extern const FFCodec ff_mp3_at_decoder; +extern const FFCodec ff_pcm_alaw_at_encoder; +extern const FFCodec ff_pcm_alaw_at_decoder; +extern const FFCodec ff_pcm_mulaw_at_encoder; +extern const FFCodec ff_pcm_mulaw_at_decoder; +extern const FFCodec ff_qdmc_at_decoder; +extern const FFCodec ff_qdm2_at_decoder; +extern FFCodec ff_libaom_av1_encoder; +extern const FFCodec ff_libaribb24_decoder; +extern const FFCodec ff_libcelt_decoder; +extern const FFCodec ff_libcodec2_encoder; +extern const FFCodec ff_libcodec2_decoder; +extern const FFCodec ff_libdav1d_decoder; +extern const FFCodec ff_libdavs2_decoder; +extern const FFCodec ff_libfdk_aac_encoder; +extern const FFCodec ff_libfdk_aac_decoder; +extern const FFCodec ff_libgsm_encoder; +extern const FFCodec ff_libgsm_decoder; +extern const FFCodec ff_libgsm_ms_encoder; +extern const FFCodec ff_libgsm_ms_decoder; +extern const FFCodec ff_libilbc_encoder; +extern const FFCodec ff_libilbc_decoder; +extern const FFCodec ff_libjxl_decoder; +extern const FFCodec ff_libjxl_encoder; +extern const FFCodec ff_libmp3lame_encoder; +extern const FFCodec ff_libopencore_amrnb_encoder; +extern const FFCodec ff_libopencore_amrnb_decoder; +extern const FFCodec ff_libopencore_amrwb_decoder; +extern const FFCodec ff_libopenjpeg_encoder; +extern const FFCodec ff_libopenjpeg_decoder; +extern const FFCodec ff_libopus_encoder; +extern const FFCodec ff_libopus_decoder; +extern const FFCodec ff_librav1e_encoder; +extern const FFCodec ff_librsvg_decoder; +extern const FFCodec ff_libshine_encoder; +extern const FFCodec ff_libspeex_encoder; +extern const FFCodec ff_libspeex_decoder; +extern const FFCodec ff_libsvtav1_encoder; +extern const FFCodec ff_libtheora_encoder; +extern const FFCodec ff_libtwolame_encoder; +extern const FFCodec ff_libuavs3d_decoder; +extern const FFCodec ff_libvo_amrwbenc_encoder; +extern const FFCodec ff_libvorbis_encoder; +extern const FFCodec ff_libvorbis_decoder; +extern const FFCodec ff_libvpx_vp8_encoder; +extern const FFCodec ff_libvpx_vp8_decoder; +extern FFCodec ff_libvpx_vp9_encoder; +extern FFCodec ff_libvpx_vp9_decoder; +/* preferred over libwebp */ +extern const FFCodec ff_libwebp_anim_encoder; +extern const FFCodec ff_libwebp_encoder; +extern const FFCodec ff_libx262_encoder; +#if CONFIG_LIBX264_ENCODER +#include <x264.h> +#if X264_BUILD < 153 +#define LIBX264_CONST +#else +#define LIBX264_CONST const +#endif +extern LIBX264_CONST FFCodec ff_libx264_encoder; +#endif +extern const FFCodec ff_libx264rgb_encoder; +extern FFCodec ff_libx265_encoder; +extern const FFCodec ff_libxavs_encoder; +extern const FFCodec ff_libxavs2_encoder; +extern const FFCodec ff_libxvid_encoder; +extern const FFCodec ff_libzvbi_teletext_decoder; + +/* text */ +extern const FFCodec ff_bintext_decoder; +extern const FFCodec ff_xbin_decoder; +extern const FFCodec ff_idf_decoder; + +/* external libraries, that shouldn't be used by default if one of the + * above is available */ +extern const FFCodec ff_aac_mf_encoder; +extern const FFCodec ff_ac3_mf_encoder; +extern const FFCodec ff_h263_v4l2m2m_encoder; +extern const FFCodec ff_libaom_av1_decoder; +/* hwaccel hooks only, so prefer external decoders */ +extern const FFCodec ff_av1_decoder; +extern const FFCodec ff_av1_cuvid_decoder; +extern const FFCodec ff_av1_mediacodec_decoder; +extern const FFCodec ff_av1_nvenc_encoder; +extern const FFCodec ff_av1_qsv_decoder; +extern const FFCodec ff_av1_qsv_encoder; +extern const FFCodec ff_av1_amf_encoder; +extern const FFCodec ff_libopenh264_encoder; +extern const FFCodec ff_libopenh264_decoder; +extern const FFCodec ff_h264_amf_encoder; +extern const FFCodec ff_h264_cuvid_decoder; +extern const FFCodec ff_h264_mf_encoder; +extern const FFCodec ff_h264_nvenc_encoder; +extern const FFCodec ff_h264_omx_encoder; +extern const FFCodec ff_h264_qsv_encoder; +extern const FFCodec ff_h264_v4l2m2m_encoder; +extern const FFCodec ff_h264_vaapi_encoder; +extern const FFCodec ff_h264_videotoolbox_encoder; +extern const FFCodec ff_hevc_amf_encoder; +extern const FFCodec ff_hevc_cuvid_decoder; +extern const FFCodec ff_hevc_mediacodec_decoder; +extern const FFCodec ff_hevc_mediacodec_encoder; +extern const FFCodec ff_hevc_mf_encoder; +extern const FFCodec ff_hevc_nvenc_encoder; +extern const FFCodec ff_hevc_qsv_encoder; +extern const FFCodec ff_hevc_v4l2m2m_encoder; +extern const FFCodec ff_hevc_vaapi_encoder; +extern const FFCodec ff_hevc_videotoolbox_encoder; +extern const FFCodec ff_libkvazaar_encoder; +extern const FFCodec ff_mjpeg_cuvid_decoder; +extern const FFCodec ff_mjpeg_qsv_encoder; +extern const FFCodec ff_mjpeg_qsv_decoder; +extern const FFCodec ff_mjpeg_vaapi_encoder; +extern const FFCodec ff_mp3_mf_encoder; +extern const FFCodec ff_mpeg1_cuvid_decoder; +extern const FFCodec ff_mpeg2_cuvid_decoder; +extern const FFCodec ff_mpeg2_qsv_encoder; +extern const FFCodec ff_mpeg2_vaapi_encoder; +extern const FFCodec ff_mpeg4_cuvid_decoder; +extern const FFCodec ff_mpeg4_mediacodec_decoder; +extern const FFCodec ff_mpeg4_omx_encoder; +extern const FFCodec ff_mpeg4_v4l2m2m_encoder; +extern const FFCodec ff_prores_videotoolbox_encoder; +extern const FFCodec ff_vc1_cuvid_decoder; +extern const FFCodec ff_vp8_cuvid_decoder; +extern const FFCodec ff_vp8_mediacodec_decoder; +extern const FFCodec ff_vp8_qsv_decoder; +extern const FFCodec ff_vp8_v4l2m2m_encoder; +extern const FFCodec ff_vp8_vaapi_encoder; +extern const FFCodec ff_vp9_cuvid_decoder; +extern const FFCodec ff_vp9_mediacodec_decoder; +extern const FFCodec ff_vp9_qsv_decoder; +extern const FFCodec ff_vp9_vaapi_encoder; +extern const FFCodec ff_vp9_qsv_encoder; + +// null codecs +extern const FFCodec ff_vnull_decoder; +extern const FFCodec ff_vnull_encoder; +extern const FFCodec ff_anull_decoder; +extern const FFCodec ff_anull_encoder; + +// The iterate API is not usable with ossfuzz due to the excessive size of binaries created +#if CONFIG_OSSFUZZ +const FFCodec * codec_list[] = { + NULL, + NULL, + NULL +}; +#else +#include "libavcodec/codec_list.c" +#endif + +static AVOnce av_codec_static_init = AV_ONCE_INIT; +static void av_codec_init_static(void) +{ + for (int i = 0; codec_list[i]; i++) { + if (codec_list[i]->init_static_data) + codec_list[i]->init_static_data((FFCodec*)codec_list[i]); + } +} + +const AVCodec *av_codec_iterate(void **opaque) +{ + uintptr_t i = (uintptr_t)*opaque; + const FFCodec *c = codec_list[i]; + + ff_thread_once(&av_codec_static_init, av_codec_init_static); + + if (c) { + *opaque = (void*)(i + 1); + return &c->p; + } + return NULL; +} + +static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id) +{ + switch(id){ + //This is for future deprecatec codec ids, its empty since + //last major bump but will fill up again over time, please don't remove it + default : return id; + } +} + +static const AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *)) +{ + const AVCodec *p, *experimental = NULL; + void *i = 0; + + id = remap_deprecated_codec_id(id); + + while ((p = av_codec_iterate(&i))) { + if (!x(p)) + continue; + if (p->id == id) { + if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) { + experimental = p; + } else + return p; + } + } + + return experimental; +} + +const AVCodec *avcodec_find_encoder(enum AVCodecID id) +{ + return find_codec(id, av_codec_is_encoder); +} + +const AVCodec *avcodec_find_decoder(enum AVCodecID id) +{ + return find_codec(id, av_codec_is_decoder); +} + +static const AVCodec *find_codec_by_name(const char *name, int (*x)(const AVCodec *)) +{ + void *i = 0; + const AVCodec *p; + + if (!name) + return NULL; + + while ((p = av_codec_iterate(&i))) { + if (!x(p)) + continue; + if (strcmp(name, p->name) == 0) + return p; + } + + return NULL; +} + +const AVCodec *avcodec_find_encoder_by_name(const char *name) +{ + return find_codec_by_name(name, av_codec_is_encoder); +} + +const AVCodec *avcodec_find_decoder_by_name(const char *name) +{ + return find_codec_by_name(name, av_codec_is_decoder); +} diff --git a/media/ffvpx/libavcodec/arm/fft_init_arm.c b/media/ffvpx/libavcodec/arm/fft_init_arm.c new file mode 100644 index 0000000000..8ae22dfb4e --- /dev/null +++ b/media/ffvpx/libavcodec/arm/fft_init_arm.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" + +#include "libavcodec/fft.h" + +void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z); + +void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); +void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); + +void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input); + +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); + +av_cold void ff_fft_init_arm(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_vfp_vm(cpu_flags)) { + s->fft_calc = ff_fft_calc_vfp; +#if CONFIG_MDCT + s->imdct_half = ff_imdct_half_vfp; +#endif + } + + if (have_neon(cpu_flags)) { +#if CONFIG_FFT + if (s->nbits < 17) { + s->fft_permute = ff_fft_permute_neon; + s->fft_calc = ff_fft_calc_neon; + } +#endif +#if CONFIG_MDCT + s->imdct_calc = ff_imdct_calc_neon; + s->imdct_half = ff_imdct_half_neon; + s->mdct_calc = ff_mdct_calc_neon; + s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; +#endif + } +} diff --git a/media/ffvpx/libavcodec/arm/fft_neon.S b/media/ffvpx/libavcodec/arm/fft_neon.S new file mode 100644 index 0000000000..48f8dfc424 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/fft_neon.S @@ -0,0 +1,375 @@ +/* + * ARM NEON optimised FFT + * + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2009 Naotoshi Nojiri + * + * This algorithm (though not any of the implementation details) is + * based on libdjbfft by D. J. Bernstein. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +#define M_SQRT1_2 0.70710678118654752440 + + +function fft4_neon + vld1.32 {d0-d3}, [r0,:128] + + vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2 + vsub.f32 d6, d0, d1 @ r0-r1,i0-i1 + vsub.f32 d7, d16, d17 @ r3-r2,i2-i3 + vadd.f32 d4, d0, d1 @ r0+r1,i0+i1 + vadd.f32 d5, d2, d3 @ i2+i3,r2+r3 + vadd.f32 d1, d6, d7 + vsub.f32 d3, d6, d7 + vadd.f32 d0, d4, d5 + vsub.f32 d2, d4, d5 + + vst1.32 {d0-d3}, [r0,:128] + + bx lr +endfunc + +function fft8_neon + mov r1, r0 + vld1.32 {d0-d3}, [r1,:128]! + vld1.32 {d16-d19}, [r1,:128] + + movw r2, #0x04f3 @ sqrt(1/2) + movt r2, #0x3f35 + eor r3, r2, #1<<31 + vdup.32 d31, r2 + + vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2 + vadd.f32 d4, d16, d17 @ r4+r5,i4+i5 + vmov d28, r3, r2 + vadd.f32 d5, d18, d19 @ r6+r7,i6+i7 + vsub.f32 d17, d16, d17 @ r4-r5,i4-i5 + vsub.f32 d19, d18, d19 @ r6-r7,i6-i7 + vrev64.32 d29, d28 + vadd.f32 d20, d0, d1 @ r0+r1,i0+i1 + vadd.f32 d21, d2, d3 @ r2+r3,i2+i3 + vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w + vext.32 q3, q2, q2, #1 + vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w + vsub.f32 d23, d22, d23 @ i2-i3,r3-r2 + vsub.f32 d22, d0, d1 @ r0-r1,i0-i1 + vmul.f32 d24, d17, d31 @ a2r*w,a2i*w + vmul.f32 d25, d19, d31 @ a3r*w,a3i*w + vadd.f32 d0, d20, d21 + vsub.f32 d2, d20, d21 + vadd.f32 d1, d22, d23 + vrev64.32 q13, q13 + vsub.f32 d3, d22, d23 + vsub.f32 d6, d6, d7 + vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2 + vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6 + vadd.f32 d7, d4, d5 + vsub.f32 d18, d2, d6 + vext.32 q13, q12, q12, #1 + vadd.f32 d2, d2, d6 + vsub.f32 d16, d0, d7 + vadd.f32 d5, d25, d24 + vsub.f32 d4, d26, d27 + vadd.f32 d0, d0, d7 + vsub.f32 d17, d1, d5 + vsub.f32 d19, d3, d4 + vadd.f32 d3, d3, d4 + vadd.f32 d1, d1, d5 + + vst1.32 {d16-d19}, [r1,:128] + vst1.32 {d0-d3}, [r0,:128] + + bx lr +endfunc + +function fft16_neon + movrel r1, mppm + vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3} + pld [r0, #32] + vld1.32 {d2-d3}, [r1,:128] + vext.32 q13, q9, q9, #1 + vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7} + vadd.f32 d4, d16, d17 + vsub.f32 d5, d16, d17 + vadd.f32 d18, d18, d19 + vsub.f32 d19, d26, d27 + + vadd.f32 d20, d22, d23 + vsub.f32 d22, d22, d23 + vsub.f32 d23, d24, d25 + vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1} + vadd.f32 d21, d24, d25 + vmul.f32 d24, d22, d2 + vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3} + vmul.f32 d25, d23, d3 + vuzp.32 d16, d17 @ {r0,r1,i0,i1} + vmul.f32 q1, q11, d2[1] + vuzp.32 d18, d19 @ {r2,r3,i2,i3} + vrev64.32 q12, q12 + vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6} + vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11} + vzip.32 q10, q11 + vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15} + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + sub r0, r0, #96 + vext.32 q13, q13, q13, #1 + vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5} + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} + vext.32 q15, q15, q15, #1 + vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7} + vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10} + vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3} + vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14} + vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6} + vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a} + movrelx r2, X(ff_cos_16) + vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8} + vrev64.32 d1, d1 + vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a} + vrev64.32 d3, d3 + movrel r3, pmmp + vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8} + vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a} + vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9} + vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13} + vld1.32 {d4-d5}, [r2,:64] + vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11} + vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15} + vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13} + vld1.32 {d6-d7}, [r3,:128] + vrev64.32 q1, q14 + vmul.f32 q14, q14, d4[1] + vmul.f32 q1, q1, q3 + vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a} + vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15} + vzip.32 q12, q14 + vadd.f32 d0, d28, d24 + vadd.f32 d1, d25, d29 + vsub.f32 d2, d25, d29 + vsub.f32 d3, d28, d24 + vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9} + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} + vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13} + mov r1, #32 + vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5} + vrev64.32 q0, q13 + vmul.f32 q13, q13, d5[0] + vrev64.32 q1, q15 + vmul.f32 q15, q15, d5[1] + vst2.32 {d16-d17},[r0,:128], r1 + vmul.f32 q0, q0, q3 + vst2.32 {d20-d21},[r0,:128], r1 + vmul.f32 q1, q1, q3 + vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6} + vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a} + vst2.32 {d24-d25},[r0,:128], r1 + vst2.32 {d28-d29},[r0,:128] + vzip.32 q13, q15 + sub r0, r0, #80 + vadd.f32 d0, d30, d26 + vadd.f32 d1, d27, d31 + vsub.f32 d2, d27, d31 + vsub.f32 d3, d30, d26 + vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11} + vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3} + vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15} + vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7} + vst2.32 {d18-d19},[r0,:128], r1 + vst2.32 {d22-d23},[r0,:128], r1 + vst2.32 {d26-d27},[r0,:128], r1 + vst2.32 {d30-d31},[r0,:128] + bx lr +endfunc + +function fft_pass_neon + push {r4-r6,lr} + mov r6, r2 @ n + lsl r5, r2, #3 @ 2 * n * sizeof FFTSample + lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex + lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex + add r3, r2, r4 + add r4, r4, r0 @ &z[o1] + add r2, r2, r0 @ &z[o2] + add r3, r3, r0 @ &z[o3] + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} + movrel r12, pmmp + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} + add r5, r5, r1 @ wim + vld1.32 {d6-d7}, [r12,:128] @ pmmp + vswp d21, d22 + vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]} + sub r5, r5, #4 @ wim-- + vrev64.32 q1, q11 + vmul.f32 q11, q11, d4[1] + vmul.f32 q1, q1, q3 + vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1] + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} + sub r6, r6, #1 @ n-- + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} + vzip.32 q10, q11 + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + vsub.f32 q10, q8, q0 + vadd.f32 q8, q8, q0 + vsub.f32 q11, q9, q1 + vadd.f32 q9, q9, q1 + vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]} + vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]} + vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]} + vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]} + sub r5, r5, #8 @ wim -= 2 +1: + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} + vswp d21, d22 + vld1.32 {d4}, [r1]! @ {wre[0],wre[1]} + vrev64.32 q0, q10 + vmul.f32 q10, q10, d4[0] + vrev64.32 q1, q11 + vmul.f32 q11, q11, d4[1] + vld1.32 {d5}, [r5] @ {wim[-1],wim[0]} + vmul.f32 q0, q0, q3 + sub r5, r5, #8 @ wim -= 2 + vmul.f32 q1, q1, q3 + vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6} + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} + subs r6, r6, #1 @ n-- + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} + vzip.32 q10, q11 + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + vsub.f32 q10, q8, q0 + vadd.f32 q8, q8, q0 + vsub.f32 q11, q9, q1 + vadd.f32 q9, q9, q1 + vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]} + vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]} + vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]} + vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]} + bne 1b + + pop {r4-r6,pc} +endfunc + +.macro def_fft n, n2, n4 + .align 6 +function fft\n\()_neon + push {r4, lr} + mov r4, r0 + bl fft\n2\()_neon + add r0, r4, #\n4*2*8 + bl fft\n4\()_neon + add r0, r4, #\n4*3*8 + bl fft\n4\()_neon + mov r0, r4 + pop {r4, lr} + movrelx r1, X(ff_cos_\n) + mov r2, #\n4/2 + b fft_pass_neon +endfunc +.endm + + def_fft 32, 16, 8 + def_fft 64, 32, 16 + def_fft 128, 64, 32 + def_fft 256, 128, 64 + def_fft 512, 256, 128 + def_fft 1024, 512, 256 + def_fft 2048, 1024, 512 + def_fft 4096, 2048, 1024 + def_fft 8192, 4096, 2048 + def_fft 16384, 8192, 4096 + def_fft 32768, 16384, 8192 + def_fft 65536, 32768, 16384 + +function ff_fft_calc_neon, export=1 + ldr r2, [r0] + sub r2, r2, #2 + movrel r3, fft_tab_neon + ldr r3, [r3, r2, lsl #2] + mov r0, r1 + bx r3 +endfunc + +function ff_fft_permute_neon, export=1 + push {r4,lr} + mov r12, #1 + ldr r2, [r0] @ nbits + ldr r3, [r0, #12] @ tmp_buf + ldr r0, [r0, #8] @ revtab + lsl r12, r12, r2 + mov r2, r12 +1: + vld1.32 {d0-d1}, [r1,:128]! + ldr r4, [r0], #4 + uxth lr, r4 + uxth r4, r4, ror #16 + add lr, r3, lr, lsl #3 + add r4, r3, r4, lsl #3 + vst1.32 {d0}, [lr,:64] + vst1.32 {d1}, [r4,:64] + subs r12, r12, #2 + bgt 1b + + sub r1, r1, r2, lsl #3 +1: + vld1.32 {d0-d3}, [r3,:128]! + vst1.32 {d0-d3}, [r1,:128]! + subs r2, r2, #4 + bgt 1b + + pop {r4,pc} +endfunc + +const fft_tab_neon, relocate=1 + .word fft4_neon + .word fft8_neon + .word fft16_neon + .word fft32_neon + .word fft64_neon + .word fft128_neon + .word fft256_neon + .word fft512_neon + .word fft1024_neon + .word fft2048_neon + .word fft4096_neon + .word fft8192_neon + .word fft16384_neon + .word fft32768_neon + .word fft65536_neon +endconst + +const pmmp, align=4 + .float +1.0, -1.0, -1.0, +1.0 +endconst + +const mppm, align=4 + .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +endconst diff --git a/media/ffvpx/libavcodec/arm/fft_vfp.S b/media/ffvpx/libavcodec/arm/fft_vfp.S new file mode 100644 index 0000000000..ac601325f2 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/fft_vfp.S @@ -0,0 +1,530 @@ +/* + * Copyright (c) 2013 RISC OS Open Ltd + * Author: Ben Avison <bavison@riscosopen.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +@ The fftx_internal_vfp versions of the functions obey a modified AAPCS: +@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and +@ all single-precision VFP registers may be corrupted on exit. The a2 +@ register may not be clobbered in these functions, as it holds the +@ stored original FPSCR. + +function ff_fft_calc_vfp, export=1 + ldr ip, [a1, #0] @ nbits + mov a1, a2 + movrel a2, (fft_tab_vfp - 8) + ldr pc, [a2, ip, lsl #2] +endfunc +const fft_tab_vfp, relocate=1 + .word fft4_vfp + .word fft8_vfp + .word X(ff_fft16_vfp) @ this one alone is exported + .word fft32_vfp + .word fft64_vfp + .word fft128_vfp + .word fft256_vfp + .word fft512_vfp + .word fft1024_vfp + .word fft2048_vfp + .word fft4096_vfp + .word fft8192_vfp + .word fft16384_vfp + .word fft32768_vfp + .word fft65536_vfp +endconst + +function fft4_vfp + vldr d0, [a1, #0*2*4] @ s0,s1 = z[0] + vldr d4, [a1, #1*2*4] @ s8,s9 = z[1] + vldr d1, [a1, #2*2*4] @ s2,s3 = z[2] + vldr d5, [a1, #3*2*4] @ s10,s11 = z[3] + @ stall + vadd.f s12, s0, s8 @ i0 + vadd.f s13, s1, s9 @ i1 + vadd.f s14, s2, s10 @ i2 + vadd.f s15, s3, s11 @ i3 + vsub.f s8, s0, s8 @ i4 + vsub.f s9, s1, s9 @ i5 + vsub.f s10, s2, s10 @ i6 + vsub.f s11, s3, s11 @ i7 + @ stall + @ stall + vadd.f s0, s12, s14 @ z[0].re + vsub.f s4, s12, s14 @ z[2].re + vadd.f s1, s13, s15 @ z[0].im + vsub.f s5, s13, s15 @ z[2].im + vadd.f s7, s9, s10 @ z[3].im + vsub.f s3, s9, s10 @ z[1].im + vadd.f s2, s8, s11 @ z[1].re + vsub.f s6, s8, s11 @ z[3].re + @ stall + @ stall + vstr d0, [a1, #0*2*4] + vstr d2, [a1, #2*2*4] + @ stall + @ stall + vstr d1, [a1, #1*2*4] + vstr d3, [a1, #3*2*4] + + bx lr +endfunc + +.macro macro_fft8_head + @ FFT4 + vldr d4, [a1, #0 * 2*4] + vldr d6, [a1, #1 * 2*4] + vldr d5, [a1, #2 * 2*4] + vldr d7, [a1, #3 * 2*4] + @ BF + vldr d12, [a1, #4 * 2*4] + vadd.f s16, s8, s12 @ vector op + vldr d14, [a1, #5 * 2*4] + vldr d13, [a1, #6 * 2*4] + vldr d15, [a1, #7 * 2*4] + vsub.f s20, s8, s12 @ vector op + vadd.f s0, s16, s18 + vsub.f s2, s16, s18 + vadd.f s1, s17, s19 + vsub.f s3, s17, s19 + vadd.f s7, s21, s22 + vsub.f s5, s21, s22 + vadd.f s4, s20, s23 + vsub.f s6, s20, s23 + vsub.f s20, s24, s28 @ vector op + vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory + vstr d1, [a1, #1 * 2*4] + vldr s0, cos1pi4 + vadd.f s16, s24, s28 @ vector op + vstr d2, [a1, #2 * 2*4] + vstr d3, [a1, #3 * 2*4] + vldr d12, [a1, #0 * 2*4] + @ TRANSFORM + vmul.f s20, s20, s0 @ vector x scalar op + vldr d13, [a1, #1 * 2*4] + vldr d14, [a1, #2 * 2*4] + vldr d15, [a1, #3 * 2*4] + @ BUTTERFLIES + vadd.f s0, s18, s16 + vadd.f s1, s17, s19 + vsub.f s2, s17, s19 + vsub.f s3, s18, s16 + vadd.f s4, s21, s20 + vsub.f s5, s21, s20 + vadd.f s6, s22, s23 + vsub.f s7, s22, s23 + vadd.f s8, s0, s24 @ vector op + vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory + vstr d1, [a1, #1 * 2*4] + vldr d6, [a1, #0 * 2*4] + vldr d7, [a1, #1 * 2*4] + vadd.f s1, s5, s6 + vadd.f s0, s7, s4 + vsub.f s2, s5, s6 + vsub.f s3, s7, s4 + vsub.f s12, s24, s12 @ vector op + vsub.f s5, s29, s1 + vsub.f s4, s28, s0 + vsub.f s6, s30, s2 + vsub.f s7, s31, s3 + vadd.f s16, s0, s28 @ vector op + vstr d6, [a1, #4 * 2*4] + vstr d7, [a1, #6 * 2*4] + vstr d4, [a1, #0 * 2*4] + vstr d5, [a1, #2 * 2*4] + vstr d2, [a1, #5 * 2*4] + vstr d3, [a1, #7 * 2*4] +.endm + +.macro macro_fft8_tail + vstr d8, [a1, #1 * 2*4] + vstr d9, [a1, #3 * 2*4] +.endm + +function .Lfft8_internal_vfp + macro_fft8_head + macro_fft8_tail + bx lr +endfunc + +function fft8_vfp + ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 + fmrx a2, FPSCR + fmxr FPSCR, a3 + vpush {s16-s31} + mov ip, lr + bl .Lfft8_internal_vfp + vpop {s16-s31} + fmxr FPSCR, a2 + bx ip +endfunc + +.align 3 +cos1pi4: @ cos(1*pi/4) = sqrt(2) + .float 0.707106769084930419921875 +cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2 + .float 0.92387950420379638671875 +cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2 + .float 0.3826834261417388916015625 + +function .Lfft16_internal_vfp + macro_fft8_head + @ FFT4(z+8) + vldr d10, [a1, #8 * 2*4] + vldr d12, [a1, #9 * 2*4] + vldr d11, [a1, #10 * 2*4] + vldr d13, [a1, #11 * 2*4] + macro_fft8_tail + vadd.f s16, s20, s24 @ vector op + @ FFT4(z+12) + vldr d4, [a1, #12 * 2*4] + vldr d6, [a1, #13 * 2*4] + vldr d5, [a1, #14 * 2*4] + vsub.f s20, s20, s24 @ vector op + vldr d7, [a1, #15 * 2*4] + vadd.f s0, s16, s18 + vsub.f s4, s16, s18 + vadd.f s1, s17, s19 + vsub.f s5, s17, s19 + vadd.f s7, s21, s22 + vsub.f s3, s21, s22 + vadd.f s2, s20, s23 + vsub.f s6, s20, s23 + vadd.f s16, s8, s12 @ vector op + vstr d0, [a1, #8 * 2*4] + vstr d2, [a1, #10 * 2*4] + vstr d1, [a1, #9 * 2*4] + vsub.f s20, s8, s12 + vstr d3, [a1, #11 * 2*4] + @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4) + vldr d12, [a1, #10 * 2*4] + vadd.f s0, s16, s18 + vadd.f s1, s17, s19 + vsub.f s6, s16, s18 + vsub.f s7, s17, s19 + vsub.f s3, s21, s22 + vadd.f s2, s20, s23 + vadd.f s5, s21, s22 + vsub.f s4, s20, s23 + vstr d0, [a1, #12 * 2*4] + vmov s0, s6 + @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8) + vldr d6, [a1, #9 * 2*4] + vstr d1, [a1, #13 * 2*4] + vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8 + vstr d2, [a1, #15 * 2*4] + vldr d7, [a1, #13 * 2*4] + vadd.f s4, s25, s24 + vsub.f s5, s25, s24 + vsub.f s6, s0, s7 + vadd.f s7, s0, s7 + vmul.f s20, s12, s3 @ vector op + @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8) + vldr d4, [a1, #11 * 2*4] + vldr d5, [a1, #15 * 2*4] + vldr s1, cos3pi8 + vmul.f s24, s4, s2 @ vector * scalar op + vmul.f s28, s12, s1 @ vector * scalar op + vmul.f s12, s8, s1 @ vector * scalar op + vadd.f s4, s20, s29 + vsub.f s5, s21, s28 + vsub.f s6, s22, s31 + vadd.f s7, s23, s30 + vmul.f s8, s8, s3 @ vector * scalar op + vldr d8, [a1, #1 * 2*4] + vldr d9, [a1, #5 * 2*4] + vldr d10, [a1, #3 * 2*4] + vldr d11, [a1, #7 * 2*4] + vldr d14, [a1, #2 * 2*4] + vadd.f s0, s6, s4 + vadd.f s1, s5, s7 + vsub.f s2, s5, s7 + vsub.f s3, s6, s4 + vadd.f s4, s12, s9 + vsub.f s5, s13, s8 + vsub.f s6, s14, s11 + vadd.f s7, s15, s10 + vadd.f s12, s0, s16 @ vector op + vstr d0, [a1, #1 * 2*4] + vstr d1, [a1, #5 * 2*4] + vldr d4, [a1, #1 * 2*4] + vldr d5, [a1, #5 * 2*4] + vadd.f s0, s6, s4 + vadd.f s1, s5, s7 + vsub.f s2, s5, s7 + vsub.f s3, s6, s4 + vsub.f s8, s16, s8 @ vector op + vstr d6, [a1, #1 * 2*4] + vstr d7, [a1, #5 * 2*4] + vldr d15, [a1, #6 * 2*4] + vsub.f s4, s20, s0 + vsub.f s5, s21, s1 + vsub.f s6, s22, s2 + vsub.f s7, s23, s3 + vadd.f s20, s0, s20 @ vector op + vstr d4, [a1, #9 * 2*4] + @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12]) + vldr d6, [a1, #8 * 2*4] + vstr d5, [a1, #13 * 2*4] + vldr d7, [a1, #12 * 2*4] + vstr d2, [a1, #11 * 2*4] + vldr d8, [a1, #0 * 2*4] + vstr d3, [a1, #15 * 2*4] + vldr d9, [a1, #4 * 2*4] + vadd.f s0, s26, s24 + vadd.f s1, s25, s27 + vsub.f s2, s25, s27 + vsub.f s3, s26, s24 + vadd.f s4, s14, s12 + vadd.f s5, s13, s15 + vsub.f s6, s13, s15 + vsub.f s7, s14, s12 + vadd.f s8, s0, s28 @ vector op + vstr d0, [a1, #3 * 2*4] + vstr d1, [a1, #7 * 2*4] + vldr d6, [a1, #3 * 2*4] + vldr d7, [a1, #7 * 2*4] + vsub.f s0, s16, s4 + vsub.f s1, s17, s5 + vsub.f s2, s18, s6 + vsub.f s3, s19, s7 + vsub.f s12, s28, s12 @ vector op + vadd.f s16, s4, s16 @ vector op + vstr d10, [a1, #3 * 2*4] + vstr d11, [a1, #7 * 2*4] + vstr d4, [a1, #2 * 2*4] + vstr d5, [a1, #6 * 2*4] + vstr d0, [a1, #8 * 2*4] + vstr d1, [a1, #12 * 2*4] + vstr d6, [a1, #10 * 2*4] + vstr d7, [a1, #14 * 2*4] + vstr d8, [a1, #0 * 2*4] + vstr d9, [a1, #4 * 2*4] + + bx lr +endfunc + +function ff_fft16_vfp, export=1 + ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 + fmrx a2, FPSCR + fmxr FPSCR, a3 + vpush {s16-s31} + mov ip, lr + bl .Lfft16_internal_vfp + vpop {s16-s31} + fmxr FPSCR, a2 + bx ip +endfunc + +.macro pass n, z0, z1, z2, z3 + add v6, v5, #4*2*\n + @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]) + @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) + @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]) + @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) + vldr d8, [\z2, #8*(o2+1)] @ s16,s17 + vldmdb v6!, {s2} + vldr d9, [\z3, #8*(o3+1)] @ s18,s19 + vldmia v5!, {s0,s1} @ s0 is unused + vldr s7, [\z2, #8*o2] @ t1 + vmul.f s20, s16, s2 @ vector * scalar + vldr s0, [\z3, #8*o3] @ t5 + vldr s6, [\z2, #8*o2+4] @ t2 + vldr s3, [\z3, #8*o3+4] @ t6 + vmul.f s16, s16, s1 @ vector * scalar + ldr a4, =\n-1 +1: add \z0, \z0, #8*2 + .if \n*4*2 >= 512 + add \z1, \z1, #8*2 + .endif + .if \n*4*2 >= 256 + add \z2, \z2, #8*2 + .endif + .if \n*4*2 >= 512 + add \z3, \z3, #8*2 + .endif + @ up to 2 stalls (VFP vector issuing / waiting for s0) + @ depending upon whether this is the first iteration and + @ how many add instructions are inserted above + vadd.f s4, s0, s7 @ t5 + vadd.f s5, s6, s3 @ t6 + vsub.f s6, s6, s3 @ t4 + vsub.f s7, s0, s7 @ t3 + vldr d6, [\z0, #8*0-8*2] @ s12,s13 + vadd.f s0, s16, s21 @ t1 + vldr d7, [\z1, #8*o1-8*2] @ s14,s15 + vsub.f s1, s18, s23 @ t5 + vadd.f s8, s4, s12 @ vector + vector + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vsub.f s2, s17, s20 @ t2 + vadd.f s3, s19, s22 @ t6 + vstr d4, [\z0, #8*0-8*2] @ s8,s9 + vstr d5, [\z1, #8*o1-8*2] @ s10,s11 + @ stall (waiting for s5) + vstr d2, [\z2, #8*o2-8*2] @ s4,s5 + vadd.f s4, s1, s0 @ t5 + vstr d3, [\z3, #8*o3-8*2] @ s6,s7 + vsub.f s7, s1, s0 @ t3 + vadd.f s5, s2, s3 @ t6 + vsub.f s6, s2, s3 @ t4 + vldr d6, [\z0, #8*1-8*2] @ s12,s13 + vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15 + vldr d4, [\z2, #8*o2] @ s8,s9 + vldmdb v6!, {s2,s3} + vldr d5, [\z3, #8*o3] @ s10,s11 + vadd.f s20, s4, s12 @ vector + vector + vldmia v5!, {s0,s1} + vldr d8, [\z2, #8*(o2+1)] @ s16,s17 + @ stall (VFP vector issuing) + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vmul.f s12, s8, s3 @ vector * scalar + vstr d10, [\z0, #8*1-8*2] @ s20,s21 + vldr d9, [\z3, #8*(o3+1)] @ s18,s19 + vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23 + vmul.f s8, s8, s0 @ vector * scalar + vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5 + @ stall (waiting for s7) + vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7 + vmul.f s20, s16, s2 @ vector * scalar + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + vadd.f s7, s8, s13 @ t1 + vsub.f s6, s9, s12 @ t2 + vsub.f s0, s10, s15 @ t5 + vadd.f s3, s11, s14 @ t6 + vmul.f s16, s16, s1 @ vector * scalar + subs a4, a4, #1 + bne 1b + @ What remains is identical to the first two indentations of + @ the above, but without the increment of z + vadd.f s4, s0, s7 @ t5 + vadd.f s5, s6, s3 @ t6 + vsub.f s6, s6, s3 @ t4 + vsub.f s7, s0, s7 @ t3 + vldr d6, [\z0, #8*0] @ s12,s13 + vadd.f s0, s16, s21 @ t1 + vldr d7, [\z1, #8*o1] @ s14,s15 + vsub.f s1, s18, s23 @ t5 + vadd.f s8, s4, s12 @ vector + vector + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vsub.f s2, s17, s20 @ t2 + vadd.f s3, s19, s22 @ t6 + vstr d4, [\z0, #8*0] @ s8,s9 + vstr d5, [\z1, #8*o1] @ s10,s11 + vstr d2, [\z2, #8*o2] @ s4,s5 + vadd.f s4, s1, s0 @ t5 + vstr d3, [\z3, #8*o3] @ s6,s7 + vsub.f s7, s1, s0 @ t3 + vadd.f s5, s2, s3 @ t6 + vsub.f s6, s2, s3 @ t4 + vldr d6, [\z0, #8*1] @ s12,s13 + vldr d7, [\z1, #8*(o1+1)] @ s14,s15 + vadd.f s20, s4, s12 @ vector + vector + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vstr d10, [\z0, #8*1] @ s20,s21 + vstr d11, [\z1, #8*(o1+1)] @ s22,s23 + vstr d2, [\z2, #8*(o2+1)] @ s4,s5 + vstr d3, [\z3, #8*(o3+1)] @ s6,s7 +.endm + +.macro def_fft n, n2, n4 +function .Lfft\n\()_internal_vfp + .if \n >= 512 + push {v1-v6,lr} + .elseif \n >= 256 + push {v1-v2,v5-v6,lr} + .else + push {v1,v5-v6,lr} + .endif + mov v1, a1 + bl .Lfft\n2\()_internal_vfp + add a1, v1, #8*(\n/4)*2 + bl .Lfft\n4\()_internal_vfp + movrelx v5, X(ff_cos_\n), a1 + add a1, v1, #8*(\n/4)*3 + bl .Lfft\n4\()_internal_vfp + .if \n >= 512 + .set o1, 0*(\n/4/2) + .set o2, 0*(\n/4/2) + .set o3, 0*(\n/4/2) + add v2, v1, #8*2*(\n/4/2) + add v3, v1, #8*4*(\n/4/2) + add v4, v1, #8*6*(\n/4/2) + pass (\n/4/2), v1, v2, v3, v4 + pop {v1-v6,pc} + .elseif \n >= 256 + .set o1, 2*(\n/4/2) + .set o2, 0*(\n/4/2) + .set o3, 2*(\n/4/2) + add v2, v1, #8*4*(\n/4/2) + pass (\n/4/2), v1, v1, v2, v2 + pop {v1-v2,v5-v6,pc} + .else + .set o1, 2*(\n/4/2) + .set o2, 4*(\n/4/2) + .set o3, 6*(\n/4/2) + pass (\n/4/2), v1, v1, v1, v1 + pop {v1,v5-v6,pc} + .endif +endfunc + +function fft\n\()_vfp + ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ + fmrx a2, FPSCR + fmxr FPSCR, a3 + vpush {s16-s31} + mov ip, lr + bl .Lfft\n\()_internal_vfp + vpop {s16-s31} + fmxr FPSCR, a2 + bx ip +endfunc + +.ltorg +.endm + + def_fft 32, 16, 8 + def_fft 64, 32, 16 + def_fft 128, 64, 32 + def_fft 256, 128, 64 + def_fft 512, 256, 128 + def_fft 1024, 512, 256 + def_fft 2048, 1024, 512 + def_fft 4096, 2048, 1024 + def_fft 8192, 4096, 2048 + def_fft 16384, 8192, 4096 + def_fft 32768, 16384, 8192 + def_fft 65536, 32768, 16384 diff --git a/media/ffvpx/libavcodec/arm/flacdsp_arm.S b/media/ffvpx/libavcodec/arm/flacdsp_arm.S new file mode 100644 index 0000000000..f8861c5967 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/flacdsp_arm.S @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2012 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function flac_lpc_16_1_arm + ldr r12, [sp] + push {r4, lr} + ldr r1, [r1] + subs r12, r12, #2 + ldr lr, [r0], #4 + beq 2f + it lt + poplt {r4, pc} +1: + mul r4, lr, r1 + ldm r0, {r2, lr} + add_sh r2, r2, r4, asr r3 + mul r4, r2, r1 + subs r12, r12, #2 + add_sh lr, lr, r4, asr r3 + stm r0!, {r2, lr} + bgt 1b + it lt + poplt {r4, pc} +2: + mul r4, lr, r1 + ldr r2, [r0] + add_sh r2, r2, r4, asr r3 + str r2, [r0] + pop {r4, pc} +endfunc + +function flac_lpc_16_2_arm + ldr r12, [sp] + subs r12, r12, r2 + it le + bxle lr + + push {r4-r9, lr} + ldm r0!, {r6, r7} + ldm r1, {r8, r9} + subs r12, r12, #1 + beq 2f +1: + mul r4, r6, r8 + mul r5, r7, r8 + mla r4, r7, r9, r4 + ldm r0, {r6, r7} + add_sh r6, r6, r4, asr r3 + mla r5, r6, r9, r5 + add_sh r7, r7, r5, asr r3 + stm r0!, {r6, r7} + subs r12, r12, #2 + bgt 1b + it lt + poplt {r4-r9, pc} +2: + mul r4, r6, r8 + mla r4, r7, r9, r4 + ldr r5, [r0] + add_sh r5, r5, r4, asr r3 + str r5, [r0] + pop {r4-r9, pc} +endfunc + +function ff_flac_lpc_16_arm, export=1 + cmp r2, #2 + blt flac_lpc_16_1_arm + beq flac_lpc_16_2_arm + + ldr r12, [sp] + subs r12, r12, r2 + it le + bxle lr + + push {r4-r9, lr} + + subs r12, r12, #1 + beq 3f +1: + sub lr, r2, #2 + mov r4, #0 + mov r5, #0 + + ldr r7, [r0], #4 + ldr r9, [r1], #4 +2: + mla r4, r7, r9, r4 + ldm r0!, {r6, r7} + mla r5, r6, r9, r5 + ldm r1!, {r8, r9} + mla r4, r6, r8, r4 + subs lr, lr, #2 + mla r5, r7, r8, r5 + bgt 2b + blt 6f + + mla r4, r7, r9, r4 + ldr r7, [r0], #4 + mla r5, r7, r9, r5 + ldr r9, [r1], #4 +6: + mla r4, r7, r9, r4 + ldm r0, {r6, r7} + add_sh r6, r6, r4, asr r3 + mla r5, r6, r9, r5 + add_sh r7, r7, r5, asr r3 + stm r0!, {r6, r7} + sub r0, r0, r2, lsl #2 + sub r1, r1, r2, lsl #2 + + subs r12, r12, #2 + bgt 1b + it lt + poplt {r4-r9, pc} +3: + mov r4, #0 +4: + ldr r5, [r1], #4 + ldr r6, [r0], #4 + mla r4, r5, r6, r4 + subs r2, r2, #1 + bgt 4b + ldr r5, [r0] + add_sh r5, r5, r4, asr r3 + str r5, [r0] + pop {r4-r9, pc} +endfunc diff --git a/media/ffvpx/libavcodec/arm/flacdsp_init_arm.c b/media/ffvpx/libavcodec/arm/flacdsp_init_arm.c new file mode 100644 index 0000000000..9962cc89f4 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/flacdsp_init_arm.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2012 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavcodec/flacdsp.h" + +void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order, + int qlevel, int len); + +av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels) +{ + c->lpc16 = ff_flac_lpc_16_arm; +} diff --git a/media/ffvpx/libavcodec/arm/idct.h b/media/ffvpx/libavcodec/arm/idct.h new file mode 100644 index 0000000000..6c79a69c5f --- /dev/null +++ b/media/ffvpx/libavcodec/arm/idct.h @@ -0,0 +1,41 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_IDCT_H +#define AVCODEC_ARM_IDCT_H + +#include <stddef.h> +#include <stdint.h> + +void ff_j_rev_dct_arm(int16_t *data); + +void ff_simple_idct_arm(int16_t *data); + +void ff_simple_idct_armv5te(int16_t *data); +void ff_simple_idct_put_armv5te(uint8_t *dest, ptrdiff_t line_size, int16_t *data); +void ff_simple_idct_add_armv5te(uint8_t *dest, ptrdiff_t line_size, int16_t *data); + +void ff_simple_idct_armv6(int16_t *data); +void ff_simple_idct_put_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); +void ff_simple_idct_add_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); + +void ff_simple_idct_neon(int16_t *data); +void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data); +void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data); + +#endif /* AVCODEC_ARM_IDCT_H */ diff --git a/media/ffvpx/libavcodec/arm/idctdsp_arm.S b/media/ffvpx/libavcodec/arm/idctdsp_arm.S new file mode 100644 index 0000000000..057eff9be8 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/idctdsp_arm.S @@ -0,0 +1,120 @@ +@ +@ ARMv4-optimized IDCT functions +@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> +@ +@ This file is part of FFmpeg. +@ +@ FFmpeg is free software; you can redistribute it and/or +@ modify it under the terms of the GNU Lesser General Public +@ License as published by the Free Software Foundation; either +@ version 2.1 of the License, or (at your option) any later version. +@ +@ FFmpeg is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +@ Lesser General Public License for more details. +@ +@ You should have received a copy of the GNU Lesser General Public +@ License along with FFmpeg; if not, write to the Free Software +@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +@ + +#include "config.h" +#include "libavutil/arm/asm.S" + +@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, ptrdiff_t stride) +function ff_add_pixels_clamped_arm, export=1, align=5 + push {r4-r10} + mov r10, #8 +1: + ldr r4, [r1] /* load dest */ + /* block[0] and block[1]*/ + ldrsh r5, [r0] + ldrsh r7, [r0, #2] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r6, r5 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #4] /* moved form [A] */ + orr r9, r9, r8, lsl #8 + /* block[2] and block[3] */ + /* [A] */ + ldrsh r7, [r0, #6] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + ldr r4, [r1, #4] /* moved form [B] */ + orr r9, r9, r8, lsl #24 + /* store dest */ + ldrsh r5, [r0, #8] /* moved form [C] */ + str r9, [r1] + + /* load dest */ + /* [B] */ + /* block[4] and block[5] */ + /* [C] */ + ldrsh r7, [r0, #10] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r6, r5 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #12] /* moved from [D] */ + orr r9, r9, r8, lsl #8 + /* block[6] and block[7] */ + /* [D] */ + ldrsh r7, [r0, #14] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + add r0, r0, #16 /* moved from [E] */ + orr r9, r9, r8, lsl #24 + subs r10, r10, #1 /* moved from [F] */ + /* store dest */ + str r9, [r1, #4] + + /* [E] */ + /* [F] */ + add r1, r1, r2 + bne 1b + + pop {r4-r10} + bx lr +endfunc diff --git a/media/ffvpx/libavcodec/arm/idctdsp_arm.h b/media/ffvpx/libavcodec/arm/idctdsp_arm.h new file mode 100644 index 0000000000..d7bc5cd02a --- /dev/null +++ b/media/ffvpx/libavcodec/arm/idctdsp_arm.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_IDCTDSP_ARM_H +#define AVCODEC_ARM_IDCTDSP_ARM_H + +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" + +void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); + +#endif /* AVCODEC_ARM_IDCTDSP_ARM_H */ diff --git a/media/ffvpx/libavcodec/arm/idctdsp_armv6.S b/media/ffvpx/libavcodec/arm/idctdsp_armv6.S new file mode 100644 index 0000000000..a6e77d6da1 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/idctdsp_armv6.S @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_add_pixels_clamped_armv6, export=1 + push {r4-r8,lr} + mov r3, #8 +1: + ldm r0!, {r4,r5,r12,lr} + ldrd r6, r7, [r1] + pkhbt r8, r4, r5, lsl #16 + pkhtb r5, r5, r4, asr #16 + pkhbt r4, r12, lr, lsl #16 + pkhtb lr, lr, r12, asr #16 + pld [r1, r2] + uxtab16 r8, r8, r6 + uxtab16 r5, r5, r6, ror #8 + uxtab16 r4, r4, r7 + uxtab16 lr, lr, r7, ror #8 + usat16 r8, #8, r8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 lr, #8, lr + orr r6, r8, r5, lsl #8 + orr r7, r4, lr, lsl #8 + subs r3, r3, #1 + strd_post r6, r7, r1, r2 + bgt 1b + pop {r4-r8,pc} +endfunc diff --git a/media/ffvpx/libavcodec/arm/idctdsp_init_arm.c b/media/ffvpx/libavcodec/arm/idctdsp_init_arm.c new file mode 100644 index 0000000000..ebc90e4b49 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/idctdsp_init_arm.c @@ -0,0 +1,94 @@ +/* + * ARM-optimized IDCT functions + * Copyright (c) 2001 Lionel Ulmer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idct.h" +#include "idctdsp_arm.h" + +void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest, + ptrdiff_t line_size); + +/* XXX: those functions should be suppressed ASAP when all IDCTs are + * converted */ +static void j_rev_dct_arm_put(uint8_t *dest, ptrdiff_t line_size, + int16_t *block) +{ + ff_j_rev_dct_arm(block); + ff_put_pixels_clamped_c(block, dest, line_size); +} + +static void j_rev_dct_arm_add(uint8_t *dest, ptrdiff_t line_size, + int16_t *block) +{ + ff_j_rev_dct_arm(block); + ff_add_pixels_clamped_arm(block, dest, line_size); +} + +static void simple_idct_arm_put(uint8_t *dest, ptrdiff_t line_size, + int16_t *block) +{ + ff_simple_idct_arm(block); + ff_put_pixels_clamped_c(block, dest, line_size); +} + +static void simple_idct_arm_add(uint8_t *dest, ptrdiff_t line_size, + int16_t *block) +{ + ff_simple_idct_arm(block); + ff_add_pixels_clamped_arm(block, dest, line_size); +} + +av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + + if (!avctx->lowres && !high_bit_depth) { + if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) || + avctx->idct_algo == FF_IDCT_ARM) { + c->idct_put = j_rev_dct_arm_put; + c->idct_add = j_rev_dct_arm_add; + c->idct = ff_j_rev_dct_arm; + c->perm_type = FF_IDCT_PERM_LIBMPEG2; + } else if (avctx->idct_algo == FF_IDCT_SIMPLEARM) { + c->idct_put = simple_idct_arm_put; + c->idct_add = simple_idct_arm_add; + c->idct = ff_simple_idct_arm; + c->perm_type = FF_IDCT_PERM_NONE; + } + } + + c->add_pixels_clamped = ff_add_pixels_clamped_arm; + + if (have_armv5te(cpu_flags)) + ff_idctdsp_init_armv5te(c, avctx, high_bit_depth); + if (have_armv6(cpu_flags)) + ff_idctdsp_init_armv6(c, avctx, high_bit_depth); + if (have_neon(cpu_flags)) + ff_idctdsp_init_neon(c, avctx, high_bit_depth); +} diff --git a/media/ffvpx/libavcodec/arm/idctdsp_init_armv5te.c b/media/ffvpx/libavcodec/arm/idctdsp_init_armv5te.c new file mode 100644 index 0000000000..3d881e1f18 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/idctdsp_init_armv5te.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idct.h" +#include "idctdsp_arm.h" + +av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + if (!avctx->lowres && !high_bit_depth && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) { + c->idct_put = ff_simple_idct_put_armv5te; + c->idct_add = ff_simple_idct_add_armv5te; + c->idct = ff_simple_idct_armv5te; + c->perm_type = FF_IDCT_PERM_NONE; + } +} diff --git a/media/ffvpx/libavcodec/arm/idctdsp_init_armv6.c b/media/ffvpx/libavcodec/arm/idctdsp_init_armv6.c new file mode 100644 index 0000000000..edf3070e15 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/idctdsp_init_armv6.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idct.h" +#include "idctdsp_arm.h" + +void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); + +av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + if (!avctx->lowres && !high_bit_depth) { + if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) || + avctx->idct_algo == FF_IDCT_SIMPLEARMV6) { + c->idct_put = ff_simple_idct_put_armv6; + c->idct_add = ff_simple_idct_add_armv6; + c->idct = ff_simple_idct_armv6; + c->perm_type = FF_IDCT_PERM_LIBMPEG2; + } + } + c->add_pixels_clamped = ff_add_pixels_clamped_armv6; +} diff --git a/media/ffvpx/libavcodec/arm/idctdsp_init_neon.c b/media/ffvpx/libavcodec/arm/idctdsp_init_neon.c new file mode 100644 index 0000000000..b70c5b0d44 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/idctdsp_init_neon.c @@ -0,0 +1,51 @@ +/* + * ARM-NEON-optimized IDCT functions + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idct.h" +#include "idctdsp_arm.h" + +void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); +void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); +void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); + +av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + if (!avctx->lowres && !high_bit_depth) { + if (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLENEON) { + c->idct_put = ff_simple_idct_put_neon; + c->idct_add = ff_simple_idct_add_neon; + c->idct = ff_simple_idct_neon; + c->perm_type = FF_IDCT_PERM_PARTTRANS; + } + } + + c->add_pixels_clamped = ff_add_pixels_clamped_neon; + c->put_pixels_clamped = ff_put_pixels_clamped_neon; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; +} diff --git a/media/ffvpx/libavcodec/arm/idctdsp_neon.S b/media/ffvpx/libavcodec/arm/idctdsp_neon.S new file mode 100644 index 0000000000..1911a33468 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/idctdsp_neon.S @@ -0,0 +1,128 @@ +/* + * ARM-NEON-optimized IDCT functions + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_put_pixels_clamped_neon, export=1 + vld1.16 {d16-d19}, [r0,:128]! + vqmovun.s16 d0, q8 + vld1.16 {d20-d23}, [r0,:128]! + vqmovun.s16 d1, q9 + vld1.16 {d24-d27}, [r0,:128]! + vqmovun.s16 d2, q10 + vld1.16 {d28-d31}, [r0,:128]! + vqmovun.s16 d3, q11 + vst1.8 {d0}, [r1,:64], r2 + vqmovun.s16 d4, q12 + vst1.8 {d1}, [r1,:64], r2 + vqmovun.s16 d5, q13 + vst1.8 {d2}, [r1,:64], r2 + vqmovun.s16 d6, q14 + vst1.8 {d3}, [r1,:64], r2 + vqmovun.s16 d7, q15 + vst1.8 {d4}, [r1,:64], r2 + vst1.8 {d5}, [r1,:64], r2 + vst1.8 {d6}, [r1,:64], r2 + vst1.8 {d7}, [r1,:64], r2 + bx lr +endfunc + +function ff_put_signed_pixels_clamped_neon, export=1 + vmov.u8 d31, #128 + vld1.16 {d16-d17}, [r0,:128]! + vqmovn.s16 d0, q8 + vld1.16 {d18-d19}, [r0,:128]! + vqmovn.s16 d1, q9 + vld1.16 {d16-d17}, [r0,:128]! + vqmovn.s16 d2, q8 + vld1.16 {d18-d19}, [r0,:128]! + vadd.u8 d0, d0, d31 + vld1.16 {d20-d21}, [r0,:128]! + vadd.u8 d1, d1, d31 + vld1.16 {d22-d23}, [r0,:128]! + vadd.u8 d2, d2, d31 + vst1.8 {d0}, [r1,:64], r2 + vqmovn.s16 d3, q9 + vst1.8 {d1}, [r1,:64], r2 + vqmovn.s16 d4, q10 + vst1.8 {d2}, [r1,:64], r2 + vqmovn.s16 d5, q11 + vld1.16 {d24-d25}, [r0,:128]! + vadd.u8 d3, d3, d31 + vld1.16 {d26-d27}, [r0,:128]! + vadd.u8 d4, d4, d31 + vadd.u8 d5, d5, d31 + vst1.8 {d3}, [r1,:64], r2 + vqmovn.s16 d6, q12 + vst1.8 {d4}, [r1,:64], r2 + vqmovn.s16 d7, q13 + vst1.8 {d5}, [r1,:64], r2 + vadd.u8 d6, d6, d31 + vadd.u8 d7, d7, d31 + vst1.8 {d6}, [r1,:64], r2 + vst1.8 {d7}, [r1,:64], r2 + bx lr +endfunc + +function ff_add_pixels_clamped_neon, export=1 + mov r3, r1 + vld1.8 {d16}, [r1,:64], r2 + vld1.16 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vld1.8 {d17}, [r1,:64], r2 + vld1.16 {d2-d3}, [r0,:128]! + vqmovun.s16 d0, q0 + vld1.8 {d18}, [r1,:64], r2 + vaddw.u8 q1, q1, d17 + vld1.16 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.8 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.8 {d19}, [r1,:64], r2 + vld1.16 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vqmovun.s16 d4, q2 + vst1.8 {d2}, [r3,:64], r2 + vld1.8 {d16}, [r1,:64], r2 + vqmovun.s16 d6, q3 + vld1.16 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vst1.8 {d4}, [r3,:64], r2 + vld1.8 {d17}, [r1,:64], r2 + vld1.16 {d2-d3}, [r0,:128]! + vaddw.u8 q1, q1, d17 + vst1.8 {d6}, [r3,:64], r2 + vqmovun.s16 d0, q0 + vld1.8 {d18}, [r1,:64], r2 + vld1.16 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.8 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.8 {d19}, [r1,:64], r2 + vqmovun.s16 d4, q2 + vld1.16 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vst1.8 {d2}, [r3,:64], r2 + vqmovun.s16 d6, q3 + vst1.8 {d4}, [r3,:64], r2 + vst1.8 {d6}, [r3,:64], r2 + bx lr +endfunc diff --git a/media/ffvpx/libavcodec/arm/jrevdct_arm.S b/media/ffvpx/libavcodec/arm/jrevdct_arm.S new file mode 100644 index 0000000000..f951e2af34 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/jrevdct_arm.S @@ -0,0 +1,383 @@ +/* + C-like prototype : + void j_rev_dct_arm(DCTBLOCK data) + + With DCTBLOCK being a pointer to an array of 64 'signed shorts' + + Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org) + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*/ + +#include "libavutil/arm/asm.S" + +#define FIX_0_298631336 2446 +#define FIX_0_541196100 4433 +#define FIX_0_765366865 6270 +#define FIX_1_175875602 9633 +#define FIX_1_501321110 12299 +#define FIX_2_053119869 16819 +#define FIX_3_072711026 25172 +#define FIX_M_0_390180644 -3196 +#define FIX_M_0_899976223 -7373 +#define FIX_M_1_847759065 -15137 +#define FIX_M_1_961570560 -16069 +#define FIX_M_2_562915447 -20995 +#define FIX_0xFFFF 0xFFFF + +#define FIX_0_298631336_ID 0 +#define FIX_0_541196100_ID 4 +#define FIX_0_765366865_ID 8 +#define FIX_1_175875602_ID 12 +#define FIX_1_501321110_ID 16 +#define FIX_2_053119869_ID 20 +#define FIX_3_072711026_ID 24 +#define FIX_M_0_390180644_ID 28 +#define FIX_M_0_899976223_ID 32 +#define FIX_M_1_847759065_ID 36 +#define FIX_M_1_961570560_ID 40 +#define FIX_M_2_562915447_ID 44 +#define FIX_0xFFFF_ID 48 + +function ff_j_rev_dct_arm, export=1 + push {r0, r4 - r11, lr} + + mov lr, r0 @ lr = pointer to the current row + mov r12, #8 @ r12 = row-counter + movrel r11, const_array @ r11 = base pointer to the constants array +row_loop: + ldrsh r0, [lr, # 0] @ r0 = 'd0' + ldrsh r2, [lr, # 2] @ r2 = 'd2' + + @ Optimization for row that have all items except the first set to 0 + @ (this works as the int16_t are always 4-byte aligned) + ldr r5, [lr, # 0] + ldr r6, [lr, # 4] + ldr r3, [lr, # 8] + ldr r4, [lr, #12] + orr r3, r3, r4 + orr r3, r3, r6 + orrs r5, r3, r5 + beq end_of_row_loop @ nothing to be done as ALL of them are '0' + orrs r3, r3, r2 + beq empty_row + + ldrsh r1, [lr, # 8] @ r1 = 'd1' + ldrsh r4, [lr, # 4] @ r4 = 'd4' + ldrsh r6, [lr, # 6] @ r6 = 'd6' + + ldr r3, [r11, #FIX_0_541196100_ID] + add r7, r2, r6 + ldr r5, [r11, #FIX_M_1_847759065_ID] + mul r7, r3, r7 @ r7 = z1 + ldr r3, [r11, #FIX_0_765366865_ID] + mla r6, r5, r6, r7 @ r6 = tmp2 + add r5, r0, r4 @ r5 = tmp0 + mla r2, r3, r2, r7 @ r2 = tmp3 + sub r3, r0, r4 @ r3 = tmp1 + + add r0, r2, r5, lsl #13 @ r0 = tmp10 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 + add r4, r6, r3, lsl #13 @ r4 = tmp11 + rsb r3, r6, r3, lsl #13 @ r3 = tmp12 + + push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11 + + ldrsh r3, [lr, #10] @ r3 = 'd3' + ldrsh r5, [lr, #12] @ r5 = 'd5' + ldrsh r7, [lr, #14] @ r7 = 'd7' + + add r0, r3, r5 @ r0 = 'z2' + add r2, r1, r7 @ r2 = 'z1' + add r4, r3, r7 @ r4 = 'z3' + add r6, r1, r5 @ r6 = 'z4' + ldr r9, [r11, #FIX_1_175875602_ID] + add r8, r4, r6 @ r8 = z3 + z4 + ldr r10, [r11, #FIX_M_0_899976223_ID] + mul r8, r9, r8 @ r8 = 'z5' + ldr r9, [r11, #FIX_M_2_562915447_ID] + mul r2, r10, r2 @ r2 = 'z1' + ldr r10, [r11, #FIX_M_1_961570560_ID] + mul r0, r9, r0 @ r0 = 'z2' + ldr r9, [r11, #FIX_M_0_390180644_ID] + mla r4, r10, r4, r8 @ r4 = 'z3' + ldr r10, [r11, #FIX_0_298631336_ID] + mla r6, r9, r6, r8 @ r6 = 'z4' + ldr r9, [r11, #FIX_2_053119869_ID] + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 + ldr r10, [r11, #FIX_3_072711026_ID] + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 + ldr r9, [r11, #FIX_1_501321110_ID] + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 + add r7, r7, r4 @ r7 = tmp0 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 + add r5, r5, r6 @ r5 = tmp1 + add r3, r3, r4 @ r3 = tmp2 + add r1, r1, r6 @ r1 = tmp3 + + pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 + + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) + add r8, r0, r1 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 0] + + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) + sub r8, r0, r1 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #14] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) + add r8, r6, r3 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 2] + + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) + sub r8, r6, r3 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #12] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) + add r8, r4, r5 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 4] + + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) + sub r8, r4, r5 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #10] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) + add r8, r2, r7 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 6] + + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) + sub r8, r2, r7 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 8] + + @ End of row loop + add lr, lr, #16 + subs r12, r12, #1 + bne row_loop + beq start_column_loop + +empty_row: + ldr r1, [r11, #FIX_0xFFFF_ID] + mov r0, r0, lsl #2 + and r0, r0, r1 + add r0, r0, r0, lsl #16 + str r0, [lr, # 0] + str r0, [lr, # 4] + str r0, [lr, # 8] + str r0, [lr, #12] + +end_of_row_loop: + @ End of loop + add lr, lr, #16 + subs r12, r12, #1 + bne row_loop + +start_column_loop: + @ Start of column loop + pop {lr} + mov r12, #8 +column_loop: + ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0' + ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2' + ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4' + ldrsh r6, [lr, #(12*8)] @ r6 = 'd6' + + ldr r3, [r11, #FIX_0_541196100_ID] + add r1, r2, r6 + ldr r5, [r11, #FIX_M_1_847759065_ID] + mul r1, r3, r1 @ r1 = z1 + ldr r3, [r11, #FIX_0_765366865_ID] + mla r6, r5, r6, r1 @ r6 = tmp2 + add r5, r0, r4 @ r5 = tmp0 + mla r2, r3, r2, r1 @ r2 = tmp3 + sub r3, r0, r4 @ r3 = tmp1 + + add r0, r2, r5, lsl #13 @ r0 = tmp10 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 + add r4, r6, r3, lsl #13 @ r4 = tmp11 + rsb r6, r6, r3, lsl #13 @ r6 = tmp12 + + ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1' + ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3' + ldrsh r5, [lr, #(10*8)] @ r5 = 'd5' + ldrsh r7, [lr, #(14*8)] @ r7 = 'd7' + + @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats) + orr r9, r1, r3 + orr r10, r5, r7 + orrs r10, r9, r10 + beq empty_odd_column + + push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11 + + add r0, r3, r5 @ r0 = 'z2' + add r2, r1, r7 @ r2 = 'z1' + add r4, r3, r7 @ r4 = 'z3' + add r6, r1, r5 @ r6 = 'z4' + ldr r9, [r11, #FIX_1_175875602_ID] + add r8, r4, r6 + ldr r10, [r11, #FIX_M_0_899976223_ID] + mul r8, r9, r8 @ r8 = 'z5' + ldr r9, [r11, #FIX_M_2_562915447_ID] + mul r2, r10, r2 @ r2 = 'z1' + ldr r10, [r11, #FIX_M_1_961570560_ID] + mul r0, r9, r0 @ r0 = 'z2' + ldr r9, [r11, #FIX_M_0_390180644_ID] + mla r4, r10, r4, r8 @ r4 = 'z3' + ldr r10, [r11, #FIX_0_298631336_ID] + mla r6, r9, r6, r8 @ r6 = 'z4' + ldr r9, [r11, #FIX_2_053119869_ID] + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 + ldr r10, [r11, #FIX_3_072711026_ID] + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 + ldr r9, [r11, #FIX_1_501321110_ID] + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 + add r7, r7, r4 @ r7 = tmp0 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 + add r5, r5, r6 @ r5 = tmp1 + add r3, r3, r4 @ r3 = tmp2 + add r1, r1, r6 @ r1 = tmp3 + + pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 + + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) + add r8, r0, r1 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 0*8)] + + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) + sub r8, r0, r1 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(14*8)] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) + add r8, r4, r3 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 2*8)] + + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) + sub r8, r4, r3 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(12*8)] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) + add r8, r6, r5 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 4*8)] + + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) + sub r8, r6, r5 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(10*8)] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) + add r8, r2, r7 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 6*8)] + + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) + sub r8, r2, r7 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 8*8)] + + @ End of row loop + add lr, lr, #2 + subs r12, r12, #1 + bne column_loop + beq the_end + +empty_odd_column: + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) + add r0, r0, #(1<<17) + mov r0, r0, asr #18 + strh r0, [lr, #( 0*8)] + strh r0, [lr, #(14*8)] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) + add r4, r4, #(1<<17) + mov r4, r4, asr #18 + strh r4, [lr, #( 2*8)] + strh r4, [lr, #(12*8)] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) + add r6, r6, #(1<<17) + mov r6, r6, asr #18 + strh r6, [lr, #( 4*8)] + strh r6, [lr, #(10*8)] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) + add r2, r2, #(1<<17) + mov r2, r2, asr #18 + strh r2, [lr, #( 6*8)] + strh r2, [lr, #( 8*8)] + + @ End of row loop + add lr, lr, #2 + subs r12, r12, #1 + bne column_loop + +the_end: + @ The end.... + pop {r4 - r11, pc} +endfunc + +const const_array + .word FIX_0_298631336 + .word FIX_0_541196100 + .word FIX_0_765366865 + .word FIX_1_175875602 + .word FIX_1_501321110 + .word FIX_2_053119869 + .word FIX_3_072711026 + .word FIX_M_0_390180644 + .word FIX_M_0_899976223 + .word FIX_M_1_847759065 + .word FIX_M_1_961570560 + .word FIX_M_2_562915447 + .word FIX_0xFFFF +endconst diff --git a/media/ffvpx/libavcodec/arm/mathops.h b/media/ffvpx/libavcodec/arm/mathops.h new file mode 100644 index 0000000000..dc57c5571c --- /dev/null +++ b/media/ffvpx/libavcodec/arm/mathops.h @@ -0,0 +1,108 @@ +/* + * simple math operations + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_MATHOPS_H +#define AVCODEC_ARM_MATHOPS_H + +#include <stdint.h> +#include "config.h" +#include "libavutil/common.h" + +#if HAVE_INLINE_ASM + +#if HAVE_ARMV6_INLINE +#define MULH MULH +static inline av_const int MULH(int a, int b) +{ + int r; + __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); + return r; +} + +#define FASTDIV FASTDIV +static av_always_inline av_const int FASTDIV(int a, int b) +{ + int r; + __asm__ ("cmp %2, #2 \n\t" + "ldr %0, [%3, %2, lsl #2] \n\t" + "ite le \n\t" + "lsrle %0, %1, #1 \n\t" + "smmulgt %0, %0, %1 \n\t" + : "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc"); + return r; +} + +#else /* HAVE_ARMV6_INLINE */ + +#define FASTDIV FASTDIV +static av_always_inline av_const int FASTDIV(int a, int b) +{ + int r, t; + __asm__ ("umull %1, %0, %2, %3" + : "=&r"(r), "=&r"(t) : "r"(a), "r"(ff_inverse[b])); + return r; +} +#endif + +#define MLS64(d, a, b) MAC64(d, -(a), b) + +#if HAVE_ARMV5TE_INLINE + +/* signed 16x16 -> 32 multiply add accumulate */ +# define MAC16(rt, ra, rb) \ + __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb)); + +/* signed 16x16 -> 32 multiply */ +# define MUL16 MUL16 +static inline av_const int MUL16(int ra, int rb) +{ + int rt; + __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb)); + return rt; +} + +#endif + +#define mid_pred mid_pred +static inline av_const int mid_pred(int a, int b, int c) +{ + int m; + __asm__ ( + "mov %0, %2 \n\t" + "cmp %1, %2 \n\t" + "itt gt \n\t" + "movgt %0, %1 \n\t" + "movgt %1, %2 \n\t" + "cmp %1, %3 \n\t" + "it le \n\t" + "movle %1, %3 \n\t" + "cmp %0, %1 \n\t" + "it gt \n\t" + "movgt %0, %1 \n\t" + : "=&r"(m), "+r"(a) + : "r"(b), "r"(c) + : "cc"); + return m; +} + +#endif /* HAVE_INLINE_ASM */ + +#endif /* AVCODEC_ARM_MATHOPS_H */ diff --git a/media/ffvpx/libavcodec/arm/moz.build b/media/ffvpx/libavcodec/arm/moz.build new file mode 100644 index 0000000000..dafeab21d8 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/moz.build @@ -0,0 +1,33 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +SOURCES += [ + 'fft_init_arm.c', + 'fft_neon.S', + 'fft_vfp.S', + 'flacdsp_arm.S', + 'flacdsp_init_arm.c', + 'idctdsp_arm.S', + 'idctdsp_armv6.S', + 'idctdsp_init_arm.c', + 'idctdsp_init_armv5te.c', + 'idctdsp_init_armv6.c', + 'idctdsp_init_neon.c', + 'idctdsp_neon.S', + 'jrevdct_arm.S', + 'mpegaudiodsp_fixed_armv6.S', + 'mpegaudiodsp_init_arm.c', + 'rdft_init_arm.c', + 'rdft_neon.S', + 'simple_idct_arm.S', + 'simple_idct_armv5te.S', + 'simple_idct_armv6.S', + 'simple_idct_neon.S', +] + +FINAL_LIBRARY = 'mozavcodec' + +include('/media/ffvpx/ffvpxcommon.mozbuild') diff --git a/media/ffvpx/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/media/ffvpx/libavcodec/arm/mpegaudiodsp_fixed_armv6.S new file mode 100644 index 0000000000..977abb6939 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/mpegaudiodsp_fixed_armv6.S @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro skip args:vararg +.endm + +.macro sum8 lo, hi, w, p, t1, t2, t3, t4, rsb=skip, offs=0 + ldr \t1, [\w, #4*\offs] + ldr \t2, [\p, #4]! + \rsb \t1, \t1, #0 + .irpc i, 135 + ldr \t3, [\w, #4*64*\i+4*\offs] + ldr \t4, [\p, #4*64*\i] + smlal \lo, \hi, \t1, \t2 + \rsb \t3, \t3, #0 + ldr \t1, [\w, #4*64*(\i+1)+4*\offs] + ldr \t2, [\p, #4*64*(\i+1)] + smlal \lo, \hi, \t3, \t4 + \rsb \t1, \t1, #0 + .endr + ldr \t3, [\w, #4*64*7+4*\offs] + ldr \t4, [\p, #4*64*7] + smlal \lo, \hi, \t1, \t2 + \rsb \t3, \t3, #0 + smlal \lo, \hi, \t3, \t4 +.endm + +.macro round rd, lo, hi + lsr \rd, \lo, #24 + bic \lo, \lo, #0xff000000 + orr \rd, \rd, \hi, lsl #8 + mov \hi, #0 + ssat \rd, #16, \rd +.endm + +function ff_mpadsp_apply_window_fixed_armv6, export=1 + push {r2,r4-r11,lr} + + add r4, r0, #4*512 @ synth_buf + 512 + .rept 4 + ldm r0!, {r5-r12} + stm r4!, {r5-r12} + .endr + + ldr r4, [sp, #40] @ incr + sub r0, r0, #4*17 @ synth_buf + 16 + ldr r8, [r2] @ sum:low + add r2, r0, #4*32 @ synth_buf + 48 + rsb r5, r4, r4, lsl #5 @ 31 * incr + lsl r4, r4, #1 + asr r9, r8, #31 @ sum:high + add r5, r3, r5, lsl #1 @ samples2 + add r6, r1, #4*32 @ w2 + str r4, [sp, #40] + + sum8 r8, r9, r1, r0, r10, r11, r12, lr + sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32 + round r10, r8, r9 + strh_post r10, r3, r4 + + mov lr, #15 +1: + ldr r12, [r0, #4]! + ldr r11, [r6, #-4]! + ldr r10, [r1, #4]! + .irpc i, 0246 + .if \i + ldr r11, [r6, #4*64*\i] + ldr r10, [r1, #4*64*\i] + .endif + rsb r11, r11, #0 + smlal r8, r9, r10, r12 + ldr r10, [r0, #4*64*(\i+1)] + .ifeq \i + smull r4, r7, r11, r12 + .else + smlal r4, r7, r11, r12 + .endif + ldr r11, [r6, #4*64*(\i+1)] + ldr r12, [r1, #4*64*(\i+1)] + rsb r11, r11, #0 + smlal r8, r9, r12, r10 + .iflt \i-6 + ldr r12, [r0, #4*64*(\i+2)] + .else + ldr r12, [r2, #-4]! + .endif + smlal r4, r7, r11, r10 + .endr + .irpc i, 0246 + ldr r10, [r1, #4*64*\i+4*32] + rsb r12, r12, #0 + ldr r11, [r6, #4*64*\i+4*32] + smlal r8, r9, r10, r12 + ldr r10, [r2, #4*64*(\i+1)] + smlal r4, r7, r11, r12 + ldr r12, [r1, #4*64*(\i+1)+4*32] + rsb r10, r10, #0 + ldr r11, [r6, #4*64*(\i+1)+4*32] + smlal r8, r9, r12, r10 + .iflt \i-6 + ldr r12, [r2, #4*64*(\i+2)] + .else + ldr r12, [sp, #40] + .endif + smlal r4, r7, r11, r10 + .endr + round r10, r8, r9 + adds r8, r8, r4 + adc r9, r9, r7 + strh_post r10, r3, r12 + round r11, r8, r9 + subs lr, lr, #1 + strh_dpost r11, r5, r12 + bgt 1b + + sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33 + pop {r4} + round r10, r8, r9 + str r8, [r4] + strh r10, [r3] + + pop {r4-r11,pc} +endfunc diff --git a/media/ffvpx/libavcodec/arm/mpegaudiodsp_init_arm.c b/media/ffvpx/libavcodec/arm/mpegaudiodsp_init_arm.c new file mode 100644 index 0000000000..d87bd27ad8 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/mpegaudiodsp_init_arm.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/mpegaudiodsp.h" +#include "config.h" + +void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window, + int *dither, int16_t *out, ptrdiff_t incr); + +av_cold void ff_mpadsp_init_arm(MPADSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_armv6(cpu_flags)) { + s->apply_window_fixed = ff_mpadsp_apply_window_fixed_armv6; + } +} diff --git a/media/ffvpx/libavcodec/arm/rdft_init_arm.c b/media/ffvpx/libavcodec/arm/rdft_init_arm.c new file mode 100644 index 0000000000..1c5d8beb61 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/rdft_init_arm.c @@ -0,0 +1,33 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" + +#include "libavcodec/rdft.h" + +void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); + +av_cold void ff_rdft_init_arm(RDFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) + s->rdft_calc = ff_rdft_calc_neon; +} diff --git a/media/ffvpx/libavcodec/arm/rdft_neon.S b/media/ffvpx/libavcodec/arm/rdft_neon.S new file mode 100644 index 0000000000..eabb92b4bd --- /dev/null +++ b/media/ffvpx/libavcodec/arm/rdft_neon.S @@ -0,0 +1,155 @@ +/* + * ARM NEON optimised RDFT + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_rdft_calc_neon, export=1 + push {r4-r8,lr} + + ldr r6, [r0, #4] @ inverse + mov r4, r0 + mov r5, r1 + + lsls r6, r6, #31 + bne 1f + add r0, r4, #24 + bl X(ff_fft_permute_neon) + add r0, r4, #24 + mov r1, r5 + bl X(ff_fft_calc_neon) +1: + ldr r12, [r4, #0] @ nbits + mov r2, #1 + ldr r8, [r4, #20] @ negative_sin + lsl r12, r2, r12 + add r0, r5, #8 + lsl r8, r8, #31 + add r1, r5, r12, lsl #2 + lsr r12, r12, #2 + vdup.32 d26, r8 + ldr r2, [r4, #12] @ tcos + sub r12, r12, #2 + ldr r3, [r4, #16] @ tsin + mov r7, r0 + sub r1, r1, #8 + mov lr, r1 + mov r8, #-8 + vld1.32 {d0}, [r0,:64]! @ d1[0,1] + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] + vld1.32 {d4}, [r2,:64]! @ tcos[i] + vld1.32 {d5}, [r3,:64]! @ tsin[i] + vmov.f32 d18, #0.5 @ k1 + vdup.32 d19, r6 + veor d5, d26, d5 + pld [r0, #32] + veor d19, d18, d19 @ k2 + vmov.i32 d16, #0 + vmov.i32 d17, #1<<31 + pld [r1, #-32] + vtrn.32 d16, d17 + pld [r2, #32] + vrev64.32 d16, d16 @ d16=1,0 d17=0,1 + pld [r3, #32] +2: + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] + vld1.32 {d24}, [r0,:64]! @ d1[0,1] + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] + vld1.32 {d25}, [r1,:64], r8 @ d2[0,1] + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] + veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1] + pld [r0, #32] + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re + pld [r1, #-32] + vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1] + vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1] + vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re + veor d7, d21, d16 @ -od.im, od.re + vrev64.32 d3, d21 @ od.re, od.im + veor d6, d20, d17 @ ev.re,-ev.im + veor d2, d3, d16 @ -od.re, od.im + vmla.f32 d20, d3, d4[1] + vmla.f32 d20, d7, d5[1] + vmla.f32 d6, d2, d4[1] + vmla.f32 d6, d21, d5[1] + vld1.32 {d4}, [r2,:64]! @ tcos[i] + veor d7, d23, d16 @ -od.im, od.re + vld1.32 {d5}, [r3,:64]! @ tsin[i] + veor d24, d22, d17 @ ev.re,-ev.im + vrev64.32 d3, d23 @ od.re, od.im + veor d5, d26, d5 + pld [r2, #32] + veor d2, d3, d16 @ -od.re, od.im + pld [r3, #32] + vmla.f32 d22, d3, d4[0] + vmla.f32 d22, d7, d5[0] + vmla.f32 d24, d2, d4[0] + vmla.f32 d24, d23, d5[0] + vld1.32 {d0}, [r0,:64]! @ d1[0,1] + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] + vst1.32 {d20}, [r7,:64]! + vst1.32 {d6}, [lr,:64], r8 + vst1.32 {d22}, [r7,:64]! + vst1.32 {d24}, [lr,:64], r8 + subs r12, r12, #2 + bgt 2b + + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] + ldr r2, [r4, #8] @ sign_convention + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re + add r0, r0, #4 + bfc r2, #0, #31 + vld1.32 {d0[0]}, [r0,:32] + veor d7, d21, d16 @ -od.im, od.re + vrev64.32 d3, d21 @ od.re, od.im + veor d6, d20, d17 @ ev.re,-ev.im + vld1.32 {d22}, [r5,:64] + vdup.32 d1, r2 + vmov d23, d22 + veor d2, d3, d16 @ -od.re, od.im + vtrn.32 d22, d23 + veor d0, d0, d1 + veor d23, d23, d17 + vmla.f32 d20, d3, d4[1] + vmla.f32 d20, d7, d5[1] + vmla.f32 d6, d2, d4[1] + vmla.f32 d6, d21, d5[1] + vadd.f32 d22, d22, d23 + vst1.32 {d20}, [r7,:64] + vst1.32 {d6}, [lr,:64] + vst1.32 {d0[0]}, [r0,:32] + vst1.32 {d22}, [r5,:64] + + cmp r6, #0 + it eq + popeq {r4-r8,pc} + + vmul.f32 d22, d22, d18 + vst1.32 {d22}, [r5,:64] + add r0, r4, #24 + mov r1, r5 + bl X(ff_fft_permute_neon) + add r0, r4, #24 + mov r1, r5 + pop {r4-r8,lr} + b X(ff_fft_calc_neon) +endfunc diff --git a/media/ffvpx/libavcodec/arm/simple_idct_arm.S b/media/ffvpx/libavcodec/arm/simple_idct_arm.S new file mode 100644 index 0000000000..42d79ab95e --- /dev/null +++ b/media/ffvpx/libavcodec/arm/simple_idct_arm.S @@ -0,0 +1,480 @@ +/* + * Copyright (C) 2002 Frederic 'dilb' Boulay + * + * Author: Frederic Boulay <dilb@handhelds.org> + * + * The function defined in this file is derived from the simple_idct function + * from the libavcodec library part of the FFmpeg project. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +/* useful constants for the algorithm */ +#define W1 22725 +#define W2 21407 +#define W3 19266 +#define W4 16383 +#define W5 12873 +#define W6 8867 +#define W7 4520 +#define MASK_MSHW 0xFFFF0000 + +#define ROW_SHIFT 11 +#define ROW_SHIFT2MSHW (16-11) +#define COL_SHIFT 20 +#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ +#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ + + +function ff_simple_idct_arm, export=1 + @@ void simple_idct_arm(int16_t *block) + @@ save stack for reg needed (take all of them), + @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block + @@ so it must not be overwritten, if it is not saved!! + @@ R12 is another scratch register, so it should not be saved too + @@ save all registers + stmfd sp!, {r4-r11, r14} @ R14 is also called LR + @@ at this point, R0=block, other registers are free. + add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. + @@ add 2 temporary variables in the stack: R0 and R14 + sub sp, sp, #8 @ allow 2 local variables + str r0, [sp, #0] @ save block in sp[0] + @@ stack status + @@ sp+4 free + @@ sp+0 R0 (block) + + + @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free + + +__row_loop: + @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32 bits in two 16-bit words), at least it gives more usable registers :) + ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer) + ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1] + ldr r3, [r14, #8] @ R3=ROWr32[2] + ldr r4, [r14, #12] @ R4=ROWr32[3] + @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop), + @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row) + @@ else follow the complete algorithm. + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], + @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free + orr r5, r4, r3 @ R5=R4 | R3 + orr r5, r5, r2 @ R5=R4 | R3 | R2 + orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null) + beq __end_row_loop + mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later) + ldrsh r6, [r14, #0] @ R6=ROWr16[0] + orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7 + beq __almost_empty_row + +@@ __b_evaluation: + @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3], + @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free, + @@ R12=__const_ptr_, R14=&block[n] + @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3 + + @@ MUL16(b0, W1, row[1]); + @@ MUL16(b1, W3, row[1]); + @@ MUL16(b2, W5, row[1]); + @@ MUL16(b3, W7, row[1]); + @@ MAC16(b0, W3, row[3]); + @@ MAC16(b1, -W7, row[3]); + @@ MAC16(b2, -W1, row[3]); + @@ MAC16(b3, -W5, row[3]); + ldr r8, =W1 @ R8=W1 + mov r2, r2, asr #16 @ R2=ROWr16[3] + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r9, =W3 @ R9=W3 + ldr r10, =W5 @ R10=W5 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r11, =W7 @ R11=W7 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + teq r2, #0 @ if null avoid muls + itttt ne + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + rsbne r2, r2, #0 @ R2=-ROWr16[3] + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + it ne + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; + @@ if (temp != 0) {} + orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3] + beq __end_b_evaluation + + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ MAC16(b0, W5, row[5]); + @@ MAC16(b2, W7, row[5]); + @@ MAC16(b3, W3, row[5]); + @@ MAC16(b1, -W1, row[5]); + @@ MAC16(b0, W7, row[7]); + @@ MAC16(b2, W3, row[7]); + @@ MAC16(b3, -W1, row[7]); + @@ MAC16(b1, -W5, row[7]); + mov r3, r3, asr #16 @ R3=ROWr16[5] + teq r3, #0 @ if null avoid muls + it ne + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 + mov r4, r4, asr #16 @ R4=ROWr16[7] + itttt ne + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 + rsbne r3, r3, #0 @ R3=-ROWr16[5] + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 + @@ R3 is free now + teq r4, #0 @ if null avoid muls + itttt ne + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 + rsbne r4, r4, #0 @ R4=-ROWr16[7] + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 + it ne + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 + @@ R4 is free now +__end_b_evaluation: + @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free), + @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + +@@ __a_evaluation: + @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); + @@ a1 = a0 + W6 * row[2]; + @@ a2 = a0 - W6 * row[2]; + @@ a3 = a0 - W2 * row[2]; + @@ a0 = a0 + W2 * row[2]; + ldr r9, =W4 @ R9=W4 + mul r6, r9, r6 @ R6=W4*ROWr16[0] + ldr r10, =W6 @ R10=W6 + ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet) + add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0) + + mul r11, r10, r4 @ R11=W6*ROWr16[2] + ldr r8, =W2 @ R8=W2 + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; + @@ if (temp != 0) {} + teq r2, #0 + beq __end_bef_a_evaluation + + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + + + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + + + @@ a0 += W4*row[4] + @@ a1 -= W4*row[4] + @@ a2 -= W4*row[4] + @@ a3 += W4*row[4] + ldrsh r11, [r14, #8] @ R11=ROWr16[4] + teq r11, #0 @ if null avoid muls + it ne + mulne r11, r9, r11 @ R11=W4*ROWr16[4] + @@ R9 is free now + ldrsh r9, [r14, #12] @ R9=ROWr16[6] + itttt ne + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead + teq r9, #0 @ if null avoid muls + itttt ne + mulne r11, r10, r9 @ R11=W6*ROWr16[6] + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) + mulne r10, r8, r9 @ R10=W2*ROWr16[6] + @@ a0 += W6*row[6]; + @@ a3 -= W6*row[6]; + @@ a1 -= W2*row[6]; + @@ a2 += W2*row[6]; + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) + itt ne + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) + +__end_a_evaluation: + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ row[0] = (a0 + b0) >> ROW_SHIFT; + @@ row[1] = (a1 + b1) >> ROW_SHIFT; + @@ row[2] = (a2 + b2) >> ROW_SHIFT; + @@ row[3] = (a3 + b3) >> ROW_SHIFT; + @@ row[4] = (a3 - b3) >> ROW_SHIFT; + @@ row[5] = (a2 - b2) >> ROW_SHIFT; + @@ row[6] = (a1 - b1) >> ROW_SHIFT; + @@ row[7] = (a0 - b0) >> ROW_SHIFT; + add r8, r6, r0 @ R8=a0+b0 + add r9, r2, r1 @ R9=a1+b1 + @@ put two 16-bit half-words in a 32-bit word + @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only little-endian compliant then!!!) + ldr r10, =MASK_MSHW @ R10=0xFFFF0000 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5) + mvn r11, r10 @ R11= NOT R10= 0x0000FFFF + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11) + orr r8, r8, r9 + str r8, [r14, #0] + + add r8, r3, r5 @ R8=a2+b2 + add r9, r4, r7 @ R9=a3+b3 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11) + orr r8, r8, r9 + str r8, [r14, #4] + + sub r8, r4, r7 @ R8=a3-b3 + sub r9, r3, r5 @ R9=a2-b2 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11) + orr r8, r8, r9 + str r8, [r14, #8] + + sub r8, r2, r1 @ R8=a1-b1 + sub r9, r6, r0 @ R9=a0-b0 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11) + orr r8, r8, r9 + str r8, [r14, #12] + + bal __end_row_loop + +__almost_empty_row: + @@ the row was empty, except ROWr16[0], now, management of this special case + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], + @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1], + @@ R8=0xFFFF (temp), R9-R11 free + mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run). + sub r8, r8, #1 @ R8 is now ready. + and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF + orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16) + str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5 + str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5 + str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5 + str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5 + +__end_row_loop: + @@ at this point, R0-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + ldr r0, [sp, #0] @ R0=block + teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished. + sub r14, r14, #16 + bne __row_loop + + + + @@ at this point, R0=block, R1-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. +__col_loop: + +@@ __b_evaluation2: + @@ at this point, R0=block (temp), R1-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + @@ proceed with b0-b3 first, followed by a0-a3 + @@ MUL16(b0, W1, col[8x1]); + @@ MUL16(b1, W3, col[8x1]); + @@ MUL16(b2, W5, col[8x1]); + @@ MUL16(b3, W7, col[8x1]); + @@ MAC16(b0, W3, col[8x3]); + @@ MAC16(b1, -W7, col[8x3]); + @@ MAC16(b2, -W1, col[8x3]); + @@ MAC16(b3, -W5, col[8x3]); + ldr r8, =W1 @ R8=W1 + ldrsh r7, [r14, #16] + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r9, =W3 @ R9=W3 + ldr r10, =W5 @ R10=W5 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r11, =W7 @ R11=W7 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldrsh r2, [r14, #48] + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + teq r2, #0 @ if 0, then avoid muls + itttt ne + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + rsbne r2, r2, #0 @ R2=-ROWr16[3] + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + it ne + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), + @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ MAC16(b0, W5, col[5x8]); + @@ MAC16(b2, W7, col[5x8]); + @@ MAC16(b3, W3, col[5x8]); + @@ MAC16(b1, -W1, col[5x8]); + @@ MAC16(b0, W7, col[7x8]); + @@ MAC16(b2, W3, col[7x8]); + @@ MAC16(b3, -W1, col[7x8]); + @@ MAC16(b1, -W5, col[7x8]); + ldrsh r3, [r14, #80] @ R3=COLr16[5x8] + teq r3, #0 @ if 0 then avoid muls + itttt ne + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 + rsbne r3, r3, #0 @ R3=-ROWr16[5x8] + ldrsh r4, [r14, #112] @ R4=COLr16[7x8] + it ne + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 + @@ R3 is free now + teq r4, #0 @ if 0 then avoid muls + itttt ne + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 + rsbne r4, r4, #0 @ R4=-ROWr16[7x8] + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 + it ne + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 + @@ R4 is free now +@@ __end_b_evaluation2: + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), + @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + +@@ __a_evaluation2: + @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); + @@ a1 = a0 + W6 * row[2]; + @@ a2 = a0 - W6 * row[2]; + @@ a3 = a0 - W2 * row[2]; + @@ a0 = a0 + W2 * row[2]; + ldrsh r6, [r14, #0] + ldr r9, =W4 @ R9=W4 + mul r6, r9, r6 @ R6=W4*ROWr16[0] + ldr r10, =W6 @ R10=W6 + ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) + add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) + mul r11, r10, r4 @ R11=W6*ROWr16[2] + ldr r8, =W2 @ R8=W2 + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ a0 += W4*row[4] + @@ a1 -= W4*row[4] + @@ a2 -= W4*row[4] + @@ a3 += W4*row[4] + ldrsh r11, [r14, #64] @ R11=ROWr16[4] + teq r11, #0 @ if null avoid muls + itttt ne + mulne r11, r9, r11 @ R11=W4*ROWr16[4] + @@ R9 is free now + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) + ldrsh r9, [r14, #96] @ R9=ROWr16[6] + it ne + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead + teq r9, #0 @ if null avoid muls + itttt ne + mulne r11, r10, r9 @ R11=W6*ROWr16[6] + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) + mulne r10, r8, r9 @ R10=W2*ROWr16[6] + @@ a0 += W6*row[6]; + @@ a3 -= W6*row[6]; + @@ a1 -= W2*row[6]; + @@ a2 += W2*row[6]; + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) + itt ne + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) +@@ __end_a_evaluation2: + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); + @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); + @@ col[16] = ((a2 + b2) >> COL_SHIFT); + @@ col[24] = ((a3 + b3) >> COL_SHIFT); + @@ col[32] = ((a3 - b3) >> COL_SHIFT); + @@ col[40] = ((a2 - b2) >> COL_SHIFT); + @@ col[48] = ((a1 - b1) >> COL_SHIFT); + @@ col[56] = ((a0 - b0) >> COL_SHIFT); + @@@@@ no optimization here @@@@@ + add r8, r6, r0 @ R8=a0+b0 + add r9, r2, r1 @ R9=a1+b1 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #0] + strh r9, [r14, #16] + add r8, r3, r5 @ R8=a2+b2 + add r9, r4, r7 @ R9=a3+b3 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #32] + strh r9, [r14, #48] + sub r8, r4, r7 @ R8=a3-b3 + sub r9, r3, r5 @ R9=a2-b2 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #64] + strh r9, [r14, #80] + sub r8, r2, r1 @ R8=a1-b1 + sub r9, r6, r0 @ R9=a0-b0 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #96] + strh r9, [r14, #112] + +@@ __end_col_loop: + @@ at this point, R0-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + ldr r0, [sp, #0] @ R0=block + teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. + sub r14, r14, #2 + bne __col_loop + + + + +@@ __end_simple_idct_arm: + @@ restore registers to previous status! + add sp, sp, #8 @@ the local variables! + ldmfd sp!, {r4-r11, r15} @@ update PC with LR content. + + + +@@ kind of sub-function, here not to overload the common case. +__end_bef_a_evaluation: + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + bal __end_a_evaluation +endfunc diff --git a/media/ffvpx/libavcodec/arm/simple_idct_armv5te.S b/media/ffvpx/libavcodec/arm/simple_idct_armv5te.S new file mode 100644 index 0000000000..a8d03469ab --- /dev/null +++ b/media/ffvpx/libavcodec/arm/simple_idct_armv5te.S @@ -0,0 +1,613 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * Copyright (c) 2006 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define W13 (W1 | (W3 << 16)) +#define W26 (W2 | (W6 << 16)) +#define W57 (W5 | (W7 << 16)) + +function idct_row_armv5te + str lr, [sp, #-4]! + + ldrd v1, v2, [a1, #8] + ldrd a3, a4, [a1] /* a3 = row[1:0], a4 = row[3:2] */ + orrs v1, v1, v2 + itt eq + cmpeq v1, a4 + cmpeq v1, a3, lsr #16 + beq row_dc_only + + mov v1, #(1<<(ROW_SHIFT-1)) + mov ip, #16384 + sub ip, ip, #1 /* ip = W4 */ + smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ + ldr ip, =W26 /* ip = W2 | (W6 << 16) */ + smultb a2, ip, a4 + smulbb lr, ip, a4 + add v2, v1, a2 + sub v3, v1, a2 + sub v4, v1, lr + add v1, v1, lr + + ldr ip, =W13 /* ip = W1 | (W3 << 16) */ + ldr lr, =W57 /* lr = W5 | (W7 << 16) */ + smulbt v5, ip, a3 + smultt v6, lr, a4 + smlatt v5, ip, a4, v5 + smultt a2, ip, a3 + smulbt v7, lr, a3 + sub v6, v6, a2 + smulbt a2, ip, a4 + smultt fp, lr, a3 + sub v7, v7, a2 + smulbt a2, lr, a4 + ldrd a3, a4, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ + sub fp, fp, a2 + + orrs a2, a3, a4 + beq 1f + + smlabt v5, lr, a3, v5 + smlabt v6, ip, a3, v6 + smlatt v5, lr, a4, v5 + smlabt v6, lr, a4, v6 + smlatt v7, lr, a3, v7 + smlatt fp, ip, a3, fp + smulbt a2, ip, a4 + smlatt v7, ip, a4, v7 + sub fp, fp, a2 + + ldr ip, =W26 /* ip = W2 | (W6 << 16) */ + mov a2, #16384 + sub a2, a2, #1 /* a2 = W4 */ + smulbb a2, a2, a3 /* a2 = W4*row[4] */ + smultb lr, ip, a4 /* lr = W6*row[6] */ + add v1, v1, a2 /* v1 += W4*row[4] */ + add v1, v1, lr /* v1 += W6*row[6] */ + add v4, v4, a2 /* v4 += W4*row[4] */ + sub v4, v4, lr /* v4 -= W6*row[6] */ + smulbb lr, ip, a4 /* lr = W2*row[6] */ + sub v2, v2, a2 /* v2 -= W4*row[4] */ + sub v2, v2, lr /* v2 -= W2*row[6] */ + sub v3, v3, a2 /* v3 -= W4*row[4] */ + add v3, v3, lr /* v3 += W2*row[6] */ + +1: add a2, v1, v5 + mov a3, a2, lsr #11 + bic a3, a3, #0x1f0000 + sub a2, v2, v6 + mov a2, a2, lsr #11 + add a3, a3, a2, lsl #16 + add a2, v3, v7 + mov a4, a2, lsr #11 + bic a4, a4, #0x1f0000 + add a2, v4, fp + mov a2, a2, lsr #11 + add a4, a4, a2, lsl #16 + strd a3, a4, [a1] + + sub a2, v4, fp + mov a3, a2, lsr #11 + bic a3, a3, #0x1f0000 + sub a2, v3, v7 + mov a2, a2, lsr #11 + add a3, a3, a2, lsl #16 + add a2, v2, v6 + mov a4, a2, lsr #11 + bic a4, a4, #0x1f0000 + sub a2, v1, v5 + mov a2, a2, lsr #11 + add a4, a4, a2, lsl #16 + strd a3, a4, [a1, #8] + + ldr pc, [sp], #4 + +row_dc_only: + orr a3, a3, a3, lsl #16 + bic a3, a3, #0xe000 + mov a3, a3, lsl #3 + mov a4, a3 + strd a3, a4, [a1] + strd a3, a4, [a1, #8] + + ldr pc, [sp], #4 +endfunc + + .macro idct_col + ldr a4, [a1] /* a4 = col[1:0] */ + mov ip, #16384 + sub ip, ip, #1 /* ip = W4 */ + mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */ + add v2, v1, a4, asr #16 + rsb v2, v2, v2, lsl #14 + mov a4, a4, lsl #16 + add v1, v1, a4, asr #16 + ldr a4, [a1, #(16*4)] + rsb v1, v1, v1, lsl #14 + + smulbb lr, ip, a4 + smulbt a3, ip, a4 + sub v3, v1, lr + sub v5, v1, lr + add v7, v1, lr + add v1, v1, lr + sub v4, v2, a3 + sub v6, v2, a3 + add fp, v2, a3 + ldr ip, =W26 + ldr a4, [a1, #(16*2)] + add v2, v2, a3 + + smulbb lr, ip, a4 + smultb a3, ip, a4 + add v1, v1, lr + sub v7, v7, lr + add v3, v3, a3 + sub v5, v5, a3 + smulbt lr, ip, a4 + smultt a3, ip, a4 + add v2, v2, lr + sub fp, fp, lr + add v4, v4, a3 + ldr a4, [a1, #(16*6)] + sub v6, v6, a3 + + smultb lr, ip, a4 + smulbb a3, ip, a4 + add v1, v1, lr + sub v7, v7, lr + sub v3, v3, a3 + add v5, v5, a3 + smultt lr, ip, a4 + smulbt a3, ip, a4 + add v2, v2, lr + sub fp, fp, lr + sub v4, v4, a3 + add v6, v6, a3 + + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} + + ldr ip, =W13 + ldr a4, [a1, #(16*1)] + ldr lr, =W57 + smulbb v1, ip, a4 + smultb v3, ip, a4 + smulbb v5, lr, a4 + smultb v7, lr, a4 + smulbt v2, ip, a4 + smultt v4, ip, a4 + smulbt v6, lr, a4 + smultt fp, lr, a4 + rsb v4, v4, #0 + ldr a4, [a1, #(16*3)] + rsb v3, v3, #0 + + smlatb v1, ip, a4, v1 + smlatb v3, lr, a4, v3 + smulbb a3, ip, a4 + smulbb a2, lr, a4 + sub v5, v5, a3 + sub v7, v7, a2 + smlatt v2, ip, a4, v2 + smlatt v4, lr, a4, v4 + smulbt a3, ip, a4 + smulbt a2, lr, a4 + sub v6, v6, a3 + ldr a4, [a1, #(16*5)] + sub fp, fp, a2 + + smlabb v1, lr, a4, v1 + smlabb v3, ip, a4, v3 + smlatb v5, lr, a4, v5 + smlatb v7, ip, a4, v7 + smlabt v2, lr, a4, v2 + smlabt v4, ip, a4, v4 + smlatt v6, lr, a4, v6 + ldr a3, [a1, #(16*7)] + smlatt fp, ip, a4, fp + + smlatb v1, lr, a3, v1 + smlabb v3, lr, a3, v3 + smlatb v5, ip, a3, v5 + smulbb a4, ip, a3 + smlatt v2, lr, a3, v2 + sub v7, v7, a4 + smlabt v4, lr, a3, v4 + smulbt a4, ip, a3 + smlatt v6, ip, a3, v6 + sub fp, fp, a4 + .endm + +function idct_col_armv5te + str lr, [sp, #-4]! + + idct_col + + ldmfd sp!, {a3, a4} + adds a2, a3, v1 + mov a2, a2, lsr #20 + it mi + orrmi a2, a2, #0xf000 + add ip, a4, v2 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1] + subs a3, a3, v1 + mov a2, a3, lsr #20 + it mi + orrmi a2, a2, #0xf000 + sub a4, a4, v2 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*7)] + + subs a2, a3, v3 + mov a2, a2, lsr #20 + it mi + orrmi a2, a2, #0xf000 + sub ip, a4, v4 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*1)] + adds a3, a3, v3 + mov a2, a3, lsr #20 + it mi + orrmi a2, a2, #0xf000 + add a4, a4, v4 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*6)] + + adds a2, a3, v5 + mov a2, a2, lsr #20 + it mi + orrmi a2, a2, #0xf000 + add ip, a4, v6 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*2)] + subs a3, a3, v5 + mov a2, a3, lsr #20 + it mi + orrmi a2, a2, #0xf000 + sub a4, a4, v6 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*5)] + + adds a2, a3, v7 + mov a2, a2, lsr #20 + it mi + orrmi a2, a2, #0xf000 + add ip, a4, fp + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*3)] + subs a3, a3, v7 + mov a2, a3, lsr #20 + it mi + orrmi a2, a2, #0xf000 + sub a4, a4, fp + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + str a2, [a1, #(16*4)] + + ldr pc, [sp], #4 +endfunc + +.macro clip dst, src:vararg + movs \dst, \src + it mi + movmi \dst, #0 + cmp \dst, #255 + it gt + movgt \dst, #255 +.endm + +.macro aclip dst, src:vararg + adds \dst, \src + it mi + movmi \dst, #0 + cmp \dst, #255 + it gt + movgt \dst, #255 +.endm + +function idct_col_put_armv5te + str lr, [sp, #-4]! + + idct_col + + ldmfd sp!, {a3, a4} + ldr lr, [sp, #32] + add a2, a3, v1 + clip a2, a2, asr #20 + add ip, a4, v2 + clip ip, ip, asr #20 + orr a2, a2, ip, lsl #8 + sub a3, a3, v1 + clip a3, a3, asr #20 + sub a4, a4, v2 + clip a4, a4, asr #20 + ldr v1, [sp, #28] + strh a2, [v1] + add a2, v1, #2 + str a2, [sp, #28] + orr a2, a3, a4, lsl #8 + rsb v2, lr, lr, lsl #3 + ldmfd sp!, {a3, a4} + strh_pre a2, v2, v1 + + sub a2, a3, v3 + clip a2, a2, asr #20 + sub ip, a4, v4 + clip ip, ip, asr #20 + orr a2, a2, ip, lsl #8 + strh_pre a2, v1, lr + add a3, a3, v3 + clip a2, a3, asr #20 + add a4, a4, v4 + clip a4, a4, asr #20 + orr a2, a2, a4, lsl #8 + ldmfd sp!, {a3, a4} + strh_dpre a2, v2, lr + + add a2, a3, v5 + clip a2, a2, asr #20 + add ip, a4, v6 + clip ip, ip, asr #20 + orr a2, a2, ip, lsl #8 + strh_pre a2, v1, lr + sub a3, a3, v5 + clip a2, a3, asr #20 + sub a4, a4, v6 + clip a4, a4, asr #20 + orr a2, a2, a4, lsl #8 + ldmfd sp!, {a3, a4} + strh_dpre a2, v2, lr + + add a2, a3, v7 + clip a2, a2, asr #20 + add ip, a4, fp + clip ip, ip, asr #20 + orr a2, a2, ip, lsl #8 + strh a2, [v1, lr] + sub a3, a3, v7 + clip a2, a3, asr #20 + sub a4, a4, fp + clip a4, a4, asr #20 + orr a2, a2, a4, lsl #8 + strh_dpre a2, v2, lr + + ldr pc, [sp], #4 +endfunc + +function idct_col_add_armv5te + str lr, [sp, #-4]! + + idct_col + + ldr lr, [sp, #36] + + ldmfd sp!, {a3, a4} + ldrh ip, [lr] + add a2, a3, v1 + sub a3, a3, v1 + and v1, ip, #255 + aclip a2, v1, a2, asr #20 + add v1, a4, v2 + mov v1, v1, asr #20 + aclip v1, v1, ip, lsr #8 + orr a2, a2, v1, lsl #8 + ldr v1, [sp, #32] + sub a4, a4, v2 + rsb v2, v1, v1, lsl #3 + ldrh_pre ip, v2, lr + strh a2, [lr] + and a2, ip, #255 + aclip a3, a2, a3, asr #20 + mov a4, a4, asr #20 + aclip a4, a4, ip, lsr #8 + add a2, lr, #2 + str a2, [sp, #28] + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh_pre ip, lr, v1 + sub a2, a3, v3 + add a3, a3, v3 + and v3, ip, #255 + aclip a2, v3, a2, asr #20 + sub v3, a4, v4 + mov v3, v3, asr #20 + aclip v3, v3, ip, lsr #8 + orr a2, a2, v3, lsl #8 + add a4, a4, v4 + ldrh_dpre ip, v2, v1 + strh a2, [lr] + and a2, ip, #255 + aclip a3, a2, a3, asr #20 + mov a4, a4, asr #20 + aclip a4, a4, ip, lsr #8 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh_pre ip, lr, v1 + add a2, a3, v5 + sub a3, a3, v5 + and v3, ip, #255 + aclip a2, v3, a2, asr #20 + add v3, a4, v6 + mov v3, v3, asr #20 + aclip v3, v3, ip, lsr #8 + orr a2, a2, v3, lsl #8 + sub a4, a4, v6 + ldrh_dpre ip, v2, v1 + strh a2, [lr] + and a2, ip, #255 + aclip a3, a2, a3, asr #20 + mov a4, a4, asr #20 + aclip a4, a4, ip, lsr #8 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh_pre ip, lr, v1 + add a2, a3, v7 + sub a3, a3, v7 + and v3, ip, #255 + aclip a2, v3, a2, asr #20 + add v3, a4, fp + mov v3, v3, asr #20 + aclip v3, v3, ip, lsr #8 + orr a2, a2, v3, lsl #8 + sub a4, a4, fp + ldrh_dpre ip, v2, v1 + strh a2, [lr] + and a2, ip, #255 + aclip a3, a2, a3, asr #20 + mov a4, a4, asr #20 + aclip a4, a4, ip, lsr #8 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldr pc, [sp], #4 +endfunc + +function ff_simple_idct_armv5te, export=1 + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} +endfunc + +function ff_simple_idct_add_armv5te, export=1 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + + mov a1, a3 + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + + add sp, sp, #8 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} +endfunc + +function ff_simple_idct_put_armv5te, export=1 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + + mov a1, a3 + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + + add sp, sp, #8 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} +endfunc diff --git a/media/ffvpx/libavcodec/arm/simple_idct_armv6.S b/media/ffvpx/libavcodec/arm/simple_idct_armv6.S new file mode 100644 index 0000000000..f95c20d295 --- /dev/null +++ b/media/ffvpx/libavcodec/arm/simple_idct_armv6.S @@ -0,0 +1,425 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * Copyright (c) 2007 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define W13 (W1 | (W3 << 16)) +#define W26 (W2 | (W6 << 16)) +#define W42 (W4 | (W2 << 16)) +#define W42n (-W4&0xffff | (-W2 << 16)) +#define W46 (W4 | (W6 << 16)) +#define W57 (W5 | (W7 << 16)) + +/* + Compute partial IDCT of single row. + shift = left-shift amount + r0 = source address + r2 = row[2,0] <= 2 cycles + r3 = row[3,1] + ip = w42 <= 2 cycles + + Output in registers r4--r11 +*/ + .macro idct_row shift + ldr lr, =W46 /* lr = W4 | (W6 << 16) */ + mov r1, #(1<<(\shift-1)) + smlad r4, r2, ip, r1 + smlsd r7, r2, ip, r1 + ldr ip, =W13 /* ip = W1 | (W3 << 16) */ + ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ + smlad r5, r2, lr, r1 + smlsd r6, r2, lr, r1 + + smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ + smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ + ldr lr, [r0, #12] /* lr = row[7,5] */ + pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ + pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ + smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ + smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ + smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ + + ldr r3, =W42n /* r3 = -W4 | (-W2 << 16) */ + smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ + ldr r2, [r0, #4] /* r2 = row[6,4] */ + smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ + ldr ip, =W46 /* ip = W4 | (W6 << 16) */ + smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ + + smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ + smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */ + smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */ + smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */ + .endm + +/* + Compute partial IDCT of half row. + shift = left-shift amount + r2 = row[2,0] + r3 = row[3,1] + ip = w42 + + Output in registers r4--r11 +*/ + .macro idct_row4 shift + ldr lr, =W46 /* lr = W4 | (W6 << 16) */ + ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ + mov r1, #(1<<(\shift-1)) + smlad r4, r2, ip, r1 + smlsd r7, r2, ip, r1 + ldr ip, =W13 /* ip = W1 | (W3 << 16) */ + smlad r5, r2, lr, r1 + smlsd r6, r2, lr, r1 + smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ + smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ + pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ + pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ + smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ + smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ + .endm + +/* + Compute final part of IDCT single row without shift. + Input in registers r4--r11 + Output in registers ip, r4--r6, lr, r8--r10 +*/ + .macro idct_finish + add ip, r4, r8 /* r1 = A0 + B0 */ + sub lr, r4, r8 /* r2 = A0 - B0 */ + sub r4, r5, r9 /* r2 = A1 + B1 */ + add r8, r5, r9 /* r2 = A1 - B1 */ + add r5, r6, r10 /* r1 = A2 + B2 */ + sub r9, r6, r10 /* r1 = A2 - B2 */ + add r6, r7, r11 /* r2 = A3 + B3 */ + sub r10,r7, r11 /* r2 = A3 - B3 */ + .endm + +/* + Compute final part of IDCT single row. + shift = right-shift amount + Input/output in registers r4--r11 +*/ + .macro idct_finish_shift shift + add r3, r4, r8 /* r3 = A0 + B0 */ + sub r2, r4, r8 /* r2 = A0 - B0 */ + mov r4, r3, asr #\shift + mov r8, r2, asr #\shift + + sub r3, r5, r9 /* r3 = A1 + B1 */ + add r2, r5, r9 /* r2 = A1 - B1 */ + mov r5, r3, asr #\shift + mov r9, r2, asr #\shift + + add r3, r6, r10 /* r3 = A2 + B2 */ + sub r2, r6, r10 /* r2 = A2 - B2 */ + mov r6, r3, asr #\shift + mov r10,r2, asr #\shift + + add r3, r7, r11 /* r3 = A3 + B3 */ + sub r2, r7, r11 /* r2 = A3 - B3 */ + mov r7, r3, asr #\shift + mov r11,r2, asr #\shift + .endm + +/* + Compute final part of IDCT single row, saturating results at 8 bits. + shift = right-shift amount + Input/output in registers r4--r11 +*/ + .macro idct_finish_shift_sat shift + add r3, r4, r8 /* r3 = A0 + B0 */ + sub ip, r4, r8 /* ip = A0 - B0 */ + usat r4, #8, r3, asr #\shift + usat r8, #8, ip, asr #\shift + + sub r3, r5, r9 /* r3 = A1 + B1 */ + add ip, r5, r9 /* ip = A1 - B1 */ + usat r5, #8, r3, asr #\shift + usat r9, #8, ip, asr #\shift + + add r3, r6, r10 /* r3 = A2 + B2 */ + sub ip, r6, r10 /* ip = A2 - B2 */ + usat r6, #8, r3, asr #\shift + usat r10,#8, ip, asr #\shift + + add r3, r7, r11 /* r3 = A3 + B3 */ + sub ip, r7, r11 /* ip = A3 - B3 */ + usat r7, #8, r3, asr #\shift + usat r11,#8, ip, asr #\shift + .endm + +/* + Compute IDCT of single row, storing as column. + r0 = source + r1 = dest +*/ +function idct_row_armv6 + push {lr} + + ldr lr, [r0, #12] /* lr = row[7,5] */ + ldr ip, [r0, #4] /* ip = row[6,4] */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + ldr r2, [r0] /* r2 = row[2,0] */ + orrs lr, lr, ip + itt eq + cmpeq lr, r3 + cmpeq lr, r2, lsr #16 + beq 1f + push {r1} + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ + cmp lr, #0 + beq 2f + + idct_row ROW_SHIFT + b 3f + +2: idct_row4 ROW_SHIFT + +3: pop {r1} + idct_finish_shift ROW_SHIFT + + strh r4, [r1] + strh r5, [r1, #(16*2)] + strh r6, [r1, #(16*4)] + strh r7, [r1, #(16*6)] + strh r11,[r1, #(16*1)] + strh r10,[r1, #(16*3)] + strh r9, [r1, #(16*5)] + strh r8, [r1, #(16*7)] + + pop {pc} + +1: mov r2, r2, lsl #3 + strh r2, [r1] + strh r2, [r1, #(16*2)] + strh r2, [r1, #(16*4)] + strh r2, [r1, #(16*6)] + strh r2, [r1, #(16*1)] + strh r2, [r1, #(16*3)] + strh r2, [r1, #(16*5)] + strh r2, [r1, #(16*7)] + pop {pc} +endfunc + +/* + Compute IDCT of single column, read as row. + r0 = source + r1 = dest +*/ +function idct_col_armv6 + push {r1, lr} + + ldr r2, [r0] /* r2 = row[2,0] */ + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + idct_row COL_SHIFT + pop {r1} + idct_finish_shift COL_SHIFT + + strh r4, [r1] + strh r5, [r1, #(16*1)] + strh r6, [r1, #(16*2)] + strh r7, [r1, #(16*3)] + strh r11,[r1, #(16*4)] + strh r10,[r1, #(16*5)] + strh r9, [r1, #(16*6)] + strh r8, [r1, #(16*7)] + + pop {pc} +endfunc + +/* + Compute IDCT of single column, read as row, store saturated 8-bit. + r0 = source + r1 = dest + r2 = line size +*/ +function idct_col_put_armv6 + push {r1, r2, lr} + + ldr r2, [r0] /* r2 = row[2,0] */ + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + idct_row COL_SHIFT + pop {r1, r2} + idct_finish_shift_sat COL_SHIFT + + strb_post r4, r1, r2 + strb_post r5, r1, r2 + strb_post r6, r1, r2 + strb_post r7, r1, r2 + strb_post r11,r1, r2 + strb_post r10,r1, r2 + strb_post r9, r1, r2 + strb_post r8, r1, r2 + + sub r1, r1, r2, lsl #3 + + pop {pc} +endfunc + +/* + Compute IDCT of single column, read as row, add/store saturated 8-bit. + r0 = source + r1 = dest + r2 = line size +*/ +function idct_col_add_armv6 + push {r1, r2, lr} + + ldr r2, [r0] /* r2 = row[2,0] */ + ldr ip, =W42 /* ip = W4 | (W2 << 16) */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + idct_row COL_SHIFT + pop {r1, r2} + idct_finish + + ldrb r3, [r1] + ldrb r7, [r1, r2] + ldrb r11,[r1, r2, lsl #2] + add ip, r3, ip, asr #COL_SHIFT + usat ip, #8, ip + add r4, r7, r4, asr #COL_SHIFT + strb_post ip, r1, r2 + ldrb ip, [r1, r2] + usat r4, #8, r4 + ldrb r11,[r1, r2, lsl #2] + add r5, ip, r5, asr #COL_SHIFT + usat r5, #8, r5 + strb_post r4, r1, r2 + ldrb r3, [r1, r2] + ldrb ip, [r1, r2, lsl #2] + strb_post r5, r1, r2 + ldrb r7, [r1, r2] + ldrb r4, [r1, r2, lsl #2] + add r6, r3, r6, asr #COL_SHIFT + usat r6, #8, r6 + add r10,r7, r10,asr #COL_SHIFT + usat r10,#8, r10 + add r9, r11,r9, asr #COL_SHIFT + usat r9, #8, r9 + add r8, ip, r8, asr #COL_SHIFT + usat r8, #8, r8 + add lr, r4, lr, asr #COL_SHIFT + usat lr, #8, lr + strb_post r6, r1, r2 + strb_post r10,r1, r2 + strb_post r9, r1, r2 + strb_post r8, r1, r2 + strb_post lr, r1, r2 + + sub r1, r1, r2, lsl #3 + + pop {pc} +endfunc + +/* + Compute 8 IDCT row transforms. + func = IDCT row->col function + width = width of columns in bytes +*/ + .macro idct_rows func width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + sub r0, r0, #(16*5) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + + sub r0, r0, #(16*7) + .endm + +/* void ff_simple_idct_armv6(int16_t *data); */ +function ff_simple_idct_armv6, export=1 + push {r4-r11, lr} + sub sp, sp, #128 + + mov r1, sp + idct_rows idct_row_armv6, 2 + mov r1, r0 + mov r0, sp + idct_rows idct_col_armv6, 2 + + add sp, sp, #128 + pop {r4-r11, pc} +endfunc + +/* ff_simple_idct_add_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */ +function ff_simple_idct_add_armv6, export=1 + push {r0, r1, r4-r11, lr} + sub sp, sp, #128 + + mov r0, r2 + mov r1, sp + idct_rows idct_row_armv6, 2 + mov r0, sp + ldr r1, [sp, #128] + ldr r2, [sp, #(128+4)] + idct_rows idct_col_add_armv6, 1 + + add sp, sp, #(128+8) + pop {r4-r11, pc} +endfunc + +/* ff_simple_idct_put_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */ +function ff_simple_idct_put_armv6, export=1 + push {r0, r1, r4-r11, lr} + sub sp, sp, #128 + + mov r0, r2 + mov r1, sp + idct_rows idct_row_armv6, 2 + mov r0, sp + ldr r1, [sp, #128] + ldr r2, [sp, #(128+4)] + idct_rows idct_col_put_armv6, 1 + + add sp, sp, #(128+8) + pop {r4-r11, pc} +endfunc diff --git a/media/ffvpx/libavcodec/arm/simple_idct_neon.S b/media/ffvpx/libavcodec/arm/simple_idct_neon.S new file mode 100644 index 0000000000..726d4cbefa --- /dev/null +++ b/media/ffvpx/libavcodec/arm/simple_idct_neon.S @@ -0,0 +1,375 @@ +/* + * ARM NEON IDCT + * + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * Based on Simple IDCT + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4c ((1<<(COL_SHIFT-1))/W4) +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define w1 d0[0] +#define w2 d0[1] +#define w3 d0[2] +#define w4 d0[3] +#define w5 d1[0] +#define w6 d1[1] +#define w7 d1[2] +#define w4c d1[3] + + .macro idct_col4_top + vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ + vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ + vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ + vadd.i32 q11, q15, q7 + vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ + vadd.i32 q12, q15, q8 + vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ + vsub.i32 q13, q15, q8 + vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ + vsub.i32 q14, q15, q7 + + vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ + vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ + vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ + vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ + .endm + + .text + .align 6 + +function idct_row4_pld_neon + pld [r0] + add r3, r0, r1, lsl #2 + pld [r0, r1] + pld [r0, r1, lsl #1] +A pld [r3, -r1] + pld [r3] + pld [r3, r1] + add r3, r3, r1, lsl #1 + pld [r3] + pld [r3, r1] +endfunc + +function idct_row4_neon + vmov.i32 q15, #(1<<(ROW_SHIFT-1)) + vld1.64 {d2-d5}, [r2,:128]! + vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ + vld1.64 {d6,d7}, [r2,:128]! + vorr d10, d3, d5 + vld1.64 {d8,d9}, [r2,:128]! + add r2, r2, #-64 + + vorr d11, d7, d9 + vorr d10, d10, d11 + vmov r3, r4, d10 + + idct_col4_top + + orrs r3, r3, r4 + beq 1f + + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q7 + vsub.i32 q13, q13, q7 + vadd.i32 q14, q14, q7 + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ + vmlal.s16 q9, d9, w7 + vmlsl.s16 q10, d9, w5 + vmlal.s16 q5, d9, w3 + vmlsl.s16 q6, d9, w1 + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q8 + vadd.i32 q13, q13, q8 + vsub.i32 q14, q14, q7 + +1: vadd.i32 q3, q11, q9 + vadd.i32 q4, q12, q10 + vshrn.i32 d2, q3, #ROW_SHIFT + vshrn.i32 d4, q4, #ROW_SHIFT + vadd.i32 q7, q13, q5 + vadd.i32 q8, q14, q6 + vtrn.16 d2, d4 + vshrn.i32 d6, q7, #ROW_SHIFT + vshrn.i32 d8, q8, #ROW_SHIFT + vsub.i32 q14, q14, q6 + vsub.i32 q11, q11, q9 + vtrn.16 d6, d8 + vsub.i32 q13, q13, q5 + vshrn.i32 d3, q14, #ROW_SHIFT + vtrn.32 d2, d6 + vsub.i32 q12, q12, q10 + vtrn.32 d4, d8 + vshrn.i32 d5, q13, #ROW_SHIFT + vshrn.i32 d7, q12, #ROW_SHIFT + vshrn.i32 d9, q11, #ROW_SHIFT + + vtrn.16 d3, d5 + vtrn.16 d7, d9 + vtrn.32 d3, d7 + vtrn.32 d5, d9 + + vst1.64 {d2-d5}, [r2,:128]! + vst1.64 {d6-d9}, [r2,:128]! + + bx lr +endfunc + +function idct_col4_neon + mov ip, #16 + vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ + vdup.16 d30, w4c + vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ + vadd.i16 d30, d30, d2 + vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ + vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ + vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */ + + ldrd r4, r5, [r2] + ldrd r6, r7, [r2, #16] + orrs r4, r4, r5 + + idct_col4_top + it eq + addeq r2, r2, #16 + beq 1f + + vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */ + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q7 + vsub.i32 q13, q13, q7 + vadd.i32 q14, q14, q7 + +1: orrs r6, r6, r7 + ldrd r4, r5, [r2, #16] + it eq + addeq r2, r2, #16 + beq 2f + + vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */ + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ + +2: orrs r4, r4, r5 + ldrd r4, r5, [r2, #16] + it eq + addeq r2, r2, #16 + beq 3f + + vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */ + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ + vadd.i32 q11, q11, q7 + vsub.i32 q14, q14, q7 + vsub.i32 q12, q12, q8 + vadd.i32 q13, q13, q8 + +3: orrs r4, r4, r5 + it eq + addeq r2, r2, #16 + beq 4f + + vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */ + vmlal.s16 q9, d9, w7 + vmlsl.s16 q10, d9, w5 + vmlal.s16 q5, d9, w3 + vmlsl.s16 q6, d9, w1 + +4: vaddhn.i32 d2, q11, q9 + vaddhn.i32 d3, q12, q10 + vaddhn.i32 d4, q13, q5 + vaddhn.i32 d5, q14, q6 + vsubhn.i32 d9, q11, q9 + vsubhn.i32 d8, q12, q10 + vsubhn.i32 d7, q13, q5 + vsubhn.i32 d6, q14, q6 + + bx lr +endfunc + + .align 6 + +function idct_col4_st8_neon + vqshrun.s16 d2, q1, #COL_SHIFT-16 + vqshrun.s16 d3, q2, #COL_SHIFT-16 + vqshrun.s16 d4, q3, #COL_SHIFT-16 + vqshrun.s16 d5, q4, #COL_SHIFT-16 + vst1.32 {d2[0]}, [r0,:32], r1 + vst1.32 {d2[1]}, [r0,:32], r1 + vst1.32 {d3[0]}, [r0,:32], r1 + vst1.32 {d3[1]}, [r0,:32], r1 + vst1.32 {d4[0]}, [r0,:32], r1 + vst1.32 {d4[1]}, [r0,:32], r1 + vst1.32 {d5[0]}, [r0,:32], r1 + vst1.32 {d5[1]}, [r0,:32], r1 + + bx lr +endfunc + +const idct_coeff_neon, align=4 + .short W1, W2, W3, W4, W5, W6, W7, W4c +endconst + + .macro idct_start data + push {r4-r7, lr} + pld [\data] + pld [\data, #64] + vpush {d8-d15} + movrel r3, idct_coeff_neon + vld1.64 {d0,d1}, [r3,:128] + .endm + + .macro idct_end + vpop {d8-d15} + pop {r4-r7, pc} + .endm + +/* void ff_simple_idct_put_neon(uint8_t *dst, ptrdiff_t line_size, int16_t *data); */ +function ff_simple_idct_put_neon, export=1 + idct_start r2 + + bl idct_row4_pld_neon + bl idct_row4_neon + add r2, r2, #-128 + bl idct_col4_neon + bl idct_col4_st8_neon + sub r0, r0, r1, lsl #3 + add r0, r0, #4 + add r2, r2, #-120 + bl idct_col4_neon + bl idct_col4_st8_neon + + idct_end +endfunc + + .align 6 + +function idct_col4_add8_neon + mov ip, r0 + + vld1.32 {d10[0]}, [r0,:32], r1 + vshr.s16 q1, q1, #COL_SHIFT-16 + vld1.32 {d10[1]}, [r0,:32], r1 + vshr.s16 q2, q2, #COL_SHIFT-16 + vld1.32 {d11[0]}, [r0,:32], r1 + vshr.s16 q3, q3, #COL_SHIFT-16 + vld1.32 {d11[1]}, [r0,:32], r1 + vshr.s16 q4, q4, #COL_SHIFT-16 + vld1.32 {d12[0]}, [r0,:32], r1 + vaddw.u8 q1, q1, d10 + vld1.32 {d12[1]}, [r0,:32], r1 + vaddw.u8 q2, q2, d11 + vld1.32 {d13[0]}, [r0,:32], r1 + vqmovun.s16 d2, q1 + vld1.32 {d13[1]}, [r0,:32], r1 + vaddw.u8 q3, q3, d12 + vst1.32 {d2[0]}, [ip,:32], r1 + vqmovun.s16 d3, q2 + vst1.32 {d2[1]}, [ip,:32], r1 + vaddw.u8 q4, q4, d13 + vst1.32 {d3[0]}, [ip,:32], r1 + vqmovun.s16 d4, q3 + vst1.32 {d3[1]}, [ip,:32], r1 + vqmovun.s16 d5, q4 + vst1.32 {d4[0]}, [ip,:32], r1 + vst1.32 {d4[1]}, [ip,:32], r1 + vst1.32 {d5[0]}, [ip,:32], r1 + vst1.32 {d5[1]}, [ip,:32], r1 + + bx lr +endfunc + +/* void ff_simple_idct_add_neon(uint8_t *dst, ptrdiff_t line_size, int16_t *data); */ +function ff_simple_idct_add_neon, export=1 + idct_start r2 + + bl idct_row4_pld_neon + bl idct_row4_neon + add r2, r2, #-128 + bl idct_col4_neon + bl idct_col4_add8_neon + sub r0, r0, r1, lsl #3 + add r0, r0, #4 + add r2, r2, #-120 + bl idct_col4_neon + bl idct_col4_add8_neon + + idct_end +endfunc + + .align 6 + +function idct_col4_st16_neon + mov ip, #16 + + vshr.s16 q1, q1, #COL_SHIFT-16 + vshr.s16 q2, q2, #COL_SHIFT-16 + vst1.64 {d2}, [r2,:64], ip + vshr.s16 q3, q3, #COL_SHIFT-16 + vst1.64 {d3}, [r2,:64], ip + vshr.s16 q4, q4, #COL_SHIFT-16 + vst1.64 {d4}, [r2,:64], ip + vst1.64 {d5}, [r2,:64], ip + vst1.64 {d6}, [r2,:64], ip + vst1.64 {d7}, [r2,:64], ip + vst1.64 {d8}, [r2,:64], ip + vst1.64 {d9}, [r2,:64], ip + + bx lr +endfunc + +/* void ff_simple_idct_neon(int16_t *data); */ +function ff_simple_idct_neon, export=1 + idct_start r0 + + mov r2, r0 + bl idct_row4_neon + bl idct_row4_neon + add r2, r2, #-128 + bl idct_col4_neon + add r2, r2, #-128 + bl idct_col4_st16_neon + add r2, r2, #-120 + bl idct_col4_neon + add r2, r2, #-128 + bl idct_col4_st16_neon + + idct_end +endfunc diff --git a/media/ffvpx/libavcodec/atsc_a53.c b/media/ffvpx/libavcodec/atsc_a53.c new file mode 100644 index 0000000000..29ec71bc5f --- /dev/null +++ b/media/ffvpx/libavcodec/atsc_a53.c @@ -0,0 +1,119 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stddef.h> +#include <stdint.h> + +#include "atsc_a53.h" +#include "get_bits.h" + +int ff_alloc_a53_sei(const AVFrame *frame, size_t prefix_len, + void **data, size_t *sei_size) +{ + AVFrameSideData *side_data = NULL; + uint8_t *sei_data; + + if (frame) + side_data = av_frame_get_side_data(frame, AV_FRAME_DATA_A53_CC); + + if (!side_data) { + *data = NULL; + return 0; + } + + *sei_size = side_data->size + 11; + *data = av_mallocz(*sei_size + prefix_len); + if (!*data) + return AVERROR(ENOMEM); + sei_data = (uint8_t*)*data + prefix_len; + + // country code + sei_data[0] = 181; + sei_data[1] = 0; + sei_data[2] = 49; + + /** + * 'GA94' is standard in North America for ATSC, but hard coding + * this style may not be the right thing to do -- other formats + * do exist. This information is not available in the side_data + * so we are going with this right now. + */ + AV_WL32(sei_data + 3, MKTAG('G', 'A', '9', '4')); + sei_data[7] = 3; + sei_data[8] = ((side_data->size/3) & 0x1f) | 0x40; + sei_data[9] = 0; + + memcpy(sei_data + 10, side_data->data, side_data->size); + + sei_data[side_data->size+10] = 255; + + return 0; +} + +int ff_parse_a53_cc(AVBufferRef **pbuf, const uint8_t *data, int size) +{ + AVBufferRef *buf = *pbuf; + GetBitContext gb; + size_t new_size, old_size = buf ? buf->size : 0; + int ret, cc_count; + + if (size < 3) + return AVERROR_INVALIDDATA; + + ret = init_get_bits8(&gb, data, size); + if (ret < 0) + return ret; + + if (get_bits(&gb, 8) != 0x3) // user_data_type_code + return 0; + + skip_bits(&gb, 1); // reserved + if (!get_bits(&gb, 1)) // process_cc_data_flag + return 0; + + skip_bits(&gb, 1); // zero bit + cc_count = get_bits(&gb, 5); + if (!cc_count) + return 0; + + skip_bits(&gb, 8); // reserved + + /* 3 bytes per CC plus one byte marker_bits at the end */ + if (cc_count * 3 >= (get_bits_left(&gb) >> 3)) + return AVERROR_INVALIDDATA; + + new_size = (old_size + cc_count * 3); + + if (new_size > INT_MAX) + return AVERROR_INVALIDDATA; + + /* Allow merging of the cc data from two fields. */ + ret = av_buffer_realloc(pbuf, new_size); + if (ret < 0) + return ret; + + buf = *pbuf; + /* Use of av_buffer_realloc assumes buffer is writeable */ + for (int i = 0; i < cc_count; i++) { + buf->data[old_size++] = get_bits(&gb, 8); + buf->data[old_size++] = get_bits(&gb, 8); + buf->data[old_size++] = get_bits(&gb, 8); + } + + return cc_count; +} diff --git a/media/ffvpx/libavcodec/atsc_a53.h b/media/ffvpx/libavcodec/atsc_a53.h new file mode 100644 index 0000000000..0622a55549 --- /dev/null +++ b/media/ffvpx/libavcodec/atsc_a53.h @@ -0,0 +1,56 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ATSC_A53_H +#define AVCODEC_ATSC_A53_H + +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/buffer.h" +#include "libavutil/frame.h" + +/** + * Check AVFrame for A53 side data and allocate and fill SEI message with A53 info + * + * @param frame Raw frame to get A53 side data from + * @param prefix_len Number of bytes to allocate before SEI message + * @param data Pointer to a variable to store allocated memory + * Upon return the variable will hold NULL on error or if frame has no A53 info. + * Otherwise it will point to prefix_len uninitialized bytes followed by + * *sei_size SEI message + * @param sei_size Pointer to a variable to store generated SEI message length + * @return Zero on success, negative error code on failure + */ +int ff_alloc_a53_sei(const AVFrame *frame, size_t prefix_len, + void **data, size_t *sei_size); + +/** + * Parse a data array for ATSC A53 Part 4 Closed Captions and store them in an AVBufferRef. + * + * @param pbuf Pointer to an AVBufferRef to append the closed captions. *pbuf may be NULL, in + * which case a new buffer will be allocated and put in it. + * @param data The data array containing the raw A53 data. + * @param size Size of the data array in bytes. + * + * @return Number of closed captions parsed on success, negative error code on failure. + * If no Closed Captions are parsed, *pbuf is untouched. + */ +int ff_parse_a53_cc(AVBufferRef **pbuf, const uint8_t *data, int size); + +#endif /* AVCODEC_ATSC_A53_H */ diff --git a/media/ffvpx/libavcodec/av1.h b/media/ffvpx/libavcodec/av1.h new file mode 100644 index 0000000000..384f7cddc7 --- /dev/null +++ b/media/ffvpx/libavcodec/av1.h @@ -0,0 +1,184 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * AV1 common definitions + */ + +#ifndef AVCODEC_AV1_H +#define AVCODEC_AV1_H + +// OBU types (section 6.2.2). +typedef enum { + // 0 reserved. + AV1_OBU_SEQUENCE_HEADER = 1, + AV1_OBU_TEMPORAL_DELIMITER = 2, + AV1_OBU_FRAME_HEADER = 3, + AV1_OBU_TILE_GROUP = 4, + AV1_OBU_METADATA = 5, + AV1_OBU_FRAME = 6, + AV1_OBU_REDUNDANT_FRAME_HEADER = 7, + AV1_OBU_TILE_LIST = 8, + // 9-14 reserved. + AV1_OBU_PADDING = 15, +} AV1_OBU_Type; + +// Metadata types (section 6.7.1). +enum { + AV1_METADATA_TYPE_HDR_CLL = 1, + AV1_METADATA_TYPE_HDR_MDCV = 2, + AV1_METADATA_TYPE_SCALABILITY = 3, + AV1_METADATA_TYPE_ITUT_T35 = 4, + AV1_METADATA_TYPE_TIMECODE = 5, +}; + +// Frame types (section 6.8.2). +enum { + AV1_FRAME_KEY = 0, + AV1_FRAME_INTER = 1, + AV1_FRAME_INTRA_ONLY = 2, + AV1_FRAME_SWITCH = 3, +}; + +// Reference frames (section 6.10.24). +enum { + AV1_REF_FRAME_INTRA = 0, + AV1_REF_FRAME_LAST = 1, + AV1_REF_FRAME_LAST2 = 2, + AV1_REF_FRAME_LAST3 = 3, + AV1_REF_FRAME_GOLDEN = 4, + AV1_REF_FRAME_BWDREF = 5, + AV1_REF_FRAME_ALTREF2 = 6, + AV1_REF_FRAME_ALTREF = 7, +}; + +// Constants (section 3). +enum { + AV1_MAX_OPERATING_POINTS = 32, + + AV1_MAX_SB_SIZE = 128, + AV1_MI_SIZE = 4, + + AV1_MAX_TILE_WIDTH = 4096, + AV1_MAX_TILE_AREA = 4096 * 2304, + AV1_MAX_TILE_ROWS = 64, + AV1_MAX_TILE_COLS = 64, + + AV1_NUM_REF_FRAMES = 8, + AV1_REFS_PER_FRAME = 7, + AV1_TOTAL_REFS_PER_FRAME = 8, + AV1_PRIMARY_REF_NONE = 7, + + AV1_MAX_SEGMENTS = 8, + AV1_SEG_LVL_MAX = 8, + + AV1_SEG_LVL_ALT_Q = 0, + AV1_SEG_LVL_ALT_LF_Y_V = 1, + AV1_SEG_LVL_REF_FRAME = 5, + AV1_SEG_LVL_SKIP = 6, + AV1_SEG_LVL_GLOBAL_MV = 7, + + AV1_SELECT_SCREEN_CONTENT_TOOLS = 2, + AV1_SELECT_INTEGER_MV = 2, + + AV1_SUPERRES_NUM = 8, + AV1_SUPERRES_DENOM_MIN = 9, + + AV1_INTERPOLATION_FILTER_SWITCHABLE = 4, + + AV1_GM_ABS_ALPHA_BITS = 12, + AV1_GM_ALPHA_PREC_BITS = 15, + AV1_GM_ABS_TRANS_ONLY_BITS = 9, + AV1_GM_TRANS_ONLY_PREC_BITS = 3, + AV1_GM_ABS_TRANS_BITS = 12, + AV1_GM_TRANS_PREC_BITS = 6, + AV1_WARPEDMODEL_PREC_BITS = 16, + + AV1_WARP_MODEL_IDENTITY = 0, + AV1_WARP_MODEL_TRANSLATION = 1, + AV1_WARP_MODEL_ROTZOOM = 2, + AV1_WARP_MODEL_AFFINE = 3, + AV1_WARP_PARAM_REDUCE_BITS = 6, + + AV1_DIV_LUT_BITS = 8, + AV1_DIV_LUT_PREC_BITS = 14, + AV1_DIV_LUT_NUM = 257, + + AV1_MAX_LOOP_FILTER = 63, +}; + + +// The main colour configuration information uses the same ISO/IEC 23001-8 +// (H.273) enums as FFmpeg does, so separate definitions are not required. + +// Chroma sample position. +enum { + AV1_CSP_UNKNOWN = 0, + AV1_CSP_VERTICAL = 1, // -> AVCHROMA_LOC_LEFT. + AV1_CSP_COLOCATED = 2, // -> AVCHROMA_LOC_TOPLEFT. +}; + +// Scalability modes (section 6.7.5) +enum { + AV1_SCALABILITY_L1T2 = 0, + AV1_SCALABILITY_L1T3 = 1, + AV1_SCALABILITY_L2T1 = 2, + AV1_SCALABILITY_L2T2 = 3, + AV1_SCALABILITY_L2T3 = 4, + AV1_SCALABILITY_S2T1 = 5, + AV1_SCALABILITY_S2T2 = 6, + AV1_SCALABILITY_S2T3 = 7, + AV1_SCALABILITY_L2T1h = 8, + AV1_SCALABILITY_L2T2h = 9, + AV1_SCALABILITY_L2T3h = 10, + AV1_SCALABILITY_S2T1h = 11, + AV1_SCALABILITY_S2T2h = 12, + AV1_SCALABILITY_S2T3h = 13, + AV1_SCALABILITY_SS = 14, + AV1_SCALABILITY_L3T1 = 15, + AV1_SCALABILITY_L3T2 = 16, + AV1_SCALABILITY_L3T3 = 17, + AV1_SCALABILITY_S3T1 = 18, + AV1_SCALABILITY_S3T2 = 19, + AV1_SCALABILITY_S3T3 = 20, + AV1_SCALABILITY_L3T2_KEY = 21, + AV1_SCALABILITY_L3T3_KEY = 22, + AV1_SCALABILITY_L4T5_KEY = 23, + AV1_SCALABILITY_L4T7_KEY = 24, + AV1_SCALABILITY_L3T2_KEY_SHIFT = 25, + AV1_SCALABILITY_L3T3_KEY_SHIFT = 26, + AV1_SCALABILITY_L4T5_KEY_SHIFT = 27, + AV1_SCALABILITY_L4T7_KEY_SHIFT = 28, +}; + +// Frame Restoration types (section 6.10.15) +enum { + AV1_RESTORE_NONE = 0, + AV1_RESTORE_WIENER = 1, + AV1_RESTORE_SGRPROJ = 2, + AV1_RESTORE_SWITCHABLE = 3, +}; + +// Sequence Headers are actually unbounded because one can use +// an arbitrary number of leading zeroes when encoding via uvlc. +// The following estimate is based around using the lowest number +// of bits for uvlc encoding. +#define AV1_SANE_SEQUENCE_HEADER_MAX_BITS 3138 + +#endif /* AVCODEC_AV1_H */ diff --git a/media/ffvpx/libavcodec/av1_frame_split_bsf.c b/media/ffvpx/libavcodec/av1_frame_split_bsf.c new file mode 100644 index 0000000000..5f6a40316c --- /dev/null +++ b/media/ffvpx/libavcodec/av1_frame_split_bsf.c @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2019 James Almer <jamrial@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * This bitstream filter splits AV1 Temporal Units into packets containing + * just one frame, plus any leading and trailing OBUs that may be present at + * the beginning or end, respectively. + * + * Temporal Units already containing only one frame will be passed through + * unchanged. When splitting can't be performed, the Temporal Unit will be + * passed through containing only the remaining OBUs starting from the first + * one after the last successfully split frame. + */ + +#include "libavutil/avassert.h" + +#include "bsf.h" +#include "bsf_internal.h" +#include "cbs.h" +#include "cbs_av1.h" + +typedef struct AV1FSplitContext { + AVPacket *buffer_pkt; + CodedBitstreamContext *cbc; + CodedBitstreamFragment temporal_unit; + + int nb_frames; + int cur_frame; + int cur_frame_idx; + int last_frame_idx; +} AV1FSplitContext; + +static int av1_frame_split_filter(AVBSFContext *ctx, AVPacket *out) +{ + AV1FSplitContext *s = ctx->priv_data; + CodedBitstreamFragment *td = &s->temporal_unit; + int i, ret; + int split = !!s->buffer_pkt->data; + + if (!s->buffer_pkt->data) { + int nb_frames = 0; + + ret = ff_bsf_get_packet_ref(ctx, s->buffer_pkt); + if (ret < 0) + return ret; + + ret = ff_cbs_read_packet(s->cbc, td, s->buffer_pkt); + if (ret < 0) { + av_log(ctx, AV_LOG_WARNING, "Failed to parse temporal unit.\n"); + goto passthrough; + } + + for (i = 0; i < td->nb_units; i++) { + CodedBitstreamUnit *unit = &td->units[i]; + + if (unit->type == AV1_OBU_FRAME || + unit->type == AV1_OBU_FRAME_HEADER) + nb_frames++; + else if (unit->type == AV1_OBU_TILE_LIST) { + av_log(ctx, AV_LOG_VERBOSE, "Large scale tiles are unsupported.\n"); + goto passthrough; + } + } + if (nb_frames > 1) { + s->cur_frame = 0; + s->cur_frame_idx = s->last_frame_idx = 0; + s->nb_frames = nb_frames; + split = 1; + } + } + + if (split) { + AV1RawFrameHeader *frame = NULL; + int cur_frame_type = -1, size = 0; + + for (i = s->cur_frame_idx; i < td->nb_units; i++) { + CodedBitstreamUnit *unit = &td->units[i]; + + size += unit->data_size; + if (unit->type == AV1_OBU_FRAME) { + AV1RawOBU *obu = unit->content; + + if (frame) { + av_log(ctx, AV_LOG_WARNING, "Frame OBU found when Tile data for a " + "previous frame was expected.\n"); + goto passthrough; + } + + frame = &obu->obu.frame.header; + cur_frame_type = obu->header.obu_type; + s->last_frame_idx = s->cur_frame_idx; + s->cur_frame_idx = i + 1; + s->cur_frame++; + + // split here unless it's the last frame, in which case + // include every trailing OBU + if (s->cur_frame < s->nb_frames) + break; + } else if (unit->type == AV1_OBU_FRAME_HEADER) { + AV1RawOBU *obu = unit->content; + + if (frame) { + av_log(ctx, AV_LOG_WARNING, "Frame Header OBU found when Tile data for a " + "previous frame was expected.\n"); + goto passthrough; + } + + frame = &obu->obu.frame_header; + cur_frame_type = obu->header.obu_type; + s->last_frame_idx = s->cur_frame_idx; + s->cur_frame++; + + // split here if show_existing_frame unless it's the last + // frame, in which case include every trailing OBU + if (frame->show_existing_frame && + s->cur_frame < s->nb_frames) { + s->cur_frame_idx = i + 1; + break; + } + } else if (unit->type == AV1_OBU_TILE_GROUP) { + AV1RawOBU *obu = unit->content; + AV1RawTileGroup *group = &obu->obu.tile_group; + + if (!frame || cur_frame_type != AV1_OBU_FRAME_HEADER) { + av_log(ctx, AV_LOG_WARNING, "Unexpected Tile Group OBU found before a " + "Frame Header.\n"); + goto passthrough; + } + + if ((group->tg_end == (frame->tile_cols * frame->tile_rows) - 1) && + // include every trailing OBU with the last frame + s->cur_frame < s->nb_frames) { + s->cur_frame_idx = i + 1; + break; + } + } + } + av_assert0(frame && s->cur_frame <= s->nb_frames); + + ret = av_packet_ref(out, s->buffer_pkt); + if (ret < 0) + goto fail; + + out->data = (uint8_t *)td->units[s->last_frame_idx].data; + out->size = size; + + // skip the frame in the buffer packet if it's split successfully, so it's not present + // if the packet is passed through in case of failure when splitting another frame. + s->buffer_pkt->data += size; + s->buffer_pkt->size -= size; + + if (!frame->show_existing_frame && !frame->show_frame) + out->pts = AV_NOPTS_VALUE; + + if (s->cur_frame == s->nb_frames) { + av_packet_unref(s->buffer_pkt); + ff_cbs_fragment_reset(td); + } + + return 0; + } + +passthrough: + av_packet_move_ref(out, s->buffer_pkt); + + ret = 0; +fail: + if (ret < 0) { + av_packet_unref(out); + av_packet_unref(s->buffer_pkt); + } + ff_cbs_fragment_reset(td); + + return ret; +} + +static const CodedBitstreamUnitType decompose_unit_types[] = { + AV1_OBU_TEMPORAL_DELIMITER, + AV1_OBU_SEQUENCE_HEADER, + AV1_OBU_FRAME_HEADER, + AV1_OBU_TILE_GROUP, + AV1_OBU_FRAME, +}; + +static int av1_frame_split_init(AVBSFContext *ctx) +{ + AV1FSplitContext *s = ctx->priv_data; + CodedBitstreamFragment *td = &s->temporal_unit; + int ret; + + s->buffer_pkt = av_packet_alloc(); + if (!s->buffer_pkt) + return AVERROR(ENOMEM); + + ret = ff_cbs_init(&s->cbc, AV_CODEC_ID_AV1, ctx); + if (ret < 0) + return ret; + + s->cbc->decompose_unit_types = decompose_unit_types; + s->cbc->nb_decompose_unit_types = FF_ARRAY_ELEMS(decompose_unit_types); + + if (!ctx->par_in->extradata_size) + return 0; + + ret = ff_cbs_read_extradata(s->cbc, td, ctx->par_in); + if (ret < 0) + av_log(ctx, AV_LOG_WARNING, "Failed to parse extradata.\n"); + + ff_cbs_fragment_reset(td); + + return 0; +} + +static void av1_frame_split_flush(AVBSFContext *ctx) +{ + AV1FSplitContext *s = ctx->priv_data; + + av_packet_unref(s->buffer_pkt); + ff_cbs_fragment_reset(&s->temporal_unit); +} + +static void av1_frame_split_close(AVBSFContext *ctx) +{ + AV1FSplitContext *s = ctx->priv_data; + + av_packet_free(&s->buffer_pkt); + ff_cbs_fragment_free(&s->temporal_unit); + ff_cbs_close(&s->cbc); +} + +static const enum AVCodecID av1_frame_split_codec_ids[] = { + AV_CODEC_ID_AV1, AV_CODEC_ID_NONE, +}; + +const FFBitStreamFilter ff_av1_frame_split_bsf = { + .p.name = "av1_frame_split", + .p.codec_ids = av1_frame_split_codec_ids, + .priv_data_size = sizeof(AV1FSplitContext), + .init = av1_frame_split_init, + .flush = av1_frame_split_flush, + .close = av1_frame_split_close, + .filter = av1_frame_split_filter, +}; diff --git a/media/ffvpx/libavcodec/av1_parse.h b/media/ffvpx/libavcodec/av1_parse.h new file mode 100644 index 0000000000..f4a5d2830e --- /dev/null +++ b/media/ffvpx/libavcodec/av1_parse.h @@ -0,0 +1,184 @@ +/* + * AV1 common parsing code + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AV1_PARSE_H +#define AVCODEC_AV1_PARSE_H + +#include <limits.h> +#include <stdint.h> + +#include "libavutil/error.h" +#include "libavutil/intmath.h" +#include "libavutil/macros.h" + +#include "av1.h" +#include "get_bits.h" + +// OBU header fields + max leb128 length +#define MAX_OBU_HEADER_SIZE (2 + 8) + +typedef struct AV1OBU { + /** Size of payload */ + int size; + const uint8_t *data; + + /** + * Size, in bits, of just the data, excluding the trailing_one_bit and + * any trailing padding. + */ + int size_bits; + + /** Size of entire OBU, including header */ + int raw_size; + const uint8_t *raw_data; + + /** GetBitContext initialized to the start of the payload */ + GetBitContext gb; + + int type; + + int temporal_id; + int spatial_id; +} AV1OBU; + +/** An input packet split into OBUs */ +typedef struct AV1Packet { + AV1OBU *obus; + int nb_obus; + int obus_allocated; + unsigned obus_allocated_size; +} AV1Packet; + +/** + * Extract an OBU from a raw bitstream. + * + * @note This function does not copy or store any bitstream data. All + * the pointers in the AV1OBU structure will be valid as long + * as the input buffer also is. + */ +int ff_av1_extract_obu(AV1OBU *obu, const uint8_t *buf, int length, + void *logctx); + +/** + * Split an input packet into OBUs. + * + * @note This function does not copy or store any bitstream data. All + * the pointers in the AV1Packet structure will be valid as + * long as the input buffer also is. + */ +int ff_av1_packet_split(AV1Packet *pkt, const uint8_t *buf, int length, + void *logctx); + +/** + * Free all the allocated memory in the packet. + */ +void ff_av1_packet_uninit(AV1Packet *pkt); + +static inline int64_t leb128(GetBitContext *gb) { + int64_t ret = 0; + int i; + + for (i = 0; i < 8; i++) { + int byte = get_bits(gb, 8); + ret |= (int64_t)(byte & 0x7f) << (i * 7); + if (!(byte & 0x80)) + break; + } + return ret; +} + +static inline int parse_obu_header(const uint8_t *buf, int buf_size, + int64_t *obu_size, int *start_pos, int *type, + int *temporal_id, int *spatial_id) +{ + GetBitContext gb; + int ret, extension_flag, has_size_flag; + int64_t size; + + ret = init_get_bits8(&gb, buf, FFMIN(buf_size, MAX_OBU_HEADER_SIZE)); + if (ret < 0) + return ret; + + if (get_bits1(&gb) != 0) // obu_forbidden_bit + return AVERROR_INVALIDDATA; + + *type = get_bits(&gb, 4); + extension_flag = get_bits1(&gb); + has_size_flag = get_bits1(&gb); + skip_bits1(&gb); // obu_reserved_1bit + + if (extension_flag) { + *temporal_id = get_bits(&gb, 3); + *spatial_id = get_bits(&gb, 2); + skip_bits(&gb, 3); // extension_header_reserved_3bits + } else { + *temporal_id = *spatial_id = 0; + } + + *obu_size = has_size_flag ? leb128(&gb) + : buf_size - 1 - extension_flag; + + if (get_bits_left(&gb) < 0) + return AVERROR_INVALIDDATA; + + *start_pos = get_bits_count(&gb) / 8; + + size = *obu_size + *start_pos; + + if (size > buf_size) + return AVERROR_INVALIDDATA; + + return size; +} + +static inline int get_obu_bit_length(const uint8_t *buf, int size, int type) +{ + int v; + + /* There are no trailing bits on these */ + if (type == AV1_OBU_TILE_GROUP || + type == AV1_OBU_TILE_LIST || + type == AV1_OBU_FRAME) { + if (size > INT_MAX / 8) + return AVERROR(ERANGE); + else + return size * 8; + } + + while (size > 0 && buf[size - 1] == 0) + size--; + + if (!size) + return 0; + + v = buf[size - 1]; + + if (size > INT_MAX / 8) + return AVERROR(ERANGE); + size *= 8; + + /* Remove the trailing_one_bit and following trailing zeros */ + if (v) + size -= ff_ctz(v) + 1; + + return size; +} + +#endif /* AVCODEC_AV1_PARSE_H */ diff --git a/media/ffvpx/libavcodec/av1_parser.c b/media/ffvpx/libavcodec/av1_parser.c new file mode 100644 index 0000000000..14dae92fe9 --- /dev/null +++ b/media/ffvpx/libavcodec/av1_parser.c @@ -0,0 +1,216 @@ +/* + * AV1 parser + * + * Copyright (C) 2018 James Almer <jamrial@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "cbs.h" +#include "cbs_av1.h" +#include "parser.h" + +typedef struct AV1ParseContext { + CodedBitstreamContext *cbc; + CodedBitstreamFragment temporal_unit; + int parsed_extradata; +} AV1ParseContext; + +static const enum AVPixelFormat pix_fmts_8bit[2][2] = { + { AV_PIX_FMT_YUV444P, AV_PIX_FMT_NONE }, + { AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV420P }, +}; +static const enum AVPixelFormat pix_fmts_10bit[2][2] = { + { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_NONE }, + { AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV420P10 }, +}; +static const enum AVPixelFormat pix_fmts_12bit[2][2] = { + { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_NONE }, + { AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV420P12 }, +}; + +static const enum AVPixelFormat pix_fmts_rgb[3] = { + AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, +}; + +static int av1_parser_parse(AVCodecParserContext *ctx, + AVCodecContext *avctx, + const uint8_t **out_data, int *out_size, + const uint8_t *data, int size) +{ + AV1ParseContext *s = ctx->priv_data; + CodedBitstreamFragment *td = &s->temporal_unit; + const CodedBitstreamAV1Context *av1 = s->cbc->priv_data; + const AV1RawSequenceHeader *seq; + const AV1RawColorConfig *color; + int ret; + + *out_data = data; + *out_size = size; + + ctx->key_frame = -1; + ctx->pict_type = AV_PICTURE_TYPE_NONE; + ctx->picture_structure = AV_PICTURE_STRUCTURE_UNKNOWN; + + s->cbc->log_ctx = avctx; + + if (avctx->extradata_size && !s->parsed_extradata) { + s->parsed_extradata = 1; + + ret = ff_cbs_read_extradata_from_codec(s->cbc, td, avctx); + if (ret < 0) { + av_log(avctx, AV_LOG_WARNING, "Failed to parse extradata.\n"); + } + + ff_cbs_fragment_reset(td); + } + + ret = ff_cbs_read(s->cbc, td, data, size); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to parse temporal unit.\n"); + goto end; + } + + if (!av1->sequence_header) { + av_log(avctx, AV_LOG_ERROR, "No sequence header available\n"); + goto end; + } + + seq = av1->sequence_header; + color = &seq->color_config; + + for (int i = 0; i < td->nb_units; i++) { + const CodedBitstreamUnit *unit = &td->units[i]; + const AV1RawOBU *obu = unit->content; + const AV1RawFrameHeader *frame; + + if (unit->type == AV1_OBU_FRAME) + frame = &obu->obu.frame.header; + else if (unit->type == AV1_OBU_FRAME_HEADER) + frame = &obu->obu.frame_header; + else + continue; + + if (obu->header.spatial_id > 0) + continue; + + if (!frame->show_frame && !frame->show_existing_frame) + continue; + + ctx->width = frame->frame_width_minus_1 + 1; + ctx->height = frame->frame_height_minus_1 + 1; + + ctx->key_frame = frame->frame_type == AV1_FRAME_KEY && !frame->show_existing_frame; + + switch (frame->frame_type) { + case AV1_FRAME_KEY: + case AV1_FRAME_INTRA_ONLY: + ctx->pict_type = AV_PICTURE_TYPE_I; + break; + case AV1_FRAME_INTER: + ctx->pict_type = AV_PICTURE_TYPE_P; + break; + case AV1_FRAME_SWITCH: + ctx->pict_type = AV_PICTURE_TYPE_SP; + break; + } + ctx->picture_structure = AV_PICTURE_STRUCTURE_FRAME; + } + + switch (av1->bit_depth) { + case 8: + ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY8 + : pix_fmts_8bit [color->subsampling_x][color->subsampling_y]; + break; + case 10: + ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY10 + : pix_fmts_10bit[color->subsampling_x][color->subsampling_y]; + break; + case 12: + ctx->format = color->mono_chrome ? AV_PIX_FMT_GRAY12 + : pix_fmts_12bit[color->subsampling_x][color->subsampling_y]; + break; + } + av_assert2(ctx->format != AV_PIX_FMT_NONE); + + if (!color->subsampling_x && !color->subsampling_y && + color->matrix_coefficients == AVCOL_SPC_RGB && + color->color_primaries == AVCOL_PRI_BT709 && + color->transfer_characteristics == AVCOL_TRC_IEC61966_2_1) + ctx->format = pix_fmts_rgb[color->high_bitdepth + color->twelve_bit]; + + avctx->profile = seq->seq_profile; + avctx->level = seq->seq_level_idx[0]; + + avctx->colorspace = (enum AVColorSpace) color->matrix_coefficients; + avctx->color_primaries = (enum AVColorPrimaries) color->color_primaries; + avctx->color_trc = (enum AVColorTransferCharacteristic) color->transfer_characteristics; + avctx->color_range = color->color_range ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG; + + if (seq->timing_info_present_flag) { + const AV1RawTimingInfo *timing = &seq->timing_info; + av_reduce(&avctx->framerate.den, &avctx->framerate.num, + timing->num_units_in_display_tick, timing->time_scale, INT_MAX); + } + +end: + ff_cbs_fragment_reset(td); + + s->cbc->log_ctx = NULL; + + return size; +} + +static const CodedBitstreamUnitType decompose_unit_types[] = { + AV1_OBU_TEMPORAL_DELIMITER, + AV1_OBU_SEQUENCE_HEADER, + AV1_OBU_FRAME_HEADER, + AV1_OBU_TILE_GROUP, + AV1_OBU_FRAME, +}; + +static av_cold int av1_parser_init(AVCodecParserContext *ctx) +{ + AV1ParseContext *s = ctx->priv_data; + int ret; + + ret = ff_cbs_init(&s->cbc, AV_CODEC_ID_AV1, NULL); + if (ret < 0) + return ret; + + s->cbc->decompose_unit_types = decompose_unit_types; + s->cbc->nb_decompose_unit_types = FF_ARRAY_ELEMS(decompose_unit_types); + + return 0; +} + +static void av1_parser_close(AVCodecParserContext *ctx) +{ + AV1ParseContext *s = ctx->priv_data; + + ff_cbs_fragment_free(&s->temporal_unit); + ff_cbs_close(&s->cbc); +} + +const AVCodecParser ff_av1_parser = { + .codec_ids = { AV_CODEC_ID_AV1 }, + .priv_data_size = sizeof(AV1ParseContext), + .parser_init = av1_parser_init, + .parser_close = av1_parser_close, + .parser_parse = av1_parser_parse, +}; diff --git a/media/ffvpx/libavcodec/av1dec.c b/media/ffvpx/libavcodec/av1dec.c new file mode 100644 index 0000000000..d83c902f1f --- /dev/null +++ b/media/ffvpx/libavcodec/av1dec.c @@ -0,0 +1,1286 @@ +/* + * AV1 video decoder + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config_components.h" + +#include "libavutil/film_grain_params.h" +#include "libavutil/pixdesc.h" +#include "libavutil/opt.h" +#include "avcodec.h" +#include "av1dec.h" +#include "bytestream.h" +#include "codec_internal.h" +#include "decode.h" +#include "hwconfig.h" +#include "profiles.h" +#include "thread.h" + +/**< same with Div_Lut defined in spec 7.11.3.7 */ +static const uint16_t div_lut[AV1_DIV_LUT_NUM] = { + 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768, + 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142, + 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564, + 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028, + 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530, + 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066, + 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633, + 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228, + 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848, + 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491, + 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155, + 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838, + 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538, + 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255, + 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986, + 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732, + 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489, + 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259, + 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039, + 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830, + 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630, + 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439, + 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257, + 8240, 8224, 8208, 8192 +}; + +static uint32_t inverse_recenter(int r, uint32_t v) +{ + if (v > 2 * r) + return v; + else if (v & 1) + return r - ((v + 1) >> 1); + else + return r + (v >> 1); +} + +static uint32_t decode_unsigned_subexp_with_ref(uint32_t sub_exp, + int mx, int r) +{ + if ((r << 1) <= mx) { + return inverse_recenter(r, sub_exp); + } else { + return mx - 1 - inverse_recenter(mx - 1 - r, sub_exp); + } +} + +static int32_t decode_signed_subexp_with_ref(uint32_t sub_exp, int low, + int high, int r) +{ + int32_t x = decode_unsigned_subexp_with_ref(sub_exp, high - low, r - low); + return x + low; +} + +static void read_global_param(AV1DecContext *s, int type, int ref, int idx) +{ + uint8_t primary_frame, prev_frame; + uint32_t abs_bits, prec_bits, round, prec_diff, sub, mx; + int32_t r, prev_gm_param; + + primary_frame = s->raw_frame_header->primary_ref_frame; + prev_frame = s->raw_frame_header->ref_frame_idx[primary_frame]; + abs_bits = AV1_GM_ABS_ALPHA_BITS; + prec_bits = AV1_GM_ALPHA_PREC_BITS; + + /* setup_past_independence() sets PrevGmParams to default values. We can + * simply point to the current's frame gm_params as they will be initialized + * with defaults at this point. + */ + if (s->raw_frame_header->primary_ref_frame == AV1_PRIMARY_REF_NONE) + prev_gm_param = s->cur_frame.gm_params[ref][idx]; + else + prev_gm_param = s->ref[prev_frame].gm_params[ref][idx]; + + if (idx < 2) { + if (type == AV1_WARP_MODEL_TRANSLATION) { + abs_bits = AV1_GM_ABS_TRANS_ONLY_BITS - + !s->raw_frame_header->allow_high_precision_mv; + prec_bits = AV1_GM_TRANS_ONLY_PREC_BITS - + !s->raw_frame_header->allow_high_precision_mv; + } else { + abs_bits = AV1_GM_ABS_TRANS_BITS; + prec_bits = AV1_GM_TRANS_PREC_BITS; + } + } + round = (idx % 3) == 2 ? (1 << AV1_WARPEDMODEL_PREC_BITS) : 0; + prec_diff = AV1_WARPEDMODEL_PREC_BITS - prec_bits; + sub = (idx % 3) == 2 ? (1 << prec_bits) : 0; + mx = 1 << abs_bits; + r = (prev_gm_param >> prec_diff) - sub; + + s->cur_frame.gm_params[ref][idx] = + (decode_signed_subexp_with_ref(s->raw_frame_header->gm_params[ref][idx], + -mx, mx + 1, r) << prec_diff) + round; +} + +static uint64_t round_two(uint64_t x, uint16_t n) +{ + if (n == 0) + return x; + return ((x + ((uint64_t)1 << (n - 1))) >> n); +} + +static int64_t round_two_signed(int64_t x, uint16_t n) +{ + return ((x<0) ? -((int64_t)round_two(-x, n)) : (int64_t)round_two(x, n)); +} + +/** + * Resolve divisor process. + * see spec 7.11.3.7 + */ +static int16_t resolve_divisor(uint32_t d, uint16_t *shift) +{ + int32_t e, f; + + *shift = av_log2(d); + e = d - (1 << (*shift)); + if (*shift > AV1_DIV_LUT_BITS) + f = round_two(e, *shift - AV1_DIV_LUT_BITS); + else + f = e << (AV1_DIV_LUT_BITS - (*shift)); + + *shift += AV1_DIV_LUT_PREC_BITS; + + return div_lut[f]; +} + +/** + * check if global motion params is valid. + * see spec 7.11.3.6 + */ +static uint8_t get_shear_params_valid(AV1DecContext *s, int idx) +{ + int16_t alpha, beta, gamma, delta, divf, divs; + int64_t v, w; + int32_t *param = &s->cur_frame.gm_params[idx][0]; + if (param[2] < 0) + return 0; + + alpha = av_clip_int16(param[2] - (1 << AV1_WARPEDMODEL_PREC_BITS)); + beta = av_clip_int16(param[3]); + divf = resolve_divisor(abs(param[2]), &divs); + v = (int64_t)param[4] * (1 << AV1_WARPEDMODEL_PREC_BITS); + w = (int64_t)param[3] * param[4]; + gamma = av_clip_int16((int)round_two_signed((v * divf), divs)); + delta = av_clip_int16(param[5] - (int)round_two_signed((w * divf), divs) - (1 << AV1_WARPEDMODEL_PREC_BITS)); + + alpha = round_two_signed(alpha, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS; + beta = round_two_signed(beta, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS; + gamma = round_two_signed(gamma, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS; + delta = round_two_signed(delta, AV1_WARP_PARAM_REDUCE_BITS) << AV1_WARP_PARAM_REDUCE_BITS; + + if ((4 * abs(alpha) + 7 * abs(beta)) >= (1 << AV1_WARPEDMODEL_PREC_BITS) || + (4 * abs(gamma) + 4 * abs(delta)) >= (1 << AV1_WARPEDMODEL_PREC_BITS)) + return 0; + + return 1; +} + +/** +* update gm type/params, since cbs already implemented part of this function, +* so we don't need to full implement spec. +*/ +static void global_motion_params(AV1DecContext *s) +{ + const AV1RawFrameHeader *header = s->raw_frame_header; + int type, ref; + + for (ref = AV1_REF_FRAME_LAST; ref <= AV1_REF_FRAME_ALTREF; ref++) { + s->cur_frame.gm_type[ref] = AV1_WARP_MODEL_IDENTITY; + for (int i = 0; i < 6; i++) + s->cur_frame.gm_params[ref][i] = (i % 3 == 2) ? + 1 << AV1_WARPEDMODEL_PREC_BITS : 0; + } + if (header->frame_type == AV1_FRAME_KEY || + header->frame_type == AV1_FRAME_INTRA_ONLY) + return; + + for (ref = AV1_REF_FRAME_LAST; ref <= AV1_REF_FRAME_ALTREF; ref++) { + if (header->is_global[ref]) { + if (header->is_rot_zoom[ref]) { + type = AV1_WARP_MODEL_ROTZOOM; + } else { + type = header->is_translation[ref] ? AV1_WARP_MODEL_TRANSLATION + : AV1_WARP_MODEL_AFFINE; + } + } else { + type = AV1_WARP_MODEL_IDENTITY; + } + s->cur_frame.gm_type[ref] = type; + + if (type >= AV1_WARP_MODEL_ROTZOOM) { + read_global_param(s, type, ref, 2); + read_global_param(s, type, ref, 3); + if (type == AV1_WARP_MODEL_AFFINE) { + read_global_param(s, type, ref, 4); + read_global_param(s, type, ref, 5); + } else { + s->cur_frame.gm_params[ref][4] = -s->cur_frame.gm_params[ref][3]; + s->cur_frame.gm_params[ref][5] = s->cur_frame.gm_params[ref][2]; + } + } + if (type >= AV1_WARP_MODEL_TRANSLATION) { + read_global_param(s, type, ref, 0); + read_global_param(s, type, ref, 1); + } + if (type <= AV1_WARP_MODEL_AFFINE) { + s->cur_frame.gm_invalid[ref] = !get_shear_params_valid(s, ref); + } + } +} + +static int get_relative_dist(const AV1RawSequenceHeader *seq, + unsigned int a, unsigned int b) +{ + unsigned int diff = a - b; + unsigned int m = 1 << seq->order_hint_bits_minus_1; + return (diff & (m - 1)) - (diff & m); +} + +static void skip_mode_params(AV1DecContext *s) +{ + const AV1RawFrameHeader *header = s->raw_frame_header; + const AV1RawSequenceHeader *seq = s->raw_seq; + + int forward_idx, backward_idx; + int forward_hint, backward_hint; + int second_forward_idx, second_forward_hint; + int ref_hint, dist, i; + + if (!header->skip_mode_present) + return; + + forward_idx = -1; + backward_idx = -1; + for (i = 0; i < AV1_REFS_PER_FRAME; i++) { + ref_hint = s->ref[header->ref_frame_idx[i]].raw_frame_header->order_hint; + dist = get_relative_dist(seq, ref_hint, header->order_hint); + if (dist < 0) { + if (forward_idx < 0 || + get_relative_dist(seq, ref_hint, forward_hint) > 0) { + forward_idx = i; + forward_hint = ref_hint; + } + } else if (dist > 0) { + if (backward_idx < 0 || + get_relative_dist(seq, ref_hint, backward_hint) < 0) { + backward_idx = i; + backward_hint = ref_hint; + } + } + } + + if (forward_idx < 0) { + return; + } else if (backward_idx >= 0) { + s->cur_frame.skip_mode_frame_idx[0] = + AV1_REF_FRAME_LAST + FFMIN(forward_idx, backward_idx); + s->cur_frame.skip_mode_frame_idx[1] = + AV1_REF_FRAME_LAST + FFMAX(forward_idx, backward_idx); + return; + } + + second_forward_idx = -1; + for (i = 0; i < AV1_REFS_PER_FRAME; i++) { + ref_hint = s->ref[header->ref_frame_idx[i]].raw_frame_header->order_hint; + if (get_relative_dist(seq, ref_hint, forward_hint) < 0) { + if (second_forward_idx < 0 || + get_relative_dist(seq, ref_hint, second_forward_hint) > 0) { + second_forward_idx = i; + second_forward_hint = ref_hint; + } + } + } + + if (second_forward_idx < 0) + return; + + s->cur_frame.skip_mode_frame_idx[0] = + AV1_REF_FRAME_LAST + FFMIN(forward_idx, second_forward_idx); + s->cur_frame.skip_mode_frame_idx[1] = + AV1_REF_FRAME_LAST + FFMAX(forward_idx, second_forward_idx); +} + +static void coded_lossless_param(AV1DecContext *s) +{ + const AV1RawFrameHeader *header = s->raw_frame_header; + int i; + + if (header->delta_q_y_dc || header->delta_q_u_ac || + header->delta_q_u_dc || header->delta_q_v_ac || + header->delta_q_v_dc) { + s->cur_frame.coded_lossless = 0; + return; + } + + s->cur_frame.coded_lossless = 1; + for (i = 0; i < AV1_MAX_SEGMENTS; i++) { + int qindex; + if (header->feature_enabled[i][AV1_SEG_LVL_ALT_Q]) { + qindex = (header->base_q_idx + + header->feature_value[i][AV1_SEG_LVL_ALT_Q]); + } else { + qindex = header->base_q_idx; + } + qindex = av_clip_uintp2(qindex, 8); + + if (qindex) { + s->cur_frame.coded_lossless = 0; + return; + } + } +} + +static void load_grain_params(AV1DecContext *s) +{ + const AV1RawFrameHeader *header = s->raw_frame_header; + const AV1RawFilmGrainParams *film_grain = &header->film_grain, *src; + AV1RawFilmGrainParams *dst = &s->cur_frame.film_grain; + + if (!film_grain->apply_grain) + return; + + if (film_grain->update_grain) { + memcpy(dst, film_grain, sizeof(*dst)); + return; + } + + src = &s->ref[film_grain->film_grain_params_ref_idx].film_grain; + + memcpy(dst, src, sizeof(*dst)); + dst->grain_seed = film_grain->grain_seed; +} + +static int init_tile_data(AV1DecContext *s) + +{ + int cur_tile_num = + s->raw_frame_header->tile_cols * s->raw_frame_header->tile_rows; + if (s->tile_num < cur_tile_num) { + int ret = av_reallocp_array(&s->tile_group_info, cur_tile_num, + sizeof(TileGroupInfo)); + if (ret < 0) { + s->tile_num = 0; + return ret; + } + } + s->tile_num = cur_tile_num; + + return 0; +} + +static int get_tiles_info(AVCodecContext *avctx, const AV1RawTileGroup *tile_group) +{ + AV1DecContext *s = avctx->priv_data; + GetByteContext gb; + uint16_t tile_num, tile_row, tile_col; + uint32_t size = 0, size_bytes = 0; + + bytestream2_init(&gb, tile_group->tile_data.data, + tile_group->tile_data.data_size); + s->tg_start = tile_group->tg_start; + s->tg_end = tile_group->tg_end; + + for (tile_num = tile_group->tg_start; tile_num <= tile_group->tg_end; tile_num++) { + tile_row = tile_num / s->raw_frame_header->tile_cols; + tile_col = tile_num % s->raw_frame_header->tile_cols; + + if (tile_num == tile_group->tg_end) { + s->tile_group_info[tile_num].tile_size = bytestream2_get_bytes_left(&gb); + s->tile_group_info[tile_num].tile_offset = bytestream2_tell(&gb); + s->tile_group_info[tile_num].tile_row = tile_row; + s->tile_group_info[tile_num].tile_column = tile_col; + return 0; + } + size_bytes = s->raw_frame_header->tile_size_bytes_minus1 + 1; + if (bytestream2_get_bytes_left(&gb) < size_bytes) + return AVERROR_INVALIDDATA; + size = 0; + for (int i = 0; i < size_bytes; i++) + size |= bytestream2_get_byteu(&gb) << 8 * i; + if (bytestream2_get_bytes_left(&gb) <= size) + return AVERROR_INVALIDDATA; + size++; + + s->tile_group_info[tile_num].tile_size = size; + s->tile_group_info[tile_num].tile_offset = bytestream2_tell(&gb); + s->tile_group_info[tile_num].tile_row = tile_row; + s->tile_group_info[tile_num].tile_column = tile_col; + + bytestream2_skipu(&gb, size); + } + + return 0; + +} + +static int get_pixel_format(AVCodecContext *avctx) +{ + AV1DecContext *s = avctx->priv_data; + const AV1RawSequenceHeader *seq = s->raw_seq; + uint8_t bit_depth; + int ret; + enum AVPixelFormat pix_fmt = AV_PIX_FMT_NONE; +#define HWACCEL_MAX (CONFIG_AV1_DXVA2_HWACCEL + \ + CONFIG_AV1_D3D11VA_HWACCEL * 2 + \ + CONFIG_AV1_NVDEC_HWACCEL + \ + CONFIG_AV1_VAAPI_HWACCEL + \ + CONFIG_AV1_VDPAU_HWACCEL) + enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts; + + if (seq->seq_profile == 2 && seq->color_config.high_bitdepth) + bit_depth = seq->color_config.twelve_bit ? 12 : 10; + else if (seq->seq_profile <= 2) + bit_depth = seq->color_config.high_bitdepth ? 10 : 8; + else { + av_log(avctx, AV_LOG_ERROR, + "Unknown AV1 profile %d.\n", seq->seq_profile); + return -1; + } + + if (!seq->color_config.mono_chrome) { + // 4:4:4 x:0 y:0, 4:2:2 x:1 y:0, 4:2:0 x:1 y:1 + if (seq->color_config.subsampling_x == 0 && + seq->color_config.subsampling_y == 0) { + if (bit_depth == 8) + pix_fmt = AV_PIX_FMT_YUV444P; + else if (bit_depth == 10) + pix_fmt = AV_PIX_FMT_YUV444P10; + else if (bit_depth == 12) + pix_fmt = AV_PIX_FMT_YUV444P12; + else + av_log(avctx, AV_LOG_WARNING, "Unknown AV1 pixel format.\n"); + } else if (seq->color_config.subsampling_x == 1 && + seq->color_config.subsampling_y == 0) { + if (bit_depth == 8) + pix_fmt = AV_PIX_FMT_YUV422P; + else if (bit_depth == 10) + pix_fmt = AV_PIX_FMT_YUV422P10; + else if (bit_depth == 12) + pix_fmt = AV_PIX_FMT_YUV422P12; + else + av_log(avctx, AV_LOG_WARNING, "Unknown AV1 pixel format.\n"); + } else if (seq->color_config.subsampling_x == 1 && + seq->color_config.subsampling_y == 1) { + if (bit_depth == 8) + pix_fmt = AV_PIX_FMT_YUV420P; + else if (bit_depth == 10) + pix_fmt = AV_PIX_FMT_YUV420P10; + else if (bit_depth == 12) + pix_fmt = AV_PIX_FMT_YUV420P12; + else + av_log(avctx, AV_LOG_WARNING, "Unknown AV1 pixel format.\n"); + } + } else { + if (bit_depth == 8) + pix_fmt = AV_PIX_FMT_GRAY8; + else if (bit_depth == 10) + pix_fmt = AV_PIX_FMT_GRAY10; + else if (bit_depth == 12) + pix_fmt = AV_PIX_FMT_GRAY12; + else + av_log(avctx, AV_LOG_WARNING, "Unknown AV1 pixel format.\n"); + } + + av_log(avctx, AV_LOG_DEBUG, "AV1 decode get format: %s.\n", + av_get_pix_fmt_name(pix_fmt)); + + if (pix_fmt == AV_PIX_FMT_NONE) + return -1; + + switch (pix_fmt) { + case AV_PIX_FMT_YUV420P: +#if CONFIG_AV1_DXVA2_HWACCEL + *fmtp++ = AV_PIX_FMT_DXVA2_VLD; +#endif +#if CONFIG_AV1_D3D11VA_HWACCEL + *fmtp++ = AV_PIX_FMT_D3D11VA_VLD; + *fmtp++ = AV_PIX_FMT_D3D11; +#endif +#if CONFIG_AV1_NVDEC_HWACCEL + *fmtp++ = AV_PIX_FMT_CUDA; +#endif +#if CONFIG_AV1_VAAPI_HWACCEL + *fmtp++ = AV_PIX_FMT_VAAPI; +#endif +#if CONFIG_AV1_VDPAU_HWACCEL + *fmtp++ = AV_PIX_FMT_VDPAU; +#endif + break; + case AV_PIX_FMT_YUV420P10: +#if CONFIG_AV1_DXVA2_HWACCEL + *fmtp++ = AV_PIX_FMT_DXVA2_VLD; +#endif +#if CONFIG_AV1_D3D11VA_HWACCEL + *fmtp++ = AV_PIX_FMT_D3D11VA_VLD; + *fmtp++ = AV_PIX_FMT_D3D11; +#endif +#if CONFIG_AV1_NVDEC_HWACCEL + *fmtp++ = AV_PIX_FMT_CUDA; +#endif +#if CONFIG_AV1_VAAPI_HWACCEL + *fmtp++ = AV_PIX_FMT_VAAPI; +#endif +#if CONFIG_AV1_VDPAU_HWACCEL + *fmtp++ = AV_PIX_FMT_VDPAU; +#endif + break; + case AV_PIX_FMT_GRAY8: +#if CONFIG_AV1_NVDEC_HWACCEL + *fmtp++ = AV_PIX_FMT_CUDA; +#endif + break; + case AV_PIX_FMT_GRAY10: +#if CONFIG_AV1_NVDEC_HWACCEL + *fmtp++ = AV_PIX_FMT_CUDA; +#endif + break; + } + + *fmtp++ = pix_fmt; + *fmtp = AV_PIX_FMT_NONE; + + ret = ff_thread_get_format(avctx, pix_fmts); + if (ret < 0) + return ret; + + /** + * check if the HW accel is inited correctly. If not, return un-implemented. + * Since now the av1 decoder doesn't support native decode, if it will be + * implemented in the future, need remove this check. + */ + if (!avctx->hwaccel) { + av_log(avctx, AV_LOG_ERROR, "Your platform doesn't support" + " hardware accelerated AV1 decoding.\n"); + return AVERROR(ENOSYS); + } + + s->pix_fmt = pix_fmt; + avctx->pix_fmt = ret; + + return 0; +} + +static void av1_frame_unref(AVCodecContext *avctx, AV1Frame *f) +{ + ff_thread_release_buffer(avctx, f->f); + av_buffer_unref(&f->hwaccel_priv_buf); + f->hwaccel_picture_private = NULL; + av_buffer_unref(&f->header_ref); + f->raw_frame_header = NULL; + f->spatial_id = f->temporal_id = 0; + memset(f->skip_mode_frame_idx, 0, + 2 * sizeof(uint8_t)); + memset(&f->film_grain, 0, sizeof(f->film_grain)); + f->coded_lossless = 0; +} + +static int av1_frame_ref(AVCodecContext *avctx, AV1Frame *dst, const AV1Frame *src) +{ + int ret; + + ret = av_buffer_replace(&dst->header_ref, src->header_ref); + if (ret < 0) + return ret; + + dst->raw_frame_header = src->raw_frame_header; + + if (!src->f->buf[0]) + return 0; + + ret = av_frame_ref(dst->f, src->f); + if (ret < 0) + goto fail; + + if (src->hwaccel_picture_private) { + dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf); + if (!dst->hwaccel_priv_buf) + goto fail; + dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data; + } + + dst->spatial_id = src->spatial_id; + dst->temporal_id = src->temporal_id; + memcpy(dst->gm_invalid, + src->gm_invalid, + AV1_NUM_REF_FRAMES * sizeof(uint8_t)); + memcpy(dst->gm_type, + src->gm_type, + AV1_NUM_REF_FRAMES * sizeof(uint8_t)); + memcpy(dst->gm_params, + src->gm_params, + AV1_NUM_REF_FRAMES * 6 * sizeof(int32_t)); + memcpy(dst->skip_mode_frame_idx, + src->skip_mode_frame_idx, + 2 * sizeof(uint8_t)); + memcpy(&dst->film_grain, + &src->film_grain, + sizeof(dst->film_grain)); + dst->coded_lossless = src->coded_lossless; + + return 0; + +fail: + av1_frame_unref(avctx, dst); + return AVERROR(ENOMEM); +} + +static av_cold int av1_decode_free(AVCodecContext *avctx) +{ + AV1DecContext *s = avctx->priv_data; + + for (int i = 0; i < FF_ARRAY_ELEMS(s->ref); i++) { + av1_frame_unref(avctx, &s->ref[i]); + av_frame_free(&s->ref[i].f); + } + av1_frame_unref(avctx, &s->cur_frame); + av_frame_free(&s->cur_frame.f); + + av_buffer_unref(&s->seq_ref); + av_buffer_unref(&s->header_ref); + av_freep(&s->tile_group_info); + + ff_cbs_fragment_free(&s->current_obu); + ff_cbs_close(&s->cbc); + + return 0; +} + +static int set_context_with_sequence(AVCodecContext *avctx, + const AV1RawSequenceHeader *seq) +{ + int width = seq->max_frame_width_minus_1 + 1; + int height = seq->max_frame_height_minus_1 + 1; + + avctx->profile = seq->seq_profile; + avctx->level = seq->seq_level_idx[0]; + + avctx->color_range = + seq->color_config.color_range ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG; + avctx->color_primaries = seq->color_config.color_primaries; + avctx->colorspace = seq->color_config.color_primaries; + avctx->color_trc = seq->color_config.transfer_characteristics; + + switch (seq->color_config.chroma_sample_position) { + case AV1_CSP_VERTICAL: + avctx->chroma_sample_location = AVCHROMA_LOC_LEFT; + break; + case AV1_CSP_COLOCATED: + avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT; + break; + } + + if (seq->film_grain_params_present) + avctx->properties |= FF_CODEC_PROPERTY_FILM_GRAIN; + else + avctx->properties &= ~FF_CODEC_PROPERTY_FILM_GRAIN; + + if (avctx->width != width || avctx->height != height) { + int ret = ff_set_dimensions(avctx, width, height); + if (ret < 0) + return ret; + } + avctx->sample_aspect_ratio = (AVRational) { 1, 1 }; + + if (seq->timing_info.num_units_in_display_tick && + seq->timing_info.time_scale) { + av_reduce(&avctx->framerate.den, &avctx->framerate.num, + seq->timing_info.num_units_in_display_tick, + seq->timing_info.time_scale, + INT_MAX); + if (seq->timing_info.equal_picture_interval) + avctx->ticks_per_frame = seq->timing_info.num_ticks_per_picture_minus_1 + 1; + } + + return 0; +} + +static int update_context_with_frame_header(AVCodecContext *avctx, + const AV1RawFrameHeader *header) +{ + AVRational aspect_ratio; + int width = header->frame_width_minus_1 + 1; + int height = header->frame_height_minus_1 + 1; + int r_width = header->render_width_minus_1 + 1; + int r_height = header->render_height_minus_1 + 1; + int ret; + + if (avctx->width != width || avctx->height != height) { + ret = ff_set_dimensions(avctx, width, height); + if (ret < 0) + return ret; + } + + av_reduce(&aspect_ratio.num, &aspect_ratio.den, + (int64_t)height * r_width, + (int64_t)width * r_height, + INT_MAX); + + if (av_cmp_q(avctx->sample_aspect_ratio, aspect_ratio)) { + ret = ff_set_sar(avctx, aspect_ratio); + if (ret < 0) + return ret; + } + + return 0; +} + +static av_cold int av1_decode_init(AVCodecContext *avctx) +{ + AV1DecContext *s = avctx->priv_data; + AV1RawSequenceHeader *seq; + int ret; + + s->avctx = avctx; + s->pix_fmt = AV_PIX_FMT_NONE; + + for (int i = 0; i < FF_ARRAY_ELEMS(s->ref); i++) { + s->ref[i].f = av_frame_alloc(); + if (!s->ref[i].f) { + av_log(avctx, AV_LOG_ERROR, + "Failed to allocate reference frame buffer %d.\n", i); + return AVERROR(ENOMEM); + } + } + + s->cur_frame.f = av_frame_alloc(); + if (!s->cur_frame.f) { + av_log(avctx, AV_LOG_ERROR, + "Failed to allocate current frame buffer.\n"); + return AVERROR(ENOMEM); + } + + ret = ff_cbs_init(&s->cbc, AV_CODEC_ID_AV1, avctx); + if (ret < 0) + return ret; + + av_opt_set_int(s->cbc->priv_data, "operating_point", s->operating_point, 0); + + if (avctx->extradata && avctx->extradata_size) { + ret = ff_cbs_read_extradata_from_codec(s->cbc, + &s->current_obu, + avctx); + if (ret < 0) { + av_log(avctx, AV_LOG_WARNING, "Failed to read extradata.\n"); + return ret; + } + + seq = ((CodedBitstreamAV1Context *)(s->cbc->priv_data))->sequence_header; + if (!seq) { + av_log(avctx, AV_LOG_WARNING, "No sequence header available.\n"); + goto end; + } + + ret = set_context_with_sequence(avctx, seq); + if (ret < 0) { + av_log(avctx, AV_LOG_WARNING, "Failed to set decoder context.\n"); + goto end; + } + + end: + ff_cbs_fragment_reset(&s->current_obu); + } + + return ret; +} + +static int av1_frame_alloc(AVCodecContext *avctx, AV1Frame *f) +{ + AV1DecContext *s = avctx->priv_data; + AV1RawFrameHeader *header= s->raw_frame_header; + AVFrame *frame; + int ret; + + ret = update_context_with_frame_header(avctx, header); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to update context with frame header\n"); + return ret; + } + + if ((ret = ff_thread_get_buffer(avctx, f->f, AV_GET_BUFFER_FLAG_REF)) < 0) + goto fail; + + frame = f->f; + frame->key_frame = header->frame_type == AV1_FRAME_KEY; + + switch (header->frame_type) { + case AV1_FRAME_KEY: + case AV1_FRAME_INTRA_ONLY: + frame->pict_type = AV_PICTURE_TYPE_I; + break; + case AV1_FRAME_INTER: + frame->pict_type = AV_PICTURE_TYPE_P; + break; + case AV1_FRAME_SWITCH: + frame->pict_type = AV_PICTURE_TYPE_SP; + break; + } + + if (avctx->hwaccel) { + const AVHWAccel *hwaccel = avctx->hwaccel; + if (hwaccel->frame_priv_data_size) { + f->hwaccel_priv_buf = + av_buffer_allocz(hwaccel->frame_priv_data_size); + if (!f->hwaccel_priv_buf) { + ret = AVERROR(ENOMEM); + goto fail; + } + f->hwaccel_picture_private = f->hwaccel_priv_buf->data; + } + } + return 0; + +fail: + av1_frame_unref(avctx, f); + return ret; +} + +static int export_film_grain(AVCodecContext *avctx, AVFrame *frame) +{ + AV1DecContext *s = avctx->priv_data; + const AV1RawFilmGrainParams *film_grain = &s->cur_frame.film_grain; + AVFilmGrainParams *fgp; + AVFilmGrainAOMParams *aom; + + if (!film_grain->apply_grain) + return 0; + + fgp = av_film_grain_params_create_side_data(frame); + if (!fgp) + return AVERROR(ENOMEM); + + fgp->type = AV_FILM_GRAIN_PARAMS_AV1; + fgp->seed = film_grain->grain_seed; + + aom = &fgp->codec.aom; + aom->chroma_scaling_from_luma = film_grain->chroma_scaling_from_luma; + aom->scaling_shift = film_grain->grain_scaling_minus_8 + 8; + aom->ar_coeff_lag = film_grain->ar_coeff_lag; + aom->ar_coeff_shift = film_grain->ar_coeff_shift_minus_6 + 6; + aom->grain_scale_shift = film_grain->grain_scale_shift; + aom->overlap_flag = film_grain->overlap_flag; + aom->limit_output_range = film_grain->clip_to_restricted_range; + + aom->num_y_points = film_grain->num_y_points; + for (int i = 0; i < film_grain->num_y_points; i++) { + aom->y_points[i][0] = film_grain->point_y_value[i]; + aom->y_points[i][1] = film_grain->point_y_scaling[i]; + } + aom->num_uv_points[0] = film_grain->num_cb_points; + for (int i = 0; i < film_grain->num_cb_points; i++) { + aom->uv_points[0][i][0] = film_grain->point_cb_value[i]; + aom->uv_points[0][i][1] = film_grain->point_cb_scaling[i]; + } + aom->num_uv_points[1] = film_grain->num_cr_points; + for (int i = 0; i < film_grain->num_cr_points; i++) { + aom->uv_points[1][i][0] = film_grain->point_cr_value[i]; + aom->uv_points[1][i][1] = film_grain->point_cr_scaling[i]; + } + + for (int i = 0; i < 24; i++) { + aom->ar_coeffs_y[i] = film_grain->ar_coeffs_y_plus_128[i] - 128; + } + for (int i = 0; i < 25; i++) { + aom->ar_coeffs_uv[0][i] = film_grain->ar_coeffs_cb_plus_128[i] - 128; + aom->ar_coeffs_uv[1][i] = film_grain->ar_coeffs_cr_plus_128[i] - 128; + } + + aom->uv_mult[0] = film_grain->cb_mult; + aom->uv_mult[1] = film_grain->cr_mult; + aom->uv_mult_luma[0] = film_grain->cb_luma_mult; + aom->uv_mult_luma[1] = film_grain->cr_luma_mult; + aom->uv_offset[0] = film_grain->cb_offset; + aom->uv_offset[1] = film_grain->cr_offset; + + return 0; +} + +static int set_output_frame(AVCodecContext *avctx, AVFrame *frame, + const AVPacket *pkt, int *got_frame) +{ + AV1DecContext *s = avctx->priv_data; + const AVFrame *srcframe = s->cur_frame.f; + int ret; + + // TODO: all layers + if (s->operating_point_idc && + av_log2(s->operating_point_idc >> 8) > s->cur_frame.spatial_id) + return 0; + + ret = av_frame_ref(frame, srcframe); + if (ret < 0) + return ret; + + if (avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN) { + ret = export_film_grain(avctx, frame); + if (ret < 0) { + av_frame_unref(frame); + return ret; + } + } + + frame->pts = pkt->pts; + frame->pkt_dts = pkt->dts; + frame->pkt_size = pkt->size; + + *got_frame = 1; + + return 0; +} + +static int update_reference_list(AVCodecContext *avctx) +{ + AV1DecContext *s = avctx->priv_data; + const AV1RawFrameHeader *header = s->raw_frame_header; + int ret; + + for (int i = 0; i < AV1_NUM_REF_FRAMES; i++) { + if (header->refresh_frame_flags & (1 << i)) { + av1_frame_unref(avctx, &s->ref[i]); + if ((ret = av1_frame_ref(avctx, &s->ref[i], &s->cur_frame)) < 0) { + av_log(avctx, AV_LOG_ERROR, + "Failed to update frame %d in reference list\n", i); + return ret; + } + } + } + return 0; +} + +static int get_current_frame(AVCodecContext *avctx) +{ + AV1DecContext *s = avctx->priv_data; + int ret; + + av1_frame_unref(avctx, &s->cur_frame); + + s->cur_frame.header_ref = av_buffer_ref(s->header_ref); + if (!s->cur_frame.header_ref) + return AVERROR(ENOMEM); + + s->cur_frame.raw_frame_header = s->raw_frame_header; + + ret = init_tile_data(s); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to init tile data.\n"); + return ret; + } + + if ((avctx->skip_frame >= AVDISCARD_NONINTRA && + (s->raw_frame_header->frame_type != AV1_FRAME_KEY && + s->raw_frame_header->frame_type != AV1_FRAME_INTRA_ONLY)) || + (avctx->skip_frame >= AVDISCARD_NONKEY && + s->raw_frame_header->frame_type != AV1_FRAME_KEY) || + avctx->skip_frame >= AVDISCARD_ALL) + return 0; + + ret = av1_frame_alloc(avctx, &s->cur_frame); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, + "Failed to allocate space for current frame.\n"); + return ret; + } + + global_motion_params(s); + skip_mode_params(s); + coded_lossless_param(s); + load_grain_params(s); + + return ret; +} + +static int av1_decode_frame(AVCodecContext *avctx, AVFrame *frame, + int *got_frame, AVPacket *pkt) +{ + AV1DecContext *s = avctx->priv_data; + AV1RawTileGroup *raw_tile_group = NULL; + int ret; + + ret = ff_cbs_read_packet(s->cbc, &s->current_obu, pkt); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to read packet.\n"); + goto end; + } + av_log(avctx, AV_LOG_DEBUG, "Total obu for this frame:%d.\n", + s->current_obu.nb_units); + + for (int i = 0; i < s->current_obu.nb_units; i++) { + CodedBitstreamUnit *unit = &s->current_obu.units[i]; + AV1RawOBU *obu = unit->content; + const AV1RawOBUHeader *header; + + if (!obu) + continue; + + header = &obu->header; + av_log(avctx, AV_LOG_DEBUG, "Obu idx:%d, obu type:%d.\n", i, unit->type); + + switch (unit->type) { + case AV1_OBU_SEQUENCE_HEADER: + av_buffer_unref(&s->seq_ref); + s->seq_ref = av_buffer_ref(unit->content_ref); + if (!s->seq_ref) { + ret = AVERROR(ENOMEM); + goto end; + } + + s->raw_seq = &obu->obu.sequence_header; + + ret = set_context_with_sequence(avctx, s->raw_seq); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to set context.\n"); + s->raw_seq = NULL; + goto end; + } + + s->operating_point_idc = s->raw_seq->operating_point_idc[s->operating_point]; + + if (s->pix_fmt == AV_PIX_FMT_NONE) { + ret = get_pixel_format(avctx); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, + "Failed to get pixel format.\n"); + s->raw_seq = NULL; + goto end; + } + } + + if (avctx->hwaccel && avctx->hwaccel->decode_params) { + ret = avctx->hwaccel->decode_params(avctx, unit->type, unit->data, + unit->data_size); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "HW accel decode params fail.\n"); + s->raw_seq = NULL; + goto end; + } + } + break; + case AV1_OBU_REDUNDANT_FRAME_HEADER: + if (s->raw_frame_header) + break; + // fall-through + case AV1_OBU_FRAME: + case AV1_OBU_FRAME_HEADER: + if (!s->raw_seq) { + av_log(avctx, AV_LOG_ERROR, "Missing Sequence Header.\n"); + ret = AVERROR_INVALIDDATA; + goto end; + } + + av_buffer_unref(&s->header_ref); + s->header_ref = av_buffer_ref(unit->content_ref); + if (!s->header_ref) { + ret = AVERROR(ENOMEM); + goto end; + } + + if (unit->type == AV1_OBU_FRAME) + s->raw_frame_header = &obu->obu.frame.header; + else + s->raw_frame_header = &obu->obu.frame_header; + + if (s->raw_frame_header->show_existing_frame) { + av1_frame_unref(avctx, &s->cur_frame); + + ret = av1_frame_ref(avctx, &s->cur_frame, + &s->ref[s->raw_frame_header->frame_to_show_map_idx]); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to get reference frame.\n"); + goto end; + } + + ret = update_reference_list(avctx); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to update reference list.\n"); + goto end; + } + + if (s->cur_frame.f->buf[0]) { + ret = set_output_frame(avctx, frame, pkt, got_frame); + if (ret < 0) + av_log(avctx, AV_LOG_ERROR, "Set output frame error.\n"); + } + + s->raw_frame_header = NULL; + + goto end; + } + + ret = get_current_frame(avctx); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Get current frame error\n"); + goto end; + } + + s->cur_frame.spatial_id = header->spatial_id; + s->cur_frame.temporal_id = header->temporal_id; + + if (avctx->hwaccel && s->cur_frame.f->buf[0]) { + ret = avctx->hwaccel->start_frame(avctx, unit->data, + unit->data_size); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "HW accel start frame fail.\n"); + goto end; + } + } + if (unit->type != AV1_OBU_FRAME) + break; + // fall-through + case AV1_OBU_TILE_GROUP: + if (!s->raw_frame_header) { + av_log(avctx, AV_LOG_ERROR, "Missing Frame Header.\n"); + ret = AVERROR_INVALIDDATA; + goto end; + } + + if (unit->type == AV1_OBU_FRAME) + raw_tile_group = &obu->obu.frame.tile_group; + else + raw_tile_group = &obu->obu.tile_group; + + ret = get_tiles_info(avctx, raw_tile_group); + if (ret < 0) + goto end; + + if (avctx->hwaccel && s->cur_frame.f->buf[0]) { + ret = avctx->hwaccel->decode_slice(avctx, + raw_tile_group->tile_data.data, + raw_tile_group->tile_data.data_size); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, + "HW accel decode slice fail.\n"); + goto end; + } + } + break; + case AV1_OBU_TILE_LIST: + case AV1_OBU_TEMPORAL_DELIMITER: + case AV1_OBU_PADDING: + case AV1_OBU_METADATA: + break; + default: + av_log(avctx, AV_LOG_DEBUG, + "Unknown obu type: %d (%"SIZE_SPECIFIER" bits).\n", + unit->type, unit->data_size); + } + + if (raw_tile_group && (s->tile_num == raw_tile_group->tg_end + 1)) { + if (avctx->hwaccel && s->cur_frame.f->buf[0]) { + ret = avctx->hwaccel->end_frame(avctx); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "HW accel end frame fail.\n"); + goto end; + } + } + + ret = update_reference_list(avctx); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to update reference list.\n"); + goto end; + } + + if (s->raw_frame_header->show_frame && s->cur_frame.f->buf[0]) { + ret = set_output_frame(avctx, frame, pkt, got_frame); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Set output frame error\n"); + goto end; + } + } + raw_tile_group = NULL; + s->raw_frame_header = NULL; + } + } + +end: + ff_cbs_fragment_reset(&s->current_obu); + if (ret < 0) + s->raw_frame_header = NULL; + return ret; +} + +static void av1_decode_flush(AVCodecContext *avctx) +{ + AV1DecContext *s = avctx->priv_data; + + for (int i = 0; i < FF_ARRAY_ELEMS(s->ref); i++) + av1_frame_unref(avctx, &s->ref[i]); + + av1_frame_unref(avctx, &s->cur_frame); + s->operating_point_idc = 0; + s->raw_frame_header = NULL; + s->raw_seq = NULL; + + ff_cbs_flush(s->cbc); +} + +#define OFFSET(x) offsetof(AV1DecContext, x) +#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM +static const AVOption av1_options[] = { + { "operating_point", "Select an operating point of the scalable bitstream", + OFFSET(operating_point), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, AV1_MAX_OPERATING_POINTS - 1, VD }, + { NULL } +}; + +static const AVClass av1_class = { + .class_name = "AV1 decoder", + .item_name = av_default_item_name, + .option = av1_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +const FFCodec ff_av1_decoder = { + .p.name = "av1", + CODEC_LONG_NAME("Alliance for Open Media AV1"), + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_AV1, + .priv_data_size = sizeof(AV1DecContext), + .init = av1_decode_init, + .close = av1_decode_free, + FF_CODEC_DECODE_CB(av1_decode_frame), + .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_AVOID_PROBING, + .caps_internal = FF_CODEC_CAP_INIT_CLEANUP | + FF_CODEC_CAP_SETS_PKT_DTS, + .flush = av1_decode_flush, + .p.profiles = NULL_IF_CONFIG_SMALL(ff_av1_profiles), + .p.priv_class = &av1_class, + .bsfs = "av1_frame_split", + .hw_configs = (const AVCodecHWConfigInternal *const []) { +#if CONFIG_AV1_DXVA2_HWACCEL + HWACCEL_DXVA2(av1), +#endif +#if CONFIG_AV1_D3D11VA_HWACCEL + HWACCEL_D3D11VA(av1), +#endif +#if CONFIG_AV1_D3D11VA2_HWACCEL + HWACCEL_D3D11VA2(av1), +#endif +#if CONFIG_AV1_NVDEC_HWACCEL + HWACCEL_NVDEC(av1), +#endif +#if CONFIG_AV1_VAAPI_HWACCEL + HWACCEL_VAAPI(av1), +#endif +#if CONFIG_AV1_VDPAU_HWACCEL + HWACCEL_VDPAU(av1), +#endif + + NULL + }, +}; diff --git a/media/ffvpx/libavcodec/av1dec.h b/media/ffvpx/libavcodec/av1dec.h new file mode 100644 index 0000000000..82c7084e99 --- /dev/null +++ b/media/ffvpx/libavcodec/av1dec.h @@ -0,0 +1,89 @@ +/* + * AV1 video decoder + * * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AV1DEC_H +#define AVCODEC_AV1DEC_H + +#include <stdint.h> + +#include "libavutil/buffer.h" +#include "libavutil/frame.h" +#include "libavutil/pixfmt.h" +#include "avcodec.h" +#include "cbs.h" +#include "cbs_av1.h" + +typedef struct AV1Frame { + AVFrame *f; + + AVBufferRef *hwaccel_priv_buf; + void *hwaccel_picture_private; + + AVBufferRef *header_ref; + AV1RawFrameHeader *raw_frame_header; + + int temporal_id; + int spatial_id; + + uint8_t gm_invalid[AV1_NUM_REF_FRAMES]; + uint8_t gm_type[AV1_NUM_REF_FRAMES]; + int32_t gm_params[AV1_NUM_REF_FRAMES][6]; + + uint8_t skip_mode_frame_idx[2]; + + AV1RawFilmGrainParams film_grain; + + uint8_t coded_lossless; +} AV1Frame; + +typedef struct TileGroupInfo { + uint32_t tile_offset; + uint32_t tile_size; + uint16_t tile_row; + uint16_t tile_column; +} TileGroupInfo; + +typedef struct AV1DecContext { + const AVClass *class; + AVCodecContext *avctx; + + enum AVPixelFormat pix_fmt; + CodedBitstreamContext *cbc; + CodedBitstreamFragment current_obu; + + AVBufferRef *seq_ref; + AV1RawSequenceHeader *raw_seq; + AVBufferRef *header_ref; + AV1RawFrameHeader *raw_frame_header; + TileGroupInfo *tile_group_info; + uint16_t tile_num; + uint16_t tg_start; + uint16_t tg_end; + + int operating_point_idc; + + AV1Frame ref[AV1_NUM_REF_FRAMES]; + AV1Frame cur_frame; + + // AVOptions + int operating_point; +} AV1DecContext; + +#endif /* AVCODEC_AV1DEC_H */ diff --git a/media/ffvpx/libavcodec/avcodec.c b/media/ffvpx/libavcodec/avcodec.c new file mode 100644 index 0000000000..fb1362290f --- /dev/null +++ b/media/ffvpx/libavcodec/avcodec.c @@ -0,0 +1,716 @@ +/* + * AVCodecContext functions for libavcodec + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * AVCodecContext functions for libavcodec + */ + +#include "config.h" +#include "libavutil/avassert.h" +#include "libavutil/avstring.h" +#include "libavutil/bprint.h" +#include "libavutil/channel_layout.h" +#include "libavutil/fifo.h" +#include "libavutil/imgutils.h" +#include "libavutil/mem.h" +#include "libavutil/opt.h" +#include "libavutil/thread.h" +#include "avcodec.h" +#include "bsf.h" +#include "codec_internal.h" +#include "decode.h" +#include "encode.h" +#include "frame_thread_encoder.h" +#include "internal.h" +#include "thread.h" + +int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2), void *arg, int *ret, int count, int size) +{ + int i; + + for (i = 0; i < count; i++) { + int r = func(c, (char *)arg + i * size); + if (ret) + ret[i] = r; + } + emms_c(); + return 0; +} + +int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2, int jobnr, int threadnr), void *arg, int *ret, int count) +{ + int i; + + for (i = 0; i < count; i++) { + int r = func(c, arg, i, 0); + if (ret) + ret[i] = r; + } + emms_c(); + return 0; +} + +static AVMutex codec_mutex = AV_MUTEX_INITIALIZER; + +static void lock_avcodec(const FFCodec *codec) +{ + if (codec->caps_internal & FF_CODEC_CAP_NOT_INIT_THREADSAFE && codec->init) + ff_mutex_lock(&codec_mutex); +} + +static void unlock_avcodec(const FFCodec *codec) +{ + if (codec->caps_internal & FF_CODEC_CAP_NOT_INIT_THREADSAFE && codec->init) + ff_mutex_unlock(&codec_mutex); +} + +static int64_t get_bit_rate(AVCodecContext *ctx) +{ + int64_t bit_rate; + int bits_per_sample; + + switch (ctx->codec_type) { + case AVMEDIA_TYPE_VIDEO: + case AVMEDIA_TYPE_DATA: + case AVMEDIA_TYPE_SUBTITLE: + case AVMEDIA_TYPE_ATTACHMENT: + bit_rate = ctx->bit_rate; + break; + case AVMEDIA_TYPE_AUDIO: + bits_per_sample = av_get_bits_per_sample(ctx->codec_id); + if (bits_per_sample) { + bit_rate = ctx->sample_rate * (int64_t)ctx->ch_layout.nb_channels; + if (bit_rate > INT64_MAX / bits_per_sample) { + bit_rate = 0; + } else + bit_rate *= bits_per_sample; + } else + bit_rate = ctx->bit_rate; + break; + default: + bit_rate = 0; + break; + } + return bit_rate; +} + +int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options) +{ + int ret = 0; + AVCodecInternal *avci; + const FFCodec *codec2; + + if (avcodec_is_open(avctx)) + return 0; + + if (!codec && !avctx->codec) { + av_log(avctx, AV_LOG_ERROR, "No codec provided to avcodec_open2()\n"); + return AVERROR(EINVAL); + } + if (codec && avctx->codec && codec != avctx->codec) { + av_log(avctx, AV_LOG_ERROR, "This AVCodecContext was allocated for %s, " + "but %s passed to avcodec_open2()\n", avctx->codec->name, codec->name); + return AVERROR(EINVAL); + } + if (!codec) + codec = avctx->codec; + codec2 = ffcodec(codec); + + if ((avctx->codec_type != AVMEDIA_TYPE_UNKNOWN && avctx->codec_type != codec->type) || + (avctx->codec_id != AV_CODEC_ID_NONE && avctx->codec_id != codec->id)) { + av_log(avctx, AV_LOG_ERROR, "Codec type or id mismatches\n"); + return AVERROR(EINVAL); + } + + avctx->codec_type = codec->type; + avctx->codec_id = codec->id; + avctx->codec = codec; + + if (avctx->extradata_size < 0 || avctx->extradata_size >= FF_MAX_EXTRADATA_SIZE) + return AVERROR(EINVAL); + + avci = av_mallocz(sizeof(*avci)); + if (!avci) { + ret = AVERROR(ENOMEM); + goto end; + } + avctx->internal = avci; + + avci->buffer_frame = av_frame_alloc(); + avci->buffer_pkt = av_packet_alloc(); + if (!avci->buffer_frame || !avci->buffer_pkt) { + ret = AVERROR(ENOMEM); + goto free_and_end; + } + + if (codec2->priv_data_size > 0) { + if (!avctx->priv_data) { + avctx->priv_data = av_mallocz(codec2->priv_data_size); + if (!avctx->priv_data) { + ret = AVERROR(ENOMEM); + goto free_and_end; + } + if (codec->priv_class) { + *(const AVClass **)avctx->priv_data = codec->priv_class; + av_opt_set_defaults(avctx->priv_data); + } + } + if (codec->priv_class && (ret = av_opt_set_dict(avctx->priv_data, options)) < 0) + goto free_and_end; + } else { + avctx->priv_data = NULL; + } + if ((ret = av_opt_set_dict(avctx, options)) < 0) + goto free_and_end; + + if (avctx->codec_whitelist && av_match_list(codec->name, avctx->codec_whitelist, ',') <= 0) { + av_log(avctx, AV_LOG_ERROR, "Codec (%s) not on whitelist \'%s\'\n", codec->name, avctx->codec_whitelist); + ret = AVERROR(EINVAL); + goto free_and_end; + } + + // only call ff_set_dimensions() for non H.264/VP6F/DXV codecs so as not to overwrite previously setup dimensions + if (!(avctx->coded_width && avctx->coded_height && avctx->width && avctx->height && + (avctx->codec_id == AV_CODEC_ID_H264 || avctx->codec_id == AV_CODEC_ID_VP6F || avctx->codec_id == AV_CODEC_ID_DXV))) { + if (avctx->coded_width && avctx->coded_height) + ret = ff_set_dimensions(avctx, avctx->coded_width, avctx->coded_height); + else if (avctx->width && avctx->height) + ret = ff_set_dimensions(avctx, avctx->width, avctx->height); + if (ret < 0) + goto free_and_end; + } + + if ((avctx->coded_width || avctx->coded_height || avctx->width || avctx->height) + && ( av_image_check_size2(avctx->coded_width, avctx->coded_height, avctx->max_pixels, AV_PIX_FMT_NONE, 0, avctx) < 0 + || av_image_check_size2(avctx->width, avctx->height, avctx->max_pixels, AV_PIX_FMT_NONE, 0, avctx) < 0)) { + av_log(avctx, AV_LOG_WARNING, "Ignoring invalid width/height values\n"); + ff_set_dimensions(avctx, 0, 0); + } + + if (avctx->width > 0 && avctx->height > 0) { + if (av_image_check_sar(avctx->width, avctx->height, + avctx->sample_aspect_ratio) < 0) { + av_log(avctx, AV_LOG_WARNING, "ignoring invalid SAR: %u/%u\n", + avctx->sample_aspect_ratio.num, + avctx->sample_aspect_ratio.den); + avctx->sample_aspect_ratio = (AVRational){ 0, 1 }; + } + } + + if (avctx->sample_rate < 0) { + av_log(avctx, AV_LOG_ERROR, "Invalid sample rate: %d\n", avctx->sample_rate); + ret = AVERROR(EINVAL); + goto free_and_end; + } + if (avctx->block_align < 0) { + av_log(avctx, AV_LOG_ERROR, "Invalid block align: %d\n", avctx->block_align); + ret = AVERROR(EINVAL); + goto free_and_end; + } + +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + /* compat wrapper for old-style callers */ + if (avctx->channel_layout && !avctx->channels) + avctx->channels = av_popcount64(avctx->channel_layout); + + if ((avctx->channels && avctx->ch_layout.nb_channels != avctx->channels) || + (avctx->channel_layout && (avctx->ch_layout.order != AV_CHANNEL_ORDER_NATIVE || + avctx->ch_layout.u.mask != avctx->channel_layout))) { + av_channel_layout_uninit(&avctx->ch_layout); + if (avctx->channel_layout) { + av_channel_layout_from_mask(&avctx->ch_layout, avctx->channel_layout); + } else { + avctx->ch_layout.order = AV_CHANNEL_ORDER_UNSPEC; + } + avctx->ch_layout.nb_channels = avctx->channels; + } +FF_ENABLE_DEPRECATION_WARNINGS +#endif + + /* AV_CODEC_CAP_CHANNEL_CONF is a decoder-only flag; so the code below + * in particular checks that nb_channels is set for all audio encoders. */ + if (avctx->codec_type == AVMEDIA_TYPE_AUDIO && !avctx->ch_layout.nb_channels + && !(codec->capabilities & AV_CODEC_CAP_CHANNEL_CONF)) { + av_log(avctx, AV_LOG_ERROR, "%s requires channel layout to be set\n", + av_codec_is_decoder(codec) ? "Decoder" : "Encoder"); + ret = AVERROR(EINVAL); + goto free_and_end; + } + if (avctx->ch_layout.nb_channels && !av_channel_layout_check(&avctx->ch_layout)) { + av_log(avctx, AV_LOG_ERROR, "Invalid channel layout\n"); + ret = AVERROR(EINVAL); + goto free_and_end; + } + if (avctx->ch_layout.nb_channels > FF_SANE_NB_CHANNELS) { + av_log(avctx, AV_LOG_ERROR, "Too many channels: %d\n", avctx->ch_layout.nb_channels); + ret = AVERROR(EINVAL); + goto free_and_end; + } + + avctx->frame_num = 0; +#if FF_API_AVCTX_FRAME_NUMBER +FF_DISABLE_DEPRECATION_WARNINGS + avctx->frame_number = avctx->frame_num; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + avctx->codec_descriptor = avcodec_descriptor_get(avctx->codec_id); + + if ((avctx->codec->capabilities & AV_CODEC_CAP_EXPERIMENTAL) && + avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) { + const char *codec_string = av_codec_is_encoder(codec) ? "encoder" : "decoder"; + const AVCodec *codec2; + av_log(avctx, AV_LOG_ERROR, + "The %s '%s' is experimental but experimental codecs are not enabled, " + "add '-strict %d' if you want to use it.\n", + codec_string, codec->name, FF_COMPLIANCE_EXPERIMENTAL); + codec2 = av_codec_is_encoder(codec) ? avcodec_find_encoder(codec->id) : avcodec_find_decoder(codec->id); + if (!(codec2->capabilities & AV_CODEC_CAP_EXPERIMENTAL)) + av_log(avctx, AV_LOG_ERROR, "Alternatively use the non experimental %s '%s'.\n", + codec_string, codec2->name); + ret = AVERROR_EXPERIMENTAL; + goto free_and_end; + } + + if (avctx->codec_type == AVMEDIA_TYPE_AUDIO && + (!avctx->time_base.num || !avctx->time_base.den)) { + avctx->time_base.num = 1; + avctx->time_base.den = avctx->sample_rate; + } + + if (av_codec_is_encoder(avctx->codec)) + ret = ff_encode_preinit(avctx); + else + ret = ff_decode_preinit(avctx); + if (ret < 0) + goto free_and_end; + + if (HAVE_THREADS && !avci->frame_thread_encoder) { + /* Frame-threaded decoders call FFCodec.init for their child contexts. */ + lock_avcodec(codec2); + ret = ff_thread_init(avctx); + unlock_avcodec(codec2); + if (ret < 0) { + goto free_and_end; + } + } + if (!HAVE_THREADS && !(codec2->caps_internal & FF_CODEC_CAP_AUTO_THREADS)) + avctx->thread_count = 1; + + if (!(avctx->active_thread_type & FF_THREAD_FRAME) || + avci->frame_thread_encoder) { + if (codec2->init) { + lock_avcodec(codec2); + ret = codec2->init(avctx); + unlock_avcodec(codec2); + if (ret < 0) { + avci->needs_close = codec2->caps_internal & FF_CODEC_CAP_INIT_CLEANUP; + goto free_and_end; + } + } + avci->needs_close = 1; + } + + ret=0; + + if (av_codec_is_decoder(avctx->codec)) { + if (!avctx->bit_rate) + avctx->bit_rate = get_bit_rate(avctx); + +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + /* update the deprecated fields for old-style callers */ + avctx->channels = avctx->ch_layout.nb_channels; + avctx->channel_layout = avctx->ch_layout.order == AV_CHANNEL_ORDER_NATIVE ? + avctx->ch_layout.u.mask : 0; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + + /* validate channel layout from the decoder */ + if ((avctx->ch_layout.nb_channels && !av_channel_layout_check(&avctx->ch_layout)) || + avctx->ch_layout.nb_channels > FF_SANE_NB_CHANNELS) { + ret = AVERROR(EINVAL); + goto free_and_end; + } + if (avctx->bits_per_coded_sample < 0) { + ret = AVERROR(EINVAL); + goto free_and_end; + } + } + if (codec->priv_class) + av_assert0(*(const AVClass **)avctx->priv_data == codec->priv_class); + +end: + + return ret; +free_and_end: + avcodec_close(avctx); + goto end; +} + +void avcodec_flush_buffers(AVCodecContext *avctx) +{ + AVCodecInternal *avci = avctx->internal; + + if (av_codec_is_encoder(avctx->codec)) { + int caps = avctx->codec->capabilities; + + if (!(caps & AV_CODEC_CAP_ENCODER_FLUSH)) { + // Only encoders that explicitly declare support for it can be + // flushed. Otherwise, this is a no-op. + av_log(avctx, AV_LOG_WARNING, "Ignoring attempt to flush encoder " + "that doesn't support it\n"); + return; + } + if (avci->in_frame) + av_frame_unref(avci->in_frame); + if (avci->recon_frame) + av_frame_unref(avci->recon_frame); + } else { + av_packet_unref(avci->last_pkt_props); + av_packet_unref(avci->in_pkt); + + avctx->pts_correction_last_pts = + avctx->pts_correction_last_dts = INT64_MIN; + + av_bsf_flush(avci->bsf); + } + + avci->draining = 0; + avci->draining_done = 0; + avci->nb_draining_errors = 0; + av_frame_unref(avci->buffer_frame); + av_packet_unref(avci->buffer_pkt); + + if (HAVE_THREADS && avctx->active_thread_type & FF_THREAD_FRAME) + ff_thread_flush(avctx); + else if (ffcodec(avctx->codec)->flush) + ffcodec(avctx->codec)->flush(avctx); +} + +void avsubtitle_free(AVSubtitle *sub) +{ + int i; + + for (i = 0; i < sub->num_rects; i++) { + AVSubtitleRect *const rect = sub->rects[i]; + + av_freep(&rect->data[0]); + av_freep(&rect->data[1]); + av_freep(&rect->data[2]); + av_freep(&rect->data[3]); + av_freep(&rect->text); + av_freep(&rect->ass); + + av_freep(&sub->rects[i]); + } + + av_freep(&sub->rects); + + memset(sub, 0, sizeof(*sub)); +} + +av_cold int avcodec_close(AVCodecContext *avctx) +{ + int i; + + if (!avctx) + return 0; + + if (avcodec_is_open(avctx)) { + AVCodecInternal *avci = avctx->internal; + + if (CONFIG_FRAME_THREAD_ENCODER && + avci->frame_thread_encoder && avctx->thread_count > 1) { + ff_frame_thread_encoder_free(avctx); + } + if (HAVE_THREADS && avci->thread_ctx) + ff_thread_free(avctx); + if (avci->needs_close && ffcodec(avctx->codec)->close) + ffcodec(avctx->codec)->close(avctx); + avci->byte_buffer_size = 0; + av_freep(&avci->byte_buffer); + av_frame_free(&avci->buffer_frame); + av_packet_free(&avci->buffer_pkt); + av_packet_free(&avci->last_pkt_props); + + av_packet_free(&avci->in_pkt); + av_frame_free(&avci->in_frame); + av_frame_free(&avci->recon_frame); + + av_buffer_unref(&avci->pool); + + if (avctx->hwaccel && avctx->hwaccel->uninit) + avctx->hwaccel->uninit(avctx); + av_freep(&avci->hwaccel_priv_data); + + av_bsf_free(&avci->bsf); + + av_channel_layout_uninit(&avci->initial_ch_layout); + +#if CONFIG_LCMS2 + ff_icc_context_uninit(&avci->icc); +#endif + + av_freep(&avctx->internal); + } + + for (i = 0; i < avctx->nb_coded_side_data; i++) + av_freep(&avctx->coded_side_data[i].data); + av_freep(&avctx->coded_side_data); + avctx->nb_coded_side_data = 0; + + av_buffer_unref(&avctx->hw_frames_ctx); + av_buffer_unref(&avctx->hw_device_ctx); + + if (avctx->priv_data && avctx->codec && avctx->codec->priv_class) + av_opt_free(avctx->priv_data); + av_opt_free(avctx); + av_freep(&avctx->priv_data); + if (av_codec_is_encoder(avctx->codec)) { + av_freep(&avctx->extradata); + avctx->extradata_size = 0; + } else if (av_codec_is_decoder(avctx->codec)) + av_freep(&avctx->subtitle_header); + + avctx->codec = NULL; + avctx->active_thread_type = 0; + + return 0; +} + +static const char *unknown_if_null(const char *str) +{ + return str ? str : "unknown"; +} + +void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode) +{ + const char *codec_type; + const char *codec_name; + const char *profile = NULL; + AVBPrint bprint; + int64_t bitrate; + int new_line = 0; + AVRational display_aspect_ratio; + const char *separator = enc->dump_separator ? (const char *)enc->dump_separator : ", "; + const char *str; + + if (!buf || buf_size <= 0) + return; + av_bprint_init_for_buffer(&bprint, buf, buf_size); + codec_type = av_get_media_type_string(enc->codec_type); + codec_name = avcodec_get_name(enc->codec_id); + profile = avcodec_profile_name(enc->codec_id, enc->profile); + + av_bprintf(&bprint, "%s: %s", codec_type ? codec_type : "unknown", + codec_name); + buf[0] ^= 'a' ^ 'A'; /* first letter in uppercase */ + + if (enc->codec && strcmp(enc->codec->name, codec_name)) + av_bprintf(&bprint, " (%s)", enc->codec->name); + + if (profile) + av_bprintf(&bprint, " (%s)", profile); + if ( enc->codec_type == AVMEDIA_TYPE_VIDEO + && av_log_get_level() >= AV_LOG_VERBOSE + && enc->refs) + av_bprintf(&bprint, ", %d reference frame%s", + enc->refs, enc->refs > 1 ? "s" : ""); + + if (enc->codec_tag) + av_bprintf(&bprint, " (%s / 0x%04X)", + av_fourcc2str(enc->codec_tag), enc->codec_tag); + + switch (enc->codec_type) { + case AVMEDIA_TYPE_VIDEO: + { + unsigned len; + + av_bprintf(&bprint, "%s%s", separator, + enc->pix_fmt == AV_PIX_FMT_NONE ? "none" : + unknown_if_null(av_get_pix_fmt_name(enc->pix_fmt))); + + av_bprint_chars(&bprint, '(', 1); + len = bprint.len; + + /* The following check ensures that '(' has been written + * and therefore allows us to erase it if it turns out + * to be unnecessary. */ + if (!av_bprint_is_complete(&bprint)) + return; + + if (enc->bits_per_raw_sample && enc->pix_fmt != AV_PIX_FMT_NONE && + enc->bits_per_raw_sample < av_pix_fmt_desc_get(enc->pix_fmt)->comp[0].depth) + av_bprintf(&bprint, "%d bpc, ", enc->bits_per_raw_sample); + if (enc->color_range != AVCOL_RANGE_UNSPECIFIED && + (str = av_color_range_name(enc->color_range))) + av_bprintf(&bprint, "%s, ", str); + + if (enc->colorspace != AVCOL_SPC_UNSPECIFIED || + enc->color_primaries != AVCOL_PRI_UNSPECIFIED || + enc->color_trc != AVCOL_TRC_UNSPECIFIED) { + const char *col = unknown_if_null(av_color_space_name(enc->colorspace)); + const char *pri = unknown_if_null(av_color_primaries_name(enc->color_primaries)); + const char *trc = unknown_if_null(av_color_transfer_name(enc->color_trc)); + if (strcmp(col, pri) || strcmp(col, trc)) { + new_line = 1; + av_bprintf(&bprint, "%s/%s/%s, ", col, pri, trc); + } else + av_bprintf(&bprint, "%s, ", col); + } + + if (enc->field_order != AV_FIELD_UNKNOWN) { + const char *field_order = "progressive"; + if (enc->field_order == AV_FIELD_TT) + field_order = "top first"; + else if (enc->field_order == AV_FIELD_BB) + field_order = "bottom first"; + else if (enc->field_order == AV_FIELD_TB) + field_order = "top coded first (swapped)"; + else if (enc->field_order == AV_FIELD_BT) + field_order = "bottom coded first (swapped)"; + + av_bprintf(&bprint, "%s, ", field_order); + } + + if (av_log_get_level() >= AV_LOG_VERBOSE && + enc->chroma_sample_location != AVCHROMA_LOC_UNSPECIFIED && + (str = av_chroma_location_name(enc->chroma_sample_location))) + av_bprintf(&bprint, "%s, ", str); + + if (len == bprint.len) { + bprint.str[len - 1] = '\0'; + bprint.len--; + } else { + if (bprint.len - 2 < bprint.size) { + /* Erase the last ", " */ + bprint.len -= 2; + bprint.str[bprint.len] = '\0'; + } + av_bprint_chars(&bprint, ')', 1); + } + } + + if (enc->width) { + av_bprintf(&bprint, "%s%dx%d", new_line ? separator : ", ", + enc->width, enc->height); + + if (av_log_get_level() >= AV_LOG_VERBOSE && + (enc->width != enc->coded_width || + enc->height != enc->coded_height)) + av_bprintf(&bprint, " (%dx%d)", + enc->coded_width, enc->coded_height); + + if (enc->sample_aspect_ratio.num) { + av_reduce(&display_aspect_ratio.num, &display_aspect_ratio.den, + enc->width * (int64_t)enc->sample_aspect_ratio.num, + enc->height * (int64_t)enc->sample_aspect_ratio.den, + 1024 * 1024); + av_bprintf(&bprint, " [SAR %d:%d DAR %d:%d]", + enc->sample_aspect_ratio.num, enc->sample_aspect_ratio.den, + display_aspect_ratio.num, display_aspect_ratio.den); + } + if (av_log_get_level() >= AV_LOG_DEBUG) { + int g = av_gcd(enc->time_base.num, enc->time_base.den); + av_bprintf(&bprint, ", %d/%d", + enc->time_base.num / g, enc->time_base.den / g); + } + } + if (encode) { + av_bprintf(&bprint, ", q=%d-%d", enc->qmin, enc->qmax); + } else { + if (enc->properties & FF_CODEC_PROPERTY_CLOSED_CAPTIONS) + av_bprintf(&bprint, ", Closed Captions"); + if (enc->properties & FF_CODEC_PROPERTY_FILM_GRAIN) + av_bprintf(&bprint, ", Film Grain"); + if (enc->properties & FF_CODEC_PROPERTY_LOSSLESS) + av_bprintf(&bprint, ", lossless"); + } + break; + case AVMEDIA_TYPE_AUDIO: + av_bprintf(&bprint, "%s", separator); + + if (enc->sample_rate) { + av_bprintf(&bprint, "%d Hz, ", enc->sample_rate); + } + { + char buf[512]; + int ret = av_channel_layout_describe(&enc->ch_layout, buf, sizeof(buf)); + if (ret >= 0) + av_bprintf(&bprint, "%s", buf); + } + if (enc->sample_fmt != AV_SAMPLE_FMT_NONE && + (str = av_get_sample_fmt_name(enc->sample_fmt))) { + av_bprintf(&bprint, ", %s", str); + } + if ( enc->bits_per_raw_sample > 0 + && enc->bits_per_raw_sample != av_get_bytes_per_sample(enc->sample_fmt) * 8) + av_bprintf(&bprint, " (%d bit)", enc->bits_per_raw_sample); + if (av_log_get_level() >= AV_LOG_VERBOSE) { + if (enc->initial_padding) + av_bprintf(&bprint, ", delay %d", enc->initial_padding); + if (enc->trailing_padding) + av_bprintf(&bprint, ", padding %d", enc->trailing_padding); + } + break; + case AVMEDIA_TYPE_DATA: + if (av_log_get_level() >= AV_LOG_DEBUG) { + int g = av_gcd(enc->time_base.num, enc->time_base.den); + if (g) + av_bprintf(&bprint, ", %d/%d", + enc->time_base.num / g, enc->time_base.den / g); + } + break; + case AVMEDIA_TYPE_SUBTITLE: + if (enc->width) + av_bprintf(&bprint, ", %dx%d", enc->width, enc->height); + break; + default: + return; + } + if (encode) { + if (enc->flags & AV_CODEC_FLAG_PASS1) + av_bprintf(&bprint, ", pass 1"); + if (enc->flags & AV_CODEC_FLAG_PASS2) + av_bprintf(&bprint, ", pass 2"); + } + bitrate = get_bit_rate(enc); + if (bitrate != 0) { + av_bprintf(&bprint, ", %"PRId64" kb/s", bitrate / 1000); + } else if (enc->rc_max_rate > 0) { + av_bprintf(&bprint, ", max. %"PRId64" kb/s", enc->rc_max_rate / 1000); + } +} + +int avcodec_is_open(AVCodecContext *s) +{ + return !!s->internal; +} + +int attribute_align_arg avcodec_receive_frame(AVCodecContext *avctx, AVFrame *frame) +{ + av_frame_unref(frame); + + if (av_codec_is_decoder(avctx->codec)) + return ff_decode_receive_frame(avctx, frame); + return ff_encode_receive_frame(avctx, frame); +} diff --git a/media/ffvpx/libavcodec/avcodec.h b/media/ffvpx/libavcodec/avcodec.h new file mode 100644 index 0000000000..9a0fe97cad --- /dev/null +++ b/media/ffvpx/libavcodec/avcodec.h @@ -0,0 +1,3193 @@ +/* + * copyright (c) 2001 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AVCODEC_H +#define AVCODEC_AVCODEC_H + +/** + * @file + * @ingroup libavc + * Libavcodec external API header + */ + +#include "libavutil/samplefmt.h" +#include "libavutil/attributes.h" +#include "libavutil/avutil.h" +#include "libavutil/buffer.h" +#include "libavutil/dict.h" +#include "libavutil/frame.h" +#include "libavutil/log.h" +#include "libavutil/pixfmt.h" +#include "libavutil/rational.h" + +#include "codec.h" +#include "codec_desc.h" +#include "codec_par.h" +#include "codec_id.h" +#include "defs.h" +#include "packet.h" +#include "version_major.h" +#ifndef HAVE_AV_CONFIG_H +/* When included as part of the ffmpeg build, only include the major version + * to avoid unnecessary rebuilds. When included externally, keep including + * the full version information. */ +#include "version.h" +#endif + +/** + * @defgroup libavc libavcodec + * Encoding/Decoding Library + * + * @{ + * + * @defgroup lavc_decoding Decoding + * @{ + * @} + * + * @defgroup lavc_encoding Encoding + * @{ + * @} + * + * @defgroup lavc_codec Codecs + * @{ + * @defgroup lavc_codec_native Native Codecs + * @{ + * @} + * @defgroup lavc_codec_wrappers External library wrappers + * @{ + * @} + * @defgroup lavc_codec_hwaccel Hardware Accelerators bridge + * @{ + * @} + * @} + * @defgroup lavc_internal Internal + * @{ + * @} + * @} + */ + +/** + * @ingroup libavc + * @defgroup lavc_encdec send/receive encoding and decoding API overview + * @{ + * + * The avcodec_send_packet()/avcodec_receive_frame()/avcodec_send_frame()/ + * avcodec_receive_packet() functions provide an encode/decode API, which + * decouples input and output. + * + * The API is very similar for encoding/decoding and audio/video, and works as + * follows: + * - Set up and open the AVCodecContext as usual. + * - Send valid input: + * - For decoding, call avcodec_send_packet() to give the decoder raw + * compressed data in an AVPacket. + * - For encoding, call avcodec_send_frame() to give the encoder an AVFrame + * containing uncompressed audio or video. + * + * In both cases, it is recommended that AVPackets and AVFrames are + * refcounted, or libavcodec might have to copy the input data. (libavformat + * always returns refcounted AVPackets, and av_frame_get_buffer() allocates + * refcounted AVFrames.) + * - Receive output in a loop. Periodically call one of the avcodec_receive_*() + * functions and process their output: + * - For decoding, call avcodec_receive_frame(). On success, it will return + * an AVFrame containing uncompressed audio or video data. + * - For encoding, call avcodec_receive_packet(). On success, it will return + * an AVPacket with a compressed frame. + * + * Repeat this call until it returns AVERROR(EAGAIN) or an error. The + * AVERROR(EAGAIN) return value means that new input data is required to + * return new output. In this case, continue with sending input. For each + * input frame/packet, the codec will typically return 1 output frame/packet, + * but it can also be 0 or more than 1. + * + * At the beginning of decoding or encoding, the codec might accept multiple + * input frames/packets without returning a frame, until its internal buffers + * are filled. This situation is handled transparently if you follow the steps + * outlined above. + * + * In theory, sending input can result in EAGAIN - this should happen only if + * not all output was received. You can use this to structure alternative decode + * or encode loops other than the one suggested above. For example, you could + * try sending new input on each iteration, and try to receive output if that + * returns EAGAIN. + * + * End of stream situations. These require "flushing" (aka draining) the codec, + * as the codec might buffer multiple frames or packets internally for + * performance or out of necessity (consider B-frames). + * This is handled as follows: + * - Instead of valid input, send NULL to the avcodec_send_packet() (decoding) + * or avcodec_send_frame() (encoding) functions. This will enter draining + * mode. + * - Call avcodec_receive_frame() (decoding) or avcodec_receive_packet() + * (encoding) in a loop until AVERROR_EOF is returned. The functions will + * not return AVERROR(EAGAIN), unless you forgot to enter draining mode. + * - Before decoding can be resumed again, the codec has to be reset with + * avcodec_flush_buffers(). + * + * Using the API as outlined above is highly recommended. But it is also + * possible to call functions outside of this rigid schema. For example, you can + * call avcodec_send_packet() repeatedly without calling + * avcodec_receive_frame(). In this case, avcodec_send_packet() will succeed + * until the codec's internal buffer has been filled up (which is typically of + * size 1 per output frame, after initial input), and then reject input with + * AVERROR(EAGAIN). Once it starts rejecting input, you have no choice but to + * read at least some output. + * + * Not all codecs will follow a rigid and predictable dataflow; the only + * guarantee is that an AVERROR(EAGAIN) return value on a send/receive call on + * one end implies that a receive/send call on the other end will succeed, or + * at least will not fail with AVERROR(EAGAIN). In general, no codec will + * permit unlimited buffering of input or output. + * + * A codec is not allowed to return AVERROR(EAGAIN) for both sending and receiving. This + * would be an invalid state, which could put the codec user into an endless + * loop. The API has no concept of time either: it cannot happen that trying to + * do avcodec_send_packet() results in AVERROR(EAGAIN), but a repeated call 1 second + * later accepts the packet (with no other receive/flush API calls involved). + * The API is a strict state machine, and the passage of time is not supposed + * to influence it. Some timing-dependent behavior might still be deemed + * acceptable in certain cases. But it must never result in both send/receive + * returning EAGAIN at the same time at any point. It must also absolutely be + * avoided that the current state is "unstable" and can "flip-flop" between + * the send/receive APIs allowing progress. For example, it's not allowed that + * the codec randomly decides that it actually wants to consume a packet now + * instead of returning a frame, after it just returned AVERROR(EAGAIN) on an + * avcodec_send_packet() call. + * @} + */ + +/** + * @defgroup lavc_core Core functions/structures. + * @ingroup libavc + * + * Basic definitions, functions for querying libavcodec capabilities, + * allocating core structures, etc. + * @{ + */ + +/** + * @ingroup lavc_encoding + * minimum encoding buffer size + * Used to avoid some checks during header writing. + */ +#define AV_INPUT_BUFFER_MIN_SIZE 16384 + +/** + * @ingroup lavc_encoding + */ +typedef struct RcOverride{ + int start_frame; + int end_frame; + int qscale; // If this is 0 then quality_factor will be used instead. + float quality_factor; +} RcOverride; + +/* encoding support + These flags can be passed in AVCodecContext.flags before initialization. + Note: Not everything is supported yet. +*/ + +/** + * Allow decoders to produce frames with data planes that are not aligned + * to CPU requirements (e.g. due to cropping). + */ +#define AV_CODEC_FLAG_UNALIGNED (1 << 0) +/** + * Use fixed qscale. + */ +#define AV_CODEC_FLAG_QSCALE (1 << 1) +/** + * 4 MV per MB allowed / advanced prediction for H.263. + */ +#define AV_CODEC_FLAG_4MV (1 << 2) +/** + * Output even those frames that might be corrupted. + */ +#define AV_CODEC_FLAG_OUTPUT_CORRUPT (1 << 3) +/** + * Use qpel MC. + */ +#define AV_CODEC_FLAG_QPEL (1 << 4) +/** + * Don't output frames whose parameters differ from first + * decoded frame in stream. + */ +#define AV_CODEC_FLAG_DROPCHANGED (1 << 5) +/** + * Request the encoder to output reconstructed frames, i.e.\ frames that would + * be produced by decoding the encoded bistream. These frames may be retrieved + * by calling avcodec_receive_frame() immediately after a successful call to + * avcodec_receive_packet(). + * + * Should only be used with encoders flagged with the + * @ref AV_CODEC_CAP_ENCODER_RECON_FRAME capability. + */ +#define AV_CODEC_FLAG_RECON_FRAME (1 << 6) +/** + * @par decoding + * Request the decoder to propagate each packets AVPacket.opaque and + * AVPacket.opaque_ref to its corresponding output AVFrame. + * + * @par encoding: + * Request the encoder to propagate each frame's AVFrame.opaque and + * AVFrame.opaque_ref values to its corresponding output AVPacket. + * + * @par + * May only be set on encoders that have the + * @ref AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE capability flag. + * + * @note + * While in typical cases one input frame produces exactly one output packet + * (perhaps after a delay), in general the mapping of frames to packets is + * M-to-N, so + * - Any number of input frames may be associated with any given output packet. + * This includes zero - e.g. some encoders may output packets that carry only + * metadata about the whole stream. + * - A given input frame may be associated with any number of output packets. + * Again this includes zero - e.g. some encoders may drop frames under certain + * conditions. + * . + * This implies that when using this flag, the caller must NOT assume that + * - a given input frame's opaques will necessarily appear on some output packet; + * - every output packet will have some non-NULL opaque value. + * . + * When an output packet contains multiple frames, the opaque values will be + * taken from the first of those. + * + * @note + * The converse holds for decoders, with frames and packets switched. + */ +#define AV_CODEC_FLAG_COPY_OPAQUE (1 << 7) +/** + * Signal to the encoder that the values of AVFrame.duration are valid and + * should be used (typically for transferring them to output packets). + * + * If this flag is not set, frame durations are ignored. + */ +#define AV_CODEC_FLAG_FRAME_DURATION (1 << 8) +/** + * Use internal 2pass ratecontrol in first pass mode. + */ +#define AV_CODEC_FLAG_PASS1 (1 << 9) +/** + * Use internal 2pass ratecontrol in second pass mode. + */ +#define AV_CODEC_FLAG_PASS2 (1 << 10) +/** + * loop filter. + */ +#define AV_CODEC_FLAG_LOOP_FILTER (1 << 11) +/** + * Only decode/encode grayscale. + */ +#define AV_CODEC_FLAG_GRAY (1 << 13) +/** + * error[?] variables will be set during encoding. + */ +#define AV_CODEC_FLAG_PSNR (1 << 15) +/** + * Use interlaced DCT. + */ +#define AV_CODEC_FLAG_INTERLACED_DCT (1 << 18) +/** + * Force low delay. + */ +#define AV_CODEC_FLAG_LOW_DELAY (1 << 19) +/** + * Place global headers in extradata instead of every keyframe. + */ +#define AV_CODEC_FLAG_GLOBAL_HEADER (1 << 22) +/** + * Use only bitexact stuff (except (I)DCT). + */ +#define AV_CODEC_FLAG_BITEXACT (1 << 23) +/* Fx : Flag for H.263+ extra options */ +/** + * H.263 advanced intra coding / MPEG-4 AC prediction + */ +#define AV_CODEC_FLAG_AC_PRED (1 << 24) +/** + * interlaced motion estimation + */ +#define AV_CODEC_FLAG_INTERLACED_ME (1 << 29) +#define AV_CODEC_FLAG_CLOSED_GOP (1U << 31) + +/** + * Allow non spec compliant speedup tricks. + */ +#define AV_CODEC_FLAG2_FAST (1 << 0) +/** + * Skip bitstream encoding. + */ +#define AV_CODEC_FLAG2_NO_OUTPUT (1 << 2) +/** + * Place global headers at every keyframe instead of in extradata. + */ +#define AV_CODEC_FLAG2_LOCAL_HEADER (1 << 3) + +/** + * Input bitstream might be truncated at a packet boundaries + * instead of only at frame boundaries. + */ +#define AV_CODEC_FLAG2_CHUNKS (1 << 15) +/** + * Discard cropping information from SPS. + */ +#define AV_CODEC_FLAG2_IGNORE_CROP (1 << 16) + +/** + * Show all frames before the first keyframe + */ +#define AV_CODEC_FLAG2_SHOW_ALL (1 << 22) +/** + * Export motion vectors through frame side data + */ +#define AV_CODEC_FLAG2_EXPORT_MVS (1 << 28) +/** + * Do not skip samples and export skip information as frame side data + */ +#define AV_CODEC_FLAG2_SKIP_MANUAL (1 << 29) +/** + * Do not reset ASS ReadOrder field on flush (subtitles decoding) + */ +#define AV_CODEC_FLAG2_RO_FLUSH_NOOP (1 << 30) +/** + * Generate/parse ICC profiles on encode/decode, as appropriate for the type of + * file. No effect on codecs which cannot contain embedded ICC profiles, or + * when compiled without support for lcms2. + */ +#define AV_CODEC_FLAG2_ICC_PROFILES (1U << 31) + +/* Exported side data. + These flags can be passed in AVCodecContext.export_side_data before initialization. +*/ +/** + * Export motion vectors through frame side data + */ +#define AV_CODEC_EXPORT_DATA_MVS (1 << 0) +/** + * Export encoder Producer Reference Time through packet side data + */ +#define AV_CODEC_EXPORT_DATA_PRFT (1 << 1) +/** + * Decoding only. + * Export the AVVideoEncParams structure through frame side data. + */ +#define AV_CODEC_EXPORT_DATA_VIDEO_ENC_PARAMS (1 << 2) +/** + * Decoding only. + * Do not apply film grain, export it instead. + */ +#define AV_CODEC_EXPORT_DATA_FILM_GRAIN (1 << 3) + +/** + * The decoder will keep a reference to the frame and may reuse it later. + */ +#define AV_GET_BUFFER_FLAG_REF (1 << 0) + +/** + * The encoder will keep a reference to the packet and may reuse it later. + */ +#define AV_GET_ENCODE_BUFFER_FLAG_REF (1 << 0) + +struct AVCodecInternal; + +/** + * main external API structure. + * New fields can be added to the end with minor version bumps. + * Removal, reordering and changes to existing fields require a major + * version bump. + * You can use AVOptions (av_opt* / av_set/get*()) to access these fields from user + * applications. + * The name string for AVOptions options matches the associated command line + * parameter name and can be found in libavcodec/options_table.h + * The AVOption/command line parameter names differ in some cases from the C + * structure field names for historic reasons or brevity. + * sizeof(AVCodecContext) must not be used outside libav*. + */ +typedef struct AVCodecContext { + /** + * information on struct for av_log + * - set by avcodec_alloc_context3 + */ + const AVClass *av_class; + int log_level_offset; + + enum AVMediaType codec_type; /* see AVMEDIA_TYPE_xxx */ + const struct AVCodec *codec; + enum AVCodecID codec_id; /* see AV_CODEC_ID_xxx */ + + /** + * fourcc (LSB first, so "ABCD" -> ('D'<<24) + ('C'<<16) + ('B'<<8) + 'A'). + * This is used to work around some encoder bugs. + * A demuxer should set this to what is stored in the field used to identify the codec. + * If there are multiple such fields in a container then the demuxer should choose the one + * which maximizes the information about the used codec. + * If the codec tag field in a container is larger than 32 bits then the demuxer should + * remap the longer ID to 32 bits with a table or other structure. Alternatively a new + * extra_codec_tag + size could be added but for this a clear advantage must be demonstrated + * first. + * - encoding: Set by user, if not then the default based on codec_id will be used. + * - decoding: Set by user, will be converted to uppercase by libavcodec during init. + */ + unsigned int codec_tag; + + void *priv_data; + + /** + * Private context used for internal data. + * + * Unlike priv_data, this is not codec-specific. It is used in general + * libavcodec functions. + */ + struct AVCodecInternal *internal; + + /** + * Private data of the user, can be used to carry app specific stuff. + * - encoding: Set by user. + * - decoding: Set by user. + */ + void *opaque; + + /** + * the average bitrate + * - encoding: Set by user; unused for constant quantizer encoding. + * - decoding: Set by user, may be overwritten by libavcodec + * if this info is available in the stream + */ + int64_t bit_rate; + + /** + * number of bits the bitstream is allowed to diverge from the reference. + * the reference can be CBR (for CBR pass1) or VBR (for pass2) + * - encoding: Set by user; unused for constant quantizer encoding. + * - decoding: unused + */ + int bit_rate_tolerance; + + /** + * Global quality for codecs which cannot change it per frame. + * This should be proportional to MPEG-1/2/4 qscale. + * - encoding: Set by user. + * - decoding: unused + */ + int global_quality; + + /** + * - encoding: Set by user. + * - decoding: unused + */ + int compression_level; +#define FF_COMPRESSION_DEFAULT -1 + + /** + * AV_CODEC_FLAG_*. + * - encoding: Set by user. + * - decoding: Set by user. + */ + int flags; + + /** + * AV_CODEC_FLAG2_* + * - encoding: Set by user. + * - decoding: Set by user. + */ + int flags2; + + /** + * some codecs need / can use extradata like Huffman tables. + * MJPEG: Huffman tables + * rv10: additional flags + * MPEG-4: global headers (they can be in the bitstream or here) + * The allocated memory should be AV_INPUT_BUFFER_PADDING_SIZE bytes larger + * than extradata_size to avoid problems if it is read with the bitstream reader. + * The bytewise contents of extradata must not depend on the architecture or CPU endianness. + * Must be allocated with the av_malloc() family of functions. + * - encoding: Set/allocated/freed by libavcodec. + * - decoding: Set/allocated/freed by user. + */ + uint8_t *extradata; + int extradata_size; + + /** + * This is the fundamental unit of time (in seconds) in terms + * of which frame timestamps are represented. For fixed-fps content, + * timebase should be 1/framerate and timestamp increments should be + * identically 1. + * This often, but not always is the inverse of the frame rate or field rate + * for video. 1/time_base is not the average frame rate if the frame rate is not + * constant. + * + * Like containers, elementary streams also can store timestamps, 1/time_base + * is the unit in which these timestamps are specified. + * As example of such codec time base see ISO/IEC 14496-2:2001(E) + * vop_time_increment_resolution and fixed_vop_rate + * (fixed_vop_rate == 0 implies that it is different from the framerate) + * + * - encoding: MUST be set by user. + * - decoding: unused. + */ + AVRational time_base; + + /** + * For some codecs, the time base is closer to the field rate than the frame rate. + * Most notably, H.264 and MPEG-2 specify time_base as half of frame duration + * if no telecine is used ... + * + * Set to time_base ticks per frame. Default 1, e.g., H.264/MPEG-2 set it to 2. + */ + int ticks_per_frame; + + /** + * Codec delay. + * + * Encoding: Number of frames delay there will be from the encoder input to + * the decoder output. (we assume the decoder matches the spec) + * Decoding: Number of frames delay in addition to what a standard decoder + * as specified in the spec would produce. + * + * Video: + * Number of frames the decoded output will be delayed relative to the + * encoded input. + * + * Audio: + * For encoding, this field is unused (see initial_padding). + * + * For decoding, this is the number of samples the decoder needs to + * output before the decoder's output is valid. When seeking, you should + * start decoding this many samples prior to your desired seek point. + * + * - encoding: Set by libavcodec. + * - decoding: Set by libavcodec. + */ + int delay; + + + /* video only */ + /** + * picture width / height. + * + * @note Those fields may not match the values of the last + * AVFrame output by avcodec_receive_frame() due frame + * reordering. + * + * - encoding: MUST be set by user. + * - decoding: May be set by the user before opening the decoder if known e.g. + * from the container. Some decoders will require the dimensions + * to be set by the caller. During decoding, the decoder may + * overwrite those values as required while parsing the data. + */ + int width, height; + + /** + * Bitstream width / height, may be different from width/height e.g. when + * the decoded frame is cropped before being output or lowres is enabled. + * + * @note Those field may not match the value of the last + * AVFrame output by avcodec_receive_frame() due frame + * reordering. + * + * - encoding: unused + * - decoding: May be set by the user before opening the decoder if known + * e.g. from the container. During decoding, the decoder may + * overwrite those values as required while parsing the data. + */ + int coded_width, coded_height; + + /** + * the number of pictures in a group of pictures, or 0 for intra_only + * - encoding: Set by user. + * - decoding: unused + */ + int gop_size; + + /** + * Pixel format, see AV_PIX_FMT_xxx. + * May be set by the demuxer if known from headers. + * May be overridden by the decoder if it knows better. + * + * @note This field may not match the value of the last + * AVFrame output by avcodec_receive_frame() due frame + * reordering. + * + * - encoding: Set by user. + * - decoding: Set by user if known, overridden by libavcodec while + * parsing the data. + */ + enum AVPixelFormat pix_fmt; + + /** + * If non NULL, 'draw_horiz_band' is called by the libavcodec + * decoder to draw a horizontal band. It improves cache usage. Not + * all codecs can do that. You must check the codec capabilities + * beforehand. + * When multithreading is used, it may be called from multiple threads + * at the same time; threads might draw different parts of the same AVFrame, + * or multiple AVFrames, and there is no guarantee that slices will be drawn + * in order. + * The function is also used by hardware acceleration APIs. + * It is called at least once during frame decoding to pass + * the data needed for hardware render. + * In that mode instead of pixel data, AVFrame points to + * a structure specific to the acceleration API. The application + * reads the structure and can change some fields to indicate progress + * or mark state. + * - encoding: unused + * - decoding: Set by user. + * @param height the height of the slice + * @param y the y position of the slice + * @param type 1->top field, 2->bottom field, 3->frame + * @param offset offset into the AVFrame.data from which the slice should be read + */ + void (*draw_horiz_band)(struct AVCodecContext *s, + const AVFrame *src, int offset[AV_NUM_DATA_POINTERS], + int y, int type, int height); + + /** + * Callback to negotiate the pixel format. Decoding only, may be set by the + * caller before avcodec_open2(). + * + * Called by some decoders to select the pixel format that will be used for + * the output frames. This is mainly used to set up hardware acceleration, + * then the provided format list contains the corresponding hwaccel pixel + * formats alongside the "software" one. The software pixel format may also + * be retrieved from \ref sw_pix_fmt. + * + * This callback will be called when the coded frame properties (such as + * resolution, pixel format, etc.) change and more than one output format is + * supported for those new properties. If a hardware pixel format is chosen + * and initialization for it fails, the callback may be called again + * immediately. + * + * This callback may be called from different threads if the decoder is + * multi-threaded, but not from more than one thread simultaneously. + * + * @param fmt list of formats which may be used in the current + * configuration, terminated by AV_PIX_FMT_NONE. + * @warning Behavior is undefined if the callback returns a value other + * than one of the formats in fmt or AV_PIX_FMT_NONE. + * @return the chosen format or AV_PIX_FMT_NONE + */ + enum AVPixelFormat (*get_format)(struct AVCodecContext *s, const enum AVPixelFormat * fmt); + + /** + * maximum number of B-frames between non-B-frames + * Note: The output will be delayed by max_b_frames+1 relative to the input. + * - encoding: Set by user. + * - decoding: unused + */ + int max_b_frames; + + /** + * qscale factor between IP and B-frames + * If > 0 then the last P-frame quantizer will be used (q= lastp_q*factor+offset). + * If < 0 then normal ratecontrol will be done (q= -normal_q*factor+offset). + * - encoding: Set by user. + * - decoding: unused + */ + float b_quant_factor; + + /** + * qscale offset between IP and B-frames + * - encoding: Set by user. + * - decoding: unused + */ + float b_quant_offset; + + /** + * Size of the frame reordering buffer in the decoder. + * For MPEG-2 it is 1 IPB or 0 low delay IP. + * - encoding: Set by libavcodec. + * - decoding: Set by libavcodec. + */ + int has_b_frames; + + /** + * qscale factor between P- and I-frames + * If > 0 then the last P-frame quantizer will be used (q = lastp_q * factor + offset). + * If < 0 then normal ratecontrol will be done (q= -normal_q*factor+offset). + * - encoding: Set by user. + * - decoding: unused + */ + float i_quant_factor; + + /** + * qscale offset between P and I-frames + * - encoding: Set by user. + * - decoding: unused + */ + float i_quant_offset; + + /** + * luminance masking (0-> disabled) + * - encoding: Set by user. + * - decoding: unused + */ + float lumi_masking; + + /** + * temporary complexity masking (0-> disabled) + * - encoding: Set by user. + * - decoding: unused + */ + float temporal_cplx_masking; + + /** + * spatial complexity masking (0-> disabled) + * - encoding: Set by user. + * - decoding: unused + */ + float spatial_cplx_masking; + + /** + * p block masking (0-> disabled) + * - encoding: Set by user. + * - decoding: unused + */ + float p_masking; + + /** + * darkness masking (0-> disabled) + * - encoding: Set by user. + * - decoding: unused + */ + float dark_masking; + + /** + * slice count + * - encoding: Set by libavcodec. + * - decoding: Set by user (or 0). + */ + int slice_count; + + /** + * slice offsets in the frame in bytes + * - encoding: Set/allocated by libavcodec. + * - decoding: Set/allocated by user (or NULL). + */ + int *slice_offset; + + /** + * sample aspect ratio (0 if unknown) + * That is the width of a pixel divided by the height of the pixel. + * Numerator and denominator must be relatively prime and smaller than 256 for some video standards. + * - encoding: Set by user. + * - decoding: Set by libavcodec. + */ + AVRational sample_aspect_ratio; + + /** + * motion estimation comparison function + * - encoding: Set by user. + * - decoding: unused + */ + int me_cmp; + /** + * subpixel motion estimation comparison function + * - encoding: Set by user. + * - decoding: unused + */ + int me_sub_cmp; + /** + * macroblock comparison function (not supported yet) + * - encoding: Set by user. + * - decoding: unused + */ + int mb_cmp; + /** + * interlaced DCT comparison function + * - encoding: Set by user. + * - decoding: unused + */ + int ildct_cmp; +#define FF_CMP_SAD 0 +#define FF_CMP_SSE 1 +#define FF_CMP_SATD 2 +#define FF_CMP_DCT 3 +#define FF_CMP_PSNR 4 +#define FF_CMP_BIT 5 +#define FF_CMP_RD 6 +#define FF_CMP_ZERO 7 +#define FF_CMP_VSAD 8 +#define FF_CMP_VSSE 9 +#define FF_CMP_NSSE 10 +#define FF_CMP_W53 11 +#define FF_CMP_W97 12 +#define FF_CMP_DCTMAX 13 +#define FF_CMP_DCT264 14 +#define FF_CMP_MEDIAN_SAD 15 +#define FF_CMP_CHROMA 256 + + /** + * ME diamond size & shape + * - encoding: Set by user. + * - decoding: unused + */ + int dia_size; + + /** + * amount of previous MV predictors (2a+1 x 2a+1 square) + * - encoding: Set by user. + * - decoding: unused + */ + int last_predictor_count; + + /** + * motion estimation prepass comparison function + * - encoding: Set by user. + * - decoding: unused + */ + int me_pre_cmp; + + /** + * ME prepass diamond size & shape + * - encoding: Set by user. + * - decoding: unused + */ + int pre_dia_size; + + /** + * subpel ME quality + * - encoding: Set by user. + * - decoding: unused + */ + int me_subpel_quality; + + /** + * maximum motion estimation search range in subpel units + * If 0 then no limit. + * + * - encoding: Set by user. + * - decoding: unused + */ + int me_range; + + /** + * slice flags + * - encoding: unused + * - decoding: Set by user. + */ + int slice_flags; +#define SLICE_FLAG_CODED_ORDER 0x0001 ///< draw_horiz_band() is called in coded order instead of display +#define SLICE_FLAG_ALLOW_FIELD 0x0002 ///< allow draw_horiz_band() with field slices (MPEG-2 field pics) +#define SLICE_FLAG_ALLOW_PLANE 0x0004 ///< allow draw_horiz_band() with 1 component at a time (SVQ1) + + /** + * macroblock decision mode + * - encoding: Set by user. + * - decoding: unused + */ + int mb_decision; +#define FF_MB_DECISION_SIMPLE 0 ///< uses mb_cmp +#define FF_MB_DECISION_BITS 1 ///< chooses the one which needs the fewest bits +#define FF_MB_DECISION_RD 2 ///< rate distortion + + /** + * custom intra quantization matrix + * Must be allocated with the av_malloc() family of functions, and will be freed in + * avcodec_free_context(). + * - encoding: Set/allocated by user, freed by libavcodec. Can be NULL. + * - decoding: Set/allocated/freed by libavcodec. + */ + uint16_t *intra_matrix; + + /** + * custom inter quantization matrix + * Must be allocated with the av_malloc() family of functions, and will be freed in + * avcodec_free_context(). + * - encoding: Set/allocated by user, freed by libavcodec. Can be NULL. + * - decoding: Set/allocated/freed by libavcodec. + */ + uint16_t *inter_matrix; + + /** + * precision of the intra DC coefficient - 8 + * - encoding: Set by user. + * - decoding: Set by libavcodec + */ + int intra_dc_precision; + + /** + * Number of macroblock rows at the top which are skipped. + * - encoding: unused + * - decoding: Set by user. + */ + int skip_top; + + /** + * Number of macroblock rows at the bottom which are skipped. + * - encoding: unused + * - decoding: Set by user. + */ + int skip_bottom; + + /** + * minimum MB Lagrange multiplier + * - encoding: Set by user. + * - decoding: unused + */ + int mb_lmin; + + /** + * maximum MB Lagrange multiplier + * - encoding: Set by user. + * - decoding: unused + */ + int mb_lmax; + + /** + * - encoding: Set by user. + * - decoding: unused + */ + int bidir_refine; + + /** + * minimum GOP size + * - encoding: Set by user. + * - decoding: unused + */ + int keyint_min; + + /** + * number of reference frames + * - encoding: Set by user. + * - decoding: Set by lavc. + */ + int refs; + + /** + * Note: Value depends upon the compare function used for fullpel ME. + * - encoding: Set by user. + * - decoding: unused + */ + int mv0_threshold; + + /** + * Chromaticity coordinates of the source primaries. + * - encoding: Set by user + * - decoding: Set by libavcodec + */ + enum AVColorPrimaries color_primaries; + + /** + * Color Transfer Characteristic. + * - encoding: Set by user + * - decoding: Set by libavcodec + */ + enum AVColorTransferCharacteristic color_trc; + + /** + * YUV colorspace type. + * - encoding: Set by user + * - decoding: Set by libavcodec + */ + enum AVColorSpace colorspace; + + /** + * MPEG vs JPEG YUV range. + * - encoding: Set by user + * - decoding: Set by libavcodec + */ + enum AVColorRange color_range; + + /** + * This defines the location of chroma samples. + * - encoding: Set by user + * - decoding: Set by libavcodec + */ + enum AVChromaLocation chroma_sample_location; + + /** + * Number of slices. + * Indicates number of picture subdivisions. Used for parallelized + * decoding. + * - encoding: Set by user + * - decoding: unused + */ + int slices; + + /** Field order + * - encoding: set by libavcodec + * - decoding: Set by user. + */ + enum AVFieldOrder field_order; + + /* audio only */ + int sample_rate; ///< samples per second + +#if FF_API_OLD_CHANNEL_LAYOUT + /** + * number of audio channels + * @deprecated use ch_layout.nb_channels + */ + attribute_deprecated + int channels; +#endif + + /** + * audio sample format + * - encoding: Set by user. + * - decoding: Set by libavcodec. + */ + enum AVSampleFormat sample_fmt; ///< sample format + + /* The following data should not be initialized. */ + /** + * Number of samples per channel in an audio frame. + * + * - encoding: set by libavcodec in avcodec_open2(). Each submitted frame + * except the last must contain exactly frame_size samples per channel. + * May be 0 when the codec has AV_CODEC_CAP_VARIABLE_FRAME_SIZE set, then the + * frame size is not restricted. + * - decoding: may be set by some decoders to indicate constant frame size + */ + int frame_size; + +#if FF_API_AVCTX_FRAME_NUMBER + /** + * Frame counter, set by libavcodec. + * + * - decoding: total number of frames returned from the decoder so far. + * - encoding: total number of frames passed to the encoder so far. + * + * @note the counter is not incremented if encoding/decoding resulted in + * an error. + * @deprecated use frame_num instead + */ + attribute_deprecated + int frame_number; +#endif + + /** + * number of bytes per packet if constant and known or 0 + * Used by some WAV based audio codecs. + */ + int block_align; + + /** + * Audio cutoff bandwidth (0 means "automatic") + * - encoding: Set by user. + * - decoding: unused + */ + int cutoff; + +#if FF_API_OLD_CHANNEL_LAYOUT + /** + * Audio channel layout. + * - encoding: set by user. + * - decoding: set by user, may be overwritten by libavcodec. + * @deprecated use ch_layout + */ + attribute_deprecated + uint64_t channel_layout; + + /** + * Request decoder to use this channel layout if it can (0 for default) + * - encoding: unused + * - decoding: Set by user. + * @deprecated use "downmix" codec private option + */ + attribute_deprecated + uint64_t request_channel_layout; +#endif + + /** + * Type of service that the audio stream conveys. + * - encoding: Set by user. + * - decoding: Set by libavcodec. + */ + enum AVAudioServiceType audio_service_type; + + /** + * desired sample format + * - encoding: Not used. + * - decoding: Set by user. + * Decoder will decode to this format if it can. + */ + enum AVSampleFormat request_sample_fmt; + + /** + * This callback is called at the beginning of each frame to get data + * buffer(s) for it. There may be one contiguous buffer for all the data or + * there may be a buffer per each data plane or anything in between. What + * this means is, you may set however many entries in buf[] you feel necessary. + * Each buffer must be reference-counted using the AVBuffer API (see description + * of buf[] below). + * + * The following fields will be set in the frame before this callback is + * called: + * - format + * - width, height (video only) + * - sample_rate, channel_layout, nb_samples (audio only) + * Their values may differ from the corresponding values in + * AVCodecContext. This callback must use the frame values, not the codec + * context values, to calculate the required buffer size. + * + * This callback must fill the following fields in the frame: + * - data[] + * - linesize[] + * - extended_data: + * * if the data is planar audio with more than 8 channels, then this + * callback must allocate and fill extended_data to contain all pointers + * to all data planes. data[] must hold as many pointers as it can. + * extended_data must be allocated with av_malloc() and will be freed in + * av_frame_unref(). + * * otherwise extended_data must point to data + * - buf[] must contain one or more pointers to AVBufferRef structures. Each of + * the frame's data and extended_data pointers must be contained in these. That + * is, one AVBufferRef for each allocated chunk of memory, not necessarily one + * AVBufferRef per data[] entry. See: av_buffer_create(), av_buffer_alloc(), + * and av_buffer_ref(). + * - extended_buf and nb_extended_buf must be allocated with av_malloc() by + * this callback and filled with the extra buffers if there are more + * buffers than buf[] can hold. extended_buf will be freed in + * av_frame_unref(). + * + * If AV_CODEC_CAP_DR1 is not set then get_buffer2() must call + * avcodec_default_get_buffer2() instead of providing buffers allocated by + * some other means. + * + * Each data plane must be aligned to the maximum required by the target + * CPU. + * + * @see avcodec_default_get_buffer2() + * + * Video: + * + * If AV_GET_BUFFER_FLAG_REF is set in flags then the frame may be reused + * (read and/or written to if it is writable) later by libavcodec. + * + * avcodec_align_dimensions2() should be used to find the required width and + * height, as they normally need to be rounded up to the next multiple of 16. + * + * Some decoders do not support linesizes changing between frames. + * + * If frame multithreading is used, this callback may be called from a + * different thread, but not from more than one at once. Does not need to be + * reentrant. + * + * @see avcodec_align_dimensions2() + * + * Audio: + * + * Decoders request a buffer of a particular size by setting + * AVFrame.nb_samples prior to calling get_buffer2(). The decoder may, + * however, utilize only part of the buffer by setting AVFrame.nb_samples + * to a smaller value in the output frame. + * + * As a convenience, av_samples_get_buffer_size() and + * av_samples_fill_arrays() in libavutil may be used by custom get_buffer2() + * functions to find the required data size and to fill data pointers and + * linesize. In AVFrame.linesize, only linesize[0] may be set for audio + * since all planes must be the same size. + * + * @see av_samples_get_buffer_size(), av_samples_fill_arrays() + * + * - encoding: unused + * - decoding: Set by libavcodec, user can override. + */ + int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags); + + /* - encoding parameters */ + float qcompress; ///< amount of qscale change between easy & hard scenes (0.0-1.0) + float qblur; ///< amount of qscale smoothing over time (0.0-1.0) + + /** + * minimum quantizer + * - encoding: Set by user. + * - decoding: unused + */ + int qmin; + + /** + * maximum quantizer + * - encoding: Set by user. + * - decoding: unused + */ + int qmax; + + /** + * maximum quantizer difference between frames + * - encoding: Set by user. + * - decoding: unused + */ + int max_qdiff; + + /** + * decoder bitstream buffer size + * - encoding: Set by user. + * - decoding: unused + */ + int rc_buffer_size; + + /** + * ratecontrol override, see RcOverride + * - encoding: Allocated/set/freed by user. + * - decoding: unused + */ + int rc_override_count; + RcOverride *rc_override; + + /** + * maximum bitrate + * - encoding: Set by user. + * - decoding: Set by user, may be overwritten by libavcodec. + */ + int64_t rc_max_rate; + + /** + * minimum bitrate + * - encoding: Set by user. + * - decoding: unused + */ + int64_t rc_min_rate; + + /** + * Ratecontrol attempt to use, at maximum, <value> of what can be used without an underflow. + * - encoding: Set by user. + * - decoding: unused. + */ + float rc_max_available_vbv_use; + + /** + * Ratecontrol attempt to use, at least, <value> times the amount needed to prevent a vbv overflow. + * - encoding: Set by user. + * - decoding: unused. + */ + float rc_min_vbv_overflow_use; + + /** + * Number of bits which should be loaded into the rc buffer before decoding starts. + * - encoding: Set by user. + * - decoding: unused + */ + int rc_initial_buffer_occupancy; + + /** + * trellis RD quantization + * - encoding: Set by user. + * - decoding: unused + */ + int trellis; + + /** + * pass1 encoding statistics output buffer + * - encoding: Set by libavcodec. + * - decoding: unused + */ + char *stats_out; + + /** + * pass2 encoding statistics input buffer + * Concatenated stuff from stats_out of pass1 should be placed here. + * - encoding: Allocated/set/freed by user. + * - decoding: unused + */ + char *stats_in; + + /** + * Work around bugs in encoders which sometimes cannot be detected automatically. + * - encoding: Set by user + * - decoding: Set by user + */ + int workaround_bugs; +#define FF_BUG_AUTODETECT 1 ///< autodetection +#define FF_BUG_XVID_ILACE 4 +#define FF_BUG_UMP4 8 +#define FF_BUG_NO_PADDING 16 +#define FF_BUG_AMV 32 +#define FF_BUG_QPEL_CHROMA 64 +#define FF_BUG_STD_QPEL 128 +#define FF_BUG_QPEL_CHROMA2 256 +#define FF_BUG_DIRECT_BLOCKSIZE 512 +#define FF_BUG_EDGE 1024 +#define FF_BUG_HPEL_CHROMA 2048 +#define FF_BUG_DC_CLIP 4096 +#define FF_BUG_MS 8192 ///< Work around various bugs in Microsoft's broken decoders. +#define FF_BUG_TRUNCATED 16384 +#define FF_BUG_IEDGE 32768 + + /** + * strictly follow the standard (MPEG-4, ...). + * - encoding: Set by user. + * - decoding: Set by user. + * Setting this to STRICT or higher means the encoder and decoder will + * generally do stupid things, whereas setting it to unofficial or lower + * will mean the encoder might produce output that is not supported by all + * spec-compliant decoders. Decoders don't differentiate between normal, + * unofficial and experimental (that is, they always try to decode things + * when they can) unless they are explicitly asked to behave stupidly + * (=strictly conform to the specs) + * This may only be set to one of the FF_COMPLIANCE_* values in defs.h. + */ + int strict_std_compliance; + + /** + * error concealment flags + * - encoding: unused + * - decoding: Set by user. + */ + int error_concealment; +#define FF_EC_GUESS_MVS 1 +#define FF_EC_DEBLOCK 2 +#define FF_EC_FAVOR_INTER 256 + + /** + * debug + * - encoding: Set by user. + * - decoding: Set by user. + */ + int debug; +#define FF_DEBUG_PICT_INFO 1 +#define FF_DEBUG_RC 2 +#define FF_DEBUG_BITSTREAM 4 +#define FF_DEBUG_MB_TYPE 8 +#define FF_DEBUG_QP 16 +#define FF_DEBUG_DCT_COEFF 0x00000040 +#define FF_DEBUG_SKIP 0x00000080 +#define FF_DEBUG_STARTCODE 0x00000100 +#define FF_DEBUG_ER 0x00000400 +#define FF_DEBUG_MMCO 0x00000800 +#define FF_DEBUG_BUGS 0x00001000 +#define FF_DEBUG_BUFFERS 0x00008000 +#define FF_DEBUG_THREADS 0x00010000 +#define FF_DEBUG_GREEN_MD 0x00800000 +#define FF_DEBUG_NOMC 0x01000000 + + /** + * Error recognition; may misdetect some more or less valid parts as errors. + * This is a bitfield of the AV_EF_* values defined in defs.h. + * + * - encoding: Set by user. + * - decoding: Set by user. + */ + int err_recognition; + +#if FF_API_REORDERED_OPAQUE + /** + * opaque 64-bit number (generally a PTS) that will be reordered and + * output in AVFrame.reordered_opaque + * - encoding: Set by libavcodec to the reordered_opaque of the input + * frame corresponding to the last returned packet. Only + * supported by encoders with the + * AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE capability. + * - decoding: Set by user. + * + * @deprecated Use AV_CODEC_FLAG_COPY_OPAQUE instead + */ + attribute_deprecated + int64_t reordered_opaque; +#endif + + /** + * Hardware accelerator in use + * - encoding: unused. + * - decoding: Set by libavcodec + */ + const struct AVHWAccel *hwaccel; + + /** + * Legacy hardware accelerator context. + * + * For some hardware acceleration methods, the caller may use this field to + * signal hwaccel-specific data to the codec. The struct pointed to by this + * pointer is hwaccel-dependent and defined in the respective header. Please + * refer to the FFmpeg HW accelerator documentation to know how to fill + * this. + * + * In most cases this field is optional - the necessary information may also + * be provided to libavcodec through @ref hw_frames_ctx or @ref + * hw_device_ctx (see avcodec_get_hw_config()). However, in some cases it + * may be the only method of signalling some (optional) information. + * + * The struct and its contents are owned by the caller. + * + * - encoding: May be set by the caller before avcodec_open2(). Must remain + * valid until avcodec_free_context(). + * - decoding: May be set by the caller in the get_format() callback. + * Must remain valid until the next get_format() call, + * or avcodec_free_context() (whichever comes first). + */ + void *hwaccel_context; + + /** + * error + * - encoding: Set by libavcodec if flags & AV_CODEC_FLAG_PSNR. + * - decoding: unused + */ + uint64_t error[AV_NUM_DATA_POINTERS]; + + /** + * DCT algorithm, see FF_DCT_* below + * - encoding: Set by user. + * - decoding: unused + */ + int dct_algo; +#define FF_DCT_AUTO 0 +#define FF_DCT_FASTINT 1 +#define FF_DCT_INT 2 +#define FF_DCT_MMX 3 +#define FF_DCT_ALTIVEC 5 +#define FF_DCT_FAAN 6 + + /** + * IDCT algorithm, see FF_IDCT_* below. + * - encoding: Set by user. + * - decoding: Set by user. + */ + int idct_algo; +#define FF_IDCT_AUTO 0 +#define FF_IDCT_INT 1 +#define FF_IDCT_SIMPLE 2 +#define FF_IDCT_SIMPLEMMX 3 +#define FF_IDCT_ARM 7 +#define FF_IDCT_ALTIVEC 8 +#define FF_IDCT_SIMPLEARM 10 +#define FF_IDCT_XVID 14 +#define FF_IDCT_SIMPLEARMV5TE 16 +#define FF_IDCT_SIMPLEARMV6 17 +#define FF_IDCT_FAAN 20 +#define FF_IDCT_SIMPLENEON 22 +#if FF_API_IDCT_NONE +// formerly used by xvmc +#define FF_IDCT_NONE 24 +#endif +#define FF_IDCT_SIMPLEAUTO 128 + + /** + * bits per sample/pixel from the demuxer (needed for huffyuv). + * - encoding: Set by libavcodec. + * - decoding: Set by user. + */ + int bits_per_coded_sample; + + /** + * Bits per sample/pixel of internal libavcodec pixel/sample format. + * - encoding: set by user. + * - decoding: set by libavcodec. + */ + int bits_per_raw_sample; + + /** + * low resolution decoding, 1-> 1/2 size, 2->1/4 size + * - encoding: unused + * - decoding: Set by user. + */ + int lowres; + + /** + * thread count + * is used to decide how many independent tasks should be passed to execute() + * - encoding: Set by user. + * - decoding: Set by user. + */ + int thread_count; + + /** + * Which multithreading methods to use. + * Use of FF_THREAD_FRAME will increase decoding delay by one frame per thread, + * so clients which cannot provide future frames should not use it. + * + * - encoding: Set by user, otherwise the default is used. + * - decoding: Set by user, otherwise the default is used. + */ + int thread_type; +#define FF_THREAD_FRAME 1 ///< Decode more than one frame at once +#define FF_THREAD_SLICE 2 ///< Decode more than one part of a single frame at once + + /** + * Which multithreading methods are in use by the codec. + * - encoding: Set by libavcodec. + * - decoding: Set by libavcodec. + */ + int active_thread_type; + + /** + * The codec may call this to execute several independent things. + * It will return only after finishing all tasks. + * The user may replace this with some multithreaded implementation, + * the default implementation will execute the parts serially. + * @param count the number of things to execute + * - encoding: Set by libavcodec, user can override. + * - decoding: Set by libavcodec, user can override. + */ + int (*execute)(struct AVCodecContext *c, int (*func)(struct AVCodecContext *c2, void *arg), void *arg2, int *ret, int count, int size); + + /** + * The codec may call this to execute several independent things. + * It will return only after finishing all tasks. + * The user may replace this with some multithreaded implementation, + * the default implementation will execute the parts serially. + * @param c context passed also to func + * @param count the number of things to execute + * @param arg2 argument passed unchanged to func + * @param ret return values of executed functions, must have space for "count" values. May be NULL. + * @param func function that will be called count times, with jobnr from 0 to count-1. + * threadnr will be in the range 0 to c->thread_count-1 < MAX_THREADS and so that no + * two instances of func executing at the same time will have the same threadnr. + * @return always 0 currently, but code should handle a future improvement where when any call to func + * returns < 0 no further calls to func may be done and < 0 is returned. + * - encoding: Set by libavcodec, user can override. + * - decoding: Set by libavcodec, user can override. + */ + int (*execute2)(struct AVCodecContext *c, int (*func)(struct AVCodecContext *c2, void *arg, int jobnr, int threadnr), void *arg2, int *ret, int count); + + /** + * noise vs. sse weight for the nsse comparison function + * - encoding: Set by user. + * - decoding: unused + */ + int nsse_weight; + + /** + * profile + * - encoding: Set by user. + * - decoding: Set by libavcodec. + */ + int profile; +#define FF_PROFILE_UNKNOWN -99 +#define FF_PROFILE_RESERVED -100 + +#define FF_PROFILE_AAC_MAIN 0 +#define FF_PROFILE_AAC_LOW 1 +#define FF_PROFILE_AAC_SSR 2 +#define FF_PROFILE_AAC_LTP 3 +#define FF_PROFILE_AAC_HE 4 +#define FF_PROFILE_AAC_HE_V2 28 +#define FF_PROFILE_AAC_LD 22 +#define FF_PROFILE_AAC_ELD 38 +#define FF_PROFILE_MPEG2_AAC_LOW 128 +#define FF_PROFILE_MPEG2_AAC_HE 131 + +#define FF_PROFILE_DNXHD 0 +#define FF_PROFILE_DNXHR_LB 1 +#define FF_PROFILE_DNXHR_SQ 2 +#define FF_PROFILE_DNXHR_HQ 3 +#define FF_PROFILE_DNXHR_HQX 4 +#define FF_PROFILE_DNXHR_444 5 + +#define FF_PROFILE_DTS 20 +#define FF_PROFILE_DTS_ES 30 +#define FF_PROFILE_DTS_96_24 40 +#define FF_PROFILE_DTS_HD_HRA 50 +#define FF_PROFILE_DTS_HD_MA 60 +#define FF_PROFILE_DTS_EXPRESS 70 + +#define FF_PROFILE_MPEG2_422 0 +#define FF_PROFILE_MPEG2_HIGH 1 +#define FF_PROFILE_MPEG2_SS 2 +#define FF_PROFILE_MPEG2_SNR_SCALABLE 3 +#define FF_PROFILE_MPEG2_MAIN 4 +#define FF_PROFILE_MPEG2_SIMPLE 5 + +#define FF_PROFILE_H264_CONSTRAINED (1<<9) // 8+1; constraint_set1_flag +#define FF_PROFILE_H264_INTRA (1<<11) // 8+3; constraint_set3_flag + +#define FF_PROFILE_H264_BASELINE 66 +#define FF_PROFILE_H264_CONSTRAINED_BASELINE (66|FF_PROFILE_H264_CONSTRAINED) +#define FF_PROFILE_H264_MAIN 77 +#define FF_PROFILE_H264_EXTENDED 88 +#define FF_PROFILE_H264_HIGH 100 +#define FF_PROFILE_H264_HIGH_10 110 +#define FF_PROFILE_H264_HIGH_10_INTRA (110|FF_PROFILE_H264_INTRA) +#define FF_PROFILE_H264_MULTIVIEW_HIGH 118 +#define FF_PROFILE_H264_HIGH_422 122 +#define FF_PROFILE_H264_HIGH_422_INTRA (122|FF_PROFILE_H264_INTRA) +#define FF_PROFILE_H264_STEREO_HIGH 128 +#define FF_PROFILE_H264_HIGH_444 144 +#define FF_PROFILE_H264_HIGH_444_PREDICTIVE 244 +#define FF_PROFILE_H264_HIGH_444_INTRA (244|FF_PROFILE_H264_INTRA) +#define FF_PROFILE_H264_CAVLC_444 44 + +#define FF_PROFILE_VC1_SIMPLE 0 +#define FF_PROFILE_VC1_MAIN 1 +#define FF_PROFILE_VC1_COMPLEX 2 +#define FF_PROFILE_VC1_ADVANCED 3 + +#define FF_PROFILE_MPEG4_SIMPLE 0 +#define FF_PROFILE_MPEG4_SIMPLE_SCALABLE 1 +#define FF_PROFILE_MPEG4_CORE 2 +#define FF_PROFILE_MPEG4_MAIN 3 +#define FF_PROFILE_MPEG4_N_BIT 4 +#define FF_PROFILE_MPEG4_SCALABLE_TEXTURE 5 +#define FF_PROFILE_MPEG4_SIMPLE_FACE_ANIMATION 6 +#define FF_PROFILE_MPEG4_BASIC_ANIMATED_TEXTURE 7 +#define FF_PROFILE_MPEG4_HYBRID 8 +#define FF_PROFILE_MPEG4_ADVANCED_REAL_TIME 9 +#define FF_PROFILE_MPEG4_CORE_SCALABLE 10 +#define FF_PROFILE_MPEG4_ADVANCED_CODING 11 +#define FF_PROFILE_MPEG4_ADVANCED_CORE 12 +#define FF_PROFILE_MPEG4_ADVANCED_SCALABLE_TEXTURE 13 +#define FF_PROFILE_MPEG4_SIMPLE_STUDIO 14 +#define FF_PROFILE_MPEG4_ADVANCED_SIMPLE 15 + +#define FF_PROFILE_JPEG2000_CSTREAM_RESTRICTION_0 1 +#define FF_PROFILE_JPEG2000_CSTREAM_RESTRICTION_1 2 +#define FF_PROFILE_JPEG2000_CSTREAM_NO_RESTRICTION 32768 +#define FF_PROFILE_JPEG2000_DCINEMA_2K 3 +#define FF_PROFILE_JPEG2000_DCINEMA_4K 4 + +#define FF_PROFILE_VP9_0 0 +#define FF_PROFILE_VP9_1 1 +#define FF_PROFILE_VP9_2 2 +#define FF_PROFILE_VP9_3 3 + +#define FF_PROFILE_HEVC_MAIN 1 +#define FF_PROFILE_HEVC_MAIN_10 2 +#define FF_PROFILE_HEVC_MAIN_STILL_PICTURE 3 +#define FF_PROFILE_HEVC_REXT 4 +#define FF_PROFILE_HEVC_SCC 9 + +#define FF_PROFILE_VVC_MAIN_10 1 +#define FF_PROFILE_VVC_MAIN_10_444 33 + +#define FF_PROFILE_AV1_MAIN 0 +#define FF_PROFILE_AV1_HIGH 1 +#define FF_PROFILE_AV1_PROFESSIONAL 2 + +#define FF_PROFILE_MJPEG_HUFFMAN_BASELINE_DCT 0xc0 +#define FF_PROFILE_MJPEG_HUFFMAN_EXTENDED_SEQUENTIAL_DCT 0xc1 +#define FF_PROFILE_MJPEG_HUFFMAN_PROGRESSIVE_DCT 0xc2 +#define FF_PROFILE_MJPEG_HUFFMAN_LOSSLESS 0xc3 +#define FF_PROFILE_MJPEG_JPEG_LS 0xf7 + +#define FF_PROFILE_SBC_MSBC 1 + +#define FF_PROFILE_PRORES_PROXY 0 +#define FF_PROFILE_PRORES_LT 1 +#define FF_PROFILE_PRORES_STANDARD 2 +#define FF_PROFILE_PRORES_HQ 3 +#define FF_PROFILE_PRORES_4444 4 +#define FF_PROFILE_PRORES_XQ 5 + +#define FF_PROFILE_ARIB_PROFILE_A 0 +#define FF_PROFILE_ARIB_PROFILE_C 1 + +#define FF_PROFILE_KLVA_SYNC 0 +#define FF_PROFILE_KLVA_ASYNC 1 + + /** + * level + * - encoding: Set by user. + * - decoding: Set by libavcodec. + */ + int level; +#define FF_LEVEL_UNKNOWN -99 + + /** + * Skip loop filtering for selected frames. + * - encoding: unused + * - decoding: Set by user. + */ + enum AVDiscard skip_loop_filter; + + /** + * Skip IDCT/dequantization for selected frames. + * - encoding: unused + * - decoding: Set by user. + */ + enum AVDiscard skip_idct; + + /** + * Skip decoding for selected frames. + * - encoding: unused + * - decoding: Set by user. + */ + enum AVDiscard skip_frame; + + /** + * Header containing style information for text subtitles. + * For SUBTITLE_ASS subtitle type, it should contain the whole ASS + * [Script Info] and [V4+ Styles] section, plus the [Events] line and + * the Format line following. It shouldn't include any Dialogue line. + * - encoding: Set/allocated/freed by user (before avcodec_open2()) + * - decoding: Set/allocated/freed by libavcodec (by avcodec_open2()) + */ + uint8_t *subtitle_header; + int subtitle_header_size; + + /** + * Audio only. The number of "priming" samples (padding) inserted by the + * encoder at the beginning of the audio. I.e. this number of leading + * decoded samples must be discarded by the caller to get the original audio + * without leading padding. + * + * - decoding: unused + * - encoding: Set by libavcodec. The timestamps on the output packets are + * adjusted by the encoder so that they always refer to the + * first sample of the data actually contained in the packet, + * including any added padding. E.g. if the timebase is + * 1/samplerate and the timestamp of the first input sample is + * 0, the timestamp of the first output packet will be + * -initial_padding. + */ + int initial_padding; + + /** + * - decoding: For codecs that store a framerate value in the compressed + * bitstream, the decoder may export it here. { 0, 1} when + * unknown. + * - encoding: May be used to signal the framerate of CFR content to an + * encoder. + */ + AVRational framerate; + + /** + * Nominal unaccelerated pixel format, see AV_PIX_FMT_xxx. + * - encoding: unused. + * - decoding: Set by libavcodec before calling get_format() + */ + enum AVPixelFormat sw_pix_fmt; + + /** + * Timebase in which pkt_dts/pts and AVPacket.dts/pts are. + * - encoding unused. + * - decoding set by user. + */ + AVRational pkt_timebase; + + /** + * AVCodecDescriptor + * - encoding: unused. + * - decoding: set by libavcodec. + */ + const AVCodecDescriptor *codec_descriptor; + + /** + * Current statistics for PTS correction. + * - decoding: maintained and used by libavcodec, not intended to be used by user apps + * - encoding: unused + */ + int64_t pts_correction_num_faulty_pts; /// Number of incorrect PTS values so far + int64_t pts_correction_num_faulty_dts; /// Number of incorrect DTS values so far + int64_t pts_correction_last_pts; /// PTS of the last frame + int64_t pts_correction_last_dts; /// DTS of the last frame + + /** + * Character encoding of the input subtitles file. + * - decoding: set by user + * - encoding: unused + */ + char *sub_charenc; + + /** + * Subtitles character encoding mode. Formats or codecs might be adjusting + * this setting (if they are doing the conversion themselves for instance). + * - decoding: set by libavcodec + * - encoding: unused + */ + int sub_charenc_mode; +#define FF_SUB_CHARENC_MODE_DO_NOTHING -1 ///< do nothing (demuxer outputs a stream supposed to be already in UTF-8, or the codec is bitmap for instance) +#define FF_SUB_CHARENC_MODE_AUTOMATIC 0 ///< libavcodec will select the mode itself +#define FF_SUB_CHARENC_MODE_PRE_DECODER 1 ///< the AVPacket data needs to be recoded to UTF-8 before being fed to the decoder, requires iconv +#define FF_SUB_CHARENC_MODE_IGNORE 2 ///< neither convert the subtitles, nor check them for valid UTF-8 + + /** + * Skip processing alpha if supported by codec. + * Note that if the format uses pre-multiplied alpha (common with VP6, + * and recommended due to better video quality/compression) + * the image will look as if alpha-blended onto a black background. + * However for formats that do not use pre-multiplied alpha + * there might be serious artefacts (though e.g. libswscale currently + * assumes pre-multiplied alpha anyway). + * + * - decoding: set by user + * - encoding: unused + */ + int skip_alpha; + + /** + * Number of samples to skip after a discontinuity + * - decoding: unused + * - encoding: set by libavcodec + */ + int seek_preroll; + + /** + * custom intra quantization matrix + * - encoding: Set by user, can be NULL. + * - decoding: unused. + */ + uint16_t *chroma_intra_matrix; + + /** + * dump format separator. + * can be ", " or "\n " or anything else + * - encoding: Set by user. + * - decoding: Set by user. + */ + uint8_t *dump_separator; + + /** + * ',' separated list of allowed decoders. + * If NULL then all are allowed + * - encoding: unused + * - decoding: set by user + */ + char *codec_whitelist; + + /** + * Properties of the stream that gets decoded + * - encoding: unused + * - decoding: set by libavcodec + */ + unsigned properties; +#define FF_CODEC_PROPERTY_LOSSLESS 0x00000001 +#define FF_CODEC_PROPERTY_CLOSED_CAPTIONS 0x00000002 +#define FF_CODEC_PROPERTY_FILM_GRAIN 0x00000004 + + /** + * Additional data associated with the entire coded stream. + * + * - decoding: unused + * - encoding: may be set by libavcodec after avcodec_open2(). + */ + AVPacketSideData *coded_side_data; + int nb_coded_side_data; + + /** + * A reference to the AVHWFramesContext describing the input (for encoding) + * or output (decoding) frames. The reference is set by the caller and + * afterwards owned (and freed) by libavcodec - it should never be read by + * the caller after being set. + * + * - decoding: This field should be set by the caller from the get_format() + * callback. The previous reference (if any) will always be + * unreffed by libavcodec before the get_format() call. + * + * If the default get_buffer2() is used with a hwaccel pixel + * format, then this AVHWFramesContext will be used for + * allocating the frame buffers. + * + * - encoding: For hardware encoders configured to use a hwaccel pixel + * format, this field should be set by the caller to a reference + * to the AVHWFramesContext describing input frames. + * AVHWFramesContext.format must be equal to + * AVCodecContext.pix_fmt. + * + * This field should be set before avcodec_open2() is called. + */ + AVBufferRef *hw_frames_ctx; + + /** + * Audio only. The amount of padding (in samples) appended by the encoder to + * the end of the audio. I.e. this number of decoded samples must be + * discarded by the caller from the end of the stream to get the original + * audio without any trailing padding. + * + * - decoding: unused + * - encoding: unused + */ + int trailing_padding; + + /** + * The number of pixels per image to maximally accept. + * + * - decoding: set by user + * - encoding: set by user + */ + int64_t max_pixels; + + /** + * A reference to the AVHWDeviceContext describing the device which will + * be used by a hardware encoder/decoder. The reference is set by the + * caller and afterwards owned (and freed) by libavcodec. + * + * This should be used if either the codec device does not require + * hardware frames or any that are used are to be allocated internally by + * libavcodec. If the user wishes to supply any of the frames used as + * encoder input or decoder output then hw_frames_ctx should be used + * instead. When hw_frames_ctx is set in get_format() for a decoder, this + * field will be ignored while decoding the associated stream segment, but + * may again be used on a following one after another get_format() call. + * + * For both encoders and decoders this field should be set before + * avcodec_open2() is called and must not be written to thereafter. + * + * Note that some decoders may require this field to be set initially in + * order to support hw_frames_ctx at all - in that case, all frames + * contexts used must be created on the same device. + */ + AVBufferRef *hw_device_ctx; + + /** + * Bit set of AV_HWACCEL_FLAG_* flags, which affect hardware accelerated + * decoding (if active). + * - encoding: unused + * - decoding: Set by user (either before avcodec_open2(), or in the + * AVCodecContext.get_format callback) + */ + int hwaccel_flags; + + /** + * Video decoding only. Certain video codecs support cropping, meaning that + * only a sub-rectangle of the decoded frame is intended for display. This + * option controls how cropping is handled by libavcodec. + * + * When set to 1 (the default), libavcodec will apply cropping internally. + * I.e. it will modify the output frame width/height fields and offset the + * data pointers (only by as much as possible while preserving alignment, or + * by the full amount if the AV_CODEC_FLAG_UNALIGNED flag is set) so that + * the frames output by the decoder refer only to the cropped area. The + * crop_* fields of the output frames will be zero. + * + * When set to 0, the width/height fields of the output frames will be set + * to the coded dimensions and the crop_* fields will describe the cropping + * rectangle. Applying the cropping is left to the caller. + * + * @warning When hardware acceleration with opaque output frames is used, + * libavcodec is unable to apply cropping from the top/left border. + * + * @note when this option is set to zero, the width/height fields of the + * AVCodecContext and output AVFrames have different meanings. The codec + * context fields store display dimensions (with the coded dimensions in + * coded_width/height), while the frame fields store the coded dimensions + * (with the display dimensions being determined by the crop_* fields). + */ + int apply_cropping; + + /* + * Video decoding only. Sets the number of extra hardware frames which + * the decoder will allocate for use by the caller. This must be set + * before avcodec_open2() is called. + * + * Some hardware decoders require all frames that they will use for + * output to be defined in advance before decoding starts. For such + * decoders, the hardware frame pool must therefore be of a fixed size. + * The extra frames set here are on top of any number that the decoder + * needs internally in order to operate normally (for example, frames + * used as reference pictures). + */ + int extra_hw_frames; + + /** + * The percentage of damaged samples to discard a frame. + * + * - decoding: set by user + * - encoding: unused + */ + int discard_damaged_percentage; + + /** + * The number of samples per frame to maximally accept. + * + * - decoding: set by user + * - encoding: set by user + */ + int64_t max_samples; + + /** + * Bit set of AV_CODEC_EXPORT_DATA_* flags, which affects the kind of + * metadata exported in frame, packet, or coded stream side data by + * decoders and encoders. + * + * - decoding: set by user + * - encoding: set by user + */ + int export_side_data; + + /** + * This callback is called at the beginning of each packet to get a data + * buffer for it. + * + * The following field will be set in the packet before this callback is + * called: + * - size + * This callback must use the above value to calculate the required buffer size, + * which must padded by at least AV_INPUT_BUFFER_PADDING_SIZE bytes. + * + * In some specific cases, the encoder may not use the entire buffer allocated by this + * callback. This will be reflected in the size value in the packet once returned by + * avcodec_receive_packet(). + * + * This callback must fill the following fields in the packet: + * - data: alignment requirements for AVPacket apply, if any. Some architectures and + * encoders may benefit from having aligned data. + * - buf: must contain a pointer to an AVBufferRef structure. The packet's + * data pointer must be contained in it. See: av_buffer_create(), av_buffer_alloc(), + * and av_buffer_ref(). + * + * If AV_CODEC_CAP_DR1 is not set then get_encode_buffer() must call + * avcodec_default_get_encode_buffer() instead of providing a buffer allocated by + * some other means. + * + * The flags field may contain a combination of AV_GET_ENCODE_BUFFER_FLAG_ flags. + * They may be used for example to hint what use the buffer may get after being + * created. + * Implementations of this callback may ignore flags they don't understand. + * If AV_GET_ENCODE_BUFFER_FLAG_REF is set in flags then the packet may be reused + * (read and/or written to if it is writable) later by libavcodec. + * + * This callback must be thread-safe, as when frame threading is used, it may + * be called from multiple threads simultaneously. + * + * @see avcodec_default_get_encode_buffer() + * + * - encoding: Set by libavcodec, user can override. + * - decoding: unused + */ + int (*get_encode_buffer)(struct AVCodecContext *s, AVPacket *pkt, int flags); + + /** + * Audio channel layout. + * - encoding: must be set by the caller, to one of AVCodec.ch_layouts. + * - decoding: may be set by the caller if known e.g. from the container. + * The decoder can then override during decoding as needed. + */ + AVChannelLayout ch_layout; + + /** + * Frame counter, set by libavcodec. + * + * - decoding: total number of frames returned from the decoder so far. + * - encoding: total number of frames passed to the encoder so far. + * + * @note the counter is not incremented if encoding/decoding resulted in + * an error. + */ + int64_t frame_num; +} AVCodecContext; + +/** + * @defgroup lavc_hwaccel AVHWAccel + * + * @note Nothing in this structure should be accessed by the user. At some + * point in future it will not be externally visible at all. + * + * @{ + */ +typedef struct AVHWAccel { + /** + * Name of the hardware accelerated codec. + * The name is globally unique among encoders and among decoders (but an + * encoder and a decoder can share the same name). + */ + const char *name; + + /** + * Type of codec implemented by the hardware accelerator. + * + * See AVMEDIA_TYPE_xxx + */ + enum AVMediaType type; + + /** + * Codec implemented by the hardware accelerator. + * + * See AV_CODEC_ID_xxx + */ + enum AVCodecID id; + + /** + * Supported pixel format. + * + * Only hardware accelerated formats are supported here. + */ + enum AVPixelFormat pix_fmt; + + /** + * Hardware accelerated codec capabilities. + * see AV_HWACCEL_CODEC_CAP_* + */ + int capabilities; + + /***************************************************************** + * No fields below this line are part of the public API. They + * may not be used outside of libavcodec and can be changed and + * removed at will. + * New public fields should be added right above. + ***************************************************************** + */ + + /** + * Allocate a custom buffer + */ + int (*alloc_frame)(AVCodecContext *avctx, AVFrame *frame); + + /** + * Called at the beginning of each frame or field picture. + * + * Meaningful frame information (codec specific) is guaranteed to + * be parsed at this point. This function is mandatory. + * + * Note that buf can be NULL along with buf_size set to 0. + * Otherwise, this means the whole frame is available at this point. + * + * @param avctx the codec context + * @param buf the frame data buffer base + * @param buf_size the size of the frame in bytes + * @return zero if successful, a negative value otherwise + */ + int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); + + /** + * Callback for parameter data (SPS/PPS/VPS etc). + * + * Useful for hardware decoders which keep persistent state about the + * video parameters, and need to receive any changes to update that state. + * + * @param avctx the codec context + * @param type the nal unit type + * @param buf the nal unit data buffer + * @param buf_size the size of the nal unit in bytes + * @return zero if successful, a negative value otherwise + */ + int (*decode_params)(AVCodecContext *avctx, int type, const uint8_t *buf, uint32_t buf_size); + + /** + * Callback for each slice. + * + * Meaningful slice information (codec specific) is guaranteed to + * be parsed at this point. This function is mandatory. + * + * @param avctx the codec context + * @param buf the slice data buffer base + * @param buf_size the size of the slice in bytes + * @return zero if successful, a negative value otherwise + */ + int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size); + + /** + * Called at the end of each frame or field picture. + * + * The whole picture is parsed at this point and can now be sent + * to the hardware accelerator. This function is mandatory. + * + * @param avctx the codec context + * @return zero if successful, a negative value otherwise + */ + int (*end_frame)(AVCodecContext *avctx); + + /** + * Size of per-frame hardware accelerator private data. + * + * Private data is allocated with av_mallocz() before + * AVCodecContext.get_buffer() and deallocated after + * AVCodecContext.release_buffer(). + */ + int frame_priv_data_size; + + /** + * Initialize the hwaccel private data. + * + * This will be called from ff_get_format(), after hwaccel and + * hwaccel_context are set and the hwaccel private data in AVCodecInternal + * is allocated. + */ + int (*init)(AVCodecContext *avctx); + + /** + * Uninitialize the hwaccel private data. + * + * This will be called from get_format() or avcodec_close(), after hwaccel + * and hwaccel_context are already uninitialized. + */ + int (*uninit)(AVCodecContext *avctx); + + /** + * Size of the private data to allocate in + * AVCodecInternal.hwaccel_priv_data. + */ + int priv_data_size; + + /** + * Internal hwaccel capabilities. + */ + int caps_internal; + + /** + * Fill the given hw_frames context with current codec parameters. Called + * from get_format. Refer to avcodec_get_hw_frames_parameters() for + * details. + * + * This CAN be called before AVHWAccel.init is called, and you must assume + * that avctx->hwaccel_priv_data is invalid. + */ + int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); +} AVHWAccel; + +/** + * HWAccel is experimental and is thus avoided in favor of non experimental + * codecs + */ +#define AV_HWACCEL_CODEC_CAP_EXPERIMENTAL 0x0200 + +/** + * Hardware acceleration should be used for decoding even if the codec level + * used is unknown or higher than the maximum supported level reported by the + * hardware driver. + * + * It's generally a good idea to pass this flag unless you have a specific + * reason not to, as hardware tends to under-report supported levels. + */ +#define AV_HWACCEL_FLAG_IGNORE_LEVEL (1 << 0) + +/** + * Hardware acceleration can output YUV pixel formats with a different chroma + * sampling than 4:2:0 and/or other than 8 bits per component. + */ +#define AV_HWACCEL_FLAG_ALLOW_HIGH_DEPTH (1 << 1) + +/** + * Hardware acceleration should still be attempted for decoding when the + * codec profile does not match the reported capabilities of the hardware. + * + * For example, this can be used to try to decode baseline profile H.264 + * streams in hardware - it will often succeed, because many streams marked + * as baseline profile actually conform to constrained baseline profile. + * + * @warning If the stream is actually not supported then the behaviour is + * undefined, and may include returning entirely incorrect output + * while indicating success. + */ +#define AV_HWACCEL_FLAG_ALLOW_PROFILE_MISMATCH (1 << 2) + +/** + * Some hardware decoders (namely nvdec) can either output direct decoder + * surfaces, or make an on-device copy and return said copy. + * There is a hard limit on how many decoder surfaces there can be, and it + * cannot be accurately guessed ahead of time. + * For some processing chains, this can be okay, but others will run into the + * limit and in turn produce very confusing errors that require fine tuning of + * more or less obscure options by the user, or in extreme cases cannot be + * resolved at all without inserting an avfilter that forces a copy. + * + * Thus, the hwaccel will by default make a copy for safety and resilience. + * If a users really wants to minimize the amount of copies, they can set this + * flag and ensure their processing chain does not exhaust the surface pool. + */ +#define AV_HWACCEL_FLAG_UNSAFE_OUTPUT (1 << 3) + +/** + * @} + */ + +enum AVSubtitleType { + SUBTITLE_NONE, + + SUBTITLE_BITMAP, ///< A bitmap, pict will be set + + /** + * Plain text, the text field must be set by the decoder and is + * authoritative. ass and pict fields may contain approximations. + */ + SUBTITLE_TEXT, + + /** + * Formatted text, the ass field must be set by the decoder and is + * authoritative. pict and text fields may contain approximations. + */ + SUBTITLE_ASS, +}; + +#define AV_SUBTITLE_FLAG_FORCED 0x00000001 + +typedef struct AVSubtitleRect { + int x; ///< top left corner of pict, undefined when pict is not set + int y; ///< top left corner of pict, undefined when pict is not set + int w; ///< width of pict, undefined when pict is not set + int h; ///< height of pict, undefined when pict is not set + int nb_colors; ///< number of colors in pict, undefined when pict is not set + + /** + * data+linesize for the bitmap of this subtitle. + * Can be set for text/ass as well once they are rendered. + */ + uint8_t *data[4]; + int linesize[4]; + + enum AVSubtitleType type; + + char *text; ///< 0 terminated plain UTF-8 text + + /** + * 0 terminated ASS/SSA compatible event line. + * The presentation of this is unaffected by the other values in this + * struct. + */ + char *ass; + + int flags; +} AVSubtitleRect; + +typedef struct AVSubtitle { + uint16_t format; /* 0 = graphics */ + uint32_t start_display_time; /* relative to packet pts, in ms */ + uint32_t end_display_time; /* relative to packet pts, in ms */ + unsigned num_rects; + AVSubtitleRect **rects; + int64_t pts; ///< Same as packet pts, in AV_TIME_BASE +} AVSubtitle; + +/** + * Return the LIBAVCODEC_VERSION_INT constant. + */ +unsigned avcodec_version(void); + +/** + * Return the libavcodec build-time configuration. + */ +const char *avcodec_configuration(void); + +/** + * Return the libavcodec license. + */ +const char *avcodec_license(void); + +/** + * Allocate an AVCodecContext and set its fields to default values. The + * resulting struct should be freed with avcodec_free_context(). + * + * @param codec if non-NULL, allocate private data and initialize defaults + * for the given codec. It is illegal to then call avcodec_open2() + * with a different codec. + * If NULL, then the codec-specific defaults won't be initialized, + * which may result in suboptimal default settings (this is + * important mainly for encoders, e.g. libx264). + * + * @return An AVCodecContext filled with default values or NULL on failure. + */ +AVCodecContext *avcodec_alloc_context3(const AVCodec *codec); + +/** + * Free the codec context and everything associated with it and write NULL to + * the provided pointer. + */ +void avcodec_free_context(AVCodecContext **avctx); + +/** + * Get the AVClass for AVCodecContext. It can be used in combination with + * AV_OPT_SEARCH_FAKE_OBJ for examining options. + * + * @see av_opt_find(). + */ +const AVClass *avcodec_get_class(void); + +/** + * Get the AVClass for AVSubtitleRect. It can be used in combination with + * AV_OPT_SEARCH_FAKE_OBJ for examining options. + * + * @see av_opt_find(). + */ +const AVClass *avcodec_get_subtitle_rect_class(void); + +/** + * Fill the parameters struct based on the values from the supplied codec + * context. Any allocated fields in par are freed and replaced with duplicates + * of the corresponding fields in codec. + * + * @return >= 0 on success, a negative AVERROR code on failure + */ +int avcodec_parameters_from_context(AVCodecParameters *par, + const AVCodecContext *codec); + +/** + * Fill the codec context based on the values from the supplied codec + * parameters. Any allocated fields in codec that have a corresponding field in + * par are freed and replaced with duplicates of the corresponding field in par. + * Fields in codec that do not have a counterpart in par are not touched. + * + * @return >= 0 on success, a negative AVERROR code on failure. + */ +int avcodec_parameters_to_context(AVCodecContext *codec, + const AVCodecParameters *par); + +/** + * Initialize the AVCodecContext to use the given AVCodec. Prior to using this + * function the context has to be allocated with avcodec_alloc_context3(). + * + * The functions avcodec_find_decoder_by_name(), avcodec_find_encoder_by_name(), + * avcodec_find_decoder() and avcodec_find_encoder() provide an easy way for + * retrieving a codec. + * + * @note Always call this function before using decoding routines (such as + * @ref avcodec_receive_frame()). + * + * @code + * av_dict_set(&opts, "b", "2.5M", 0); + * codec = avcodec_find_decoder(AV_CODEC_ID_H264); + * if (!codec) + * exit(1); + * + * context = avcodec_alloc_context3(codec); + * + * if (avcodec_open2(context, codec, opts) < 0) + * exit(1); + * @endcode + * + * @param avctx The context to initialize. + * @param codec The codec to open this context for. If a non-NULL codec has been + * previously passed to avcodec_alloc_context3() or + * for this context, then this parameter MUST be either NULL or + * equal to the previously passed codec. + * @param options A dictionary filled with AVCodecContext and codec-private options. + * On return this object will be filled with options that were not found. + * + * @return zero on success, a negative value on error + * @see avcodec_alloc_context3(), avcodec_find_decoder(), avcodec_find_encoder(), + * av_dict_set(), av_opt_find(). + */ +int avcodec_open2(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options); + +/** + * Close a given AVCodecContext and free all the data associated with it + * (but not the AVCodecContext itself). + * + * Calling this function on an AVCodecContext that hasn't been opened will free + * the codec-specific data allocated in avcodec_alloc_context3() with a non-NULL + * codec. Subsequent calls will do nothing. + * + * @note Do not use this function. Use avcodec_free_context() to destroy a + * codec context (either open or closed). Opening and closing a codec context + * multiple times is not supported anymore -- use multiple codec contexts + * instead. + */ +int avcodec_close(AVCodecContext *avctx); + +/** + * Free all allocated data in the given subtitle struct. + * + * @param sub AVSubtitle to free. + */ +void avsubtitle_free(AVSubtitle *sub); + +/** + * @} + */ + +/** + * @addtogroup lavc_decoding + * @{ + */ + +/** + * The default callback for AVCodecContext.get_buffer2(). It is made public so + * it can be called by custom get_buffer2() implementations for decoders without + * AV_CODEC_CAP_DR1 set. + */ +int avcodec_default_get_buffer2(AVCodecContext *s, AVFrame *frame, int flags); + +/** + * The default callback for AVCodecContext.get_encode_buffer(). It is made public so + * it can be called by custom get_encode_buffer() implementations for encoders without + * AV_CODEC_CAP_DR1 set. + */ +int avcodec_default_get_encode_buffer(AVCodecContext *s, AVPacket *pkt, int flags); + +/** + * Modify width and height values so that they will result in a memory + * buffer that is acceptable for the codec if you do not use any horizontal + * padding. + * + * May only be used if a codec with AV_CODEC_CAP_DR1 has been opened. + */ +void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height); + +/** + * Modify width and height values so that they will result in a memory + * buffer that is acceptable for the codec if you also ensure that all + * line sizes are a multiple of the respective linesize_align[i]. + * + * May only be used if a codec with AV_CODEC_CAP_DR1 has been opened. + */ +void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height, + int linesize_align[AV_NUM_DATA_POINTERS]); + +#ifdef FF_API_AVCODEC_CHROMA_POS +/** + * Converts AVChromaLocation to swscale x/y chroma position. + * + * The positions represent the chroma (0,0) position in a coordinates system + * with luma (0,0) representing the origin and luma(1,1) representing 256,256 + * + * @param xpos horizontal chroma sample position + * @param ypos vertical chroma sample position + * @deprecated Use av_chroma_location_enum_to_pos() instead. + */ + attribute_deprecated +int avcodec_enum_to_chroma_pos(int *xpos, int *ypos, enum AVChromaLocation pos); + +/** + * Converts swscale x/y chroma position to AVChromaLocation. + * + * The positions represent the chroma (0,0) position in a coordinates system + * with luma (0,0) representing the origin and luma(1,1) representing 256,256 + * + * @param xpos horizontal chroma sample position + * @param ypos vertical chroma sample position + * @deprecated Use av_chroma_location_pos_to_enum() instead. + */ + attribute_deprecated +enum AVChromaLocation avcodec_chroma_pos_to_enum(int xpos, int ypos); +#endif + +/** + * Decode a subtitle message. + * Return a negative value on error, otherwise return the number of bytes used. + * If no subtitle could be decompressed, got_sub_ptr is zero. + * Otherwise, the subtitle is stored in *sub. + * Note that AV_CODEC_CAP_DR1 is not available for subtitle codecs. This is for + * simplicity, because the performance difference is expected to be negligible + * and reusing a get_buffer written for video codecs would probably perform badly + * due to a potentially very different allocation pattern. + * + * Some decoders (those marked with AV_CODEC_CAP_DELAY) have a delay between input + * and output. This means that for some packets they will not immediately + * produce decoded output and need to be flushed at the end of decoding to get + * all the decoded data. Flushing is done by calling this function with packets + * with avpkt->data set to NULL and avpkt->size set to 0 until it stops + * returning subtitles. It is safe to flush even those decoders that are not + * marked with AV_CODEC_CAP_DELAY, then no subtitles will be returned. + * + * @note The AVCodecContext MUST have been opened with @ref avcodec_open2() + * before packets may be fed to the decoder. + * + * @param avctx the codec context + * @param[out] sub The preallocated AVSubtitle in which the decoded subtitle will be stored, + * must be freed with avsubtitle_free if *got_sub_ptr is set. + * @param[in,out] got_sub_ptr Zero if no subtitle could be decompressed, otherwise, it is nonzero. + * @param[in] avpkt The input AVPacket containing the input buffer. + */ +int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub, + int *got_sub_ptr, const AVPacket *avpkt); + +/** + * Supply raw packet data as input to a decoder. + * + * Internally, this call will copy relevant AVCodecContext fields, which can + * influence decoding per-packet, and apply them when the packet is actually + * decoded. (For example AVCodecContext.skip_frame, which might direct the + * decoder to drop the frame contained by the packet sent with this function.) + * + * @warning The input buffer, avpkt->data must be AV_INPUT_BUFFER_PADDING_SIZE + * larger than the actual read bytes because some optimized bitstream + * readers read 32 or 64 bits at once and could read over the end. + * + * @note The AVCodecContext MUST have been opened with @ref avcodec_open2() + * before packets may be fed to the decoder. + * + * @param avctx codec context + * @param[in] avpkt The input AVPacket. Usually, this will be a single video + * frame, or several complete audio frames. + * Ownership of the packet remains with the caller, and the + * decoder will not write to the packet. The decoder may create + * a reference to the packet data (or copy it if the packet is + * not reference-counted). + * Unlike with older APIs, the packet is always fully consumed, + * and if it contains multiple frames (e.g. some audio codecs), + * will require you to call avcodec_receive_frame() multiple + * times afterwards before you can send a new packet. + * It can be NULL (or an AVPacket with data set to NULL and + * size set to 0); in this case, it is considered a flush + * packet, which signals the end of the stream. Sending the + * first flush packet will return success. Subsequent ones are + * unnecessary and will return AVERROR_EOF. If the decoder + * still has frames buffered, it will return them after sending + * a flush packet. + * + * @retval 0 success + * @retval AVERROR(EAGAIN) input is not accepted in the current state - user + * must read output with avcodec_receive_frame() (once + * all output is read, the packet should be resent, + * and the call will not fail with EAGAIN). + * @retval AVERROR_EOF the decoder has been flushed, and no new packets can be + * sent to it (also returned if more than 1 flush + * packet is sent) + * @retval AVERROR(EINVAL) codec not opened, it is an encoder, or requires flush + * @retval AVERROR(ENOMEM) failed to add packet to internal queue, or similar + * @retval "another negative error code" legitimate decoding errors + */ +int avcodec_send_packet(AVCodecContext *avctx, const AVPacket *avpkt); + +/** + * Return decoded output data from a decoder or encoder (when the + * AV_CODEC_FLAG_RECON_FRAME flag is used). + * + * @param avctx codec context + * @param frame This will be set to a reference-counted video or audio + * frame (depending on the decoder type) allocated by the + * codec. Note that the function will always call + * av_frame_unref(frame) before doing anything else. + * + * @retval 0 success, a frame was returned + * @retval AVERROR(EAGAIN) output is not available in this state - user must + * try to send new input + * @retval AVERROR_EOF the codec has been fully flushed, and there will be + * no more output frames + * @retval AVERROR(EINVAL) codec not opened, or it is an encoder without the + * AV_CODEC_FLAG_RECON_FRAME flag enabled + * @retval AVERROR_INPUT_CHANGED current decoded frame has changed parameters with + * respect to first decoded frame. Applicable when flag + * AV_CODEC_FLAG_DROPCHANGED is set. + * @retval "other negative error code" legitimate decoding errors + */ +int avcodec_receive_frame(AVCodecContext *avctx, AVFrame *frame); + +/** + * Supply a raw video or audio frame to the encoder. Use avcodec_receive_packet() + * to retrieve buffered output packets. + * + * @param avctx codec context + * @param[in] frame AVFrame containing the raw audio or video frame to be encoded. + * Ownership of the frame remains with the caller, and the + * encoder will not write to the frame. The encoder may create + * a reference to the frame data (or copy it if the frame is + * not reference-counted). + * It can be NULL, in which case it is considered a flush + * packet. This signals the end of the stream. If the encoder + * still has packets buffered, it will return them after this + * call. Once flushing mode has been entered, additional flush + * packets are ignored, and sending frames will return + * AVERROR_EOF. + * + * For audio: + * If AV_CODEC_CAP_VARIABLE_FRAME_SIZE is set, then each frame + * can have any number of samples. + * If it is not set, frame->nb_samples must be equal to + * avctx->frame_size for all frames except the last. + * The final frame may be smaller than avctx->frame_size. + * @retval 0 success + * @retval AVERROR(EAGAIN) input is not accepted in the current state - user must + * read output with avcodec_receive_packet() (once all + * output is read, the packet should be resent, and the + * call will not fail with EAGAIN). + * @retval AVERROR_EOF the encoder has been flushed, and no new frames can + * be sent to it + * @retval AVERROR(EINVAL) codec not opened, it is a decoder, or requires flush + * @retval AVERROR(ENOMEM) failed to add packet to internal queue, or similar + * @retval "another negative error code" legitimate encoding errors + */ +int avcodec_send_frame(AVCodecContext *avctx, const AVFrame *frame); + +/** + * Read encoded data from the encoder. + * + * @param avctx codec context + * @param avpkt This will be set to a reference-counted packet allocated by the + * encoder. Note that the function will always call + * av_packet_unref(avpkt) before doing anything else. + * @retval 0 success + * @retval AVERROR(EAGAIN) output is not available in the current state - user must + * try to send input + * @retval AVERROR_EOF the encoder has been fully flushed, and there will be no + * more output packets + * @retval AVERROR(EINVAL) codec not opened, or it is a decoder + * @retval "another negative error code" legitimate encoding errors + */ +int avcodec_receive_packet(AVCodecContext *avctx, AVPacket *avpkt); + +/** + * Create and return a AVHWFramesContext with values adequate for hardware + * decoding. This is meant to get called from the get_format callback, and is + * a helper for preparing a AVHWFramesContext for AVCodecContext.hw_frames_ctx. + * This API is for decoding with certain hardware acceleration modes/APIs only. + * + * The returned AVHWFramesContext is not initialized. The caller must do this + * with av_hwframe_ctx_init(). + * + * Calling this function is not a requirement, but makes it simpler to avoid + * codec or hardware API specific details when manually allocating frames. + * + * Alternatively to this, an API user can set AVCodecContext.hw_device_ctx, + * which sets up AVCodecContext.hw_frames_ctx fully automatically, and makes + * it unnecessary to call this function or having to care about + * AVHWFramesContext initialization at all. + * + * There are a number of requirements for calling this function: + * + * - It must be called from get_format with the same avctx parameter that was + * passed to get_format. Calling it outside of get_format is not allowed, and + * can trigger undefined behavior. + * - The function is not always supported (see description of return values). + * Even if this function returns successfully, hwaccel initialization could + * fail later. (The degree to which implementations check whether the stream + * is actually supported varies. Some do this check only after the user's + * get_format callback returns.) + * - The hw_pix_fmt must be one of the choices suggested by get_format. If the + * user decides to use a AVHWFramesContext prepared with this API function, + * the user must return the same hw_pix_fmt from get_format. + * - The device_ref passed to this function must support the given hw_pix_fmt. + * - After calling this API function, it is the user's responsibility to + * initialize the AVHWFramesContext (returned by the out_frames_ref parameter), + * and to set AVCodecContext.hw_frames_ctx to it. If done, this must be done + * before returning from get_format (this is implied by the normal + * AVCodecContext.hw_frames_ctx API rules). + * - The AVHWFramesContext parameters may change every time time get_format is + * called. Also, AVCodecContext.hw_frames_ctx is reset before get_format. So + * you are inherently required to go through this process again on every + * get_format call. + * - It is perfectly possible to call this function without actually using + * the resulting AVHWFramesContext. One use-case might be trying to reuse a + * previously initialized AVHWFramesContext, and calling this API function + * only to test whether the required frame parameters have changed. + * - Fields that use dynamically allocated values of any kind must not be set + * by the user unless setting them is explicitly allowed by the documentation. + * If the user sets AVHWFramesContext.free and AVHWFramesContext.user_opaque, + * the new free callback must call the potentially set previous free callback. + * This API call may set any dynamically allocated fields, including the free + * callback. + * + * The function will set at least the following fields on AVHWFramesContext + * (potentially more, depending on hwaccel API): + * + * - All fields set by av_hwframe_ctx_alloc(). + * - Set the format field to hw_pix_fmt. + * - Set the sw_format field to the most suited and most versatile format. (An + * implication is that this will prefer generic formats over opaque formats + * with arbitrary restrictions, if possible.) + * - Set the width/height fields to the coded frame size, rounded up to the + * API-specific minimum alignment. + * - Only _if_ the hwaccel requires a pre-allocated pool: set the initial_pool_size + * field to the number of maximum reference surfaces possible with the codec, + * plus 1 surface for the user to work (meaning the user can safely reference + * at most 1 decoded surface at a time), plus additional buffering introduced + * by frame threading. If the hwaccel does not require pre-allocation, the + * field is left to 0, and the decoder will allocate new surfaces on demand + * during decoding. + * - Possibly AVHWFramesContext.hwctx fields, depending on the underlying + * hardware API. + * + * Essentially, out_frames_ref returns the same as av_hwframe_ctx_alloc(), but + * with basic frame parameters set. + * + * The function is stateless, and does not change the AVCodecContext or the + * device_ref AVHWDeviceContext. + * + * @param avctx The context which is currently calling get_format, and which + * implicitly contains all state needed for filling the returned + * AVHWFramesContext properly. + * @param device_ref A reference to the AVHWDeviceContext describing the device + * which will be used by the hardware decoder. + * @param hw_pix_fmt The hwaccel format you are going to return from get_format. + * @param out_frames_ref On success, set to a reference to an _uninitialized_ + * AVHWFramesContext, created from the given device_ref. + * Fields will be set to values required for decoding. + * Not changed if an error is returned. + * @return zero on success, a negative value on error. The following error codes + * have special semantics: + * AVERROR(ENOENT): the decoder does not support this functionality. Setup + * is always manual, or it is a decoder which does not + * support setting AVCodecContext.hw_frames_ctx at all, + * or it is a software format. + * AVERROR(EINVAL): it is known that hardware decoding is not supported for + * this configuration, or the device_ref is not supported + * for the hwaccel referenced by hw_pix_fmt. + */ +int avcodec_get_hw_frames_parameters(AVCodecContext *avctx, + AVBufferRef *device_ref, + enum AVPixelFormat hw_pix_fmt, + AVBufferRef **out_frames_ref); + + + +/** + * @defgroup lavc_parsing Frame parsing + * @{ + */ + +enum AVPictureStructure { + AV_PICTURE_STRUCTURE_UNKNOWN, ///< unknown + AV_PICTURE_STRUCTURE_TOP_FIELD, ///< coded as top field + AV_PICTURE_STRUCTURE_BOTTOM_FIELD, ///< coded as bottom field + AV_PICTURE_STRUCTURE_FRAME, ///< coded as frame +}; + +typedef struct AVCodecParserContext { + void *priv_data; + const struct AVCodecParser *parser; + int64_t frame_offset; /* offset of the current frame */ + int64_t cur_offset; /* current offset + (incremented by each av_parser_parse()) */ + int64_t next_frame_offset; /* offset of the next frame */ + /* video info */ + int pict_type; /* XXX: Put it back in AVCodecContext. */ + /** + * This field is used for proper frame duration computation in lavf. + * It signals, how much longer the frame duration of the current frame + * is compared to normal frame duration. + * + * frame_duration = (1 + repeat_pict) * time_base + * + * It is used by codecs like H.264 to display telecined material. + */ + int repeat_pict; /* XXX: Put it back in AVCodecContext. */ + int64_t pts; /* pts of the current frame */ + int64_t dts; /* dts of the current frame */ + + /* private data */ + int64_t last_pts; + int64_t last_dts; + int fetch_timestamp; + +#define AV_PARSER_PTS_NB 4 + int cur_frame_start_index; + int64_t cur_frame_offset[AV_PARSER_PTS_NB]; + int64_t cur_frame_pts[AV_PARSER_PTS_NB]; + int64_t cur_frame_dts[AV_PARSER_PTS_NB]; + + int flags; +#define PARSER_FLAG_COMPLETE_FRAMES 0x0001 +#define PARSER_FLAG_ONCE 0x0002 +/// Set if the parser has a valid file offset +#define PARSER_FLAG_FETCHED_OFFSET 0x0004 +#define PARSER_FLAG_USE_CODEC_TS 0x1000 + + int64_t offset; ///< byte offset from starting packet start + int64_t cur_frame_end[AV_PARSER_PTS_NB]; + + /** + * Set by parser to 1 for key frames and 0 for non-key frames. + * It is initialized to -1, so if the parser doesn't set this flag, + * old-style fallback using AV_PICTURE_TYPE_I picture type as key frames + * will be used. + */ + int key_frame; + + // Timestamp generation support: + /** + * Synchronization point for start of timestamp generation. + * + * Set to >0 for sync point, 0 for no sync point and <0 for undefined + * (default). + * + * For example, this corresponds to presence of H.264 buffering period + * SEI message. + */ + int dts_sync_point; + + /** + * Offset of the current timestamp against last timestamp sync point in + * units of AVCodecContext.time_base. + * + * Set to INT_MIN when dts_sync_point unused. Otherwise, it must + * contain a valid timestamp offset. + * + * Note that the timestamp of sync point has usually a nonzero + * dts_ref_dts_delta, which refers to the previous sync point. Offset of + * the next frame after timestamp sync point will be usually 1. + * + * For example, this corresponds to H.264 cpb_removal_delay. + */ + int dts_ref_dts_delta; + + /** + * Presentation delay of current frame in units of AVCodecContext.time_base. + * + * Set to INT_MIN when dts_sync_point unused. Otherwise, it must + * contain valid non-negative timestamp delta (presentation time of a frame + * must not lie in the past). + * + * This delay represents the difference between decoding and presentation + * time of the frame. + * + * For example, this corresponds to H.264 dpb_output_delay. + */ + int pts_dts_delta; + + /** + * Position of the packet in file. + * + * Analogous to cur_frame_pts/dts + */ + int64_t cur_frame_pos[AV_PARSER_PTS_NB]; + + /** + * Byte position of currently parsed frame in stream. + */ + int64_t pos; + + /** + * Previous frame byte position. + */ + int64_t last_pos; + + /** + * Duration of the current frame. + * For audio, this is in units of 1 / AVCodecContext.sample_rate. + * For all other types, this is in units of AVCodecContext.time_base. + */ + int duration; + + enum AVFieldOrder field_order; + + /** + * Indicate whether a picture is coded as a frame, top field or bottom field. + * + * For example, H.264 field_pic_flag equal to 0 corresponds to + * AV_PICTURE_STRUCTURE_FRAME. An H.264 picture with field_pic_flag + * equal to 1 and bottom_field_flag equal to 0 corresponds to + * AV_PICTURE_STRUCTURE_TOP_FIELD. + */ + enum AVPictureStructure picture_structure; + + /** + * Picture number incremented in presentation or output order. + * This field may be reinitialized at the first picture of a new sequence. + * + * For example, this corresponds to H.264 PicOrderCnt. + */ + int output_picture_number; + + /** + * Dimensions of the decoded video intended for presentation. + */ + int width; + int height; + + /** + * Dimensions of the coded video. + */ + int coded_width; + int coded_height; + + /** + * The format of the coded data, corresponds to enum AVPixelFormat for video + * and for enum AVSampleFormat for audio. + * + * Note that a decoder can have considerable freedom in how exactly it + * decodes the data, so the format reported here might be different from the + * one returned by a decoder. + */ + int format; +} AVCodecParserContext; + +typedef struct AVCodecParser { + int codec_ids[7]; /* several codec IDs are permitted */ + int priv_data_size; + int (*parser_init)(AVCodecParserContext *s); + /* This callback never returns an error, a negative value means that + * the frame start was in a previous packet. */ + int (*parser_parse)(AVCodecParserContext *s, + AVCodecContext *avctx, + const uint8_t **poutbuf, int *poutbuf_size, + const uint8_t *buf, int buf_size); + void (*parser_close)(AVCodecParserContext *s); + int (*split)(AVCodecContext *avctx, const uint8_t *buf, int buf_size); +} AVCodecParser; + +/** + * Iterate over all registered codec parsers. + * + * @param opaque a pointer where libavcodec will store the iteration state. Must + * point to NULL to start the iteration. + * + * @return the next registered codec parser or NULL when the iteration is + * finished + */ +const AVCodecParser *av_parser_iterate(void **opaque); + +AVCodecParserContext *av_parser_init(int codec_id); + +/** + * Parse a packet. + * + * @param s parser context. + * @param avctx codec context. + * @param poutbuf set to pointer to parsed buffer or NULL if not yet finished. + * @param poutbuf_size set to size of parsed buffer or zero if not yet finished. + * @param buf input buffer. + * @param buf_size buffer size in bytes without the padding. I.e. the full buffer + size is assumed to be buf_size + AV_INPUT_BUFFER_PADDING_SIZE. + To signal EOF, this should be 0 (so that the last frame + can be output). + * @param pts input presentation timestamp. + * @param dts input decoding timestamp. + * @param pos input byte position in stream. + * @return the number of bytes of the input bitstream used. + * + * Example: + * @code + * while(in_len){ + * len = av_parser_parse2(myparser, AVCodecContext, &data, &size, + * in_data, in_len, + * pts, dts, pos); + * in_data += len; + * in_len -= len; + * + * if(size) + * decode_frame(data, size); + * } + * @endcode + */ +int av_parser_parse2(AVCodecParserContext *s, + AVCodecContext *avctx, + uint8_t **poutbuf, int *poutbuf_size, + const uint8_t *buf, int buf_size, + int64_t pts, int64_t dts, + int64_t pos); + +void av_parser_close(AVCodecParserContext *s); + +/** + * @} + * @} + */ + +/** + * @addtogroup lavc_encoding + * @{ + */ + +int avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size, + const AVSubtitle *sub); + + +/** + * @} + */ + +/** + * @defgroup lavc_misc Utility functions + * @ingroup libavc + * + * Miscellaneous utility functions related to both encoding and decoding + * (or neither). + * @{ + */ + +/** + * @defgroup lavc_misc_pixfmt Pixel formats + * + * Functions for working with pixel formats. + * @{ + */ + +/** + * Return a value representing the fourCC code associated to the + * pixel format pix_fmt, or 0 if no associated fourCC code can be + * found. + */ +unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat pix_fmt); + +/** + * Find the best pixel format to convert to given a certain source pixel + * format. When converting from one pixel format to another, information loss + * may occur. For example, when converting from RGB24 to GRAY, the color + * information will be lost. Similarly, other losses occur when converting from + * some formats to other formats. avcodec_find_best_pix_fmt_of_2() searches which of + * the given pixel formats should be used to suffer the least amount of loss. + * The pixel formats from which it chooses one, are determined by the + * pix_fmt_list parameter. + * + * + * @param[in] pix_fmt_list AV_PIX_FMT_NONE terminated array of pixel formats to choose from + * @param[in] src_pix_fmt source pixel format + * @param[in] has_alpha Whether the source pixel format alpha channel is used. + * @param[out] loss_ptr Combination of flags informing you what kind of losses will occur. + * @return The best pixel format to convert to or -1 if none was found. + */ +enum AVPixelFormat avcodec_find_best_pix_fmt_of_list(const enum AVPixelFormat *pix_fmt_list, + enum AVPixelFormat src_pix_fmt, + int has_alpha, int *loss_ptr); + +enum AVPixelFormat avcodec_default_get_format(struct AVCodecContext *s, const enum AVPixelFormat * fmt); + +/** + * @} + */ + +void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode); + +int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2),void *arg, int *ret, int count, int size); +int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2, int, int),void *arg, int *ret, int count); +//FIXME func typedef + +/** + * Fill AVFrame audio data and linesize pointers. + * + * The buffer buf must be a preallocated buffer with a size big enough + * to contain the specified samples amount. The filled AVFrame data + * pointers will point to this buffer. + * + * AVFrame extended_data channel pointers are allocated if necessary for + * planar audio. + * + * @param frame the AVFrame + * frame->nb_samples must be set prior to calling the + * function. This function fills in frame->data, + * frame->extended_data, frame->linesize[0]. + * @param nb_channels channel count + * @param sample_fmt sample format + * @param buf buffer to use for frame data + * @param buf_size size of buffer + * @param align plane size sample alignment (0 = default) + * @return >=0 on success, negative error code on failure + * @todo return the size in bytes required to store the samples in + * case of success, at the next libavutil bump + */ +int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels, + enum AVSampleFormat sample_fmt, const uint8_t *buf, + int buf_size, int align); + +/** + * Reset the internal codec state / flush internal buffers. Should be called + * e.g. when seeking or when switching to a different stream. + * + * @note for decoders, this function just releases any references the decoder + * might keep internally, but the caller's references remain valid. + * + * @note for encoders, this function will only do something if the encoder + * declares support for AV_CODEC_CAP_ENCODER_FLUSH. When called, the encoder + * will drain any remaining packets, and can then be re-used for a different + * stream (as opposed to sending a null frame which will leave the encoder + * in a permanent EOF state after draining). This can be desirable if the + * cost of tearing down and replacing the encoder instance is high. + */ +void avcodec_flush_buffers(AVCodecContext *avctx); + +/** + * Return audio frame duration. + * + * @param avctx codec context + * @param frame_bytes size of the frame, or 0 if unknown + * @return frame duration, in samples, if known. 0 if not able to + * determine. + */ +int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes); + +/* memory */ + +/** + * Same behaviour av_fast_malloc but the buffer has additional + * AV_INPUT_BUFFER_PADDING_SIZE at the end which will always be 0. + * + * In addition the whole buffer will initially and after resizes + * be 0-initialized so that no uninitialized data will ever appear. + */ +void av_fast_padded_malloc(void *ptr, unsigned int *size, size_t min_size); + +/** + * Same behaviour av_fast_padded_malloc except that buffer will always + * be 0-initialized after call. + */ +void av_fast_padded_mallocz(void *ptr, unsigned int *size, size_t min_size); + +/** + * @return a positive value if s is open (i.e. avcodec_open2() was called on it + * with no corresponding avcodec_close()), 0 otherwise. + */ +int avcodec_is_open(AVCodecContext *s); + +/** + * @} + */ + +#endif /* AVCODEC_AVCODEC_H */ diff --git a/media/ffvpx/libavcodec/avcodec.symbols b/media/ffvpx/libavcodec/avcodec.symbols new file mode 100644 index 0000000000..b15862fa50 --- /dev/null +++ b/media/ffvpx/libavcodec/avcodec.symbols @@ -0,0 +1,81 @@ +av_codec_ffversion +av_codec_is_decoder +av_codec_is_encoder +av_codec_iterate +av_fast_padded_malloc +av_fast_padded_mallocz +av_get_audio_frame_duration +av_get_bits_per_sample +av_get_exact_bits_per_sample +av_get_pcm_codec +av_get_profile_name +av_grow_packet +av_init_packet +av_new_packet +av_packet_copy_props +av_packet_free_side_data +av_packet_from_data +av_packet_get_side_data +av_packet_move_ref +av_packet_new_side_data +av_packet_pack_dictionary +av_packet_ref +av_packet_rescale_ts +av_packet_shrink_side_data +av_packet_side_data_name +av_packet_unpack_dictionary +av_packet_unref +av_parser_close +av_parser_init +av_parser_parse2 +#ifdef MOZ_LIBAV_FFT +av_rdft_calc +av_rdft_end +av_rdft_init +#endif +av_shrink_packet +av_vorbis_parse_frame +av_vorbis_parse_frame_flags +av_vorbis_parse_free +av_vorbis_parse_init +av_vorbis_parse_reset +av_xiphlacing +avcodec_align_dimensions +avcodec_align_dimensions2 +avcodec_alloc_context3 +avcodec_chroma_pos_to_enum +avcodec_close +avcodec_configuration +avcodec_decode_subtitle2 +avcodec_default_execute +avcodec_default_execute2 +avcodec_default_get_buffer2 +avcodec_default_get_format +avcodec_descriptor_get +avcodec_descriptor_get_by_name +avcodec_descriptor_next +avcodec_enum_to_chroma_pos +avcodec_fill_audio_frame +avcodec_find_decoder +avcodec_find_decoder_by_name +avcodec_find_encoder +avcodec_find_encoder_by_name +avcodec_flush_buffers +avcodec_free_context +avcodec_get_class +avcodec_get_hw_config +avcodec_get_name +avcodec_get_subtitle_rect_class +avcodec_get_type +avcodec_is_open +avcodec_license +avcodec_open2 +avcodec_string +avcodec_version +avsubtitle_free +avcodec_send_packet +avcodec_receive_frame +ff_init_vlc_from_lengths +ff_init_vlc_sparse +ff_mpa_freq_tab +ff_mpa_bitrate_tab diff --git a/media/ffvpx/libavcodec/avdct.c b/media/ffvpx/libavcodec/avdct.c new file mode 100644 index 0000000000..e8fa41f73b --- /dev/null +++ b/media/ffvpx/libavcodec/avdct.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2014 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "avcodec.h" +#include "idctdsp.h" +#include "fdctdsp.h" +#include "pixblockdsp.h" +#include "avdct.h" + +#define OFFSET(x) offsetof(AVDCT,x) +#define DEFAULT 0 //should be NAN but it does not work as it is not a constant in glibc as required by ANSI/ISO C +//these names are too long to be readable +#define V AV_OPT_FLAG_VIDEO_PARAM +#define A AV_OPT_FLAG_AUDIO_PARAM +#define E AV_OPT_FLAG_ENCODING_PARAM +#define D AV_OPT_FLAG_DECODING_PARAM + +static const AVOption avdct_options[] = { +{"dct", "DCT algorithm", OFFSET(dct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E, "dct"}, +{"auto", "autoselect a good one", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_AUTO }, INT_MIN, INT_MAX, V|E, "dct"}, +{"fastint", "fast integer (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FASTINT }, INT_MIN, INT_MAX, V|E, "dct"}, +{"int", "accurate integer", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_INT }, INT_MIN, INT_MAX, V|E, "dct"}, +{"mmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_MMX }, INT_MIN, INT_MAX, V|E, "dct"}, +{"altivec", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_ALTIVEC }, INT_MIN, INT_MAX, V|E, "dct"}, +{"faan", "floating point AAN DCT (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FAAN }, INT_MIN, INT_MAX, V|E, "dct"}, + +{"idct", "select IDCT implementation", OFFSET(idct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E|D, "idct"}, +{"auto", "autoselect a good one", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_AUTO }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"int", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_INT }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simple", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLE }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simplemmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEMMX }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"arm", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ARM }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"altivec", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ALTIVEC }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simplearm", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARM }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simplearmv5te", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV5TE }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simplearmv6", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV6 }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simpleneon", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLENEON }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"xvid", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"xvidmmx", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"faani", "floating point AAN IDCT (experimental / for debugging)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_FAAN }, INT_MIN, INT_MAX, V|D|E, "idct"}, +{"simpleauto", "experimental / for debugging", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEAUTO }, INT_MIN, INT_MAX, V|E|D, "idct"}, + +{"bits_per_sample", "", OFFSET(bits_per_sample), AV_OPT_TYPE_INT, {.i64 = 8 }, 0, 14, 0,}, +{NULL}, +}; + +static const AVClass avdct_class = { + .class_name = "AVDCT", + .option = avdct_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +const AVClass *avcodec_dct_get_class(void) +{ + return &avdct_class; +} + +AVDCT *avcodec_dct_alloc(void) +{ + AVDCT *dsp = av_mallocz(sizeof(AVDCT)); + + if (!dsp) + return NULL; + + dsp->av_class = &avdct_class; + av_opt_set_defaults(dsp); + + return dsp; +} + +int avcodec_dct_init(AVDCT *dsp) +{ + AVCodecContext *avctx = avcodec_alloc_context3(NULL); + + if (!avctx) + return AVERROR(ENOMEM); + + avctx->idct_algo = dsp->idct_algo; + avctx->dct_algo = dsp->dct_algo; + avctx->bits_per_raw_sample = dsp->bits_per_sample; + +#define COPY(src, name) memcpy(&dsp->name, &src.name, sizeof(dsp->name)) + +#if CONFIG_IDCTDSP + { + IDCTDSPContext idsp = {0}; + ff_idctdsp_init(&idsp, avctx); + COPY(idsp, idct); + COPY(idsp, idct_permutation); + } +#endif + +#if CONFIG_FDCTDSP + { + FDCTDSPContext fdsp; + ff_fdctdsp_init(&fdsp, avctx); + COPY(fdsp, fdct); + } +#endif + +#if CONFIG_PIXBLOCKDSP + { + PixblockDSPContext pdsp; + ff_pixblockdsp_init(&pdsp, avctx); + COPY(pdsp, get_pixels); + COPY(pdsp, get_pixels_unaligned); + } +#endif + + avcodec_free_context(&avctx); + + return 0; +} diff --git a/media/ffvpx/libavcodec/avdct.h b/media/ffvpx/libavcodec/avdct.h new file mode 100644 index 0000000000..6411fab6f6 --- /dev/null +++ b/media/ffvpx/libavcodec/avdct.h @@ -0,0 +1,88 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AVDCT_H +#define AVCODEC_AVDCT_H + +#include "libavutil/opt.h" + +/** + * AVDCT context. + * @note function pointers can be NULL if the specific features have been + * disabled at build time. + */ +typedef struct AVDCT { + const AVClass *av_class; + + void (*idct)(int16_t *block /* align 16 */); + + /** + * IDCT input permutation. + * Several optimized IDCTs need a permutated input (relative to the + * normal order of the reference IDCT). + * This permutation must be performed before the idct_put/add. + * Note, normally this can be merged with the zigzag/alternate scan<br> + * An example to avoid confusion: + * - (->decode coeffs -> zigzag reorder -> dequant -> reference IDCT -> ...) + * - (x -> reference DCT -> reference IDCT -> x) + * - (x -> reference DCT -> simple_mmx_perm = idct_permutation + * -> simple_idct_mmx -> x) + * - (-> decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant + * -> simple_idct_mmx -> ...) + */ + uint8_t idct_permutation[64]; + + void (*fdct)(int16_t *block /* align 16 */); + + + /** + * DCT algorithm. + * must use AVOptions to set this field. + */ + int dct_algo; + + /** + * IDCT algorithm. + * must use AVOptions to set this field. + */ + int idct_algo; + + void (*get_pixels)(int16_t *block /* align 16 */, + const uint8_t *pixels /* align 8 */, + ptrdiff_t line_size); + + int bits_per_sample; + + void (*get_pixels_unaligned)(int16_t *block /* align 16 */, + const uint8_t *pixels, + ptrdiff_t line_size); +} AVDCT; + +/** + * Allocates a AVDCT context. + * This needs to be initialized with avcodec_dct_init() after optionally + * configuring it with AVOptions. + * + * To free it use av_free() + */ +AVDCT *avcodec_dct_alloc(void); +int avcodec_dct_init(AVDCT *); + +const AVClass *avcodec_dct_get_class(void); + +#endif /* AVCODEC_AVDCT_H */ diff --git a/media/ffvpx/libavcodec/avfft.c b/media/ffvpx/libavcodec/avfft.c new file mode 100644 index 0000000000..2200f37708 --- /dev/null +++ b/media/ffvpx/libavcodec/avfft.c @@ -0,0 +1,145 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/mem.h" +#include "avfft.h" +#include "fft.h" +#include "rdft.h" +#include "dct.h" + +/* FFT */ + +FFTContext *av_fft_init(int nbits, int inverse) +{ + FFTContext *s = av_mallocz(sizeof(*s)); + + if (s && ff_fft_init(s, nbits, inverse)) + av_freep(&s); + + return s; +} + +void av_fft_permute(FFTContext *s, FFTComplex *z) +{ + s->fft_permute(s, z); +} + +void av_fft_calc(FFTContext *s, FFTComplex *z) +{ + s->fft_calc(s, z); +} + +av_cold void av_fft_end(FFTContext *s) +{ + if (s) { + ff_fft_end(s); + av_free(s); + } +} + +#if CONFIG_MDCT + +FFTContext *av_mdct_init(int nbits, int inverse, double scale) +{ + FFTContext *s = av_malloc(sizeof(*s)); + + if (s && ff_mdct_init(s, nbits, inverse, scale)) + av_freep(&s); + + return s; +} + +void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_calc(s, output, input); +} + +void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_half(s, output, input); +} + +void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->mdct_calc(s, output, input); +} + +av_cold void av_mdct_end(FFTContext *s) +{ + if (s) { + ff_mdct_end(s); + av_free(s); + } +} + +#endif /* CONFIG_MDCT */ + +#if CONFIG_RDFT + +RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans) +{ + RDFTContext *s = av_malloc(sizeof(*s)); + + if (s && ff_rdft_init(s, nbits, trans)) + av_freep(&s); + + return s; +} + +void av_rdft_calc(RDFTContext *s, FFTSample *data) +{ + s->rdft_calc(s, data); +} + +av_cold void av_rdft_end(RDFTContext *s) +{ + if (s) { + ff_rdft_end(s); + av_free(s); + } +} + +#endif /* CONFIG_RDFT */ + +#if CONFIG_DCT + +DCTContext *av_dct_init(int nbits, enum DCTTransformType inverse) +{ + DCTContext *s = av_malloc(sizeof(*s)); + + if (s && ff_dct_init(s, nbits, inverse)) + av_freep(&s); + + return s; +} + +void av_dct_calc(DCTContext *s, FFTSample *data) +{ + s->dct_calc(s, data); +} + +av_cold void av_dct_end(DCTContext *s) +{ + if (s) { + ff_dct_end(s); + av_free(s); + } +} + +#endif /* CONFIG_DCT */ diff --git a/media/ffvpx/libavcodec/avfft.h b/media/ffvpx/libavcodec/avfft.h new file mode 100644 index 0000000000..0c0f9b8d8d --- /dev/null +++ b/media/ffvpx/libavcodec/avfft.h @@ -0,0 +1,118 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AVFFT_H +#define AVCODEC_AVFFT_H + +/** + * @file + * @ingroup lavc_fft + * FFT functions + */ + +/** + * @defgroup lavc_fft FFT functions + * @ingroup lavc_misc + * + * @{ + */ + +typedef float FFTSample; + +typedef struct FFTComplex { + FFTSample re, im; +} FFTComplex; + +typedef struct FFTContext FFTContext; + +/** + * Set up a complex FFT. + * @param nbits log2 of the length of the input array + * @param inverse if 0 perform the forward transform, if 1 perform the inverse + */ +FFTContext *av_fft_init(int nbits, int inverse); + +/** + * Do the permutation needed BEFORE calling ff_fft_calc(). + */ +void av_fft_permute(FFTContext *s, FFTComplex *z); + +/** + * Do a complex FFT with the parameters defined in av_fft_init(). The + * input data must be permuted before. No 1.0/sqrt(n) normalization is done. + */ +void av_fft_calc(FFTContext *s, FFTComplex *z); + +void av_fft_end(FFTContext *s); + +FFTContext *av_mdct_init(int nbits, int inverse, double scale); +void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_mdct_end(FFTContext *s); + +/* Real Discrete Fourier Transform */ + +enum RDFTransformType { + DFT_R2C, + IDFT_C2R, + IDFT_R2C, + DFT_C2R, +}; + +typedef struct RDFTContext RDFTContext; + +/** + * Set up a real FFT. + * @param nbits log2 of the length of the input array + * @param trans the type of transform + */ +RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans); +void av_rdft_calc(RDFTContext *s, FFTSample *data); +void av_rdft_end(RDFTContext *s); + +/* Discrete Cosine Transform */ + +typedef struct DCTContext DCTContext; + +enum DCTTransformType { + DCT_II = 0, + DCT_III, + DCT_I, + DST_I, +}; + +/** + * Set up DCT. + * + * @param nbits size of the input array: + * (1 << nbits) for DCT-II, DCT-III and DST-I + * (1 << nbits) + 1 for DCT-I + * @param type the type of transform + * + * @note the first element of the input of DST-I is ignored + */ +DCTContext *av_dct_init(int nbits, enum DCTTransformType type); +void av_dct_calc(DCTContext *s, FFTSample *data); +void av_dct_end (DCTContext *s); + +/** + * @} + */ + +#endif /* AVCODEC_AVFFT_H */ diff --git a/media/ffvpx/libavcodec/avpacket.c b/media/ffvpx/libavcodec/avpacket.c new file mode 100644 index 0000000000..5fef65e97a --- /dev/null +++ b/media/ffvpx/libavcodec/avpacket.c @@ -0,0 +1,647 @@ +/* + * AVPacket functions for libavcodec + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <string.h> + +#include "libavutil/avassert.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mathematics.h" +#include "libavutil/mem.h" +#include "libavutil/rational.h" + +#include "defs.h" +#include "packet.h" +#include "packet_internal.h" + +#if FF_API_INIT_PACKET +void av_init_packet(AVPacket *pkt) +{ + pkt->pts = AV_NOPTS_VALUE; + pkt->dts = AV_NOPTS_VALUE; + pkt->pos = -1; + pkt->duration = 0; + pkt->flags = 0; + pkt->stream_index = 0; + pkt->buf = NULL; + pkt->side_data = NULL; + pkt->side_data_elems = 0; + pkt->opaque = NULL; + pkt->opaque_ref = NULL; + pkt->time_base = av_make_q(0, 1); +} +#endif + +static void get_packet_defaults(AVPacket *pkt) +{ + memset(pkt, 0, sizeof(*pkt)); + + pkt->pts = AV_NOPTS_VALUE; + pkt->dts = AV_NOPTS_VALUE; + pkt->pos = -1; + pkt->time_base = av_make_q(0, 1); +} + +AVPacket *av_packet_alloc(void) +{ + AVPacket *pkt = av_malloc(sizeof(AVPacket)); + if (!pkt) + return pkt; + + get_packet_defaults(pkt); + + return pkt; +} + +void av_packet_free(AVPacket **pkt) +{ + if (!pkt || !*pkt) + return; + + av_packet_unref(*pkt); + av_freep(pkt); +} + +static int packet_alloc(AVBufferRef **buf, int size) +{ + int ret; + if (size < 0 || size >= INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE) + return AVERROR(EINVAL); + + ret = av_buffer_realloc(buf, size + AV_INPUT_BUFFER_PADDING_SIZE); + if (ret < 0) + return ret; + + memset((*buf)->data + size, 0, AV_INPUT_BUFFER_PADDING_SIZE); + + return 0; +} + +int av_new_packet(AVPacket *pkt, int size) +{ + AVBufferRef *buf = NULL; + int ret = packet_alloc(&buf, size); + if (ret < 0) + return ret; + + get_packet_defaults(pkt); + pkt->buf = buf; + pkt->data = buf->data; + pkt->size = size; + + return 0; +} + +void av_shrink_packet(AVPacket *pkt, int size) +{ + if (pkt->size <= size) + return; + pkt->size = size; + memset(pkt->data + size, 0, AV_INPUT_BUFFER_PADDING_SIZE); +} + +int av_grow_packet(AVPacket *pkt, int grow_by) +{ + int new_size; + av_assert0((unsigned)pkt->size <= INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE); + if ((unsigned)grow_by > + INT_MAX - (pkt->size + AV_INPUT_BUFFER_PADDING_SIZE)) + return AVERROR(ENOMEM); + + new_size = pkt->size + grow_by + AV_INPUT_BUFFER_PADDING_SIZE; + if (pkt->buf) { + size_t data_offset; + uint8_t *old_data = pkt->data; + if (pkt->data == NULL) { + data_offset = 0; + pkt->data = pkt->buf->data; + } else { + data_offset = pkt->data - pkt->buf->data; + if (data_offset > INT_MAX - new_size) + return AVERROR(ENOMEM); + } + + if (new_size + data_offset > pkt->buf->size || + !av_buffer_is_writable(pkt->buf)) { + int ret; + + // allocate slightly more than requested to avoid excessive + // reallocations + if (new_size + data_offset < INT_MAX - new_size/16) + new_size += new_size/16; + + ret = av_buffer_realloc(&pkt->buf, new_size + data_offset); + if (ret < 0) { + pkt->data = old_data; + return ret; + } + pkt->data = pkt->buf->data + data_offset; + } + } else { + pkt->buf = av_buffer_alloc(new_size); + if (!pkt->buf) + return AVERROR(ENOMEM); + if (pkt->size > 0) + memcpy(pkt->buf->data, pkt->data, pkt->size); + pkt->data = pkt->buf->data; + } + pkt->size += grow_by; + memset(pkt->data + pkt->size, 0, AV_INPUT_BUFFER_PADDING_SIZE); + + return 0; +} + +int av_packet_from_data(AVPacket *pkt, uint8_t *data, int size) +{ + if (size >= INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE) + return AVERROR(EINVAL); + + pkt->buf = av_buffer_create(data, size + AV_INPUT_BUFFER_PADDING_SIZE, + av_buffer_default_free, NULL, 0); + if (!pkt->buf) + return AVERROR(ENOMEM); + + pkt->data = data; + pkt->size = size; + + return 0; +} + +void av_packet_free_side_data(AVPacket *pkt) +{ + int i; + for (i = 0; i < pkt->side_data_elems; i++) + av_freep(&pkt->side_data[i].data); + av_freep(&pkt->side_data); + pkt->side_data_elems = 0; +} + +int av_packet_add_side_data(AVPacket *pkt, enum AVPacketSideDataType type, + uint8_t *data, size_t size) +{ + AVPacketSideData *tmp; + int i, elems = pkt->side_data_elems; + + for (i = 0; i < elems; i++) { + AVPacketSideData *sd = &pkt->side_data[i]; + + if (sd->type == type) { + av_free(sd->data); + sd->data = data; + sd->size = size; + return 0; + } + } + + if ((unsigned)elems + 1 > AV_PKT_DATA_NB) + return AVERROR(ERANGE); + + tmp = av_realloc(pkt->side_data, (elems + 1) * sizeof(*tmp)); + if (!tmp) + return AVERROR(ENOMEM); + + pkt->side_data = tmp; + pkt->side_data[elems].data = data; + pkt->side_data[elems].size = size; + pkt->side_data[elems].type = type; + pkt->side_data_elems++; + + return 0; +} + + +uint8_t *av_packet_new_side_data(AVPacket *pkt, enum AVPacketSideDataType type, + size_t size) +{ + int ret; + uint8_t *data; + + if (size > SIZE_MAX - AV_INPUT_BUFFER_PADDING_SIZE) + return NULL; + data = av_mallocz(size + AV_INPUT_BUFFER_PADDING_SIZE); + if (!data) + return NULL; + + ret = av_packet_add_side_data(pkt, type, data, size); + if (ret < 0) { + av_freep(&data); + return NULL; + } + + return data; +} + +uint8_t *av_packet_get_side_data(const AVPacket *pkt, enum AVPacketSideDataType type, + size_t *size) +{ + int i; + + for (i = 0; i < pkt->side_data_elems; i++) { + if (pkt->side_data[i].type == type) { + if (size) + *size = pkt->side_data[i].size; + return pkt->side_data[i].data; + } + } + if (size) + *size = 0; + return NULL; +} + +const char *av_packet_side_data_name(enum AVPacketSideDataType type) +{ + switch(type) { + case AV_PKT_DATA_PALETTE: return "Palette"; + case AV_PKT_DATA_NEW_EXTRADATA: return "New Extradata"; + case AV_PKT_DATA_PARAM_CHANGE: return "Param Change"; + case AV_PKT_DATA_H263_MB_INFO: return "H263 MB Info"; + case AV_PKT_DATA_REPLAYGAIN: return "Replay Gain"; + case AV_PKT_DATA_DISPLAYMATRIX: return "Display Matrix"; + case AV_PKT_DATA_STEREO3D: return "Stereo 3D"; + case AV_PKT_DATA_AUDIO_SERVICE_TYPE: return "Audio Service Type"; + case AV_PKT_DATA_QUALITY_STATS: return "Quality stats"; + case AV_PKT_DATA_FALLBACK_TRACK: return "Fallback track"; + case AV_PKT_DATA_CPB_PROPERTIES: return "CPB properties"; + case AV_PKT_DATA_SKIP_SAMPLES: return "Skip Samples"; + case AV_PKT_DATA_JP_DUALMONO: return "JP Dual Mono"; + case AV_PKT_DATA_STRINGS_METADATA: return "Strings Metadata"; + case AV_PKT_DATA_SUBTITLE_POSITION: return "Subtitle Position"; + case AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL: return "Matroska BlockAdditional"; + case AV_PKT_DATA_WEBVTT_IDENTIFIER: return "WebVTT ID"; + case AV_PKT_DATA_WEBVTT_SETTINGS: return "WebVTT Settings"; + case AV_PKT_DATA_METADATA_UPDATE: return "Metadata Update"; + case AV_PKT_DATA_MPEGTS_STREAM_ID: return "MPEGTS Stream ID"; + case AV_PKT_DATA_MASTERING_DISPLAY_METADATA: return "Mastering display metadata"; + case AV_PKT_DATA_CONTENT_LIGHT_LEVEL: return "Content light level metadata"; + case AV_PKT_DATA_SPHERICAL: return "Spherical Mapping"; + case AV_PKT_DATA_A53_CC: return "A53 Closed Captions"; + case AV_PKT_DATA_ENCRYPTION_INIT_INFO: return "Encryption initialization data"; + case AV_PKT_DATA_ENCRYPTION_INFO: return "Encryption info"; + case AV_PKT_DATA_AFD: return "Active Format Description data"; + case AV_PKT_DATA_PRFT: return "Producer Reference Time"; + case AV_PKT_DATA_ICC_PROFILE: return "ICC Profile"; + case AV_PKT_DATA_DOVI_CONF: return "DOVI configuration record"; + case AV_PKT_DATA_S12M_TIMECODE: return "SMPTE ST 12-1:2014 timecode"; + case AV_PKT_DATA_DYNAMIC_HDR10_PLUS: return "HDR10+ Dynamic Metadata (SMPTE 2094-40)"; + } + return NULL; +} + +uint8_t *av_packet_pack_dictionary(AVDictionary *dict, size_t *size) +{ + uint8_t *data = NULL; + *size = 0; + + if (!dict) + return NULL; + + for (int pass = 0; pass < 2; pass++) { + const AVDictionaryEntry *t = NULL; + size_t total_length = 0; + + while ((t = av_dict_iterate(dict, t))) { + for (int i = 0; i < 2; i++) { + const char *str = i ? t->value : t->key; + const size_t len = strlen(str) + 1; + + if (pass) + memcpy(data + total_length, str, len); + else if (len > SIZE_MAX - total_length) + return NULL; + total_length += len; + } + } + if (pass) + break; + data = av_malloc(total_length); + if (!data) + return NULL; + *size = total_length; + } + + return data; +} + +int av_packet_unpack_dictionary(const uint8_t *data, size_t size, + AVDictionary **dict) +{ + const uint8_t *end; + int ret; + + if (!dict || !data || !size) + return 0; + end = data + size; + if (size && end[-1]) + return AVERROR_INVALIDDATA; + while (data < end) { + const uint8_t *key = data; + const uint8_t *val = data + strlen(key) + 1; + + if (val >= end || !*key) + return AVERROR_INVALIDDATA; + + ret = av_dict_set(dict, key, val, 0); + if (ret < 0) + return ret; + data = val + strlen(val) + 1; + } + + return 0; +} + +int av_packet_shrink_side_data(AVPacket *pkt, enum AVPacketSideDataType type, + size_t size) +{ + int i; + + for (i = 0; i < pkt->side_data_elems; i++) { + if (pkt->side_data[i].type == type) { + if (size > pkt->side_data[i].size) + return AVERROR(ENOMEM); + pkt->side_data[i].size = size; + return 0; + } + } + return AVERROR(ENOENT); +} + +int av_packet_copy_props(AVPacket *dst, const AVPacket *src) +{ + int i, ret; + + dst->pts = src->pts; + dst->dts = src->dts; + dst->pos = src->pos; + dst->duration = src->duration; + dst->flags = src->flags; + dst->stream_index = src->stream_index; + dst->opaque = src->opaque; + dst->time_base = src->time_base; + dst->opaque_ref = NULL; + dst->side_data = NULL; + dst->side_data_elems = 0; + + ret = av_buffer_replace(&dst->opaque_ref, src->opaque_ref); + if (ret < 0) + return ret; + + for (i = 0; i < src->side_data_elems; i++) { + enum AVPacketSideDataType type = src->side_data[i].type; + size_t size = src->side_data[i].size; + uint8_t *src_data = src->side_data[i].data; + uint8_t *dst_data = av_packet_new_side_data(dst, type, size); + + if (!dst_data) { + av_buffer_unref(&dst->opaque_ref); + av_packet_free_side_data(dst); + return AVERROR(ENOMEM); + } + memcpy(dst_data, src_data, size); + } + + return 0; +} + +void av_packet_unref(AVPacket *pkt) +{ + av_packet_free_side_data(pkt); + av_buffer_unref(&pkt->opaque_ref); + av_buffer_unref(&pkt->buf); + get_packet_defaults(pkt); +} + +int av_packet_ref(AVPacket *dst, const AVPacket *src) +{ + int ret; + + dst->buf = NULL; + + ret = av_packet_copy_props(dst, src); + if (ret < 0) + goto fail; + + if (!src->buf) { + ret = packet_alloc(&dst->buf, src->size); + if (ret < 0) + goto fail; + av_assert1(!src->size || src->data); + if (src->size) + memcpy(dst->buf->data, src->data, src->size); + + dst->data = dst->buf->data; + } else { + dst->buf = av_buffer_ref(src->buf); + if (!dst->buf) { + ret = AVERROR(ENOMEM); + goto fail; + } + dst->data = src->data; + } + + dst->size = src->size; + + return 0; +fail: + av_packet_unref(dst); + return ret; +} + +AVPacket *av_packet_clone(const AVPacket *src) +{ + AVPacket *ret = av_packet_alloc(); + + if (!ret) + return ret; + + if (av_packet_ref(ret, src)) + av_packet_free(&ret); + + return ret; +} + +void av_packet_move_ref(AVPacket *dst, AVPacket *src) +{ + *dst = *src; + get_packet_defaults(src); +} + +int av_packet_make_refcounted(AVPacket *pkt) +{ + int ret; + + if (pkt->buf) + return 0; + + ret = packet_alloc(&pkt->buf, pkt->size); + if (ret < 0) + return ret; + av_assert1(!pkt->size || pkt->data); + if (pkt->size) + memcpy(pkt->buf->data, pkt->data, pkt->size); + + pkt->data = pkt->buf->data; + + return 0; +} + +int av_packet_make_writable(AVPacket *pkt) +{ + AVBufferRef *buf = NULL; + int ret; + + if (pkt->buf && av_buffer_is_writable(pkt->buf)) + return 0; + + ret = packet_alloc(&buf, pkt->size); + if (ret < 0) + return ret; + av_assert1(!pkt->size || pkt->data); + if (pkt->size) + memcpy(buf->data, pkt->data, pkt->size); + + av_buffer_unref(&pkt->buf); + pkt->buf = buf; + pkt->data = buf->data; + + return 0; +} + +void av_packet_rescale_ts(AVPacket *pkt, AVRational src_tb, AVRational dst_tb) +{ + if (pkt->pts != AV_NOPTS_VALUE) + pkt->pts = av_rescale_q(pkt->pts, src_tb, dst_tb); + if (pkt->dts != AV_NOPTS_VALUE) + pkt->dts = av_rescale_q(pkt->dts, src_tb, dst_tb); + if (pkt->duration > 0) + pkt->duration = av_rescale_q(pkt->duration, src_tb, dst_tb); +} + +int avpriv_packet_list_put(PacketList *packet_buffer, + AVPacket *pkt, + int (*copy)(AVPacket *dst, const AVPacket *src), + int flags) +{ + PacketListEntry *pktl = av_malloc(sizeof(*pktl)); + int ret; + + if (!pktl) + return AVERROR(ENOMEM); + + if (copy) { + get_packet_defaults(&pktl->pkt); + ret = copy(&pktl->pkt, pkt); + if (ret < 0) { + av_free(pktl); + return ret; + } + } else { + ret = av_packet_make_refcounted(pkt); + if (ret < 0) { + av_free(pktl); + return ret; + } + av_packet_move_ref(&pktl->pkt, pkt); + } + + pktl->next = NULL; + + if (packet_buffer->head) + packet_buffer->tail->next = pktl; + else + packet_buffer->head = pktl; + + /* Add the packet in the buffered packet list. */ + packet_buffer->tail = pktl; + return 0; +} + +int avpriv_packet_list_get(PacketList *pkt_buffer, + AVPacket *pkt) +{ + PacketListEntry *pktl = pkt_buffer->head; + if (!pktl) + return AVERROR(EAGAIN); + *pkt = pktl->pkt; + pkt_buffer->head = pktl->next; + if (!pkt_buffer->head) + pkt_buffer->tail = NULL; + av_freep(&pktl); + return 0; +} + +void avpriv_packet_list_free(PacketList *pkt_buf) +{ + PacketListEntry *tmp = pkt_buf->head; + + while (tmp) { + PacketListEntry *pktl = tmp; + tmp = pktl->next; + av_packet_unref(&pktl->pkt); + av_freep(&pktl); + } + pkt_buf->head = pkt_buf->tail = NULL; +} + +int ff_side_data_set_encoder_stats(AVPacket *pkt, int quality, int64_t *error, int error_count, int pict_type) +{ + uint8_t *side_data; + size_t side_data_size; + int i; + + side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_QUALITY_STATS, &side_data_size); + if (!side_data) { + side_data_size = 4+4+8*error_count; + side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_QUALITY_STATS, + side_data_size); + } + + if (!side_data || side_data_size < 4+4+8*error_count) + return AVERROR(ENOMEM); + + AV_WL32(side_data , quality ); + side_data[4] = pict_type; + side_data[5] = error_count; + for (i = 0; i<error_count; i++) + AV_WL64(side_data+8 + 8*i , error[i]); + + return 0; +} + +int ff_side_data_set_prft(AVPacket *pkt, int64_t timestamp) +{ + AVProducerReferenceTime *prft; + uint8_t *side_data; + size_t side_data_size; + + side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_PRFT, &side_data_size); + if (!side_data) { + side_data_size = sizeof(AVProducerReferenceTime); + side_data = av_packet_new_side_data(pkt, AV_PKT_DATA_PRFT, side_data_size); + } + + if (!side_data || side_data_size < sizeof(AVProducerReferenceTime)) + return AVERROR(ENOMEM); + + prft = (AVProducerReferenceTime *)side_data; + prft->wallclock = timestamp; + prft->flags = 0; + + return 0; +} diff --git a/media/ffvpx/libavcodec/avpicture.c b/media/ffvpx/libavcodec/avpicture.c new file mode 100644 index 0000000000..56435f4fc9 --- /dev/null +++ b/media/ffvpx/libavcodec/avpicture.c @@ -0,0 +1,82 @@ +/* + * AVPicture management routines + * Copyright (c) 2001, 2002, 2003 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * AVPicture management routines + */ + +#include "avcodec.h" +#include "internal.h" +#include "libavutil/common.h" +#include "libavutil/pixdesc.h" +#include "libavutil/imgutils.h" +#include "libavutil/internal.h" +#include "libavutil/colorspace.h" + +#if FF_API_AVPICTURE +FF_DISABLE_DEPRECATION_WARNINGS +int avpicture_fill(AVPicture *picture, const uint8_t *ptr, + enum AVPixelFormat pix_fmt, int width, int height) +{ + return av_image_fill_arrays(picture->data, picture->linesize, + ptr, pix_fmt, width, height, 1); +} + +int avpicture_layout(const AVPicture* src, enum AVPixelFormat pix_fmt, int width, int height, + unsigned char *dest, int dest_size) +{ + return av_image_copy_to_buffer(dest, dest_size, + (const uint8_t * const*)src->data, src->linesize, + pix_fmt, width, height, 1); +} + +int avpicture_get_size(enum AVPixelFormat pix_fmt, int width, int height) +{ + return av_image_get_buffer_size(pix_fmt, width, height, 1); +} + +int avpicture_alloc(AVPicture *picture, + enum AVPixelFormat pix_fmt, int width, int height) +{ + int ret = av_image_alloc(picture->data, picture->linesize, + width, height, pix_fmt, 1); + if (ret < 0) { + memset(picture, 0, sizeof(AVPicture)); + return ret; + } + + return 0; +} + +void avpicture_free(AVPicture *picture) +{ + av_freep(&picture->data[0]); +} + +void av_picture_copy(AVPicture *dst, const AVPicture *src, + enum AVPixelFormat pix_fmt, int width, int height) +{ + av_image_copy(dst->data, dst->linesize, (const uint8_t **)src->data, + src->linesize, pix_fmt, width, height); +} +FF_ENABLE_DEPRECATION_WARNINGS +#endif /* FF_API_AVPICTURE */ diff --git a/media/ffvpx/libavcodec/bit_depth_template.c b/media/ffvpx/libavcodec/bit_depth_template.c new file mode 100644 index 0000000000..d44d47ea45 --- /dev/null +++ b/media/ffvpx/libavcodec/bit_depth_template.c @@ -0,0 +1,108 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "mathops.h" +#include "rnd_avg.h" +#include "libavutil/intreadwrite.h" + +#ifndef BIT_DEPTH +#define BIT_DEPTH 8 +#endif + +#ifdef AVCODEC_BIT_DEPTH_TEMPLATE_C +# undef pixel +# undef pixel2 +# undef pixel4 +# undef dctcoef +# undef idctin +# undef INIT_CLIP +# undef no_rnd_avg_pixel4 +# undef rnd_avg_pixel4 +# undef AV_RN2P +# undef AV_RN4P +# undef AV_RN4PA +# undef AV_WN2P +# undef AV_WN4P +# undef AV_WN4PA +# undef CLIP +# undef FUNC +# undef FUNCC +# undef av_clip_pixel +# undef PIXEL_SPLAT_X4 +#else +# define AVCODEC_BIT_DEPTH_TEMPLATE_C +#endif + +#if BIT_DEPTH > 8 +# define pixel uint16_t +# define pixel2 uint32_t +# define pixel4 uint64_t +# define dctcoef int32_t + +#ifdef IN_IDCT_DEPTH +#if IN_IDCT_DEPTH == 32 +# define idctin int32_t +#else +# define idctin int16_t +#endif +#else +# define idctin int16_t +#endif + +# define INIT_CLIP +# define no_rnd_avg_pixel4 no_rnd_avg64 +# define rnd_avg_pixel4 rnd_avg64 +# define AV_RN2P AV_RN32 +# define AV_RN4P AV_RN64 +# define AV_RN4PA AV_RN64A +# define AV_WN2P AV_WN32 +# define AV_WN4P AV_WN64 +# define AV_WN4PA AV_WN64A +# define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL) + +# define av_clip_pixel(a) av_clip_uintp2(a, BIT_DEPTH) +# define CLIP(a) av_clip_uintp2(a, BIT_DEPTH) +#else +# define pixel uint8_t +# define pixel2 uint16_t +# define pixel4 uint32_t +# define dctcoef int16_t +# define idctin int16_t + +# define INIT_CLIP +# define no_rnd_avg_pixel4 no_rnd_avg32 +# define rnd_avg_pixel4 rnd_avg32 +# define AV_RN2P AV_RN16 +# define AV_RN4P AV_RN32 +# define AV_RN4PA AV_RN32A +# define AV_WN2P AV_WN16 +# define AV_WN4P AV_WN32 +# define AV_WN4PA AV_WN32A +# define PIXEL_SPLAT_X4(x) ((x)*0x01010101U) + +# define av_clip_pixel(a) av_clip_uint8(a) +# define CLIP(a) av_clip_uint8(a) +#endif + +#define FUNC3(a, b, c) a ## _ ## b ## c +#define FUNC2(a, b, c) FUNC3(a, b, c) +#define FUNC(a) FUNC2(a, BIT_DEPTH,) +#define FUNCC(a) FUNC2(a, BIT_DEPTH, _c) +#define FUNC4(a, b, c) a ## _int ## b ## _ ## c ## bit +#define FUNC5(a, b, c) FUNC4(a, b, c) +#define FUNC6(a) FUNC5(a, IN_IDCT_DEPTH, BIT_DEPTH) diff --git a/media/ffvpx/libavcodec/bitstream.c b/media/ffvpx/libavcodec/bitstream.c new file mode 100644 index 0000000000..3606575055 --- /dev/null +++ b/media/ffvpx/libavcodec/bitstream.c @@ -0,0 +1,72 @@ +/* + * Common bit i/o utils + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * Copyright (c) 2010 Loren Merritt + * + * alternative bitstream reader & writer by Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * bitstream api. + */ + +#include <stdint.h> +#include <string.h> + +#include "config.h" +#include "libavutil/avassert.h" +#include "libavutil/intreadwrite.h" +#include "put_bits.h" + +void ff_put_string(PutBitContext *pb, const char *string, int terminate_string) +{ + while (*string) { + put_bits(pb, 8, *string); + string++; + } + if (terminate_string) + put_bits(pb, 8, 0); +} + +void ff_copy_bits(PutBitContext *pb, const uint8_t *src, int length) +{ + int words = length >> 4; + int bits = length & 15; + int i; + + if (length == 0) + return; + + av_assert0(length <= put_bits_left(pb)); + + if (CONFIG_SMALL || words < 16 || put_bits_count(pb) & 7) { + for (i = 0; i < words; i++) + put_bits(pb, 16, AV_RB16(src + 2 * i)); + } else { + for (i = 0; put_bits_count(pb) & 31; i++) + put_bits(pb, 8, src[i]); + flush_put_bits(pb); + memcpy(put_bits_ptr(pb), src + i, 2 * words - i); + skip_put_bytes(pb, 2 * words - i); + } + + put_bits(pb, bits, AV_RB16(src + 2 * words) >> (16 - bits)); +} diff --git a/media/ffvpx/libavcodec/bitstream_filters.c b/media/ffvpx/libavcodec/bitstream_filters.c new file mode 100644 index 0000000000..e8216819ca --- /dev/null +++ b/media/ffvpx/libavcodec/bitstream_filters.c @@ -0,0 +1,109 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> +#include <string.h> + +#include "libavutil/log.h" + +#include "bsf.h" +#include "bsf_internal.h" + +extern const FFBitStreamFilter ff_aac_adtstoasc_bsf; +extern const FFBitStreamFilter ff_av1_frame_merge_bsf; +extern const FFBitStreamFilter ff_av1_frame_split_bsf; +extern const FFBitStreamFilter ff_av1_metadata_bsf; +extern const FFBitStreamFilter ff_chomp_bsf; +extern const FFBitStreamFilter ff_dump_extradata_bsf; +extern const FFBitStreamFilter ff_dca_core_bsf; +extern const FFBitStreamFilter ff_dts2pts_bsf; +extern const FFBitStreamFilter ff_dv_error_marker_bsf; +extern const FFBitStreamFilter ff_eac3_core_bsf; +extern const FFBitStreamFilter ff_extract_extradata_bsf; +extern const FFBitStreamFilter ff_filter_units_bsf; +extern const FFBitStreamFilter ff_h264_metadata_bsf; +extern const FFBitStreamFilter ff_h264_mp4toannexb_bsf; +extern const FFBitStreamFilter ff_h264_redundant_pps_bsf; +extern const FFBitStreamFilter ff_hapqa_extract_bsf; +extern const FFBitStreamFilter ff_hevc_metadata_bsf; +extern const FFBitStreamFilter ff_hevc_mp4toannexb_bsf; +extern const FFBitStreamFilter ff_imx_dump_header_bsf; +extern const FFBitStreamFilter ff_media100_to_mjpegb_bsf; +extern const FFBitStreamFilter ff_mjpeg2jpeg_bsf; +extern const FFBitStreamFilter ff_mjpega_dump_header_bsf; +extern const FFBitStreamFilter ff_mp3_header_decompress_bsf; +extern const FFBitStreamFilter ff_mpeg2_metadata_bsf; +extern const FFBitStreamFilter ff_mpeg4_unpack_bframes_bsf; +extern const FFBitStreamFilter ff_mov2textsub_bsf; +extern const FFBitStreamFilter ff_noise_bsf; +extern const FFBitStreamFilter ff_null_bsf; +extern const FFBitStreamFilter ff_opus_metadata_bsf; +extern const FFBitStreamFilter ff_pcm_rechunk_bsf; +extern const FFBitStreamFilter ff_pgs_frame_merge_bsf; +extern const FFBitStreamFilter ff_prores_metadata_bsf; +extern const FFBitStreamFilter ff_remove_extradata_bsf; +extern const FFBitStreamFilter ff_setts_bsf; +extern const FFBitStreamFilter ff_text2movsub_bsf; +extern const FFBitStreamFilter ff_trace_headers_bsf; +extern const FFBitStreamFilter ff_truehd_core_bsf; +extern const FFBitStreamFilter ff_vp9_metadata_bsf; +extern const FFBitStreamFilter ff_vp9_raw_reorder_bsf; +extern const FFBitStreamFilter ff_vp9_superframe_bsf; +extern const FFBitStreamFilter ff_vp9_superframe_split_bsf; + +#include "libavcodec/bsf_list.c" + +const AVBitStreamFilter *av_bsf_iterate(void **opaque) +{ + uintptr_t i = (uintptr_t)*opaque; + const FFBitStreamFilter *f = bitstream_filters[i]; + + if (f) { + *opaque = (void*)(i + 1); + return &f->p; + } + return NULL; +} + +const AVBitStreamFilter *av_bsf_get_by_name(const char *name) +{ + const AVBitStreamFilter *f = NULL; + void *i = 0; + + if (!name) + return NULL; + + while ((f = av_bsf_iterate(&i))) { + if (!strcmp(f->name, name)) + return f; + } + + return NULL; +} + +const AVClass *ff_bsf_child_class_iterate(void **opaque) +{ + const AVBitStreamFilter *f; + + /* find next filter with priv options */ + while ((f = av_bsf_iterate(opaque))) { + if (f->priv_class) + return f->priv_class; + } + return NULL; +} diff --git a/media/ffvpx/libavcodec/blockdsp.h b/media/ffvpx/libavcodec/blockdsp.h new file mode 100644 index 0000000000..d853adada2 --- /dev/null +++ b/media/ffvpx/libavcodec/blockdsp.h @@ -0,0 +1,47 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_BLOCKDSP_H +#define AVCODEC_BLOCKDSP_H + +#include <stddef.h> +#include <stdint.h> + +/* add and put pixel (decoding) + * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16. + * h for op_pixels_func is limited to { width / 2, width }, + * but never larger than 16 and never smaller than 4. */ +typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */, + uint8_t value, ptrdiff_t line_size, int h); + +typedef struct BlockDSPContext { + void (*clear_block)(int16_t *block /* align 32 */); + void (*clear_blocks)(int16_t *blocks /* align 32 */); + + op_fill_func fill_block_tab[2]; +} BlockDSPContext; + +void ff_blockdsp_init(BlockDSPContext *c); + +void ff_blockdsp_init_alpha(BlockDSPContext *c); +void ff_blockdsp_init_arm(BlockDSPContext *c); +void ff_blockdsp_init_ppc(BlockDSPContext *c); +void ff_blockdsp_init_x86(BlockDSPContext *c); +void ff_blockdsp_init_mips(BlockDSPContext *c); + +#endif /* AVCODEC_BLOCKDSP_H */ diff --git a/media/ffvpx/libavcodec/bsf.c b/media/ffvpx/libavcodec/bsf.c new file mode 100644 index 0000000000..42cc1b5ab0 --- /dev/null +++ b/media/ffvpx/libavcodec/bsf.c @@ -0,0 +1,562 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <string.h> + +#include "config_components.h" + +#include "libavutil/avassert.h" +#include "libavutil/log.h" +#include "libavutil/mem.h" +#include "libavutil/opt.h" +#include "libavutil/avstring.h" +#include "libavutil/bprint.h" + +#include "bsf.h" +#include "bsf_internal.h" +#include "codec_desc.h" +#include "codec_par.h" + +#define IS_EMPTY(pkt) (!(pkt)->data && !(pkt)->side_data_elems) + +static av_always_inline const FFBitStreamFilter *ff_bsf(const AVBitStreamFilter *bsf) +{ + return (const FFBitStreamFilter*)bsf; +} + +typedef struct FFBSFContext { + AVBSFContext pub; + AVPacket *buffer_pkt; + int eof; +} FFBSFContext; + +static av_always_inline FFBSFContext *ffbsfcontext(AVBSFContext *ctx) +{ + return (FFBSFContext *)ctx; +} + +void av_bsf_free(AVBSFContext **pctx) +{ + AVBSFContext *ctx; + FFBSFContext *bsfi; + + if (!pctx || !*pctx) + return; + ctx = *pctx; + bsfi = ffbsfcontext(ctx); + + if (ctx->priv_data) { + if (ff_bsf(ctx->filter)->close) + ff_bsf(ctx->filter)->close(ctx); + if (ctx->filter->priv_class) + av_opt_free(ctx->priv_data); + av_freep(&ctx->priv_data); + } + av_packet_free(&bsfi->buffer_pkt); + + avcodec_parameters_free(&ctx->par_in); + avcodec_parameters_free(&ctx->par_out); + + av_freep(pctx); +} + +static void *bsf_child_next(void *obj, void *prev) +{ + AVBSFContext *ctx = obj; + if (!prev && ctx->filter->priv_class) + return ctx->priv_data; + return NULL; +} + +static const char *bsf_to_name(void *bsf) +{ + return ((AVBSFContext *)bsf)->filter->name; +} + +static const AVClass bsf_class = { + .class_name = "AVBSFContext", + .item_name = bsf_to_name, + .version = LIBAVUTIL_VERSION_INT, + .child_next = bsf_child_next, + .child_class_iterate = ff_bsf_child_class_iterate, + .category = AV_CLASS_CATEGORY_BITSTREAM_FILTER, +}; + +const AVClass *av_bsf_get_class(void) +{ + return &bsf_class; +} + +int av_bsf_alloc(const AVBitStreamFilter *filter, AVBSFContext **pctx) +{ + AVBSFContext *ctx; + FFBSFContext *bsfi; + int ret; + + bsfi = av_mallocz(sizeof(*bsfi)); + if (!bsfi) + return AVERROR(ENOMEM); + ctx = &bsfi->pub; + + ctx->av_class = &bsf_class; + ctx->filter = filter; + + ctx->par_in = avcodec_parameters_alloc(); + ctx->par_out = avcodec_parameters_alloc(); + if (!ctx->par_in || !ctx->par_out) { + ret = AVERROR(ENOMEM); + goto fail; + } + /* allocate priv data and init private options */ + if (ff_bsf(filter)->priv_data_size) { + ctx->priv_data = av_mallocz(ff_bsf(filter)->priv_data_size); + if (!ctx->priv_data) { + ret = AVERROR(ENOMEM); + goto fail; + } + if (filter->priv_class) { + *(const AVClass **)ctx->priv_data = filter->priv_class; + av_opt_set_defaults(ctx->priv_data); + } + } + bsfi->buffer_pkt = av_packet_alloc(); + if (!bsfi->buffer_pkt) { + ret = AVERROR(ENOMEM); + goto fail; + } + + *pctx = ctx; + return 0; +fail: + av_bsf_free(&ctx); + return ret; +} + +int av_bsf_init(AVBSFContext *ctx) +{ + int ret, i; + + /* check that the codec is supported */ + if (ctx->filter->codec_ids) { + for (i = 0; ctx->filter->codec_ids[i] != AV_CODEC_ID_NONE; i++) + if (ctx->par_in->codec_id == ctx->filter->codec_ids[i]) + break; + if (ctx->filter->codec_ids[i] == AV_CODEC_ID_NONE) { + const AVCodecDescriptor *desc = avcodec_descriptor_get(ctx->par_in->codec_id); + av_log(ctx, AV_LOG_ERROR, "Codec '%s' (%d) is not supported by the " + "bitstream filter '%s'. Supported codecs are: ", + desc ? desc->name : "unknown", ctx->par_in->codec_id, ctx->filter->name); + for (i = 0; ctx->filter->codec_ids[i] != AV_CODEC_ID_NONE; i++) { + enum AVCodecID codec_id = ctx->filter->codec_ids[i]; + av_log(ctx, AV_LOG_ERROR, "%s (%d) ", + avcodec_get_name(codec_id), codec_id); + } + av_log(ctx, AV_LOG_ERROR, "\n"); + return AVERROR(EINVAL); + } + } + + /* initialize output parameters to be the same as input + * init below might overwrite that */ + ret = avcodec_parameters_copy(ctx->par_out, ctx->par_in); + if (ret < 0) + return ret; + + ctx->time_base_out = ctx->time_base_in; + + if (ff_bsf(ctx->filter)->init) { + ret = ff_bsf(ctx->filter)->init(ctx); + if (ret < 0) + return ret; + } + + return 0; +} + +void av_bsf_flush(AVBSFContext *ctx) +{ + FFBSFContext *const bsfi = ffbsfcontext(ctx); + + bsfi->eof = 0; + + av_packet_unref(bsfi->buffer_pkt); + + if (ff_bsf(ctx->filter)->flush) + ff_bsf(ctx->filter)->flush(ctx); +} + +int av_bsf_send_packet(AVBSFContext *ctx, AVPacket *pkt) +{ + FFBSFContext *const bsfi = ffbsfcontext(ctx); + int ret; + + if (!pkt || IS_EMPTY(pkt)) { + if (pkt) + av_packet_unref(pkt); + bsfi->eof = 1; + return 0; + } + + if (bsfi->eof) { + av_log(ctx, AV_LOG_ERROR, "A non-NULL packet sent after an EOF.\n"); + return AVERROR(EINVAL); + } + + if (!IS_EMPTY(bsfi->buffer_pkt)) + return AVERROR(EAGAIN); + + ret = av_packet_make_refcounted(pkt); + if (ret < 0) + return ret; + av_packet_move_ref(bsfi->buffer_pkt, pkt); + + return 0; +} + +int av_bsf_receive_packet(AVBSFContext *ctx, AVPacket *pkt) +{ + return ff_bsf(ctx->filter)->filter(ctx, pkt); +} + +int ff_bsf_get_packet(AVBSFContext *ctx, AVPacket **pkt) +{ + FFBSFContext *const bsfi = ffbsfcontext(ctx); + AVPacket *tmp_pkt; + + if (bsfi->eof) + return AVERROR_EOF; + + if (IS_EMPTY(bsfi->buffer_pkt)) + return AVERROR(EAGAIN); + + tmp_pkt = av_packet_alloc(); + if (!tmp_pkt) + return AVERROR(ENOMEM); + + *pkt = bsfi->buffer_pkt; + bsfi->buffer_pkt = tmp_pkt; + + return 0; +} + +int ff_bsf_get_packet_ref(AVBSFContext *ctx, AVPacket *pkt) +{ + FFBSFContext *const bsfi = ffbsfcontext(ctx); + + if (bsfi->eof) + return AVERROR_EOF; + + if (IS_EMPTY(bsfi->buffer_pkt)) + return AVERROR(EAGAIN); + + av_packet_move_ref(pkt, bsfi->buffer_pkt); + + return 0; +} + +typedef struct BSFListContext { + const AVClass *class; + + AVBSFContext **bsfs; + int nb_bsfs; + + unsigned idx; // index of currently processed BSF + + char * item_name; +} BSFListContext; + + +static int bsf_list_init(AVBSFContext *bsf) +{ + BSFListContext *lst = bsf->priv_data; + int ret, i; + const AVCodecParameters *cod_par = bsf->par_in; + AVRational tb = bsf->time_base_in; + + for (i = 0; i < lst->nb_bsfs; ++i) { + ret = avcodec_parameters_copy(lst->bsfs[i]->par_in, cod_par); + if (ret < 0) + goto fail; + + lst->bsfs[i]->time_base_in = tb; + + ret = av_bsf_init(lst->bsfs[i]); + if (ret < 0) + goto fail; + + cod_par = lst->bsfs[i]->par_out; + tb = lst->bsfs[i]->time_base_out; + } + + bsf->time_base_out = tb; + ret = avcodec_parameters_copy(bsf->par_out, cod_par); + +fail: + return ret; +} + +static int bsf_list_filter(AVBSFContext *bsf, AVPacket *out) +{ + BSFListContext *lst = bsf->priv_data; + int ret, eof = 0; + + if (!lst->nb_bsfs) + return ff_bsf_get_packet_ref(bsf, out); + + while (1) { + /* get a packet from the previous filter up the chain */ + if (lst->idx) + ret = av_bsf_receive_packet(lst->bsfs[lst->idx-1], out); + else + ret = ff_bsf_get_packet_ref(bsf, out); + if (ret == AVERROR(EAGAIN)) { + if (!lst->idx) + return ret; + lst->idx--; + continue; + } else if (ret == AVERROR_EOF) { + eof = 1; + } else if (ret < 0) + return ret; + + /* send it to the next filter down the chain */ + if (lst->idx < lst->nb_bsfs) { + ret = av_bsf_send_packet(lst->bsfs[lst->idx], eof ? NULL : out); + av_assert1(ret != AVERROR(EAGAIN)); + if (ret < 0) { + av_packet_unref(out); + return ret; + } + lst->idx++; + eof = 0; + } else if (eof) { + return ret; + } else { + return 0; + } + } +} + +static void bsf_list_flush(AVBSFContext *bsf) +{ + BSFListContext *lst = bsf->priv_data; + + for (int i = 0; i < lst->nb_bsfs; i++) + av_bsf_flush(lst->bsfs[i]); + lst->idx = 0; +} + +static void bsf_list_close(AVBSFContext *bsf) +{ + BSFListContext *lst = bsf->priv_data; + int i; + + for (i = 0; i < lst->nb_bsfs; ++i) + av_bsf_free(&lst->bsfs[i]); + av_freep(&lst->bsfs); + av_freep(&lst->item_name); +} + +static const char *bsf_list_item_name(void *ctx) +{ + static const char *null_filter_name = "null"; + AVBSFContext *bsf_ctx = ctx; + BSFListContext *lst = bsf_ctx->priv_data; + + if (!lst->nb_bsfs) + return null_filter_name; + + if (!lst->item_name) { + int i; + AVBPrint bp; + av_bprint_init(&bp, 16, 128); + + av_bprintf(&bp, "bsf_list("); + for (i = 0; i < lst->nb_bsfs; i++) + av_bprintf(&bp, i ? ",%s" : "%s", lst->bsfs[i]->filter->name); + av_bprintf(&bp, ")"); + + av_bprint_finalize(&bp, &lst->item_name); + } + + return lst->item_name; +} + +static const AVClass bsf_list_class = { + .class_name = "bsf_list", + .item_name = bsf_list_item_name, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const FFBitStreamFilter list_bsf = { + .p.name = "bsf_list", + .p.priv_class = &bsf_list_class, + .priv_data_size = sizeof(BSFListContext), + .init = bsf_list_init, + .filter = bsf_list_filter, + .flush = bsf_list_flush, + .close = bsf_list_close, +}; + +struct AVBSFList { + AVBSFContext **bsfs; + int nb_bsfs; +}; + +AVBSFList *av_bsf_list_alloc(void) +{ + return av_mallocz(sizeof(AVBSFList)); +} + +void av_bsf_list_free(AVBSFList **lst) +{ + int i; + + if (!*lst) + return; + + for (i = 0; i < (*lst)->nb_bsfs; ++i) + av_bsf_free(&(*lst)->bsfs[i]); + av_free((*lst)->bsfs); + av_freep(lst); +} + +int av_bsf_list_append(AVBSFList *lst, AVBSFContext *bsf) +{ + return av_dynarray_add_nofree(&lst->bsfs, &lst->nb_bsfs, bsf); +} + +static int bsf_list_append_internal(AVBSFList *lst, const char *bsf_name, const char *options, AVDictionary ** options_dict) +{ + int ret; + const AVBitStreamFilter *filter; + AVBSFContext *bsf; + + filter = av_bsf_get_by_name(bsf_name); + if (!filter) + return AVERROR_BSF_NOT_FOUND; + + ret = av_bsf_alloc(filter, &bsf); + if (ret < 0) + return ret; + + if (options && filter->priv_class) { + const AVOption *opt = av_opt_next(bsf->priv_data, NULL); + const char * shorthand[2] = {NULL}; + + if (opt) + shorthand[0] = opt->name; + + ret = av_opt_set_from_string(bsf->priv_data, options, shorthand, "=", ":"); + if (ret < 0) + goto end; + } + + if (options_dict) { + ret = av_opt_set_dict2(bsf, options_dict, AV_OPT_SEARCH_CHILDREN); + if (ret < 0) + goto end; + } + + ret = av_bsf_list_append(lst, bsf); + +end: + if (ret < 0) + av_bsf_free(&bsf); + + return ret; +} + +int av_bsf_list_append2(AVBSFList *lst, const char *bsf_name, AVDictionary ** options) +{ + return bsf_list_append_internal(lst, bsf_name, NULL, options); +} + +int av_bsf_list_finalize(AVBSFList **lst, AVBSFContext **bsf) +{ + int ret = 0; + BSFListContext *ctx; + + if ((*lst)->nb_bsfs == 1) { + *bsf = (*lst)->bsfs[0]; + av_freep(&(*lst)->bsfs); + (*lst)->nb_bsfs = 0; + goto end; + } + + ret = av_bsf_alloc(&list_bsf.p, bsf); + if (ret < 0) + return ret; + + ctx = (*bsf)->priv_data; + + ctx->bsfs = (*lst)->bsfs; + ctx->nb_bsfs = (*lst)->nb_bsfs; + +end: + av_freep(lst); + return ret; +} + +static int bsf_parse_single(char *str, AVBSFList *bsf_lst) +{ + char *bsf_name, *bsf_options_str; + + bsf_name = av_strtok(str, "=", &bsf_options_str); + if (!bsf_name) + return AVERROR(EINVAL); + + return bsf_list_append_internal(bsf_lst, bsf_name, bsf_options_str, NULL); +} + +int av_bsf_list_parse_str(const char *str, AVBSFContext **bsf_lst) +{ + AVBSFList *lst; + int ret; + + if (!str) + return av_bsf_get_null_filter(bsf_lst); + + lst = av_bsf_list_alloc(); + if (!lst) + return AVERROR(ENOMEM); + + do { + char *bsf_str = av_get_token(&str, ","); + ret = bsf_parse_single(bsf_str, lst); + av_free(bsf_str); + if (ret < 0) + goto end; + } while (*str && *++str); + + ret = av_bsf_list_finalize(&lst, bsf_lst); +end: + if (ret < 0) + av_bsf_list_free(&lst); + return ret; +} + +int av_bsf_get_null_filter(AVBSFContext **bsf) +{ +#if CONFIG_NULL_BSF + extern const FFBitStreamFilter ff_null_bsf; + return av_bsf_alloc(&ff_null_bsf.p, bsf); +#else + return av_bsf_alloc(&list_bsf.p, bsf); +#endif +} diff --git a/media/ffvpx/libavcodec/bsf.h b/media/ffvpx/libavcodec/bsf.h new file mode 100644 index 0000000000..a09c69f242 --- /dev/null +++ b/media/ffvpx/libavcodec/bsf.h @@ -0,0 +1,332 @@ +/* + * Bitstream filters public API + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_BSF_H +#define AVCODEC_BSF_H + +#include "libavutil/dict.h" +#include "libavutil/log.h" +#include "libavutil/rational.h" + +#include "codec_id.h" +#include "codec_par.h" +#include "packet.h" + +/** + * @defgroup lavc_bsf Bitstream filters + * @ingroup libavc + * + * Bitstream filters transform encoded media data without decoding it. This + * allows e.g. manipulating various header values. Bitstream filters operate on + * @ref AVPacket "AVPackets". + * + * The bitstream filtering API is centered around two structures: + * AVBitStreamFilter and AVBSFContext. The former represents a bitstream filter + * in abstract, the latter a specific filtering process. Obtain an + * AVBitStreamFilter using av_bsf_get_by_name() or av_bsf_iterate(), then pass + * it to av_bsf_alloc() to create an AVBSFContext. Fill in the user-settable + * AVBSFContext fields, as described in its documentation, then call + * av_bsf_init() to prepare the filter context for use. + * + * Submit packets for filtering using av_bsf_send_packet(), obtain filtered + * results with av_bsf_receive_packet(). When no more input packets will be + * sent, submit a NULL AVPacket to signal the end of the stream to the filter. + * av_bsf_receive_packet() will then return trailing packets, if any are + * produced by the filter. + * + * Finally, free the filter context with av_bsf_free(). + * @{ + */ + +/** + * The bitstream filter state. + * + * This struct must be allocated with av_bsf_alloc() and freed with + * av_bsf_free(). + * + * The fields in the struct will only be changed (by the caller or by the + * filter) as described in their documentation, and are to be considered + * immutable otherwise. + */ +typedef struct AVBSFContext { + /** + * A class for logging and AVOptions + */ + const AVClass *av_class; + + /** + * The bitstream filter this context is an instance of. + */ + const struct AVBitStreamFilter *filter; + + /** + * Opaque filter-specific private data. If filter->priv_class is non-NULL, + * this is an AVOptions-enabled struct. + */ + void *priv_data; + + /** + * Parameters of the input stream. This field is allocated in + * av_bsf_alloc(), it needs to be filled by the caller before + * av_bsf_init(). + */ + AVCodecParameters *par_in; + + /** + * Parameters of the output stream. This field is allocated in + * av_bsf_alloc(), it is set by the filter in av_bsf_init(). + */ + AVCodecParameters *par_out; + + /** + * The timebase used for the timestamps of the input packets. Set by the + * caller before av_bsf_init(). + */ + AVRational time_base_in; + + /** + * The timebase used for the timestamps of the output packets. Set by the + * filter in av_bsf_init(). + */ + AVRational time_base_out; +} AVBSFContext; + +typedef struct AVBitStreamFilter { + const char *name; + + /** + * A list of codec ids supported by the filter, terminated by + * AV_CODEC_ID_NONE. + * May be NULL, in that case the bitstream filter works with any codec id. + */ + const enum AVCodecID *codec_ids; + + /** + * A class for the private data, used to declare bitstream filter private + * AVOptions. This field is NULL for bitstream filters that do not declare + * any options. + * + * If this field is non-NULL, the first member of the filter private data + * must be a pointer to AVClass, which will be set by libavcodec generic + * code to this class. + */ + const AVClass *priv_class; +} AVBitStreamFilter; + +/** + * @return a bitstream filter with the specified name or NULL if no such + * bitstream filter exists. + */ +const AVBitStreamFilter *av_bsf_get_by_name(const char *name); + +/** + * Iterate over all registered bitstream filters. + * + * @param opaque a pointer where libavcodec will store the iteration state. Must + * point to NULL to start the iteration. + * + * @return the next registered bitstream filter or NULL when the iteration is + * finished + */ +const AVBitStreamFilter *av_bsf_iterate(void **opaque); + +/** + * Allocate a context for a given bitstream filter. The caller must fill in the + * context parameters as described in the documentation and then call + * av_bsf_init() before sending any data to the filter. + * + * @param filter the filter for which to allocate an instance. + * @param[out] ctx a pointer into which the pointer to the newly-allocated context + * will be written. It must be freed with av_bsf_free() after the + * filtering is done. + * + * @return 0 on success, a negative AVERROR code on failure + */ +int av_bsf_alloc(const AVBitStreamFilter *filter, AVBSFContext **ctx); + +/** + * Prepare the filter for use, after all the parameters and options have been + * set. + * + * @param ctx a AVBSFContext previously allocated with av_bsf_alloc() + */ +int av_bsf_init(AVBSFContext *ctx); + +/** + * Submit a packet for filtering. + * + * After sending each packet, the filter must be completely drained by calling + * av_bsf_receive_packet() repeatedly until it returns AVERROR(EAGAIN) or + * AVERROR_EOF. + * + * @param ctx an initialized AVBSFContext + * @param pkt the packet to filter. The bitstream filter will take ownership of + * the packet and reset the contents of pkt. pkt is not touched if an error occurs. + * If pkt is empty (i.e. NULL, or pkt->data is NULL and pkt->side_data_elems zero), + * it signals the end of the stream (i.e. no more non-empty packets will be sent; + * sending more empty packets does nothing) and will cause the filter to output + * any packets it may have buffered internally. + * + * @return + * - 0 on success. + * - AVERROR(EAGAIN) if packets need to be retrieved from the filter (using + * av_bsf_receive_packet()) before new input can be consumed. + * - Another negative AVERROR value if an error occurs. + */ +int av_bsf_send_packet(AVBSFContext *ctx, AVPacket *pkt); + +/** + * Retrieve a filtered packet. + * + * @param ctx an initialized AVBSFContext + * @param[out] pkt this struct will be filled with the contents of the filtered + * packet. It is owned by the caller and must be freed using + * av_packet_unref() when it is no longer needed. + * This parameter should be "clean" (i.e. freshly allocated + * with av_packet_alloc() or unreffed with av_packet_unref()) + * when this function is called. If this function returns + * successfully, the contents of pkt will be completely + * overwritten by the returned data. On failure, pkt is not + * touched. + * + * @return + * - 0 on success. + * - AVERROR(EAGAIN) if more packets need to be sent to the filter (using + * av_bsf_send_packet()) to get more output. + * - AVERROR_EOF if there will be no further output from the filter. + * - Another negative AVERROR value if an error occurs. + * + * @note one input packet may result in several output packets, so after sending + * a packet with av_bsf_send_packet(), this function needs to be called + * repeatedly until it stops returning 0. It is also possible for a filter to + * output fewer packets than were sent to it, so this function may return + * AVERROR(EAGAIN) immediately after a successful av_bsf_send_packet() call. + */ +int av_bsf_receive_packet(AVBSFContext *ctx, AVPacket *pkt); + +/** + * Reset the internal bitstream filter state. Should be called e.g. when seeking. + */ +void av_bsf_flush(AVBSFContext *ctx); + +/** + * Free a bitstream filter context and everything associated with it; write NULL + * into the supplied pointer. + */ +void av_bsf_free(AVBSFContext **ctx); + +/** + * Get the AVClass for AVBSFContext. It can be used in combination with + * AV_OPT_SEARCH_FAKE_OBJ for examining options. + * + * @see av_opt_find(). + */ +const AVClass *av_bsf_get_class(void); + +/** + * Structure for chain/list of bitstream filters. + * Empty list can be allocated by av_bsf_list_alloc(). + */ +typedef struct AVBSFList AVBSFList; + +/** + * Allocate empty list of bitstream filters. + * The list must be later freed by av_bsf_list_free() + * or finalized by av_bsf_list_finalize(). + * + * @return Pointer to @ref AVBSFList on success, NULL in case of failure + */ +AVBSFList *av_bsf_list_alloc(void); + +/** + * Free list of bitstream filters. + * + * @param lst Pointer to pointer returned by av_bsf_list_alloc() + */ +void av_bsf_list_free(AVBSFList **lst); + +/** + * Append bitstream filter to the list of bitstream filters. + * + * @param lst List to append to + * @param bsf Filter context to be appended + * + * @return >=0 on success, negative AVERROR in case of failure + */ +int av_bsf_list_append(AVBSFList *lst, AVBSFContext *bsf); + +/** + * Construct new bitstream filter context given it's name and options + * and append it to the list of bitstream filters. + * + * @param lst List to append to + * @param bsf_name Name of the bitstream filter + * @param options Options for the bitstream filter, can be set to NULL + * + * @return >=0 on success, negative AVERROR in case of failure + */ +int av_bsf_list_append2(AVBSFList *lst, const char * bsf_name, AVDictionary **options); +/** + * Finalize list of bitstream filters. + * + * This function will transform @ref AVBSFList to single @ref AVBSFContext, + * so the whole chain of bitstream filters can be treated as single filter + * freshly allocated by av_bsf_alloc(). + * If the call is successful, @ref AVBSFList structure is freed and lst + * will be set to NULL. In case of failure, caller is responsible for + * freeing the structure by av_bsf_list_free() + * + * @param lst Filter list structure to be transformed + * @param[out] bsf Pointer to be set to newly created @ref AVBSFContext structure + * representing the chain of bitstream filters + * + * @return >=0 on success, negative AVERROR in case of failure + */ +int av_bsf_list_finalize(AVBSFList **lst, AVBSFContext **bsf); + +/** + * Parse string describing list of bitstream filters and create single + * @ref AVBSFContext describing the whole chain of bitstream filters. + * Resulting @ref AVBSFContext can be treated as any other @ref AVBSFContext freshly + * allocated by av_bsf_alloc(). + * + * @param str String describing chain of bitstream filters in format + * `bsf1[=opt1=val1:opt2=val2][,bsf2]` + * @param[out] bsf Pointer to be set to newly created @ref AVBSFContext structure + * representing the chain of bitstream filters + * + * @return >=0 on success, negative AVERROR in case of failure + */ +int av_bsf_list_parse_str(const char *str, AVBSFContext **bsf); + +/** + * Get null/pass-through bitstream filter. + * + * @param[out] bsf Pointer to be set to new instance of pass-through bitstream filter + * + * @return + */ +int av_bsf_get_null_filter(AVBSFContext **bsf); + +/** + * @} + */ + +#endif // AVCODEC_BSF_H diff --git a/media/ffvpx/libavcodec/bsf_internal.h b/media/ffvpx/libavcodec/bsf_internal.h new file mode 100644 index 0000000000..922b03c01b --- /dev/null +++ b/media/ffvpx/libavcodec/bsf_internal.h @@ -0,0 +1,60 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_BSF_INTERNAL_H +#define AVCODEC_BSF_INTERNAL_H + +#include "libavutil/log.h" + +#include "bsf.h" +#include "packet.h" + +typedef struct FFBitStreamFilter { + /** + * The public AVBitStreamFilter. See bsf.h for it. + */ + AVBitStreamFilter p; + + int priv_data_size; + int (*init)(AVBSFContext *ctx); + int (*filter)(AVBSFContext *ctx, AVPacket *pkt); + void (*close)(AVBSFContext *ctx); + void (*flush)(AVBSFContext *ctx); +} FFBitStreamFilter; + +/** + * Called by the bitstream filters to get the next packet for filtering. + * The filter is responsible for either freeing the packet or passing it to the + * caller. + */ +int ff_bsf_get_packet(AVBSFContext *ctx, AVPacket **pkt); + +/** + * Called by bitstream filters to get packet for filtering. + * The reference to packet is moved to provided packet structure. + * + * @param ctx pointer to AVBSFContext of filter + * @param pkt pointer to packet to move reference to + * + * @return 0 on success, negative AVERROR in case of failure + */ +int ff_bsf_get_packet_ref(AVBSFContext *ctx, AVPacket *pkt); + +const AVClass *ff_bsf_child_class_iterate(void **opaque); + +#endif /* AVCODEC_BSF_INTERNAL_H */ diff --git a/media/ffvpx/libavcodec/bsf_list.c b/media/ffvpx/libavcodec/bsf_list.c new file mode 100644 index 0000000000..4050b41fde --- /dev/null +++ b/media/ffvpx/libavcodec/bsf_list.c @@ -0,0 +1,11 @@ +#include "config_components.h" + +static const FFBitStreamFilter * const bitstream_filters[] = { +#if CONFIG_VP9_SUPERFRAME_SPLIT_BSF + &ff_vp9_superframe_split_bsf, +#endif +#if CONFIG_AV1_VAAPI_HWACCEL + &ff_av1_frame_split_bsf, +#endif + &ff_null_bsf, + NULL }; diff --git a/media/ffvpx/libavcodec/bytestream.h b/media/ffvpx/libavcodec/bytestream.h new file mode 100644 index 0000000000..d0033f14f3 --- /dev/null +++ b/media/ffvpx/libavcodec/bytestream.h @@ -0,0 +1,380 @@ +/* + * Bytestream functions + * copyright (c) 2006 Baptiste Coudurier <baptiste.coudurier@free.fr> + * Copyright (c) 2012 Aneesh Dogra (lionaneesh) <lionaneesh@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_BYTESTREAM_H +#define AVCODEC_BYTESTREAM_H + +#include <stdint.h> +#include <string.h> + +#include "libavutil/avassert.h" +#include "libavutil/common.h" +#include "libavutil/intreadwrite.h" + +typedef struct GetByteContext { + const uint8_t *buffer, *buffer_end, *buffer_start; +} GetByteContext; + +typedef struct PutByteContext { + uint8_t *buffer, *buffer_end, *buffer_start; + int eof; +} PutByteContext; + +#define DEF(type, name, bytes, read, write) \ +static av_always_inline type bytestream_get_ ## name(const uint8_t **b) \ +{ \ + (*b) += bytes; \ + return read(*b - bytes); \ +} \ +static av_always_inline void bytestream_put_ ## name(uint8_t **b, \ + const type value) \ +{ \ + write(*b, value); \ + (*b) += bytes; \ +} \ +static av_always_inline void bytestream2_put_ ## name ## u(PutByteContext *p, \ + const type value) \ +{ \ + bytestream_put_ ## name(&p->buffer, value); \ +} \ +static av_always_inline void bytestream2_put_ ## name(PutByteContext *p, \ + const type value) \ +{ \ + if (!p->eof && (p->buffer_end - p->buffer >= bytes)) { \ + write(p->buffer, value); \ + p->buffer += bytes; \ + } else \ + p->eof = 1; \ +} \ +static av_always_inline type bytestream2_get_ ## name ## u(GetByteContext *g) \ +{ \ + return bytestream_get_ ## name(&g->buffer); \ +} \ +static av_always_inline type bytestream2_get_ ## name(GetByteContext *g) \ +{ \ + if (g->buffer_end - g->buffer < bytes) { \ + g->buffer = g->buffer_end; \ + return 0; \ + } \ + return bytestream2_get_ ## name ## u(g); \ +} \ +static av_always_inline type bytestream2_peek_ ## name ## u(GetByteContext *g) \ +{ \ + return read(g->buffer); \ +} \ +static av_always_inline type bytestream2_peek_ ## name(GetByteContext *g) \ +{ \ + if (g->buffer_end - g->buffer < bytes) \ + return 0; \ + return bytestream2_peek_ ## name ## u(g); \ +} + +DEF(uint64_t, le64, 8, AV_RL64, AV_WL64) +DEF(unsigned int, le32, 4, AV_RL32, AV_WL32) +DEF(unsigned int, le24, 3, AV_RL24, AV_WL24) +DEF(unsigned int, le16, 2, AV_RL16, AV_WL16) +DEF(uint64_t, be64, 8, AV_RB64, AV_WB64) +DEF(unsigned int, be32, 4, AV_RB32, AV_WB32) +DEF(unsigned int, be24, 3, AV_RB24, AV_WB24) +DEF(unsigned int, be16, 2, AV_RB16, AV_WB16) +DEF(unsigned int, byte, 1, AV_RB8 , AV_WB8) + +#if AV_HAVE_BIGENDIAN +# define bytestream2_get_ne16 bytestream2_get_be16 +# define bytestream2_get_ne24 bytestream2_get_be24 +# define bytestream2_get_ne32 bytestream2_get_be32 +# define bytestream2_get_ne64 bytestream2_get_be64 +# define bytestream2_get_ne16u bytestream2_get_be16u +# define bytestream2_get_ne24u bytestream2_get_be24u +# define bytestream2_get_ne32u bytestream2_get_be32u +# define bytestream2_get_ne64u bytestream2_get_be64u +# define bytestream2_put_ne16 bytestream2_put_be16 +# define bytestream2_put_ne24 bytestream2_put_be24 +# define bytestream2_put_ne32 bytestream2_put_be32 +# define bytestream2_put_ne64 bytestream2_put_be64 +# define bytestream2_peek_ne16 bytestream2_peek_be16 +# define bytestream2_peek_ne24 bytestream2_peek_be24 +# define bytestream2_peek_ne32 bytestream2_peek_be32 +# define bytestream2_peek_ne64 bytestream2_peek_be64 +#else +# define bytestream2_get_ne16 bytestream2_get_le16 +# define bytestream2_get_ne24 bytestream2_get_le24 +# define bytestream2_get_ne32 bytestream2_get_le32 +# define bytestream2_get_ne64 bytestream2_get_le64 +# define bytestream2_get_ne16u bytestream2_get_le16u +# define bytestream2_get_ne24u bytestream2_get_le24u +# define bytestream2_get_ne32u bytestream2_get_le32u +# define bytestream2_get_ne64u bytestream2_get_le64u +# define bytestream2_put_ne16 bytestream2_put_le16 +# define bytestream2_put_ne24 bytestream2_put_le24 +# define bytestream2_put_ne32 bytestream2_put_le32 +# define bytestream2_put_ne64 bytestream2_put_le64 +# define bytestream2_peek_ne16 bytestream2_peek_le16 +# define bytestream2_peek_ne24 bytestream2_peek_le24 +# define bytestream2_peek_ne32 bytestream2_peek_le32 +# define bytestream2_peek_ne64 bytestream2_peek_le64 +#endif + +static av_always_inline void bytestream2_init(GetByteContext *g, + const uint8_t *buf, + int buf_size) +{ + av_assert0(buf_size >= 0); + g->buffer = buf; + g->buffer_start = buf; + g->buffer_end = buf + buf_size; +} + +static av_always_inline void bytestream2_init_writer(PutByteContext *p, + uint8_t *buf, + int buf_size) +{ + av_assert0(buf_size >= 0); + p->buffer = buf; + p->buffer_start = buf; + p->buffer_end = buf + buf_size; + p->eof = 0; +} + +static av_always_inline int bytestream2_get_bytes_left(GetByteContext *g) +{ + return g->buffer_end - g->buffer; +} + +static av_always_inline int bytestream2_get_bytes_left_p(PutByteContext *p) +{ + return p->buffer_end - p->buffer; +} + +static av_always_inline void bytestream2_skip(GetByteContext *g, + unsigned int size) +{ + g->buffer += FFMIN(g->buffer_end - g->buffer, size); +} + +static av_always_inline void bytestream2_skipu(GetByteContext *g, + unsigned int size) +{ + g->buffer += size; +} + +static av_always_inline void bytestream2_skip_p(PutByteContext *p, + unsigned int size) +{ + int size2; + if (p->eof) + return; + size2 = FFMIN(p->buffer_end - p->buffer, size); + if (size2 != size) + p->eof = 1; + p->buffer += size2; +} + +static av_always_inline int bytestream2_tell(GetByteContext *g) +{ + return (int)(g->buffer - g->buffer_start); +} + +static av_always_inline int bytestream2_tell_p(PutByteContext *p) +{ + return (int)(p->buffer - p->buffer_start); +} + +static av_always_inline int bytestream2_size(GetByteContext *g) +{ + return (int)(g->buffer_end - g->buffer_start); +} + +static av_always_inline int bytestream2_size_p(PutByteContext *p) +{ + return (int)(p->buffer_end - p->buffer_start); +} + +static av_always_inline int bytestream2_seek(GetByteContext *g, + int offset, + int whence) +{ + switch (whence) { + case SEEK_CUR: + offset = av_clip(offset, -(g->buffer - g->buffer_start), + g->buffer_end - g->buffer); + g->buffer += offset; + break; + case SEEK_END: + offset = av_clip(offset, -(g->buffer_end - g->buffer_start), 0); + g->buffer = g->buffer_end + offset; + break; + case SEEK_SET: + offset = av_clip(offset, 0, g->buffer_end - g->buffer_start); + g->buffer = g->buffer_start + offset; + break; + default: + return AVERROR(EINVAL); + } + return bytestream2_tell(g); +} + +static av_always_inline int bytestream2_seek_p(PutByteContext *p, + int offset, + int whence) +{ + p->eof = 0; + switch (whence) { + case SEEK_CUR: + if (p->buffer_end - p->buffer < offset) + p->eof = 1; + offset = av_clip(offset, -(p->buffer - p->buffer_start), + p->buffer_end - p->buffer); + p->buffer += offset; + break; + case SEEK_END: + if (offset > 0) + p->eof = 1; + offset = av_clip(offset, -(p->buffer_end - p->buffer_start), 0); + p->buffer = p->buffer_end + offset; + break; + case SEEK_SET: + if (p->buffer_end - p->buffer_start < offset) + p->eof = 1; + offset = av_clip(offset, 0, p->buffer_end - p->buffer_start); + p->buffer = p->buffer_start + offset; + break; + default: + return AVERROR(EINVAL); + } + return bytestream2_tell_p(p); +} + +static av_always_inline unsigned int bytestream2_get_buffer(GetByteContext *g, + uint8_t *dst, + unsigned int size) +{ + int size2 = FFMIN(g->buffer_end - g->buffer, size); + memcpy(dst, g->buffer, size2); + g->buffer += size2; + return size2; +} + +static av_always_inline unsigned int bytestream2_get_bufferu(GetByteContext *g, + uint8_t *dst, + unsigned int size) +{ + memcpy(dst, g->buffer, size); + g->buffer += size; + return size; +} + +static av_always_inline unsigned int bytestream2_put_buffer(PutByteContext *p, + const uint8_t *src, + unsigned int size) +{ + int size2; + if (p->eof) + return 0; + size2 = FFMIN(p->buffer_end - p->buffer, size); + if (size2 != size) + p->eof = 1; + memcpy(p->buffer, src, size2); + p->buffer += size2; + return size2; +} + +static av_always_inline unsigned int bytestream2_put_bufferu(PutByteContext *p, + const uint8_t *src, + unsigned int size) +{ + memcpy(p->buffer, src, size); + p->buffer += size; + return size; +} + +static av_always_inline void bytestream2_set_buffer(PutByteContext *p, + const uint8_t c, + unsigned int size) +{ + int size2; + if (p->eof) + return; + size2 = FFMIN(p->buffer_end - p->buffer, size); + if (size2 != size) + p->eof = 1; + memset(p->buffer, c, size2); + p->buffer += size2; +} + +static av_always_inline void bytestream2_set_bufferu(PutByteContext *p, + const uint8_t c, + unsigned int size) +{ + memset(p->buffer, c, size); + p->buffer += size; +} + +static av_always_inline unsigned int bytestream2_get_eof(PutByteContext *p) +{ + return p->eof; +} + +static av_always_inline unsigned int bytestream2_copy_bufferu(PutByteContext *p, + GetByteContext *g, + unsigned int size) +{ + memcpy(p->buffer, g->buffer, size); + p->buffer += size; + g->buffer += size; + return size; +} + +static av_always_inline unsigned int bytestream2_copy_buffer(PutByteContext *p, + GetByteContext *g, + unsigned int size) +{ + int size2; + + if (p->eof) + return 0; + size = FFMIN(g->buffer_end - g->buffer, size); + size2 = FFMIN(p->buffer_end - p->buffer, size); + if (size2 != size) + p->eof = 1; + + return bytestream2_copy_bufferu(p, g, size2); +} + +static av_always_inline unsigned int bytestream_get_buffer(const uint8_t **b, + uint8_t *dst, + unsigned int size) +{ + memcpy(dst, *b, size); + (*b) += size; + return size; +} + +static av_always_inline void bytestream_put_buffer(uint8_t **b, + const uint8_t *src, + unsigned int size) +{ + memcpy(*b, src, size); + (*b) += size; +} + +#endif /* AVCODEC_BYTESTREAM_H */ diff --git a/media/ffvpx/libavcodec/cbs.c b/media/ffvpx/libavcodec/cbs.c new file mode 100644 index 0000000000..504197e06d --- /dev/null +++ b/media/ffvpx/libavcodec/cbs.c @@ -0,0 +1,1028 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <string.h> + +#include "config.h" + +#include "libavutil/avassert.h" +#include "libavutil/buffer.h" +#include "libavutil/common.h" +#include "libavutil/opt.h" + +#include "avcodec.h" +#include "cbs.h" +#include "cbs_internal.h" + + +static const CodedBitstreamType *const cbs_type_table[] = { +#if CONFIG_CBS_AV1 + &ff_cbs_type_av1, +#endif +#if CONFIG_CBS_H264 + &ff_cbs_type_h264, +#endif +#if CONFIG_CBS_H265 + &ff_cbs_type_h265, +#endif +#if CONFIG_CBS_JPEG + &ff_cbs_type_jpeg, +#endif +#if CONFIG_CBS_MPEG2 + &ff_cbs_type_mpeg2, +#endif +#if CONFIG_CBS_VP9 + &ff_cbs_type_vp9, +#endif +}; + +const enum AVCodecID ff_cbs_all_codec_ids[] = { +#if CONFIG_CBS_AV1 + AV_CODEC_ID_AV1, +#endif +#if CONFIG_CBS_H264 + AV_CODEC_ID_H264, +#endif +#if CONFIG_CBS_H265 + AV_CODEC_ID_H265, +#endif +#if CONFIG_CBS_JPEG + AV_CODEC_ID_MJPEG, +#endif +#if CONFIG_CBS_MPEG2 + AV_CODEC_ID_MPEG2VIDEO, +#endif +#if CONFIG_CBS_VP9 + AV_CODEC_ID_VP9, +#endif + AV_CODEC_ID_NONE +}; + +av_cold int ff_cbs_init(CodedBitstreamContext **ctx_ptr, + enum AVCodecID codec_id, void *log_ctx) +{ + CodedBitstreamContext *ctx; + const CodedBitstreamType *type; + int i; + + type = NULL; + for (i = 0; i < FF_ARRAY_ELEMS(cbs_type_table); i++) { + if (cbs_type_table[i]->codec_id == codec_id) { + type = cbs_type_table[i]; + break; + } + } + if (!type) + return AVERROR(EINVAL); + + ctx = av_mallocz(sizeof(*ctx)); + if (!ctx) + return AVERROR(ENOMEM); + + ctx->log_ctx = log_ctx; + ctx->codec = type; /* Must be before any error */ + + if (type->priv_data_size) { + ctx->priv_data = av_mallocz(ctx->codec->priv_data_size); + if (!ctx->priv_data) { + av_freep(&ctx); + return AVERROR(ENOMEM); + } + if (type->priv_class) { + *(const AVClass **)ctx->priv_data = type->priv_class; + av_opt_set_defaults(ctx->priv_data); + } + } + + ctx->decompose_unit_types = NULL; + + ctx->trace_enable = 0; + ctx->trace_level = AV_LOG_TRACE; + + *ctx_ptr = ctx; + return 0; +} + +av_cold void ff_cbs_flush(CodedBitstreamContext *ctx) +{ + if (ctx->codec->flush) + ctx->codec->flush(ctx); +} + +av_cold void ff_cbs_close(CodedBitstreamContext **ctx_ptr) +{ + CodedBitstreamContext *ctx = *ctx_ptr; + + if (!ctx) + return; + + if (ctx->codec->close) + ctx->codec->close(ctx); + + av_freep(&ctx->write_buffer); + + if (ctx->codec->priv_class && ctx->priv_data) + av_opt_free(ctx->priv_data); + + av_freep(&ctx->priv_data); + av_freep(ctx_ptr); +} + +static void cbs_unit_uninit(CodedBitstreamUnit *unit) +{ + av_buffer_unref(&unit->content_ref); + unit->content = NULL; + + av_buffer_unref(&unit->data_ref); + unit->data = NULL; + unit->data_size = 0; + unit->data_bit_padding = 0; +} + +void ff_cbs_fragment_reset(CodedBitstreamFragment *frag) +{ + int i; + + for (i = 0; i < frag->nb_units; i++) + cbs_unit_uninit(&frag->units[i]); + frag->nb_units = 0; + + av_buffer_unref(&frag->data_ref); + frag->data = NULL; + frag->data_size = 0; + frag->data_bit_padding = 0; +} + +av_cold void ff_cbs_fragment_free(CodedBitstreamFragment *frag) +{ + ff_cbs_fragment_reset(frag); + + av_freep(&frag->units); + frag->nb_units_allocated = 0; +} + +static int cbs_read_fragment_content(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag) +{ + int err, i, j; + + for (i = 0; i < frag->nb_units; i++) { + CodedBitstreamUnit *unit = &frag->units[i]; + + if (ctx->decompose_unit_types) { + for (j = 0; j < ctx->nb_decompose_unit_types; j++) { + if (ctx->decompose_unit_types[j] == unit->type) + break; + } + if (j >= ctx->nb_decompose_unit_types) + continue; + } + + av_buffer_unref(&unit->content_ref); + unit->content = NULL; + + av_assert0(unit->data && unit->data_ref); + + err = ctx->codec->read_unit(ctx, unit); + if (err == AVERROR(ENOSYS)) { + av_log(ctx->log_ctx, AV_LOG_VERBOSE, + "Decomposition unimplemented for unit %d " + "(type %"PRIu32").\n", i, unit->type); + } else if (err == AVERROR(EAGAIN)) { + av_log(ctx->log_ctx, AV_LOG_VERBOSE, + "Skipping decomposition of unit %d " + "(type %"PRIu32").\n", i, unit->type); + av_buffer_unref(&unit->content_ref); + unit->content = NULL; + } else if (err < 0) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Failed to read unit %d " + "(type %"PRIu32").\n", i, unit->type); + return err; + } + } + + return 0; +} + +static int cbs_fill_fragment_data(CodedBitstreamFragment *frag, + const uint8_t *data, size_t size) +{ + av_assert0(!frag->data && !frag->data_ref); + + frag->data_ref = + av_buffer_alloc(size + AV_INPUT_BUFFER_PADDING_SIZE); + if (!frag->data_ref) + return AVERROR(ENOMEM); + + frag->data = frag->data_ref->data; + frag->data_size = size; + + memcpy(frag->data, data, size); + memset(frag->data + size, 0, + AV_INPUT_BUFFER_PADDING_SIZE); + + return 0; +} + +static int cbs_read_data(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + AVBufferRef *buf, + const uint8_t *data, size_t size, + int header) +{ + int err; + + if (buf) { + frag->data_ref = av_buffer_ref(buf); + if (!frag->data_ref) + return AVERROR(ENOMEM); + + frag->data = (uint8_t *)data; + frag->data_size = size; + + } else { + err = cbs_fill_fragment_data(frag, data, size); + if (err < 0) + return err; + } + + err = ctx->codec->split_fragment(ctx, frag, header); + if (err < 0) + return err; + + return cbs_read_fragment_content(ctx, frag); +} + +int ff_cbs_read_extradata(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + const AVCodecParameters *par) +{ + return cbs_read_data(ctx, frag, NULL, + par->extradata, + par->extradata_size, 1); +} + +int ff_cbs_read_extradata_from_codec(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + const AVCodecContext *avctx) +{ + return cbs_read_data(ctx, frag, NULL, + avctx->extradata, + avctx->extradata_size, 1); +} + +int ff_cbs_read_packet(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + const AVPacket *pkt) +{ + return cbs_read_data(ctx, frag, pkt->buf, + pkt->data, pkt->size, 0); +} + +int ff_cbs_read_packet_side_data(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + const AVPacket *pkt) +{ + size_t side_data_size; + const uint8_t *side_data = + av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, + &side_data_size); + + return cbs_read_data(ctx, frag, NULL, + side_data, side_data_size, 1); +} + +int ff_cbs_read(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + const uint8_t *data, size_t size) +{ + return cbs_read_data(ctx, frag, NULL, + data, size, 0); +} + +/** + * Allocate a new internal data buffer of the given size in the unit. + * + * The data buffer will have input padding. + */ +static int cbs_alloc_unit_data(CodedBitstreamUnit *unit, + size_t size) +{ + av_assert0(!unit->data && !unit->data_ref); + + unit->data_ref = av_buffer_alloc(size + AV_INPUT_BUFFER_PADDING_SIZE); + if (!unit->data_ref) + return AVERROR(ENOMEM); + + unit->data = unit->data_ref->data; + unit->data_size = size; + + memset(unit->data + size, 0, AV_INPUT_BUFFER_PADDING_SIZE); + + return 0; +} + +static int cbs_write_unit_data(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit) +{ + PutBitContext pbc; + int ret; + + if (!ctx->write_buffer) { + // Initial write buffer size is 1MB. + ctx->write_buffer_size = 1024 * 1024; + + reallocate_and_try_again: + ret = av_reallocp(&ctx->write_buffer, ctx->write_buffer_size); + if (ret < 0) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Unable to allocate a " + "sufficiently large write buffer (last attempt " + "%"SIZE_SPECIFIER" bytes).\n", ctx->write_buffer_size); + return ret; + } + } + + init_put_bits(&pbc, ctx->write_buffer, ctx->write_buffer_size); + + ret = ctx->codec->write_unit(ctx, unit, &pbc); + if (ret < 0) { + if (ret == AVERROR(ENOSPC)) { + // Overflow. + if (ctx->write_buffer_size == INT_MAX / 8) + return AVERROR(ENOMEM); + ctx->write_buffer_size = FFMIN(2 * ctx->write_buffer_size, INT_MAX / 8); + goto reallocate_and_try_again; + } + // Write failed for some other reason. + return ret; + } + + // Overflow but we didn't notice. + av_assert0(put_bits_count(&pbc) <= 8 * ctx->write_buffer_size); + + if (put_bits_count(&pbc) % 8) + unit->data_bit_padding = 8 - put_bits_count(&pbc) % 8; + else + unit->data_bit_padding = 0; + + flush_put_bits(&pbc); + + ret = cbs_alloc_unit_data(unit, put_bytes_output(&pbc)); + if (ret < 0) + return ret; + + memcpy(unit->data, ctx->write_buffer, unit->data_size); + + return 0; +} + +int ff_cbs_write_fragment_data(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag) +{ + int err, i; + + for (i = 0; i < frag->nb_units; i++) { + CodedBitstreamUnit *unit = &frag->units[i]; + + if (!unit->content) + continue; + + av_buffer_unref(&unit->data_ref); + unit->data = NULL; + + err = cbs_write_unit_data(ctx, unit); + if (err < 0) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Failed to write unit %d " + "(type %"PRIu32").\n", i, unit->type); + return err; + } + av_assert0(unit->data && unit->data_ref); + } + + av_buffer_unref(&frag->data_ref); + frag->data = NULL; + + err = ctx->codec->assemble_fragment(ctx, frag); + if (err < 0) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Failed to assemble fragment.\n"); + return err; + } + av_assert0(frag->data && frag->data_ref); + + return 0; +} + +int ff_cbs_write_extradata(CodedBitstreamContext *ctx, + AVCodecParameters *par, + CodedBitstreamFragment *frag) +{ + int err; + + err = ff_cbs_write_fragment_data(ctx, frag); + if (err < 0) + return err; + + av_freep(&par->extradata); + par->extradata_size = 0; + + if (!frag->data_size) + return 0; + + par->extradata = av_malloc(frag->data_size + + AV_INPUT_BUFFER_PADDING_SIZE); + if (!par->extradata) + return AVERROR(ENOMEM); + + memcpy(par->extradata, frag->data, frag->data_size); + memset(par->extradata + frag->data_size, 0, + AV_INPUT_BUFFER_PADDING_SIZE); + par->extradata_size = frag->data_size; + + return 0; +} + +int ff_cbs_write_packet(CodedBitstreamContext *ctx, + AVPacket *pkt, + CodedBitstreamFragment *frag) +{ + AVBufferRef *buf; + int err; + + err = ff_cbs_write_fragment_data(ctx, frag); + if (err < 0) + return err; + + buf = av_buffer_ref(frag->data_ref); + if (!buf) + return AVERROR(ENOMEM); + + av_buffer_unref(&pkt->buf); + + pkt->buf = buf; + pkt->data = frag->data; + pkt->size = frag->data_size; + + return 0; +} + + +void ff_cbs_trace_header(CodedBitstreamContext *ctx, + const char *name) +{ + if (!ctx->trace_enable) + return; + + av_log(ctx->log_ctx, ctx->trace_level, "%s\n", name); +} + +void ff_cbs_trace_syntax_element(CodedBitstreamContext *ctx, int position, + const char *str, const int *subscripts, + const char *bits, int64_t value) +{ + char name[256]; + size_t name_len, bits_len; + int pad, subs, i, j, k, n; + + if (!ctx->trace_enable) + return; + + av_assert0(value >= INT_MIN && value <= UINT32_MAX); + + subs = subscripts ? subscripts[0] : 0; + n = 0; + for (i = j = 0; str[i];) { + if (str[i] == '[') { + if (n < subs) { + ++n; + k = snprintf(name + j, sizeof(name) - j, "[%d", subscripts[n]); + av_assert0(k > 0 && j + k < sizeof(name)); + j += k; + for (++i; str[i] && str[i] != ']'; i++); + av_assert0(str[i] == ']'); + } else { + while (str[i] && str[i] != ']') + name[j++] = str[i++]; + av_assert0(str[i] == ']'); + } + } else { + av_assert0(j + 1 < sizeof(name)); + name[j++] = str[i++]; + } + } + av_assert0(j + 1 < sizeof(name)); + name[j] = 0; + av_assert0(n == subs); + + name_len = strlen(name); + bits_len = strlen(bits); + + if (name_len + bits_len > 60) + pad = bits_len + 2; + else + pad = 61 - name_len; + + av_log(ctx->log_ctx, ctx->trace_level, "%-10d %s%*s = %"PRId64"\n", + position, name, pad, bits, value); +} + +int ff_cbs_read_unsigned(CodedBitstreamContext *ctx, GetBitContext *gbc, + int width, const char *name, + const int *subscripts, uint32_t *write_to, + uint32_t range_min, uint32_t range_max) +{ + uint32_t value; + int position; + + av_assert0(width > 0 && width <= 32); + + if (get_bits_left(gbc) < width) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid value at " + "%s: bitstream ended.\n", name); + return AVERROR_INVALIDDATA; + } + + if (ctx->trace_enable) + position = get_bits_count(gbc); + + value = get_bits_long(gbc, width); + + if (ctx->trace_enable) { + char bits[33]; + int i; + for (i = 0; i < width; i++) + bits[i] = value >> (width - i - 1) & 1 ? '1' : '0'; + bits[i] = 0; + + ff_cbs_trace_syntax_element(ctx, position, name, subscripts, + bits, value); + } + + if (value < range_min || value > range_max) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: " + "%"PRIu32", but must be in [%"PRIu32",%"PRIu32"].\n", + name, value, range_min, range_max); + return AVERROR_INVALIDDATA; + } + + *write_to = value; + return 0; +} + +int ff_cbs_write_unsigned(CodedBitstreamContext *ctx, PutBitContext *pbc, + int width, const char *name, + const int *subscripts, uint32_t value, + uint32_t range_min, uint32_t range_max) +{ + av_assert0(width > 0 && width <= 32); + + if (value < range_min || value > range_max) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: " + "%"PRIu32", but must be in [%"PRIu32",%"PRIu32"].\n", + name, value, range_min, range_max); + return AVERROR_INVALIDDATA; + } + + if (put_bits_left(pbc) < width) + return AVERROR(ENOSPC); + + if (ctx->trace_enable) { + char bits[33]; + int i; + for (i = 0; i < width; i++) + bits[i] = value >> (width - i - 1) & 1 ? '1' : '0'; + bits[i] = 0; + + ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc), + name, subscripts, bits, value); + } + + if (width < 32) + put_bits(pbc, width, value); + else + put_bits32(pbc, value); + + return 0; +} + +int ff_cbs_read_signed(CodedBitstreamContext *ctx, GetBitContext *gbc, + int width, const char *name, + const int *subscripts, int32_t *write_to, + int32_t range_min, int32_t range_max) +{ + int32_t value; + int position; + + av_assert0(width > 0 && width <= 32); + + if (get_bits_left(gbc) < width) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid value at " + "%s: bitstream ended.\n", name); + return AVERROR_INVALIDDATA; + } + + if (ctx->trace_enable) + position = get_bits_count(gbc); + + value = get_sbits_long(gbc, width); + + if (ctx->trace_enable) { + char bits[33]; + int i; + for (i = 0; i < width; i++) + bits[i] = value & (1U << (width - i - 1)) ? '1' : '0'; + bits[i] = 0; + + ff_cbs_trace_syntax_element(ctx, position, name, subscripts, + bits, value); + } + + if (value < range_min || value > range_max) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: " + "%"PRId32", but must be in [%"PRId32",%"PRId32"].\n", + name, value, range_min, range_max); + return AVERROR_INVALIDDATA; + } + + *write_to = value; + return 0; +} + +int ff_cbs_write_signed(CodedBitstreamContext *ctx, PutBitContext *pbc, + int width, const char *name, + const int *subscripts, int32_t value, + int32_t range_min, int32_t range_max) +{ + av_assert0(width > 0 && width <= 32); + + if (value < range_min || value > range_max) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: " + "%"PRId32", but must be in [%"PRId32",%"PRId32"].\n", + name, value, range_min, range_max); + return AVERROR_INVALIDDATA; + } + + if (put_bits_left(pbc) < width) + return AVERROR(ENOSPC); + + if (ctx->trace_enable) { + char bits[33]; + int i; + for (i = 0; i < width; i++) + bits[i] = value & (1U << (width - i - 1)) ? '1' : '0'; + bits[i] = 0; + + ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc), + name, subscripts, bits, value); + } + + if (width < 32) + put_sbits(pbc, width, value); + else + put_bits32(pbc, value); + + return 0; +} + + +static int cbs_insert_unit(CodedBitstreamFragment *frag, + int position) +{ + CodedBitstreamUnit *units; + + if (frag->nb_units < frag->nb_units_allocated) { + units = frag->units; + + if (position < frag->nb_units) + memmove(units + position + 1, units + position, + (frag->nb_units - position) * sizeof(*units)); + } else { + units = av_malloc_array(frag->nb_units*2 + 1, sizeof(*units)); + if (!units) + return AVERROR(ENOMEM); + + frag->nb_units_allocated = 2*frag->nb_units_allocated + 1; + + if (position > 0) + memcpy(units, frag->units, position * sizeof(*units)); + + if (position < frag->nb_units) + memcpy(units + position + 1, frag->units + position, + (frag->nb_units - position) * sizeof(*units)); + } + + memset(units + position, 0, sizeof(*units)); + + if (units != frag->units) { + av_free(frag->units); + frag->units = units; + } + + ++frag->nb_units; + + return 0; +} + +int ff_cbs_insert_unit_content(CodedBitstreamFragment *frag, + int position, + CodedBitstreamUnitType type, + void *content, + AVBufferRef *content_buf) +{ + CodedBitstreamUnit *unit; + AVBufferRef *content_ref; + int err; + + if (position == -1) + position = frag->nb_units; + av_assert0(position >= 0 && position <= frag->nb_units); + + if (content_buf) { + content_ref = av_buffer_ref(content_buf); + if (!content_ref) + return AVERROR(ENOMEM); + } else { + content_ref = NULL; + } + + err = cbs_insert_unit(frag, position); + if (err < 0) { + av_buffer_unref(&content_ref); + return err; + } + + unit = &frag->units[position]; + unit->type = type; + unit->content = content; + unit->content_ref = content_ref; + + return 0; +} + +static int cbs_insert_unit_data(CodedBitstreamFragment *frag, + CodedBitstreamUnitType type, + uint8_t *data, size_t data_size, + AVBufferRef *data_buf, + int position) +{ + CodedBitstreamUnit *unit; + AVBufferRef *data_ref; + int err; + + av_assert0(position >= 0 && position <= frag->nb_units); + + if (data_buf) + data_ref = av_buffer_ref(data_buf); + else + data_ref = av_buffer_create(data, data_size, NULL, NULL, 0); + if (!data_ref) { + if (!data_buf) + av_free(data); + return AVERROR(ENOMEM); + } + + err = cbs_insert_unit(frag, position); + if (err < 0) { + av_buffer_unref(&data_ref); + return err; + } + + unit = &frag->units[position]; + unit->type = type; + unit->data = data; + unit->data_size = data_size; + unit->data_ref = data_ref; + + return 0; +} + +int ff_cbs_append_unit_data(CodedBitstreamFragment *frag, + CodedBitstreamUnitType type, + uint8_t *data, size_t data_size, + AVBufferRef *data_buf) +{ + return cbs_insert_unit_data(frag, type, + data, data_size, data_buf, + frag->nb_units); +} + +void ff_cbs_delete_unit(CodedBitstreamFragment *frag, + int position) +{ + av_assert0(0 <= position && position < frag->nb_units + && "Unit to be deleted not in fragment."); + + cbs_unit_uninit(&frag->units[position]); + + --frag->nb_units; + + if (frag->nb_units > 0) + memmove(frag->units + position, + frag->units + position + 1, + (frag->nb_units - position) * sizeof(*frag->units)); +} + +static void cbs_default_free_unit_content(void *opaque, uint8_t *data) +{ + const CodedBitstreamUnitTypeDescriptor *desc = opaque; + + for (int i = 0; i < desc->type.ref.nb_offsets; i++) { + void **ptr = (void**)(data + desc->type.ref.offsets[i]); + av_buffer_unref((AVBufferRef**)(ptr + 1)); + } + av_free(data); +} + +static const CodedBitstreamUnitTypeDescriptor + *cbs_find_unit_type_desc(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit) +{ + const CodedBitstreamUnitTypeDescriptor *desc; + int i, j; + + if (!ctx->codec->unit_types) + return NULL; + + for (i = 0;; i++) { + desc = &ctx->codec->unit_types[i]; + if (desc->nb_unit_types == 0) + break; + if (desc->nb_unit_types == CBS_UNIT_TYPE_RANGE) { + if (unit->type >= desc->unit_type.range.start && + unit->type <= desc->unit_type.range.end) + return desc; + } else { + for (j = 0; j < desc->nb_unit_types; j++) { + if (desc->unit_type.list[j] == unit->type) + return desc; + } + } + } + return NULL; +} + +int ff_cbs_alloc_unit_content(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit) +{ + const CodedBitstreamUnitTypeDescriptor *desc; + + av_assert0(!unit->content && !unit->content_ref); + + desc = cbs_find_unit_type_desc(ctx, unit); + if (!desc) + return AVERROR(ENOSYS); + + unit->content = av_mallocz(desc->content_size); + if (!unit->content) + return AVERROR(ENOMEM); + + unit->content_ref = + av_buffer_create(unit->content, desc->content_size, + desc->content_type == CBS_CONTENT_TYPE_COMPLEX + ? desc->type.complex.content_free + : cbs_default_free_unit_content, + (void*)desc, 0); + if (!unit->content_ref) { + av_freep(&unit->content); + return AVERROR(ENOMEM); + } + + return 0; +} + +static int cbs_clone_internal_refs_unit_content(AVBufferRef **clone_ref, + const CodedBitstreamUnit *unit, + const CodedBitstreamUnitTypeDescriptor *desc) +{ + const uint8_t *src; + uint8_t *copy; + int err, i; + + av_assert0(unit->content); + src = unit->content; + + copy = av_memdup(src, desc->content_size); + if (!copy) + return AVERROR(ENOMEM); + + for (i = 0; i < desc->type.ref.nb_offsets; i++) { + const uint8_t *const *src_ptr = (const uint8_t* const*)(src + desc->type.ref.offsets[i]); + const AVBufferRef *src_buf = *(AVBufferRef**)(src_ptr + 1); + uint8_t **copy_ptr = (uint8_t**)(copy + desc->type.ref.offsets[i]); + AVBufferRef **copy_buf = (AVBufferRef**)(copy_ptr + 1); + + if (!*src_ptr) { + av_assert0(!src_buf); + continue; + } + if (!src_buf) { + // We can't handle a non-refcounted pointer here - we don't + // have enough information to handle whatever structure lies + // at the other end of it. + err = AVERROR(EINVAL); + goto fail; + } + + *copy_buf = av_buffer_ref(src_buf); + if (!*copy_buf) { + err = AVERROR(ENOMEM); + goto fail; + } + } + + *clone_ref = av_buffer_create(copy, desc->content_size, + cbs_default_free_unit_content, + (void*)desc, 0); + if (!*clone_ref) { + err = AVERROR(ENOMEM); + goto fail; + } + + return 0; + +fail: + for (--i; i >= 0; i--) + av_buffer_unref((AVBufferRef**)(copy + desc->type.ref.offsets[i])); + av_freep(©); + *clone_ref = NULL; + return err; +} + +/* + * On success, unit->content and unit->content_ref are updated with + * the new content; unit is untouched on failure. + * Any old content_ref is simply overwritten and not freed. + */ +static int cbs_clone_unit_content(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit) +{ + const CodedBitstreamUnitTypeDescriptor *desc; + AVBufferRef *ref; + int err; + + desc = cbs_find_unit_type_desc(ctx, unit); + if (!desc) + return AVERROR(ENOSYS); + + switch (desc->content_type) { + case CBS_CONTENT_TYPE_INTERNAL_REFS: + err = cbs_clone_internal_refs_unit_content(&ref, unit, desc); + break; + + case CBS_CONTENT_TYPE_COMPLEX: + if (!desc->type.complex.content_clone) + return AVERROR_PATCHWELCOME; + err = desc->type.complex.content_clone(&ref, unit); + break; + + default: + av_assert0(0 && "Invalid content type."); + } + + if (err < 0) + return err; + + unit->content_ref = ref; + unit->content = ref->data; + return 0; +} + +int ff_cbs_make_unit_refcounted(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit) +{ + av_assert0(unit->content); + if (unit->content_ref) + return 0; + return cbs_clone_unit_content(ctx, unit); +} + +int ff_cbs_make_unit_writable(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit) +{ + AVBufferRef *ref = unit->content_ref; + int err; + + av_assert0(unit->content); + if (ref && av_buffer_is_writable(ref)) + return 0; + + err = cbs_clone_unit_content(ctx, unit); + if (err < 0) + return err; + av_buffer_unref(&ref); + return 0; +} diff --git a/media/ffvpx/libavcodec/cbs.h b/media/ffvpx/libavcodec/cbs.h new file mode 100644 index 0000000000..ee21623dac --- /dev/null +++ b/media/ffvpx/libavcodec/cbs.h @@ -0,0 +1,436 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_CBS_H +#define AVCODEC_CBS_H + +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/buffer.h" + +#include "codec_id.h" +#include "codec_par.h" +#include "packet.h" + + +/* + * This defines a framework for converting between a coded bitstream + * and structures defining all individual syntax elements found in + * such a stream. + * + * Conversion in both directions is possible. Given a coded bitstream + * (any meaningful fragment), it can be parsed and decomposed into + * syntax elements stored in a set of codec-specific structures. + * Similarly, given a set of those same codec-specific structures the + * syntax elements can be serialised and combined to create a coded + * bitstream. + */ + +struct AVCodecContext; +struct CodedBitstreamType; + +/** + * The codec-specific type of a bitstream unit. + * + * AV1: obu_type + * H.264 / AVC: nal_unit_type + * H.265 / HEVC: nal_unit_type + * JPEG: marker value (without 0xff prefix) + * MPEG-2: start code value (without prefix) + * VP9: unused, set to zero (every unit is a frame) + */ +typedef uint32_t CodedBitstreamUnitType; + +/** + * Coded bitstream unit structure. + * + * A bitstream unit the smallest element of a bitstream which + * is meaningful on its own. For example, an H.264 NAL unit. + * + * See the codec-specific header for the meaning of this for any + * particular codec. + */ +typedef struct CodedBitstreamUnit { + /** + * Codec-specific type of this unit. + */ + CodedBitstreamUnitType type; + + /** + * Pointer to the directly-parsable bitstream form of this unit. + * + * May be NULL if the unit currently only exists in decomposed form. + */ + uint8_t *data; + /** + * The number of bytes in the bitstream (including any padding bits + * in the final byte). + */ + size_t data_size; + /** + * The number of bits which should be ignored in the final byte. + * + * This supports non-byte-aligned bitstreams. + */ + size_t data_bit_padding; + /** + * A reference to the buffer containing data. + * + * Must be set if data is not NULL. + */ + AVBufferRef *data_ref; + + /** + * Pointer to the decomposed form of this unit. + * + * The type of this structure depends on both the codec and the + * type of this unit. May be NULL if the unit only exists in + * bitstream form. + */ + void *content; + /** + * If content is reference counted, a reference to the buffer containing + * content. Null if content is not reference counted. + */ + AVBufferRef *content_ref; +} CodedBitstreamUnit; + +/** + * Coded bitstream fragment structure, combining one or more units. + * + * This is any sequence of units. It need not form some greater whole, + * though in many cases it will. For example, an H.264 access unit, + * which is composed of a sequence of H.264 NAL units. + */ +typedef struct CodedBitstreamFragment { + /** + * Pointer to the bitstream form of this fragment. + * + * May be NULL if the fragment only exists as component units. + */ + uint8_t *data; + /** + * The number of bytes in the bitstream. + * + * The number of bytes in the bitstream (including any padding bits + * in the final byte). + */ + size_t data_size; + /** + * The number of bits which should be ignored in the final byte. + */ + size_t data_bit_padding; + /** + * A reference to the buffer containing data. + * + * Must be set if data is not NULL. + */ + AVBufferRef *data_ref; + + /** + * Number of units in this fragment. + * + * This may be zero if the fragment only exists in bitstream form + * and has not been decomposed. + */ + int nb_units; + + /** + * Number of allocated units. + * + * Must always be >= nb_units; designed for internal use by cbs. + */ + int nb_units_allocated; + + /** + * Pointer to an array of units of length nb_units_allocated. + * Only the first nb_units are valid. + * + * Must be NULL if nb_units_allocated is zero. + */ + CodedBitstreamUnit *units; +} CodedBitstreamFragment; + +/** + * Context structure for coded bitstream operations. + */ +typedef struct CodedBitstreamContext { + /** + * Logging context to be passed to all av_log() calls associated + * with this context. + */ + void *log_ctx; + + /** + * Internal codec-specific hooks. + */ + const struct CodedBitstreamType *codec; + + /** + * Internal codec-specific data. + * + * This contains any information needed when reading/writing + * bitsteams which will not necessarily be present in a fragment. + * For example, for H.264 it contains all currently visible + * parameter sets - they are required to determine the bitstream + * syntax but need not be present in every access unit. + */ + void *priv_data; + + /** + * Array of unit types which should be decomposed when reading. + * + * Types not in this list will be available in bitstream form only. + * If NULL, all supported types will be decomposed. + */ + const CodedBitstreamUnitType *decompose_unit_types; + /** + * Length of the decompose_unit_types array. + */ + int nb_decompose_unit_types; + + /** + * Enable trace output during read/write operations. + */ + int trace_enable; + /** + * Log level to use for trace output. + * + * From AV_LOG_*; defaults to AV_LOG_TRACE. + */ + int trace_level; + + /** + * Write buffer. Used as intermediate buffer when writing units. + * For internal use of cbs only. + */ + uint8_t *write_buffer; + size_t write_buffer_size; +} CodedBitstreamContext; + + +/** + * Table of all supported codec IDs. + * + * Terminated by AV_CODEC_ID_NONE. + */ +extern const enum AVCodecID ff_cbs_all_codec_ids[]; + + +/** + * Create and initialise a new context for the given codec. + */ +int ff_cbs_init(CodedBitstreamContext **ctx, + enum AVCodecID codec_id, void *log_ctx); + +/** + * Reset all internal state in a context. + */ +void ff_cbs_flush(CodedBitstreamContext *ctx); + +/** + * Close a context and free all internal state. + */ +void ff_cbs_close(CodedBitstreamContext **ctx); + + +/** + * Read the extradata bitstream found in codec parameters into a + * fragment, then split into units and decompose. + * + * This also updates the internal state, so will need to be called for + * codecs with extradata to read parameter sets necessary for further + * parsing even if the fragment itself is not desired. + * + * The fragment must have been zeroed or reset via ff_cbs_fragment_reset + * before use. + */ +int ff_cbs_read_extradata(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + const AVCodecParameters *par); + +/** + * Read the extradata bitstream found in a codec context into a + * fragment, then split into units and decompose. + * + * This acts identical to ff_cbs_read_extradata() for the case where + * you already have a codec context. + */ +int ff_cbs_read_extradata_from_codec(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + const struct AVCodecContext *avctx); + +int ff_cbs_read_packet_side_data(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + const AVPacket *pkt); + +/** + * Read the data bitstream from a packet into a fragment, then + * split into units and decompose. + * + * This also updates the internal state of the coded bitstream context + * with any persistent data from the fragment which may be required to + * read following fragments (e.g. parameter sets). + * + * The fragment must have been zeroed or reset via ff_cbs_fragment_reset + * before use. + */ +int ff_cbs_read_packet(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + const AVPacket *pkt); + +/** + * Read a bitstream from a memory region into a fragment, then + * split into units and decompose. + * + * This also updates the internal state of the coded bitstream context + * with any persistent data from the fragment which may be required to + * read following fragments (e.g. parameter sets). + * + * The fragment must have been zeroed or reset via ff_cbs_fragment_reset + * before use. + */ +int ff_cbs_read(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + const uint8_t *data, size_t size); + + +/** + * Write the content of the fragment to its own internal buffer. + * + * Writes the content of all units and then assembles them into a new + * data buffer. When modifying the content of decomposed units, this + * can be used to regenerate the bitstream form of units or the whole + * fragment so that it can be extracted for other use. + * + * This also updates the internal state of the coded bitstream context + * with any persistent data from the fragment which may be required to + * write following fragments (e.g. parameter sets). + */ +int ff_cbs_write_fragment_data(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag); + +/** + * Write the bitstream of a fragment to the extradata in codec parameters. + * + * Modifies context and fragment as ff_cbs_write_fragment_data does and + * replaces any existing extradata in the structure. + */ +int ff_cbs_write_extradata(CodedBitstreamContext *ctx, + AVCodecParameters *par, + CodedBitstreamFragment *frag); + +/** + * Write the bitstream of a fragment to a packet. + * + * Modifies context and fragment as ff_cbs_write_fragment_data does. + * + * On success, the packet's buf is unreferenced and its buf, data and + * size fields are set to the corresponding values from the newly updated + * fragment; other fields are not touched. On failure, the packet is not + * touched at all. + */ +int ff_cbs_write_packet(CodedBitstreamContext *ctx, + AVPacket *pkt, + CodedBitstreamFragment *frag); + + +/** + * Free the units contained in a fragment as well as the fragment's + * own data buffer, but not the units array itself. + */ +void ff_cbs_fragment_reset(CodedBitstreamFragment *frag); + +/** + * Free the units array of a fragment in addition to what + * ff_cbs_fragment_reset does. + */ +void ff_cbs_fragment_free(CodedBitstreamFragment *frag); + +/** + * Allocate a new internal content buffer matching the type of the unit. + * + * The content will be zeroed. + */ +int ff_cbs_alloc_unit_content(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit); + +/** + * Insert a new unit into a fragment with the given content. + * + * The content structure continues to be owned by the caller if + * content_buf is not supplied. + */ +int ff_cbs_insert_unit_content(CodedBitstreamFragment *frag, + int position, + CodedBitstreamUnitType type, + void *content, + AVBufferRef *content_buf); + +/** + * Add a new unit to a fragment with the given data bitstream. + * + * If data_buf is not supplied then data must have been allocated with + * av_malloc() and will on success become owned by the unit after this + * call or freed on error. + */ +int ff_cbs_append_unit_data(CodedBitstreamFragment *frag, + CodedBitstreamUnitType type, + uint8_t *data, size_t data_size, + AVBufferRef *data_buf); + +/** + * Delete a unit from a fragment and free all memory it uses. + * + * Requires position to be >= 0 and < frag->nb_units. + */ +void ff_cbs_delete_unit(CodedBitstreamFragment *frag, + int position); + + +/** + * Make the content of a unit refcounted. + * + * If the unit is not refcounted, this will do a deep copy of the unit + * content to new refcounted buffers. + * + * It is not valid to call this function on a unit which does not have + * decomposed content. + */ +int ff_cbs_make_unit_refcounted(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit); + +/** + * Make the content of a unit writable so that internal fields can be + * modified. + * + * If it is known that there are no other references to the content of + * the unit, does nothing and returns success. Otherwise (including the + * case where the unit content is not refcounted), it does a full clone + * of the content (including any internal buffers) to make a new copy, + * and replaces the existing references inside the unit with that. + * + * It is not valid to call this function on a unit which does not have + * decomposed content. + */ +int ff_cbs_make_unit_writable(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit); + + +#endif /* AVCODEC_CBS_H */ diff --git a/media/ffvpx/libavcodec/cbs_av1.c b/media/ffvpx/libavcodec/cbs_av1.c new file mode 100644 index 0000000000..45e1288a51 --- /dev/null +++ b/media/ffvpx/libavcodec/cbs_av1.c @@ -0,0 +1,1366 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/opt.h" +#include "libavutil/pixfmt.h" + +#include "avcodec.h" +#include "cbs.h" +#include "cbs_internal.h" +#include "cbs_av1.h" + + +static int cbs_av1_read_uvlc(CodedBitstreamContext *ctx, GetBitContext *gbc, + const char *name, uint32_t *write_to, + uint32_t range_min, uint32_t range_max) +{ + uint32_t zeroes, bits_value, value; + int position; + + if (ctx->trace_enable) + position = get_bits_count(gbc); + + zeroes = 0; + while (1) { + if (get_bits_left(gbc) < 1) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid uvlc code at " + "%s: bitstream ended.\n", name); + return AVERROR_INVALIDDATA; + } + + if (get_bits1(gbc)) + break; + ++zeroes; + } + + if (zeroes >= 32) { + value = MAX_UINT_BITS(32); + } else { + if (get_bits_left(gbc) < zeroes) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid uvlc code at " + "%s: bitstream ended.\n", name); + return AVERROR_INVALIDDATA; + } + + bits_value = get_bits_long(gbc, zeroes); + value = bits_value + (UINT32_C(1) << zeroes) - 1; + } + + if (ctx->trace_enable) { + char bits[65]; + int i, j, k; + + if (zeroes >= 32) { + while (zeroes > 32) { + k = FFMIN(zeroes - 32, 32); + for (i = 0; i < k; i++) + bits[i] = '0'; + bits[i] = 0; + ff_cbs_trace_syntax_element(ctx, position, name, + NULL, bits, 0); + zeroes -= k; + position += k; + } + } + + for (i = 0; i < zeroes; i++) + bits[i] = '0'; + bits[i++] = '1'; + + if (zeroes < 32) { + for (j = 0; j < zeroes; j++) + bits[i++] = (bits_value >> (zeroes - j - 1) & 1) ? '1' : '0'; + } + + bits[i] = 0; + ff_cbs_trace_syntax_element(ctx, position, name, + NULL, bits, value); + } + + if (value < range_min || value > range_max) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: " + "%"PRIu32", but must be in [%"PRIu32",%"PRIu32"].\n", + name, value, range_min, range_max); + return AVERROR_INVALIDDATA; + } + + *write_to = value; + return 0; +} + +static int cbs_av1_write_uvlc(CodedBitstreamContext *ctx, PutBitContext *pbc, + const char *name, uint32_t value, + uint32_t range_min, uint32_t range_max) +{ + uint32_t v; + int position, zeroes; + + if (value < range_min || value > range_max) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: " + "%"PRIu32", but must be in [%"PRIu32",%"PRIu32"].\n", + name, value, range_min, range_max); + return AVERROR_INVALIDDATA; + } + + if (ctx->trace_enable) + position = put_bits_count(pbc); + + zeroes = av_log2(value + 1); + v = value - (1U << zeroes) + 1; + put_bits(pbc, zeroes, 0); + put_bits(pbc, 1, 1); + put_bits(pbc, zeroes, v); + + if (ctx->trace_enable) { + char bits[65]; + int i, j; + i = 0; + for (j = 0; j < zeroes; j++) + bits[i++] = '0'; + bits[i++] = '1'; + for (j = 0; j < zeroes; j++) + bits[i++] = (v >> (zeroes - j - 1) & 1) ? '1' : '0'; + bits[i++] = 0; + ff_cbs_trace_syntax_element(ctx, position, name, NULL, + bits, value); + } + + return 0; +} + +static int cbs_av1_read_leb128(CodedBitstreamContext *ctx, GetBitContext *gbc, + const char *name, uint64_t *write_to) +{ + uint64_t value; + int position, err, i; + + if (ctx->trace_enable) + position = get_bits_count(gbc); + + value = 0; + for (i = 0; i < 8; i++) { + int subscript[2] = { 1, i }; + uint32_t byte; + err = ff_cbs_read_unsigned(ctx, gbc, 8, "leb128_byte[i]", subscript, + &byte, 0x00, 0xff); + if (err < 0) + return err; + + value |= (uint64_t)(byte & 0x7f) << (i * 7); + if (!(byte & 0x80)) + break; + } + + if (value > UINT32_MAX) + return AVERROR_INVALIDDATA; + + if (ctx->trace_enable) + ff_cbs_trace_syntax_element(ctx, position, name, NULL, "", value); + + *write_to = value; + return 0; +} + +static int cbs_av1_write_leb128(CodedBitstreamContext *ctx, PutBitContext *pbc, + const char *name, uint64_t value) +{ + int position, err, len, i; + uint8_t byte; + + len = (av_log2(value) + 7) / 7; + + if (ctx->trace_enable) + position = put_bits_count(pbc); + + for (i = 0; i < len; i++) { + int subscript[2] = { 1, i }; + + byte = value >> (7 * i) & 0x7f; + if (i < len - 1) + byte |= 0x80; + + err = ff_cbs_write_unsigned(ctx, pbc, 8, "leb128_byte[i]", subscript, + byte, 0x00, 0xff); + if (err < 0) + return err; + } + + if (ctx->trace_enable) + ff_cbs_trace_syntax_element(ctx, position, name, NULL, "", value); + + return 0; +} + +static int cbs_av1_read_ns(CodedBitstreamContext *ctx, GetBitContext *gbc, + uint32_t n, const char *name, + const int *subscripts, uint32_t *write_to) +{ + uint32_t m, v, extra_bit, value; + int position, w; + + av_assert0(n > 0); + + if (ctx->trace_enable) + position = get_bits_count(gbc); + + w = av_log2(n) + 1; + m = (1 << w) - n; + + if (get_bits_left(gbc) < w) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid non-symmetric value at " + "%s: bitstream ended.\n", name); + return AVERROR_INVALIDDATA; + } + + if (w - 1 > 0) + v = get_bits(gbc, w - 1); + else + v = 0; + + if (v < m) { + value = v; + } else { + extra_bit = get_bits1(gbc); + value = (v << 1) - m + extra_bit; + } + + if (ctx->trace_enable) { + char bits[33]; + int i; + for (i = 0; i < w - 1; i++) + bits[i] = (v >> i & 1) ? '1' : '0'; + if (v >= m) + bits[i++] = extra_bit ? '1' : '0'; + bits[i] = 0; + + ff_cbs_trace_syntax_element(ctx, position, + name, subscripts, bits, value); + } + + *write_to = value; + return 0; +} + +static int cbs_av1_write_ns(CodedBitstreamContext *ctx, PutBitContext *pbc, + uint32_t n, const char *name, + const int *subscripts, uint32_t value) +{ + uint32_t w, m, v, extra_bit; + int position; + + if (value > n) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: " + "%"PRIu32", but must be in [0,%"PRIu32"].\n", + name, value, n); + return AVERROR_INVALIDDATA; + } + + if (ctx->trace_enable) + position = put_bits_count(pbc); + + w = av_log2(n) + 1; + m = (1 << w) - n; + + if (put_bits_left(pbc) < w) + return AVERROR(ENOSPC); + + if (value < m) { + v = value; + put_bits(pbc, w - 1, v); + } else { + v = m + ((value - m) >> 1); + extra_bit = (value - m) & 1; + put_bits(pbc, w - 1, v); + put_bits(pbc, 1, extra_bit); + } + + if (ctx->trace_enable) { + char bits[33]; + int i; + for (i = 0; i < w - 1; i++) + bits[i] = (v >> i & 1) ? '1' : '0'; + if (value >= m) + bits[i++] = extra_bit ? '1' : '0'; + bits[i] = 0; + + ff_cbs_trace_syntax_element(ctx, position, + name, subscripts, bits, value); + } + + return 0; +} + +static int cbs_av1_read_increment(CodedBitstreamContext *ctx, GetBitContext *gbc, + uint32_t range_min, uint32_t range_max, + const char *name, uint32_t *write_to) +{ + uint32_t value; + int position, i; + char bits[33]; + + av_assert0(range_min <= range_max && range_max - range_min < sizeof(bits) - 1); + if (ctx->trace_enable) + position = get_bits_count(gbc); + + for (i = 0, value = range_min; value < range_max;) { + if (get_bits_left(gbc) < 1) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid increment value at " + "%s: bitstream ended.\n", name); + return AVERROR_INVALIDDATA; + } + if (get_bits1(gbc)) { + bits[i++] = '1'; + ++value; + } else { + bits[i++] = '0'; + break; + } + } + + if (ctx->trace_enable) { + bits[i] = 0; + ff_cbs_trace_syntax_element(ctx, position, + name, NULL, bits, value); + } + + *write_to = value; + return 0; +} + +static int cbs_av1_write_increment(CodedBitstreamContext *ctx, PutBitContext *pbc, + uint32_t range_min, uint32_t range_max, + const char *name, uint32_t value) +{ + int len; + + av_assert0(range_min <= range_max && range_max - range_min < 32); + if (value < range_min || value > range_max) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: " + "%"PRIu32", but must be in [%"PRIu32",%"PRIu32"].\n", + name, value, range_min, range_max); + return AVERROR_INVALIDDATA; + } + + if (value == range_max) + len = range_max - range_min; + else + len = value - range_min + 1; + if (put_bits_left(pbc) < len) + return AVERROR(ENOSPC); + + if (ctx->trace_enable) { + char bits[33]; + int i; + for (i = 0; i < len; i++) { + if (range_min + i == value) + bits[i] = '0'; + else + bits[i] = '1'; + } + bits[i] = 0; + ff_cbs_trace_syntax_element(ctx, put_bits_count(pbc), + name, NULL, bits, value); + } + + if (len > 0) + put_bits(pbc, len, (1 << len) - 1 - (value != range_max)); + + return 0; +} + +static int cbs_av1_read_subexp(CodedBitstreamContext *ctx, GetBitContext *gbc, + uint32_t range_max, const char *name, + const int *subscripts, uint32_t *write_to) +{ + uint32_t value; + int position, err; + uint32_t max_len, len, range_offset, range_bits; + + if (ctx->trace_enable) + position = get_bits_count(gbc); + + av_assert0(range_max > 0); + max_len = av_log2(range_max - 1) - 3; + + err = cbs_av1_read_increment(ctx, gbc, 0, max_len, + "subexp_more_bits", &len); + if (err < 0) + return err; + + if (len) { + range_bits = 2 + len; + range_offset = 1 << range_bits; + } else { + range_bits = 3; + range_offset = 0; + } + + if (len < max_len) { + err = ff_cbs_read_unsigned(ctx, gbc, range_bits, + "subexp_bits", NULL, &value, + 0, MAX_UINT_BITS(range_bits)); + if (err < 0) + return err; + + } else { + err = cbs_av1_read_ns(ctx, gbc, range_max - range_offset, + "subexp_final_bits", NULL, &value); + if (err < 0) + return err; + } + value += range_offset; + + if (ctx->trace_enable) + ff_cbs_trace_syntax_element(ctx, position, + name, subscripts, "", value); + + *write_to = value; + return err; +} + +static int cbs_av1_write_subexp(CodedBitstreamContext *ctx, PutBitContext *pbc, + uint32_t range_max, const char *name, + const int *subscripts, uint32_t value) +{ + int position, err; + uint32_t max_len, len, range_offset, range_bits; + + if (value > range_max) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "%s out of range: " + "%"PRIu32", but must be in [0,%"PRIu32"].\n", + name, value, range_max); + return AVERROR_INVALIDDATA; + } + + if (ctx->trace_enable) + position = put_bits_count(pbc); + + av_assert0(range_max > 0); + max_len = av_log2(range_max - 1) - 3; + + if (value < 8) { + range_bits = 3; + range_offset = 0; + len = 0; + } else { + range_bits = av_log2(value); + len = range_bits - 2; + if (len > max_len) { + // The top bin is combined with the one below it. + av_assert0(len == max_len + 1); + --range_bits; + len = max_len; + } + range_offset = 1 << range_bits; + } + + err = cbs_av1_write_increment(ctx, pbc, 0, max_len, + "subexp_more_bits", len); + if (err < 0) + return err; + + if (len < max_len) { + err = ff_cbs_write_unsigned(ctx, pbc, range_bits, + "subexp_bits", NULL, + value - range_offset, + 0, MAX_UINT_BITS(range_bits)); + if (err < 0) + return err; + + } else { + err = cbs_av1_write_ns(ctx, pbc, range_max - range_offset, + "subexp_final_bits", NULL, + value - range_offset); + if (err < 0) + return err; + } + + if (ctx->trace_enable) + ff_cbs_trace_syntax_element(ctx, position, + name, subscripts, "", value); + + return err; +} + + +static int cbs_av1_tile_log2(int blksize, int target) +{ + int k; + for (k = 0; (blksize << k) < target; k++); + return k; +} + +static int cbs_av1_get_relative_dist(const AV1RawSequenceHeader *seq, + unsigned int a, unsigned int b) +{ + unsigned int diff, m; + if (!seq->enable_order_hint) + return 0; + diff = a - b; + m = 1 << seq->order_hint_bits_minus_1; + diff = (diff & (m - 1)) - (diff & m); + return diff; +} + +static size_t cbs_av1_get_payload_bytes_left(GetBitContext *gbc) +{ + GetBitContext tmp = *gbc; + size_t size = 0; + for (int i = 0; get_bits_left(&tmp) >= 8; i++) { + if (get_bits(&tmp, 8)) + size = i; + } + return size; +} + + +#define HEADER(name) do { \ + ff_cbs_trace_header(ctx, name); \ + } while (0) + +#define CHECK(call) do { \ + err = (call); \ + if (err < 0) \ + return err; \ + } while (0) + +#define FUNC_NAME(rw, codec, name) cbs_ ## codec ## _ ## rw ## _ ## name +#define FUNC_AV1(rw, name) FUNC_NAME(rw, av1, name) +#define FUNC(name) FUNC_AV1(READWRITE, name) + +#define SUBSCRIPTS(subs, ...) (subs > 0 ? ((int[subs + 1]){ subs, __VA_ARGS__ }) : NULL) + +#define fb(width, name) \ + xf(width, name, current->name, 0, MAX_UINT_BITS(width), 0, ) +#define fc(width, name, range_min, range_max) \ + xf(width, name, current->name, range_min, range_max, 0, ) +#define flag(name) fb(1, name) +#define su(width, name) \ + xsu(width, name, current->name, 0, ) + +#define fbs(width, name, subs, ...) \ + xf(width, name, current->name, 0, MAX_UINT_BITS(width), subs, __VA_ARGS__) +#define fcs(width, name, range_min, range_max, subs, ...) \ + xf(width, name, current->name, range_min, range_max, subs, __VA_ARGS__) +#define flags(name, subs, ...) \ + xf(1, name, current->name, 0, 1, subs, __VA_ARGS__) +#define sus(width, name, subs, ...) \ + xsu(width, name, current->name, subs, __VA_ARGS__) + +#define fixed(width, name, value) do { \ + av_unused uint32_t fixed_value = value; \ + xf(width, name, fixed_value, value, value, 0, ); \ + } while (0) + + +#define READ +#define READWRITE read +#define RWContext GetBitContext + +#define xf(width, name, var, range_min, range_max, subs, ...) do { \ + uint32_t value; \ + CHECK(ff_cbs_read_unsigned(ctx, rw, width, #name, \ + SUBSCRIPTS(subs, __VA_ARGS__), \ + &value, range_min, range_max)); \ + var = value; \ + } while (0) + +#define xsu(width, name, var, subs, ...) do { \ + int32_t value; \ + CHECK(ff_cbs_read_signed(ctx, rw, width, #name, \ + SUBSCRIPTS(subs, __VA_ARGS__), &value, \ + MIN_INT_BITS(width), \ + MAX_INT_BITS(width))); \ + var = value; \ + } while (0) + +#define uvlc(name, range_min, range_max) do { \ + uint32_t value; \ + CHECK(cbs_av1_read_uvlc(ctx, rw, #name, \ + &value, range_min, range_max)); \ + current->name = value; \ + } while (0) + +#define ns(max_value, name, subs, ...) do { \ + uint32_t value; \ + CHECK(cbs_av1_read_ns(ctx, rw, max_value, #name, \ + SUBSCRIPTS(subs, __VA_ARGS__), &value)); \ + current->name = value; \ + } while (0) + +#define increment(name, min, max) do { \ + uint32_t value; \ + CHECK(cbs_av1_read_increment(ctx, rw, min, max, #name, &value)); \ + current->name = value; \ + } while (0) + +#define subexp(name, max, subs, ...) do { \ + uint32_t value; \ + CHECK(cbs_av1_read_subexp(ctx, rw, max, #name, \ + SUBSCRIPTS(subs, __VA_ARGS__), &value)); \ + current->name = value; \ + } while (0) + +#define delta_q(name) do { \ + uint8_t delta_coded; \ + int8_t delta_q; \ + xf(1, name.delta_coded, delta_coded, 0, 1, 0, ); \ + if (delta_coded) \ + xsu(1 + 6, name.delta_q, delta_q, 0, ); \ + else \ + delta_q = 0; \ + current->name = delta_q; \ + } while (0) + +#define leb128(name) do { \ + uint64_t value; \ + CHECK(cbs_av1_read_leb128(ctx, rw, #name, &value)); \ + current->name = value; \ + } while (0) + +#define infer(name, value) do { \ + current->name = value; \ + } while (0) + +#define byte_alignment(rw) (get_bits_count(rw) % 8) + +#include "cbs_av1_syntax_template.c" + +#undef READ +#undef READWRITE +#undef RWContext +#undef xf +#undef xsu +#undef uvlc +#undef ns +#undef increment +#undef subexp +#undef delta_q +#undef leb128 +#undef infer +#undef byte_alignment + + +#define WRITE +#define READWRITE write +#define RWContext PutBitContext + +#define xf(width, name, var, range_min, range_max, subs, ...) do { \ + CHECK(ff_cbs_write_unsigned(ctx, rw, width, #name, \ + SUBSCRIPTS(subs, __VA_ARGS__), \ + var, range_min, range_max)); \ + } while (0) + +#define xsu(width, name, var, subs, ...) do { \ + CHECK(ff_cbs_write_signed(ctx, rw, width, #name, \ + SUBSCRIPTS(subs, __VA_ARGS__), var, \ + MIN_INT_BITS(width), \ + MAX_INT_BITS(width))); \ + } while (0) + +#define uvlc(name, range_min, range_max) do { \ + CHECK(cbs_av1_write_uvlc(ctx, rw, #name, current->name, \ + range_min, range_max)); \ + } while (0) + +#define ns(max_value, name, subs, ...) do { \ + CHECK(cbs_av1_write_ns(ctx, rw, max_value, #name, \ + SUBSCRIPTS(subs, __VA_ARGS__), \ + current->name)); \ + } while (0) + +#define increment(name, min, max) do { \ + CHECK(cbs_av1_write_increment(ctx, rw, min, max, #name, \ + current->name)); \ + } while (0) + +#define subexp(name, max, subs, ...) do { \ + CHECK(cbs_av1_write_subexp(ctx, rw, max, #name, \ + SUBSCRIPTS(subs, __VA_ARGS__), \ + current->name)); \ + } while (0) + +#define delta_q(name) do { \ + xf(1, name.delta_coded, current->name != 0, 0, 1, 0, ); \ + if (current->name) \ + xsu(1 + 6, name.delta_q, current->name, 0, ); \ + } while (0) + +#define leb128(name) do { \ + CHECK(cbs_av1_write_leb128(ctx, rw, #name, current->name)); \ + } while (0) + +#define infer(name, value) do { \ + if (current->name != (value)) { \ + av_log(ctx->log_ctx, AV_LOG_ERROR, \ + "%s does not match inferred value: " \ + "%"PRId64", but should be %"PRId64".\n", \ + #name, (int64_t)current->name, (int64_t)(value)); \ + return AVERROR_INVALIDDATA; \ + } \ + } while (0) + +#define byte_alignment(rw) (put_bits_count(rw) % 8) + +#include "cbs_av1_syntax_template.c" + +#undef WRITE +#undef READWRITE +#undef RWContext +#undef xf +#undef xsu +#undef uvlc +#undef ns +#undef increment +#undef subexp +#undef delta_q +#undef leb128 +#undef infer +#undef byte_alignment + + +static int cbs_av1_split_fragment(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + int header) +{ + GetBitContext gbc; + uint8_t *data; + size_t size; + uint64_t obu_length; + int pos, err, trace; + + // Don't include this parsing in trace output. + trace = ctx->trace_enable; + ctx->trace_enable = 0; + + data = frag->data; + size = frag->data_size; + + if (INT_MAX / 8 < size) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid fragment: " + "too large (%"SIZE_SPECIFIER" bytes).\n", size); + err = AVERROR_INVALIDDATA; + goto fail; + } + + if (header && size && data[0] & 0x80) { + // first bit is nonzero, the extradata does not consist purely of + // OBUs. Expect MP4/Matroska AV1CodecConfigurationRecord + int config_record_version = data[0] & 0x7f; + + if (config_record_version != 1) { + av_log(ctx->log_ctx, AV_LOG_ERROR, + "Unknown version %d of AV1CodecConfigurationRecord " + "found!\n", + config_record_version); + err = AVERROR_INVALIDDATA; + goto fail; + } + + if (size <= 4) { + if (size < 4) { + av_log(ctx->log_ctx, AV_LOG_WARNING, + "Undersized AV1CodecConfigurationRecord v%d found!\n", + config_record_version); + err = AVERROR_INVALIDDATA; + goto fail; + } + + goto success; + } + + // In AV1CodecConfigurationRecord v1, actual OBUs start after + // four bytes. Thus set the offset as required for properly + // parsing them. + data += 4; + size -= 4; + } + + while (size > 0) { + AV1RawOBUHeader header; + uint64_t obu_size; + + init_get_bits(&gbc, data, 8 * size); + + err = cbs_av1_read_obu_header(ctx, &gbc, &header); + if (err < 0) + goto fail; + + if (header.obu_has_size_field) { + if (get_bits_left(&gbc) < 8) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid OBU: fragment " + "too short (%"SIZE_SPECIFIER" bytes).\n", size); + err = AVERROR_INVALIDDATA; + goto fail; + } + err = cbs_av1_read_leb128(ctx, &gbc, "obu_size", &obu_size); + if (err < 0) + goto fail; + } else + obu_size = size - 1 - header.obu_extension_flag; + + pos = get_bits_count(&gbc); + av_assert0(pos % 8 == 0 && pos / 8 <= size); + + obu_length = pos / 8 + obu_size; + + if (size < obu_length) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid OBU length: " + "%"PRIu64", but only %"SIZE_SPECIFIER" bytes remaining in fragment.\n", + obu_length, size); + err = AVERROR_INVALIDDATA; + goto fail; + } + + err = ff_cbs_append_unit_data(frag, header.obu_type, + data, obu_length, frag->data_ref); + if (err < 0) + goto fail; + + data += obu_length; + size -= obu_length; + } + +success: + err = 0; +fail: + ctx->trace_enable = trace; + return err; +} + +static int cbs_av1_ref_tile_data(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit, + GetBitContext *gbc, + AV1RawTileData *td) +{ + int pos; + + pos = get_bits_count(gbc); + if (pos >= 8 * unit->data_size) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Bitstream ended before " + "any data in tile group (%d bits read).\n", pos); + return AVERROR_INVALIDDATA; + } + // Must be byte-aligned at this point. + av_assert0(pos % 8 == 0); + + td->data_ref = av_buffer_ref(unit->data_ref); + if (!td->data_ref) + return AVERROR(ENOMEM); + + td->data = unit->data + pos / 8; + td->data_size = unit->data_size - pos / 8; + + return 0; +} + +static int cbs_av1_read_unit(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + AV1RawOBU *obu; + GetBitContext gbc; + int err, start_pos, end_pos; + + err = ff_cbs_alloc_unit_content(ctx, unit); + if (err < 0) + return err; + obu = unit->content; + + err = init_get_bits(&gbc, unit->data, 8 * unit->data_size); + if (err < 0) + return err; + + err = cbs_av1_read_obu_header(ctx, &gbc, &obu->header); + if (err < 0) + return err; + av_assert0(obu->header.obu_type == unit->type); + + if (obu->header.obu_has_size_field) { + uint64_t obu_size; + err = cbs_av1_read_leb128(ctx, &gbc, "obu_size", &obu_size); + if (err < 0) + return err; + obu->obu_size = obu_size; + } else { + if (unit->data_size < 1 + obu->header.obu_extension_flag) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid OBU length: " + "unit too short (%"SIZE_SPECIFIER").\n", unit->data_size); + return AVERROR_INVALIDDATA; + } + obu->obu_size = unit->data_size - 1 - obu->header.obu_extension_flag; + } + + start_pos = get_bits_count(&gbc); + + if (obu->header.obu_extension_flag) { + if (obu->header.obu_type != AV1_OBU_SEQUENCE_HEADER && + obu->header.obu_type != AV1_OBU_TEMPORAL_DELIMITER && + priv->operating_point_idc) { + int in_temporal_layer = + (priv->operating_point_idc >> priv->temporal_id ) & 1; + int in_spatial_layer = + (priv->operating_point_idc >> (priv->spatial_id + 8)) & 1; + if (!in_temporal_layer || !in_spatial_layer) { + return AVERROR(EAGAIN); // drop_obu() + } + } + } + + switch (obu->header.obu_type) { + case AV1_OBU_SEQUENCE_HEADER: + { + err = cbs_av1_read_sequence_header_obu(ctx, &gbc, + &obu->obu.sequence_header); + if (err < 0) + return err; + + if (priv->operating_point >= 0) { + AV1RawSequenceHeader *sequence_header = &obu->obu.sequence_header; + + if (priv->operating_point > sequence_header->operating_points_cnt_minus_1) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid Operating Point %d requested. " + "Must not be higher than %u.\n", + priv->operating_point, sequence_header->operating_points_cnt_minus_1); + return AVERROR(EINVAL); + } + priv->operating_point_idc = sequence_header->operating_point_idc[priv->operating_point]; + } + + av_buffer_unref(&priv->sequence_header_ref); + priv->sequence_header = NULL; + + priv->sequence_header_ref = av_buffer_ref(unit->content_ref); + if (!priv->sequence_header_ref) + return AVERROR(ENOMEM); + priv->sequence_header = &obu->obu.sequence_header; + } + break; + case AV1_OBU_TEMPORAL_DELIMITER: + { + err = cbs_av1_read_temporal_delimiter_obu(ctx, &gbc); + if (err < 0) + return err; + } + break; + case AV1_OBU_FRAME_HEADER: + case AV1_OBU_REDUNDANT_FRAME_HEADER: + { + err = cbs_av1_read_frame_header_obu(ctx, &gbc, + &obu->obu.frame_header, + obu->header.obu_type == + AV1_OBU_REDUNDANT_FRAME_HEADER, + unit->data_ref); + if (err < 0) + return err; + } + break; + case AV1_OBU_TILE_GROUP: + { + err = cbs_av1_read_tile_group_obu(ctx, &gbc, + &obu->obu.tile_group); + if (err < 0) + return err; + + err = cbs_av1_ref_tile_data(ctx, unit, &gbc, + &obu->obu.tile_group.tile_data); + if (err < 0) + return err; + } + break; + case AV1_OBU_FRAME: + { + err = cbs_av1_read_frame_obu(ctx, &gbc, &obu->obu.frame, + unit->data_ref); + if (err < 0) + return err; + + err = cbs_av1_ref_tile_data(ctx, unit, &gbc, + &obu->obu.frame.tile_group.tile_data); + if (err < 0) + return err; + } + break; + case AV1_OBU_TILE_LIST: + { + err = cbs_av1_read_tile_list_obu(ctx, &gbc, + &obu->obu.tile_list); + if (err < 0) + return err; + + err = cbs_av1_ref_tile_data(ctx, unit, &gbc, + &obu->obu.tile_list.tile_data); + if (err < 0) + return err; + } + break; + case AV1_OBU_METADATA: + { + err = cbs_av1_read_metadata_obu(ctx, &gbc, &obu->obu.metadata); + if (err < 0) + return err; + } + break; + case AV1_OBU_PADDING: + { + err = cbs_av1_read_padding_obu(ctx, &gbc, &obu->obu.padding); + if (err < 0) + return err; + } + break; + default: + return AVERROR(ENOSYS); + } + + end_pos = get_bits_count(&gbc); + av_assert0(end_pos <= unit->data_size * 8); + + if (obu->obu_size > 0 && + obu->header.obu_type != AV1_OBU_TILE_GROUP && + obu->header.obu_type != AV1_OBU_TILE_LIST && + obu->header.obu_type != AV1_OBU_FRAME) { + int nb_bits = obu->obu_size * 8 + start_pos - end_pos; + + if (nb_bits <= 0) + return AVERROR_INVALIDDATA; + + err = cbs_av1_read_trailing_bits(ctx, &gbc, nb_bits); + if (err < 0) + return err; + } + + return 0; +} + +static int cbs_av1_write_obu(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit, + PutBitContext *pbc) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + AV1RawOBU *obu = unit->content; + PutBitContext pbc_tmp; + AV1RawTileData *td; + size_t header_size; + int err, start_pos, end_pos, data_pos; + CodedBitstreamAV1Context av1ctx; + + // OBUs in the normal bitstream format must contain a size field + // in every OBU (in annex B it is optional, but we don't support + // writing that). + obu->header.obu_has_size_field = 1; + av1ctx = *priv; + + if (priv->sequence_header_ref) { + av1ctx.sequence_header_ref = av_buffer_ref(priv->sequence_header_ref); + if (!av1ctx.sequence_header_ref) + return AVERROR(ENOMEM); + } + + if (priv->frame_header_ref) { + av1ctx.frame_header_ref = av_buffer_ref(priv->frame_header_ref); + if (!av1ctx.frame_header_ref) { + err = AVERROR(ENOMEM); + goto error; + } + } + + err = cbs_av1_write_obu_header(ctx, pbc, &obu->header); + if (err < 0) + goto error; + + if (obu->header.obu_has_size_field) { + pbc_tmp = *pbc; + // Add space for the size field to fill later. + put_bits32(pbc, 0); + put_bits32(pbc, 0); + } + + td = NULL; + start_pos = put_bits_count(pbc); + + switch (obu->header.obu_type) { + case AV1_OBU_SEQUENCE_HEADER: + { + err = cbs_av1_write_sequence_header_obu(ctx, pbc, + &obu->obu.sequence_header); + if (err < 0) + goto error; + + av_buffer_unref(&priv->sequence_header_ref); + priv->sequence_header = NULL; + + err = ff_cbs_make_unit_refcounted(ctx, unit); + if (err < 0) + goto error; + + priv->sequence_header_ref = av_buffer_ref(unit->content_ref); + if (!priv->sequence_header_ref) { + err = AVERROR(ENOMEM); + goto error; + } + + priv->sequence_header = &obu->obu.sequence_header; + } + break; + case AV1_OBU_TEMPORAL_DELIMITER: + { + err = cbs_av1_write_temporal_delimiter_obu(ctx, pbc); + if (err < 0) + goto error; + } + break; + case AV1_OBU_FRAME_HEADER: + case AV1_OBU_REDUNDANT_FRAME_HEADER: + { + err = cbs_av1_write_frame_header_obu(ctx, pbc, + &obu->obu.frame_header, + obu->header.obu_type == + AV1_OBU_REDUNDANT_FRAME_HEADER, + NULL); + if (err < 0) + goto error; + } + break; + case AV1_OBU_TILE_GROUP: + { + err = cbs_av1_write_tile_group_obu(ctx, pbc, + &obu->obu.tile_group); + if (err < 0) + goto error; + + td = &obu->obu.tile_group.tile_data; + } + break; + case AV1_OBU_FRAME: + { + err = cbs_av1_write_frame_obu(ctx, pbc, &obu->obu.frame, NULL); + if (err < 0) + goto error; + + td = &obu->obu.frame.tile_group.tile_data; + } + break; + case AV1_OBU_TILE_LIST: + { + err = cbs_av1_write_tile_list_obu(ctx, pbc, &obu->obu.tile_list); + if (err < 0) + goto error; + + td = &obu->obu.tile_list.tile_data; + } + break; + case AV1_OBU_METADATA: + { + err = cbs_av1_write_metadata_obu(ctx, pbc, &obu->obu.metadata); + if (err < 0) + goto error; + } + break; + case AV1_OBU_PADDING: + { + err = cbs_av1_write_padding_obu(ctx, pbc, &obu->obu.padding); + if (err < 0) + goto error; + } + break; + default: + err = AVERROR(ENOSYS); + goto error; + } + + end_pos = put_bits_count(pbc); + header_size = (end_pos - start_pos + 7) / 8; + if (td) { + obu->obu_size = header_size + td->data_size; + } else if (header_size > 0) { + // Add trailing bits and recalculate. + err = cbs_av1_write_trailing_bits(ctx, pbc, 8 - end_pos % 8); + if (err < 0) + goto error; + end_pos = put_bits_count(pbc); + obu->obu_size = header_size = (end_pos - start_pos + 7) / 8; + } else { + // Empty OBU. + obu->obu_size = 0; + } + + end_pos = put_bits_count(pbc); + // Must now be byte-aligned. + av_assert0(end_pos % 8 == 0); + flush_put_bits(pbc); + start_pos /= 8; + end_pos /= 8; + + *pbc = pbc_tmp; + err = cbs_av1_write_leb128(ctx, pbc, "obu_size", obu->obu_size); + if (err < 0) + goto error; + + data_pos = put_bits_count(pbc) / 8; + flush_put_bits(pbc); + av_assert0(data_pos <= start_pos); + + if (8 * obu->obu_size > put_bits_left(pbc)) { + av_buffer_unref(&priv->sequence_header_ref); + av_buffer_unref(&priv->frame_header_ref); + *priv = av1ctx; + + return AVERROR(ENOSPC); + } + + if (obu->obu_size > 0) { + memmove(pbc->buf + data_pos, + pbc->buf + start_pos, header_size); + skip_put_bytes(pbc, header_size); + + if (td) { + memcpy(pbc->buf + data_pos + header_size, + td->data, td->data_size); + skip_put_bytes(pbc, td->data_size); + } + } + + // OBU data must be byte-aligned. + av_assert0(put_bits_count(pbc) % 8 == 0); + err = 0; + +error: + av_buffer_unref(&av1ctx.sequence_header_ref); + av_buffer_unref(&av1ctx.frame_header_ref); + + return err; +} + +static int cbs_av1_assemble_fragment(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag) +{ + size_t size, pos; + int i; + + size = 0; + for (i = 0; i < frag->nb_units; i++) + size += frag->units[i].data_size; + + frag->data_ref = av_buffer_alloc(size + AV_INPUT_BUFFER_PADDING_SIZE); + if (!frag->data_ref) + return AVERROR(ENOMEM); + frag->data = frag->data_ref->data; + memset(frag->data + size, 0, AV_INPUT_BUFFER_PADDING_SIZE); + + pos = 0; + for (i = 0; i < frag->nb_units; i++) { + memcpy(frag->data + pos, frag->units[i].data, + frag->units[i].data_size); + pos += frag->units[i].data_size; + } + av_assert0(pos == size); + frag->data_size = size; + + return 0; +} + +static void cbs_av1_flush(CodedBitstreamContext *ctx) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + + av_buffer_unref(&priv->frame_header_ref); + priv->sequence_header = NULL; + priv->frame_header = NULL; + + memset(priv->ref, 0, sizeof(priv->ref)); + priv->operating_point_idc = 0; + priv->seen_frame_header = 0; + priv->tile_num = 0; +} + +static void cbs_av1_close(CodedBitstreamContext *ctx) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + + av_buffer_unref(&priv->sequence_header_ref); + av_buffer_unref(&priv->frame_header_ref); +} + +static void cbs_av1_free_metadata(void *unit, uint8_t *content) +{ + AV1RawOBU *obu = (AV1RawOBU*)content; + AV1RawMetadata *md; + + av_assert0(obu->header.obu_type == AV1_OBU_METADATA); + md = &obu->obu.metadata; + + switch (md->metadata_type) { + case AV1_METADATA_TYPE_ITUT_T35: + av_buffer_unref(&md->metadata.itut_t35.payload_ref); + break; + } + av_free(content); +} + +static const CodedBitstreamUnitTypeDescriptor cbs_av1_unit_types[] = { + CBS_UNIT_TYPE_POD(AV1_OBU_SEQUENCE_HEADER, AV1RawOBU), + CBS_UNIT_TYPE_POD(AV1_OBU_TEMPORAL_DELIMITER, AV1RawOBU), + CBS_UNIT_TYPE_POD(AV1_OBU_FRAME_HEADER, AV1RawOBU), + CBS_UNIT_TYPE_POD(AV1_OBU_REDUNDANT_FRAME_HEADER, AV1RawOBU), + + CBS_UNIT_TYPE_INTERNAL_REF(AV1_OBU_TILE_GROUP, AV1RawOBU, + obu.tile_group.tile_data.data), + CBS_UNIT_TYPE_INTERNAL_REF(AV1_OBU_FRAME, AV1RawOBU, + obu.frame.tile_group.tile_data.data), + CBS_UNIT_TYPE_INTERNAL_REF(AV1_OBU_TILE_LIST, AV1RawOBU, + obu.tile_list.tile_data.data), + CBS_UNIT_TYPE_INTERNAL_REF(AV1_OBU_PADDING, AV1RawOBU, + obu.padding.payload), + + CBS_UNIT_TYPE_COMPLEX(AV1_OBU_METADATA, AV1RawOBU, + &cbs_av1_free_metadata), + + CBS_UNIT_TYPE_END_OF_LIST +}; + +#define OFFSET(x) offsetof(CodedBitstreamAV1Context, x) +static const AVOption cbs_av1_options[] = { + { "operating_point", "Set operating point to select layers to parse from a scalable bitstream", + OFFSET(operating_point), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, AV1_MAX_OPERATING_POINTS - 1, 0 }, + { NULL } +}; + +static const AVClass cbs_av1_class = { + .class_name = "cbs_av1", + .item_name = av_default_item_name, + .option = cbs_av1_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +const CodedBitstreamType ff_cbs_type_av1 = { + .codec_id = AV_CODEC_ID_AV1, + + .priv_class = &cbs_av1_class, + .priv_data_size = sizeof(CodedBitstreamAV1Context), + + .unit_types = cbs_av1_unit_types, + + .split_fragment = &cbs_av1_split_fragment, + .read_unit = &cbs_av1_read_unit, + .write_unit = &cbs_av1_write_obu, + .assemble_fragment = &cbs_av1_assemble_fragment, + + .flush = &cbs_av1_flush, + .close = &cbs_av1_close, +}; diff --git a/media/ffvpx/libavcodec/cbs_av1.h b/media/ffvpx/libavcodec/cbs_av1.h new file mode 100644 index 0000000000..1fc80dcfa0 --- /dev/null +++ b/media/ffvpx/libavcodec/cbs_av1.h @@ -0,0 +1,464 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_CBS_AV1_H +#define AVCODEC_CBS_AV1_H + +#include <stddef.h> +#include <stdint.h> + +#include "av1.h" +#include "cbs.h" + + +typedef struct AV1RawOBUHeader { + uint8_t obu_forbidden_bit; + uint8_t obu_type; + uint8_t obu_extension_flag; + uint8_t obu_has_size_field; + uint8_t obu_reserved_1bit; + + uint8_t temporal_id; + uint8_t spatial_id; + uint8_t extension_header_reserved_3bits; +} AV1RawOBUHeader; + +typedef struct AV1RawColorConfig { + uint8_t high_bitdepth; + uint8_t twelve_bit; + uint8_t mono_chrome; + + uint8_t color_description_present_flag; + uint8_t color_primaries; + uint8_t transfer_characteristics; + uint8_t matrix_coefficients; + + uint8_t color_range; + uint8_t subsampling_x; + uint8_t subsampling_y; + uint8_t chroma_sample_position; + uint8_t separate_uv_delta_q; +} AV1RawColorConfig; + +typedef struct AV1RawTimingInfo { + uint32_t num_units_in_display_tick; + uint32_t time_scale; + + uint8_t equal_picture_interval; + uint32_t num_ticks_per_picture_minus_1; +} AV1RawTimingInfo; + +typedef struct AV1RawDecoderModelInfo { + uint8_t buffer_delay_length_minus_1; + uint32_t num_units_in_decoding_tick; + uint8_t buffer_removal_time_length_minus_1; + uint8_t frame_presentation_time_length_minus_1; +} AV1RawDecoderModelInfo; + +typedef struct AV1RawSequenceHeader { + uint8_t seq_profile; + uint8_t still_picture; + uint8_t reduced_still_picture_header; + + uint8_t timing_info_present_flag; + uint8_t decoder_model_info_present_flag; + uint8_t initial_display_delay_present_flag; + uint8_t operating_points_cnt_minus_1; + + AV1RawTimingInfo timing_info; + AV1RawDecoderModelInfo decoder_model_info; + + uint16_t operating_point_idc[AV1_MAX_OPERATING_POINTS]; + uint8_t seq_level_idx[AV1_MAX_OPERATING_POINTS]; + uint8_t seq_tier[AV1_MAX_OPERATING_POINTS]; + uint8_t decoder_model_present_for_this_op[AV1_MAX_OPERATING_POINTS]; + uint32_t decoder_buffer_delay[AV1_MAX_OPERATING_POINTS]; + uint32_t encoder_buffer_delay[AV1_MAX_OPERATING_POINTS]; + uint8_t low_delay_mode_flag[AV1_MAX_OPERATING_POINTS]; + uint8_t initial_display_delay_present_for_this_op[AV1_MAX_OPERATING_POINTS]; + uint8_t initial_display_delay_minus_1[AV1_MAX_OPERATING_POINTS]; + + uint8_t frame_width_bits_minus_1; + uint8_t frame_height_bits_minus_1; + uint16_t max_frame_width_minus_1; + uint16_t max_frame_height_minus_1; + + uint8_t frame_id_numbers_present_flag; + uint8_t delta_frame_id_length_minus_2; + uint8_t additional_frame_id_length_minus_1; + + uint8_t use_128x128_superblock; + uint8_t enable_filter_intra; + uint8_t enable_intra_edge_filter; + uint8_t enable_interintra_compound; + uint8_t enable_masked_compound; + uint8_t enable_warped_motion; + uint8_t enable_dual_filter; + + uint8_t enable_order_hint; + uint8_t enable_jnt_comp; + uint8_t enable_ref_frame_mvs; + + uint8_t seq_choose_screen_content_tools; + uint8_t seq_force_screen_content_tools; + uint8_t seq_choose_integer_mv; + uint8_t seq_force_integer_mv; + + uint8_t order_hint_bits_minus_1; + + uint8_t enable_superres; + uint8_t enable_cdef; + uint8_t enable_restoration; + + AV1RawColorConfig color_config; + + uint8_t film_grain_params_present; +} AV1RawSequenceHeader; + +typedef struct AV1RawFilmGrainParams { + uint8_t apply_grain; + uint16_t grain_seed; + uint8_t update_grain; + uint8_t film_grain_params_ref_idx; + uint8_t num_y_points; + uint8_t point_y_value[14]; + uint8_t point_y_scaling[14]; + uint8_t chroma_scaling_from_luma; + uint8_t num_cb_points; + uint8_t point_cb_value[10]; + uint8_t point_cb_scaling[10]; + uint8_t num_cr_points; + uint8_t point_cr_value[10]; + uint8_t point_cr_scaling[10]; + uint8_t grain_scaling_minus_8; + uint8_t ar_coeff_lag; + uint8_t ar_coeffs_y_plus_128[24]; + uint8_t ar_coeffs_cb_plus_128[25]; + uint8_t ar_coeffs_cr_plus_128[25]; + uint8_t ar_coeff_shift_minus_6; + uint8_t grain_scale_shift; + uint8_t cb_mult; + uint8_t cb_luma_mult; + uint16_t cb_offset; + uint8_t cr_mult; + uint8_t cr_luma_mult; + uint16_t cr_offset; + uint8_t overlap_flag; + uint8_t clip_to_restricted_range; +} AV1RawFilmGrainParams; + +typedef struct AV1RawFrameHeader { + uint8_t show_existing_frame; + uint8_t frame_to_show_map_idx; + uint32_t frame_presentation_time; + uint32_t display_frame_id; + + uint8_t frame_type; + uint8_t show_frame; + uint8_t showable_frame; + + uint8_t error_resilient_mode; + uint8_t disable_cdf_update; + uint8_t allow_screen_content_tools; + uint8_t force_integer_mv; + + uint32_t current_frame_id; + uint8_t frame_size_override_flag; + uint8_t order_hint; + + uint8_t buffer_removal_time_present_flag; + uint32_t buffer_removal_time[AV1_MAX_OPERATING_POINTS]; + + uint8_t primary_ref_frame; + uint16_t frame_width_minus_1; + uint16_t frame_height_minus_1; + uint8_t use_superres; + uint8_t coded_denom; + uint8_t render_and_frame_size_different; + uint16_t render_width_minus_1; + uint16_t render_height_minus_1; + + uint8_t found_ref[AV1_REFS_PER_FRAME]; + + uint8_t refresh_frame_flags; + uint8_t allow_intrabc; + uint8_t ref_order_hint[AV1_NUM_REF_FRAMES]; + uint8_t frame_refs_short_signaling; + uint8_t last_frame_idx; + uint8_t golden_frame_idx; + int8_t ref_frame_idx[AV1_REFS_PER_FRAME]; + uint32_t delta_frame_id_minus1[AV1_REFS_PER_FRAME]; + + uint8_t allow_high_precision_mv; + uint8_t is_filter_switchable; + uint8_t interpolation_filter; + uint8_t is_motion_mode_switchable; + uint8_t use_ref_frame_mvs; + + uint8_t disable_frame_end_update_cdf; + + uint8_t uniform_tile_spacing_flag; + uint8_t tile_cols_log2; + uint8_t tile_rows_log2; + uint8_t width_in_sbs_minus_1[AV1_MAX_TILE_COLS]; + uint8_t height_in_sbs_minus_1[AV1_MAX_TILE_ROWS]; + uint16_t context_update_tile_id; + uint8_t tile_size_bytes_minus1; + + // These are derived values, but it's very unhelpful to have to + // recalculate them all the time so we store them here. + uint16_t tile_cols; + uint16_t tile_rows; + + uint8_t base_q_idx; + int8_t delta_q_y_dc; + uint8_t diff_uv_delta; + int8_t delta_q_u_dc; + int8_t delta_q_u_ac; + int8_t delta_q_v_dc; + int8_t delta_q_v_ac; + uint8_t using_qmatrix; + uint8_t qm_y; + uint8_t qm_u; + uint8_t qm_v; + + uint8_t segmentation_enabled; + uint8_t segmentation_update_map; + uint8_t segmentation_temporal_update; + uint8_t segmentation_update_data; + uint8_t feature_enabled[AV1_MAX_SEGMENTS][AV1_SEG_LVL_MAX]; + int16_t feature_value[AV1_MAX_SEGMENTS][AV1_SEG_LVL_MAX]; + + uint8_t delta_q_present; + uint8_t delta_q_res; + uint8_t delta_lf_present; + uint8_t delta_lf_res; + uint8_t delta_lf_multi; + + uint8_t loop_filter_level[4]; + uint8_t loop_filter_sharpness; + uint8_t loop_filter_delta_enabled; + uint8_t loop_filter_delta_update; + uint8_t update_ref_delta[AV1_TOTAL_REFS_PER_FRAME]; + int8_t loop_filter_ref_deltas[AV1_TOTAL_REFS_PER_FRAME]; + uint8_t update_mode_delta[2]; + int8_t loop_filter_mode_deltas[2]; + + uint8_t cdef_damping_minus_3; + uint8_t cdef_bits; + uint8_t cdef_y_pri_strength[8]; + uint8_t cdef_y_sec_strength[8]; + uint8_t cdef_uv_pri_strength[8]; + uint8_t cdef_uv_sec_strength[8]; + + uint8_t lr_type[3]; + uint8_t lr_unit_shift; + uint8_t lr_uv_shift; + + uint8_t tx_mode; + uint8_t reference_select; + uint8_t skip_mode_present; + + uint8_t allow_warped_motion; + uint8_t reduced_tx_set; + + uint8_t is_global[AV1_TOTAL_REFS_PER_FRAME]; + uint8_t is_rot_zoom[AV1_TOTAL_REFS_PER_FRAME]; + uint8_t is_translation[AV1_TOTAL_REFS_PER_FRAME]; + //AV1RawSubexp gm_params[AV1_TOTAL_REFS_PER_FRAME][6]; + uint32_t gm_params[AV1_TOTAL_REFS_PER_FRAME][6]; + + AV1RawFilmGrainParams film_grain; +} AV1RawFrameHeader; + +typedef struct AV1RawTileData { + uint8_t *data; + AVBufferRef *data_ref; + size_t data_size; +} AV1RawTileData; + +typedef struct AV1RawTileGroup { + uint8_t tile_start_and_end_present_flag; + uint16_t tg_start; + uint16_t tg_end; + + AV1RawTileData tile_data; +} AV1RawTileGroup; + +typedef struct AV1RawFrame { + AV1RawFrameHeader header; + AV1RawTileGroup tile_group; +} AV1RawFrame; + +typedef struct AV1RawTileList { + uint8_t output_frame_width_in_tiles_minus_1; + uint8_t output_frame_height_in_tiles_minus_1; + uint16_t tile_count_minus_1; + + AV1RawTileData tile_data; +} AV1RawTileList; + +typedef struct AV1RawMetadataHDRCLL { + uint16_t max_cll; + uint16_t max_fall; +} AV1RawMetadataHDRCLL; + +typedef struct AV1RawMetadataHDRMDCV { + uint16_t primary_chromaticity_x[3]; + uint16_t primary_chromaticity_y[3]; + uint16_t white_point_chromaticity_x; + uint16_t white_point_chromaticity_y; + uint32_t luminance_max; + uint32_t luminance_min; +} AV1RawMetadataHDRMDCV; + +typedef struct AV1RawMetadataScalability { + uint8_t scalability_mode_idc; + uint8_t spatial_layers_cnt_minus_1; + uint8_t spatial_layer_dimensions_present_flag; + uint8_t spatial_layer_description_present_flag; + uint8_t temporal_group_description_present_flag; + uint8_t scalability_structure_reserved_3bits; + uint16_t spatial_layer_max_width[4]; + uint16_t spatial_layer_max_height[4]; + uint8_t spatial_layer_ref_id[4]; + uint8_t temporal_group_size; + uint8_t temporal_group_temporal_id[255]; + uint8_t temporal_group_temporal_switching_up_point_flag[255]; + uint8_t temporal_group_spatial_switching_up_point_flag[255]; + uint8_t temporal_group_ref_cnt[255]; + uint8_t temporal_group_ref_pic_diff[255][7]; +} AV1RawMetadataScalability; + +typedef struct AV1RawMetadataITUTT35 { + uint8_t itu_t_t35_country_code; + uint8_t itu_t_t35_country_code_extension_byte; + + uint8_t *payload; + AVBufferRef *payload_ref; + size_t payload_size; +} AV1RawMetadataITUTT35; + +typedef struct AV1RawMetadataTimecode { + uint8_t counting_type; + uint8_t full_timestamp_flag; + uint8_t discontinuity_flag; + uint8_t cnt_dropped_flag; + uint16_t n_frames; + uint8_t seconds_value; + uint8_t minutes_value; + uint8_t hours_value; + uint8_t seconds_flag; + uint8_t minutes_flag; + uint8_t hours_flag; + uint8_t time_offset_length; + uint32_t time_offset_value; +} AV1RawMetadataTimecode; + +typedef struct AV1RawMetadata { + uint64_t metadata_type; + union { + AV1RawMetadataHDRCLL hdr_cll; + AV1RawMetadataHDRMDCV hdr_mdcv; + AV1RawMetadataScalability scalability; + AV1RawMetadataITUTT35 itut_t35; + AV1RawMetadataTimecode timecode; + } metadata; +} AV1RawMetadata; + +typedef struct AV1RawPadding { + uint8_t *payload; + AVBufferRef *payload_ref; + size_t payload_size; +} AV1RawPadding; + + +typedef struct AV1RawOBU { + AV1RawOBUHeader header; + + size_t obu_size; + + union { + AV1RawSequenceHeader sequence_header; + AV1RawFrameHeader frame_header; + AV1RawFrame frame; + AV1RawTileGroup tile_group; + AV1RawTileList tile_list; + AV1RawMetadata metadata; + AV1RawPadding padding; + } obu; +} AV1RawOBU; + +typedef struct AV1ReferenceFrameState { + int valid; // RefValid + int frame_id; // RefFrameId + int upscaled_width; // RefUpscaledWidth + int frame_width; // RefFrameWidth + int frame_height; // RefFrameHeight + int render_width; // RefRenderWidth + int render_height; // RefRenderHeight + int frame_type; // RefFrameType + int subsampling_x; // RefSubsamplingX + int subsampling_y; // RefSubsamplingY + int bit_depth; // RefBitDepth + int order_hint; // RefOrderHint + + int8_t loop_filter_ref_deltas[AV1_TOTAL_REFS_PER_FRAME]; + int8_t loop_filter_mode_deltas[2]; + uint8_t feature_enabled[AV1_MAX_SEGMENTS][AV1_SEG_LVL_MAX]; + int16_t feature_value[AV1_MAX_SEGMENTS][AV1_SEG_LVL_MAX]; +} AV1ReferenceFrameState; + +typedef struct CodedBitstreamAV1Context { + const AVClass *class; + + AV1RawSequenceHeader *sequence_header; + AVBufferRef *sequence_header_ref; + + int seen_frame_header; + AVBufferRef *frame_header_ref; + uint8_t *frame_header; + size_t frame_header_size; + + int temporal_id; + int spatial_id; + int operating_point_idc; + + int bit_depth; + int order_hint; + int frame_width; + int frame_height; + int upscaled_width; + int render_width; + int render_height; + + int num_planes; + int coded_lossless; + int all_lossless; + int tile_cols; + int tile_rows; + int tile_num; + + AV1ReferenceFrameState ref[AV1_NUM_REF_FRAMES]; + + // AVOptions + int operating_point; +} CodedBitstreamAV1Context; + + +#endif /* AVCODEC_CBS_AV1_H */ diff --git a/media/ffvpx/libavcodec/cbs_av1_syntax_template.c b/media/ffvpx/libavcodec/cbs_av1_syntax_template.c new file mode 100644 index 0000000000..e95925a493 --- /dev/null +++ b/media/ffvpx/libavcodec/cbs_av1_syntax_template.c @@ -0,0 +1,2050 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +static int FUNC(obu_header)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawOBUHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + int err; + + HEADER("OBU header"); + + fc(1, obu_forbidden_bit, 0, 0); + + fc(4, obu_type, 0, AV1_OBU_PADDING); + flag(obu_extension_flag); + flag(obu_has_size_field); + + fc(1, obu_reserved_1bit, 0, 0); + + if (current->obu_extension_flag) { + fb(3, temporal_id); + fb(2, spatial_id); + fc(3, extension_header_reserved_3bits, 0, 0); + } else { + infer(temporal_id, 0); + infer(spatial_id, 0); + } + + priv->temporal_id = current->temporal_id; + priv->spatial_id = current->spatial_id; + + return 0; +} + +static int FUNC(trailing_bits)(CodedBitstreamContext *ctx, RWContext *rw, int nb_bits) +{ + int err; + + av_assert0(nb_bits > 0); + + fixed(1, trailing_one_bit, 1); + --nb_bits; + + while (nb_bits > 0) { + fixed(1, trailing_zero_bit, 0); + --nb_bits; + } + + return 0; +} + +static int FUNC(byte_alignment)(CodedBitstreamContext *ctx, RWContext *rw) +{ + int err; + + while (byte_alignment(rw) != 0) + fixed(1, zero_bit, 0); + + return 0; +} + +static int FUNC(color_config)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawColorConfig *current, int seq_profile) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + int err; + + flag(high_bitdepth); + + if (seq_profile == FF_PROFILE_AV1_PROFESSIONAL && + current->high_bitdepth) { + flag(twelve_bit); + priv->bit_depth = current->twelve_bit ? 12 : 10; + } else { + priv->bit_depth = current->high_bitdepth ? 10 : 8; + } + + if (seq_profile == FF_PROFILE_AV1_HIGH) + infer(mono_chrome, 0); + else + flag(mono_chrome); + priv->num_planes = current->mono_chrome ? 1 : 3; + + flag(color_description_present_flag); + if (current->color_description_present_flag) { + fb(8, color_primaries); + fb(8, transfer_characteristics); + fb(8, matrix_coefficients); + } else { + infer(color_primaries, AVCOL_PRI_UNSPECIFIED); + infer(transfer_characteristics, AVCOL_TRC_UNSPECIFIED); + infer(matrix_coefficients, AVCOL_SPC_UNSPECIFIED); + } + + if (current->mono_chrome) { + flag(color_range); + + infer(subsampling_x, 1); + infer(subsampling_y, 1); + infer(chroma_sample_position, AV1_CSP_UNKNOWN); + infer(separate_uv_delta_q, 0); + + } else if (current->color_primaries == AVCOL_PRI_BT709 && + current->transfer_characteristics == AVCOL_TRC_IEC61966_2_1 && + current->matrix_coefficients == AVCOL_SPC_RGB) { + infer(color_range, 1); + infer(subsampling_x, 0); + infer(subsampling_y, 0); + flag(separate_uv_delta_q); + + } else { + flag(color_range); + + if (seq_profile == FF_PROFILE_AV1_MAIN) { + infer(subsampling_x, 1); + infer(subsampling_y, 1); + } else if (seq_profile == FF_PROFILE_AV1_HIGH) { + infer(subsampling_x, 0); + infer(subsampling_y, 0); + } else { + if (priv->bit_depth == 12) { + fb(1, subsampling_x); + if (current->subsampling_x) + fb(1, subsampling_y); + else + infer(subsampling_y, 0); + } else { + infer(subsampling_x, 1); + infer(subsampling_y, 0); + } + } + if (current->subsampling_x && current->subsampling_y) { + fc(2, chroma_sample_position, AV1_CSP_UNKNOWN, + AV1_CSP_COLOCATED); + } + + flag(separate_uv_delta_q); + } + + return 0; +} + +static int FUNC(timing_info)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawTimingInfo *current) +{ + int err; + + fc(32, num_units_in_display_tick, 1, MAX_UINT_BITS(32)); + fc(32, time_scale, 1, MAX_UINT_BITS(32)); + + flag(equal_picture_interval); + if (current->equal_picture_interval) + uvlc(num_ticks_per_picture_minus_1, 0, MAX_UINT_BITS(32) - 1); + + return 0; +} + +static int FUNC(decoder_model_info)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawDecoderModelInfo *current) +{ + int err; + + fb(5, buffer_delay_length_minus_1); + fb(32, num_units_in_decoding_tick); + fb(5, buffer_removal_time_length_minus_1); + fb(5, frame_presentation_time_length_minus_1); + + return 0; +} + +static int FUNC(sequence_header_obu)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawSequenceHeader *current) +{ + int i, err; + + HEADER("Sequence Header"); + + fc(3, seq_profile, FF_PROFILE_AV1_MAIN, + FF_PROFILE_AV1_PROFESSIONAL); + flag(still_picture); + flag(reduced_still_picture_header); + + if (current->reduced_still_picture_header) { + infer(timing_info_present_flag, 0); + infer(decoder_model_info_present_flag, 0); + infer(initial_display_delay_present_flag, 0); + infer(operating_points_cnt_minus_1, 0); + infer(operating_point_idc[0], 0); + + fb(5, seq_level_idx[0]); + + infer(seq_tier[0], 0); + infer(decoder_model_present_for_this_op[0], 0); + infer(initial_display_delay_present_for_this_op[0], 0); + + } else { + flag(timing_info_present_flag); + if (current->timing_info_present_flag) { + CHECK(FUNC(timing_info)(ctx, rw, ¤t->timing_info)); + + flag(decoder_model_info_present_flag); + if (current->decoder_model_info_present_flag) { + CHECK(FUNC(decoder_model_info) + (ctx, rw, ¤t->decoder_model_info)); + } + } else { + infer(decoder_model_info_present_flag, 0); + } + + flag(initial_display_delay_present_flag); + + fb(5, operating_points_cnt_minus_1); + for (i = 0; i <= current->operating_points_cnt_minus_1; i++) { + fbs(12, operating_point_idc[i], 1, i); + fbs(5, seq_level_idx[i], 1, i); + + if (current->seq_level_idx[i] > 7) + flags(seq_tier[i], 1, i); + else + infer(seq_tier[i], 0); + + if (current->decoder_model_info_present_flag) { + flags(decoder_model_present_for_this_op[i], 1, i); + if (current->decoder_model_present_for_this_op[i]) { + int n = current->decoder_model_info.buffer_delay_length_minus_1 + 1; + fbs(n, decoder_buffer_delay[i], 1, i); + fbs(n, encoder_buffer_delay[i], 1, i); + flags(low_delay_mode_flag[i], 1, i); + } + } else { + infer(decoder_model_present_for_this_op[i], 0); + } + + if (current->initial_display_delay_present_flag) { + flags(initial_display_delay_present_for_this_op[i], 1, i); + if (current->initial_display_delay_present_for_this_op[i]) + fbs(4, initial_display_delay_minus_1[i], 1, i); + } + } + } + + fb(4, frame_width_bits_minus_1); + fb(4, frame_height_bits_minus_1); + + fb(current->frame_width_bits_minus_1 + 1, max_frame_width_minus_1); + fb(current->frame_height_bits_minus_1 + 1, max_frame_height_minus_1); + + if (current->reduced_still_picture_header) + infer(frame_id_numbers_present_flag, 0); + else + flag(frame_id_numbers_present_flag); + if (current->frame_id_numbers_present_flag) { + fb(4, delta_frame_id_length_minus_2); + fb(3, additional_frame_id_length_minus_1); + } + + flag(use_128x128_superblock); + flag(enable_filter_intra); + flag(enable_intra_edge_filter); + + if (current->reduced_still_picture_header) { + infer(enable_interintra_compound, 0); + infer(enable_masked_compound, 0); + infer(enable_warped_motion, 0); + infer(enable_dual_filter, 0); + infer(enable_order_hint, 0); + infer(enable_jnt_comp, 0); + infer(enable_ref_frame_mvs, 0); + + infer(seq_force_screen_content_tools, + AV1_SELECT_SCREEN_CONTENT_TOOLS); + infer(seq_force_integer_mv, + AV1_SELECT_INTEGER_MV); + } else { + flag(enable_interintra_compound); + flag(enable_masked_compound); + flag(enable_warped_motion); + flag(enable_dual_filter); + + flag(enable_order_hint); + if (current->enable_order_hint) { + flag(enable_jnt_comp); + flag(enable_ref_frame_mvs); + } else { + infer(enable_jnt_comp, 0); + infer(enable_ref_frame_mvs, 0); + } + + flag(seq_choose_screen_content_tools); + if (current->seq_choose_screen_content_tools) + infer(seq_force_screen_content_tools, + AV1_SELECT_SCREEN_CONTENT_TOOLS); + else + fb(1, seq_force_screen_content_tools); + if (current->seq_force_screen_content_tools > 0) { + flag(seq_choose_integer_mv); + if (current->seq_choose_integer_mv) + infer(seq_force_integer_mv, + AV1_SELECT_INTEGER_MV); + else + fb(1, seq_force_integer_mv); + } else { + infer(seq_force_integer_mv, AV1_SELECT_INTEGER_MV); + } + + if (current->enable_order_hint) + fb(3, order_hint_bits_minus_1); + } + + flag(enable_superres); + flag(enable_cdef); + flag(enable_restoration); + + CHECK(FUNC(color_config)(ctx, rw, ¤t->color_config, + current->seq_profile)); + + flag(film_grain_params_present); + + return 0; +} + +static int FUNC(temporal_delimiter_obu)(CodedBitstreamContext *ctx, RWContext *rw) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + + HEADER("Temporal Delimiter"); + + priv->seen_frame_header = 0; + + return 0; +} + +static int FUNC(set_frame_refs)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + const AV1RawSequenceHeader *seq = priv->sequence_header; + static const uint8_t ref_frame_list[AV1_NUM_REF_FRAMES - 2] = { + AV1_REF_FRAME_LAST2, AV1_REF_FRAME_LAST3, AV1_REF_FRAME_BWDREF, + AV1_REF_FRAME_ALTREF2, AV1_REF_FRAME_ALTREF + }; + int8_t ref_frame_idx[AV1_REFS_PER_FRAME], used_frame[AV1_NUM_REF_FRAMES]; + int16_t shifted_order_hints[AV1_NUM_REF_FRAMES]; + int cur_frame_hint, latest_order_hint, earliest_order_hint, ref; + int i, j; + + for (i = 0; i < AV1_REFS_PER_FRAME; i++) + ref_frame_idx[i] = -1; + ref_frame_idx[AV1_REF_FRAME_LAST - AV1_REF_FRAME_LAST] = current->last_frame_idx; + ref_frame_idx[AV1_REF_FRAME_GOLDEN - AV1_REF_FRAME_LAST] = current->golden_frame_idx; + + for (i = 0; i < AV1_NUM_REF_FRAMES; i++) + used_frame[i] = 0; + used_frame[current->last_frame_idx] = 1; + used_frame[current->golden_frame_idx] = 1; + + cur_frame_hint = 1 << (seq->order_hint_bits_minus_1); + for (i = 0; i < AV1_NUM_REF_FRAMES; i++) + shifted_order_hints[i] = cur_frame_hint + + cbs_av1_get_relative_dist(seq, priv->ref[i].order_hint, + priv->order_hint); + + latest_order_hint = shifted_order_hints[current->last_frame_idx]; + earliest_order_hint = shifted_order_hints[current->golden_frame_idx]; + + ref = -1; + for (i = 0; i < AV1_NUM_REF_FRAMES; i++) { + int hint = shifted_order_hints[i]; + if (!used_frame[i] && hint >= cur_frame_hint && + (ref < 0 || hint >= latest_order_hint)) { + ref = i; + latest_order_hint = hint; + } + } + if (ref >= 0) { + ref_frame_idx[AV1_REF_FRAME_ALTREF - AV1_REF_FRAME_LAST] = ref; + used_frame[ref] = 1; + } + + ref = -1; + for (i = 0; i < AV1_NUM_REF_FRAMES; i++) { + int hint = shifted_order_hints[i]; + if (!used_frame[i] && hint >= cur_frame_hint && + (ref < 0 || hint < earliest_order_hint)) { + ref = i; + earliest_order_hint = hint; + } + } + if (ref >= 0) { + ref_frame_idx[AV1_REF_FRAME_BWDREF - AV1_REF_FRAME_LAST] = ref; + used_frame[ref] = 1; + } + + ref = -1; + for (i = 0; i < AV1_NUM_REF_FRAMES; i++) { + int hint = shifted_order_hints[i]; + if (!used_frame[i] && hint >= cur_frame_hint && + (ref < 0 || hint < earliest_order_hint)) { + ref = i; + earliest_order_hint = hint; + } + } + if (ref >= 0) { + ref_frame_idx[AV1_REF_FRAME_ALTREF2 - AV1_REF_FRAME_LAST] = ref; + used_frame[ref] = 1; + } + + for (i = 0; i < AV1_REFS_PER_FRAME - 2; i++) { + int ref_frame = ref_frame_list[i]; + if (ref_frame_idx[ref_frame - AV1_REF_FRAME_LAST] < 0 ) { + ref = -1; + for (j = 0; j < AV1_NUM_REF_FRAMES; j++) { + int hint = shifted_order_hints[j]; + if (!used_frame[j] && hint < cur_frame_hint && + (ref < 0 || hint >= latest_order_hint)) { + ref = j; + latest_order_hint = hint; + } + } + if (ref >= 0) { + ref_frame_idx[ref_frame - AV1_REF_FRAME_LAST] = ref; + used_frame[ref] = 1; + } + } + } + + ref = -1; + for (i = 0; i < AV1_NUM_REF_FRAMES; i++) { + int hint = shifted_order_hints[i]; + if (ref < 0 || hint < earliest_order_hint) { + ref = i; + earliest_order_hint = hint; + } + } + for (i = 0; i < AV1_REFS_PER_FRAME; i++) { + if (ref_frame_idx[i] < 0) + ref_frame_idx[i] = ref; + infer(ref_frame_idx[i], ref_frame_idx[i]); + } + + return 0; +} + +static int FUNC(superres_params)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + const AV1RawSequenceHeader *seq = priv->sequence_header; + int denom, err; + + if (seq->enable_superres) + flag(use_superres); + else + infer(use_superres, 0); + + if (current->use_superres) { + fb(3, coded_denom); + denom = current->coded_denom + AV1_SUPERRES_DENOM_MIN; + } else { + denom = AV1_SUPERRES_NUM; + } + + priv->upscaled_width = priv->frame_width; + priv->frame_width = (priv->upscaled_width * AV1_SUPERRES_NUM + + denom / 2) / denom; + + return 0; +} + +static int FUNC(frame_size)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + const AV1RawSequenceHeader *seq = priv->sequence_header; + int err; + + if (current->frame_size_override_flag) { + fb(seq->frame_width_bits_minus_1 + 1, frame_width_minus_1); + fb(seq->frame_height_bits_minus_1 + 1, frame_height_minus_1); + } else { + infer(frame_width_minus_1, seq->max_frame_width_minus_1); + infer(frame_height_minus_1, seq->max_frame_height_minus_1); + } + + priv->frame_width = current->frame_width_minus_1 + 1; + priv->frame_height = current->frame_height_minus_1 + 1; + + CHECK(FUNC(superres_params)(ctx, rw, current)); + + return 0; +} + +static int FUNC(render_size)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + int err; + + flag(render_and_frame_size_different); + + if (current->render_and_frame_size_different) { + fb(16, render_width_minus_1); + fb(16, render_height_minus_1); + } else { + infer(render_width_minus_1, current->frame_width_minus_1); + infer(render_height_minus_1, current->frame_height_minus_1); + } + + priv->render_width = current->render_width_minus_1 + 1; + priv->render_height = current->render_height_minus_1 + 1; + + return 0; +} + +static int FUNC(frame_size_with_refs)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + int i, err; + + for (i = 0; i < AV1_REFS_PER_FRAME; i++) { + flags(found_ref[i], 1, i); + if (current->found_ref[i]) { + AV1ReferenceFrameState *ref = + &priv->ref[current->ref_frame_idx[i]]; + + if (!ref->valid) { + av_log(ctx->log_ctx, AV_LOG_ERROR, + "Missing reference frame needed for frame size " + "(ref = %d, ref_frame_idx = %d).\n", + i, current->ref_frame_idx[i]); + return AVERROR_INVALIDDATA; + } + + infer(frame_width_minus_1, ref->upscaled_width - 1); + infer(frame_height_minus_1, ref->frame_height - 1); + infer(render_width_minus_1, ref->render_width - 1); + infer(render_height_minus_1, ref->render_height - 1); + + priv->upscaled_width = ref->upscaled_width; + priv->frame_width = priv->upscaled_width; + priv->frame_height = ref->frame_height; + priv->render_width = ref->render_width; + priv->render_height = ref->render_height; + break; + } + } + + if (i >= AV1_REFS_PER_FRAME) { + CHECK(FUNC(frame_size)(ctx, rw, current)); + CHECK(FUNC(render_size)(ctx, rw, current)); + } else { + CHECK(FUNC(superres_params)(ctx, rw, current)); + } + + return 0; +} + +static int FUNC(interpolation_filter)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + int err; + + flag(is_filter_switchable); + if (current->is_filter_switchable) + infer(interpolation_filter, + AV1_INTERPOLATION_FILTER_SWITCHABLE); + else + fb(2, interpolation_filter); + + return 0; +} + +static int FUNC(tile_info)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + const AV1RawSequenceHeader *seq = priv->sequence_header; + int mi_cols, mi_rows, sb_cols, sb_rows, sb_shift, sb_size; + int max_tile_width_sb, max_tile_height_sb, max_tile_area_sb; + int min_log2_tile_cols, max_log2_tile_cols, max_log2_tile_rows; + int min_log2_tiles, min_log2_tile_rows; + int i, err; + + mi_cols = 2 * ((priv->frame_width + 7) >> 3); + mi_rows = 2 * ((priv->frame_height + 7) >> 3); + + sb_cols = seq->use_128x128_superblock ? ((mi_cols + 31) >> 5) + : ((mi_cols + 15) >> 4); + sb_rows = seq->use_128x128_superblock ? ((mi_rows + 31) >> 5) + : ((mi_rows + 15) >> 4); + + sb_shift = seq->use_128x128_superblock ? 5 : 4; + sb_size = sb_shift + 2; + + max_tile_width_sb = AV1_MAX_TILE_WIDTH >> sb_size; + max_tile_area_sb = AV1_MAX_TILE_AREA >> (2 * sb_size); + + min_log2_tile_cols = cbs_av1_tile_log2(max_tile_width_sb, sb_cols); + max_log2_tile_cols = cbs_av1_tile_log2(1, FFMIN(sb_cols, AV1_MAX_TILE_COLS)); + max_log2_tile_rows = cbs_av1_tile_log2(1, FFMIN(sb_rows, AV1_MAX_TILE_ROWS)); + min_log2_tiles = FFMAX(min_log2_tile_cols, + cbs_av1_tile_log2(max_tile_area_sb, sb_rows * sb_cols)); + + flag(uniform_tile_spacing_flag); + + if (current->uniform_tile_spacing_flag) { + int tile_width_sb, tile_height_sb; + + increment(tile_cols_log2, min_log2_tile_cols, max_log2_tile_cols); + + tile_width_sb = (sb_cols + (1 << current->tile_cols_log2) - 1) >> + current->tile_cols_log2; + current->tile_cols = (sb_cols + tile_width_sb - 1) / tile_width_sb; + + min_log2_tile_rows = FFMAX(min_log2_tiles - current->tile_cols_log2, 0); + + increment(tile_rows_log2, min_log2_tile_rows, max_log2_tile_rows); + + tile_height_sb = (sb_rows + (1 << current->tile_rows_log2) - 1) >> + current->tile_rows_log2; + current->tile_rows = (sb_rows + tile_height_sb - 1) / tile_height_sb; + + for (i = 0; i < current->tile_cols - 1; i++) + infer(width_in_sbs_minus_1[i], tile_width_sb - 1); + infer(width_in_sbs_minus_1[i], + sb_cols - (current->tile_cols - 1) * tile_width_sb - 1); + for (i = 0; i < current->tile_rows - 1; i++) + infer(height_in_sbs_minus_1[i], tile_height_sb - 1); + infer(height_in_sbs_minus_1[i], + sb_rows - (current->tile_rows - 1) * tile_height_sb - 1); + + } else { + int widest_tile_sb, start_sb, size_sb, max_width, max_height; + + widest_tile_sb = 0; + + start_sb = 0; + for (i = 0; start_sb < sb_cols && i < AV1_MAX_TILE_COLS; i++) { + max_width = FFMIN(sb_cols - start_sb, max_tile_width_sb); + ns(max_width, width_in_sbs_minus_1[i], 1, i); + size_sb = current->width_in_sbs_minus_1[i] + 1; + widest_tile_sb = FFMAX(size_sb, widest_tile_sb); + start_sb += size_sb; + } + current->tile_cols_log2 = cbs_av1_tile_log2(1, i); + current->tile_cols = i; + + if (min_log2_tiles > 0) + max_tile_area_sb = (sb_rows * sb_cols) >> (min_log2_tiles + 1); + else + max_tile_area_sb = sb_rows * sb_cols; + max_tile_height_sb = FFMAX(max_tile_area_sb / widest_tile_sb, 1); + + start_sb = 0; + for (i = 0; start_sb < sb_rows && i < AV1_MAX_TILE_ROWS; i++) { + max_height = FFMIN(sb_rows - start_sb, max_tile_height_sb); + ns(max_height, height_in_sbs_minus_1[i], 1, i); + size_sb = current->height_in_sbs_minus_1[i] + 1; + start_sb += size_sb; + } + current->tile_rows_log2 = cbs_av1_tile_log2(1, i); + current->tile_rows = i; + } + + if (current->tile_cols_log2 > 0 || + current->tile_rows_log2 > 0) { + fb(current->tile_cols_log2 + current->tile_rows_log2, + context_update_tile_id); + fb(2, tile_size_bytes_minus1); + } else { + infer(context_update_tile_id, 0); + } + + priv->tile_cols = current->tile_cols; + priv->tile_rows = current->tile_rows; + + return 0; +} + +static int FUNC(quantization_params)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + const AV1RawSequenceHeader *seq = priv->sequence_header; + int err; + + fb(8, base_q_idx); + + delta_q(delta_q_y_dc); + + if (priv->num_planes > 1) { + if (seq->color_config.separate_uv_delta_q) + flag(diff_uv_delta); + else + infer(diff_uv_delta, 0); + + delta_q(delta_q_u_dc); + delta_q(delta_q_u_ac); + + if (current->diff_uv_delta) { + delta_q(delta_q_v_dc); + delta_q(delta_q_v_ac); + } else { + infer(delta_q_v_dc, current->delta_q_u_dc); + infer(delta_q_v_ac, current->delta_q_u_ac); + } + } else { + infer(delta_q_u_dc, 0); + infer(delta_q_u_ac, 0); + infer(delta_q_v_dc, 0); + infer(delta_q_v_ac, 0); + } + + flag(using_qmatrix); + if (current->using_qmatrix) { + fb(4, qm_y); + fb(4, qm_u); + if (seq->color_config.separate_uv_delta_q) + fb(4, qm_v); + else + infer(qm_v, current->qm_u); + } + + return 0; +} + +static int FUNC(segmentation_params)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + static const uint8_t bits[AV1_SEG_LVL_MAX] = { 8, 6, 6, 6, 6, 3, 0, 0 }; + static const uint8_t sign[AV1_SEG_LVL_MAX] = { 1, 1, 1, 1, 1, 0, 0, 0 }; + static const uint8_t default_feature_enabled[AV1_SEG_LVL_MAX] = { 0 }; + static const int16_t default_feature_value[AV1_SEG_LVL_MAX] = { 0 }; + int i, j, err; + + flag(segmentation_enabled); + + if (current->segmentation_enabled) { + if (current->primary_ref_frame == AV1_PRIMARY_REF_NONE) { + infer(segmentation_update_map, 1); + infer(segmentation_temporal_update, 0); + infer(segmentation_update_data, 1); + } else { + flag(segmentation_update_map); + if (current->segmentation_update_map) + flag(segmentation_temporal_update); + else + infer(segmentation_temporal_update, 0); + flag(segmentation_update_data); + } + + for (i = 0; i < AV1_MAX_SEGMENTS; i++) { + const uint8_t *ref_feature_enabled; + const int16_t *ref_feature_value; + + if (current->primary_ref_frame == AV1_PRIMARY_REF_NONE) { + ref_feature_enabled = default_feature_enabled; + ref_feature_value = default_feature_value; + } else { + ref_feature_enabled = + priv->ref[current->ref_frame_idx[current->primary_ref_frame]].feature_enabled[i]; + ref_feature_value = + priv->ref[current->ref_frame_idx[current->primary_ref_frame]].feature_value[i]; + } + + for (j = 0; j < AV1_SEG_LVL_MAX; j++) { + if (current->segmentation_update_data) { + flags(feature_enabled[i][j], 2, i, j); + + if (current->feature_enabled[i][j] && bits[j] > 0) { + if (sign[j]) + sus(1 + bits[j], feature_value[i][j], 2, i, j); + else + fbs(bits[j], feature_value[i][j], 2, i, j); + } else { + infer(feature_value[i][j], 0); + } + } else { + infer(feature_enabled[i][j], ref_feature_enabled[j]); + infer(feature_value[i][j], ref_feature_value[j]); + } + } + } + } else { + for (i = 0; i < AV1_MAX_SEGMENTS; i++) { + for (j = 0; j < AV1_SEG_LVL_MAX; j++) { + infer(feature_enabled[i][j], 0); + infer(feature_value[i][j], 0); + } + } + } + + return 0; +} + +static int FUNC(delta_q_params)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + int err; + + if (current->base_q_idx > 0) + flag(delta_q_present); + else + infer(delta_q_present, 0); + + if (current->delta_q_present) + fb(2, delta_q_res); + + return 0; +} + +static int FUNC(delta_lf_params)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + int err; + + if (current->delta_q_present) { + if (!current->allow_intrabc) + flag(delta_lf_present); + else + infer(delta_lf_present, 0); + if (current->delta_lf_present) { + fb(2, delta_lf_res); + flag(delta_lf_multi); + } else { + infer(delta_lf_res, 0); + infer(delta_lf_multi, 0); + } + } else { + infer(delta_lf_present, 0); + infer(delta_lf_res, 0); + infer(delta_lf_multi, 0); + } + + return 0; +} + +static int FUNC(loop_filter_params)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + static const int8_t default_loop_filter_ref_deltas[AV1_TOTAL_REFS_PER_FRAME] = + { 1, 0, 0, 0, -1, 0, -1, -1 }; + static const int8_t default_loop_filter_mode_deltas[2] = { 0, 0 }; + int i, err; + + if (priv->coded_lossless || current->allow_intrabc) { + infer(loop_filter_level[0], 0); + infer(loop_filter_level[1], 0); + infer(loop_filter_ref_deltas[AV1_REF_FRAME_INTRA], 1); + infer(loop_filter_ref_deltas[AV1_REF_FRAME_LAST], 0); + infer(loop_filter_ref_deltas[AV1_REF_FRAME_LAST2], 0); + infer(loop_filter_ref_deltas[AV1_REF_FRAME_LAST3], 0); + infer(loop_filter_ref_deltas[AV1_REF_FRAME_BWDREF], 0); + infer(loop_filter_ref_deltas[AV1_REF_FRAME_GOLDEN], -1); + infer(loop_filter_ref_deltas[AV1_REF_FRAME_ALTREF], -1); + infer(loop_filter_ref_deltas[AV1_REF_FRAME_ALTREF2], -1); + for (i = 0; i < 2; i++) + infer(loop_filter_mode_deltas[i], 0); + return 0; + } + + fb(6, loop_filter_level[0]); + fb(6, loop_filter_level[1]); + + if (priv->num_planes > 1) { + if (current->loop_filter_level[0] || + current->loop_filter_level[1]) { + fb(6, loop_filter_level[2]); + fb(6, loop_filter_level[3]); + } + } + + fb(3, loop_filter_sharpness); + + flag(loop_filter_delta_enabled); + if (current->loop_filter_delta_enabled) { + const int8_t *ref_loop_filter_ref_deltas, *ref_loop_filter_mode_deltas; + + if (current->primary_ref_frame == AV1_PRIMARY_REF_NONE) { + ref_loop_filter_ref_deltas = default_loop_filter_ref_deltas; + ref_loop_filter_mode_deltas = default_loop_filter_mode_deltas; + } else { + ref_loop_filter_ref_deltas = + priv->ref[current->ref_frame_idx[current->primary_ref_frame]].loop_filter_ref_deltas; + ref_loop_filter_mode_deltas = + priv->ref[current->ref_frame_idx[current->primary_ref_frame]].loop_filter_mode_deltas; + } + + flag(loop_filter_delta_update); + for (i = 0; i < AV1_TOTAL_REFS_PER_FRAME; i++) { + if (current->loop_filter_delta_update) + flags(update_ref_delta[i], 1, i); + else + infer(update_ref_delta[i], 0); + if (current->update_ref_delta[i]) + sus(1 + 6, loop_filter_ref_deltas[i], 1, i); + else + infer(loop_filter_ref_deltas[i], ref_loop_filter_ref_deltas[i]); + } + for (i = 0; i < 2; i++) { + if (current->loop_filter_delta_update) + flags(update_mode_delta[i], 1, i); + else + infer(update_mode_delta[i], 0); + if (current->update_mode_delta[i]) + sus(1 + 6, loop_filter_mode_deltas[i], 1, i); + else + infer(loop_filter_mode_deltas[i], ref_loop_filter_mode_deltas[i]); + } + } else { + for (i = 0; i < AV1_TOTAL_REFS_PER_FRAME; i++) + infer(loop_filter_ref_deltas[i], default_loop_filter_ref_deltas[i]); + for (i = 0; i < 2; i++) + infer(loop_filter_mode_deltas[i], default_loop_filter_mode_deltas[i]); + } + + return 0; +} + +static int FUNC(cdef_params)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + const AV1RawSequenceHeader *seq = priv->sequence_header; + int i, err; + + if (priv->coded_lossless || current->allow_intrabc || + !seq->enable_cdef) { + infer(cdef_damping_minus_3, 0); + infer(cdef_bits, 0); + infer(cdef_y_pri_strength[0], 0); + infer(cdef_y_sec_strength[0], 0); + infer(cdef_uv_pri_strength[0], 0); + infer(cdef_uv_sec_strength[0], 0); + + return 0; + } + + fb(2, cdef_damping_minus_3); + fb(2, cdef_bits); + + for (i = 0; i < (1 << current->cdef_bits); i++) { + fbs(4, cdef_y_pri_strength[i], 1, i); + fbs(2, cdef_y_sec_strength[i], 1, i); + + if (priv->num_planes > 1) { + fbs(4, cdef_uv_pri_strength[i], 1, i); + fbs(2, cdef_uv_sec_strength[i], 1, i); + } + } + + return 0; +} + +static int FUNC(lr_params)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + const AV1RawSequenceHeader *seq = priv->sequence_header; + int uses_lr, uses_chroma_lr; + int i, err; + + if (priv->all_lossless || current->allow_intrabc || + !seq->enable_restoration) { + return 0; + } + + uses_lr = uses_chroma_lr = 0; + for (i = 0; i < priv->num_planes; i++) { + fbs(2, lr_type[i], 1, i); + + if (current->lr_type[i] != AV1_RESTORE_NONE) { + uses_lr = 1; + if (i > 0) + uses_chroma_lr = 1; + } + } + + if (uses_lr) { + if (seq->use_128x128_superblock) + increment(lr_unit_shift, 1, 2); + else + increment(lr_unit_shift, 0, 2); + + if(seq->color_config.subsampling_x && + seq->color_config.subsampling_y && uses_chroma_lr) { + fb(1, lr_uv_shift); + } else { + infer(lr_uv_shift, 0); + } + } + + return 0; +} + +static int FUNC(read_tx_mode)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + int err; + + if (priv->coded_lossless) + infer(tx_mode, 0); + else + increment(tx_mode, 1, 2); + + return 0; +} + +static int FUNC(frame_reference_mode)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + int err; + + if (current->frame_type == AV1_FRAME_INTRA_ONLY || + current->frame_type == AV1_FRAME_KEY) + infer(reference_select, 0); + else + flag(reference_select); + + return 0; +} + +static int FUNC(skip_mode_params)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + const AV1RawSequenceHeader *seq = priv->sequence_header; + int skip_mode_allowed; + int err; + + if (current->frame_type == AV1_FRAME_KEY || + current->frame_type == AV1_FRAME_INTRA_ONLY || + !current->reference_select || !seq->enable_order_hint) { + skip_mode_allowed = 0; + } else { + int forward_idx, backward_idx; + int forward_hint, backward_hint; + int ref_hint, dist, i; + + forward_idx = -1; + backward_idx = -1; + for (i = 0; i < AV1_REFS_PER_FRAME; i++) { + ref_hint = priv->ref[current->ref_frame_idx[i]].order_hint; + dist = cbs_av1_get_relative_dist(seq, ref_hint, + priv->order_hint); + if (dist < 0) { + if (forward_idx < 0 || + cbs_av1_get_relative_dist(seq, ref_hint, + forward_hint) > 0) { + forward_idx = i; + forward_hint = ref_hint; + } + } else if (dist > 0) { + if (backward_idx < 0 || + cbs_av1_get_relative_dist(seq, ref_hint, + backward_hint) < 0) { + backward_idx = i; + backward_hint = ref_hint; + } + } + } + + if (forward_idx < 0) { + skip_mode_allowed = 0; + } else if (backward_idx >= 0) { + skip_mode_allowed = 1; + // Frames for skip mode are forward_idx and backward_idx. + } else { + int second_forward_idx; + int second_forward_hint; + + second_forward_idx = -1; + for (i = 0; i < AV1_REFS_PER_FRAME; i++) { + ref_hint = priv->ref[current->ref_frame_idx[i]].order_hint; + if (cbs_av1_get_relative_dist(seq, ref_hint, + forward_hint) < 0) { + if (second_forward_idx < 0 || + cbs_av1_get_relative_dist(seq, ref_hint, + second_forward_hint) > 0) { + second_forward_idx = i; + second_forward_hint = ref_hint; + } + } + } + + if (second_forward_idx < 0) { + skip_mode_allowed = 0; + } else { + skip_mode_allowed = 1; + // Frames for skip mode are forward_idx and second_forward_idx. + } + } + } + + if (skip_mode_allowed) + flag(skip_mode_present); + else + infer(skip_mode_present, 0); + + return 0; +} + +static int FUNC(global_motion_param)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current, + int type, int ref, int idx) +{ + uint32_t abs_bits, prec_bits, num_syms; + int err; + + if (idx < 2) { + if (type == AV1_WARP_MODEL_TRANSLATION) { + abs_bits = AV1_GM_ABS_TRANS_ONLY_BITS - !current->allow_high_precision_mv; + prec_bits = AV1_GM_TRANS_ONLY_PREC_BITS - !current->allow_high_precision_mv; + } else { + abs_bits = AV1_GM_ABS_TRANS_BITS; + prec_bits = AV1_GM_TRANS_PREC_BITS; + } + } else { + abs_bits = AV1_GM_ABS_ALPHA_BITS; + prec_bits = AV1_GM_ALPHA_PREC_BITS; + } + + num_syms = 2 * (1 << abs_bits) + 1; + subexp(gm_params[ref][idx], num_syms, 2, ref, idx); + + // Actual gm_params value is not reconstructed here. + (void)prec_bits; + + return 0; +} + +static int FUNC(global_motion_params)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + int ref, type; + int err; + + if (current->frame_type == AV1_FRAME_KEY || + current->frame_type == AV1_FRAME_INTRA_ONLY) + return 0; + + for (ref = AV1_REF_FRAME_LAST; ref <= AV1_REF_FRAME_ALTREF; ref++) { + flags(is_global[ref], 1, ref); + if (current->is_global[ref]) { + flags(is_rot_zoom[ref], 1, ref); + if (current->is_rot_zoom[ref]) { + type = AV1_WARP_MODEL_ROTZOOM; + } else { + flags(is_translation[ref], 1, ref); + type = current->is_translation[ref] ? AV1_WARP_MODEL_TRANSLATION + : AV1_WARP_MODEL_AFFINE; + } + } else { + type = AV1_WARP_MODEL_IDENTITY; + } + + if (type >= AV1_WARP_MODEL_ROTZOOM) { + CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 2)); + CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 3)); + if (type == AV1_WARP_MODEL_AFFINE) { + CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 4)); + CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 5)); + } else { + // gm_params[ref][4] = -gm_params[ref][3] + // gm_params[ref][5] = gm_params[ref][2] + } + } + if (type >= AV1_WARP_MODEL_TRANSLATION) { + CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 0)); + CHECK(FUNC(global_motion_param)(ctx, rw, current, type, ref, 1)); + } + } + + return 0; +} + +static int FUNC(film_grain_params)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFilmGrainParams *current, + AV1RawFrameHeader *frame_header) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + const AV1RawSequenceHeader *seq = priv->sequence_header; + int num_pos_luma, num_pos_chroma; + int i, err; + + if (!seq->film_grain_params_present || + (!frame_header->show_frame && !frame_header->showable_frame)) + return 0; + + flag(apply_grain); + + if (!current->apply_grain) + return 0; + + fb(16, grain_seed); + + if (frame_header->frame_type == AV1_FRAME_INTER) + flag(update_grain); + else + infer(update_grain, 1); + + if (!current->update_grain) { + fb(3, film_grain_params_ref_idx); + return 0; + } + + fc(4, num_y_points, 0, 14); + for (i = 0; i < current->num_y_points; i++) { + fcs(8, point_y_value[i], + i ? current->point_y_value[i - 1] + 1 : 0, + MAX_UINT_BITS(8) - (current->num_y_points - i - 1), + 1, i); + fbs(8, point_y_scaling[i], 1, i); + } + + if (seq->color_config.mono_chrome) + infer(chroma_scaling_from_luma, 0); + else + flag(chroma_scaling_from_luma); + + if (seq->color_config.mono_chrome || + current->chroma_scaling_from_luma || + (seq->color_config.subsampling_x == 1 && + seq->color_config.subsampling_y == 1 && + current->num_y_points == 0)) { + infer(num_cb_points, 0); + infer(num_cr_points, 0); + } else { + fc(4, num_cb_points, 0, 10); + for (i = 0; i < current->num_cb_points; i++) { + fcs(8, point_cb_value[i], + i ? current->point_cb_value[i - 1] + 1 : 0, + MAX_UINT_BITS(8) - (current->num_cb_points - i - 1), + 1, i); + fbs(8, point_cb_scaling[i], 1, i); + } + fc(4, num_cr_points, 0, 10); + for (i = 0; i < current->num_cr_points; i++) { + fcs(8, point_cr_value[i], + i ? current->point_cr_value[i - 1] + 1 : 0, + MAX_UINT_BITS(8) - (current->num_cr_points - i - 1), + 1, i); + fbs(8, point_cr_scaling[i], 1, i); + } + } + + fb(2, grain_scaling_minus_8); + fb(2, ar_coeff_lag); + num_pos_luma = 2 * current->ar_coeff_lag * (current->ar_coeff_lag + 1); + if (current->num_y_points) { + num_pos_chroma = num_pos_luma + 1; + for (i = 0; i < num_pos_luma; i++) + fbs(8, ar_coeffs_y_plus_128[i], 1, i); + } else { + num_pos_chroma = num_pos_luma; + } + if (current->chroma_scaling_from_luma || current->num_cb_points) { + for (i = 0; i < num_pos_chroma; i++) + fbs(8, ar_coeffs_cb_plus_128[i], 1, i); + } + if (current->chroma_scaling_from_luma || current->num_cr_points) { + for (i = 0; i < num_pos_chroma; i++) + fbs(8, ar_coeffs_cr_plus_128[i], 1, i); + } + fb(2, ar_coeff_shift_minus_6); + fb(2, grain_scale_shift); + if (current->num_cb_points) { + fb(8, cb_mult); + fb(8, cb_luma_mult); + fb(9, cb_offset); + } + if (current->num_cr_points) { + fb(8, cr_mult); + fb(8, cr_luma_mult); + fb(9, cr_offset); + } + + flag(overlap_flag); + flag(clip_to_restricted_range); + + return 0; +} + +static int FUNC(uncompressed_header)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + const AV1RawSequenceHeader *seq; + int id_len, diff_len, all_frames, frame_is_intra, order_hint_bits; + int i, err; + + if (!priv->sequence_header) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "No sequence header available: " + "unable to decode frame header.\n"); + return AVERROR_INVALIDDATA; + } + seq = priv->sequence_header; + + id_len = seq->additional_frame_id_length_minus_1 + + seq->delta_frame_id_length_minus_2 + 3; + all_frames = (1 << AV1_NUM_REF_FRAMES) - 1; + + if (seq->reduced_still_picture_header) { + infer(show_existing_frame, 0); + infer(frame_type, AV1_FRAME_KEY); + infer(show_frame, 1); + infer(showable_frame, 0); + frame_is_intra = 1; + + } else { + flag(show_existing_frame); + + if (current->show_existing_frame) { + AV1ReferenceFrameState *ref; + + fb(3, frame_to_show_map_idx); + ref = &priv->ref[current->frame_to_show_map_idx]; + + if (!ref->valid) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Missing reference frame needed for " + "show_existing_frame (frame_to_show_map_idx = %d).\n", + current->frame_to_show_map_idx); + return AVERROR_INVALIDDATA; + } + + if (seq->decoder_model_info_present_flag && + !seq->timing_info.equal_picture_interval) { + fb(seq->decoder_model_info.frame_presentation_time_length_minus_1 + 1, + frame_presentation_time); + } + + if (seq->frame_id_numbers_present_flag) + fb(id_len, display_frame_id); + + infer(frame_type, ref->frame_type); + if (current->frame_type == AV1_FRAME_KEY) { + infer(refresh_frame_flags, all_frames); + + // Section 7.21 + infer(current_frame_id, ref->frame_id); + priv->upscaled_width = ref->upscaled_width; + priv->frame_width = ref->frame_width; + priv->frame_height = ref->frame_height; + priv->render_width = ref->render_width; + priv->render_height = ref->render_height; + priv->bit_depth = ref->bit_depth; + priv->order_hint = ref->order_hint; + } else + infer(refresh_frame_flags, 0); + + infer(frame_width_minus_1, ref->upscaled_width - 1); + infer(frame_height_minus_1, ref->frame_height - 1); + infer(render_width_minus_1, ref->render_width - 1); + infer(render_height_minus_1, ref->render_height - 1); + + // Section 7.20 + goto update_refs; + } + + fb(2, frame_type); + frame_is_intra = (current->frame_type == AV1_FRAME_INTRA_ONLY || + current->frame_type == AV1_FRAME_KEY); + + flag(show_frame); + if (current->show_frame && + seq->decoder_model_info_present_flag && + !seq->timing_info.equal_picture_interval) { + fb(seq->decoder_model_info.frame_presentation_time_length_minus_1 + 1, + frame_presentation_time); + } + if (current->show_frame) + infer(showable_frame, current->frame_type != AV1_FRAME_KEY); + else + flag(showable_frame); + + if (current->frame_type == AV1_FRAME_SWITCH || + (current->frame_type == AV1_FRAME_KEY && current->show_frame)) + infer(error_resilient_mode, 1); + else + flag(error_resilient_mode); + } + + if (current->frame_type == AV1_FRAME_KEY && current->show_frame) { + for (i = 0; i < AV1_NUM_REF_FRAMES; i++) { + priv->ref[i].valid = 0; + priv->ref[i].order_hint = 0; + } + } + + flag(disable_cdf_update); + + if (seq->seq_force_screen_content_tools == + AV1_SELECT_SCREEN_CONTENT_TOOLS) { + flag(allow_screen_content_tools); + } else { + infer(allow_screen_content_tools, + seq->seq_force_screen_content_tools); + } + if (current->allow_screen_content_tools) { + if (seq->seq_force_integer_mv == AV1_SELECT_INTEGER_MV) + flag(force_integer_mv); + else + infer(force_integer_mv, seq->seq_force_integer_mv); + } else { + infer(force_integer_mv, 0); + } + + if (seq->frame_id_numbers_present_flag) { + fb(id_len, current_frame_id); + + diff_len = seq->delta_frame_id_length_minus_2 + 2; + for (i = 0; i < AV1_NUM_REF_FRAMES; i++) { + if (current->current_frame_id > (1 << diff_len)) { + if (priv->ref[i].frame_id > current->current_frame_id || + priv->ref[i].frame_id < (current->current_frame_id - + (1 << diff_len))) + priv->ref[i].valid = 0; + } else { + if (priv->ref[i].frame_id > current->current_frame_id && + priv->ref[i].frame_id < ((1 << id_len) + + current->current_frame_id - + (1 << diff_len))) + priv->ref[i].valid = 0; + } + } + } else { + infer(current_frame_id, 0); + } + + if (current->frame_type == AV1_FRAME_SWITCH) + infer(frame_size_override_flag, 1); + else if(seq->reduced_still_picture_header) + infer(frame_size_override_flag, 0); + else + flag(frame_size_override_flag); + + order_hint_bits = + seq->enable_order_hint ? seq->order_hint_bits_minus_1 + 1 : 0; + if (order_hint_bits > 0) + fb(order_hint_bits, order_hint); + else + infer(order_hint, 0); + priv->order_hint = current->order_hint; + + if (frame_is_intra || current->error_resilient_mode) + infer(primary_ref_frame, AV1_PRIMARY_REF_NONE); + else + fb(3, primary_ref_frame); + + if (seq->decoder_model_info_present_flag) { + flag(buffer_removal_time_present_flag); + if (current->buffer_removal_time_present_flag) { + for (i = 0; i <= seq->operating_points_cnt_minus_1; i++) { + if (seq->decoder_model_present_for_this_op[i]) { + int op_pt_idc = seq->operating_point_idc[i]; + int in_temporal_layer = (op_pt_idc >> priv->temporal_id ) & 1; + int in_spatial_layer = (op_pt_idc >> (priv->spatial_id + 8)) & 1; + if (seq->operating_point_idc[i] == 0 || + (in_temporal_layer && in_spatial_layer)) { + fbs(seq->decoder_model_info.buffer_removal_time_length_minus_1 + 1, + buffer_removal_time[i], 1, i); + } + } + } + } + } + + if (current->frame_type == AV1_FRAME_SWITCH || + (current->frame_type == AV1_FRAME_KEY && current->show_frame)) + infer(refresh_frame_flags, all_frames); + else + fb(8, refresh_frame_flags); + + if (!frame_is_intra || current->refresh_frame_flags != all_frames) { + if (seq->enable_order_hint) { + for (i = 0; i < AV1_NUM_REF_FRAMES; i++) { + if (current->error_resilient_mode) + fbs(order_hint_bits, ref_order_hint[i], 1, i); + else + infer(ref_order_hint[i], priv->ref[i].order_hint); + if (current->ref_order_hint[i] != priv->ref[i].order_hint) + priv->ref[i].valid = 0; + } + } + } + + if (current->frame_type == AV1_FRAME_KEY || + current->frame_type == AV1_FRAME_INTRA_ONLY) { + CHECK(FUNC(frame_size)(ctx, rw, current)); + CHECK(FUNC(render_size)(ctx, rw, current)); + + if (current->allow_screen_content_tools && + priv->upscaled_width == priv->frame_width) + flag(allow_intrabc); + else + infer(allow_intrabc, 0); + + } else { + if (!seq->enable_order_hint) { + infer(frame_refs_short_signaling, 0); + } else { + flag(frame_refs_short_signaling); + if (current->frame_refs_short_signaling) { + fb(3, last_frame_idx); + fb(3, golden_frame_idx); + CHECK(FUNC(set_frame_refs)(ctx, rw, current)); + } + } + + for (i = 0; i < AV1_REFS_PER_FRAME; i++) { + if (!current->frame_refs_short_signaling) + fbs(3, ref_frame_idx[i], 1, i); + if (seq->frame_id_numbers_present_flag) { + fbs(seq->delta_frame_id_length_minus_2 + 2, + delta_frame_id_minus1[i], 1, i); + } + } + + if (current->frame_size_override_flag && + !current->error_resilient_mode) { + CHECK(FUNC(frame_size_with_refs)(ctx, rw, current)); + } else { + CHECK(FUNC(frame_size)(ctx, rw, current)); + CHECK(FUNC(render_size)(ctx, rw, current)); + } + + if (current->force_integer_mv) + infer(allow_high_precision_mv, 0); + else + flag(allow_high_precision_mv); + + CHECK(FUNC(interpolation_filter)(ctx, rw, current)); + + flag(is_motion_mode_switchable); + + if (current->error_resilient_mode || + !seq->enable_ref_frame_mvs) + infer(use_ref_frame_mvs, 0); + else + flag(use_ref_frame_mvs); + + infer(allow_intrabc, 0); + } + + if (!frame_is_intra) { + // Derive reference frame sign biases. + } + + if (seq->reduced_still_picture_header || current->disable_cdf_update) + infer(disable_frame_end_update_cdf, 1); + else + flag(disable_frame_end_update_cdf); + + if (current->primary_ref_frame == AV1_PRIMARY_REF_NONE) { + // Init non-coeff CDFs. + // Setup past independence. + } else { + // Load CDF tables from previous frame. + // Load params from previous frame. + } + + if (current->use_ref_frame_mvs) { + // Perform motion field estimation process. + } + + CHECK(FUNC(tile_info)(ctx, rw, current)); + + CHECK(FUNC(quantization_params)(ctx, rw, current)); + + CHECK(FUNC(segmentation_params)(ctx, rw, current)); + + CHECK(FUNC(delta_q_params)(ctx, rw, current)); + + CHECK(FUNC(delta_lf_params)(ctx, rw, current)); + + // Init coeff CDFs / load previous segments. + + priv->coded_lossless = 1; + for (i = 0; i < AV1_MAX_SEGMENTS; i++) { + int qindex; + if (current->feature_enabled[i][AV1_SEG_LVL_ALT_Q]) { + qindex = (current->base_q_idx + + current->feature_value[i][AV1_SEG_LVL_ALT_Q]); + } else { + qindex = current->base_q_idx; + } + qindex = av_clip_uintp2(qindex, 8); + + if (qindex || current->delta_q_y_dc || + current->delta_q_u_ac || current->delta_q_u_dc || + current->delta_q_v_ac || current->delta_q_v_dc) { + priv->coded_lossless = 0; + } + } + priv->all_lossless = priv->coded_lossless && + priv->frame_width == priv->upscaled_width; + + CHECK(FUNC(loop_filter_params)(ctx, rw, current)); + + CHECK(FUNC(cdef_params)(ctx, rw, current)); + + CHECK(FUNC(lr_params)(ctx, rw, current)); + + CHECK(FUNC(read_tx_mode)(ctx, rw, current)); + + CHECK(FUNC(frame_reference_mode)(ctx, rw, current)); + + CHECK(FUNC(skip_mode_params)(ctx, rw, current)); + + if (frame_is_intra || current->error_resilient_mode || + !seq->enable_warped_motion) + infer(allow_warped_motion, 0); + else + flag(allow_warped_motion); + + flag(reduced_tx_set); + + CHECK(FUNC(global_motion_params)(ctx, rw, current)); + + CHECK(FUNC(film_grain_params)(ctx, rw, ¤t->film_grain, current)); + + av_log(ctx->log_ctx, AV_LOG_DEBUG, "Frame %d: size %dx%d " + "upscaled %d render %dx%d subsample %dx%d " + "bitdepth %d tiles %dx%d.\n", priv->order_hint, + priv->frame_width, priv->frame_height, priv->upscaled_width, + priv->render_width, priv->render_height, + seq->color_config.subsampling_x + 1, + seq->color_config.subsampling_y + 1, priv->bit_depth, + priv->tile_rows, priv->tile_cols); + +update_refs: + for (i = 0; i < AV1_NUM_REF_FRAMES; i++) { + if (current->refresh_frame_flags & (1 << i)) { + priv->ref[i] = (AV1ReferenceFrameState) { + .valid = 1, + .frame_id = current->current_frame_id, + .upscaled_width = priv->upscaled_width, + .frame_width = priv->frame_width, + .frame_height = priv->frame_height, + .render_width = priv->render_width, + .render_height = priv->render_height, + .frame_type = current->frame_type, + .subsampling_x = seq->color_config.subsampling_x, + .subsampling_y = seq->color_config.subsampling_y, + .bit_depth = priv->bit_depth, + .order_hint = priv->order_hint, + }; + memcpy(priv->ref[i].loop_filter_ref_deltas, current->loop_filter_ref_deltas, + sizeof(current->loop_filter_ref_deltas)); + memcpy(priv->ref[i].loop_filter_mode_deltas, current->loop_filter_mode_deltas, + sizeof(current->loop_filter_mode_deltas)); + memcpy(priv->ref[i].feature_enabled, current->feature_enabled, + sizeof(current->feature_enabled)); + memcpy(priv->ref[i].feature_value, current->feature_value, + sizeof(current->feature_value)); + } + } + + return 0; +} + +static int FUNC(frame_header_obu)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrameHeader *current, int redundant, + AVBufferRef *rw_buffer_ref) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + int start_pos, fh_bits, fh_bytes, err; + uint8_t *fh_start; + + if (priv->seen_frame_header) { + if (!redundant) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "Invalid repeated " + "frame header OBU.\n"); + return AVERROR_INVALIDDATA; + } else { + GetBitContext fh; + size_t i, b; + uint32_t val; + + HEADER("Redundant Frame Header"); + + av_assert0(priv->frame_header_ref && priv->frame_header); + + init_get_bits(&fh, priv->frame_header, + priv->frame_header_size); + for (i = 0; i < priv->frame_header_size; i += 8) { + b = FFMIN(priv->frame_header_size - i, 8); + val = get_bits(&fh, b); + xf(b, frame_header_copy[i], + val, val, val, 1, i / 8); + } + } + } else { + if (redundant) + HEADER("Redundant Frame Header (used as Frame Header)"); + else + HEADER("Frame Header"); + +#ifdef READ + start_pos = get_bits_count(rw); +#else + start_pos = put_bits_count(rw); +#endif + + CHECK(FUNC(uncompressed_header)(ctx, rw, current)); + + priv->tile_num = 0; + + if (current->show_existing_frame) { + priv->seen_frame_header = 0; + } else { + priv->seen_frame_header = 1; + + av_buffer_unref(&priv->frame_header_ref); + +#ifdef READ + fh_bits = get_bits_count(rw) - start_pos; + fh_start = (uint8_t*)rw->buffer + start_pos / 8; +#else + // Need to flush the bitwriter so that we can copy its output, + // but use a copy so we don't affect the caller's structure. + { + PutBitContext tmp = *rw; + flush_put_bits(&tmp); + } + + fh_bits = put_bits_count(rw) - start_pos; + fh_start = rw->buf + start_pos / 8; +#endif + fh_bytes = (fh_bits + 7) / 8; + + priv->frame_header_size = fh_bits; + + if (rw_buffer_ref) { + priv->frame_header_ref = av_buffer_ref(rw_buffer_ref); + if (!priv->frame_header_ref) + return AVERROR(ENOMEM); + priv->frame_header = fh_start; + } else { + priv->frame_header_ref = + av_buffer_alloc(fh_bytes + AV_INPUT_BUFFER_PADDING_SIZE); + if (!priv->frame_header_ref) + return AVERROR(ENOMEM); + priv->frame_header = priv->frame_header_ref->data; + memcpy(priv->frame_header, fh_start, fh_bytes); + } + } + } + + return 0; +} + +static int FUNC(tile_group_obu)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawTileGroup *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + int num_tiles, tile_bits; + int err; + + HEADER("Tile Group"); + + num_tiles = priv->tile_cols * priv->tile_rows; + if (num_tiles > 1) + flag(tile_start_and_end_present_flag); + else + infer(tile_start_and_end_present_flag, 0); + + if (num_tiles == 1 || !current->tile_start_and_end_present_flag) { + infer(tg_start, 0); + infer(tg_end, num_tiles - 1); + } else { + tile_bits = cbs_av1_tile_log2(1, priv->tile_cols) + + cbs_av1_tile_log2(1, priv->tile_rows); + fc(tile_bits, tg_start, priv->tile_num, num_tiles - 1); + fc(tile_bits, tg_end, current->tg_start, num_tiles - 1); + } + + priv->tile_num = current->tg_end + 1; + + CHECK(FUNC(byte_alignment)(ctx, rw)); + + // Reset header for next frame. + if (current->tg_end == num_tiles - 1) + priv->seen_frame_header = 0; + + // Tile data follows. + + return 0; +} + +static int FUNC(frame_obu)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawFrame *current, + AVBufferRef *rw_buffer_ref) +{ + int err; + + CHECK(FUNC(frame_header_obu)(ctx, rw, ¤t->header, + 0, rw_buffer_ref)); + + CHECK(FUNC(byte_alignment)(ctx, rw)); + + CHECK(FUNC(tile_group_obu)(ctx, rw, ¤t->tile_group)); + + return 0; +} + +static int FUNC(tile_list_obu)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawTileList *current) +{ + int err; + + fb(8, output_frame_width_in_tiles_minus_1); + fb(8, output_frame_height_in_tiles_minus_1); + + fb(16, tile_count_minus_1); + + // Tile data follows. + + return 0; +} + +static int FUNC(metadata_hdr_cll)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawMetadataHDRCLL *current) +{ + int err; + + fb(16, max_cll); + fb(16, max_fall); + + return 0; +} + +static int FUNC(metadata_hdr_mdcv)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawMetadataHDRMDCV *current) +{ + int err, i; + + for (i = 0; i < 3; i++) { + fbs(16, primary_chromaticity_x[i], 1, i); + fbs(16, primary_chromaticity_y[i], 1, i); + } + + fb(16, white_point_chromaticity_x); + fb(16, white_point_chromaticity_y); + + fb(32, luminance_max); + fb(32, luminance_min); + + return 0; +} + +static int FUNC(scalability_structure)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawMetadataScalability *current) +{ + CodedBitstreamAV1Context *priv = ctx->priv_data; + const AV1RawSequenceHeader *seq; + int err, i, j; + + if (!priv->sequence_header) { + av_log(ctx->log_ctx, AV_LOG_ERROR, "No sequence header available: " + "unable to parse scalability metadata.\n"); + return AVERROR_INVALIDDATA; + } + seq = priv->sequence_header; + + fb(2, spatial_layers_cnt_minus_1); + flag(spatial_layer_dimensions_present_flag); + flag(spatial_layer_description_present_flag); + flag(temporal_group_description_present_flag); + fc(3, scalability_structure_reserved_3bits, 0, 0); + if (current->spatial_layer_dimensions_present_flag) { + for (i = 0; i <= current->spatial_layers_cnt_minus_1; i++) { + fcs(16, spatial_layer_max_width[i], + 0, seq->max_frame_width_minus_1 + 1, 1, i); + fcs(16, spatial_layer_max_height[i], + 0, seq->max_frame_height_minus_1 + 1, 1, i); + } + } + if (current->spatial_layer_description_present_flag) { + for (i = 0; i <= current->spatial_layers_cnt_minus_1; i++) + fbs(8, spatial_layer_ref_id[i], 1, i); + } + if (current->temporal_group_description_present_flag) { + fb(8, temporal_group_size); + for (i = 0; i < current->temporal_group_size; i++) { + fbs(3, temporal_group_temporal_id[i], 1, i); + flags(temporal_group_temporal_switching_up_point_flag[i], 1, i); + flags(temporal_group_spatial_switching_up_point_flag[i], 1, i); + fbs(3, temporal_group_ref_cnt[i], 1, i); + for (j = 0; j < current->temporal_group_ref_cnt[i]; j++) { + fbs(8, temporal_group_ref_pic_diff[i][j], 2, i, j); + } + } + } + + return 0; +} + +static int FUNC(metadata_scalability)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawMetadataScalability *current) +{ + int err; + + fb(8, scalability_mode_idc); + + if (current->scalability_mode_idc == AV1_SCALABILITY_SS) + CHECK(FUNC(scalability_structure)(ctx, rw, current)); + + return 0; +} + +static int FUNC(metadata_itut_t35)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawMetadataITUTT35 *current) +{ + int err; + size_t i; + + fb(8, itu_t_t35_country_code); + if (current->itu_t_t35_country_code == 0xff) + fb(8, itu_t_t35_country_code_extension_byte); + +#ifdef READ + // The payload runs up to the start of the trailing bits, but there might + // be arbitrarily many trailing zeroes so we need to read through twice. + current->payload_size = cbs_av1_get_payload_bytes_left(rw); + + current->payload_ref = av_buffer_alloc(current->payload_size); + if (!current->payload_ref) + return AVERROR(ENOMEM); + current->payload = current->payload_ref->data; +#endif + + for (i = 0; i < current->payload_size; i++) + xf(8, itu_t_t35_payload_bytes[i], current->payload[i], + 0x00, 0xff, 1, i); + + return 0; +} + +static int FUNC(metadata_timecode)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawMetadataTimecode *current) +{ + int err; + + fb(5, counting_type); + flag(full_timestamp_flag); + flag(discontinuity_flag); + flag(cnt_dropped_flag); + fb(9, n_frames); + + if (current->full_timestamp_flag) { + fc(6, seconds_value, 0, 59); + fc(6, minutes_value, 0, 59); + fc(5, hours_value, 0, 23); + } else { + flag(seconds_flag); + if (current->seconds_flag) { + fc(6, seconds_value, 0, 59); + flag(minutes_flag); + if (current->minutes_flag) { + fc(6, minutes_value, 0, 59); + flag(hours_flag); + if (current->hours_flag) + fc(5, hours_value, 0, 23); + } + } + } + + fb(5, time_offset_length); + if (current->time_offset_length > 0) + fb(current->time_offset_length, time_offset_value); + else + infer(time_offset_length, 0); + + return 0; +} + +static int FUNC(metadata_obu)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawMetadata *current) +{ + int err; + + leb128(metadata_type); + + switch (current->metadata_type) { + case AV1_METADATA_TYPE_HDR_CLL: + CHECK(FUNC(metadata_hdr_cll)(ctx, rw, ¤t->metadata.hdr_cll)); + break; + case AV1_METADATA_TYPE_HDR_MDCV: + CHECK(FUNC(metadata_hdr_mdcv)(ctx, rw, ¤t->metadata.hdr_mdcv)); + break; + case AV1_METADATA_TYPE_SCALABILITY: + CHECK(FUNC(metadata_scalability)(ctx, rw, ¤t->metadata.scalability)); + break; + case AV1_METADATA_TYPE_ITUT_T35: + CHECK(FUNC(metadata_itut_t35)(ctx, rw, ¤t->metadata.itut_t35)); + break; + case AV1_METADATA_TYPE_TIMECODE: + CHECK(FUNC(metadata_timecode)(ctx, rw, ¤t->metadata.timecode)); + break; + default: + // Unknown metadata type. + return AVERROR_PATCHWELCOME; + } + + return 0; +} + +static int FUNC(padding_obu)(CodedBitstreamContext *ctx, RWContext *rw, + AV1RawPadding *current) +{ + int i, err; + + HEADER("Padding"); + +#ifdef READ + // The payload runs up to the start of the trailing bits, but there might + // be arbitrarily many trailing zeroes so we need to read through twice. + current->payload_size = cbs_av1_get_payload_bytes_left(rw); + + current->payload_ref = av_buffer_alloc(current->payload_size); + if (!current->payload_ref) + return AVERROR(ENOMEM); + current->payload = current->payload_ref->data; +#endif + + for (i = 0; i < current->payload_size; i++) + xf(8, obu_padding_byte[i], current->payload[i], 0x00, 0xff, 1, i); + + return 0; +} diff --git a/media/ffvpx/libavcodec/cbs_internal.h b/media/ffvpx/libavcodec/cbs_internal.h new file mode 100644 index 0000000000..e585c77934 --- /dev/null +++ b/media/ffvpx/libavcodec/cbs_internal.h @@ -0,0 +1,253 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_CBS_INTERNAL_H +#define AVCODEC_CBS_INTERNAL_H + +#include <stdint.h> + +#include "libavutil/buffer.h" +#include "libavutil/log.h" + +#include "cbs.h" +#include "codec_id.h" +#include "get_bits.h" +#include "put_bits.h" + + +enum CBSContentType { + // Unit content may contain some references to other structures, but all + // managed via buffer reference counting. The descriptor defines the + // structure offsets of every buffer reference. + CBS_CONTENT_TYPE_INTERNAL_REFS, + // Unit content is something more complex. The descriptor defines + // special functions to manage the content. + CBS_CONTENT_TYPE_COMPLEX, +}; + +enum { + // Maximum number of unit types described by the same non-range + // unit type descriptor. + CBS_MAX_LIST_UNIT_TYPES = 3, + // Maximum number of reference buffer offsets in any one unit. + CBS_MAX_REF_OFFSETS = 2, + // Special value used in a unit type descriptor to indicate that it + // applies to a large range of types rather than a set of discrete + // values. + CBS_UNIT_TYPE_RANGE = -1, +}; + +typedef const struct CodedBitstreamUnitTypeDescriptor { + // Number of entries in the unit_types array, or the special value + // CBS_UNIT_TYPE_RANGE to indicate that the range fields should be + // used instead. + int nb_unit_types; + + union { + // Array of unit types that this entry describes. + CodedBitstreamUnitType list[CBS_MAX_LIST_UNIT_TYPES]; + // Start and end of unit type range, used if nb_unit_types is + // CBS_UNIT_TYPE_RANGE. + struct { + CodedBitstreamUnitType start; + CodedBitstreamUnitType end; + } range; + } unit_type; + + // The type of content described. + enum CBSContentType content_type; + // The size of the structure which should be allocated to contain + // the decomposed content of this type of unit. + size_t content_size; + + union { + // This union's state is determined by content_type: + // ref for CBS_CONTENT_TYPE_INTERNAL_REFS, + // complex for CBS_CONTENT_TYPE_COMPLEX. + struct { + // Number of entries in the ref_offsets array. + // May be zero, then the structure is POD-like. + int nb_offsets; + // The structure must contain two adjacent elements: + // type *field; + // AVBufferRef *field_ref; + // where field points to something in the buffer referred to by + // field_ref. This offset is then set to offsetof(struct, field). + size_t offsets[CBS_MAX_REF_OFFSETS]; + } ref; + + struct { + void (*content_free)(void *opaque, uint8_t *data); + int (*content_clone)(AVBufferRef **ref, CodedBitstreamUnit *unit); + } complex; + } type; +} CodedBitstreamUnitTypeDescriptor; + +typedef struct CodedBitstreamType { + enum AVCodecID codec_id; + + // A class for the private data, used to declare private AVOptions. + // This field is NULL for types that do not declare any options. + // If this field is non-NULL, the first member of the filter private data + // must be a pointer to AVClass. + const AVClass *priv_class; + + size_t priv_data_size; + + // List of unit type descriptors for this codec. + // Terminated by a descriptor with nb_unit_types equal to zero. + const CodedBitstreamUnitTypeDescriptor *unit_types; + + // Split frag->data into coded bitstream units, creating the + // frag->units array. Fill data but not content on each unit. + // The header argument should be set if the fragment came from + // a header block, which may require different parsing for some + // codecs (e.g. the AVCC header in H.264). + int (*split_fragment)(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag, + int header); + + // Read the unit->data bitstream and decompose it, creating + // unit->content. + int (*read_unit)(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit); + + // Write the data bitstream from unit->content into pbc. + // Return value AVERROR(ENOSPC) indicates that pbc was too small. + int (*write_unit)(CodedBitstreamContext *ctx, + CodedBitstreamUnit *unit, + PutBitContext *pbc); + + // Read the data from all of frag->units and assemble it into + // a bitstream for the whole fragment. + int (*assemble_fragment)(CodedBitstreamContext *ctx, + CodedBitstreamFragment *frag); + + // Reset the codec internal state. + void (*flush)(CodedBitstreamContext *ctx); + + // Free the codec internal state. + void (*close)(CodedBitstreamContext *ctx); +} CodedBitstreamType; + + +// Helper functions for trace output. + +void ff_cbs_trace_header(CodedBitstreamContext *ctx, + const char *name); + +void ff_cbs_trace_syntax_element(CodedBitstreamContext *ctx, int position, + const char *name, const int *subscripts, + const char *bitstring, int64_t value); + + +// Helper functions for read/write of common bitstream elements, including +// generation of trace output. + +int ff_cbs_read_unsigned(CodedBitstreamContext *ctx, GetBitContext *gbc, + int width, const char *name, + const int *subscripts, uint32_t *write_to, + uint32_t range_min, uint32_t range_max); + +int ff_cbs_write_unsigned(CodedBitstreamContext *ctx, PutBitContext *pbc, + int width, const char *name, + const int *subscripts, uint32_t value, + uint32_t range_min, uint32_t range_max); + +int ff_cbs_read_signed(CodedBitstreamContext *ctx, GetBitContext *gbc, + int width, const char *name, + const int *subscripts, int32_t *write_to, + int32_t range_min, int32_t range_max); + +int ff_cbs_write_signed(CodedBitstreamContext *ctx, PutBitContext *pbc, + int width, const char *name, + const int *subscripts, int32_t value, + int32_t range_min, int32_t range_max); + +// The largest unsigned value representable in N bits, suitable for use as +// range_max in the above functions. +#define MAX_UINT_BITS(length) ((UINT64_C(1) << (length)) - 1) + +// The largest signed value representable in N bits, suitable for use as +// range_max in the above functions. +#define MAX_INT_BITS(length) ((INT64_C(1) << ((length) - 1)) - 1) + +// The smallest signed value representable in N bits, suitable for use as +// range_min in the above functions. +#define MIN_INT_BITS(length) (-(INT64_C(1) << ((length) - 1))) + +#define TYPE_LIST(...) { __VA_ARGS__ } +#define CBS_UNIT_TYPE_POD(type_, structure) { \ + .nb_unit_types = 1, \ + .unit_type.list = { type_ }, \ + .content_type = CBS_CONTENT_TYPE_INTERNAL_REFS, \ + .content_size = sizeof(structure), \ + .type.ref = { .nb_offsets = 0 }, \ + } +#define CBS_UNIT_RANGE_POD(range_start, range_end, structure) { \ + .nb_unit_types = CBS_UNIT_TYPE_RANGE, \ + .unit_type.range.start = range_start, \ + .unit_type.range.end = range_end, \ + .content_type = CBS_CONTENT_TYPE_INTERNAL_REFS, \ + .content_size = sizeof(structure), \ + .type.ref = { .nb_offsets = 0 }, \ + } + +#define CBS_UNIT_TYPES_INTERNAL_REF(types, structure, ref_field) { \ + .nb_unit_types = FF_ARRAY_ELEMS((CodedBitstreamUnitType[])TYPE_LIST types), \ + .unit_type.list = TYPE_LIST types, \ + .content_type = CBS_CONTENT_TYPE_INTERNAL_REFS, \ + .content_size = sizeof(structure), \ + .type.ref = { .nb_offsets = 1, \ + .offsets = { offsetof(structure, ref_field) } }, \ + } +#define CBS_UNIT_TYPE_INTERNAL_REF(type, structure, ref_field) \ + CBS_UNIT_TYPES_INTERNAL_REF((type), structure, ref_field) + +#define CBS_UNIT_RANGE_INTERNAL_REF(range_start, range_end, structure, ref_field) { \ + .nb_unit_types = CBS_UNIT_TYPE_RANGE, \ + .unit_type.range.start = range_start, \ + .unit_type.range.end = range_end, \ + .content_type = CBS_CONTENT_TYPE_INTERNAL_REFS, \ + .content_size = sizeof(structure), \ + .type.ref = { .nb_offsets = 1, \ + .offsets = { offsetof(structure, ref_field) } }, \ + } + +#define CBS_UNIT_TYPES_COMPLEX(types, structure, free_func) { \ + .nb_unit_types = FF_ARRAY_ELEMS((CodedBitstreamUnitType[])TYPE_LIST types), \ + .unit_type.list = TYPE_LIST types, \ + .content_type = CBS_CONTENT_TYPE_COMPLEX, \ + .content_size = sizeof(structure), \ + .type.complex = { .content_free = free_func }, \ + } +#define CBS_UNIT_TYPE_COMPLEX(type, structure, free_func) \ + CBS_UNIT_TYPES_COMPLEX((type), structure, free_func) + +#define CBS_UNIT_TYPE_END_OF_LIST { .nb_unit_types = 0 } + + +extern const CodedBitstreamType ff_cbs_type_av1; +extern const CodedBitstreamType ff_cbs_type_h264; +extern const CodedBitstreamType ff_cbs_type_h265; +extern const CodedBitstreamType ff_cbs_type_jpeg; +extern const CodedBitstreamType ff_cbs_type_mpeg2; +extern const CodedBitstreamType ff_cbs_type_vp9; + + +#endif /* AVCODEC_CBS_INTERNAL_H */ diff --git a/media/ffvpx/libavcodec/codec.h b/media/ffvpx/libavcodec/codec.h new file mode 100644 index 0000000000..3b1995bcfe --- /dev/null +++ b/media/ffvpx/libavcodec/codec.h @@ -0,0 +1,375 @@ +/* + * AVCodec public API + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_CODEC_H +#define AVCODEC_CODEC_H + +#include <stdint.h> + +#include "libavutil/avutil.h" +#include "libavutil/hwcontext.h" +#include "libavutil/log.h" +#include "libavutil/pixfmt.h" +#include "libavutil/rational.h" +#include "libavutil/samplefmt.h" + +#include "libavcodec/codec_id.h" +#include "libavcodec/version_major.h" + +/** + * @addtogroup lavc_core + * @{ + */ + +/** + * Decoder can use draw_horiz_band callback. + */ +#define AV_CODEC_CAP_DRAW_HORIZ_BAND (1 << 0) +/** + * Codec uses get_buffer() or get_encode_buffer() for allocating buffers and + * supports custom allocators. + * If not set, it might not use get_buffer() or get_encode_buffer() at all, or + * use operations that assume the buffer was allocated by + * avcodec_default_get_buffer2 or avcodec_default_get_encode_buffer. + */ +#define AV_CODEC_CAP_DR1 (1 << 1) +/** + * Encoder or decoder requires flushing with NULL input at the end in order to + * give the complete and correct output. + * + * NOTE: If this flag is not set, the codec is guaranteed to never be fed with + * with NULL data. The user can still send NULL data to the public encode + * or decode function, but libavcodec will not pass it along to the codec + * unless this flag is set. + * + * Decoders: + * The decoder has a non-zero delay and needs to be fed with avpkt->data=NULL, + * avpkt->size=0 at the end to get the delayed data until the decoder no longer + * returns frames. + * + * Encoders: + * The encoder needs to be fed with NULL data at the end of encoding until the + * encoder no longer returns data. + * + * NOTE: For encoders implementing the AVCodec.encode2() function, setting this + * flag also means that the encoder must set the pts and duration for + * each output packet. If this flag is not set, the pts and duration will + * be determined by libavcodec from the input frame. + */ +#define AV_CODEC_CAP_DELAY (1 << 5) +/** + * Codec can be fed a final frame with a smaller size. + * This can be used to prevent truncation of the last audio samples. + */ +#define AV_CODEC_CAP_SMALL_LAST_FRAME (1 << 6) + +/** + * Codec can output multiple frames per AVPacket + * Normally demuxers return one frame at a time, demuxers which do not do + * are connected to a parser to split what they return into proper frames. + * This flag is reserved to the very rare category of codecs which have a + * bitstream that cannot be split into frames without timeconsuming + * operations like full decoding. Demuxers carrying such bitstreams thus + * may return multiple frames in a packet. This has many disadvantages like + * prohibiting stream copy in many cases thus it should only be considered + * as a last resort. + */ +#define AV_CODEC_CAP_SUBFRAMES (1 << 8) +/** + * Codec is experimental and is thus avoided in favor of non experimental + * encoders + */ +#define AV_CODEC_CAP_EXPERIMENTAL (1 << 9) +/** + * Codec should fill in channel configuration and samplerate instead of container + */ +#define AV_CODEC_CAP_CHANNEL_CONF (1 << 10) +/** + * Codec supports frame-level multithreading. + */ +#define AV_CODEC_CAP_FRAME_THREADS (1 << 12) +/** + * Codec supports slice-based (or partition-based) multithreading. + */ +#define AV_CODEC_CAP_SLICE_THREADS (1 << 13) +/** + * Codec supports changed parameters at any point. + */ +#define AV_CODEC_CAP_PARAM_CHANGE (1 << 14) +/** + * Codec supports multithreading through a method other than slice- or + * frame-level multithreading. Typically this marks wrappers around + * multithreading-capable external libraries. + */ +#define AV_CODEC_CAP_OTHER_THREADS (1 << 15) +/** + * Audio encoder supports receiving a different number of samples in each call. + */ +#define AV_CODEC_CAP_VARIABLE_FRAME_SIZE (1 << 16) +/** + * Decoder is not a preferred choice for probing. + * This indicates that the decoder is not a good choice for probing. + * It could for example be an expensive to spin up hardware decoder, + * or it could simply not provide a lot of useful information about + * the stream. + * A decoder marked with this flag should only be used as last resort + * choice for probing. + */ +#define AV_CODEC_CAP_AVOID_PROBING (1 << 17) + +/** + * Codec is backed by a hardware implementation. Typically used to + * identify a non-hwaccel hardware decoder. For information about hwaccels, use + * avcodec_get_hw_config() instead. + */ +#define AV_CODEC_CAP_HARDWARE (1 << 18) + +/** + * Codec is potentially backed by a hardware implementation, but not + * necessarily. This is used instead of AV_CODEC_CAP_HARDWARE, if the + * implementation provides some sort of internal fallback. + */ +#define AV_CODEC_CAP_HYBRID (1 << 19) + +/** + * This encoder can reorder user opaque values from input AVFrames and return + * them with corresponding output packets. + * @see AV_CODEC_FLAG_COPY_OPAQUE + */ +#define AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE (1 << 20) + +/** + * This encoder can be flushed using avcodec_flush_buffers(). If this flag is + * not set, the encoder must be closed and reopened to ensure that no frames + * remain pending. + */ +#define AV_CODEC_CAP_ENCODER_FLUSH (1 << 21) + +/** + * The encoder is able to output reconstructed frame data, i.e. raw frames that + * would be produced by decoding the encoded bitstream. + * + * Reconstructed frame output is enabled by the AV_CODEC_FLAG_RECON_FRAME flag. + */ +#define AV_CODEC_CAP_ENCODER_RECON_FRAME (1 << 22) + +/** + * AVProfile. + */ +typedef struct AVProfile { + int profile; + const char *name; ///< short name for the profile +} AVProfile; + +/** + * AVCodec. + */ +typedef struct AVCodec { + /** + * Name of the codec implementation. + * The name is globally unique among encoders and among decoders (but an + * encoder and a decoder can share the same name). + * This is the primary way to find a codec from the user perspective. + */ + const char *name; + /** + * Descriptive name for the codec, meant to be more human readable than name. + * You should use the NULL_IF_CONFIG_SMALL() macro to define it. + */ + const char *long_name; + enum AVMediaType type; + enum AVCodecID id; + /** + * Codec capabilities. + * see AV_CODEC_CAP_* + */ + int capabilities; + uint8_t max_lowres; ///< maximum value for lowres supported by the decoder + const AVRational *supported_framerates; ///< array of supported framerates, or NULL if any, array is terminated by {0,0} + const enum AVPixelFormat *pix_fmts; ///< array of supported pixel formats, or NULL if unknown, array is terminated by -1 + const int *supported_samplerates; ///< array of supported audio samplerates, or NULL if unknown, array is terminated by 0 + const enum AVSampleFormat *sample_fmts; ///< array of supported sample formats, or NULL if unknown, array is terminated by -1 +#if FF_API_OLD_CHANNEL_LAYOUT + /** + * @deprecated use ch_layouts instead + */ + attribute_deprecated + const uint64_t *channel_layouts; ///< array of support channel layouts, or NULL if unknown. array is terminated by 0 +#endif + const AVClass *priv_class; ///< AVClass for the private context + const AVProfile *profiles; ///< array of recognized profiles, or NULL if unknown, array is terminated by {FF_PROFILE_UNKNOWN} + + /** + * Group name of the codec implementation. + * This is a short symbolic name of the wrapper backing this codec. A + * wrapper uses some kind of external implementation for the codec, such + * as an external library, or a codec implementation provided by the OS or + * the hardware. + * If this field is NULL, this is a builtin, libavcodec native codec. + * If non-NULL, this will be the suffix in AVCodec.name in most cases + * (usually AVCodec.name will be of the form "<codec_name>_<wrapper_name>"). + */ + const char *wrapper_name; + + /** + * Array of supported channel layouts, terminated with a zeroed layout. + */ + const AVChannelLayout *ch_layouts; +} AVCodec; + +/** + * Iterate over all registered codecs. + * + * @param opaque a pointer where libavcodec will store the iteration state. Must + * point to NULL to start the iteration. + * + * @return the next registered codec or NULL when the iteration is + * finished + */ +const AVCodec *av_codec_iterate(void **opaque); + +/** + * Find a registered decoder with a matching codec ID. + * + * @param id AVCodecID of the requested decoder + * @return A decoder if one was found, NULL otherwise. + */ +const AVCodec *avcodec_find_decoder(enum AVCodecID id); + +/** + * Find a registered decoder with the specified name. + * + * @param name name of the requested decoder + * @return A decoder if one was found, NULL otherwise. + */ +const AVCodec *avcodec_find_decoder_by_name(const char *name); + +/** + * Find a registered encoder with a matching codec ID. + * + * @param id AVCodecID of the requested encoder + * @return An encoder if one was found, NULL otherwise. + */ +const AVCodec *avcodec_find_encoder(enum AVCodecID id); + +/** + * Find a registered encoder with the specified name. + * + * @param name name of the requested encoder + * @return An encoder if one was found, NULL otherwise. + */ +const AVCodec *avcodec_find_encoder_by_name(const char *name); +/** + * @return a non-zero number if codec is an encoder, zero otherwise + */ +int av_codec_is_encoder(const AVCodec *codec); + +/** + * @return a non-zero number if codec is a decoder, zero otherwise + */ +int av_codec_is_decoder(const AVCodec *codec); + +/** + * Return a name for the specified profile, if available. + * + * @param codec the codec that is searched for the given profile + * @param profile the profile value for which a name is requested + * @return A name for the profile if found, NULL otherwise. + */ +const char *av_get_profile_name(const AVCodec *codec, int profile); + +enum { + /** + * The codec supports this format via the hw_device_ctx interface. + * + * When selecting this format, AVCodecContext.hw_device_ctx should + * have been set to a device of the specified type before calling + * avcodec_open2(). + */ + AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX = 0x01, + /** + * The codec supports this format via the hw_frames_ctx interface. + * + * When selecting this format for a decoder, + * AVCodecContext.hw_frames_ctx should be set to a suitable frames + * context inside the get_format() callback. The frames context + * must have been created on a device of the specified type. + * + * When selecting this format for an encoder, + * AVCodecContext.hw_frames_ctx should be set to the context which + * will be used for the input frames before calling avcodec_open2(). + */ + AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX = 0x02, + /** + * The codec supports this format by some internal method. + * + * This format can be selected without any additional configuration - + * no device or frames context is required. + */ + AV_CODEC_HW_CONFIG_METHOD_INTERNAL = 0x04, + /** + * The codec supports this format by some ad-hoc method. + * + * Additional settings and/or function calls are required. See the + * codec-specific documentation for details. (Methods requiring + * this sort of configuration are deprecated and others should be + * used in preference.) + */ + AV_CODEC_HW_CONFIG_METHOD_AD_HOC = 0x08, +}; + +typedef struct AVCodecHWConfig { + /** + * For decoders, a hardware pixel format which that decoder may be + * able to decode to if suitable hardware is available. + * + * For encoders, a pixel format which the encoder may be able to + * accept. If set to AV_PIX_FMT_NONE, this applies to all pixel + * formats supported by the codec. + */ + enum AVPixelFormat pix_fmt; + /** + * Bit set of AV_CODEC_HW_CONFIG_METHOD_* flags, describing the possible + * setup methods which can be used with this configuration. + */ + int methods; + /** + * The device type associated with the configuration. + * + * Must be set for AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX and + * AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX, otherwise unused. + */ + enum AVHWDeviceType device_type; +} AVCodecHWConfig; + +/** + * Retrieve supported hardware configurations for a codec. + * + * Values of index from zero to some maximum return the indexed configuration + * descriptor; all other values return NULL. If the codec does not support + * any hardware configurations then it will always return NULL. + */ +const AVCodecHWConfig *avcodec_get_hw_config(const AVCodec *codec, int index); + +/** + * @} + */ + +#endif /* AVCODEC_CODEC_H */ diff --git a/media/ffvpx/libavcodec/codec_desc.c b/media/ffvpx/libavcodec/codec_desc.c new file mode 100644 index 0000000000..199f62df15 --- /dev/null +++ b/media/ffvpx/libavcodec/codec_desc.c @@ -0,0 +1,3693 @@ +/* + * This file is part of FFmpeg. + * + * This table was generated from the long and short names of AVCodecs + * please see the respective codec sources for authorship + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdlib.h> +#include <string.h> + +#include "libavutil/internal.h" +#include "libavutil/macros.h" + +#include "codec_id.h" +#include "codec_desc.h" +#include "profiles.h" + +#define MT(...) (const char *const[]){ __VA_ARGS__, NULL } + +static const AVCodecDescriptor codec_descriptors[] = { + /* video codecs */ + { + .id = AV_CODEC_ID_MPEG1VIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mpeg1video", + .long_name = NULL_IF_CONFIG_SMALL("MPEG-1 video"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + }, + { + .id = AV_CODEC_ID_MPEG2VIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mpeg2video", + .long_name = NULL_IF_CONFIG_SMALL("MPEG-2 video"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + .profiles = NULL_IF_CONFIG_SMALL(ff_mpeg2_video_profiles), + }, + { + .id = AV_CODEC_ID_H261, + .type = AVMEDIA_TYPE_VIDEO, + .name = "h261", + .long_name = NULL_IF_CONFIG_SMALL("H.261"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_H263, + .type = AVMEDIA_TYPE_VIDEO, + .name = "h263", + .long_name = NULL_IF_CONFIG_SMALL("H.263 / H.263-1996, H.263+ / H.263-1998 / H.263 version 2"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + }, + { + .id = AV_CODEC_ID_RV10, + .type = AVMEDIA_TYPE_VIDEO, + .name = "rv10", + .long_name = NULL_IF_CONFIG_SMALL("RealVideo 1.0"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_RV20, + .type = AVMEDIA_TYPE_VIDEO, + .name = "rv20", + .long_name = NULL_IF_CONFIG_SMALL("RealVideo 2.0"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + }, + { + .id = AV_CODEC_ID_MJPEG, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mjpeg", + .long_name = NULL_IF_CONFIG_SMALL("Motion JPEG"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + .mime_types= MT("image/jpeg"), + .profiles = NULL_IF_CONFIG_SMALL(ff_mjpeg_profiles), + }, + { + .id = AV_CODEC_ID_MJPEGB, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mjpegb", + .long_name = NULL_IF_CONFIG_SMALL("Apple MJPEG-B"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_LJPEG, + .type = AVMEDIA_TYPE_VIDEO, + .name = "ljpeg", + .long_name = NULL_IF_CONFIG_SMALL("Lossless JPEG"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_SP5X, + .type = AVMEDIA_TYPE_VIDEO, + .name = "sp5x", + .long_name = NULL_IF_CONFIG_SMALL("Sunplus JPEG (SP5X)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_JPEGLS, + .type = AVMEDIA_TYPE_VIDEO, + .name = "jpegls", + .long_name = NULL_IF_CONFIG_SMALL("JPEG-LS"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY | + AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MPEG4, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mpeg4", + .long_name = NULL_IF_CONFIG_SMALL("MPEG-4 part 2"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + .profiles = NULL_IF_CONFIG_SMALL(ff_mpeg4_video_profiles), + }, + { + .id = AV_CODEC_ID_RAWVIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "rawvideo", + .long_name = NULL_IF_CONFIG_SMALL("raw video"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MSMPEG4V1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "msmpeg4v1", + .long_name = NULL_IF_CONFIG_SMALL("MPEG-4 part 2 Microsoft variant version 1"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MSMPEG4V2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "msmpeg4v2", + .long_name = NULL_IF_CONFIG_SMALL("MPEG-4 part 2 Microsoft variant version 2"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MSMPEG4V3, + .type = AVMEDIA_TYPE_VIDEO, + .name = "msmpeg4v3", + .long_name = NULL_IF_CONFIG_SMALL("MPEG-4 part 2 Microsoft variant version 3"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WMV1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "wmv1", + .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 7"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WMV2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "wmv2", + .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 8"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_H263P, + .type = AVMEDIA_TYPE_VIDEO, + .name = "h263p", + .long_name = NULL_IF_CONFIG_SMALL("H.263+ / H.263-1998 / H.263 version 2"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + }, + { + .id = AV_CODEC_ID_H263I, + .type = AVMEDIA_TYPE_VIDEO, + .name = "h263i", + .long_name = NULL_IF_CONFIG_SMALL("Intel H.263"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + }, + { + .id = AV_CODEC_ID_FLV1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "flv1", + .long_name = NULL_IF_CONFIG_SMALL("FLV / Sorenson Spark / Sorenson H.263 (Flash Video)"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SVQ1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "svq1", + .long_name = NULL_IF_CONFIG_SMALL("Sorenson Vector Quantizer 1 / Sorenson Video 1 / SVQ1"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SVQ3, + .type = AVMEDIA_TYPE_VIDEO, + .name = "svq3", + .long_name = NULL_IF_CONFIG_SMALL("Sorenson Vector Quantizer 3 / Sorenson Video 3 / SVQ3"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + }, + { + .id = AV_CODEC_ID_DVVIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "dvvideo", + .long_name = NULL_IF_CONFIG_SMALL("DV (Digital Video)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_HUFFYUV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "huffyuv", + .long_name = NULL_IF_CONFIG_SMALL("HuffYUV"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_CYUV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cyuv", + .long_name = NULL_IF_CONFIG_SMALL("Creative YUV (CYUV)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_H264, + .type = AVMEDIA_TYPE_VIDEO, + .name = "h264", + .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS | AV_CODEC_PROP_REORDER, + .profiles = NULL_IF_CONFIG_SMALL(ff_h264_profiles), + }, + { + .id = AV_CODEC_ID_INDEO3, + .type = AVMEDIA_TYPE_VIDEO, + .name = "indeo3", + .long_name = NULL_IF_CONFIG_SMALL("Intel Indeo 3"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VP3, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vp3", + .long_name = NULL_IF_CONFIG_SMALL("On2 VP3"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_THEORA, + .type = AVMEDIA_TYPE_VIDEO, + .name = "theora", + .long_name = NULL_IF_CONFIG_SMALL("Theora"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ASV1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "asv1", + .long_name = NULL_IF_CONFIG_SMALL("ASUS V1"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ASV2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "asv2", + .long_name = NULL_IF_CONFIG_SMALL("ASUS V2"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_FFV1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "ffv1", + .long_name = NULL_IF_CONFIG_SMALL("FFmpeg video codec #1"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_4XM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "4xm", + .long_name = NULL_IF_CONFIG_SMALL("4X Movie"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VCR1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vcr1", + .long_name = NULL_IF_CONFIG_SMALL("ATI VCR1"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_CLJR, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cljr", + .long_name = NULL_IF_CONFIG_SMALL("Cirrus Logic AccuPak"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MDEC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mdec", + .long_name = NULL_IF_CONFIG_SMALL("Sony PlayStation MDEC (Motion DECoder)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ROQ, + .type = AVMEDIA_TYPE_VIDEO, + .name = "roq", + .long_name = NULL_IF_CONFIG_SMALL("id RoQ video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_INTERPLAY_VIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "interplayvideo", + .long_name = NULL_IF_CONFIG_SMALL("Interplay MVE video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_XAN_WC3, + .type = AVMEDIA_TYPE_VIDEO, + .name = "xan_wc3", + .long_name = NULL_IF_CONFIG_SMALL("Wing Commander III / Xan"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_XAN_WC4, + .type = AVMEDIA_TYPE_VIDEO, + .name = "xan_wc4", + .long_name = NULL_IF_CONFIG_SMALL("Wing Commander IV / Xxan"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_RPZA, + .type = AVMEDIA_TYPE_VIDEO, + .name = "rpza", + .long_name = NULL_IF_CONFIG_SMALL("QuickTime video (RPZA)"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_CINEPAK, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cinepak", + .long_name = NULL_IF_CONFIG_SMALL("Cinepak"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WS_VQA, + .type = AVMEDIA_TYPE_VIDEO, + .name = "ws_vqa", + .long_name = NULL_IF_CONFIG_SMALL("Westwood Studios VQA (Vector Quantized Animation) video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MSRLE, + .type = AVMEDIA_TYPE_VIDEO, + .name = "msrle", + .long_name = NULL_IF_CONFIG_SMALL("Microsoft RLE"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MSVIDEO1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "msvideo1", + .long_name = NULL_IF_CONFIG_SMALL("Microsoft Video 1"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_IDCIN, + .type = AVMEDIA_TYPE_VIDEO, + .name = "idcin", + .long_name = NULL_IF_CONFIG_SMALL("id Quake II CIN video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_8BPS, + .type = AVMEDIA_TYPE_VIDEO, + .name = "8bps", + .long_name = NULL_IF_CONFIG_SMALL("QuickTime 8BPS video"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_SMC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "smc", + .long_name = NULL_IF_CONFIG_SMALL("QuickTime Graphics (SMC)"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_FLIC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "flic", + .long_name = NULL_IF_CONFIG_SMALL("Autodesk Animator Flic video"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_TRUEMOTION1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "truemotion1", + .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 1.0"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VMDVIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vmdvideo", + .long_name = NULL_IF_CONFIG_SMALL("Sierra VMD video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MSZH, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mszh", + .long_name = NULL_IF_CONFIG_SMALL("LCL (LossLess Codec Library) MSZH"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_ZLIB, + .type = AVMEDIA_TYPE_VIDEO, + .name = "zlib", + .long_name = NULL_IF_CONFIG_SMALL("LCL (LossLess Codec Library) ZLIB"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_QTRLE, + .type = AVMEDIA_TYPE_VIDEO, + .name = "qtrle", + .long_name = NULL_IF_CONFIG_SMALL("QuickTime Animation (RLE) video"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_TSCC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "tscc", + .long_name = NULL_IF_CONFIG_SMALL("TechSmith Screen Capture Codec"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_ULTI, + .type = AVMEDIA_TYPE_VIDEO, + .name = "ulti", + .long_name = NULL_IF_CONFIG_SMALL("IBM UltiMotion"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_QDRAW, + .type = AVMEDIA_TYPE_VIDEO, + .name = "qdraw", + .long_name = NULL_IF_CONFIG_SMALL("Apple QuickDraw"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_VIXL, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vixl", + .long_name = NULL_IF_CONFIG_SMALL("Miro VideoXL"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_QPEG, + .type = AVMEDIA_TYPE_VIDEO, + .name = "qpeg", + .long_name = NULL_IF_CONFIG_SMALL("Q-team QPEG"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PNG, + .type = AVMEDIA_TYPE_VIDEO, + .name = "png", + .long_name = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"), + .props = AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/png"), + }, + { + .id = AV_CODEC_ID_PPM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "ppm", + .long_name = NULL_IF_CONFIG_SMALL("PPM (Portable PixelMap) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PBM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "pbm", + .long_name = NULL_IF_CONFIG_SMALL("PBM (Portable BitMap) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PGM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "pgm", + .long_name = NULL_IF_CONFIG_SMALL("PGM (Portable GrayMap) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PGMYUV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "pgmyuv", + .long_name = NULL_IF_CONFIG_SMALL("PGMYUV (Portable GrayMap YUV) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PAM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "pam", + .long_name = NULL_IF_CONFIG_SMALL("PAM (Portable AnyMap) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/x-portable-pixmap"), + }, + { + .id = AV_CODEC_ID_FFVHUFF, + .type = AVMEDIA_TYPE_VIDEO, + .name = "ffvhuff", + .long_name = NULL_IF_CONFIG_SMALL("Huffyuv FFmpeg variant"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_RV30, + .type = AVMEDIA_TYPE_VIDEO, + .name = "rv30", + .long_name = NULL_IF_CONFIG_SMALL("RealVideo 3.0"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + }, + { + .id = AV_CODEC_ID_RV40, + .type = AVMEDIA_TYPE_VIDEO, + .name = "rv40", + .long_name = NULL_IF_CONFIG_SMALL("RealVideo 4.0"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + }, + { + .id = AV_CODEC_ID_VC1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vc1", + .long_name = NULL_IF_CONFIG_SMALL("SMPTE VC-1"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + .profiles = NULL_IF_CONFIG_SMALL(ff_vc1_profiles), + }, + { + .id = AV_CODEC_ID_WMV3, + .type = AVMEDIA_TYPE_VIDEO, + .name = "wmv3", + .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 9"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + .profiles = NULL_IF_CONFIG_SMALL(ff_vc1_profiles), + }, + { + .id = AV_CODEC_ID_LOCO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "loco", + .long_name = NULL_IF_CONFIG_SMALL("LOCO"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_WNV1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "wnv1", + .long_name = NULL_IF_CONFIG_SMALL("Winnov WNV1"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_AASC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "aasc", + .long_name = NULL_IF_CONFIG_SMALL("Autodesk RLE"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_INDEO2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "indeo2", + .long_name = NULL_IF_CONFIG_SMALL("Intel Indeo 2"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_FRAPS, + .type = AVMEDIA_TYPE_VIDEO, + .name = "fraps", + .long_name = NULL_IF_CONFIG_SMALL("Fraps"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_TRUEMOTION2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "truemotion2", + .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_BMP, + .type = AVMEDIA_TYPE_VIDEO, + .name = "bmp", + .long_name = NULL_IF_CONFIG_SMALL("BMP (Windows and OS/2 bitmap)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/x-ms-bmp"), + }, + { + .id = AV_CODEC_ID_CSCD, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cscd", + .long_name = NULL_IF_CONFIG_SMALL("CamStudio"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MMVIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mmvideo", + .long_name = NULL_IF_CONFIG_SMALL("American Laser Games MM Video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ZMBV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "zmbv", + .long_name = NULL_IF_CONFIG_SMALL("Zip Motion Blocks Video"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_AVS, + .type = AVMEDIA_TYPE_VIDEO, + .name = "avs", + .long_name = NULL_IF_CONFIG_SMALL("AVS (Audio Video Standard) video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SMACKVIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "smackvideo", + .long_name = NULL_IF_CONFIG_SMALL("Smacker video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_NUV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "nuv", + .long_name = NULL_IF_CONFIG_SMALL("NuppelVideo/RTJPEG"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_KMVC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "kmvc", + .long_name = NULL_IF_CONFIG_SMALL("Karl Morton's video codec"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_FLASHSV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "flashsv", + .long_name = NULL_IF_CONFIG_SMALL("Flash Screen Video v1"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_CAVS, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cavs", + .long_name = NULL_IF_CONFIG_SMALL("Chinese AVS (Audio Video Standard) (AVS1-P2, JiZhun profile)"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + }, + { + .id = AV_CODEC_ID_JPEG2000, + .type = AVMEDIA_TYPE_VIDEO, + .name = "jpeg2000", + .long_name = NULL_IF_CONFIG_SMALL("JPEG 2000"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY | + AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/jp2"), + .profiles = NULL_IF_CONFIG_SMALL(ff_jpeg2000_profiles), + }, + { + .id = AV_CODEC_ID_VMNC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vmnc", + .long_name = NULL_IF_CONFIG_SMALL("VMware Screen Codec / VMware Video"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_VP5, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vp5", + .long_name = NULL_IF_CONFIG_SMALL("On2 VP5"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VP6, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vp6", + .long_name = NULL_IF_CONFIG_SMALL("On2 VP6"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VP6F, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vp6f", + .long_name = NULL_IF_CONFIG_SMALL("On2 VP6 (Flash version)"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TARGA, + .type = AVMEDIA_TYPE_VIDEO, + .name = "targa", + .long_name = NULL_IF_CONFIG_SMALL("Truevision Targa image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/x-targa", "image/x-tga"), + }, + { + .id = AV_CODEC_ID_DSICINVIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "dsicinvideo", + .long_name = NULL_IF_CONFIG_SMALL("Delphine Software International CIN video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TIERTEXSEQVIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "tiertexseqvideo", + .long_name = NULL_IF_CONFIG_SMALL("Tiertex Limited SEQ video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TIFF, + .type = AVMEDIA_TYPE_VIDEO, + .name = "tiff", + .long_name = NULL_IF_CONFIG_SMALL("TIFF image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/tiff"), + }, + { + .id = AV_CODEC_ID_GIF, + .type = AVMEDIA_TYPE_VIDEO, + .name = "gif", + .long_name = NULL_IF_CONFIG_SMALL("CompuServe GIF (Graphics Interchange Format)"), + .props = AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/gif"), + }, + { + .id = AV_CODEC_ID_DXA, + .type = AVMEDIA_TYPE_VIDEO, + .name = "dxa", + .long_name = NULL_IF_CONFIG_SMALL("Feeble Files/ScummVM DXA"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_DNXHD, + .type = AVMEDIA_TYPE_VIDEO, + .name = "dnxhd", + .long_name = NULL_IF_CONFIG_SMALL("VC3/DNxHD"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + .profiles = NULL_IF_CONFIG_SMALL(ff_dnxhd_profiles), + }, + { + .id = AV_CODEC_ID_THP, + .type = AVMEDIA_TYPE_VIDEO, + .name = "thp", + .long_name = NULL_IF_CONFIG_SMALL("Nintendo Gamecube THP video"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SGI, + .type = AVMEDIA_TYPE_VIDEO, + .name = "sgi", + .long_name = NULL_IF_CONFIG_SMALL("SGI image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_C93, + .type = AVMEDIA_TYPE_VIDEO, + .name = "c93", + .long_name = NULL_IF_CONFIG_SMALL("Interplay C93"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_BETHSOFTVID, + .type = AVMEDIA_TYPE_VIDEO, + .name = "bethsoftvid", + .long_name = NULL_IF_CONFIG_SMALL("Bethesda VID video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PTX, + .type = AVMEDIA_TYPE_VIDEO, + .name = "ptx", + .long_name = NULL_IF_CONFIG_SMALL("V.Flash PTX image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TXD, + .type = AVMEDIA_TYPE_VIDEO, + .name = "txd", + .long_name = NULL_IF_CONFIG_SMALL("Renderware TXD (TeXture Dictionary) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VP6A, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vp6a", + .long_name = NULL_IF_CONFIG_SMALL("On2 VP6 (Flash version, with alpha channel)"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_AMV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "amv", + .long_name = NULL_IF_CONFIG_SMALL("AMV Video"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VB, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vb", + .long_name = NULL_IF_CONFIG_SMALL("Beam Software VB"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PCX, + .type = AVMEDIA_TYPE_VIDEO, + .name = "pcx", + .long_name = NULL_IF_CONFIG_SMALL("PC Paintbrush PCX image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/x-pcx"), + }, + { + .id = AV_CODEC_ID_SUNRAST, + .type = AVMEDIA_TYPE_VIDEO, + .name = "sunrast", + .long_name = NULL_IF_CONFIG_SMALL("Sun Rasterfile image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_INDEO4, + .type = AVMEDIA_TYPE_VIDEO, + .name = "indeo4", + .long_name = NULL_IF_CONFIG_SMALL("Intel Indeo Video Interactive 4"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_INDEO5, + .type = AVMEDIA_TYPE_VIDEO, + .name = "indeo5", + .long_name = NULL_IF_CONFIG_SMALL("Intel Indeo Video Interactive 5"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MIMIC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mimic", + .long_name = NULL_IF_CONFIG_SMALL("Mimic"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_RL2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "rl2", + .long_name = NULL_IF_CONFIG_SMALL("RL2 video"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ESCAPE124, + .type = AVMEDIA_TYPE_VIDEO, + .name = "escape124", + .long_name = NULL_IF_CONFIG_SMALL("Escape 124"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DIRAC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "dirac", + .long_name = NULL_IF_CONFIG_SMALL("Dirac"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS | AV_CODEC_PROP_REORDER, + }, + { + .id = AV_CODEC_ID_BFI, + .type = AVMEDIA_TYPE_VIDEO, + .name = "bfi", + .long_name = NULL_IF_CONFIG_SMALL("Brute Force & Ignorance"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_CMV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cmv", + .long_name = NULL_IF_CONFIG_SMALL("Electronic Arts CMV video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MOTIONPIXELS, + .type = AVMEDIA_TYPE_VIDEO, + .name = "motionpixels", + .long_name = NULL_IF_CONFIG_SMALL("Motion Pixels video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TGV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "tgv", + .long_name = NULL_IF_CONFIG_SMALL("Electronic Arts TGV video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TGQ, + .type = AVMEDIA_TYPE_VIDEO, + .name = "tgq", + .long_name = NULL_IF_CONFIG_SMALL("Electronic Arts TGQ video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TQI, + .type = AVMEDIA_TYPE_VIDEO, + .name = "tqi", + .long_name = NULL_IF_CONFIG_SMALL("Electronic Arts TQI video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_AURA, + .type = AVMEDIA_TYPE_VIDEO, + .name = "aura", + .long_name = NULL_IF_CONFIG_SMALL("Auravision AURA"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_AURA2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "aura2", + .long_name = NULL_IF_CONFIG_SMALL("Auravision Aura 2"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_V210X, + .type = AVMEDIA_TYPE_VIDEO, + .name = "v210x", + .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_TMV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "tmv", + .long_name = NULL_IF_CONFIG_SMALL("8088flex TMV"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_V210, + .type = AVMEDIA_TYPE_VIDEO, + .name = "v210", + .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_DPX, + .type = AVMEDIA_TYPE_VIDEO, + .name = "dpx", + .long_name = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MAD, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mad", + .long_name = NULL_IF_CONFIG_SMALL("Electronic Arts Madcow Video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_FRWU, + .type = AVMEDIA_TYPE_VIDEO, + .name = "frwu", + .long_name = NULL_IF_CONFIG_SMALL("Forward Uncompressed"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_FLASHSV2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "flashsv2", + .long_name = NULL_IF_CONFIG_SMALL("Flash Screen Video v2"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_CDGRAPHICS, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cdgraphics", + .long_name = NULL_IF_CONFIG_SMALL("CD Graphics video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_R210, + .type = AVMEDIA_TYPE_VIDEO, + .name = "r210", + .long_name = NULL_IF_CONFIG_SMALL("Uncompressed RGB 10-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_ANM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "anm", + .long_name = NULL_IF_CONFIG_SMALL("Deluxe Paint Animation"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_BINKVIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "binkvideo", + .long_name = NULL_IF_CONFIG_SMALL("Bink video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_IFF_ILBM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "iff_ilbm", + .long_name = NULL_IF_CONFIG_SMALL("IFF ACBM/ANIM/DEEP/ILBM/PBM/RGB8/RGBN"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_KGV1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "kgv1", + .long_name = NULL_IF_CONFIG_SMALL("Kega Game Video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_YOP, + .type = AVMEDIA_TYPE_VIDEO, + .name = "yop", + .long_name = NULL_IF_CONFIG_SMALL("Psygnosis YOP Video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VP8, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vp8", + .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PICTOR, + .type = AVMEDIA_TYPE_VIDEO, + .name = "pictor", + .long_name = NULL_IF_CONFIG_SMALL("Pictor/PC Paint"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ANSI, + .type = AVMEDIA_TYPE_VIDEO, + .name = "ansi", + .long_name = NULL_IF_CONFIG_SMALL("ASCII/ANSI art"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_A64_MULTI, + .type = AVMEDIA_TYPE_VIDEO, + .name = "a64_multi", + .long_name = NULL_IF_CONFIG_SMALL("Multicolor charset for Commodore 64"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_A64_MULTI5, + .type = AVMEDIA_TYPE_VIDEO, + .name = "a64_multi5", + .long_name = NULL_IF_CONFIG_SMALL("Multicolor charset for Commodore 64, extended with 5th color (colram)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_R10K, + .type = AVMEDIA_TYPE_VIDEO, + .name = "r10k", + .long_name = NULL_IF_CONFIG_SMALL("AJA Kona 10-bit RGB Codec"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MXPEG, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mxpeg", + .long_name = NULL_IF_CONFIG_SMALL("Mobotix MxPEG video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_LAGARITH, + .type = AVMEDIA_TYPE_VIDEO, + .name = "lagarith", + .long_name = NULL_IF_CONFIG_SMALL("Lagarith lossless"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PRORES, + .type = AVMEDIA_TYPE_VIDEO, + .name = "prores", + .long_name = NULL_IF_CONFIG_SMALL("Apple ProRes (iCodec Pro)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + .profiles = NULL_IF_CONFIG_SMALL(ff_prores_profiles), + }, + { + .id = AV_CODEC_ID_JV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "jv", + .long_name = NULL_IF_CONFIG_SMALL("Bitmap Brothers JV video"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DFA, + .type = AVMEDIA_TYPE_VIDEO, + .name = "dfa", + .long_name = NULL_IF_CONFIG_SMALL("Chronomaster DFA"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WMV3IMAGE, + .type = AVMEDIA_TYPE_VIDEO, + .name = "wmv3image", + .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 9 Image"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VC1IMAGE, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vc1image", + .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 9 Image v2"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_UTVIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "utvideo", + .long_name = NULL_IF_CONFIG_SMALL("Ut Video"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_BMV_VIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "bmv_video", + .long_name = NULL_IF_CONFIG_SMALL("Discworld II BMV video"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_VBLE, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vble", + .long_name = NULL_IF_CONFIG_SMALL("VBLE Lossless Codec"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_DXTORY, + .type = AVMEDIA_TYPE_VIDEO, + .name = "dxtory", + .long_name = NULL_IF_CONFIG_SMALL("Dxtory"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_V410, + .type = AVMEDIA_TYPE_VIDEO, + .name = "v410", + .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:4:4 10-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_XWD, + .type = AVMEDIA_TYPE_VIDEO, + .name = "xwd", + .long_name = NULL_IF_CONFIG_SMALL("XWD (X Window Dump) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/x-xwindowdump"), + }, + { + .id = AV_CODEC_ID_CDXL, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cdxl", + .long_name = NULL_IF_CONFIG_SMALL("Commodore CDXL video"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_XBM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "xbm", + .long_name = NULL_IF_CONFIG_SMALL("XBM (X BitMap) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/x-xbitmap"), + }, + { + .id = AV_CODEC_ID_ZEROCODEC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "zerocodec", + .long_name = NULL_IF_CONFIG_SMALL("ZeroCodec Lossless Video"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MSS1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mss1", + .long_name = NULL_IF_CONFIG_SMALL("MS Screen 1"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MSA1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "msa1", + .long_name = NULL_IF_CONFIG_SMALL("MS ATC Screen"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TSCC2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "tscc2", + .long_name = NULL_IF_CONFIG_SMALL("TechSmith Screen Codec 2"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MTS2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mts2", + .long_name = NULL_IF_CONFIG_SMALL("MS Expression Encoder Screen"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_CLLC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cllc", + .long_name = NULL_IF_CONFIG_SMALL("Canopus Lossless Codec"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MSS2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mss2", + .long_name = NULL_IF_CONFIG_SMALL("MS Windows Media Video V9 Screen"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VP9, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vp9", + .long_name = NULL_IF_CONFIG_SMALL("Google VP9"), + .props = AV_CODEC_PROP_LOSSY, + .profiles = NULL_IF_CONFIG_SMALL(ff_vp9_profiles), + }, + { + .id = AV_CODEC_ID_AIC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "aic", + .long_name = NULL_IF_CONFIG_SMALL("Apple Intermediate Codec"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ESCAPE130, + .type = AVMEDIA_TYPE_VIDEO, + .name = "escape130", + .long_name = NULL_IF_CONFIG_SMALL("Escape 130"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_G2M, + .type = AVMEDIA_TYPE_VIDEO, + .name = "g2m", + .long_name = NULL_IF_CONFIG_SMALL("Go2Meeting"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WEBP, + .type = AVMEDIA_TYPE_VIDEO, + .name = "webp", + .long_name = NULL_IF_CONFIG_SMALL("WebP"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY | + AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/webp"), + }, + { + .id = AV_CODEC_ID_HNM4_VIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "hnm4video", + .long_name = NULL_IF_CONFIG_SMALL("HNM 4 video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_HEVC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "hevc", + .long_name = NULL_IF_CONFIG_SMALL("H.265 / HEVC (High Efficiency Video Coding)"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), + }, + { + .id = AV_CODEC_ID_FIC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "fic", + .long_name = NULL_IF_CONFIG_SMALL("Mirillis FIC"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ALIAS_PIX, + .type = AVMEDIA_TYPE_VIDEO, + .name = "alias_pix", + .long_name = NULL_IF_CONFIG_SMALL("Alias/Wavefront PIX image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_BRENDER_PIX, + .type = AVMEDIA_TYPE_VIDEO, + .name = "brender_pix", + .long_name = NULL_IF_CONFIG_SMALL("BRender PIX image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PAF_VIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "paf_video", + .long_name = NULL_IF_CONFIG_SMALL("Amazing Studio Packed Animation File Video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_EXR, + .type = AVMEDIA_TYPE_VIDEO, + .name = "exr", + .long_name = NULL_IF_CONFIG_SMALL("OpenEXR image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY | + AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_VP7, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vp7", + .long_name = NULL_IF_CONFIG_SMALL("On2 VP7"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SANM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "sanm", + .long_name = NULL_IF_CONFIG_SMALL("LucasArts SANM/SMUSH video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SGIRLE, + .type = AVMEDIA_TYPE_VIDEO, + .name = "sgirle", + .long_name = NULL_IF_CONFIG_SMALL("SGI RLE 8-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MVC1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mvc1", + .long_name = NULL_IF_CONFIG_SMALL("Silicon Graphics Motion Video Compressor 1"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MVC2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mvc2", + .long_name = NULL_IF_CONFIG_SMALL("Silicon Graphics Motion Video Compressor 2"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_HQX, + .type = AVMEDIA_TYPE_VIDEO, + .name = "hqx", + .long_name = NULL_IF_CONFIG_SMALL("Canopus HQX"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TDSC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "tdsc", + .long_name = NULL_IF_CONFIG_SMALL("TDSC"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_HQ_HQA, + .type = AVMEDIA_TYPE_VIDEO, + .name = "hq_hqa", + .long_name = NULL_IF_CONFIG_SMALL("Canopus HQ/HQA"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_HAP, + .type = AVMEDIA_TYPE_VIDEO, + .name = "hap", + .long_name = NULL_IF_CONFIG_SMALL("Vidvox Hap"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DDS, + .type = AVMEDIA_TYPE_VIDEO, + .name = "dds", + .long_name = NULL_IF_CONFIG_SMALL("DirectDraw Surface image decoder"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY | + AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_DXV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "dxv", + .long_name = NULL_IF_CONFIG_SMALL("Resolume DXV"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SCREENPRESSO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "screenpresso", + .long_name = NULL_IF_CONFIG_SMALL("Screenpresso"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_RSCC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "rscc", + .long_name = NULL_IF_CONFIG_SMALL("innoHeim/Rsupport Screen Capture Codec"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_AVS2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "avs2", + .long_name = NULL_IF_CONFIG_SMALL("AVS2-P2/IEEE1857.4"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PGX, + .type = AVMEDIA_TYPE_VIDEO, + .name = "pgx", + .long_name = NULL_IF_CONFIG_SMALL("PGX (JPEG2000 Test Format)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_AVS3, + .type = AVMEDIA_TYPE_VIDEO, + .name = "avs3", + .long_name = NULL_IF_CONFIG_SMALL("AVS3-P2/IEEE1857.10"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MSP2, + .type = AVMEDIA_TYPE_VIDEO, + .name = "msp2", + .long_name = NULL_IF_CONFIG_SMALL("Microsoft Paint (MSP) version 2"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_VVC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vvc", + .long_name = NULL_IF_CONFIG_SMALL("H.266 / VVC (Versatile Video Coding)"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER, + .profiles = NULL_IF_CONFIG_SMALL(ff_vvc_profiles), + }, + { + .id = AV_CODEC_ID_Y41P, + .type = AVMEDIA_TYPE_VIDEO, + .name = "y41p", + .long_name = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_AVRP, + .type = AVMEDIA_TYPE_VIDEO, + .name = "avrp", + .long_name = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_012V, + .type = AVMEDIA_TYPE_VIDEO, + .name = "012v", + .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_AVUI, + .type = AVMEDIA_TYPE_VIDEO, + .name = "avui", + .long_name = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, +#if FF_API_AYUV_CODECID + { + .id = AV_CODEC_ID_AYUV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "ayuv", + .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, +#endif + { + .id = AV_CODEC_ID_TARGA_Y216, + .type = AVMEDIA_TYPE_VIDEO, + .name = "targa_y216", + .long_name = NULL_IF_CONFIG_SMALL("Pinnacle TARGA CineWave YUV16"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_V308, + .type = AVMEDIA_TYPE_VIDEO, + .name = "v308", + .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_V408, + .type = AVMEDIA_TYPE_VIDEO, + .name = "v408", + .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_YUV4, + .type = AVMEDIA_TYPE_VIDEO, + .name = "yuv4", + .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_AVRN, + .type = AVMEDIA_TYPE_VIDEO, + .name = "avrn", + .long_name = NULL_IF_CONFIG_SMALL("Avid AVI Codec"), + }, + { + .id = AV_CODEC_ID_CPIA, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cpia", + .long_name = NULL_IF_CONFIG_SMALL("CPiA video format"), + }, + { + .id = AV_CODEC_ID_XFACE, + .type = AVMEDIA_TYPE_VIDEO, + .name = "xface", + .long_name = NULL_IF_CONFIG_SMALL("X-face image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SNOW, + .type = AVMEDIA_TYPE_VIDEO, + .name = "snow", + .long_name = NULL_IF_CONFIG_SMALL("Snow"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_SMVJPEG, + .type = AVMEDIA_TYPE_VIDEO, + .name = "smvjpeg", + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + .long_name = NULL_IF_CONFIG_SMALL("Sigmatel Motion Video"), + }, + { + .id = AV_CODEC_ID_APNG, + .type = AVMEDIA_TYPE_VIDEO, + .name = "apng", + .long_name = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"), + .props = AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/png"), + }, + { + .id = AV_CODEC_ID_DAALA, + .type = AVMEDIA_TYPE_VIDEO, + .name = "daala", + .long_name = NULL_IF_CONFIG_SMALL("Daala"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_CFHD, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cfhd", + .long_name = NULL_IF_CONFIG_SMALL("GoPro CineForm HD"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TRUEMOTION2RT, + .type = AVMEDIA_TYPE_VIDEO, + .name = "truemotion2rt", + .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0 Real Time"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_M101, + .type = AVMEDIA_TYPE_VIDEO, + .name = "m101", + .long_name = NULL_IF_CONFIG_SMALL("Matrox Uncompressed SD"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MAGICYUV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "magicyuv", + .long_name = NULL_IF_CONFIG_SMALL("MagicYUV video"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_SHEERVIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "sheervideo", + .long_name = NULL_IF_CONFIG_SMALL("BitJazz SheerVideo"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_YLC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "ylc", + .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PSD, + .type = AVMEDIA_TYPE_VIDEO, + .name = "psd", + .long_name = NULL_IF_CONFIG_SMALL("Photoshop PSD file"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PIXLET, + .type = AVMEDIA_TYPE_VIDEO, + .name = "pixlet", + .long_name = NULL_IF_CONFIG_SMALL("Apple Pixlet"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SPEEDHQ, + .type = AVMEDIA_TYPE_VIDEO, + .name = "speedhq", + .long_name = NULL_IF_CONFIG_SMALL("NewTek SpeedHQ"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_FMVC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "fmvc", + .long_name = NULL_IF_CONFIG_SMALL("FM Screen Capture Codec"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_SCPR, + .type = AVMEDIA_TYPE_VIDEO, + .name = "scpr", + .long_name = NULL_IF_CONFIG_SMALL("ScreenPressor"), + .props = AV_CODEC_PROP_LOSSLESS | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_CLEARVIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "clearvideo", + .long_name = NULL_IF_CONFIG_SMALL("Iterated Systems ClearVideo"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_XPM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "xpm", + .long_name = NULL_IF_CONFIG_SMALL("XPM (X PixMap) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/x-xpixmap"), + }, + { + .id = AV_CODEC_ID_AV1, + .type = AVMEDIA_TYPE_VIDEO, + .name = "av1", + .long_name = NULL_IF_CONFIG_SMALL("Alliance for Open Media AV1"), + .props = AV_CODEC_PROP_LOSSY, + .profiles = NULL_IF_CONFIG_SMALL(ff_av1_profiles), + }, + { + .id = AV_CODEC_ID_BITPACKED, + .type = AVMEDIA_TYPE_VIDEO, + .name = "bitpacked", + .long_name = NULL_IF_CONFIG_SMALL("Bitpacked"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MSCC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mscc", + .long_name = NULL_IF_CONFIG_SMALL("Mandsoft Screen Capture Codec"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_SRGC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "srgc", + .long_name = NULL_IF_CONFIG_SMALL("Screen Recorder Gold Codec"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_SVG, + .type = AVMEDIA_TYPE_VIDEO, + .name = "svg", + .long_name = NULL_IF_CONFIG_SMALL("Scalable Vector Graphics"), + .props = AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/svg+xml"), + }, + { + .id = AV_CODEC_ID_GDV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "gdv", + .long_name = NULL_IF_CONFIG_SMALL("Gremlin Digital Video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_FITS, + .type = AVMEDIA_TYPE_VIDEO, + .name = "fits", + .long_name = NULL_IF_CONFIG_SMALL("FITS (Flexible Image Transport System)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_IMM4, + .type = AVMEDIA_TYPE_VIDEO, + .name = "imm4", + .long_name = NULL_IF_CONFIG_SMALL("Infinity IMM4"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PROSUMER, + .type = AVMEDIA_TYPE_VIDEO, + .name = "prosumer", + .long_name = NULL_IF_CONFIG_SMALL("Brooktree ProSumer Video"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MWSC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mwsc", + .long_name = NULL_IF_CONFIG_SMALL("MatchWare Screen Capture Codec"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_WCMV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "wcmv", + .long_name = NULL_IF_CONFIG_SMALL("WinCAM Motion Video"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_RASC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "rasc", + .long_name = NULL_IF_CONFIG_SMALL("RemotelyAnywhere Screen Capture"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_HYMT, + .type = AVMEDIA_TYPE_VIDEO, + .name = "hymt", + .long_name = NULL_IF_CONFIG_SMALL("HuffYUV MT"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_ARBC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "arbc", + .long_name = NULL_IF_CONFIG_SMALL("Gryphon's Anim Compressor"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_AGM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "agm", + .long_name = NULL_IF_CONFIG_SMALL("Amuse Graphics Movie"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_LSCR, + .type = AVMEDIA_TYPE_VIDEO, + .name = "lscr", + .long_name = NULL_IF_CONFIG_SMALL("LEAD Screen Capture"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VP4, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vp4", + .long_name = NULL_IF_CONFIG_SMALL("On2 VP4"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_IMM5, + .type = AVMEDIA_TYPE_VIDEO, + .name = "imm5", + .long_name = NULL_IF_CONFIG_SMALL("Infinity IMM5"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MVDV, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mvdv", + .long_name = NULL_IF_CONFIG_SMALL("MidiVid VQ"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MVHA, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mvha", + .long_name = NULL_IF_CONFIG_SMALL("MidiVid Archive Codec"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_CDTOONS, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cdtoons", + .long_name = NULL_IF_CONFIG_SMALL("CDToons video"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MV30, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mv30", + .long_name = NULL_IF_CONFIG_SMALL("MidiVid 3.0"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_NOTCHLC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "notchlc", + .long_name = NULL_IF_CONFIG_SMALL("NotchLC"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PFM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "pfm", + .long_name = NULL_IF_CONFIG_SMALL("PFM (Portable FloatMap) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MOBICLIP, + .type = AVMEDIA_TYPE_VIDEO, + .name = "mobiclip", + .long_name = NULL_IF_CONFIG_SMALL("MobiClip Video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PHOTOCD, + .type = AVMEDIA_TYPE_VIDEO, + .name = "photocd", + .long_name = NULL_IF_CONFIG_SMALL("Kodak Photo CD"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_IPU, + .type = AVMEDIA_TYPE_VIDEO, + .name = "ipu", + .long_name = NULL_IF_CONFIG_SMALL("IPU Video"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ARGO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "argo", + .long_name = NULL_IF_CONFIG_SMALL("Argonaut Games Video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_CRI, + .type = AVMEDIA_TYPE_VIDEO, + .name = "cri", + .long_name = NULL_IF_CONFIG_SMALL("Cintel RAW"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_SIMBIOSIS_IMX, + .type = AVMEDIA_TYPE_VIDEO, + .name = "simbiosis_imx", + .long_name = NULL_IF_CONFIG_SMALL("Simbiosis Interactive IMX Video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SGA_VIDEO, + .type = AVMEDIA_TYPE_VIDEO, + .name = "sga", + .long_name = NULL_IF_CONFIG_SMALL("Digital Pictures SGA Video"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_GEM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "gem", + .long_name = NULL_IF_CONFIG_SMALL("GEM Raster image"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VBN, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vbn", + .long_name = NULL_IF_CONFIG_SMALL("Vizrt Binary Image"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_JPEGXL, + .type = AVMEDIA_TYPE_VIDEO, + .name = "jpegxl", + .long_name = NULL_IF_CONFIG_SMALL("JPEG XL"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY | + AV_CODEC_PROP_LOSSLESS, + .mime_types= MT("image/jxl"), + }, + { + .id = AV_CODEC_ID_QOI, + .type = AVMEDIA_TYPE_VIDEO, + .name = "qoi", + .long_name = NULL_IF_CONFIG_SMALL("QOI (Quite OK Image)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PHM, + .type = AVMEDIA_TYPE_VIDEO, + .name = "phm", + .long_name = NULL_IF_CONFIG_SMALL("PHM (Portable HalfFloatMap) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_RADIANCE_HDR, + .type = AVMEDIA_TYPE_VIDEO, + .name = "hdr", + .long_name = NULL_IF_CONFIG_SMALL("HDR (Radiance RGBE format) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WBMP, + .type = AVMEDIA_TYPE_VIDEO, + .name = "wbmp", + .long_name = NULL_IF_CONFIG_SMALL("WBMP (Wireless Application Protocol Bitmap) image"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MEDIA100, + .type = AVMEDIA_TYPE_VIDEO, + .name = "media100", + .long_name = NULL_IF_CONFIG_SMALL("Media 100i"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VQC, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vqc", + .long_name = NULL_IF_CONFIG_SMALL("ViewQuest VQC"), + .props = AV_CODEC_PROP_LOSSY, + }, + + /* various PCM "codecs" */ + { + .id = AV_CODEC_ID_PCM_S16LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s16le", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16-bit little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S16BE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s16be", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16-bit big-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_U16LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_u16le", + .long_name = NULL_IF_CONFIG_SMALL("PCM unsigned 16-bit little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_U16BE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_u16be", + .long_name = NULL_IF_CONFIG_SMALL("PCM unsigned 16-bit big-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S8, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s8", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 8-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_U8, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_u8", + .long_name = NULL_IF_CONFIG_SMALL("PCM unsigned 8-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_MULAW, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_mulaw", + .long_name = NULL_IF_CONFIG_SMALL("PCM mu-law / G.711 mu-law"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PCM_ALAW, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_alaw", + .long_name = NULL_IF_CONFIG_SMALL("PCM A-law / G.711 A-law"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PCM_S32LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s32le", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 32-bit little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S32BE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s32be", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 32-bit big-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_U32LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_u32le", + .long_name = NULL_IF_CONFIG_SMALL("PCM unsigned 32-bit little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_U32BE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_u32be", + .long_name = NULL_IF_CONFIG_SMALL("PCM unsigned 32-bit big-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S24LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s24le", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 24-bit little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S24BE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s24be", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 24-bit big-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_U24LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_u24le", + .long_name = NULL_IF_CONFIG_SMALL("PCM unsigned 24-bit little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_U24BE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_u24be", + .long_name = NULL_IF_CONFIG_SMALL("PCM unsigned 24-bit big-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S24DAUD, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s24daud", + .long_name = NULL_IF_CONFIG_SMALL("PCM D-Cinema audio signed 24-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S16LE_PLANAR, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s16le_planar", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16-bit little-endian planar"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_DVD, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_dvd", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 20|24-bit big-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_F32BE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_f32be", + .long_name = NULL_IF_CONFIG_SMALL("PCM 32-bit floating point big-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_F32LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_f32le", + .long_name = NULL_IF_CONFIG_SMALL("PCM 32-bit floating point little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_F64BE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_f64be", + .long_name = NULL_IF_CONFIG_SMALL("PCM 64-bit floating point big-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_F64LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_f64le", + .long_name = NULL_IF_CONFIG_SMALL("PCM 64-bit floating point little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_BLURAY, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_bluray", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16|20|24-bit big-endian for Blu-ray media"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_LXF, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_lxf", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 20-bit little-endian planar"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_S302M, + .type = AVMEDIA_TYPE_AUDIO, + .name = "s302m", + .long_name = NULL_IF_CONFIG_SMALL("SMPTE 302M"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S8_PLANAR, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s8_planar", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 8-bit planar"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S24LE_PLANAR, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s24le_planar", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 24-bit little-endian planar"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S32LE_PLANAR, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s32le_planar", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 32-bit little-endian planar"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S16BE_PLANAR, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s16be_planar", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16-bit big-endian planar"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S64LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s64le", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 64-bit little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_S64BE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_s64be", + .long_name = NULL_IF_CONFIG_SMALL("PCM signed 64-bit big-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_F16LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_f16le", + .long_name = NULL_IF_CONFIG_SMALL("PCM 16.8 floating point little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_F24LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_f24le", + .long_name = NULL_IF_CONFIG_SMALL("PCM 24.0 floating point little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_PCM_VIDC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_vidc", + .long_name = NULL_IF_CONFIG_SMALL("PCM Archimedes VIDC"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PCM_SGA, + .type = AVMEDIA_TYPE_AUDIO, + .name = "pcm_sga", + .long_name = NULL_IF_CONFIG_SMALL("PCM SGA"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + + /* various ADPCM codecs */ + { + .id = AV_CODEC_ID_ADPCM_IMA_QT, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_qt", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA QuickTime"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_WAV, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_wav", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA WAV"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_DK3, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_dk3", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Duck DK3"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_DK4, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_dk4", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Duck DK4"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_WS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_ws", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Westwood"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_SMJPEG, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_smjpeg", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Loki SDL MJPEG"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_MS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ms", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Microsoft"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_4XM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_4xm", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM 4X Movie"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_XA, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_xa", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM CDROM XA"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_ADX, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_adx", + .long_name = NULL_IF_CONFIG_SMALL("SEGA CRI ADX ADPCM"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_EA, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ea", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Electronic Arts"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_G726, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_g726", + .long_name = NULL_IF_CONFIG_SMALL("G.726 ADPCM"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_CT, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ct", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Creative Technology"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_SWF, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_swf", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Shockwave Flash"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_YAMAHA, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_yamaha", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Yamaha"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_SBPRO_4, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_sbpro_4", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Sound Blaster Pro 4-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_SBPRO_3, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_sbpro_3", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Sound Blaster Pro 2.6-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_SBPRO_2, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_sbpro_2", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Sound Blaster Pro 2-bit"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_THP, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_thp", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo THP"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_AMV, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_amv", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA AMV"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_EA_R1, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ea_r1", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Electronic Arts R1"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_EA_R3, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ea_r3", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Electronic Arts R3"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_EA_R2, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ea_r2", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Electronic Arts R2"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_EA_SEAD, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_ea_sead", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Electronic Arts SEAD"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_EA_EACS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_ea_eacs", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Electronic Arts EACS"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_EA_XAS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ea_xas", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Electronic Arts XAS"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_EA_MAXIS_XA, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ea_maxis_xa", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Electronic Arts Maxis CDROM XA"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_ISS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_iss", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Funcom ISS"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_G722, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_g722", + .long_name = NULL_IF_CONFIG_SMALL("G.722 ADPCM"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_APC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_apc", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA CRYO APC"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_VIMA, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_vima", + .long_name = NULL_IF_CONFIG_SMALL("LucasArts VIMA audio"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_AFC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_afc", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo Gamecube AFC"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_OKI, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_oki", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Dialogic OKI"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_DTK, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_dtk", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo Gamecube DTK"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_RAD, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_rad", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Radical"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_G726LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_g726le", + .long_name = NULL_IF_CONFIG_SMALL("G.726 ADPCM little-endian"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_THP_LE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_thp_le", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo THP (Little-Endian)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_PSX, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_psx", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Playstation"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_AICA, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_aica", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Yamaha AICA"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_DAT4, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_dat4", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Eurocom DAT4"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_MTAF, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_mtaf", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM MTAF"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_AGM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_agm", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM AmuseGraphics Movie AGM"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_ARGO, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_argo", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Argonaut Games"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_SSI, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_ssi", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Simon & Schuster Interactive"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_ZORK, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_zork", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Zork"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_APM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_apm", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Ubisoft APM"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_ALP, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_alp", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA High Voltage Software ALP"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_MTF, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_mtf", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Capcom's MT Framework"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_CUNNING, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_cunning", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Cunning Developments"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_MOFLEX, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_moflex", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA MobiClip MOFLEX"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_IMA_ACORN, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_ima_acorn", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Acorn Replay"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ADPCM_XMD, + .type = AVMEDIA_TYPE_AUDIO, + .name = "adpcm_xmd", + .long_name = NULL_IF_CONFIG_SMALL("ADPCM Konami XMD"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + + /* AMR */ + { + .id = AV_CODEC_ID_AMR_NB, + .type = AVMEDIA_TYPE_AUDIO, + .name = "amr_nb", + .long_name = NULL_IF_CONFIG_SMALL("AMR-NB (Adaptive Multi-Rate NarrowBand)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_AMR_WB, + .type = AVMEDIA_TYPE_AUDIO, + .name = "amr_wb", + .long_name = NULL_IF_CONFIG_SMALL("AMR-WB (Adaptive Multi-Rate WideBand)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + + /* RealAudio codecs*/ + { + .id = AV_CODEC_ID_RA_144, + .type = AVMEDIA_TYPE_AUDIO, + .name = "ra_144", + .long_name = NULL_IF_CONFIG_SMALL("RealAudio 1.0 (14.4K)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_RA_288, + .type = AVMEDIA_TYPE_AUDIO, + .name = "ra_288", + .long_name = NULL_IF_CONFIG_SMALL("RealAudio 2.0 (28.8K)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + + /* various DPCM codecs */ + { + .id = AV_CODEC_ID_ROQ_DPCM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "roq_dpcm", + .long_name = NULL_IF_CONFIG_SMALL("DPCM id RoQ"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_INTERPLAY_DPCM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "interplay_dpcm", + .long_name = NULL_IF_CONFIG_SMALL("DPCM Interplay"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_XAN_DPCM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "xan_dpcm", + .long_name = NULL_IF_CONFIG_SMALL("DPCM Xan"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SOL_DPCM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "sol_dpcm", + .long_name = NULL_IF_CONFIG_SMALL("DPCM Sol"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SDX2_DPCM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "sdx2_dpcm", + .long_name = NULL_IF_CONFIG_SMALL("DPCM Squareroot-Delta-Exact"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_GREMLIN_DPCM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "gremlin_dpcm", + .long_name = NULL_IF_CONFIG_SMALL("DPCM Gremlin"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DERF_DPCM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "derf_dpcm", + .long_name = NULL_IF_CONFIG_SMALL("DPCM Xilam DERF"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WADY_DPCM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "wady_dpcm", + .long_name = NULL_IF_CONFIG_SMALL("DPCM Marble WADY"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_CBD2_DPCM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "cbd2_dpcm", + .long_name = NULL_IF_CONFIG_SMALL("DPCM Cuberoot-Delta-Exact"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + + /* audio codecs */ + { + .id = AV_CODEC_ID_MP2, + .type = AVMEDIA_TYPE_AUDIO, + .name = "mp2", + .long_name = NULL_IF_CONFIG_SMALL("MP2 (MPEG audio layer 2)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MP3, + .type = AVMEDIA_TYPE_AUDIO, + .name = "mp3", + .long_name = NULL_IF_CONFIG_SMALL("MP3 (MPEG audio layer 3)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_AAC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "aac", + .long_name = NULL_IF_CONFIG_SMALL("AAC (Advanced Audio Coding)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + .profiles = NULL_IF_CONFIG_SMALL(ff_aac_profiles), + }, + { + .id = AV_CODEC_ID_AC3, + .type = AVMEDIA_TYPE_AUDIO, + .name = "ac3", + .long_name = NULL_IF_CONFIG_SMALL("ATSC A/52A (AC-3)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DTS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "dts", + .long_name = NULL_IF_CONFIG_SMALL("DCA (DTS Coherent Acoustics)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS, + .profiles = NULL_IF_CONFIG_SMALL(ff_dca_profiles), + }, + { + .id = AV_CODEC_ID_VORBIS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "vorbis", + .long_name = NULL_IF_CONFIG_SMALL("Vorbis"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DVAUDIO, + .type = AVMEDIA_TYPE_AUDIO, + .name = "dvaudio", + .long_name = NULL_IF_CONFIG_SMALL("DV audio"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WMAV1, + .type = AVMEDIA_TYPE_AUDIO, + .name = "wmav1", + .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio 1"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WMAV2, + .type = AVMEDIA_TYPE_AUDIO, + .name = "wmav2", + .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio 2"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MACE3, + .type = AVMEDIA_TYPE_AUDIO, + .name = "mace3", + .long_name = NULL_IF_CONFIG_SMALL("MACE (Macintosh Audio Compression/Expansion) 3:1"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MACE6, + .type = AVMEDIA_TYPE_AUDIO, + .name = "mace6", + .long_name = NULL_IF_CONFIG_SMALL("MACE (Macintosh Audio Compression/Expansion) 6:1"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_VMDAUDIO, + .type = AVMEDIA_TYPE_AUDIO, + .name = "vmdaudio", + .long_name = NULL_IF_CONFIG_SMALL("Sierra VMD audio"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_FLAC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "flac", + .long_name = NULL_IF_CONFIG_SMALL("FLAC (Free Lossless Audio Codec)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MP3ADU, + .type = AVMEDIA_TYPE_AUDIO, + .name = "mp3adu", + .long_name = NULL_IF_CONFIG_SMALL("ADU (Application Data Unit) MP3 (MPEG audio layer 3)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MP3ON4, + .type = AVMEDIA_TYPE_AUDIO, + .name = "mp3on4", + .long_name = NULL_IF_CONFIG_SMALL("MP3onMP4"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SHORTEN, + .type = AVMEDIA_TYPE_AUDIO, + .name = "shorten", + .long_name = NULL_IF_CONFIG_SMALL("Shorten"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_ALAC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "alac", + .long_name = NULL_IF_CONFIG_SMALL("ALAC (Apple Lossless Audio Codec)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_WESTWOOD_SND1, + .type = AVMEDIA_TYPE_AUDIO, + .name = "westwood_snd1", + .long_name = NULL_IF_CONFIG_SMALL("Westwood Audio (SND1)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_GSM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "gsm", + .long_name = NULL_IF_CONFIG_SMALL("GSM"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_QDM2, + .type = AVMEDIA_TYPE_AUDIO, + .name = "qdm2", + .long_name = NULL_IF_CONFIG_SMALL("QDesign Music Codec 2"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_COOK, + .type = AVMEDIA_TYPE_AUDIO, + .name = "cook", + .long_name = NULL_IF_CONFIG_SMALL("Cook / Cooker / Gecko (RealAudio G2)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TRUESPEECH, + .type = AVMEDIA_TYPE_AUDIO, + .name = "truespeech", + .long_name = NULL_IF_CONFIG_SMALL("DSP Group TrueSpeech"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TTA, + .type = AVMEDIA_TYPE_AUDIO, + .name = "tta", + .long_name = NULL_IF_CONFIG_SMALL("TTA (True Audio)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_SMACKAUDIO, + .type = AVMEDIA_TYPE_AUDIO, + .name = "smackaudio", + .long_name = NULL_IF_CONFIG_SMALL("Smacker audio"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_QCELP, + .type = AVMEDIA_TYPE_AUDIO, + .name = "qcelp", + .long_name = NULL_IF_CONFIG_SMALL("QCELP / PureVoice"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WAVPACK, + .type = AVMEDIA_TYPE_AUDIO, + .name = "wavpack", + .long_name = NULL_IF_CONFIG_SMALL("WavPack"), + .props = AV_CODEC_PROP_INTRA_ONLY | + AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_DSICINAUDIO, + .type = AVMEDIA_TYPE_AUDIO, + .name = "dsicinaudio", + .long_name = NULL_IF_CONFIG_SMALL("Delphine Software International CIN audio"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_IMC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "imc", + .long_name = NULL_IF_CONFIG_SMALL("IMC (Intel Music Coder)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MUSEPACK7, + .type = AVMEDIA_TYPE_AUDIO, + .name = "musepack7", + .long_name = NULL_IF_CONFIG_SMALL("Musepack SV7"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MLP, + .type = AVMEDIA_TYPE_AUDIO, + .name = "mlp", + .long_name = NULL_IF_CONFIG_SMALL("MLP (Meridian Lossless Packing)"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_GSM_MS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "gsm_ms", + .long_name = NULL_IF_CONFIG_SMALL("GSM Microsoft variant"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ATRAC3, + .type = AVMEDIA_TYPE_AUDIO, + .name = "atrac3", + .long_name = NULL_IF_CONFIG_SMALL("ATRAC3 (Adaptive TRansform Acoustic Coding 3)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_APE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "ape", + .long_name = NULL_IF_CONFIG_SMALL("Monkey's Audio"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_NELLYMOSER, + .type = AVMEDIA_TYPE_AUDIO, + .name = "nellymoser", + .long_name = NULL_IF_CONFIG_SMALL("Nellymoser Asao"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MUSEPACK8, + .type = AVMEDIA_TYPE_AUDIO, + .name = "musepack8", + .long_name = NULL_IF_CONFIG_SMALL("Musepack SV8"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SPEEX, + .type = AVMEDIA_TYPE_AUDIO, + .name = "speex", + .long_name = NULL_IF_CONFIG_SMALL("Speex"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WMAVOICE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "wmavoice", + .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WMAPRO, + .type = AVMEDIA_TYPE_AUDIO, + .name = "wmapro", + .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio 9 Professional"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WMALOSSLESS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "wmalossless", + .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Lossless"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_ATRAC3P, + .type = AVMEDIA_TYPE_AUDIO, + .name = "atrac3p", + .long_name = NULL_IF_CONFIG_SMALL("ATRAC3+ (Adaptive TRansform Acoustic Coding 3+)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_EAC3, + .type = AVMEDIA_TYPE_AUDIO, + .name = "eac3", + .long_name = NULL_IF_CONFIG_SMALL("ATSC A/52B (AC-3, E-AC-3)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SIPR, + .type = AVMEDIA_TYPE_AUDIO, + .name = "sipr", + .long_name = NULL_IF_CONFIG_SMALL("RealAudio SIPR / ACELP.NET"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MP1, + .type = AVMEDIA_TYPE_AUDIO, + .name = "mp1", + .long_name = NULL_IF_CONFIG_SMALL("MP1 (MPEG audio layer 1)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TWINVQ, + .type = AVMEDIA_TYPE_AUDIO, + .name = "twinvq", + .long_name = NULL_IF_CONFIG_SMALL("VQF TwinVQ"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TRUEHD, + .type = AVMEDIA_TYPE_AUDIO, + .name = "truehd", + .long_name = NULL_IF_CONFIG_SMALL("TrueHD"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MP4ALS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "mp4als", + .long_name = NULL_IF_CONFIG_SMALL("MPEG-4 Audio Lossless Coding (ALS)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_ATRAC1, + .type = AVMEDIA_TYPE_AUDIO, + .name = "atrac1", + .long_name = NULL_IF_CONFIG_SMALL("ATRAC1 (Adaptive TRansform Acoustic Coding)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_BINKAUDIO_RDFT, + .type = AVMEDIA_TYPE_AUDIO, + .name = "binkaudio_rdft", + .long_name = NULL_IF_CONFIG_SMALL("Bink Audio (RDFT)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_BINKAUDIO_DCT, + .type = AVMEDIA_TYPE_AUDIO, + .name = "binkaudio_dct", + .long_name = NULL_IF_CONFIG_SMALL("Bink Audio (DCT)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_AAC_LATM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "aac_latm", + .long_name = NULL_IF_CONFIG_SMALL("AAC LATM (Advanced Audio Coding LATM syntax)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + .profiles = NULL_IF_CONFIG_SMALL(ff_aac_profiles), + }, + { + .id = AV_CODEC_ID_QDMC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "qdmc", + .long_name = NULL_IF_CONFIG_SMALL("QDesign Music"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_CELT, + .type = AVMEDIA_TYPE_AUDIO, + .name = "celt", + .long_name = NULL_IF_CONFIG_SMALL("Constrained Energy Lapped Transform (CELT)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_G723_1, + .type = AVMEDIA_TYPE_AUDIO, + .name = "g723_1", + .long_name = NULL_IF_CONFIG_SMALL("G.723.1"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_G729, + .type = AVMEDIA_TYPE_AUDIO, + .name = "g729", + .long_name = NULL_IF_CONFIG_SMALL("G.729"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_8SVX_EXP, + .type = AVMEDIA_TYPE_AUDIO, + .name = "8svx_exp", + .long_name = NULL_IF_CONFIG_SMALL("8SVX exponential"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_8SVX_FIB, + .type = AVMEDIA_TYPE_AUDIO, + .name = "8svx_fib", + .long_name = NULL_IF_CONFIG_SMALL("8SVX fibonacci"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_BMV_AUDIO, + .type = AVMEDIA_TYPE_AUDIO, + .name = "bmv_audio", + .long_name = NULL_IF_CONFIG_SMALL("Discworld II BMV audio"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_RALF, + .type = AVMEDIA_TYPE_AUDIO, + .name = "ralf", + .long_name = NULL_IF_CONFIG_SMALL("RealAudio Lossless"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_IAC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "iac", + .long_name = NULL_IF_CONFIG_SMALL("IAC (Indeo Audio Coder)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ILBC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "ilbc", + .long_name = NULL_IF_CONFIG_SMALL("iLBC (Internet Low Bitrate Codec)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_OPUS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "opus", + .long_name = NULL_IF_CONFIG_SMALL("Opus (Opus Interactive Audio Codec)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_COMFORT_NOISE, + .type = AVMEDIA_TYPE_AUDIO, + .name = "comfortnoise", + .long_name = NULL_IF_CONFIG_SMALL("RFC 3389 Comfort Noise"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_TAK, + .type = AVMEDIA_TYPE_AUDIO, + .name = "tak", + .long_name = NULL_IF_CONFIG_SMALL("TAK (Tom's lossless Audio Kompressor)"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_METASOUND, + .type = AVMEDIA_TYPE_AUDIO, + .name = "metasound", + .long_name = NULL_IF_CONFIG_SMALL("Voxware MetaSound"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_PAF_AUDIO, + .type = AVMEDIA_TYPE_AUDIO, + .name = "paf_audio", + .long_name = NULL_IF_CONFIG_SMALL("Amazing Studio Packed Animation File Audio"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ON2AVC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "avc", + .long_name = NULL_IF_CONFIG_SMALL("On2 Audio for Video Codec"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DSS_SP, + .type = AVMEDIA_TYPE_AUDIO, + .name = "dss_sp", + .long_name = NULL_IF_CONFIG_SMALL("Digital Speech Standard - Standard Play mode (DSS SP)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_CODEC2, + .type = AVMEDIA_TYPE_AUDIO, + .name = "codec2", + .long_name = NULL_IF_CONFIG_SMALL("codec2 (very low bitrate speech codec)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_FFWAVESYNTH, + .type = AVMEDIA_TYPE_AUDIO, + .name = "wavesynth", + .long_name = NULL_IF_CONFIG_SMALL("Wave synthesis pseudo-codec"), + .props = AV_CODEC_PROP_INTRA_ONLY, + }, + { + .id = AV_CODEC_ID_SONIC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "sonic", + .long_name = NULL_IF_CONFIG_SMALL("Sonic"), + .props = AV_CODEC_PROP_INTRA_ONLY, + }, + { + .id = AV_CODEC_ID_SONIC_LS, + .type = AVMEDIA_TYPE_AUDIO, + .name = "sonicls", + .long_name = NULL_IF_CONFIG_SMALL("Sonic lossless"), + .props = AV_CODEC_PROP_INTRA_ONLY, + }, + { + .id = AV_CODEC_ID_EVRC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "evrc", + .long_name = NULL_IF_CONFIG_SMALL("EVRC (Enhanced Variable Rate Codec)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SMV, + .type = AVMEDIA_TYPE_AUDIO, + .name = "smv", + .long_name = NULL_IF_CONFIG_SMALL("SMV (Selectable Mode Vocoder)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DSD_LSBF, + .type = AVMEDIA_TYPE_AUDIO, + .name = "dsd_lsbf", + .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), least significant bit first"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DSD_MSBF, + .type = AVMEDIA_TYPE_AUDIO, + .name = "dsd_msbf", + .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), most significant bit first"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DSD_LSBF_PLANAR, + .type = AVMEDIA_TYPE_AUDIO, + .name = "dsd_lsbf_planar", + .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), least significant bit first, planar"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DSD_MSBF_PLANAR, + .type = AVMEDIA_TYPE_AUDIO, + .name = "dsd_msbf_planar", + .long_name = NULL_IF_CONFIG_SMALL("DSD (Direct Stream Digital), most significant bit first, planar"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_4GV, + .type = AVMEDIA_TYPE_AUDIO, + .name = "4gv", + .long_name = NULL_IF_CONFIG_SMALL("4GV (Fourth Generation Vocoder)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_INTERPLAY_ACM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "interplayacm", + .long_name = NULL_IF_CONFIG_SMALL("Interplay ACM"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_XMA1, + .type = AVMEDIA_TYPE_AUDIO, + .name = "xma1", + .long_name = NULL_IF_CONFIG_SMALL("Xbox Media Audio 1"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_XMA2, + .type = AVMEDIA_TYPE_AUDIO, + .name = "xma2", + .long_name = NULL_IF_CONFIG_SMALL("Xbox Media Audio 2"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DST, + .type = AVMEDIA_TYPE_AUDIO, + .name = "dst", + .long_name = NULL_IF_CONFIG_SMALL("DST (Direct Stream Transfer)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_ATRAC3AL, + .type = AVMEDIA_TYPE_AUDIO, + .name = "atrac3al", + .long_name = NULL_IF_CONFIG_SMALL("ATRAC3 AL (Adaptive TRansform Acoustic Coding 3 Advanced Lossless)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_ATRAC3PAL, + .type = AVMEDIA_TYPE_AUDIO, + .name = "atrac3pal", + .long_name = NULL_IF_CONFIG_SMALL("ATRAC3+ AL (Adaptive TRansform Acoustic Coding 3+ Advanced Lossless)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_DOLBY_E, + .type = AVMEDIA_TYPE_AUDIO, + .name = "dolby_e", + .long_name = NULL_IF_CONFIG_SMALL("Dolby E"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_APTX, + .type = AVMEDIA_TYPE_AUDIO, + .name = "aptx", + .long_name = NULL_IF_CONFIG_SMALL("aptX (Audio Processing Technology for Bluetooth)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_APTX_HD, + .type = AVMEDIA_TYPE_AUDIO, + .name = "aptx_hd", + .long_name = NULL_IF_CONFIG_SMALL("aptX HD (Audio Processing Technology for Bluetooth)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SBC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "sbc", + .long_name = NULL_IF_CONFIG_SMALL("SBC (low-complexity subband codec)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ATRAC9, + .type = AVMEDIA_TYPE_AUDIO, + .name = "atrac9", + .long_name = NULL_IF_CONFIG_SMALL("ATRAC9 (Adaptive TRansform Acoustic Coding 9)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_HCOM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "hcom", + .long_name = NULL_IF_CONFIG_SMALL("HCOM Audio"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_ACELP_KELVIN, + .type = AVMEDIA_TYPE_AUDIO, + .name = "acelp.kelvin", + .long_name = NULL_IF_CONFIG_SMALL("Sipro ACELP.KELVIN"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MPEGH_3D_AUDIO, + .type = AVMEDIA_TYPE_AUDIO, + .name = "mpegh_3d_audio", + .long_name = NULL_IF_CONFIG_SMALL("MPEG-H 3D Audio"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_SIREN, + .type = AVMEDIA_TYPE_AUDIO, + .name = "siren", + .long_name = NULL_IF_CONFIG_SMALL("Siren"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_HCA, + .type = AVMEDIA_TYPE_AUDIO, + .name = "hca", + .long_name = NULL_IF_CONFIG_SMALL("CRI HCA"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_FASTAUDIO, + .type = AVMEDIA_TYPE_AUDIO, + .name = "fastaudio", + .long_name = NULL_IF_CONFIG_SMALL("MobiClip FastAudio"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_MSNSIREN, + .type = AVMEDIA_TYPE_AUDIO, + .name = "msnsiren", + .long_name = NULL_IF_CONFIG_SMALL("MSN Siren"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_DFPWM, + .type = AVMEDIA_TYPE_AUDIO, + .name = "dfpwm", + .long_name = NULL_IF_CONFIG_SMALL("DFPWM (Dynamic Filter Pulse Width Modulation)"), + .props = AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_BONK, + .type = AVMEDIA_TYPE_AUDIO, + .name = "bonk", + .long_name = NULL_IF_CONFIG_SMALL("Bonk audio"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_MISC4, + .type = AVMEDIA_TYPE_AUDIO, + .name = "misc4", + .long_name = NULL_IF_CONFIG_SMALL("Micronas SC-4 Audio"), + .props = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_INTRA_ONLY, + }, + { + .id = AV_CODEC_ID_APAC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "apac", + .long_name = NULL_IF_CONFIG_SMALL("Marian's A-pac audio"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_FTR, + .type = AVMEDIA_TYPE_AUDIO, + .name = "ftr", + .long_name = NULL_IF_CONFIG_SMALL("FTR Voice"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY, + }, + { + .id = AV_CODEC_ID_WAVARC, + .type = AVMEDIA_TYPE_AUDIO, + .name = "wavarc", + .long_name = NULL_IF_CONFIG_SMALL("Waveform Archiver"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_RKA, + .type = AVMEDIA_TYPE_AUDIO, + .name = "rka", + .long_name = NULL_IF_CONFIG_SMALL("RKA (RK Audio)"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS, + }, + + /* subtitle codecs */ + { + .id = AV_CODEC_ID_DVD_SUBTITLE, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "dvd_subtitle", + .long_name = NULL_IF_CONFIG_SMALL("DVD subtitles"), + .props = AV_CODEC_PROP_BITMAP_SUB, + }, + { + .id = AV_CODEC_ID_DVB_SUBTITLE, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "dvb_subtitle", + .long_name = NULL_IF_CONFIG_SMALL("DVB subtitles"), + .props = AV_CODEC_PROP_BITMAP_SUB, + }, + { + .id = AV_CODEC_ID_TEXT, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "text", + .long_name = NULL_IF_CONFIG_SMALL("raw UTF-8 text"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_XSUB, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "xsub", + .long_name = NULL_IF_CONFIG_SMALL("XSUB"), + .props = AV_CODEC_PROP_BITMAP_SUB, + }, + { + .id = AV_CODEC_ID_SSA, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "ssa", + .long_name = NULL_IF_CONFIG_SMALL("SSA (SubStation Alpha) subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_MOV_TEXT, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "mov_text", + .long_name = NULL_IF_CONFIG_SMALL("MOV text"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_HDMV_PGS_SUBTITLE, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "hdmv_pgs_subtitle", + .long_name = NULL_IF_CONFIG_SMALL("HDMV Presentation Graphic Stream subtitles"), + .props = AV_CODEC_PROP_BITMAP_SUB, + }, + { + .id = AV_CODEC_ID_DVB_TELETEXT, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "dvb_teletext", + .long_name = NULL_IF_CONFIG_SMALL("DVB teletext"), + }, + { + .id = AV_CODEC_ID_SRT, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "srt", + .long_name = NULL_IF_CONFIG_SMALL("SubRip subtitle with embedded timing"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_MICRODVD, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "microdvd", + .long_name = NULL_IF_CONFIG_SMALL("MicroDVD subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_EIA_608, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "eia_608", + .long_name = NULL_IF_CONFIG_SMALL("EIA-608 closed captions"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_JACOSUB, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "jacosub", + .long_name = NULL_IF_CONFIG_SMALL("JACOsub subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_SAMI, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "sami", + .long_name = NULL_IF_CONFIG_SMALL("SAMI subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_REALTEXT, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "realtext", + .long_name = NULL_IF_CONFIG_SMALL("RealText subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_STL, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "stl", + .long_name = NULL_IF_CONFIG_SMALL("Spruce subtitle format"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_SUBVIEWER1, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "subviewer1", + .long_name = NULL_IF_CONFIG_SMALL("SubViewer v1 subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_SUBVIEWER, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "subviewer", + .long_name = NULL_IF_CONFIG_SMALL("SubViewer subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_SUBRIP, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "subrip", + .long_name = NULL_IF_CONFIG_SMALL("SubRip subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_WEBVTT, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "webvtt", + .long_name = NULL_IF_CONFIG_SMALL("WebVTT subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_MPL2, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "mpl2", + .long_name = NULL_IF_CONFIG_SMALL("MPL2 subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_VPLAYER, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "vplayer", + .long_name = NULL_IF_CONFIG_SMALL("VPlayer subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_PJS, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "pjs", + .long_name = NULL_IF_CONFIG_SMALL("PJS (Phoenix Japanimation Society) subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_ASS, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "ass", + .long_name = NULL_IF_CONFIG_SMALL("ASS (Advanced SSA) subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_HDMV_TEXT_SUBTITLE, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "hdmv_text_subtitle", + .long_name = NULL_IF_CONFIG_SMALL("HDMV Text subtitle"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_TTML, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "ttml", + .long_name = NULL_IF_CONFIG_SMALL("Timed Text Markup Language"), + .props = AV_CODEC_PROP_TEXT_SUB, + }, + { + .id = AV_CODEC_ID_ARIB_CAPTION, + .type = AVMEDIA_TYPE_SUBTITLE, + .name = "arib_caption", + .long_name = NULL_IF_CONFIG_SMALL("ARIB STD-B24 caption"), + .props = AV_CODEC_PROP_TEXT_SUB, + .profiles = NULL_IF_CONFIG_SMALL(ff_arib_caption_profiles), + }, + + /* other kind of codecs and pseudo-codecs */ + { + .id = AV_CODEC_ID_TTF, + .type = AVMEDIA_TYPE_DATA, + .name = "ttf", + .long_name = NULL_IF_CONFIG_SMALL("TrueType font"), + .mime_types= MT("application/x-truetype-font", "application/x-font"), + }, + { + .id = AV_CODEC_ID_SCTE_35, + .type = AVMEDIA_TYPE_DATA, + .name = "scte_35", + .long_name = NULL_IF_CONFIG_SMALL("SCTE 35 Message Queue"), + }, + { + .id = AV_CODEC_ID_EPG, + .type = AVMEDIA_TYPE_DATA, + .name = "epg", + .long_name = NULL_IF_CONFIG_SMALL("Electronic Program Guide"), + }, + { + .id = AV_CODEC_ID_BINTEXT, + .type = AVMEDIA_TYPE_VIDEO, + .name = "bintext", + .long_name = NULL_IF_CONFIG_SMALL("Binary text"), + .props = AV_CODEC_PROP_INTRA_ONLY, + }, + { + .id = AV_CODEC_ID_XBIN, + .type = AVMEDIA_TYPE_VIDEO, + .name = "xbin", + .long_name = NULL_IF_CONFIG_SMALL("eXtended BINary text"), + .props = AV_CODEC_PROP_INTRA_ONLY, + }, + { + .id = AV_CODEC_ID_IDF, + .type = AVMEDIA_TYPE_VIDEO, + .name = "idf", + .long_name = NULL_IF_CONFIG_SMALL("iCEDraw text"), + .props = AV_CODEC_PROP_INTRA_ONLY, + }, + { + .id = AV_CODEC_ID_OTF, + .type = AVMEDIA_TYPE_DATA, + .name = "otf", + .long_name = NULL_IF_CONFIG_SMALL("OpenType font"), + .mime_types= MT("application/vnd.ms-opentype"), + }, + { + .id = AV_CODEC_ID_SMPTE_KLV, + .type = AVMEDIA_TYPE_DATA, + .name = "klv", + .long_name = NULL_IF_CONFIG_SMALL("SMPTE 336M Key-Length-Value (KLV) metadata"), + }, + { + .id = AV_CODEC_ID_DVD_NAV, + .type = AVMEDIA_TYPE_DATA, + .name = "dvd_nav_packet", + .long_name = NULL_IF_CONFIG_SMALL("DVD Nav packet"), + }, + { + .id = AV_CODEC_ID_TIMED_ID3, + .type = AVMEDIA_TYPE_DATA, + .name = "timed_id3", + .long_name = NULL_IF_CONFIG_SMALL("timed ID3 metadata"), + }, + { + .id = AV_CODEC_ID_BIN_DATA, + .type = AVMEDIA_TYPE_DATA, + .name = "bin_data", + .long_name = NULL_IF_CONFIG_SMALL("binary data"), + .mime_types= MT("application/octet-stream"), + }, + { + .id = AV_CODEC_ID_MPEG2TS, + .type = AVMEDIA_TYPE_DATA, + .name = "mpegts", + .long_name = NULL_IF_CONFIG_SMALL("raw MPEG-TS stream"), + .mime_types= MT("application/MP2T"), + }, + { + .id = AV_CODEC_ID_WRAPPED_AVFRAME, + .type = AVMEDIA_TYPE_VIDEO, + .name = "wrapped_avframe", + .long_name = NULL_IF_CONFIG_SMALL("AVFrame to AVPacket passthrough"), + .props = AV_CODEC_PROP_LOSSLESS, + }, + { + .id = AV_CODEC_ID_VNULL, + .type = AVMEDIA_TYPE_VIDEO, + .name = "vnull", + .long_name = NULL_IF_CONFIG_SMALL("Null video codec"), + }, + { + .id = AV_CODEC_ID_ANULL, + .type = AVMEDIA_TYPE_AUDIO, + .name = "anull", + .long_name = NULL_IF_CONFIG_SMALL("Null audio codec"), + }, +}; + +static int descriptor_compare(const void *key, const void *member) +{ + enum AVCodecID id = *(const enum AVCodecID *) key; + const AVCodecDescriptor *desc = member; + + return id - desc->id; +} + +const AVCodecDescriptor *avcodec_descriptor_get(enum AVCodecID id) +{ + return bsearch(&id, codec_descriptors, FF_ARRAY_ELEMS(codec_descriptors), + sizeof(codec_descriptors[0]), descriptor_compare); +} + +const AVCodecDescriptor *avcodec_descriptor_next(const AVCodecDescriptor *prev) +{ + if (!prev) + return &codec_descriptors[0]; + if (prev - codec_descriptors < FF_ARRAY_ELEMS(codec_descriptors) - 1) + return prev + 1; + return NULL; +} + +const AVCodecDescriptor *avcodec_descriptor_get_by_name(const char *name) +{ + const AVCodecDescriptor *desc = NULL; + + while ((desc = avcodec_descriptor_next(desc))) + if (!strcmp(desc->name, name)) + return desc; + return NULL; +} + +enum AVMediaType avcodec_get_type(enum AVCodecID codec_id) +{ + const AVCodecDescriptor *desc = avcodec_descriptor_get(codec_id); + return desc ? desc->type : AVMEDIA_TYPE_UNKNOWN; +} diff --git a/media/ffvpx/libavcodec/codec_desc.h b/media/ffvpx/libavcodec/codec_desc.h new file mode 100644 index 0000000000..126b52df47 --- /dev/null +++ b/media/ffvpx/libavcodec/codec_desc.h @@ -0,0 +1,128 @@ +/* + * Codec descriptors public API + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_CODEC_DESC_H +#define AVCODEC_CODEC_DESC_H + +#include "libavutil/avutil.h" + +#include "codec_id.h" + +/** + * @addtogroup lavc_core + * @{ + */ + +/** + * This struct describes the properties of a single codec described by an + * AVCodecID. + * @see avcodec_descriptor_get() + */ +typedef struct AVCodecDescriptor { + enum AVCodecID id; + enum AVMediaType type; + /** + * Name of the codec described by this descriptor. It is non-empty and + * unique for each codec descriptor. It should contain alphanumeric + * characters and '_' only. + */ + const char *name; + /** + * A more descriptive name for this codec. May be NULL. + */ + const char *long_name; + /** + * Codec properties, a combination of AV_CODEC_PROP_* flags. + */ + int props; + /** + * MIME type(s) associated with the codec. + * May be NULL; if not, a NULL-terminated array of MIME types. + * The first item is always non-NULL and is the preferred MIME type. + */ + const char *const *mime_types; + /** + * If non-NULL, an array of profiles recognized for this codec. + * Terminated with FF_PROFILE_UNKNOWN. + */ + const struct AVProfile *profiles; +} AVCodecDescriptor; + +/** + * Codec uses only intra compression. + * Video and audio codecs only. + */ +#define AV_CODEC_PROP_INTRA_ONLY (1 << 0) +/** + * Codec supports lossy compression. Audio and video codecs only. + * @note a codec may support both lossy and lossless + * compression modes + */ +#define AV_CODEC_PROP_LOSSY (1 << 1) +/** + * Codec supports lossless compression. Audio and video codecs only. + */ +#define AV_CODEC_PROP_LOSSLESS (1 << 2) +/** + * Codec supports frame reordering. That is, the coded order (the order in which + * the encoded packets are output by the encoders / stored / input to the + * decoders) may be different from the presentation order of the corresponding + * frames. + * + * For codecs that do not have this property set, PTS and DTS should always be + * equal. + */ +#define AV_CODEC_PROP_REORDER (1 << 3) +/** + * Subtitle codec is bitmap based + * Decoded AVSubtitle data can be read from the AVSubtitleRect->pict field. + */ +#define AV_CODEC_PROP_BITMAP_SUB (1 << 16) +/** + * Subtitle codec is text based. + * Decoded AVSubtitle data can be read from the AVSubtitleRect->ass field. + */ +#define AV_CODEC_PROP_TEXT_SUB (1 << 17) + +/** + * @return descriptor for given codec ID or NULL if no descriptor exists. + */ +const AVCodecDescriptor *avcodec_descriptor_get(enum AVCodecID id); + +/** + * Iterate over all codec descriptors known to libavcodec. + * + * @param prev previous descriptor. NULL to get the first descriptor. + * + * @return next descriptor or NULL after the last descriptor + */ +const AVCodecDescriptor *avcodec_descriptor_next(const AVCodecDescriptor *prev); + +/** + * @return codec descriptor with the given name or NULL if no such descriptor + * exists. + */ +const AVCodecDescriptor *avcodec_descriptor_get_by_name(const char *name); + +/** + * @} + */ + +#endif // AVCODEC_CODEC_DESC_H diff --git a/media/ffvpx/libavcodec/codec_id.h b/media/ffvpx/libavcodec/codec_id.h new file mode 100644 index 0000000000..89a4a0cb89 --- /dev/null +++ b/media/ffvpx/libavcodec/codec_id.h @@ -0,0 +1,661 @@ +/* + * Codec IDs + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_CODEC_ID_H +#define AVCODEC_CODEC_ID_H + +#include "libavutil/avutil.h" +#include "libavutil/samplefmt.h" + +#include "version_major.h" + +/** + * @addtogroup lavc_core + * @{ + */ + +/** + * Identify the syntax and semantics of the bitstream. + * The principle is roughly: + * Two decoders with the same ID can decode the same streams. + * Two encoders with the same ID can encode compatible streams. + * There may be slight deviations from the principle due to implementation + * details. + * + * If you add a codec ID to this list, add it so that + * 1. no value of an existing codec ID changes (that would break ABI), + * 2. it is as close as possible to similar codecs + * + * After adding new codec IDs, do not forget to add an entry to the codec + * descriptor list and bump libavcodec minor version. + */ +enum AVCodecID { + AV_CODEC_ID_NONE, + + /* video codecs */ + AV_CODEC_ID_MPEG1VIDEO, + AV_CODEC_ID_MPEG2VIDEO, ///< preferred ID for MPEG-1/2 video decoding + AV_CODEC_ID_H261, + AV_CODEC_ID_H263, + AV_CODEC_ID_RV10, + AV_CODEC_ID_RV20, + AV_CODEC_ID_MJPEG, + AV_CODEC_ID_MJPEGB, + AV_CODEC_ID_LJPEG, + AV_CODEC_ID_SP5X, + AV_CODEC_ID_JPEGLS, + AV_CODEC_ID_MPEG4, + AV_CODEC_ID_RAWVIDEO, + AV_CODEC_ID_MSMPEG4V1, + AV_CODEC_ID_MSMPEG4V2, + AV_CODEC_ID_MSMPEG4V3, + AV_CODEC_ID_WMV1, + AV_CODEC_ID_WMV2, + AV_CODEC_ID_H263P, + AV_CODEC_ID_H263I, + AV_CODEC_ID_FLV1, + AV_CODEC_ID_SVQ1, + AV_CODEC_ID_SVQ3, + AV_CODEC_ID_DVVIDEO, + AV_CODEC_ID_HUFFYUV, + AV_CODEC_ID_CYUV, + AV_CODEC_ID_H264, + AV_CODEC_ID_INDEO3, + AV_CODEC_ID_VP3, + AV_CODEC_ID_THEORA, + AV_CODEC_ID_ASV1, + AV_CODEC_ID_ASV2, + AV_CODEC_ID_FFV1, + AV_CODEC_ID_4XM, + AV_CODEC_ID_VCR1, + AV_CODEC_ID_CLJR, + AV_CODEC_ID_MDEC, + AV_CODEC_ID_ROQ, + AV_CODEC_ID_INTERPLAY_VIDEO, + AV_CODEC_ID_XAN_WC3, + AV_CODEC_ID_XAN_WC4, + AV_CODEC_ID_RPZA, + AV_CODEC_ID_CINEPAK, + AV_CODEC_ID_WS_VQA, + AV_CODEC_ID_MSRLE, + AV_CODEC_ID_MSVIDEO1, + AV_CODEC_ID_IDCIN, + AV_CODEC_ID_8BPS, + AV_CODEC_ID_SMC, + AV_CODEC_ID_FLIC, + AV_CODEC_ID_TRUEMOTION1, + AV_CODEC_ID_VMDVIDEO, + AV_CODEC_ID_MSZH, + AV_CODEC_ID_ZLIB, + AV_CODEC_ID_QTRLE, + AV_CODEC_ID_TSCC, + AV_CODEC_ID_ULTI, + AV_CODEC_ID_QDRAW, + AV_CODEC_ID_VIXL, + AV_CODEC_ID_QPEG, + AV_CODEC_ID_PNG, + AV_CODEC_ID_PPM, + AV_CODEC_ID_PBM, + AV_CODEC_ID_PGM, + AV_CODEC_ID_PGMYUV, + AV_CODEC_ID_PAM, + AV_CODEC_ID_FFVHUFF, + AV_CODEC_ID_RV30, + AV_CODEC_ID_RV40, + AV_CODEC_ID_VC1, + AV_CODEC_ID_WMV3, + AV_CODEC_ID_LOCO, + AV_CODEC_ID_WNV1, + AV_CODEC_ID_AASC, + AV_CODEC_ID_INDEO2, + AV_CODEC_ID_FRAPS, + AV_CODEC_ID_TRUEMOTION2, + AV_CODEC_ID_BMP, + AV_CODEC_ID_CSCD, + AV_CODEC_ID_MMVIDEO, + AV_CODEC_ID_ZMBV, + AV_CODEC_ID_AVS, + AV_CODEC_ID_SMACKVIDEO, + AV_CODEC_ID_NUV, + AV_CODEC_ID_KMVC, + AV_CODEC_ID_FLASHSV, + AV_CODEC_ID_CAVS, + AV_CODEC_ID_JPEG2000, + AV_CODEC_ID_VMNC, + AV_CODEC_ID_VP5, + AV_CODEC_ID_VP6, + AV_CODEC_ID_VP6F, + AV_CODEC_ID_TARGA, + AV_CODEC_ID_DSICINVIDEO, + AV_CODEC_ID_TIERTEXSEQVIDEO, + AV_CODEC_ID_TIFF, + AV_CODEC_ID_GIF, + AV_CODEC_ID_DXA, + AV_CODEC_ID_DNXHD, + AV_CODEC_ID_THP, + AV_CODEC_ID_SGI, + AV_CODEC_ID_C93, + AV_CODEC_ID_BETHSOFTVID, + AV_CODEC_ID_PTX, + AV_CODEC_ID_TXD, + AV_CODEC_ID_VP6A, + AV_CODEC_ID_AMV, + AV_CODEC_ID_VB, + AV_CODEC_ID_PCX, + AV_CODEC_ID_SUNRAST, + AV_CODEC_ID_INDEO4, + AV_CODEC_ID_INDEO5, + AV_CODEC_ID_MIMIC, + AV_CODEC_ID_RL2, + AV_CODEC_ID_ESCAPE124, + AV_CODEC_ID_DIRAC, + AV_CODEC_ID_BFI, + AV_CODEC_ID_CMV, + AV_CODEC_ID_MOTIONPIXELS, + AV_CODEC_ID_TGV, + AV_CODEC_ID_TGQ, + AV_CODEC_ID_TQI, + AV_CODEC_ID_AURA, + AV_CODEC_ID_AURA2, + AV_CODEC_ID_V210X, + AV_CODEC_ID_TMV, + AV_CODEC_ID_V210, + AV_CODEC_ID_DPX, + AV_CODEC_ID_MAD, + AV_CODEC_ID_FRWU, + AV_CODEC_ID_FLASHSV2, + AV_CODEC_ID_CDGRAPHICS, + AV_CODEC_ID_R210, + AV_CODEC_ID_ANM, + AV_CODEC_ID_BINKVIDEO, + AV_CODEC_ID_IFF_ILBM, +#define AV_CODEC_ID_IFF_BYTERUN1 AV_CODEC_ID_IFF_ILBM + AV_CODEC_ID_KGV1, + AV_CODEC_ID_YOP, + AV_CODEC_ID_VP8, + AV_CODEC_ID_PICTOR, + AV_CODEC_ID_ANSI, + AV_CODEC_ID_A64_MULTI, + AV_CODEC_ID_A64_MULTI5, + AV_CODEC_ID_R10K, + AV_CODEC_ID_MXPEG, + AV_CODEC_ID_LAGARITH, + AV_CODEC_ID_PRORES, + AV_CODEC_ID_JV, + AV_CODEC_ID_DFA, + AV_CODEC_ID_WMV3IMAGE, + AV_CODEC_ID_VC1IMAGE, + AV_CODEC_ID_UTVIDEO, + AV_CODEC_ID_BMV_VIDEO, + AV_CODEC_ID_VBLE, + AV_CODEC_ID_DXTORY, + AV_CODEC_ID_V410, + AV_CODEC_ID_XWD, + AV_CODEC_ID_CDXL, + AV_CODEC_ID_XBM, + AV_CODEC_ID_ZEROCODEC, + AV_CODEC_ID_MSS1, + AV_CODEC_ID_MSA1, + AV_CODEC_ID_TSCC2, + AV_CODEC_ID_MTS2, + AV_CODEC_ID_CLLC, + AV_CODEC_ID_MSS2, + AV_CODEC_ID_VP9, + AV_CODEC_ID_AIC, + AV_CODEC_ID_ESCAPE130, + AV_CODEC_ID_G2M, + AV_CODEC_ID_WEBP, + AV_CODEC_ID_HNM4_VIDEO, + AV_CODEC_ID_HEVC, +#define AV_CODEC_ID_H265 AV_CODEC_ID_HEVC + AV_CODEC_ID_FIC, + AV_CODEC_ID_ALIAS_PIX, + AV_CODEC_ID_BRENDER_PIX, + AV_CODEC_ID_PAF_VIDEO, + AV_CODEC_ID_EXR, + AV_CODEC_ID_VP7, + AV_CODEC_ID_SANM, + AV_CODEC_ID_SGIRLE, + AV_CODEC_ID_MVC1, + AV_CODEC_ID_MVC2, + AV_CODEC_ID_HQX, + AV_CODEC_ID_TDSC, + AV_CODEC_ID_HQ_HQA, + AV_CODEC_ID_HAP, + AV_CODEC_ID_DDS, + AV_CODEC_ID_DXV, + AV_CODEC_ID_SCREENPRESSO, + AV_CODEC_ID_RSCC, + AV_CODEC_ID_AVS2, + AV_CODEC_ID_PGX, + AV_CODEC_ID_AVS3, + AV_CODEC_ID_MSP2, + AV_CODEC_ID_VVC, +#define AV_CODEC_ID_H266 AV_CODEC_ID_VVC + AV_CODEC_ID_Y41P, + AV_CODEC_ID_AVRP, + AV_CODEC_ID_012V, + AV_CODEC_ID_AVUI, +#if FF_API_AYUV_CODECID + AV_CODEC_ID_AYUV, +#endif + AV_CODEC_ID_TARGA_Y216, + AV_CODEC_ID_V308, + AV_CODEC_ID_V408, + AV_CODEC_ID_YUV4, + AV_CODEC_ID_AVRN, + AV_CODEC_ID_CPIA, + AV_CODEC_ID_XFACE, + AV_CODEC_ID_SNOW, + AV_CODEC_ID_SMVJPEG, + AV_CODEC_ID_APNG, + AV_CODEC_ID_DAALA, + AV_CODEC_ID_CFHD, + AV_CODEC_ID_TRUEMOTION2RT, + AV_CODEC_ID_M101, + AV_CODEC_ID_MAGICYUV, + AV_CODEC_ID_SHEERVIDEO, + AV_CODEC_ID_YLC, + AV_CODEC_ID_PSD, + AV_CODEC_ID_PIXLET, + AV_CODEC_ID_SPEEDHQ, + AV_CODEC_ID_FMVC, + AV_CODEC_ID_SCPR, + AV_CODEC_ID_CLEARVIDEO, + AV_CODEC_ID_XPM, + AV_CODEC_ID_AV1, + AV_CODEC_ID_BITPACKED, + AV_CODEC_ID_MSCC, + AV_CODEC_ID_SRGC, + AV_CODEC_ID_SVG, + AV_CODEC_ID_GDV, + AV_CODEC_ID_FITS, + AV_CODEC_ID_IMM4, + AV_CODEC_ID_PROSUMER, + AV_CODEC_ID_MWSC, + AV_CODEC_ID_WCMV, + AV_CODEC_ID_RASC, + AV_CODEC_ID_HYMT, + AV_CODEC_ID_ARBC, + AV_CODEC_ID_AGM, + AV_CODEC_ID_LSCR, + AV_CODEC_ID_VP4, + AV_CODEC_ID_IMM5, + AV_CODEC_ID_MVDV, + AV_CODEC_ID_MVHA, + AV_CODEC_ID_CDTOONS, + AV_CODEC_ID_MV30, + AV_CODEC_ID_NOTCHLC, + AV_CODEC_ID_PFM, + AV_CODEC_ID_MOBICLIP, + AV_CODEC_ID_PHOTOCD, + AV_CODEC_ID_IPU, + AV_CODEC_ID_ARGO, + AV_CODEC_ID_CRI, + AV_CODEC_ID_SIMBIOSIS_IMX, + AV_CODEC_ID_SGA_VIDEO, + AV_CODEC_ID_GEM, + AV_CODEC_ID_VBN, + AV_CODEC_ID_JPEGXL, + AV_CODEC_ID_QOI, + AV_CODEC_ID_PHM, + AV_CODEC_ID_RADIANCE_HDR, + AV_CODEC_ID_WBMP, + AV_CODEC_ID_MEDIA100, + AV_CODEC_ID_VQC, + + /* various PCM "codecs" */ + AV_CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs + AV_CODEC_ID_PCM_S16LE = 0x10000, + AV_CODEC_ID_PCM_S16BE, + AV_CODEC_ID_PCM_U16LE, + AV_CODEC_ID_PCM_U16BE, + AV_CODEC_ID_PCM_S8, + AV_CODEC_ID_PCM_U8, + AV_CODEC_ID_PCM_MULAW, + AV_CODEC_ID_PCM_ALAW, + AV_CODEC_ID_PCM_S32LE, + AV_CODEC_ID_PCM_S32BE, + AV_CODEC_ID_PCM_U32LE, + AV_CODEC_ID_PCM_U32BE, + AV_CODEC_ID_PCM_S24LE, + AV_CODEC_ID_PCM_S24BE, + AV_CODEC_ID_PCM_U24LE, + AV_CODEC_ID_PCM_U24BE, + AV_CODEC_ID_PCM_S24DAUD, + AV_CODEC_ID_PCM_ZORK, + AV_CODEC_ID_PCM_S16LE_PLANAR, + AV_CODEC_ID_PCM_DVD, + AV_CODEC_ID_PCM_F32BE, + AV_CODEC_ID_PCM_F32LE, + AV_CODEC_ID_PCM_F64BE, + AV_CODEC_ID_PCM_F64LE, + AV_CODEC_ID_PCM_BLURAY, + AV_CODEC_ID_PCM_LXF, + AV_CODEC_ID_S302M, + AV_CODEC_ID_PCM_S8_PLANAR, + AV_CODEC_ID_PCM_S24LE_PLANAR, + AV_CODEC_ID_PCM_S32LE_PLANAR, + AV_CODEC_ID_PCM_S16BE_PLANAR, + AV_CODEC_ID_PCM_S64LE, + AV_CODEC_ID_PCM_S64BE, + AV_CODEC_ID_PCM_F16LE, + AV_CODEC_ID_PCM_F24LE, + AV_CODEC_ID_PCM_VIDC, + AV_CODEC_ID_PCM_SGA, + + /* various ADPCM codecs */ + AV_CODEC_ID_ADPCM_IMA_QT = 0x11000, + AV_CODEC_ID_ADPCM_IMA_WAV, + AV_CODEC_ID_ADPCM_IMA_DK3, + AV_CODEC_ID_ADPCM_IMA_DK4, + AV_CODEC_ID_ADPCM_IMA_WS, + AV_CODEC_ID_ADPCM_IMA_SMJPEG, + AV_CODEC_ID_ADPCM_MS, + AV_CODEC_ID_ADPCM_4XM, + AV_CODEC_ID_ADPCM_XA, + AV_CODEC_ID_ADPCM_ADX, + AV_CODEC_ID_ADPCM_EA, + AV_CODEC_ID_ADPCM_G726, + AV_CODEC_ID_ADPCM_CT, + AV_CODEC_ID_ADPCM_SWF, + AV_CODEC_ID_ADPCM_YAMAHA, + AV_CODEC_ID_ADPCM_SBPRO_4, + AV_CODEC_ID_ADPCM_SBPRO_3, + AV_CODEC_ID_ADPCM_SBPRO_2, + AV_CODEC_ID_ADPCM_THP, + AV_CODEC_ID_ADPCM_IMA_AMV, + AV_CODEC_ID_ADPCM_EA_R1, + AV_CODEC_ID_ADPCM_EA_R3, + AV_CODEC_ID_ADPCM_EA_R2, + AV_CODEC_ID_ADPCM_IMA_EA_SEAD, + AV_CODEC_ID_ADPCM_IMA_EA_EACS, + AV_CODEC_ID_ADPCM_EA_XAS, + AV_CODEC_ID_ADPCM_EA_MAXIS_XA, + AV_CODEC_ID_ADPCM_IMA_ISS, + AV_CODEC_ID_ADPCM_G722, + AV_CODEC_ID_ADPCM_IMA_APC, + AV_CODEC_ID_ADPCM_VIMA, + AV_CODEC_ID_ADPCM_AFC, + AV_CODEC_ID_ADPCM_IMA_OKI, + AV_CODEC_ID_ADPCM_DTK, + AV_CODEC_ID_ADPCM_IMA_RAD, + AV_CODEC_ID_ADPCM_G726LE, + AV_CODEC_ID_ADPCM_THP_LE, + AV_CODEC_ID_ADPCM_PSX, + AV_CODEC_ID_ADPCM_AICA, + AV_CODEC_ID_ADPCM_IMA_DAT4, + AV_CODEC_ID_ADPCM_MTAF, + AV_CODEC_ID_ADPCM_AGM, + AV_CODEC_ID_ADPCM_ARGO, + AV_CODEC_ID_ADPCM_IMA_SSI, + AV_CODEC_ID_ADPCM_ZORK, + AV_CODEC_ID_ADPCM_IMA_APM, + AV_CODEC_ID_ADPCM_IMA_ALP, + AV_CODEC_ID_ADPCM_IMA_MTF, + AV_CODEC_ID_ADPCM_IMA_CUNNING, + AV_CODEC_ID_ADPCM_IMA_MOFLEX, + AV_CODEC_ID_ADPCM_IMA_ACORN, + AV_CODEC_ID_ADPCM_XMD, + + /* AMR */ + AV_CODEC_ID_AMR_NB = 0x12000, + AV_CODEC_ID_AMR_WB, + + /* RealAudio codecs*/ + AV_CODEC_ID_RA_144 = 0x13000, + AV_CODEC_ID_RA_288, + + /* various DPCM codecs */ + AV_CODEC_ID_ROQ_DPCM = 0x14000, + AV_CODEC_ID_INTERPLAY_DPCM, + AV_CODEC_ID_XAN_DPCM, + AV_CODEC_ID_SOL_DPCM, + AV_CODEC_ID_SDX2_DPCM, + AV_CODEC_ID_GREMLIN_DPCM, + AV_CODEC_ID_DERF_DPCM, + AV_CODEC_ID_WADY_DPCM, + AV_CODEC_ID_CBD2_DPCM, + + /* audio codecs */ + AV_CODEC_ID_MP2 = 0x15000, + AV_CODEC_ID_MP3, ///< preferred ID for decoding MPEG audio layer 1, 2 or 3 + AV_CODEC_ID_AAC, + AV_CODEC_ID_AC3, + AV_CODEC_ID_DTS, + AV_CODEC_ID_VORBIS, + AV_CODEC_ID_DVAUDIO, + AV_CODEC_ID_WMAV1, + AV_CODEC_ID_WMAV2, + AV_CODEC_ID_MACE3, + AV_CODEC_ID_MACE6, + AV_CODEC_ID_VMDAUDIO, + AV_CODEC_ID_FLAC, + AV_CODEC_ID_MP3ADU, + AV_CODEC_ID_MP3ON4, + AV_CODEC_ID_SHORTEN, + AV_CODEC_ID_ALAC, + AV_CODEC_ID_WESTWOOD_SND1, + AV_CODEC_ID_GSM, ///< as in Berlin toast format + AV_CODEC_ID_QDM2, + AV_CODEC_ID_COOK, + AV_CODEC_ID_TRUESPEECH, + AV_CODEC_ID_TTA, + AV_CODEC_ID_SMACKAUDIO, + AV_CODEC_ID_QCELP, + AV_CODEC_ID_WAVPACK, + AV_CODEC_ID_DSICINAUDIO, + AV_CODEC_ID_IMC, + AV_CODEC_ID_MUSEPACK7, + AV_CODEC_ID_MLP, + AV_CODEC_ID_GSM_MS, /* as found in WAV */ + AV_CODEC_ID_ATRAC3, + AV_CODEC_ID_APE, + AV_CODEC_ID_NELLYMOSER, + AV_CODEC_ID_MUSEPACK8, + AV_CODEC_ID_SPEEX, + AV_CODEC_ID_WMAVOICE, + AV_CODEC_ID_WMAPRO, + AV_CODEC_ID_WMALOSSLESS, + AV_CODEC_ID_ATRAC3P, + AV_CODEC_ID_EAC3, + AV_CODEC_ID_SIPR, + AV_CODEC_ID_MP1, + AV_CODEC_ID_TWINVQ, + AV_CODEC_ID_TRUEHD, + AV_CODEC_ID_MP4ALS, + AV_CODEC_ID_ATRAC1, + AV_CODEC_ID_BINKAUDIO_RDFT, + AV_CODEC_ID_BINKAUDIO_DCT, + AV_CODEC_ID_AAC_LATM, + AV_CODEC_ID_QDMC, + AV_CODEC_ID_CELT, + AV_CODEC_ID_G723_1, + AV_CODEC_ID_G729, + AV_CODEC_ID_8SVX_EXP, + AV_CODEC_ID_8SVX_FIB, + AV_CODEC_ID_BMV_AUDIO, + AV_CODEC_ID_RALF, + AV_CODEC_ID_IAC, + AV_CODEC_ID_ILBC, + AV_CODEC_ID_OPUS, + AV_CODEC_ID_COMFORT_NOISE, + AV_CODEC_ID_TAK, + AV_CODEC_ID_METASOUND, + AV_CODEC_ID_PAF_AUDIO, + AV_CODEC_ID_ON2AVC, + AV_CODEC_ID_DSS_SP, + AV_CODEC_ID_CODEC2, + AV_CODEC_ID_FFWAVESYNTH, + AV_CODEC_ID_SONIC, + AV_CODEC_ID_SONIC_LS, + AV_CODEC_ID_EVRC, + AV_CODEC_ID_SMV, + AV_CODEC_ID_DSD_LSBF, + AV_CODEC_ID_DSD_MSBF, + AV_CODEC_ID_DSD_LSBF_PLANAR, + AV_CODEC_ID_DSD_MSBF_PLANAR, + AV_CODEC_ID_4GV, + AV_CODEC_ID_INTERPLAY_ACM, + AV_CODEC_ID_XMA1, + AV_CODEC_ID_XMA2, + AV_CODEC_ID_DST, + AV_CODEC_ID_ATRAC3AL, + AV_CODEC_ID_ATRAC3PAL, + AV_CODEC_ID_DOLBY_E, + AV_CODEC_ID_APTX, + AV_CODEC_ID_APTX_HD, + AV_CODEC_ID_SBC, + AV_CODEC_ID_ATRAC9, + AV_CODEC_ID_HCOM, + AV_CODEC_ID_ACELP_KELVIN, + AV_CODEC_ID_MPEGH_3D_AUDIO, + AV_CODEC_ID_SIREN, + AV_CODEC_ID_HCA, + AV_CODEC_ID_FASTAUDIO, + AV_CODEC_ID_MSNSIREN, + AV_CODEC_ID_DFPWM, + AV_CODEC_ID_BONK, + AV_CODEC_ID_MISC4, + AV_CODEC_ID_APAC, + AV_CODEC_ID_FTR, + AV_CODEC_ID_WAVARC, + AV_CODEC_ID_RKA, + + /* subtitle codecs */ + AV_CODEC_ID_FIRST_SUBTITLE = 0x17000, ///< A dummy ID pointing at the start of subtitle codecs. + AV_CODEC_ID_DVD_SUBTITLE = 0x17000, + AV_CODEC_ID_DVB_SUBTITLE, + AV_CODEC_ID_TEXT, ///< raw UTF-8 text + AV_CODEC_ID_XSUB, + AV_CODEC_ID_SSA, + AV_CODEC_ID_MOV_TEXT, + AV_CODEC_ID_HDMV_PGS_SUBTITLE, + AV_CODEC_ID_DVB_TELETEXT, + AV_CODEC_ID_SRT, + AV_CODEC_ID_MICRODVD, + AV_CODEC_ID_EIA_608, + AV_CODEC_ID_JACOSUB, + AV_CODEC_ID_SAMI, + AV_CODEC_ID_REALTEXT, + AV_CODEC_ID_STL, + AV_CODEC_ID_SUBVIEWER1, + AV_CODEC_ID_SUBVIEWER, + AV_CODEC_ID_SUBRIP, + AV_CODEC_ID_WEBVTT, + AV_CODEC_ID_MPL2, + AV_CODEC_ID_VPLAYER, + AV_CODEC_ID_PJS, + AV_CODEC_ID_ASS, + AV_CODEC_ID_HDMV_TEXT_SUBTITLE, + AV_CODEC_ID_TTML, + AV_CODEC_ID_ARIB_CAPTION, + + /* other specific kind of codecs (generally used for attachments) */ + AV_CODEC_ID_FIRST_UNKNOWN = 0x18000, ///< A dummy ID pointing at the start of various fake codecs. + AV_CODEC_ID_TTF = 0x18000, + + AV_CODEC_ID_SCTE_35, ///< Contain timestamp estimated through PCR of program stream. + AV_CODEC_ID_EPG, + AV_CODEC_ID_BINTEXT, + AV_CODEC_ID_XBIN, + AV_CODEC_ID_IDF, + AV_CODEC_ID_OTF, + AV_CODEC_ID_SMPTE_KLV, + AV_CODEC_ID_DVD_NAV, + AV_CODEC_ID_TIMED_ID3, + AV_CODEC_ID_BIN_DATA, + + + AV_CODEC_ID_PROBE = 0x19000, ///< codec_id is not known (like AV_CODEC_ID_NONE) but lavf should attempt to identify it + + AV_CODEC_ID_MPEG2TS = 0x20000, /**< _FAKE_ codec to indicate a raw MPEG-2 TS + * stream (only used by libavformat) */ + AV_CODEC_ID_MPEG4SYSTEMS = 0x20001, /**< _FAKE_ codec to indicate a MPEG-4 Systems + * stream (only used by libavformat) */ + AV_CODEC_ID_FFMETADATA = 0x21000, ///< Dummy codec for streams containing only metadata information. + AV_CODEC_ID_WRAPPED_AVFRAME = 0x21001, ///< Passthrough codec, AVFrames wrapped in AVPacket + /** + * Dummy null video codec, useful mainly for development and debugging. + * Null encoder/decoder discard all input and never return any output. + */ + AV_CODEC_ID_VNULL, + /** + * Dummy null audio codec, useful mainly for development and debugging. + * Null encoder/decoder discard all input and never return any output. + */ + AV_CODEC_ID_ANULL, +}; + +/** + * Get the type of the given codec. + */ +enum AVMediaType avcodec_get_type(enum AVCodecID codec_id); + +/** + * Get the name of a codec. + * @return a static string identifying the codec; never NULL + */ +const char *avcodec_get_name(enum AVCodecID id); + +/** + * Return codec bits per sample. + * + * @param[in] codec_id the codec + * @return Number of bits per sample or zero if unknown for the given codec. + */ +int av_get_bits_per_sample(enum AVCodecID codec_id); + +/** + * Return codec bits per sample. + * Only return non-zero if the bits per sample is exactly correct, not an + * approximation. + * + * @param[in] codec_id the codec + * @return Number of bits per sample or zero if unknown for the given codec. + */ +int av_get_exact_bits_per_sample(enum AVCodecID codec_id); + +/** + * Return a name for the specified profile, if available. + * + * @param codec_id the ID of the codec to which the requested profile belongs + * @param profile the profile value for which a name is requested + * @return A name for the profile if found, NULL otherwise. + * + * @note unlike av_get_profile_name(), which searches a list of profiles + * supported by a specific decoder or encoder implementation, this + * function searches the list of profiles from the AVCodecDescriptor + */ +const char *avcodec_profile_name(enum AVCodecID codec_id, int profile); + +/** + * Return the PCM codec associated with a sample format. + * @param be endianness, 0 for little, 1 for big, + * -1 (or anything else) for native + * @return AV_CODEC_ID_PCM_* or AV_CODEC_ID_NONE + */ +enum AVCodecID av_get_pcm_codec(enum AVSampleFormat fmt, int be); + +/** + * @} + */ + +#endif // AVCODEC_CODEC_ID_H diff --git a/media/ffvpx/libavcodec/codec_internal.h b/media/ffvpx/libavcodec/codec_internal.h new file mode 100644 index 0000000000..130a7dc3cd --- /dev/null +++ b/media/ffvpx/libavcodec/codec_internal.h @@ -0,0 +1,330 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_CODEC_INTERNAL_H +#define AVCODEC_CODEC_INTERNAL_H + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "codec.h" +#include "config.h" + +/** + * The codec is not known to be init-threadsafe (i.e. it might be unsafe + * to initialize this codec and another codec concurrently, typically because + * the codec calls external APIs that are not known to be thread-safe). + * Therefore calling the codec's init function needs to be guarded with a lock. + */ +#define FF_CODEC_CAP_NOT_INIT_THREADSAFE (1 << 0) +/** + * The codec allows calling the close function for deallocation even if + * the init function returned a failure. Without this capability flag, a + * codec does such cleanup internally when returning failures from the + * init function and does not expect the close function to be called at + * all. + */ +#define FF_CODEC_CAP_INIT_CLEANUP (1 << 1) +/** + * Decoders marked with FF_CODEC_CAP_SETS_PKT_DTS want to set + * AVFrame.pkt_dts manually. If the flag is set, decode.c won't overwrite + * this field. If it's unset, decode.c tries to guess the pkt_dts field + * from the input AVPacket. + */ +#define FF_CODEC_CAP_SETS_PKT_DTS (1 << 2) +/** + * The decoder extracts and fills its parameters even if the frame is + * skipped due to the skip_frame setting. + */ +#define FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM (1 << 3) +/** + * The decoder sets the cropping fields in the output frames manually. + * If this cap is set, the generic code will initialize output frame + * dimensions to coded rather than display values. + */ +#define FF_CODEC_CAP_EXPORTS_CROPPING (1 << 4) +/** + * Codec initializes slice-based threading with a main function + */ +#define FF_CODEC_CAP_SLICE_THREAD_HAS_MF (1 << 5) +/* + * The codec supports frame threading and has inter-frame dependencies, so it + * uses ff_thread_report/await_progress(). + */ +#define FF_CODEC_CAP_ALLOCATE_PROGRESS (1 << 6) +/** + * Codec handles avctx->thread_count == 0 (auto) internally. + */ +#define FF_CODEC_CAP_AUTO_THREADS (1 << 7) +/** + * Codec handles output frame properties internally instead of letting the + * internal logic derive them from AVCodecInternal.last_pkt_props. + */ +#define FF_CODEC_CAP_SETS_FRAME_PROPS (1 << 8) +/** + * Codec supports embedded ICC profiles (AV_FRAME_DATA_ICC_PROFILE). + */ +#define FF_CODEC_CAP_ICC_PROFILES (1 << 9) +/** + * The encoder has AV_CODEC_CAP_DELAY set, but does not actually have delay - it + * only wants to be flushed at the end to update some context variables (e.g. + * 2pass stats) or produce a trailing packet. Besides that it immediately + * produces exactly one output packet per each input frame, just as no-delay + * encoders do. + */ +#define FF_CODEC_CAP_EOF_FLUSH (1 << 10) + +/** + * FFCodec.codec_tags termination value + */ +#define FF_CODEC_TAGS_END -1 + +typedef struct FFCodecDefault { + const char *key; + const char *value; +} FFCodecDefault; + +struct AVCodecContext; +struct AVSubtitle; +struct AVPacket; + +enum FFCodecType { + /* The codec is a decoder using the decode callback; + * audio and video codecs only. */ + FF_CODEC_CB_TYPE_DECODE, + /* The codec is a decoder using the decode_sub callback; + * subtitle codecs only. */ + FF_CODEC_CB_TYPE_DECODE_SUB, + /* The codec is a decoder using the receive_frame callback; + * audio and video codecs only. */ + FF_CODEC_CB_TYPE_RECEIVE_FRAME, + /* The codec is an encoder using the encode callback; + * audio and video codecs only. */ + FF_CODEC_CB_TYPE_ENCODE, + /* The codec is an encoder using the encode_sub callback; + * subtitle codecs only. */ + FF_CODEC_CB_TYPE_ENCODE_SUB, + /* The codec is an encoder using the receive_packet callback; + * audio and video codecs only. */ + FF_CODEC_CB_TYPE_RECEIVE_PACKET, +}; + +typedef struct FFCodec { + /** + * The public AVCodec. See codec.h for it. + */ + AVCodec p; + + /** + * Internal codec capabilities FF_CODEC_CAP_*. + */ + unsigned caps_internal:29; + + /** + * This field determines the type of the codec (decoder/encoder) + * and also the exact callback cb implemented by the codec. + * cb_type uses enum FFCodecType values. + */ + unsigned cb_type:3; + + int priv_data_size; + /** + * @name Frame-level threading support functions + * @{ + */ + /** + * Copy necessary context variables from a previous thread context to the current one. + * If not defined, the next thread will start automatically; otherwise, the codec + * must call ff_thread_finish_setup(). + * + * dst and src will (rarely) point to the same context, in which case memcpy should be skipped. + */ + int (*update_thread_context)(struct AVCodecContext *dst, const struct AVCodecContext *src); + + /** + * Copy variables back to the user-facing context + */ + int (*update_thread_context_for_user)(struct AVCodecContext *dst, const struct AVCodecContext *src); + /** @} */ + + /** + * Private codec-specific defaults. + */ + const FFCodecDefault *defaults; + + /** + * Initialize codec static data, called from av_codec_iterate(). + * + * This is not intended for time consuming operations as it is + * run for every codec regardless of that codec being used. + */ + void (*init_static_data)(struct FFCodec *codec); + + int (*init)(struct AVCodecContext *); + + union { + /** + * Decode to an AVFrame. + * cb is in this state if cb_type is FF_CODEC_CB_TYPE_DECODE. + * + * @param avctx codec context + * @param[out] frame AVFrame for output + * @param[out] got_frame_ptr decoder sets to 0 or 1 to indicate that + * a non-empty frame was returned in frame. + * @param[in] avpkt AVPacket containing the data to be decoded + * @return amount of bytes read from the packet on success, + * negative error code on failure + */ + int (*decode)(struct AVCodecContext *avctx, struct AVFrame *frame, + int *got_frame_ptr, struct AVPacket *avpkt); + /** + * Decode subtitle data to an AVSubtitle. + * cb is in this state if cb_type is FF_CODEC_CB_TYPE_DECODE_SUB. + * + * Apart from that this is like the decode callback. + */ + int (*decode_sub)(struct AVCodecContext *avctx, struct AVSubtitle *sub, + int *got_frame_ptr, const struct AVPacket *avpkt); + /** + * Decode API with decoupled packet/frame dataflow. + * cb is in this state if cb_type is FF_CODEC_CB_TYPE_RECEIVE_FRAME. + * + * This function is called to get one output frame. It should call + * ff_decode_get_packet() to obtain input data. + */ + int (*receive_frame)(struct AVCodecContext *avctx, struct AVFrame *frame); + /** + * Encode data to an AVPacket. + * cb is in this state if cb_type is FF_CODEC_CB_TYPE_ENCODE + * + * @param avctx codec context + * @param[out] avpkt output AVPacket + * @param[in] frame AVFrame containing the input to be encoded + * @param[out] got_packet_ptr encoder sets to 0 or 1 to indicate that a + * non-empty packet was returned in avpkt. + * @return 0 on success, negative error code on failure + */ + int (*encode)(struct AVCodecContext *avctx, struct AVPacket *avpkt, + const struct AVFrame *frame, int *got_packet_ptr); + /** + * Encode subtitles to a raw buffer. + * cb is in this state if cb_type is FF_CODEC_CB_TYPE_ENCODE_SUB. + */ + int (*encode_sub)(struct AVCodecContext *avctx, uint8_t *buf, + int buf_size, const struct AVSubtitle *sub); + /** + * Encode API with decoupled frame/packet dataflow. + * cb is in this state if cb_type is FF_CODEC_CB_TYPE_RECEIVE_PACKET. + * + * This function is called to get one output packet. + * It should call ff_encode_get_frame() to obtain input data. + */ + int (*receive_packet)(struct AVCodecContext *avctx, struct AVPacket *avpkt); + } cb; + + int (*close)(struct AVCodecContext *); + + /** + * Flush buffers. + * Will be called when seeking + */ + void (*flush)(struct AVCodecContext *); + + /** + * Decoding only, a comma-separated list of bitstream filters to apply to + * packets before decoding. + */ + const char *bsfs; + + /** + * Array of pointers to hardware configurations supported by the codec, + * or NULL if no hardware supported. The array is terminated by a NULL + * pointer. + * + * The user can only access this field via avcodec_get_hw_config(). + */ + const struct AVCodecHWConfigInternal *const *hw_configs; + + /** + * List of supported codec_tags, terminated by FF_CODEC_TAGS_END. + */ + const uint32_t *codec_tags; +} FFCodec; + +#if CONFIG_SMALL +#define CODEC_LONG_NAME(str) .p.long_name = NULL +#else +#define CODEC_LONG_NAME(str) .p.long_name = str +#endif + +#if HAVE_THREADS +#define UPDATE_THREAD_CONTEXT(func) \ + .update_thread_context = (func) +#define UPDATE_THREAD_CONTEXT_FOR_USER(func) \ + .update_thread_context_for_user = (func) +#else +#define UPDATE_THREAD_CONTEXT(func) \ + .update_thread_context = NULL +#define UPDATE_THREAD_CONTEXT_FOR_USER(func) \ + .update_thread_context_for_user = NULL +#endif + +#if FF_API_OLD_CHANNEL_LAYOUT +#define CODEC_OLD_CHANNEL_LAYOUTS(...) CODEC_OLD_CHANNEL_LAYOUTS_ARRAY(((const uint64_t[]) { __VA_ARGS__, 0 })) +#if defined(__clang__) +#define CODEC_OLD_CHANNEL_LAYOUTS_ARRAY(array) \ + FF_DISABLE_DEPRECATION_WARNINGS \ + .p.channel_layouts = (array), \ + FF_ENABLE_DEPRECATION_WARNINGS +#else +#define CODEC_OLD_CHANNEL_LAYOUTS_ARRAY(array) .p.channel_layouts = (array), +#endif +#else +/* This is only provided to allow to test disabling FF_API_OLD_CHANNEL_LAYOUT + * without removing all the FF_API_OLD_CHANNEL_LAYOUT codeblocks. + * It is of course still expected to be removed when FF_API_OLD_CHANNEL_LAYOUT + * will be finally removed (along with all usages of these macros). */ +#define CODEC_OLD_CHANNEL_LAYOUTS(...) +#define CODEC_OLD_CHANNEL_LAYOUTS_ARRAY(array) +#endif + +#define FF_CODEC_DECODE_CB(func) \ + .cb_type = FF_CODEC_CB_TYPE_DECODE, \ + .cb.decode = (func) +#define FF_CODEC_DECODE_SUB_CB(func) \ + .cb_type = FF_CODEC_CB_TYPE_DECODE_SUB, \ + .cb.decode_sub = (func) +#define FF_CODEC_RECEIVE_FRAME_CB(func) \ + .cb_type = FF_CODEC_CB_TYPE_RECEIVE_FRAME, \ + .cb.receive_frame = (func) +#define FF_CODEC_ENCODE_CB(func) \ + .cb_type = FF_CODEC_CB_TYPE_ENCODE, \ + .cb.encode = (func) +#define FF_CODEC_ENCODE_SUB_CB(func) \ + .cb_type = FF_CODEC_CB_TYPE_ENCODE_SUB, \ + .cb.encode_sub = (func) +#define FF_CODEC_RECEIVE_PACKET_CB(func) \ + .cb_type = FF_CODEC_CB_TYPE_RECEIVE_PACKET, \ + .cb.receive_packet = (func) + +static av_always_inline const FFCodec *ffcodec(const AVCodec *codec) +{ + return (const FFCodec*)codec; +} + +#endif /* AVCODEC_CODEC_INTERNAL_H */ diff --git a/media/ffvpx/libavcodec/codec_list.c b/media/ffvpx/libavcodec/codec_list.c new file mode 100644 index 0000000000..db49c8fffb --- /dev/null +++ b/media/ffvpx/libavcodec/codec_list.c @@ -0,0 +1,20 @@ +static const FFCodec * const codec_list[] = { +#if CONFIG_VP8_DECODER + &ff_vp8_decoder, +#endif +#if CONFIG_VP9_DECODER + &ff_vp9_decoder, +#endif +#if CONFIG_FLAC_DECODER + &ff_flac_decoder, +#endif +#if CONFIG_MP3_DECODER + &ff_mp3_decoder, +#endif +#if CONFIG_LIBDAV1D + &ff_libdav1d_decoder, +#endif +#if CONFIG_AV1_DECODER + &ff_av1_decoder, +#endif + NULL }; diff --git a/media/ffvpx/libavcodec/codec_par.c b/media/ffvpx/libavcodec/codec_par.c new file mode 100644 index 0000000000..abda649aa8 --- /dev/null +++ b/media/ffvpx/libavcodec/codec_par.c @@ -0,0 +1,263 @@ +/* + * AVCodecParameters functions for libavcodec + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * AVCodecParameters functions for libavcodec. + */ + +#include <string.h> +#include "libavutil/mem.h" +#include "avcodec.h" +#include "codec_par.h" + +static void codec_parameters_reset(AVCodecParameters *par) +{ + av_freep(&par->extradata); + av_channel_layout_uninit(&par->ch_layout); + + memset(par, 0, sizeof(*par)); + + par->codec_type = AVMEDIA_TYPE_UNKNOWN; + par->codec_id = AV_CODEC_ID_NONE; + par->format = -1; + par->ch_layout.order = AV_CHANNEL_ORDER_UNSPEC; + par->field_order = AV_FIELD_UNKNOWN; + par->color_range = AVCOL_RANGE_UNSPECIFIED; + par->color_primaries = AVCOL_PRI_UNSPECIFIED; + par->color_trc = AVCOL_TRC_UNSPECIFIED; + par->color_space = AVCOL_SPC_UNSPECIFIED; + par->chroma_location = AVCHROMA_LOC_UNSPECIFIED; + par->sample_aspect_ratio = (AVRational){ 0, 1 }; + par->profile = FF_PROFILE_UNKNOWN; + par->level = FF_LEVEL_UNKNOWN; +} + +AVCodecParameters *avcodec_parameters_alloc(void) +{ + AVCodecParameters *par = av_mallocz(sizeof(*par)); + + if (!par) + return NULL; + codec_parameters_reset(par); + return par; +} + +void avcodec_parameters_free(AVCodecParameters **ppar) +{ + AVCodecParameters *par = *ppar; + + if (!par) + return; + codec_parameters_reset(par); + + av_freep(ppar); +} + +int avcodec_parameters_copy(AVCodecParameters *dst, const AVCodecParameters *src) +{ + int ret; + + codec_parameters_reset(dst); + memcpy(dst, src, sizeof(*dst)); + + dst->ch_layout = (AVChannelLayout){0}; + dst->extradata = NULL; + dst->extradata_size = 0; + if (src->extradata) { + dst->extradata = av_mallocz(src->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE); + if (!dst->extradata) + return AVERROR(ENOMEM); + memcpy(dst->extradata, src->extradata, src->extradata_size); + dst->extradata_size = src->extradata_size; + } + + ret = av_channel_layout_copy(&dst->ch_layout, &src->ch_layout); + if (ret < 0) + return ret; + + return 0; +} + +int avcodec_parameters_from_context(AVCodecParameters *par, + const AVCodecContext *codec) +{ + int ret; + + codec_parameters_reset(par); + + par->codec_type = codec->codec_type; + par->codec_id = codec->codec_id; + par->codec_tag = codec->codec_tag; + + par->bit_rate = codec->bit_rate; + par->bits_per_coded_sample = codec->bits_per_coded_sample; + par->bits_per_raw_sample = codec->bits_per_raw_sample; + par->profile = codec->profile; + par->level = codec->level; + + switch (par->codec_type) { + case AVMEDIA_TYPE_VIDEO: + par->format = codec->pix_fmt; + par->width = codec->width; + par->height = codec->height; + par->field_order = codec->field_order; + par->color_range = codec->color_range; + par->color_primaries = codec->color_primaries; + par->color_trc = codec->color_trc; + par->color_space = codec->colorspace; + par->chroma_location = codec->chroma_sample_location; + par->sample_aspect_ratio = codec->sample_aspect_ratio; + par->video_delay = codec->has_b_frames; + break; + case AVMEDIA_TYPE_AUDIO: + par->format = codec->sample_fmt; +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + // if the old/new fields are set inconsistently, prefer the old ones + if ((codec->channels && codec->channels != codec->ch_layout.nb_channels) || + (codec->channel_layout && (codec->ch_layout.order != AV_CHANNEL_ORDER_NATIVE || + codec->ch_layout.u.mask != codec->channel_layout))) { + if (codec->channel_layout) + av_channel_layout_from_mask(&par->ch_layout, codec->channel_layout); + else { + par->ch_layout.order = AV_CHANNEL_ORDER_UNSPEC; + par->ch_layout.nb_channels = codec->channels; + } +FF_ENABLE_DEPRECATION_WARNINGS + } else { +#endif + ret = av_channel_layout_copy(&par->ch_layout, &codec->ch_layout); + if (ret < 0) + return ret; +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + } + par->channel_layout = par->ch_layout.order == AV_CHANNEL_ORDER_NATIVE ? + par->ch_layout.u.mask : 0; + par->channels = par->ch_layout.nb_channels; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + par->sample_rate = codec->sample_rate; + par->block_align = codec->block_align; + par->frame_size = codec->frame_size; + par->initial_padding = codec->initial_padding; + par->trailing_padding = codec->trailing_padding; + par->seek_preroll = codec->seek_preroll; + break; + case AVMEDIA_TYPE_SUBTITLE: + par->width = codec->width; + par->height = codec->height; + break; + } + + if (codec->extradata) { + par->extradata = av_mallocz(codec->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE); + if (!par->extradata) + return AVERROR(ENOMEM); + memcpy(par->extradata, codec->extradata, codec->extradata_size); + par->extradata_size = codec->extradata_size; + } + + return 0; +} + +int avcodec_parameters_to_context(AVCodecContext *codec, + const AVCodecParameters *par) +{ + int ret; + + codec->codec_type = par->codec_type; + codec->codec_id = par->codec_id; + codec->codec_tag = par->codec_tag; + + codec->bit_rate = par->bit_rate; + codec->bits_per_coded_sample = par->bits_per_coded_sample; + codec->bits_per_raw_sample = par->bits_per_raw_sample; + codec->profile = par->profile; + codec->level = par->level; + + switch (par->codec_type) { + case AVMEDIA_TYPE_VIDEO: + codec->pix_fmt = par->format; + codec->width = par->width; + codec->height = par->height; + codec->field_order = par->field_order; + codec->color_range = par->color_range; + codec->color_primaries = par->color_primaries; + codec->color_trc = par->color_trc; + codec->colorspace = par->color_space; + codec->chroma_sample_location = par->chroma_location; + codec->sample_aspect_ratio = par->sample_aspect_ratio; + codec->has_b_frames = par->video_delay; + break; + case AVMEDIA_TYPE_AUDIO: + codec->sample_fmt = par->format; +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + // if the old/new fields are set inconsistently, prefer the old ones + if ((par->channels && par->channels != par->ch_layout.nb_channels) || + (par->channel_layout && (par->ch_layout.order != AV_CHANNEL_ORDER_NATIVE || + par->ch_layout.u.mask != par->channel_layout))) { + if (par->channel_layout) + av_channel_layout_from_mask(&codec->ch_layout, par->channel_layout); + else { + codec->ch_layout.order = AV_CHANNEL_ORDER_UNSPEC; + codec->ch_layout.nb_channels = par->channels; + } +FF_ENABLE_DEPRECATION_WARNINGS + } else { +#endif + ret = av_channel_layout_copy(&codec->ch_layout, &par->ch_layout); + if (ret < 0) + return ret; +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + } + codec->channel_layout = codec->ch_layout.order == AV_CHANNEL_ORDER_NATIVE ? + codec->ch_layout.u.mask : 0; + codec->channels = codec->ch_layout.nb_channels; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + codec->sample_rate = par->sample_rate; + codec->block_align = par->block_align; + codec->frame_size = par->frame_size; + codec->delay = + codec->initial_padding = par->initial_padding; + codec->trailing_padding = par->trailing_padding; + codec->seek_preroll = par->seek_preroll; + break; + case AVMEDIA_TYPE_SUBTITLE: + codec->width = par->width; + codec->height = par->height; + break; + } + + if (par->extradata) { + av_freep(&codec->extradata); + codec->extradata = av_mallocz(par->extradata_size + AV_INPUT_BUFFER_PADDING_SIZE); + if (!codec->extradata) + return AVERROR(ENOMEM); + memcpy(codec->extradata, par->extradata, par->extradata_size); + codec->extradata_size = par->extradata_size; + } + + return 0; +} diff --git a/media/ffvpx/libavcodec/codec_par.h b/media/ffvpx/libavcodec/codec_par.h new file mode 100644 index 0000000000..f51d27c590 --- /dev/null +++ b/media/ffvpx/libavcodec/codec_par.h @@ -0,0 +1,247 @@ +/* + * Codec parameters public API + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_CODEC_PAR_H +#define AVCODEC_CODEC_PAR_H + +#include <stdint.h> + +#include "libavutil/avutil.h" +#include "libavutil/channel_layout.h" +#include "libavutil/rational.h" +#include "libavutil/pixfmt.h" + +#include "codec_id.h" + +/** + * @addtogroup lavc_core + * @{ + */ + +enum AVFieldOrder { + AV_FIELD_UNKNOWN, + AV_FIELD_PROGRESSIVE, + AV_FIELD_TT, ///< Top coded_first, top displayed first + AV_FIELD_BB, ///< Bottom coded first, bottom displayed first + AV_FIELD_TB, ///< Top coded first, bottom displayed first + AV_FIELD_BT, ///< Bottom coded first, top displayed first +}; + +/** + * This struct describes the properties of an encoded stream. + * + * sizeof(AVCodecParameters) is not a part of the public ABI, this struct must + * be allocated with avcodec_parameters_alloc() and freed with + * avcodec_parameters_free(). + */ +typedef struct AVCodecParameters { + /** + * General type of the encoded data. + */ + enum AVMediaType codec_type; + /** + * Specific type of the encoded data (the codec used). + */ + enum AVCodecID codec_id; + /** + * Additional information about the codec (corresponds to the AVI FOURCC). + */ + uint32_t codec_tag; + + /** + * Extra binary data needed for initializing the decoder, codec-dependent. + * + * Must be allocated with av_malloc() and will be freed by + * avcodec_parameters_free(). The allocated size of extradata must be at + * least extradata_size + AV_INPUT_BUFFER_PADDING_SIZE, with the padding + * bytes zeroed. + */ + uint8_t *extradata; + /** + * Size of the extradata content in bytes. + */ + int extradata_size; + + /** + * - video: the pixel format, the value corresponds to enum AVPixelFormat. + * - audio: the sample format, the value corresponds to enum AVSampleFormat. + */ + int format; + + /** + * The average bitrate of the encoded data (in bits per second). + */ + int64_t bit_rate; + + /** + * The number of bits per sample in the codedwords. + * + * This is basically the bitrate per sample. It is mandatory for a bunch of + * formats to actually decode them. It's the number of bits for one sample in + * the actual coded bitstream. + * + * This could be for example 4 for ADPCM + * For PCM formats this matches bits_per_raw_sample + * Can be 0 + */ + int bits_per_coded_sample; + + /** + * This is the number of valid bits in each output sample. If the + * sample format has more bits, the least significant bits are additional + * padding bits, which are always 0. Use right shifts to reduce the sample + * to its actual size. For example, audio formats with 24 bit samples will + * have bits_per_raw_sample set to 24, and format set to AV_SAMPLE_FMT_S32. + * To get the original sample use "(int32_t)sample >> 8"." + * + * For ADPCM this might be 12 or 16 or similar + * Can be 0 + */ + int bits_per_raw_sample; + + /** + * Codec-specific bitstream restrictions that the stream conforms to. + */ + int profile; + int level; + + /** + * Video only. The dimensions of the video frame in pixels. + */ + int width; + int height; + + /** + * Video only. The aspect ratio (width / height) which a single pixel + * should have when displayed. + * + * When the aspect ratio is unknown / undefined, the numerator should be + * set to 0 (the denominator may have any value). + */ + AVRational sample_aspect_ratio; + + /** + * Video only. The order of the fields in interlaced video. + */ + enum AVFieldOrder field_order; + + /** + * Video only. Additional colorspace characteristics. + */ + enum AVColorRange color_range; + enum AVColorPrimaries color_primaries; + enum AVColorTransferCharacteristic color_trc; + enum AVColorSpace color_space; + enum AVChromaLocation chroma_location; + + /** + * Video only. Number of delayed frames. + */ + int video_delay; + +#if FF_API_OLD_CHANNEL_LAYOUT + /** + * Audio only. The channel layout bitmask. May be 0 if the channel layout is + * unknown or unspecified, otherwise the number of bits set must be equal to + * the channels field. + * @deprecated use ch_layout + */ + attribute_deprecated + uint64_t channel_layout; + /** + * Audio only. The number of audio channels. + * @deprecated use ch_layout.nb_channels + */ + attribute_deprecated + int channels; +#endif + /** + * Audio only. The number of audio samples per second. + */ + int sample_rate; + /** + * Audio only. The number of bytes per coded audio frame, required by some + * formats. + * + * Corresponds to nBlockAlign in WAVEFORMATEX. + */ + int block_align; + /** + * Audio only. Audio frame size, if known. Required by some formats to be static. + */ + int frame_size; + + /** + * Audio only. The amount of padding (in samples) inserted by the encoder at + * the beginning of the audio. I.e. this number of leading decoded samples + * must be discarded by the caller to get the original audio without leading + * padding. + */ + int initial_padding; + /** + * Audio only. The amount of padding (in samples) appended by the encoder to + * the end of the audio. I.e. this number of decoded samples must be + * discarded by the caller from the end of the stream to get the original + * audio without any trailing padding. + */ + int trailing_padding; + /** + * Audio only. Number of samples to skip after a discontinuity. + */ + int seek_preroll; + + /** + * Audio only. The channel layout and number of channels. + */ + AVChannelLayout ch_layout; +} AVCodecParameters; + +/** + * Allocate a new AVCodecParameters and set its fields to default values + * (unknown/invalid/0). The returned struct must be freed with + * avcodec_parameters_free(). + */ +AVCodecParameters *avcodec_parameters_alloc(void); + +/** + * Free an AVCodecParameters instance and everything associated with it and + * write NULL to the supplied pointer. + */ +void avcodec_parameters_free(AVCodecParameters **par); + +/** + * Copy the contents of src to dst. Any allocated fields in dst are freed and + * replaced with newly allocated duplicates of the corresponding fields in src. + * + * @return >= 0 on success, a negative AVERROR code on failure. + */ +int avcodec_parameters_copy(AVCodecParameters *dst, const AVCodecParameters *src); + +/** + * This function is the same as av_get_audio_frame_duration(), except it works + * with AVCodecParameters instead of an AVCodecContext. + */ +int av_get_audio_frame_duration2(AVCodecParameters *par, int frame_bytes); + +/** + * @} + */ + +#endif // AVCODEC_CODEC_PAR_H diff --git a/media/ffvpx/libavcodec/dct.c b/media/ffvpx/libavcodec/dct.c new file mode 100644 index 0000000000..eeb4d154e0 --- /dev/null +++ b/media/ffvpx/libavcodec/dct.c @@ -0,0 +1,228 @@ +/* + * (I)DCT Transforms + * Copyright (c) 2009 Peter Ross <pross@xvid.org> + * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com> + * Copyright (c) 2010 Vitor Sessak + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * (Inverse) Discrete Cosine Transforms. These are also known as the + * type II and type III DCTs respectively. + */ + +#include <math.h> +#include <string.h> + +#include "libavutil/error.h" +#include "libavutil/mathematics.h" +#include "libavutil/mem.h" +#include "dct.h" +#include "dct32.h" + +/* sin((M_PI * x / (2 * n)) */ +#define SIN(s, n, x) (s->costab[(n) - (x)]) + +/* cos((M_PI * x / (2 * n)) */ +#define COS(s, n, x) (s->costab[x]) + +static void dst_calc_I_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + + data[0] = 0; + for (i = 1; i < n / 2; i++) { + float tmp1 = data[i ]; + float tmp2 = data[n - i]; + float s = SIN(ctx, n, 2 * i); + + s *= tmp1 + tmp2; + tmp1 = (tmp1 - tmp2) * 0.5f; + data[i] = s + tmp1; + data[n - i] = s - tmp1; + } + + data[n / 2] *= 2; + ctx->rdft.rdft_calc(&ctx->rdft, data); + + data[0] *= 0.5f; + + for (i = 1; i < n - 2; i += 2) { + data[i + 1] += data[i - 1]; + data[i] = -data[i + 2]; + } + + data[n - 1] = 0; +} + +static void dct_calc_I_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + float next = -0.5f * (data[0] - data[n]); + + for (i = 0; i < n / 2; i++) { + float tmp1 = data[i]; + float tmp2 = data[n - i]; + float s = SIN(ctx, n, 2 * i); + float c = COS(ctx, n, 2 * i); + + c *= tmp1 - tmp2; + s *= tmp1 - tmp2; + + next += c; + + tmp1 = (tmp1 + tmp2) * 0.5f; + data[i] = tmp1 - s; + data[n - i] = tmp1 + s; + } + + ctx->rdft.rdft_calc(&ctx->rdft, data); + data[n] = data[1]; + data[1] = next; + + for (i = 3; i <= n; i += 2) + data[i] = data[i - 2] - data[i]; +} + +static void dct_calc_III_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + + float next = data[n - 1]; + float inv_n = 1.0f / n; + + for (i = n - 2; i >= 2; i -= 2) { + float val1 = data[i]; + float val2 = data[i - 1] - data[i + 1]; + float c = COS(ctx, n, i); + float s = SIN(ctx, n, i); + + data[i] = c * val1 + s * val2; + data[i + 1] = s * val1 - c * val2; + } + + data[1] = 2 * next; + + ctx->rdft.rdft_calc(&ctx->rdft, data); + + for (i = 0; i < n / 2; i++) { + float tmp1 = data[i] * inv_n; + float tmp2 = data[n - i - 1] * inv_n; + float csc = ctx->csc2[i] * (tmp1 - tmp2); + + tmp1 += tmp2; + data[i] = tmp1 + csc; + data[n - i - 1] = tmp1 - csc; + } +} + +static void dct_calc_II_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + float next; + + for (i = 0; i < n / 2; i++) { + float tmp1 = data[i]; + float tmp2 = data[n - i - 1]; + float s = SIN(ctx, n, 2 * i + 1); + + s *= tmp1 - tmp2; + tmp1 = (tmp1 + tmp2) * 0.5f; + + data[i] = tmp1 + s; + data[n-i-1] = tmp1 - s; + } + + ctx->rdft.rdft_calc(&ctx->rdft, data); + + next = data[1] * 0.5; + data[1] *= -1; + + for (i = n - 2; i >= 0; i -= 2) { + float inr = data[i ]; + float ini = data[i + 1]; + float c = COS(ctx, n, i); + float s = SIN(ctx, n, i); + + data[i] = c * inr + s * ini; + data[i + 1] = next; + + next += s * inr - c * ini; + } +} + +static void dct32_func(DCTContext *ctx, FFTSample *data) +{ + ctx->dct32(data, data); +} + +av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse) +{ + int n = 1 << nbits; + int i; + int ret; + + memset(s, 0, sizeof(*s)); + + s->nbits = nbits; + s->inverse = inverse; + + if (inverse == DCT_II && nbits == 5) { + s->dct_calc = dct32_func; + } else { + ff_init_ff_cos_tabs(nbits + 2); + + s->costab = ff_cos_tabs[nbits + 2]; + s->csc2 = av_malloc_array(n / 2, sizeof(FFTSample)); + if (!s->csc2) + return AVERROR(ENOMEM); + + if ((ret = ff_rdft_init(&s->rdft, nbits, inverse == DCT_III)) < 0) { + av_freep(&s->csc2); + return ret; + } + + for (i = 0; i < n / 2; i++) + s->csc2[i] = 0.5 / sin((M_PI / (2 * n) * (2 * i + 1))); + + switch (inverse) { + case DCT_I : s->dct_calc = dct_calc_I_c; break; + case DCT_II : s->dct_calc = dct_calc_II_c; break; + case DCT_III: s->dct_calc = dct_calc_III_c; break; + case DST_I : s->dct_calc = dst_calc_I_c; break; + } + } + + s->dct32 = ff_dct32_float; +#if ARCH_X86 + ff_dct_init_x86(s); +#endif + + return 0; +} + +av_cold void ff_dct_end(DCTContext *s) +{ + ff_rdft_end(&s->rdft); + av_freep(&s->csc2); +} diff --git a/media/ffvpx/libavcodec/dct.h b/media/ffvpx/libavcodec/dct.h new file mode 100644 index 0000000000..0a03e256d1 --- /dev/null +++ b/media/ffvpx/libavcodec/dct.h @@ -0,0 +1,69 @@ +/* + * (I)DCT Transforms + * Copyright (c) 2009 Peter Ross <pross@xvid.org> + * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com> + * Copyright (c) 2010 Vitor Sessak + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#if !defined(AVCODEC_DCT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT) +#define AVCODEC_DCT_H + +#include <stddef.h> +#include <stdint.h> + +#include "rdft.h" + +struct DCTContext { + int nbits; + int inverse; + RDFTContext rdft; + const float *costab; + FFTSample *csc2; + void (*dct_calc)(struct DCTContext *s, FFTSample *data); + void (*dct32)(FFTSample *out, const FFTSample *in); +}; + +/** + * Set up DCT. + * @param nbits size of the input array: + * (1 << nbits) for DCT-II, DCT-III and DST-I + * (1 << nbits) + 1 for DCT-I + * + * @note the first element of the input of DST-I is ignored + */ +int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType type); +void ff_dct_end (DCTContext *s); + +void ff_dct_init_x86(DCTContext *s); + +void ff_fdct_ifast(int16_t *data); +void ff_fdct_ifast248(int16_t *data); +void ff_jpeg_fdct_islow_8(int16_t *data); +void ff_jpeg_fdct_islow_10(int16_t *data); +void ff_fdct248_islow_8(int16_t *data); +void ff_fdct248_islow_10(int16_t *data); + +void ff_j_rev_dct(int16_t *data); +void ff_j_rev_dct4(int16_t *data); +void ff_j_rev_dct2(int16_t *data); +void ff_j_rev_dct1(int16_t *data); +void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +#endif /* AVCODEC_DCT_H */ diff --git a/media/ffvpx/libavcodec/dct32.h b/media/ffvpx/libavcodec/dct32.h new file mode 100644 index 0000000000..61bf223a8d --- /dev/null +++ b/media/ffvpx/libavcodec/dct32.h @@ -0,0 +1,25 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_DCT32_H +#define AVCODEC_DCT32_H + +void ff_dct32_float(float *dst, const float *src); +void ff_dct32_fixed(int *dst, const int *src); + +#endif /* AVCODEC_DCT32_H */ diff --git a/media/ffvpx/libavcodec/dct32_fixed.c b/media/ffvpx/libavcodec/dct32_fixed.c new file mode 100644 index 0000000000..9025d5efdd --- /dev/null +++ b/media/ffvpx/libavcodec/dct32_fixed.c @@ -0,0 +1,20 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define DCT32_FLOAT 0 +#include "dct32_template.c" diff --git a/media/ffvpx/libavcodec/dct32_float.c b/media/ffvpx/libavcodec/dct32_float.c new file mode 100644 index 0000000000..597c9bb639 --- /dev/null +++ b/media/ffvpx/libavcodec/dct32_float.c @@ -0,0 +1,20 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define DCT32_FLOAT 1 +#include "dct32_template.c" diff --git a/media/ffvpx/libavcodec/dct32_template.c b/media/ffvpx/libavcodec/dct32_template.c new file mode 100644 index 0000000000..51cebc053f --- /dev/null +++ b/media/ffvpx/libavcodec/dct32_template.c @@ -0,0 +1,288 @@ +/* + * Template for the Discrete Cosine Transform for 32 samples + * Copyright (c) 2001, 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dct32.h" +#include "mathops.h" +#include "libavutil/internal.h" + +#ifdef CHECKED +#define SUINT int +#define SUINT32 int32_t +#else +#define SUINT unsigned +#define SUINT32 uint32_t +#endif + +#if DCT32_FLOAT +# define dct32 ff_dct32_float +# define FIXHR(x) ((float)(x)) +# define MULH3(x, y, s) ((s)*(y)*(x)) +# define INTFLOAT float +# define SUINTFLOAT float +#else +# define dct32 ff_dct32_fixed +# define FIXHR(a) ((int)((a) * (1LL<<32) + 0.5)) +# define MULH3(x, y, s) MULH((s)*(x), y) +# define INTFLOAT int +# define SUINTFLOAT SUINT +#endif + + +/* tab[i][j] = 1.0 / (2.0 * cos(pi*(2*k+1) / 2^(6 - j))) */ + +/* cos(i*pi/64) */ + +#define COS0_0 FIXHR(0.50060299823519630134/2) +#define COS0_1 FIXHR(0.50547095989754365998/2) +#define COS0_2 FIXHR(0.51544730992262454697/2) +#define COS0_3 FIXHR(0.53104259108978417447/2) +#define COS0_4 FIXHR(0.55310389603444452782/2) +#define COS0_5 FIXHR(0.58293496820613387367/2) +#define COS0_6 FIXHR(0.62250412303566481615/2) +#define COS0_7 FIXHR(0.67480834145500574602/2) +#define COS0_8 FIXHR(0.74453627100229844977/2) +#define COS0_9 FIXHR(0.83934964541552703873/2) +#define COS0_10 FIXHR(0.97256823786196069369/2) +#define COS0_11 FIXHR(1.16943993343288495515/4) +#define COS0_12 FIXHR(1.48416461631416627724/4) +#define COS0_13 FIXHR(2.05778100995341155085/8) +#define COS0_14 FIXHR(3.40760841846871878570/8) +#define COS0_15 FIXHR(10.19000812354805681150/32) + +#define COS1_0 FIXHR(0.50241928618815570551/2) +#define COS1_1 FIXHR(0.52249861493968888062/2) +#define COS1_2 FIXHR(0.56694403481635770368/2) +#define COS1_3 FIXHR(0.64682178335999012954/2) +#define COS1_4 FIXHR(0.78815462345125022473/2) +#define COS1_5 FIXHR(1.06067768599034747134/4) +#define COS1_6 FIXHR(1.72244709823833392782/4) +#define COS1_7 FIXHR(5.10114861868916385802/16) + +#define COS2_0 FIXHR(0.50979557910415916894/2) +#define COS2_1 FIXHR(0.60134488693504528054/2) +#define COS2_2 FIXHR(0.89997622313641570463/2) +#define COS2_3 FIXHR(2.56291544774150617881/8) + +#define COS3_0 FIXHR(0.54119610014619698439/2) +#define COS3_1 FIXHR(1.30656296487637652785/4) + +#define COS4_0 FIXHR(M_SQRT1_2/2) + +/* butterfly operator */ +#define BF(a, b, c, s)\ +{\ + tmp0 = val##a + val##b;\ + tmp1 = val##a - val##b;\ + val##a = tmp0;\ + val##b = MULH3(tmp1, c, 1<<(s));\ +} + +#define BF0(a, b, c, s)\ +{\ + tmp0 = tab[a] + tab[b];\ + tmp1 = tab[a] - tab[b];\ + val##a = tmp0;\ + val##b = MULH3(tmp1, c, 1<<(s));\ +} + +#define BF1(a, b, c, d)\ +{\ + BF(a, b, COS4_0, 1);\ + BF(c, d,-COS4_0, 1);\ + val##c += val##d;\ +} + +#define BF2(a, b, c, d)\ +{\ + BF(a, b, COS4_0, 1);\ + BF(c, d,-COS4_0, 1);\ + val##c += val##d;\ + val##a += val##c;\ + val##c += val##b;\ + val##b += val##d;\ +} + +#define ADD(a, b) val##a += val##b + +/* DCT32 without 1/sqrt(2) coef zero scaling. */ +void dct32(INTFLOAT *out, const INTFLOAT *tab_arg) +{ + const SUINTFLOAT *tab = tab_arg; + SUINTFLOAT tmp0, tmp1; + + SUINTFLOAT val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7 , + val8 , val9 , val10, val11, val12, val13, val14, val15, + val16, val17, val18, val19, val20, val21, val22, val23, + val24, val25, val26, val27, val28, val29, val30, val31; + + /* pass 1 */ + BF0( 0, 31, COS0_0 , 1); + BF0(15, 16, COS0_15, 5); + /* pass 2 */ + BF( 0, 15, COS1_0 , 1); + BF(16, 31,-COS1_0 , 1); + /* pass 1 */ + BF0( 7, 24, COS0_7 , 1); + BF0( 8, 23, COS0_8 , 1); + /* pass 2 */ + BF( 7, 8, COS1_7 , 4); + BF(23, 24,-COS1_7 , 4); + /* pass 3 */ + BF( 0, 7, COS2_0 , 1); + BF( 8, 15,-COS2_0 , 1); + BF(16, 23, COS2_0 , 1); + BF(24, 31,-COS2_0 , 1); + /* pass 1 */ + BF0( 3, 28, COS0_3 , 1); + BF0(12, 19, COS0_12, 2); + /* pass 2 */ + BF( 3, 12, COS1_3 , 1); + BF(19, 28,-COS1_3 , 1); + /* pass 1 */ + BF0( 4, 27, COS0_4 , 1); + BF0(11, 20, COS0_11, 2); + /* pass 2 */ + BF( 4, 11, COS1_4 , 1); + BF(20, 27,-COS1_4 , 1); + /* pass 3 */ + BF( 3, 4, COS2_3 , 3); + BF(11, 12,-COS2_3 , 3); + BF(19, 20, COS2_3 , 3); + BF(27, 28,-COS2_3 , 3); + /* pass 4 */ + BF( 0, 3, COS3_0 , 1); + BF( 4, 7,-COS3_0 , 1); + BF( 8, 11, COS3_0 , 1); + BF(12, 15,-COS3_0 , 1); + BF(16, 19, COS3_0 , 1); + BF(20, 23,-COS3_0 , 1); + BF(24, 27, COS3_0 , 1); + BF(28, 31,-COS3_0 , 1); + + + + /* pass 1 */ + BF0( 1, 30, COS0_1 , 1); + BF0(14, 17, COS0_14, 3); + /* pass 2 */ + BF( 1, 14, COS1_1 , 1); + BF(17, 30,-COS1_1 , 1); + /* pass 1 */ + BF0( 6, 25, COS0_6 , 1); + BF0( 9, 22, COS0_9 , 1); + /* pass 2 */ + BF( 6, 9, COS1_6 , 2); + BF(22, 25,-COS1_6 , 2); + /* pass 3 */ + BF( 1, 6, COS2_1 , 1); + BF( 9, 14,-COS2_1 , 1); + BF(17, 22, COS2_1 , 1); + BF(25, 30,-COS2_1 , 1); + + /* pass 1 */ + BF0( 2, 29, COS0_2 , 1); + BF0(13, 18, COS0_13, 3); + /* pass 2 */ + BF( 2, 13, COS1_2 , 1); + BF(18, 29,-COS1_2 , 1); + /* pass 1 */ + BF0( 5, 26, COS0_5 , 1); + BF0(10, 21, COS0_10, 1); + /* pass 2 */ + BF( 5, 10, COS1_5 , 2); + BF(21, 26,-COS1_5 , 2); + /* pass 3 */ + BF( 2, 5, COS2_2 , 1); + BF(10, 13,-COS2_2 , 1); + BF(18, 21, COS2_2 , 1); + BF(26, 29,-COS2_2 , 1); + /* pass 4 */ + BF( 1, 2, COS3_1 , 2); + BF( 5, 6,-COS3_1 , 2); + BF( 9, 10, COS3_1 , 2); + BF(13, 14,-COS3_1 , 2); + BF(17, 18, COS3_1 , 2); + BF(21, 22,-COS3_1 , 2); + BF(25, 26, COS3_1 , 2); + BF(29, 30,-COS3_1 , 2); + + /* pass 5 */ + BF1( 0, 1, 2, 3); + BF2( 4, 5, 6, 7); + BF1( 8, 9, 10, 11); + BF2(12, 13, 14, 15); + BF1(16, 17, 18, 19); + BF2(20, 21, 22, 23); + BF1(24, 25, 26, 27); + BF2(28, 29, 30, 31); + + /* pass 6 */ + + ADD( 8, 12); + ADD(12, 10); + ADD(10, 14); + ADD(14, 9); + ADD( 9, 13); + ADD(13, 11); + ADD(11, 15); + + out[ 0] = val0; + out[16] = val1; + out[ 8] = val2; + out[24] = val3; + out[ 4] = val4; + out[20] = val5; + out[12] = val6; + out[28] = val7; + out[ 2] = val8; + out[18] = val9; + out[10] = val10; + out[26] = val11; + out[ 6] = val12; + out[22] = val13; + out[14] = val14; + out[30] = val15; + + ADD(24, 28); + ADD(28, 26); + ADD(26, 30); + ADD(30, 25); + ADD(25, 29); + ADD(29, 27); + ADD(27, 31); + + out[ 1] = val16 + val24; + out[17] = val17 + val25; + out[ 9] = val18 + val26; + out[25] = val19 + val27; + out[ 5] = val20 + val28; + out[21] = val21 + val29; + out[13] = val22 + val30; + out[29] = val23 + val31; + out[ 3] = val24 + val20; + out[19] = val25 + val21; + out[11] = val26 + val22; + out[27] = val27 + val23; + out[ 7] = val28 + val18; + out[23] = val29 + val19; + out[15] = val30 + val17; + out[31] = val31; +} diff --git a/media/ffvpx/libavcodec/decode.c b/media/ffvpx/libavcodec/decode.c new file mode 100644 index 0000000000..be2be81089 --- /dev/null +++ b/media/ffvpx/libavcodec/decode.c @@ -0,0 +1,1687 @@ +/* + * generic decoding-related code + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> +#include <string.h> + +#include "config.h" + +#if CONFIG_ICONV +# include <iconv.h> +#endif + +#include "libavutil/avassert.h" +#include "libavutil/avstring.h" +#include "libavutil/bprint.h" +#include "libavutil/channel_layout.h" +#include "libavutil/common.h" +#include "libavutil/fifo.h" +#include "libavutil/frame.h" +#include "libavutil/hwcontext.h" +#include "libavutil/imgutils.h" +#include "libavutil/internal.h" +#include "libavutil/intmath.h" +#include "libavutil/opt.h" + +#include "avcodec.h" +#include "bytestream.h" +#include "bsf.h" +#include "codec_internal.h" +#include "decode.h" +#include "hwconfig.h" +#include "internal.h" +#include "thread.h" + +static int apply_param_change(AVCodecContext *avctx, const AVPacket *avpkt) +{ + int ret; + size_t size; + const uint8_t *data; + uint32_t flags; + int64_t val; + + data = av_packet_get_side_data(avpkt, AV_PKT_DATA_PARAM_CHANGE, &size); + if (!data) + return 0; + + if (!(avctx->codec->capabilities & AV_CODEC_CAP_PARAM_CHANGE)) { + av_log(avctx, AV_LOG_ERROR, "This decoder does not support parameter " + "changes, but PARAM_CHANGE side data was sent to it.\n"); + ret = AVERROR(EINVAL); + goto fail2; + } + + if (size < 4) + goto fail; + + flags = bytestream_get_le32(&data); + size -= 4; + +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + if (flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT) { + if (size < 4) + goto fail; + val = bytestream_get_le32(&data); + if (val <= 0 || val > INT_MAX) { + av_log(avctx, AV_LOG_ERROR, "Invalid channel count"); + ret = AVERROR_INVALIDDATA; + goto fail2; + } + avctx->channels = val; + size -= 4; + } + if (flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_LAYOUT) { + if (size < 8) + goto fail; + avctx->channel_layout = bytestream_get_le64(&data); + size -= 8; + } +FF_ENABLE_DEPRECATION_WARNINGS +#endif + if (flags & AV_SIDE_DATA_PARAM_CHANGE_SAMPLE_RATE) { + if (size < 4) + goto fail; + val = bytestream_get_le32(&data); + if (val <= 0 || val > INT_MAX) { + av_log(avctx, AV_LOG_ERROR, "Invalid sample rate"); + ret = AVERROR_INVALIDDATA; + goto fail2; + } + avctx->sample_rate = val; + size -= 4; + } + if (flags & AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS) { + if (size < 8) + goto fail; + avctx->width = bytestream_get_le32(&data); + avctx->height = bytestream_get_le32(&data); + size -= 8; + ret = ff_set_dimensions(avctx, avctx->width, avctx->height); + if (ret < 0) + goto fail2; + } + + return 0; +fail: + av_log(avctx, AV_LOG_ERROR, "PARAM_CHANGE side data too small.\n"); + ret = AVERROR_INVALIDDATA; +fail2: + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Error applying parameter changes.\n"); + if (avctx->err_recognition & AV_EF_EXPLODE) + return ret; + } + return 0; +} + +static int extract_packet_props(AVCodecInternal *avci, const AVPacket *pkt) +{ + int ret = 0; + + av_packet_unref(avci->last_pkt_props); + if (pkt) { + ret = av_packet_copy_props(avci->last_pkt_props, pkt); + if (!ret) + avci->last_pkt_props->opaque = (void *)(intptr_t)pkt->size; // Needed for ff_decode_frame_props(). + } + return ret; +} + +static int decode_bsfs_init(AVCodecContext *avctx) +{ + AVCodecInternal *avci = avctx->internal; + const FFCodec *const codec = ffcodec(avctx->codec); + int ret; + + if (avci->bsf) + return 0; + + ret = av_bsf_list_parse_str(codec->bsfs, &avci->bsf); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Error parsing decoder bitstream filters '%s': %s\n", codec->bsfs, av_err2str(ret)); + if (ret != AVERROR(ENOMEM)) + ret = AVERROR_BUG; + goto fail; + } + + /* We do not currently have an API for passing the input timebase into decoders, + * but no filters used here should actually need it. + * So we make up some plausible-looking number (the MPEG 90kHz timebase) */ + avci->bsf->time_base_in = (AVRational){ 1, 90000 }; + ret = avcodec_parameters_from_context(avci->bsf->par_in, avctx); + if (ret < 0) + goto fail; + + ret = av_bsf_init(avci->bsf); + if (ret < 0) + goto fail; + + return 0; +fail: + av_bsf_free(&avci->bsf); + return ret; +} + +int ff_decode_get_packet(AVCodecContext *avctx, AVPacket *pkt) +{ + AVCodecInternal *avci = avctx->internal; + int ret; + + if (avci->draining) + return AVERROR_EOF; + + ret = av_bsf_receive_packet(avci->bsf, pkt); + if (ret == AVERROR_EOF) + avci->draining = 1; + if (ret < 0) + return ret; + + if (!(ffcodec(avctx->codec)->caps_internal & FF_CODEC_CAP_SETS_FRAME_PROPS)) { + ret = extract_packet_props(avctx->internal, pkt); + if (ret < 0) + goto finish; + } + + ret = apply_param_change(avctx, pkt); + if (ret < 0) + goto finish; + + return 0; +finish: + av_packet_unref(pkt); + return ret; +} + +/** + * Attempt to guess proper monotonic timestamps for decoded video frames + * which might have incorrect times. Input timestamps may wrap around, in + * which case the output will as well. + * + * @param pts the pts field of the decoded AVPacket, as passed through + * AVFrame.pts + * @param dts the dts field of the decoded AVPacket + * @return one of the input values, may be AV_NOPTS_VALUE + */ +static int64_t guess_correct_pts(AVCodecContext *ctx, + int64_t reordered_pts, int64_t dts) +{ + int64_t pts = AV_NOPTS_VALUE; + + if (dts != AV_NOPTS_VALUE) { + ctx->pts_correction_num_faulty_dts += dts <= ctx->pts_correction_last_dts; + ctx->pts_correction_last_dts = dts; + } else if (reordered_pts != AV_NOPTS_VALUE) + ctx->pts_correction_last_dts = reordered_pts; + + if (reordered_pts != AV_NOPTS_VALUE) { + ctx->pts_correction_num_faulty_pts += reordered_pts <= ctx->pts_correction_last_pts; + ctx->pts_correction_last_pts = reordered_pts; + } else if(dts != AV_NOPTS_VALUE) + ctx->pts_correction_last_pts = dts; + + if ((ctx->pts_correction_num_faulty_pts<=ctx->pts_correction_num_faulty_dts || dts == AV_NOPTS_VALUE) + && reordered_pts != AV_NOPTS_VALUE) + pts = reordered_pts; + else + pts = dts; + + return pts; +} + +/* + * The core of the receive_frame_wrapper for the decoders implementing + * the simple API. Certain decoders might consume partial packets without + * returning any output, so this function needs to be called in a loop until it + * returns EAGAIN. + **/ +static inline int decode_simple_internal(AVCodecContext *avctx, AVFrame *frame, int64_t *discarded_samples) +{ + AVCodecInternal *avci = avctx->internal; + AVPacket *const pkt = avci->in_pkt; + const FFCodec *const codec = ffcodec(avctx->codec); + int got_frame, actual_got_frame; + int ret; + + if (!pkt->data && !avci->draining) { + av_packet_unref(pkt); + ret = ff_decode_get_packet(avctx, pkt); + if (ret < 0 && ret != AVERROR_EOF) + return ret; + } + + // Some codecs (at least wma lossless) will crash when feeding drain packets + // after EOF was signaled. + if (avci->draining_done) + return AVERROR_EOF; + + if (!pkt->data && + !(avctx->codec->capabilities & AV_CODEC_CAP_DELAY || + avctx->active_thread_type & FF_THREAD_FRAME)) + return AVERROR_EOF; + + got_frame = 0; + + if (HAVE_THREADS && avctx->active_thread_type & FF_THREAD_FRAME) { + ret = ff_thread_decode_frame(avctx, frame, &got_frame, pkt); + } else { + ret = codec->cb.decode(avctx, frame, &got_frame, pkt); + + if (!(codec->caps_internal & FF_CODEC_CAP_SETS_PKT_DTS)) + frame->pkt_dts = pkt->dts; + if (avctx->codec->type == AVMEDIA_TYPE_VIDEO) { + if(!avctx->has_b_frames) + frame->pkt_pos = pkt->pos; + //FIXME these should be under if(!avctx->has_b_frames) + /* get_buffer is supposed to set frame parameters */ + if (!(avctx->codec->capabilities & AV_CODEC_CAP_DR1)) { + if (!frame->sample_aspect_ratio.num) frame->sample_aspect_ratio = avctx->sample_aspect_ratio; + if (!frame->width) frame->width = avctx->width; + if (!frame->height) frame->height = avctx->height; + if (frame->format == AV_PIX_FMT_NONE) frame->format = avctx->pix_fmt; + } + } + } + emms_c(); + actual_got_frame = got_frame; + + if (avctx->codec->type == AVMEDIA_TYPE_VIDEO) { + if (frame->flags & AV_FRAME_FLAG_DISCARD) + got_frame = 0; + } else if (avctx->codec->type == AVMEDIA_TYPE_AUDIO) { + uint8_t *side; + size_t side_size; + uint32_t discard_padding = 0; + uint8_t skip_reason = 0; + uint8_t discard_reason = 0; + + if (ret >= 0 && got_frame) { + if (frame->format == AV_SAMPLE_FMT_NONE) + frame->format = avctx->sample_fmt; + if (!frame->ch_layout.nb_channels) { + int ret2 = av_channel_layout_copy(&frame->ch_layout, &avctx->ch_layout); + if (ret2 < 0) { + ret = ret2; + got_frame = 0; + } + } +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + if (!frame->channel_layout) + frame->channel_layout = avctx->ch_layout.order == AV_CHANNEL_ORDER_NATIVE ? + avctx->ch_layout.u.mask : 0; + if (!frame->channels) + frame->channels = avctx->ch_layout.nb_channels; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + if (!frame->sample_rate) + frame->sample_rate = avctx->sample_rate; + } + + side= av_packet_get_side_data(avci->last_pkt_props, AV_PKT_DATA_SKIP_SAMPLES, &side_size); + if(side && side_size>=10) { + avci->skip_samples = AV_RL32(side); + avci->skip_samples = FFMAX(0, avci->skip_samples); + discard_padding = AV_RL32(side + 4); + av_log(avctx, AV_LOG_DEBUG, "skip %d / discard %d samples due to side data\n", + avci->skip_samples, (int)discard_padding); + skip_reason = AV_RL8(side + 8); + discard_reason = AV_RL8(side + 9); + } + + if ((frame->flags & AV_FRAME_FLAG_DISCARD) && got_frame && + !(avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL)) { + avci->skip_samples = FFMAX(0, avci->skip_samples - frame->nb_samples); + got_frame = 0; + *discarded_samples += frame->nb_samples; + } + + if (avci->skip_samples > 0 && got_frame && + !(avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL)) { + if(frame->nb_samples <= avci->skip_samples){ + got_frame = 0; + *discarded_samples += frame->nb_samples; + avci->skip_samples -= frame->nb_samples; + av_log(avctx, AV_LOG_DEBUG, "skip whole frame, skip left: %d\n", + avci->skip_samples); + } else { + av_samples_copy(frame->extended_data, frame->extended_data, 0, avci->skip_samples, + frame->nb_samples - avci->skip_samples, avctx->ch_layout.nb_channels, frame->format); + if(avctx->pkt_timebase.num && avctx->sample_rate) { + int64_t diff_ts = av_rescale_q(avci->skip_samples, + (AVRational){1, avctx->sample_rate}, + avctx->pkt_timebase); + if(frame->pts!=AV_NOPTS_VALUE) + frame->pts += diff_ts; + if(frame->pkt_dts!=AV_NOPTS_VALUE) + frame->pkt_dts += diff_ts; + if (frame->duration >= diff_ts) + frame->duration -= diff_ts; + } else { + av_log(avctx, AV_LOG_WARNING, "Could not update timestamps for skipped samples.\n"); + } + av_log(avctx, AV_LOG_DEBUG, "skip %d/%d samples\n", + avci->skip_samples, frame->nb_samples); + *discarded_samples += avci->skip_samples; + frame->nb_samples -= avci->skip_samples; + avci->skip_samples = 0; + } + } + + if (discard_padding > 0 && discard_padding <= frame->nb_samples && got_frame && + !(avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL)) { + if (discard_padding == frame->nb_samples) { + *discarded_samples += frame->nb_samples; + got_frame = 0; + } else { + if(avctx->pkt_timebase.num && avctx->sample_rate) { + int64_t diff_ts = av_rescale_q(frame->nb_samples - discard_padding, + (AVRational){1, avctx->sample_rate}, + avctx->pkt_timebase); + frame->duration = diff_ts; + } else { + av_log(avctx, AV_LOG_WARNING, "Could not update timestamps for discarded samples.\n"); + } + av_log(avctx, AV_LOG_DEBUG, "discard %d/%d samples\n", + (int)discard_padding, frame->nb_samples); + frame->nb_samples -= discard_padding; + } + } + + if ((avctx->flags2 & AV_CODEC_FLAG2_SKIP_MANUAL) && got_frame) { + AVFrameSideData *fside = av_frame_new_side_data(frame, AV_FRAME_DATA_SKIP_SAMPLES, 10); + if (fside) { + AV_WL32(fside->data, avci->skip_samples); + AV_WL32(fside->data + 4, discard_padding); + AV_WL8(fside->data + 8, skip_reason); + AV_WL8(fside->data + 9, discard_reason); + avci->skip_samples = 0; + } + } + } + + if (avctx->codec->type == AVMEDIA_TYPE_AUDIO && + !avci->showed_multi_packet_warning && + ret >= 0 && ret != pkt->size && !(avctx->codec->capabilities & AV_CODEC_CAP_SUBFRAMES)) { + av_log(avctx, AV_LOG_WARNING, "Multiple frames in a packet.\n"); + avci->showed_multi_packet_warning = 1; + } + + if (!got_frame) + av_frame_unref(frame); + + if (ret >= 0 && avctx->codec->type == AVMEDIA_TYPE_VIDEO) + ret = pkt->size; + + /* do not stop draining when actual_got_frame != 0 or ret < 0 */ + /* got_frame == 0 but actual_got_frame != 0 when frame is discarded */ + if (avci->draining && !actual_got_frame) { + if (ret < 0) { + /* prevent infinite loop if a decoder wrongly always return error on draining */ + /* reasonable nb_errors_max = maximum b frames + thread count */ + int nb_errors_max = 20 + (HAVE_THREADS && avctx->active_thread_type & FF_THREAD_FRAME ? + avctx->thread_count : 1); + + if (avci->nb_draining_errors++ >= nb_errors_max) { + av_log(avctx, AV_LOG_ERROR, "Too many errors when draining, this is a bug. " + "Stop draining and force EOF.\n"); + avci->draining_done = 1; + ret = AVERROR_BUG; + } + } else { + avci->draining_done = 1; + } + } + + if (ret >= pkt->size || ret < 0) { + av_packet_unref(pkt); + } else { + int consumed = ret; + + pkt->data += consumed; + pkt->size -= consumed; + pkt->pts = AV_NOPTS_VALUE; + pkt->dts = AV_NOPTS_VALUE; + if (!(codec->caps_internal & FF_CODEC_CAP_SETS_FRAME_PROPS)) { + // See extract_packet_props() comment. + avci->last_pkt_props->opaque = (void *)((intptr_t)avci->last_pkt_props->opaque - consumed); + avci->last_pkt_props->pts = AV_NOPTS_VALUE; + avci->last_pkt_props->dts = AV_NOPTS_VALUE; + } + } + + if (got_frame) + av_assert0(frame->buf[0]); + + return ret < 0 ? ret : 0; +} + +#if CONFIG_LCMS2 +static int detect_colorspace(AVCodecContext *avctx, AVFrame *frame) +{ + AVCodecInternal *avci = avctx->internal; + enum AVColorTransferCharacteristic trc; + AVColorPrimariesDesc coeffs; + enum AVColorPrimaries prim; + cmsHPROFILE profile; + AVFrameSideData *sd; + int ret; + if (!(avctx->flags2 & AV_CODEC_FLAG2_ICC_PROFILES)) + return 0; + + sd = av_frame_get_side_data(frame, AV_FRAME_DATA_ICC_PROFILE); + if (!sd || !sd->size) + return 0; + + if (!avci->icc.avctx) { + ret = ff_icc_context_init(&avci->icc, avctx); + if (ret < 0) + return ret; + } + + profile = cmsOpenProfileFromMemTHR(avci->icc.ctx, sd->data, sd->size); + if (!profile) + return AVERROR_INVALIDDATA; + + ret = ff_icc_profile_read_primaries(&avci->icc, profile, &coeffs); + if (!ret) + ret = ff_icc_profile_detect_transfer(&avci->icc, profile, &trc); + cmsCloseProfile(profile); + if (ret < 0) + return ret; + + prim = av_csp_primaries_id_from_desc(&coeffs); + if (prim != AVCOL_PRI_UNSPECIFIED) + frame->color_primaries = prim; + if (trc != AVCOL_TRC_UNSPECIFIED) + frame->color_trc = trc; + return 0; +} +#else /* !CONFIG_LCMS2 */ +static int detect_colorspace(av_unused AVCodecContext *c, av_unused AVFrame *f) +{ + return 0; +} +#endif + +static int decode_simple_receive_frame(AVCodecContext *avctx, AVFrame *frame) +{ + int ret; + int64_t discarded_samples = 0; + + while (!frame->buf[0]) { + if (discarded_samples > avctx->max_samples) + return AVERROR(EAGAIN); + ret = decode_simple_internal(avctx, frame, &discarded_samples); + if (ret < 0) + return ret; + } + + return 0; +} + +static int decode_receive_frame_internal(AVCodecContext *avctx, AVFrame *frame) +{ + AVCodecInternal *avci = avctx->internal; + const FFCodec *const codec = ffcodec(avctx->codec); + int ret, ok; + + av_assert0(!frame->buf[0]); + + if (codec->cb_type == FF_CODEC_CB_TYPE_RECEIVE_FRAME) { + ret = codec->cb.receive_frame(avctx, frame); + } else + ret = decode_simple_receive_frame(avctx, frame); + + if (ret == AVERROR_EOF) + avci->draining_done = 1; + + /* preserve ret */ + ok = detect_colorspace(avctx, frame); + if (ok < 0) { + av_frame_unref(frame); + return ok; + } + + if (!ret) { + frame->best_effort_timestamp = guess_correct_pts(avctx, + frame->pts, + frame->pkt_dts); + +#if FF_API_PKT_DURATION +FF_DISABLE_DEPRECATION_WARNINGS + frame->pkt_duration = frame->duration; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + + /* the only case where decode data is not set should be decoders + * that do not call ff_get_buffer() */ + av_assert0((frame->private_ref && frame->private_ref->size == sizeof(FrameDecodeData)) || + !(avctx->codec->capabilities & AV_CODEC_CAP_DR1)); + + if (frame->private_ref) { + FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data; + + if (fdd->post_process) { + ret = fdd->post_process(avctx, frame); + if (ret < 0) { + av_frame_unref(frame); + return ret; + } + } + } + } + + /* free the per-frame decode data */ + av_buffer_unref(&frame->private_ref); + + return ret; +} + +int attribute_align_arg avcodec_send_packet(AVCodecContext *avctx, const AVPacket *avpkt) +{ + AVCodecInternal *avci = avctx->internal; + int ret; + + if (!avcodec_is_open(avctx) || !av_codec_is_decoder(avctx->codec)) + return AVERROR(EINVAL); + + if (avctx->internal->draining) + return AVERROR_EOF; + + if (avpkt && !avpkt->size && avpkt->data) + return AVERROR(EINVAL); + + av_packet_unref(avci->buffer_pkt); + if (avpkt && (avpkt->data || avpkt->side_data_elems)) { + ret = av_packet_ref(avci->buffer_pkt, avpkt); + if (ret < 0) + return ret; + } + + ret = av_bsf_send_packet(avci->bsf, avci->buffer_pkt); + if (ret < 0) { + av_packet_unref(avci->buffer_pkt); + return ret; + } + + if (!avci->buffer_frame->buf[0]) { + ret = decode_receive_frame_internal(avctx, avci->buffer_frame); + if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) + return ret; + } + + return 0; +} + +static int apply_cropping(AVCodecContext *avctx, AVFrame *frame) +{ + /* make sure we are noisy about decoders returning invalid cropping data */ + if (frame->crop_left >= INT_MAX - frame->crop_right || + frame->crop_top >= INT_MAX - frame->crop_bottom || + (frame->crop_left + frame->crop_right) >= frame->width || + (frame->crop_top + frame->crop_bottom) >= frame->height) { + av_log(avctx, AV_LOG_WARNING, + "Invalid cropping information set by a decoder: " + "%"SIZE_SPECIFIER"/%"SIZE_SPECIFIER"/%"SIZE_SPECIFIER"/%"SIZE_SPECIFIER" " + "(frame size %dx%d). This is a bug, please report it\n", + frame->crop_left, frame->crop_right, frame->crop_top, frame->crop_bottom, + frame->width, frame->height); + frame->crop_left = 0; + frame->crop_right = 0; + frame->crop_top = 0; + frame->crop_bottom = 0; + return 0; + } + + if (!avctx->apply_cropping) + return 0; + + return av_frame_apply_cropping(frame, avctx->flags & AV_CODEC_FLAG_UNALIGNED ? + AV_FRAME_CROP_UNALIGNED : 0); +} + +// make sure frames returned to the caller are valid +static int frame_validate(AVCodecContext *avctx, AVFrame *frame) +{ + if (!frame->buf[0] || frame->format < 0) + goto fail; + + switch (avctx->codec_type) { + case AVMEDIA_TYPE_VIDEO: + if (frame->width <= 0 || frame->height <= 0) + goto fail; + break; + case AVMEDIA_TYPE_AUDIO: + if (!av_channel_layout_check(&frame->ch_layout) || + frame->sample_rate <= 0) + goto fail; + + break; + default: av_assert0(0); + } + + return 0; +fail: + av_log(avctx, AV_LOG_ERROR, "An invalid frame was output by a decoder. " + "This is a bug, please report it.\n"); + return AVERROR_BUG; +} + +int ff_decode_receive_frame(AVCodecContext *avctx, AVFrame *frame) +{ + AVCodecInternal *avci = avctx->internal; + int ret, changed; + + if (!avcodec_is_open(avctx) || !av_codec_is_decoder(avctx->codec)) + return AVERROR(EINVAL); + + if (avci->buffer_frame->buf[0]) { + av_frame_move_ref(frame, avci->buffer_frame); + } else { + ret = decode_receive_frame_internal(avctx, frame); + if (ret < 0) + return ret; + } + + ret = frame_validate(avctx, frame); + if (ret < 0) + goto fail; + + if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) { + ret = apply_cropping(avctx, frame); + if (ret < 0) + goto fail; + } + + avctx->frame_num++; +#if FF_API_AVCTX_FRAME_NUMBER +FF_DISABLE_DEPRECATION_WARNINGS + avctx->frame_number = avctx->frame_num; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + + if (avctx->flags & AV_CODEC_FLAG_DROPCHANGED) { + + if (avctx->frame_num == 1) { + avci->initial_format = frame->format; + switch(avctx->codec_type) { + case AVMEDIA_TYPE_VIDEO: + avci->initial_width = frame->width; + avci->initial_height = frame->height; + break; + case AVMEDIA_TYPE_AUDIO: + avci->initial_sample_rate = frame->sample_rate ? frame->sample_rate : + avctx->sample_rate; + ret = av_channel_layout_copy(&avci->initial_ch_layout, &frame->ch_layout); + if (ret < 0) + goto fail; + break; + } + } + + if (avctx->frame_num > 1) { + changed = avci->initial_format != frame->format; + + switch(avctx->codec_type) { + case AVMEDIA_TYPE_VIDEO: + changed |= avci->initial_width != frame->width || + avci->initial_height != frame->height; + break; + case AVMEDIA_TYPE_AUDIO: + changed |= avci->initial_sample_rate != frame->sample_rate || + avci->initial_sample_rate != avctx->sample_rate || + av_channel_layout_compare(&avci->initial_ch_layout, &frame->ch_layout); + break; + } + + if (changed) { + avci->changed_frames_dropped++; + av_log(avctx, AV_LOG_INFO, "dropped changed frame #%"PRId64" pts %"PRId64 + " drop count: %d \n", + avctx->frame_num, frame->pts, + avci->changed_frames_dropped); + ret = AVERROR_INPUT_CHANGED; + goto fail; + } + } + } + return 0; +fail: + av_frame_unref(frame); + return ret; +} + +static void get_subtitle_defaults(AVSubtitle *sub) +{ + memset(sub, 0, sizeof(*sub)); + sub->pts = AV_NOPTS_VALUE; +} + +#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */ +static int recode_subtitle(AVCodecContext *avctx, const AVPacket **outpkt, + const AVPacket *inpkt, AVPacket *buf_pkt) +{ +#if CONFIG_ICONV + iconv_t cd = (iconv_t)-1; + int ret = 0; + char *inb, *outb; + size_t inl, outl; +#endif + + if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_PRE_DECODER || inpkt->size == 0) { + *outpkt = inpkt; + return 0; + } + +#if CONFIG_ICONV + inb = inpkt->data; + inl = inpkt->size; + + if (inl >= INT_MAX / UTF8_MAX_BYTES - AV_INPUT_BUFFER_PADDING_SIZE) { + av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for recoding\n"); + return AVERROR(ERANGE); + } + + cd = iconv_open("UTF-8", avctx->sub_charenc); + av_assert0(cd != (iconv_t)-1); + + ret = av_new_packet(buf_pkt, inl * UTF8_MAX_BYTES); + if (ret < 0) + goto end; + ret = av_packet_copy_props(buf_pkt, inpkt); + if (ret < 0) + goto end; + outb = buf_pkt->data; + outl = buf_pkt->size; + + if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 || + iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1 || + outl >= buf_pkt->size || inl != 0) { + ret = FFMIN(AVERROR(errno), -1); + av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" " + "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc); + goto end; + } + buf_pkt->size -= outl; + memset(buf_pkt->data + buf_pkt->size, 0, outl); + *outpkt = buf_pkt; + + ret = 0; +end: + if (ret < 0) + av_packet_unref(buf_pkt); + if (cd != (iconv_t)-1) + iconv_close(cd); + return ret; +#else + av_log(avctx, AV_LOG_ERROR, "requesting subtitles recoding without iconv"); + return AVERROR(EINVAL); +#endif +} + +static int utf8_check(const uint8_t *str) +{ + const uint8_t *byte; + uint32_t codepoint, min; + + while (*str) { + byte = str; + GET_UTF8(codepoint, *(byte++), return 0;); + min = byte - str == 1 ? 0 : byte - str == 2 ? 0x80 : + 1 << (5 * (byte - str) - 4); + if (codepoint < min || codepoint >= 0x110000 || + codepoint == 0xFFFE /* BOM */ || + codepoint >= 0xD800 && codepoint <= 0xDFFF /* surrogates */) + return 0; + str = byte; + } + return 1; +} + +int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub, + int *got_sub_ptr, const AVPacket *avpkt) +{ + int ret = 0; + + if (!avpkt->data && avpkt->size) { + av_log(avctx, AV_LOG_ERROR, "invalid packet: NULL data, size != 0\n"); + return AVERROR(EINVAL); + } + if (!avctx->codec) + return AVERROR(EINVAL); + if (avctx->codec->type != AVMEDIA_TYPE_SUBTITLE) { + av_log(avctx, AV_LOG_ERROR, "Invalid media type for subtitles\n"); + return AVERROR(EINVAL); + } + + *got_sub_ptr = 0; + get_subtitle_defaults(sub); + + if ((avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || avpkt->size) { + AVCodecInternal *avci = avctx->internal; + const AVPacket *pkt; + + ret = recode_subtitle(avctx, &pkt, avpkt, avci->buffer_pkt); + if (ret < 0) + return ret; + + if (avctx->pkt_timebase.num && avpkt->pts != AV_NOPTS_VALUE) + sub->pts = av_rescale_q(avpkt->pts, + avctx->pkt_timebase, AV_TIME_BASE_Q); + ret = ffcodec(avctx->codec)->cb.decode_sub(avctx, sub, got_sub_ptr, pkt); + if (pkt == avci->buffer_pkt) // did we recode? + av_packet_unref(avci->buffer_pkt); + if (ret < 0) { + *got_sub_ptr = 0; + avsubtitle_free(sub); + return ret; + } + av_assert1(!sub->num_rects || *got_sub_ptr); + + if (sub->num_rects && !sub->end_display_time && avpkt->duration && + avctx->pkt_timebase.num) { + AVRational ms = { 1, 1000 }; + sub->end_display_time = av_rescale_q(avpkt->duration, + avctx->pkt_timebase, ms); + } + + if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB) + sub->format = 0; + else if (avctx->codec_descriptor->props & AV_CODEC_PROP_TEXT_SUB) + sub->format = 1; + + for (unsigned i = 0; i < sub->num_rects; i++) { + if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_IGNORE && + sub->rects[i]->ass && !utf8_check(sub->rects[i]->ass)) { + av_log(avctx, AV_LOG_ERROR, + "Invalid UTF-8 in decoded subtitles text; " + "maybe missing -sub_charenc option\n"); + avsubtitle_free(sub); + *got_sub_ptr = 0; + return AVERROR_INVALIDDATA; + } + } + + if (*got_sub_ptr) + avctx->frame_num++; +#if FF_API_AVCTX_FRAME_NUMBER +FF_DISABLE_DEPRECATION_WARNINGS + avctx->frame_number = avctx->frame_num; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + } + + return ret; +} + +enum AVPixelFormat avcodec_default_get_format(struct AVCodecContext *avctx, + const enum AVPixelFormat *fmt) +{ + const AVPixFmtDescriptor *desc; + const AVCodecHWConfig *config; + int i, n; + + // If a device was supplied when the codec was opened, assume that the + // user wants to use it. + if (avctx->hw_device_ctx && ffcodec(avctx->codec)->hw_configs) { + AVHWDeviceContext *device_ctx = + (AVHWDeviceContext*)avctx->hw_device_ctx->data; + for (i = 0;; i++) { + config = &ffcodec(avctx->codec)->hw_configs[i]->public; + if (!config) + break; + if (!(config->methods & + AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX)) + continue; + if (device_ctx->type != config->device_type) + continue; + for (n = 0; fmt[n] != AV_PIX_FMT_NONE; n++) { + if (config->pix_fmt == fmt[n]) + return fmt[n]; + } + } + } + // No device or other setup, so we have to choose from things which + // don't any other external information. + + // If the last element of the list is a software format, choose it + // (this should be best software format if any exist). + for (n = 0; fmt[n] != AV_PIX_FMT_NONE; n++); + desc = av_pix_fmt_desc_get(fmt[n - 1]); + if (!(desc->flags & AV_PIX_FMT_FLAG_HWACCEL)) + return fmt[n - 1]; + + // Finally, traverse the list in order and choose the first entry + // with no external dependencies (if there is no hardware configuration + // information available then this just picks the first entry). + for (n = 0; fmt[n] != AV_PIX_FMT_NONE; n++) { + for (i = 0;; i++) { + config = avcodec_get_hw_config(avctx->codec, i); + if (!config) + break; + if (config->pix_fmt == fmt[n]) + break; + } + if (!config) { + // No specific config available, so the decoder must be able + // to handle this format without any additional setup. + return fmt[n]; + } + if (config->methods & AV_CODEC_HW_CONFIG_METHOD_INTERNAL) { + // Usable with only internal setup. + return fmt[n]; + } + } + + // Nothing is usable, give up. + return AV_PIX_FMT_NONE; +} + +int ff_decode_get_hw_frames_ctx(AVCodecContext *avctx, + enum AVHWDeviceType dev_type) +{ + AVHWDeviceContext *device_ctx; + AVHWFramesContext *frames_ctx; + int ret; + + if (!avctx->hwaccel) + return AVERROR(ENOSYS); + + if (avctx->hw_frames_ctx) + return 0; + if (!avctx->hw_device_ctx) { + av_log(avctx, AV_LOG_ERROR, "A hardware frames or device context is " + "required for hardware accelerated decoding.\n"); + return AVERROR(EINVAL); + } + + device_ctx = (AVHWDeviceContext *)avctx->hw_device_ctx->data; + if (device_ctx->type != dev_type) { + av_log(avctx, AV_LOG_ERROR, "Device type %s expected for hardware " + "decoding, but got %s.\n", av_hwdevice_get_type_name(dev_type), + av_hwdevice_get_type_name(device_ctx->type)); + return AVERROR(EINVAL); + } + + ret = avcodec_get_hw_frames_parameters(avctx, + avctx->hw_device_ctx, + avctx->hwaccel->pix_fmt, + &avctx->hw_frames_ctx); + if (ret < 0) + return ret; + + frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data; + + + if (frames_ctx->initial_pool_size) { + // We guarantee 4 base work surfaces. The function above guarantees 1 + // (the absolute minimum), so add the missing count. + frames_ctx->initial_pool_size += 3; + } + + ret = av_hwframe_ctx_init(avctx->hw_frames_ctx); + if (ret < 0) { + av_buffer_unref(&avctx->hw_frames_ctx); + return ret; + } + + return 0; +} + +int avcodec_get_hw_frames_parameters(AVCodecContext *avctx, + AVBufferRef *device_ref, + enum AVPixelFormat hw_pix_fmt, + AVBufferRef **out_frames_ref) +{ + AVBufferRef *frames_ref = NULL; + const AVCodecHWConfigInternal *hw_config; + const AVHWAccel *hwa; + int i, ret; + + for (i = 0;; i++) { + hw_config = ffcodec(avctx->codec)->hw_configs[i]; + if (!hw_config) + return AVERROR(ENOENT); + if (hw_config->public.pix_fmt == hw_pix_fmt) + break; + } + + hwa = hw_config->hwaccel; + if (!hwa || !hwa->frame_params) + return AVERROR(ENOENT); + + frames_ref = av_hwframe_ctx_alloc(device_ref); + if (!frames_ref) + return AVERROR(ENOMEM); + + ret = hwa->frame_params(avctx, frames_ref); + if (ret >= 0) { + AVHWFramesContext *frames_ctx = (AVHWFramesContext*)frames_ref->data; + + if (frames_ctx->initial_pool_size) { + // If the user has requested that extra output surfaces be + // available then add them here. + if (avctx->extra_hw_frames > 0) + frames_ctx->initial_pool_size += avctx->extra_hw_frames; + + // If frame threading is enabled then an extra surface per thread + // is also required. + if (avctx->active_thread_type & FF_THREAD_FRAME) + frames_ctx->initial_pool_size += avctx->thread_count; + } + + *out_frames_ref = frames_ref; + } else { + av_buffer_unref(&frames_ref); + } + return ret; +} + +static int hwaccel_init(AVCodecContext *avctx, + const AVCodecHWConfigInternal *hw_config) +{ + const AVHWAccel *hwaccel; + int err; + + hwaccel = hw_config->hwaccel; + if (hwaccel->capabilities & AV_HWACCEL_CODEC_CAP_EXPERIMENTAL && + avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) { + av_log(avctx, AV_LOG_WARNING, "Ignoring experimental hwaccel: %s\n", + hwaccel->name); + return AVERROR_PATCHWELCOME; + } + + if (hwaccel->priv_data_size) { + avctx->internal->hwaccel_priv_data = + av_mallocz(hwaccel->priv_data_size); + if (!avctx->internal->hwaccel_priv_data) + return AVERROR(ENOMEM); + } + + avctx->hwaccel = hwaccel; + if (hwaccel->init) { + err = hwaccel->init(avctx); + if (err < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed setup for format %s: " + "hwaccel initialisation returned error.\n", + av_get_pix_fmt_name(hw_config->public.pix_fmt)); + av_freep(&avctx->internal->hwaccel_priv_data); + avctx->hwaccel = NULL; + return err; + } + } + + return 0; +} + +static void hwaccel_uninit(AVCodecContext *avctx) +{ + if (avctx->hwaccel && avctx->hwaccel->uninit) + avctx->hwaccel->uninit(avctx); + + av_freep(&avctx->internal->hwaccel_priv_data); + + avctx->hwaccel = NULL; + + av_buffer_unref(&avctx->hw_frames_ctx); +} + +int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt) +{ + const AVPixFmtDescriptor *desc; + enum AVPixelFormat *choices; + enum AVPixelFormat ret, user_choice; + const AVCodecHWConfigInternal *hw_config; + const AVCodecHWConfig *config; + int i, n, err; + + // Find end of list. + for (n = 0; fmt[n] != AV_PIX_FMT_NONE; n++); + // Must contain at least one entry. + av_assert0(n >= 1); + // If a software format is available, it must be the last entry. + desc = av_pix_fmt_desc_get(fmt[n - 1]); + if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) { + // No software format is available. + } else { + avctx->sw_pix_fmt = fmt[n - 1]; + } + + choices = av_memdup(fmt, (n + 1) * sizeof(*choices)); + if (!choices) + return AV_PIX_FMT_NONE; + + for (;;) { + // Remove the previous hwaccel, if there was one. + hwaccel_uninit(avctx); + + user_choice = avctx->get_format(avctx, choices); + if (user_choice == AV_PIX_FMT_NONE) { + // Explicitly chose nothing, give up. + ret = AV_PIX_FMT_NONE; + break; + } + + desc = av_pix_fmt_desc_get(user_choice); + if (!desc) { + av_log(avctx, AV_LOG_ERROR, "Invalid format returned by " + "get_format() callback.\n"); + ret = AV_PIX_FMT_NONE; + break; + } + av_log(avctx, AV_LOG_DEBUG, "Format %s chosen by get_format().\n", + desc->name); + + for (i = 0; i < n; i++) { + if (choices[i] == user_choice) + break; + } + if (i == n) { + av_log(avctx, AV_LOG_ERROR, "Invalid return from get_format(): " + "%s not in possible list.\n", desc->name); + ret = AV_PIX_FMT_NONE; + break; + } + + if (ffcodec(avctx->codec)->hw_configs) { + for (i = 0;; i++) { + hw_config = ffcodec(avctx->codec)->hw_configs[i]; + if (!hw_config) + break; + if (hw_config->public.pix_fmt == user_choice) + break; + } + } else { + hw_config = NULL; + } + + if (!hw_config) { + // No config available, so no extra setup required. + ret = user_choice; + break; + } + config = &hw_config->public; + + if (config->methods & + AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX && + avctx->hw_frames_ctx) { + const AVHWFramesContext *frames_ctx = + (AVHWFramesContext*)avctx->hw_frames_ctx->data; + if (frames_ctx->format != user_choice) { + av_log(avctx, AV_LOG_ERROR, "Invalid setup for format %s: " + "does not match the format of the provided frames " + "context.\n", desc->name); + goto try_again; + } + } else if (config->methods & + AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX && + avctx->hw_device_ctx) { + const AVHWDeviceContext *device_ctx = + (AVHWDeviceContext*)avctx->hw_device_ctx->data; + if (device_ctx->type != config->device_type) { + av_log(avctx, AV_LOG_ERROR, "Invalid setup for format %s: " + "does not match the type of the provided device " + "context.\n", desc->name); + goto try_again; + } + } else if (config->methods & + AV_CODEC_HW_CONFIG_METHOD_INTERNAL) { + // Internal-only setup, no additional configuration. + } else if (config->methods & + AV_CODEC_HW_CONFIG_METHOD_AD_HOC) { + // Some ad-hoc configuration we can't see and can't check. + } else { + av_log(avctx, AV_LOG_ERROR, "Invalid setup for format %s: " + "missing configuration.\n", desc->name); + goto try_again; + } + if (hw_config->hwaccel) { + av_log(avctx, AV_LOG_DEBUG, "Format %s requires hwaccel " + "initialisation.\n", desc->name); + err = hwaccel_init(avctx, hw_config); + if (err < 0) + goto try_again; + } + ret = user_choice; + break; + + try_again: + av_log(avctx, AV_LOG_DEBUG, "Format %s not usable, retrying " + "get_format() without it.\n", desc->name); + for (i = 0; i < n; i++) { + if (choices[i] == user_choice) + break; + } + for (; i + 1 < n; i++) + choices[i] = choices[i + 1]; + --n; + } + + av_freep(&choices); + return ret; +} + +static int add_metadata_from_side_data(const AVPacket *avpkt, AVFrame *frame) +{ + size_t size; + const uint8_t *side_metadata; + + AVDictionary **frame_md = &frame->metadata; + + side_metadata = av_packet_get_side_data(avpkt, + AV_PKT_DATA_STRINGS_METADATA, &size); + return av_packet_unpack_dictionary(side_metadata, size, frame_md); +} + +int ff_decode_frame_props_from_pkt(const AVCodecContext *avctx, + AVFrame *frame, const AVPacket *pkt) +{ + static const struct { + enum AVPacketSideDataType packet; + enum AVFrameSideDataType frame; + } sd[] = { + { AV_PKT_DATA_REPLAYGAIN , AV_FRAME_DATA_REPLAYGAIN }, + { AV_PKT_DATA_DISPLAYMATRIX, AV_FRAME_DATA_DISPLAYMATRIX }, + { AV_PKT_DATA_SPHERICAL, AV_FRAME_DATA_SPHERICAL }, + { AV_PKT_DATA_STEREO3D, AV_FRAME_DATA_STEREO3D }, + { AV_PKT_DATA_AUDIO_SERVICE_TYPE, AV_FRAME_DATA_AUDIO_SERVICE_TYPE }, + { AV_PKT_DATA_MASTERING_DISPLAY_METADATA, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA }, + { AV_PKT_DATA_CONTENT_LIGHT_LEVEL, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL }, + { AV_PKT_DATA_A53_CC, AV_FRAME_DATA_A53_CC }, + { AV_PKT_DATA_ICC_PROFILE, AV_FRAME_DATA_ICC_PROFILE }, + { AV_PKT_DATA_S12M_TIMECODE, AV_FRAME_DATA_S12M_TIMECODE }, + { AV_PKT_DATA_DYNAMIC_HDR10_PLUS, AV_FRAME_DATA_DYNAMIC_HDR_PLUS }, + }; + + frame->pts = pkt->pts; + frame->pkt_pos = pkt->pos; + frame->duration = pkt->duration; + frame->pkt_size = pkt->size; + + for (int i = 0; i < FF_ARRAY_ELEMS(sd); i++) { + size_t size; + uint8_t *packet_sd = av_packet_get_side_data(pkt, sd[i].packet, &size); + if (packet_sd) { + AVFrameSideData *frame_sd = av_frame_new_side_data(frame, + sd[i].frame, + size); + if (!frame_sd) + return AVERROR(ENOMEM); + + memcpy(frame_sd->data, packet_sd, size); + } + } + add_metadata_from_side_data(pkt, frame); + + if (pkt->flags & AV_PKT_FLAG_DISCARD) { + frame->flags |= AV_FRAME_FLAG_DISCARD; + } else { + frame->flags = (frame->flags & ~AV_FRAME_FLAG_DISCARD); + } + + if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) { + int ret = av_buffer_replace(&frame->opaque_ref, pkt->opaque_ref); + if (ret < 0) + return ret; + frame->opaque = pkt->opaque; + } + + return 0; +} + +int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame) +{ + const AVPacket *pkt = avctx->internal->last_pkt_props; + + if (!(ffcodec(avctx->codec)->caps_internal & FF_CODEC_CAP_SETS_FRAME_PROPS)) { + int ret = ff_decode_frame_props_from_pkt(avctx, frame, pkt); + if (ret < 0) + return ret; + frame->pkt_size = (int)(intptr_t)pkt->opaque; + } +#if FF_API_REORDERED_OPAQUE +FF_DISABLE_DEPRECATION_WARNINGS + frame->reordered_opaque = avctx->reordered_opaque; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + + if (frame->color_primaries == AVCOL_PRI_UNSPECIFIED) + frame->color_primaries = avctx->color_primaries; + if (frame->color_trc == AVCOL_TRC_UNSPECIFIED) + frame->color_trc = avctx->color_trc; + if (frame->colorspace == AVCOL_SPC_UNSPECIFIED) + frame->colorspace = avctx->colorspace; + if (frame->color_range == AVCOL_RANGE_UNSPECIFIED) + frame->color_range = avctx->color_range; + if (frame->chroma_location == AVCHROMA_LOC_UNSPECIFIED) + frame->chroma_location = avctx->chroma_sample_location; + + switch (avctx->codec->type) { + case AVMEDIA_TYPE_VIDEO: + frame->format = avctx->pix_fmt; + if (!frame->sample_aspect_ratio.num) + frame->sample_aspect_ratio = avctx->sample_aspect_ratio; + + if (frame->width && frame->height && + av_image_check_sar(frame->width, frame->height, + frame->sample_aspect_ratio) < 0) { + av_log(avctx, AV_LOG_WARNING, "ignoring invalid SAR: %u/%u\n", + frame->sample_aspect_ratio.num, + frame->sample_aspect_ratio.den); + frame->sample_aspect_ratio = (AVRational){ 0, 1 }; + } + + break; + case AVMEDIA_TYPE_AUDIO: + if (!frame->sample_rate) + frame->sample_rate = avctx->sample_rate; + if (frame->format < 0) + frame->format = avctx->sample_fmt; + if (!frame->ch_layout.nb_channels) { + int ret = av_channel_layout_copy(&frame->ch_layout, &avctx->ch_layout); + if (ret < 0) + return ret; + } +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + frame->channels = frame->ch_layout.nb_channels; + frame->channel_layout = frame->ch_layout.order == AV_CHANNEL_ORDER_NATIVE ? + frame->ch_layout.u.mask : 0; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + break; + } + return 0; +} + +static void validate_avframe_allocation(AVCodecContext *avctx, AVFrame *frame) +{ + if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) { + int i; + int num_planes = av_pix_fmt_count_planes(frame->format); + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); + int flags = desc ? desc->flags : 0; + if (num_planes == 1 && (flags & AV_PIX_FMT_FLAG_PAL)) + num_planes = 2; + for (i = 0; i < num_planes; i++) { + av_assert0(frame->data[i]); + } + // For formats without data like hwaccel allow unused pointers to be non-NULL. + for (i = num_planes; num_planes > 0 && i < FF_ARRAY_ELEMS(frame->data); i++) { + if (frame->data[i]) + av_log(avctx, AV_LOG_ERROR, "Buffer returned by get_buffer2() did not zero unused plane pointers\n"); + frame->data[i] = NULL; + } + } +} + +static void decode_data_free(void *opaque, uint8_t *data) +{ + FrameDecodeData *fdd = (FrameDecodeData*)data; + + if (fdd->post_process_opaque_free) + fdd->post_process_opaque_free(fdd->post_process_opaque); + + if (fdd->hwaccel_priv_free) + fdd->hwaccel_priv_free(fdd->hwaccel_priv); + + av_freep(&fdd); +} + +int ff_attach_decode_data(AVFrame *frame) +{ + AVBufferRef *fdd_buf; + FrameDecodeData *fdd; + + av_assert1(!frame->private_ref); + av_buffer_unref(&frame->private_ref); + + fdd = av_mallocz(sizeof(*fdd)); + if (!fdd) + return AVERROR(ENOMEM); + + fdd_buf = av_buffer_create((uint8_t*)fdd, sizeof(*fdd), decode_data_free, + NULL, AV_BUFFER_FLAG_READONLY); + if (!fdd_buf) { + av_freep(&fdd); + return AVERROR(ENOMEM); + } + + frame->private_ref = fdd_buf; + + return 0; +} + +int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags) +{ + const AVHWAccel *hwaccel = avctx->hwaccel; + int override_dimensions = 1; + int ret; + + av_assert0(av_codec_is_decoder(avctx->codec)); + + if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) { + if ((unsigned)avctx->width > INT_MAX - STRIDE_ALIGN || + (ret = av_image_check_size2(FFALIGN(avctx->width, STRIDE_ALIGN), avctx->height, avctx->max_pixels, AV_PIX_FMT_NONE, 0, avctx)) < 0 || avctx->pix_fmt<0) { + av_log(avctx, AV_LOG_ERROR, "video_get_buffer: image parameters invalid\n"); + ret = AVERROR(EINVAL); + goto fail; + } + + if (frame->width <= 0 || frame->height <= 0) { + frame->width = FFMAX(avctx->width, AV_CEIL_RSHIFT(avctx->coded_width, avctx->lowres)); + frame->height = FFMAX(avctx->height, AV_CEIL_RSHIFT(avctx->coded_height, avctx->lowres)); + override_dimensions = 0; + } + + if (frame->data[0] || frame->data[1] || frame->data[2] || frame->data[3]) { + av_log(avctx, AV_LOG_ERROR, "pic->data[*]!=NULL in get_buffer_internal\n"); + ret = AVERROR(EINVAL); + goto fail; + } + } else if (avctx->codec_type == AVMEDIA_TYPE_AUDIO) { +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + /* compat layer for old-style get_buffer() implementations */ + avctx->channels = avctx->ch_layout.nb_channels; + avctx->channel_layout = (avctx->ch_layout.order == AV_CHANNEL_ORDER_NATIVE) ? + avctx->ch_layout.u.mask : 0; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + + if (frame->nb_samples * (int64_t)avctx->ch_layout.nb_channels > avctx->max_samples) { + av_log(avctx, AV_LOG_ERROR, "samples per frame %d, exceeds max_samples %"PRId64"\n", frame->nb_samples, avctx->max_samples); + ret = AVERROR(EINVAL); + goto fail; + } + } + ret = ff_decode_frame_props(avctx, frame); + if (ret < 0) + goto fail; + + if (hwaccel) { + if (hwaccel->alloc_frame) { + ret = hwaccel->alloc_frame(avctx, frame); + goto end; + } + } else + avctx->sw_pix_fmt = avctx->pix_fmt; + + ret = avctx->get_buffer2(avctx, frame, flags); + if (ret < 0) + goto fail; + + validate_avframe_allocation(avctx, frame); + + ret = ff_attach_decode_data(frame); + if (ret < 0) + goto fail; + +end: + if (avctx->codec_type == AVMEDIA_TYPE_VIDEO && !override_dimensions && + !(ffcodec(avctx->codec)->caps_internal & FF_CODEC_CAP_EXPORTS_CROPPING)) { + frame->width = avctx->width; + frame->height = avctx->height; + } + +fail: + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n"); + av_frame_unref(frame); + } + + return ret; +} + +static int reget_buffer_internal(AVCodecContext *avctx, AVFrame *frame, int flags) +{ + AVFrame *tmp; + int ret; + + av_assert0(avctx->codec_type == AVMEDIA_TYPE_VIDEO); + + if (frame->data[0] && (frame->width != avctx->width || frame->height != avctx->height || frame->format != avctx->pix_fmt)) { + av_log(avctx, AV_LOG_WARNING, "Picture changed from size:%dx%d fmt:%s to size:%dx%d fmt:%s in reget buffer()\n", + frame->width, frame->height, av_get_pix_fmt_name(frame->format), avctx->width, avctx->height, av_get_pix_fmt_name(avctx->pix_fmt)); + av_frame_unref(frame); + } + + if (!frame->data[0]) + return ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF); + + if ((flags & FF_REGET_BUFFER_FLAG_READONLY) || av_frame_is_writable(frame)) + return ff_decode_frame_props(avctx, frame); + + tmp = av_frame_alloc(); + if (!tmp) + return AVERROR(ENOMEM); + + av_frame_move_ref(tmp, frame); + + ret = ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF); + if (ret < 0) { + av_frame_free(&tmp); + return ret; + } + + av_frame_copy(frame, tmp); + av_frame_free(&tmp); + + return 0; +} + +int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame, int flags) +{ + int ret = reget_buffer_internal(avctx, frame, flags); + if (ret < 0) + av_log(avctx, AV_LOG_ERROR, "reget_buffer() failed\n"); + return ret; +} + +int ff_decode_preinit(AVCodecContext *avctx) +{ + AVCodecInternal *avci = avctx->internal; + int ret = 0; + + /* if the decoder init function was already called previously, + * free the already allocated subtitle_header before overwriting it */ + av_freep(&avctx->subtitle_header); + + if (avctx->codec->max_lowres < avctx->lowres || avctx->lowres < 0) { + av_log(avctx, AV_LOG_WARNING, "The maximum value for lowres supported by the decoder is %d\n", + avctx->codec->max_lowres); + avctx->lowres = avctx->codec->max_lowres; + } + if (avctx->sub_charenc) { + if (avctx->codec_type != AVMEDIA_TYPE_SUBTITLE) { + av_log(avctx, AV_LOG_ERROR, "Character encoding is only " + "supported with subtitles codecs\n"); + return AVERROR(EINVAL); + } else if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB) { + av_log(avctx, AV_LOG_WARNING, "Codec '%s' is bitmap-based, " + "subtitles character encoding will be ignored\n", + avctx->codec_descriptor->name); + avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING; + } else { + /* input character encoding is set for a text based subtitle + * codec at this point */ + if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_AUTOMATIC) + avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_PRE_DECODER; + + if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_PRE_DECODER) { +#if CONFIG_ICONV + iconv_t cd = iconv_open("UTF-8", avctx->sub_charenc); + if (cd == (iconv_t)-1) { + ret = AVERROR(errno); + av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context " + "with input character encoding \"%s\"\n", avctx->sub_charenc); + return ret; + } + iconv_close(cd); +#else + av_log(avctx, AV_LOG_ERROR, "Character encoding subtitles " + "conversion needs a libavcodec built with iconv support " + "for this codec\n"); + return AVERROR(ENOSYS); +#endif + } + } + } + + avctx->pts_correction_num_faulty_pts = + avctx->pts_correction_num_faulty_dts = 0; + avctx->pts_correction_last_pts = + avctx->pts_correction_last_dts = INT64_MIN; + + if ( !CONFIG_GRAY && avctx->flags & AV_CODEC_FLAG_GRAY + && avctx->codec_descriptor->type == AVMEDIA_TYPE_VIDEO) + av_log(avctx, AV_LOG_WARNING, + "gray decoding requested but not enabled at configuration time\n"); + if (avctx->flags2 & AV_CODEC_FLAG2_EXPORT_MVS) { + avctx->export_side_data |= AV_CODEC_EXPORT_DATA_MVS; + } + + avci->in_pkt = av_packet_alloc(); + avci->last_pkt_props = av_packet_alloc(); + if (!avci->in_pkt || !avci->last_pkt_props) + return AVERROR(ENOMEM); + + ret = decode_bsfs_init(avctx); + if (ret < 0) + return ret; + + return 0; +} + +int ff_copy_palette(void *dst, const AVPacket *src, void *logctx) +{ + size_t size; + const void *pal = av_packet_get_side_data(src, AV_PKT_DATA_PALETTE, &size); + + if (pal && size == AVPALETTE_SIZE) { + memcpy(dst, pal, AVPALETTE_SIZE); + return 1; + } else if (pal) { + av_log(logctx, AV_LOG_ERROR, + "Palette size %"SIZE_SPECIFIER" is wrong\n", size); + } + return 0; +} diff --git a/media/ffvpx/libavcodec/decode.h b/media/ffvpx/libavcodec/decode.h new file mode 100644 index 0000000000..8430ffbd66 --- /dev/null +++ b/media/ffvpx/libavcodec/decode.h @@ -0,0 +1,153 @@ +/* + * generic decoding-related code + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_DECODE_H +#define AVCODEC_DECODE_H + +#include "libavutil/buffer.h" +#include "libavutil/frame.h" +#include "libavutil/hwcontext.h" + +#include "avcodec.h" + +/** + * This struct stores per-frame lavc-internal data and is attached to it via + * private_ref. + */ +typedef struct FrameDecodeData { + /** + * The callback to perform some delayed processing on the frame right + * before it is returned to the caller. + * + * @note This code is called at some unspecified point after the frame is + * returned from the decoder's decode/receive_frame call. Therefore it cannot rely + * on AVCodecContext being in any specific state, so it does not get to + * access AVCodecContext directly at all. All the state it needs must be + * stored in the post_process_opaque object. + */ + int (*post_process)(void *logctx, AVFrame *frame); + void *post_process_opaque; + void (*post_process_opaque_free)(void *opaque); + + /** + * Per-frame private data for hwaccels. + */ + void *hwaccel_priv; + void (*hwaccel_priv_free)(void *priv); +} FrameDecodeData; + +/** + * avcodec_receive_frame() implementation for decoders. + */ +int ff_decode_receive_frame(AVCodecContext *avctx, AVFrame *frame); + +/** + * Called by decoders to get the next packet for decoding. + * + * @param pkt An empty packet to be filled with data. + * @return 0 if a new reference has been successfully written to pkt + * AVERROR(EAGAIN) if no data is currently available + * AVERROR_EOF if and end of stream has been reached, so no more data + * will be available + */ +int ff_decode_get_packet(AVCodecContext *avctx, AVPacket *pkt); + +/** + * Set various frame properties from the provided packet. + */ +int ff_decode_frame_props_from_pkt(const AVCodecContext *avctx, + AVFrame *frame, const AVPacket *pkt); + +/** + * Set various frame properties from the codec context / packet data. + */ +int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame); + +/** + * Make sure avctx.hw_frames_ctx is set. If it's not set, the function will + * try to allocate it from hw_device_ctx. If that is not possible, an error + * message is printed, and an error code is returned. + */ +int ff_decode_get_hw_frames_ctx(AVCodecContext *avctx, + enum AVHWDeviceType dev_type); + +int ff_attach_decode_data(AVFrame *frame); + +/** + * Check whether the side-data of src contains a palette of + * size AVPALETTE_SIZE; if so, copy it to dst and return 1; + * else return 0. + * Also emit an error message upon encountering a palette + * with invalid size. + */ +int ff_copy_palette(void *dst, const AVPacket *src, void *logctx); + +/** + * Perform decoder initialization and validation. + * Called when opening the decoder, before the FFCodec.init() call. + */ +int ff_decode_preinit(AVCodecContext *avctx); + +/** + * Check that the provided frame dimensions are valid and set them on the codec + * context. + */ +int ff_set_dimensions(AVCodecContext *s, int width, int height); + +/** + * Check that the provided sample aspect ratio is valid and set it on the codec + * context. + */ +int ff_set_sar(AVCodecContext *avctx, AVRational sar); + +/** + * Select the (possibly hardware accelerated) pixel format. + * This is a wrapper around AVCodecContext.get_format() and should be used + * instead of calling get_format() directly. + * + * The list of pixel formats must contain at least one valid entry, and is + * terminated with AV_PIX_FMT_NONE. If it is possible to decode to software, + * the last entry in the list must be the most accurate software format. + * If it is not possible to decode to software, AVCodecContext.sw_pix_fmt + * must be set before calling this function. + */ +int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt); + +/** + * Get a buffer for a frame. This is a wrapper around + * AVCodecContext.get_buffer() and should be used instead calling get_buffer() + * directly. + */ +int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags); + +#define FF_REGET_BUFFER_FLAG_READONLY 1 ///< the returned buffer does not need to be writable +/** + * Identical in function to ff_get_buffer(), except it reuses the existing buffer + * if available. + */ +int ff_reget_buffer(AVCodecContext *avctx, AVFrame *frame, int flags); + +/** + * Add or update AV_FRAME_DATA_MATRIXENCODING side data. + */ +int ff_side_data_update_matrix_encoding(AVFrame *frame, + enum AVMatrixEncoding matrix_encoding); + +#endif /* AVCODEC_DECODE_H */ diff --git a/media/ffvpx/libavcodec/defs.h b/media/ffvpx/libavcodec/defs.h new file mode 100644 index 0000000000..fbe3254db2 --- /dev/null +++ b/media/ffvpx/libavcodec/defs.h @@ -0,0 +1,192 @@ +/* + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_DEFS_H +#define AVCODEC_DEFS_H + +/** + * @file + * @ingroup libavc + * Misc types and constants that do not belong anywhere else. + */ + +#include <stdint.h> +#include <stdlib.h> + +/** + * @ingroup lavc_decoding + * Required number of additionally allocated bytes at the end of the input bitstream for decoding. + * This is mainly needed because some optimized bitstream readers read + * 32 or 64 bit at once and could read over the end.<br> + * Note: If the first 23 bits of the additional bytes are not 0, then damaged + * MPEG bitstreams could cause overread and segfault. + */ +#define AV_INPUT_BUFFER_PADDING_SIZE 64 + +/** + * Verify checksums embedded in the bitstream (could be of either encoded or + * decoded data, depending on the format) and print an error message on mismatch. + * If AV_EF_EXPLODE is also set, a mismatching checksum will result in the + * decoder/demuxer returning an error. + */ +#define AV_EF_CRCCHECK (1<<0) +#define AV_EF_BITSTREAM (1<<1) ///< detect bitstream specification deviations +#define AV_EF_BUFFER (1<<2) ///< detect improper bitstream length +#define AV_EF_EXPLODE (1<<3) ///< abort decoding on minor error detection + +#define AV_EF_IGNORE_ERR (1<<15) ///< ignore errors and continue +#define AV_EF_CAREFUL (1<<16) ///< consider things that violate the spec, are fast to calculate and have not been seen in the wild as errors +#define AV_EF_COMPLIANT (1<<17) ///< consider all spec non compliances as errors +#define AV_EF_AGGRESSIVE (1<<18) ///< consider things that a sane encoder/muxer should not do as an error + +#define FF_COMPLIANCE_VERY_STRICT 2 ///< Strictly conform to an older more strict version of the spec or reference software. +#define FF_COMPLIANCE_STRICT 1 ///< Strictly conform to all the things in the spec no matter what consequences. +#define FF_COMPLIANCE_NORMAL 0 +#define FF_COMPLIANCE_UNOFFICIAL -1 ///< Allow unofficial extensions +#define FF_COMPLIANCE_EXPERIMENTAL -2 ///< Allow nonstandardized experimental things. + +/** + * @ingroup lavc_decoding + */ +enum AVDiscard{ + /* We leave some space between them for extensions (drop some + * keyframes for intra-only or drop just some bidir frames). */ + AVDISCARD_NONE =-16, ///< discard nothing + AVDISCARD_DEFAULT = 0, ///< discard useless packets like 0 size packets in avi + AVDISCARD_NONREF = 8, ///< discard all non reference + AVDISCARD_BIDIR = 16, ///< discard all bidirectional frames + AVDISCARD_NONINTRA= 24, ///< discard all non intra frames + AVDISCARD_NONKEY = 32, ///< discard all frames except keyframes + AVDISCARD_ALL = 48, ///< discard all +}; + +enum AVAudioServiceType { + AV_AUDIO_SERVICE_TYPE_MAIN = 0, + AV_AUDIO_SERVICE_TYPE_EFFECTS = 1, + AV_AUDIO_SERVICE_TYPE_VISUALLY_IMPAIRED = 2, + AV_AUDIO_SERVICE_TYPE_HEARING_IMPAIRED = 3, + AV_AUDIO_SERVICE_TYPE_DIALOGUE = 4, + AV_AUDIO_SERVICE_TYPE_COMMENTARY = 5, + AV_AUDIO_SERVICE_TYPE_EMERGENCY = 6, + AV_AUDIO_SERVICE_TYPE_VOICE_OVER = 7, + AV_AUDIO_SERVICE_TYPE_KARAOKE = 8, + AV_AUDIO_SERVICE_TYPE_NB , ///< Not part of ABI +}; + +/** + * Pan Scan area. + * This specifies the area which should be displayed. + * Note there may be multiple such areas for one frame. + */ +typedef struct AVPanScan { + /** + * id + * - encoding: Set by user. + * - decoding: Set by libavcodec. + */ + int id; + + /** + * width and height in 1/16 pel + * - encoding: Set by user. + * - decoding: Set by libavcodec. + */ + int width; + int height; + + /** + * position of the top left corner in 1/16 pel for up to 3 fields/frames + * - encoding: Set by user. + * - decoding: Set by libavcodec. + */ + int16_t position[3][2]; +} AVPanScan; + +/** + * This structure describes the bitrate properties of an encoded bitstream. It + * roughly corresponds to a subset the VBV parameters for MPEG-2 or HRD + * parameters for H.264/HEVC. + */ +typedef struct AVCPBProperties { + /** + * Maximum bitrate of the stream, in bits per second. + * Zero if unknown or unspecified. + */ + int64_t max_bitrate; + /** + * Minimum bitrate of the stream, in bits per second. + * Zero if unknown or unspecified. + */ + int64_t min_bitrate; + /** + * Average bitrate of the stream, in bits per second. + * Zero if unknown or unspecified. + */ + int64_t avg_bitrate; + + /** + * The size of the buffer to which the ratecontrol is applied, in bits. + * Zero if unknown or unspecified. + */ + int64_t buffer_size; + + /** + * The delay between the time the packet this structure is associated with + * is received and the time when it should be decoded, in periods of a 27MHz + * clock. + * + * UINT64_MAX when unknown or unspecified. + */ + uint64_t vbv_delay; +} AVCPBProperties; + +/** + * Allocate a CPB properties structure and initialize its fields to default + * values. + * + * @param size if non-NULL, the size of the allocated struct will be written + * here. This is useful for embedding it in side data. + * + * @return the newly allocated struct or NULL on failure + */ +AVCPBProperties *av_cpb_properties_alloc(size_t *size); + +/** + * This structure supplies correlation between a packet timestamp and a wall clock + * production time. The definition follows the Producer Reference Time ('prft') + * as defined in ISO/IEC 14496-12 + */ +typedef struct AVProducerReferenceTime { + /** + * A UTC timestamp, in microseconds, since Unix epoch (e.g, av_gettime()). + */ + int64_t wallclock; + int flags; +} AVProducerReferenceTime; + +/** + * Encode extradata length to a buffer. Used by xiph codecs. + * + * @param s buffer to write to; must be at least (v/255+1) bytes long + * @param v size of extradata in bytes + * @return number of bytes written to the buffer. + */ +unsigned int av_xiphlacing(unsigned char *s, unsigned int v); + +#endif // AVCODEC_DEFS_H diff --git a/media/ffvpx/libavcodec/encode.c b/media/ffvpx/libavcodec/encode.c new file mode 100644 index 0000000000..041fc7670e --- /dev/null +++ b/media/ffvpx/libavcodec/encode.c @@ -0,0 +1,774 @@ +/* + * generic encoding-related code + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/channel_layout.h" +#include "libavutil/frame.h" +#include "libavutil/imgutils.h" +#include "libavutil/internal.h" +#include "libavutil/samplefmt.h" + +#include "avcodec.h" +#include "codec_internal.h" +#include "encode.h" +#include "frame_thread_encoder.h" +#include "internal.h" + +int ff_alloc_packet(AVCodecContext *avctx, AVPacket *avpkt, int64_t size) +{ + if (size < 0 || size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE) { + av_log(avctx, AV_LOG_ERROR, "Invalid minimum required packet size %"PRId64" (max allowed is %d)\n", + size, INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE); + return AVERROR(EINVAL); + } + + av_assert0(!avpkt->data); + + av_fast_padded_malloc(&avctx->internal->byte_buffer, + &avctx->internal->byte_buffer_size, size); + avpkt->data = avctx->internal->byte_buffer; + if (!avpkt->data) { + av_log(avctx, AV_LOG_ERROR, "Failed to allocate packet of size %"PRId64"\n", size); + return AVERROR(ENOMEM); + } + avpkt->size = size; + + return 0; +} + +int avcodec_default_get_encode_buffer(AVCodecContext *avctx, AVPacket *avpkt, int flags) +{ + int ret; + + if (avpkt->size < 0 || avpkt->size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE) + return AVERROR(EINVAL); + + if (avpkt->data || avpkt->buf) { + av_log(avctx, AV_LOG_ERROR, "avpkt->{data,buf} != NULL in avcodec_default_get_encode_buffer()\n"); + return AVERROR(EINVAL); + } + + ret = av_buffer_realloc(&avpkt->buf, avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to allocate packet of size %d\n", avpkt->size); + return ret; + } + avpkt->data = avpkt->buf->data; + + return 0; +} + +int ff_get_encode_buffer(AVCodecContext *avctx, AVPacket *avpkt, int64_t size, int flags) +{ + int ret; + + if (size < 0 || size > INT_MAX - AV_INPUT_BUFFER_PADDING_SIZE) + return AVERROR(EINVAL); + + av_assert0(!avpkt->data && !avpkt->buf); + + avpkt->size = size; + ret = avctx->get_encode_buffer(avctx, avpkt, flags); + if (ret < 0) + goto fail; + + if (!avpkt->data || !avpkt->buf) { + av_log(avctx, AV_LOG_ERROR, "No buffer returned by get_encode_buffer()\n"); + ret = AVERROR(EINVAL); + goto fail; + } + memset(avpkt->data + avpkt->size, 0, AV_INPUT_BUFFER_PADDING_SIZE); + + ret = 0; +fail: + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "get_encode_buffer() failed\n"); + av_packet_unref(avpkt); + } + + return ret; +} + +static int encode_make_refcounted(AVCodecContext *avctx, AVPacket *avpkt) +{ + uint8_t *data = avpkt->data; + int ret; + + if (avpkt->buf) + return 0; + + avpkt->data = NULL; + ret = ff_get_encode_buffer(avctx, avpkt, avpkt->size, 0); + if (ret < 0) + return ret; + memcpy(avpkt->data, data, avpkt->size); + + return 0; +} + +/** + * Pad last frame with silence. + */ +static int pad_last_frame(AVCodecContext *s, AVFrame *frame, const AVFrame *src, int out_samples) +{ + int ret; + + frame->format = src->format; + frame->nb_samples = out_samples; + ret = av_channel_layout_copy(&frame->ch_layout, &s->ch_layout); + if (ret < 0) + goto fail; + ret = av_frame_get_buffer(frame, 0); + if (ret < 0) + goto fail; + + ret = av_frame_copy_props(frame, src); + if (ret < 0) + goto fail; + + if ((ret = av_samples_copy(frame->extended_data, src->extended_data, 0, 0, + src->nb_samples, s->ch_layout.nb_channels, + s->sample_fmt)) < 0) + goto fail; + if ((ret = av_samples_set_silence(frame->extended_data, src->nb_samples, + frame->nb_samples - src->nb_samples, + s->ch_layout.nb_channels, s->sample_fmt)) < 0) + goto fail; + + return 0; + +fail: + av_frame_unref(frame); + s->internal->last_audio_frame = 0; + return ret; +} + +int avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size, + const AVSubtitle *sub) +{ + int ret; + if (sub->start_display_time) { + av_log(avctx, AV_LOG_ERROR, "start_display_time must be 0.\n"); + return -1; + } + + ret = ffcodec(avctx->codec)->cb.encode_sub(avctx, buf, buf_size, sub); + avctx->frame_num++; +#if FF_API_AVCTX_FRAME_NUMBER +FF_DISABLE_DEPRECATION_WARNINGS + avctx->frame_number = avctx->frame_num; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + return ret; +} + +int ff_encode_get_frame(AVCodecContext *avctx, AVFrame *frame) +{ + AVCodecInternal *avci = avctx->internal; + + if (avci->draining) + return AVERROR_EOF; + + if (!avci->buffer_frame->buf[0]) + return AVERROR(EAGAIN); + + av_frame_move_ref(frame, avci->buffer_frame); + + return 0; +} + +int ff_encode_reordered_opaque(AVCodecContext *avctx, + AVPacket *pkt, const AVFrame *frame) +{ +#if FF_API_REORDERED_OPAQUE +FF_DISABLE_DEPRECATION_WARNINGS + avctx->reordered_opaque = frame->reordered_opaque; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + + if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) { + int ret = av_buffer_replace(&pkt->opaque_ref, frame->opaque_ref); + if (ret < 0) + return ret; + pkt->opaque = frame->opaque; + } + + return 0; +} + +int ff_encode_encode_cb(AVCodecContext *avctx, AVPacket *avpkt, + AVFrame *frame, int *got_packet) +{ + const FFCodec *const codec = ffcodec(avctx->codec); + int ret; + + ret = codec->cb.encode(avctx, avpkt, frame, got_packet); + emms_c(); + av_assert0(ret <= 0); + + if (!ret && *got_packet) { + if (avpkt->data) { + ret = encode_make_refcounted(avctx, avpkt); + if (ret < 0) + goto unref; + // Date returned by encoders must always be ref-counted + av_assert0(avpkt->buf); + } + + // set the timestamps for the simple no-delay case + // encoders with delay have to set the timestamps themselves + if (!(avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || + (frame && (codec->caps_internal & FF_CODEC_CAP_EOF_FLUSH))) { + if (avpkt->pts == AV_NOPTS_VALUE) + avpkt->pts = frame->pts; + + if (!avpkt->duration) { + if (frame->duration) + avpkt->duration = frame->duration; + else if (avctx->codec->type == AVMEDIA_TYPE_AUDIO) { + avpkt->duration = ff_samples_to_time_base(avctx, + frame->nb_samples); + } + } + + ret = ff_encode_reordered_opaque(avctx, avpkt, frame); + if (ret < 0) + goto unref; + } + + // dts equals pts unless there is reordering + // there can be no reordering if there is no encoder delay + if (!(avctx->codec_descriptor->props & AV_CODEC_PROP_REORDER) || + !(avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || + (codec->caps_internal & FF_CODEC_CAP_EOF_FLUSH)) + avpkt->dts = avpkt->pts; + } else { +unref: + av_packet_unref(avpkt); + } + + if (frame) + av_frame_unref(frame); + + return ret; +} + +static int encode_simple_internal(AVCodecContext *avctx, AVPacket *avpkt) +{ + AVCodecInternal *avci = avctx->internal; + AVFrame *frame = avci->in_frame; + const FFCodec *const codec = ffcodec(avctx->codec); + int got_packet; + int ret; + + if (avci->draining_done) + return AVERROR_EOF; + + if (!frame->buf[0] && !avci->draining) { + av_frame_unref(frame); + ret = ff_encode_get_frame(avctx, frame); + if (ret < 0 && ret != AVERROR_EOF) + return ret; + } + + if (!frame->buf[0]) { + if (!(avctx->codec->capabilities & AV_CODEC_CAP_DELAY || + avci->frame_thread_encoder)) + return AVERROR_EOF; + + // Flushing is signaled with a NULL frame + frame = NULL; + } + + got_packet = 0; + + av_assert0(codec->cb_type == FF_CODEC_CB_TYPE_ENCODE); + + if (CONFIG_FRAME_THREAD_ENCODER && avci->frame_thread_encoder) + /* This will unref frame. */ + ret = ff_thread_video_encode_frame(avctx, avpkt, frame, &got_packet); + else { + ret = ff_encode_encode_cb(avctx, avpkt, frame, &got_packet); + } + + if (avci->draining && !got_packet) + avci->draining_done = 1; + + return ret; +} + +static int encode_simple_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) +{ + int ret; + + while (!avpkt->data && !avpkt->side_data) { + ret = encode_simple_internal(avctx, avpkt); + if (ret < 0) + return ret; + } + + return 0; +} + +static int encode_receive_packet_internal(AVCodecContext *avctx, AVPacket *avpkt) +{ + AVCodecInternal *avci = avctx->internal; + int ret; + + if (avci->draining_done) + return AVERROR_EOF; + + av_assert0(!avpkt->data && !avpkt->side_data); + + if (avctx->codec->type == AVMEDIA_TYPE_VIDEO) { + if ((avctx->flags & AV_CODEC_FLAG_PASS1) && avctx->stats_out) + avctx->stats_out[0] = '\0'; + if (av_image_check_size2(avctx->width, avctx->height, avctx->max_pixels, AV_PIX_FMT_NONE, 0, avctx)) + return AVERROR(EINVAL); + } + + if (ffcodec(avctx->codec)->cb_type == FF_CODEC_CB_TYPE_RECEIVE_PACKET) { + ret = ffcodec(avctx->codec)->cb.receive_packet(avctx, avpkt); + if (ret < 0) + av_packet_unref(avpkt); + else + // Encoders must always return ref-counted buffers. + // Side-data only packets have no data and can be not ref-counted. + av_assert0(!avpkt->data || avpkt->buf); + } else + ret = encode_simple_receive_packet(avctx, avpkt); + if (ret >= 0) + avpkt->flags |= avci->intra_only_flag; + + if (ret == AVERROR_EOF) + avci->draining_done = 1; + + return ret; +} + +#if CONFIG_LCMS2 +static int encode_generate_icc_profile(AVCodecContext *avctx, AVFrame *frame) +{ + enum AVColorTransferCharacteristic trc = frame->color_trc; + enum AVColorPrimaries prim = frame->color_primaries; + const FFCodec *const codec = ffcodec(avctx->codec); + AVCodecInternal *avci = avctx->internal; + cmsHPROFILE profile; + int ret; + + /* don't generate ICC profiles if disabled or unsupported */ + if (!(avctx->flags2 & AV_CODEC_FLAG2_ICC_PROFILES)) + return 0; + if (!(codec->caps_internal & FF_CODEC_CAP_ICC_PROFILES)) + return 0; + + if (trc == AVCOL_TRC_UNSPECIFIED) + trc = avctx->color_trc; + if (prim == AVCOL_PRI_UNSPECIFIED) + prim = avctx->color_primaries; + if (trc == AVCOL_TRC_UNSPECIFIED || prim == AVCOL_PRI_UNSPECIFIED) + return 0; /* can't generate ICC profile with missing csp tags */ + + if (av_frame_get_side_data(frame, AV_FRAME_DATA_ICC_PROFILE)) + return 0; /* don't overwrite existing ICC profile */ + + if (!avci->icc.avctx) { + ret = ff_icc_context_init(&avci->icc, avctx); + if (ret < 0) + return ret; + } + + ret = ff_icc_profile_generate(&avci->icc, prim, trc, &profile); + if (ret < 0) + return ret; + + ret = ff_icc_profile_attach(&avci->icc, profile, frame); + cmsCloseProfile(profile); + return ret; +} +#else /* !CONFIG_LCMS2 */ +static int encode_generate_icc_profile(av_unused AVCodecContext *c, av_unused AVFrame *f) +{ + return 0; +} +#endif + +static int encode_send_frame_internal(AVCodecContext *avctx, const AVFrame *src) +{ + AVCodecInternal *avci = avctx->internal; + AVFrame *dst = avci->buffer_frame; + int ret; + + if (avctx->codec->type == AVMEDIA_TYPE_AUDIO) { + /* extract audio service type metadata */ + AVFrameSideData *sd = av_frame_get_side_data(src, AV_FRAME_DATA_AUDIO_SERVICE_TYPE); + if (sd && sd->size >= sizeof(enum AVAudioServiceType)) + avctx->audio_service_type = *(enum AVAudioServiceType*)sd->data; + + /* check for valid frame size */ + if (!(avctx->codec->capabilities & AV_CODEC_CAP_VARIABLE_FRAME_SIZE)) { + /* if we already got an undersized frame, that must have been the last */ + if (avctx->internal->last_audio_frame) { + av_log(avctx, AV_LOG_ERROR, "frame_size (%d) was not respected for a non-last frame\n", avctx->frame_size); + return AVERROR(EINVAL); + } + if (src->nb_samples > avctx->frame_size) { + av_log(avctx, AV_LOG_ERROR, "nb_samples (%d) > frame_size (%d)\n", src->nb_samples, avctx->frame_size); + return AVERROR(EINVAL); + } + if (src->nb_samples < avctx->frame_size) { + avctx->internal->last_audio_frame = 1; + if (!(avctx->codec->capabilities & AV_CODEC_CAP_SMALL_LAST_FRAME)) { + int pad_samples = avci->pad_samples ? avci->pad_samples : avctx->frame_size; + int out_samples = (src->nb_samples + pad_samples - 1) / pad_samples * pad_samples; + + if (out_samples != src->nb_samples) { + ret = pad_last_frame(avctx, dst, src, out_samples); + if (ret < 0) + return ret; + goto finish; + } + } + } + } + } + + ret = av_frame_ref(dst, src); + if (ret < 0) + return ret; + +finish: + +#if FF_API_PKT_DURATION +FF_DISABLE_DEPRECATION_WARNINGS + if (dst->pkt_duration && dst->pkt_duration != dst->duration) + dst->duration = dst->pkt_duration; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + + if (avctx->codec->type == AVMEDIA_TYPE_VIDEO) { + ret = encode_generate_icc_profile(avctx, dst); + if (ret < 0) + return ret; + } + + // unset frame duration unless AV_CODEC_FLAG_FRAME_DURATION is set, + // since otherwise we cannot be sure that whatever value it has is in the + // right timebase, so we would produce an incorrect value, which is worse + // than none at all + if (!(avctx->flags & AV_CODEC_FLAG_FRAME_DURATION)) + dst->duration = 0; + + return 0; +} + +int attribute_align_arg avcodec_send_frame(AVCodecContext *avctx, const AVFrame *frame) +{ + AVCodecInternal *avci = avctx->internal; + int ret; + + if (!avcodec_is_open(avctx) || !av_codec_is_encoder(avctx->codec)) + return AVERROR(EINVAL); + + if (avci->draining) + return AVERROR_EOF; + + if (avci->buffer_frame->buf[0]) + return AVERROR(EAGAIN); + + if (!frame) { + avci->draining = 1; + } else { + ret = encode_send_frame_internal(avctx, frame); + if (ret < 0) + return ret; + } + + if (!avci->buffer_pkt->data && !avci->buffer_pkt->side_data) { + ret = encode_receive_packet_internal(avctx, avci->buffer_pkt); + if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) + return ret; + } + + avctx->frame_num++; +#if FF_API_AVCTX_FRAME_NUMBER +FF_DISABLE_DEPRECATION_WARNINGS + avctx->frame_number = avctx->frame_num; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + + return 0; +} + +int attribute_align_arg avcodec_receive_packet(AVCodecContext *avctx, AVPacket *avpkt) +{ + AVCodecInternal *avci = avctx->internal; + int ret; + + av_packet_unref(avpkt); + + if (!avcodec_is_open(avctx) || !av_codec_is_encoder(avctx->codec)) + return AVERROR(EINVAL); + + if (avci->buffer_pkt->data || avci->buffer_pkt->side_data) { + av_packet_move_ref(avpkt, avci->buffer_pkt); + } else { + ret = encode_receive_packet_internal(avctx, avpkt); + if (ret < 0) + return ret; + } + + return 0; +} + +static int encode_preinit_video(AVCodecContext *avctx) +{ + const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(avctx->pix_fmt); + int i; + + if (avctx->codec->pix_fmts) { + for (i = 0; avctx->codec->pix_fmts[i] != AV_PIX_FMT_NONE; i++) + if (avctx->pix_fmt == avctx->codec->pix_fmts[i]) + break; + if (avctx->codec->pix_fmts[i] == AV_PIX_FMT_NONE) { + char buf[128]; + snprintf(buf, sizeof(buf), "%d", avctx->pix_fmt); + av_log(avctx, AV_LOG_ERROR, "Specified pixel format %s is invalid or not supported\n", + (char *)av_x_if_null(av_get_pix_fmt_name(avctx->pix_fmt), buf)); + return AVERROR(EINVAL); + } + if (avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ420P || + avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ411P || + avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ422P || + avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ440P || + avctx->codec->pix_fmts[i] == AV_PIX_FMT_YUVJ444P) + avctx->color_range = AVCOL_RANGE_JPEG; + } + + if ( avctx->bits_per_raw_sample < 0 + || (avctx->bits_per_raw_sample > 8 && pixdesc->comp[0].depth <= 8)) { + av_log(avctx, AV_LOG_WARNING, "Specified bit depth %d not possible with the specified pixel formats depth %d\n", + avctx->bits_per_raw_sample, pixdesc->comp[0].depth); + avctx->bits_per_raw_sample = pixdesc->comp[0].depth; + } + if (avctx->width <= 0 || avctx->height <= 0) { + av_log(avctx, AV_LOG_ERROR, "dimensions not set\n"); + return AVERROR(EINVAL); + } + + if (avctx->ticks_per_frame && avctx->time_base.num && + avctx->ticks_per_frame > INT_MAX / avctx->time_base.num) { + av_log(avctx, AV_LOG_ERROR, + "ticks_per_frame %d too large for the timebase %d/%d.", + avctx->ticks_per_frame, + avctx->time_base.num, + avctx->time_base.den); + return AVERROR(EINVAL); + } + + if (avctx->hw_frames_ctx) { + AVHWFramesContext *frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data; + if (frames_ctx->format != avctx->pix_fmt) { + av_log(avctx, AV_LOG_ERROR, + "Mismatching AVCodecContext.pix_fmt and AVHWFramesContext.format\n"); + return AVERROR(EINVAL); + } + if (avctx->sw_pix_fmt != AV_PIX_FMT_NONE && + avctx->sw_pix_fmt != frames_ctx->sw_format) { + av_log(avctx, AV_LOG_ERROR, + "Mismatching AVCodecContext.sw_pix_fmt (%s) " + "and AVHWFramesContext.sw_format (%s)\n", + av_get_pix_fmt_name(avctx->sw_pix_fmt), + av_get_pix_fmt_name(frames_ctx->sw_format)); + return AVERROR(EINVAL); + } + avctx->sw_pix_fmt = frames_ctx->sw_format; + } + + return 0; +} + +static int encode_preinit_audio(AVCodecContext *avctx) +{ + int i; + + if (avctx->codec->sample_fmts) { + for (i = 0; avctx->codec->sample_fmts[i] != AV_SAMPLE_FMT_NONE; i++) { + if (avctx->sample_fmt == avctx->codec->sample_fmts[i]) + break; + if (avctx->ch_layout.nb_channels == 1 && + av_get_planar_sample_fmt(avctx->sample_fmt) == + av_get_planar_sample_fmt(avctx->codec->sample_fmts[i])) { + avctx->sample_fmt = avctx->codec->sample_fmts[i]; + break; + } + } + if (avctx->codec->sample_fmts[i] == AV_SAMPLE_FMT_NONE) { + char buf[128]; + snprintf(buf, sizeof(buf), "%d", avctx->sample_fmt); + av_log(avctx, AV_LOG_ERROR, "Specified sample format %s is invalid or not supported\n", + (char *)av_x_if_null(av_get_sample_fmt_name(avctx->sample_fmt), buf)); + return AVERROR(EINVAL); + } + } + if (avctx->codec->supported_samplerates) { + for (i = 0; avctx->codec->supported_samplerates[i] != 0; i++) + if (avctx->sample_rate == avctx->codec->supported_samplerates[i]) + break; + if (avctx->codec->supported_samplerates[i] == 0) { + av_log(avctx, AV_LOG_ERROR, "Specified sample rate %d is not supported\n", + avctx->sample_rate); + return AVERROR(EINVAL); + } + } + if (avctx->sample_rate < 0) { + av_log(avctx, AV_LOG_ERROR, "Specified sample rate %d is not supported\n", + avctx->sample_rate); + return AVERROR(EINVAL); + } + if (avctx->codec->ch_layouts) { + for (i = 0; avctx->codec->ch_layouts[i].nb_channels; i++) { + if (!av_channel_layout_compare(&avctx->ch_layout, &avctx->codec->ch_layouts[i])) + break; + } + if (!avctx->codec->ch_layouts[i].nb_channels) { + char buf[512]; + int ret = av_channel_layout_describe(&avctx->ch_layout, buf, sizeof(buf)); + if (ret > 0) + av_log(avctx, AV_LOG_ERROR, "Specified channel layout '%s' is not supported\n", buf); + return AVERROR(EINVAL); + } + } + + if (!avctx->bits_per_raw_sample) + avctx->bits_per_raw_sample = 8 * av_get_bytes_per_sample(avctx->sample_fmt); + + return 0; +} + +int ff_encode_preinit(AVCodecContext *avctx) +{ + AVCodecInternal *avci = avctx->internal; + int ret = 0; + + if (avctx->time_base.num <= 0 || avctx->time_base.den <= 0) { + av_log(avctx, AV_LOG_ERROR, "The encoder timebase is not set.\n"); + return AVERROR(EINVAL); + } + + if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE && + !(avctx->codec->capabilities & AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE)) { + av_log(avctx, AV_LOG_ERROR, "The copy_opaque flag is set, but the " + "encoder does not support it.\n"); + return AVERROR(EINVAL); + } + + switch (avctx->codec_type) { + case AVMEDIA_TYPE_VIDEO: ret = encode_preinit_video(avctx); break; + case AVMEDIA_TYPE_AUDIO: ret = encode_preinit_audio(avctx); break; + } + if (ret < 0) + return ret; + + if ( (avctx->codec_type == AVMEDIA_TYPE_VIDEO || avctx->codec_type == AVMEDIA_TYPE_AUDIO) + && avctx->bit_rate>0 && avctx->bit_rate<1000) { + av_log(avctx, AV_LOG_WARNING, "Bitrate %"PRId64" is extremely low, maybe you mean %"PRId64"k\n", avctx->bit_rate, avctx->bit_rate); + } + + if (!avctx->rc_initial_buffer_occupancy) + avctx->rc_initial_buffer_occupancy = avctx->rc_buffer_size * 3LL / 4; + + if (avctx->codec_descriptor->props & AV_CODEC_PROP_INTRA_ONLY) + avctx->internal->intra_only_flag = AV_PKT_FLAG_KEY; + + if (ffcodec(avctx->codec)->cb_type == FF_CODEC_CB_TYPE_ENCODE) { + avci->in_frame = av_frame_alloc(); + if (!avci->in_frame) + return AVERROR(ENOMEM); + } + + if ((avctx->flags & AV_CODEC_FLAG_RECON_FRAME)) { + if (!(avctx->codec->capabilities & AV_CODEC_CAP_ENCODER_RECON_FRAME)) { + av_log(avctx, AV_LOG_ERROR, "Reconstructed frame output requested " + "from an encoder not supporting it\n"); + return AVERROR(ENOSYS); + } + + avci->recon_frame = av_frame_alloc(); + if (!avci->recon_frame) + return AVERROR(ENOMEM); + } + + if (CONFIG_FRAME_THREAD_ENCODER) { + ret = ff_frame_thread_encoder_init(avctx); + if (ret < 0) + return ret; + } + + return 0; +} + +int ff_encode_alloc_frame(AVCodecContext *avctx, AVFrame *frame) +{ + int ret; + + switch (avctx->codec->type) { + case AVMEDIA_TYPE_VIDEO: + frame->format = avctx->pix_fmt; + if (frame->width <= 0 || frame->height <= 0) { + frame->width = FFMAX(avctx->width, avctx->coded_width); + frame->height = FFMAX(avctx->height, avctx->coded_height); + } + + break; + case AVMEDIA_TYPE_AUDIO: + frame->sample_rate = avctx->sample_rate; + frame->format = avctx->sample_fmt; + if (!frame->ch_layout.nb_channels) { + ret = av_channel_layout_copy(&frame->ch_layout, &avctx->ch_layout); + if (ret < 0) + return ret; + } + break; + } + + ret = avcodec_default_get_buffer2(avctx, frame, 0); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n"); + av_frame_unref(frame); + return ret; + } + + return 0; +} + +int ff_encode_receive_frame(AVCodecContext *avctx, AVFrame *frame) +{ + AVCodecInternal *avci = avctx->internal; + + if (!avci->recon_frame) + return AVERROR(EINVAL); + if (!avci->recon_frame->buf[0]) + return avci->draining_done ? AVERROR_EOF : AVERROR(EAGAIN); + + av_frame_move_ref(frame, avci->recon_frame); + return 0; +} diff --git a/media/ffvpx/libavcodec/encode.h b/media/ffvpx/libavcodec/encode.h new file mode 100644 index 0000000000..26a3304045 --- /dev/null +++ b/media/ffvpx/libavcodec/encode.h @@ -0,0 +1,99 @@ +/* + * generic encoding-related code + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ENCODE_H +#define AVCODEC_ENCODE_H + +#include "libavutil/frame.h" + +#include "avcodec.h" +#include "packet.h" + +/** + * avcodec_receive_frame() implementation for encoders. + */ +int ff_encode_receive_frame(AVCodecContext *avctx, AVFrame *frame); + +/** + * Called by encoders to get the next frame for encoding. + * + * @param frame An empty frame to be filled with data. + * @return 0 if a new reference has been successfully written to frame + * AVERROR(EAGAIN) if no data is currently available + * AVERROR_EOF if end of stream has been reached, so no more data + * will be available + */ +int ff_encode_get_frame(AVCodecContext *avctx, AVFrame *frame); + +/** + * Get a buffer for a packet. This is a wrapper around + * AVCodecContext.get_encode_buffer() and should be used instead calling get_encode_buffer() + * directly. + */ +int ff_get_encode_buffer(AVCodecContext *avctx, AVPacket *avpkt, int64_t size, int flags); + +/** + * Allocate buffers for a frame. Encoder equivalent to ff_get_buffer(). + */ +int ff_encode_alloc_frame(AVCodecContext *avctx, AVFrame *frame); + +/** + * Check AVPacket size and allocate data. + * + * Encoders of type FF_CODEC_CB_TYPE_ENCODE can use this as a convenience to + * obtain a big enough buffer for the encoded bitstream. + * + * @param avctx the AVCodecContext of the encoder + * @param avpkt The AVPacket: on success, avpkt->data will point to a buffer + * of size at least `size`; the packet will not be refcounted. + * This packet must be initially blank. + * @param size an upper bound of the size of the packet to encode + * @return non negative on success, negative error code on failure + */ +int ff_alloc_packet(AVCodecContext *avctx, AVPacket *avpkt, int64_t size); + +/** + * Propagate user opaque values from the frame to avctx/pkt as needed. + */ +int ff_encode_reordered_opaque(AVCodecContext *avctx, + AVPacket *pkt, const AVFrame *frame); + +/* + * Perform encoder initialization and validation. + * Called when opening the encoder, before the FFCodec.init() call. + */ +int ff_encode_preinit(AVCodecContext *avctx); + +int ff_encode_encode_cb(AVCodecContext *avctx, AVPacket *avpkt, + AVFrame *frame, int *got_packet); + +/** + * Rescale from sample rate to AVCodecContext.time_base. + */ +static av_always_inline int64_t ff_samples_to_time_base(const AVCodecContext *avctx, + int64_t samples) +{ + if (samples == AV_NOPTS_VALUE) + return AV_NOPTS_VALUE; + return av_rescale_q(samples, (AVRational){ 1, avctx->sample_rate }, + avctx->time_base); +} + +#endif /* AVCODEC_ENCODE_H */ diff --git a/media/ffvpx/libavcodec/error_resilience.h b/media/ffvpx/libavcodec/error_resilience.h new file mode 100644 index 0000000000..47cc8a4fc6 --- /dev/null +++ b/media/ffvpx/libavcodec/error_resilience.h @@ -0,0 +1,97 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ERROR_RESILIENCE_H +#define AVCODEC_ERROR_RESILIENCE_H + +#include <stdint.h> +#include <stdatomic.h> + +#include "avcodec.h" +#include "me_cmp.h" +#include "threadframe.h" + +///< current MB is the first after a resync marker +#define VP_START 1 +#define ER_AC_ERROR 2 +#define ER_DC_ERROR 4 +#define ER_MV_ERROR 8 +#define ER_AC_END 16 +#define ER_DC_END 32 +#define ER_MV_END 64 + +#define ER_MB_ERROR (ER_AC_ERROR|ER_DC_ERROR|ER_MV_ERROR) +#define ER_MB_END (ER_AC_END|ER_DC_END|ER_MV_END) + +typedef struct ERPicture { + AVFrame *f; + ThreadFrame *tf; + + // it is the caller's responsibility to allocate these buffers + int16_t (*motion_val[2])[2]; + int8_t *ref_index[2]; + + uint32_t *mb_type; + int field_picture; +} ERPicture; + +typedef struct ERContext { + AVCodecContext *avctx; + + me_cmp_func sad; + int mecc_inited; + + int *mb_index2xy; + int mb_num; + int mb_width, mb_height; + ptrdiff_t mb_stride; + ptrdiff_t b8_stride; + + atomic_int error_count; + int error_occurred; + uint8_t *error_status_table; + uint8_t *er_temp_buffer; + int16_t *dc_val[3]; + uint8_t *mbskip_table; + uint8_t *mbintra_table; + int mv[2][4][2]; + + ERPicture cur_pic; + ERPicture last_pic; + ERPicture next_pic; + + int8_t *ref_index[2]; + int16_t (*motion_val_base[2])[2]; + + uint16_t pp_time; + uint16_t pb_time; + int quarter_sample; + int partitioned_frame; + + void (*decode_mb)(void *opaque, int ref, int mv_dir, int mv_type, + int (*mv)[2][4][2], + int mb_x, int mb_y, int mb_intra, int mb_skipped); + void *opaque; +} ERContext; + +void ff_er_frame_start(ERContext *s); +void ff_er_frame_end(ERContext *s); +void ff_er_add_slice(ERContext *s, int startx, int starty, int endx, int endy, + int status); + +#endif /* AVCODEC_ERROR_RESILIENCE_H */ diff --git a/media/ffvpx/libavcodec/faandct.c b/media/ffvpx/libavcodec/faandct.c new file mode 100644 index 0000000000..38c392bbae --- /dev/null +++ b/media/ffvpx/libavcodec/faandct.c @@ -0,0 +1,215 @@ +/* + * Floating point AAN DCT + * this implementation is based upon the IJG integer AAN DCT (see jfdctfst.c) + * + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (c) 2003 Roman Shaposhnik + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/** + * @file + * @brief + * Floating point AAN DCT + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#include "faandct.h" +#include "libavutil/internal.h" +#include "libavutil/libm.h" + +typedef float FLOAT; + +/* numbers generated by arbitrary precision arithmetic followed by truncation +to 36 fractional digits (enough for a 128-bit IEEE quad, see /usr/include/math.h +for this approach). Unfortunately, long double is not always available correctly, +e.g ppc has issues. +TODO: add L suffixes when ppc and toolchains sort out their stuff. +*/ +#define B0 1.000000000000000000000000000000000000 +#define B1 0.720959822006947913789091890943021267 // (cos(pi*1/16)sqrt(2))^-1 +#define B2 0.765366864730179543456919968060797734 // (cos(pi*2/16)sqrt(2))^-1 +#define B3 0.850430094767256448766702844371412325 // (cos(pi*3/16)sqrt(2))^-1 +#define B4 1.000000000000000000000000000000000000 // (cos(pi*4/16)sqrt(2))^-1 +#define B5 1.272758580572833938461007018281767032 // (cos(pi*5/16)sqrt(2))^-1 +#define B6 1.847759065022573512256366378793576574 // (cos(pi*6/16)sqrt(2))^-1 +#define B7 3.624509785411551372409941227504289587 // (cos(pi*7/16)sqrt(2))^-1 + +#define A1 M_SQRT1_2 // cos(pi*4/16) +#define A2 0.54119610014619698435 // cos(pi*6/16)sqrt(2) +#define A5 0.38268343236508977170 // cos(pi*6/16) +#define A4 1.30656296487637652774 // cos(pi*2/16)sqrt(2) + +static const FLOAT postscale[64]={ +B0*B0, B0*B1, B0*B2, B0*B3, B0*B4, B0*B5, B0*B6, B0*B7, +B1*B0, B1*B1, B1*B2, B1*B3, B1*B4, B1*B5, B1*B6, B1*B7, +B2*B0, B2*B1, B2*B2, B2*B3, B2*B4, B2*B5, B2*B6, B2*B7, +B3*B0, B3*B1, B3*B2, B3*B3, B3*B4, B3*B5, B3*B6, B3*B7, +B4*B0, B4*B1, B4*B2, B4*B3, B4*B4, B4*B5, B4*B6, B4*B7, +B5*B0, B5*B1, B5*B2, B5*B3, B5*B4, B5*B5, B5*B6, B5*B7, +B6*B0, B6*B1, B6*B2, B6*B3, B6*B4, B6*B5, B6*B6, B6*B7, +B7*B0, B7*B1, B7*B2, B7*B3, B7*B4, B7*B5, B7*B6, B7*B7, +}; + +static av_always_inline void row_fdct(FLOAT temp[64], int16_t *data) +{ + FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + FLOAT tmp10, tmp11, tmp12, tmp13; + FLOAT z2, z4, z11, z13; + int i; + + for (i=0; i<8*8; i+=8) { + tmp0= data[0 + i] + data[7 + i]; + tmp7= data[0 + i] - data[7 + i]; + tmp1= data[1 + i] + data[6 + i]; + tmp6= data[1 + i] - data[6 + i]; + tmp2= data[2 + i] + data[5 + i]; + tmp5= data[2 + i] - data[5 + i]; + tmp3= data[3 + i] + data[4 + i]; + tmp4= data[3 + i] - data[4 + i]; + + tmp10= tmp0 + tmp3; + tmp13= tmp0 - tmp3; + tmp11= tmp1 + tmp2; + tmp12= tmp1 - tmp2; + + temp[0 + i]= tmp10 + tmp11; + temp[4 + i]= tmp10 - tmp11; + + tmp12 += tmp13; + tmp12 *= A1; + temp[2 + i]= tmp13 + tmp12; + temp[6 + i]= tmp13 - tmp12; + + tmp4 += tmp5; + tmp5 += tmp6; + tmp6 += tmp7; + + z2= tmp4*(A2+A5) - tmp6*A5; + z4= tmp6*(A4-A5) + tmp4*A5; + + tmp5*=A1; + + z11= tmp7 + tmp5; + z13= tmp7 - tmp5; + + temp[5 + i]= z13 + z2; + temp[3 + i]= z13 - z2; + temp[1 + i]= z11 + z4; + temp[7 + i]= z11 - z4; + } +} + +void ff_faandct(int16_t *data) +{ + FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + FLOAT tmp10, tmp11, tmp12, tmp13; + FLOAT z2, z4, z11, z13; + FLOAT temp[64]; + int i; + + emms_c(); + + row_fdct(temp, data); + + for (i=0; i<8; i++) { + tmp0= temp[8*0 + i] + temp[8*7 + i]; + tmp7= temp[8*0 + i] - temp[8*7 + i]; + tmp1= temp[8*1 + i] + temp[8*6 + i]; + tmp6= temp[8*1 + i] - temp[8*6 + i]; + tmp2= temp[8*2 + i] + temp[8*5 + i]; + tmp5= temp[8*2 + i] - temp[8*5 + i]; + tmp3= temp[8*3 + i] + temp[8*4 + i]; + tmp4= temp[8*3 + i] - temp[8*4 + i]; + + tmp10= tmp0 + tmp3; + tmp13= tmp0 - tmp3; + tmp11= tmp1 + tmp2; + tmp12= tmp1 - tmp2; + + data[8*0 + i]= lrintf(postscale[8*0 + i] * (tmp10 + tmp11)); + data[8*4 + i]= lrintf(postscale[8*4 + i] * (tmp10 - tmp11)); + + tmp12 += tmp13; + tmp12 *= A1; + data[8*2 + i]= lrintf(postscale[8*2 + i] * (tmp13 + tmp12)); + data[8*6 + i]= lrintf(postscale[8*6 + i] * (tmp13 - tmp12)); + + tmp4 += tmp5; + tmp5 += tmp6; + tmp6 += tmp7; + + z2= tmp4*(A2+A5) - tmp6*A5; + z4= tmp6*(A4-A5) + tmp4*A5; + + tmp5*=A1; + + z11= tmp7 + tmp5; + z13= tmp7 - tmp5; + + data[8*5 + i]= lrintf(postscale[8*5 + i] * (z13 + z2)); + data[8*3 + i]= lrintf(postscale[8*3 + i] * (z13 - z2)); + data[8*1 + i]= lrintf(postscale[8*1 + i] * (z11 + z4)); + data[8*7 + i]= lrintf(postscale[8*7 + i] * (z11 - z4)); + } +} + +void ff_faandct248(int16_t *data) +{ + FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + FLOAT tmp10, tmp11, tmp12, tmp13; + FLOAT temp[64]; + int i; + + emms_c(); + + row_fdct(temp, data); + + for (i=0; i<8; i++) { + tmp0 = temp[8*0 + i] + temp[8*1 + i]; + tmp1 = temp[8*2 + i] + temp[8*3 + i]; + tmp2 = temp[8*4 + i] + temp[8*5 + i]; + tmp3 = temp[8*6 + i] + temp[8*7 + i]; + tmp4 = temp[8*0 + i] - temp[8*1 + i]; + tmp5 = temp[8*2 + i] - temp[8*3 + i]; + tmp6 = temp[8*4 + i] - temp[8*5 + i]; + tmp7 = temp[8*6 + i] - temp[8*7 + i]; + + tmp10 = tmp0 + tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + tmp13 = tmp0 - tmp3; + + data[8*0 + i] = lrintf(postscale[8*0 + i] * (tmp10 + tmp11)); + data[8*4 + i] = lrintf(postscale[8*4 + i] * (tmp10 - tmp11)); + + tmp12 += tmp13; + tmp12 *= A1; + data[8*2 + i] = lrintf(postscale[8*2 + i] * (tmp13 + tmp12)); + data[8*6 + i] = lrintf(postscale[8*6 + i] * (tmp13 - tmp12)); + + tmp10 = tmp4 + tmp7; + tmp11 = tmp5 + tmp6; + tmp12 = tmp5 - tmp6; + tmp13 = tmp4 - tmp7; + + data[8*1 + i] = lrintf(postscale[8*0 + i] * (tmp10 + tmp11)); + data[8*5 + i] = lrintf(postscale[8*4 + i] * (tmp10 - tmp11)); + + tmp12 += tmp13; + tmp12 *= A1; + data[8*3 + i] = lrintf(postscale[8*2 + i] * (tmp13 + tmp12)); + data[8*7 + i] = lrintf(postscale[8*6 + i] * (tmp13 - tmp12)); + } +} diff --git a/media/ffvpx/libavcodec/faandct.h b/media/ffvpx/libavcodec/faandct.h new file mode 100644 index 0000000000..c5ef96dcf1 --- /dev/null +++ b/media/ffvpx/libavcodec/faandct.h @@ -0,0 +1,37 @@ +/* + * Floating point AAN DCT + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * @brief + * Floating point AAN DCT + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#ifndef AVCODEC_FAANDCT_H +#define AVCODEC_FAANDCT_H + +#include <stdint.h> + +void ff_faandct(int16_t *data); +void ff_faandct248(int16_t *data); + +#endif /* AVCODEC_FAANDCT_H */ diff --git a/media/ffvpx/libavcodec/faanidct.c b/media/ffvpx/libavcodec/faanidct.c new file mode 100644 index 0000000000..3921f82dae --- /dev/null +++ b/media/ffvpx/libavcodec/faanidct.c @@ -0,0 +1,166 @@ +/* + * Floating point AAN IDCT + * Copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "faanidct.h" +#include "libavutil/common.h" + +/* To allow switching to double. */ +typedef float FLOAT; + +#define B0 1.0000000000000000000000 +#define B1 1.3870398453221474618216 // cos(pi*1/16)sqrt(2) +#define B2 1.3065629648763765278566 // cos(pi*2/16)sqrt(2) +#define B3 1.1758756024193587169745 // cos(pi*3/16)sqrt(2) +#define B4 1.0000000000000000000000 // cos(pi*4/16)sqrt(2) +#define B5 0.7856949583871021812779 // cos(pi*5/16)sqrt(2) +#define B6 0.5411961001461969843997 // cos(pi*6/16)sqrt(2) +#define B7 0.2758993792829430123360 // cos(pi*7/16)sqrt(2) + +#define A4 0.70710678118654752438 // cos(pi*4/16) +#define A2 0.92387953251128675613 // cos(pi*2/16) + +static const FLOAT prescale[64]={ +B0*B0/8, B0*B1/8, B0*B2/8, B0*B3/8, B0*B4/8, B0*B5/8, B0*B6/8, B0*B7/8, +B1*B0/8, B1*B1/8, B1*B2/8, B1*B3/8, B1*B4/8, B1*B5/8, B1*B6/8, B1*B7/8, +B2*B0/8, B2*B1/8, B2*B2/8, B2*B3/8, B2*B4/8, B2*B5/8, B2*B6/8, B2*B7/8, +B3*B0/8, B3*B1/8, B3*B2/8, B3*B3/8, B3*B4/8, B3*B5/8, B3*B6/8, B3*B7/8, +B4*B0/8, B4*B1/8, B4*B2/8, B4*B3/8, B4*B4/8, B4*B5/8, B4*B6/8, B4*B7/8, +B5*B0/8, B5*B1/8, B5*B2/8, B5*B3/8, B5*B4/8, B5*B5/8, B5*B6/8, B5*B7/8, +B6*B0/8, B6*B1/8, B6*B2/8, B6*B3/8, B6*B4/8, B6*B5/8, B6*B6/8, B6*B7/8, +B7*B0/8, B7*B1/8, B7*B2/8, B7*B3/8, B7*B4/8, B7*B5/8, B7*B6/8, B7*B7/8, +}; + +static inline void p8idct(int16_t data[64], FLOAT temp[64], uint8_t *dest, + ptrdiff_t stride, int x, int y, int type) +{ + int i; + FLOAT s04, d04, s17, d17, s26, d26, s53, d53; + FLOAT os07, os16, os25, os34; + FLOAT od07, od16, od25, od34; + + for(i=0; i<y*8; i+=y){ + s17= temp[1*x + i] + temp[7*x + i]; + d17= temp[1*x + i] - temp[7*x + i]; + s53= temp[5*x + i] + temp[3*x + i]; + d53= temp[5*x + i] - temp[3*x + i]; + + od07= s17 + s53; + od25= (s17 - s53)*(2*A4); + + od34= d17*(2*(B6-A2)) - d53*(2*A2); + od16= d53*(2*(A2-B2)) + d17*(2*A2); + + od16 -= od07; + od25 -= od16; + od34 += od25; + + s26 = temp[2*x + i] + temp[6*x + i]; + d26 = temp[2*x + i] - temp[6*x + i]; + d26*= 2*A4; + d26-= s26; + + s04= temp[0*x + i] + temp[4*x + i]; + d04= temp[0*x + i] - temp[4*x + i]; + + os07= s04 + s26; + os34= s04 - s26; + os16= d04 + d26; + os25= d04 - d26; + + if(type==0){ + temp[0*x + i]= os07 + od07; + temp[7*x + i]= os07 - od07; + temp[1*x + i]= os16 + od16; + temp[6*x + i]= os16 - od16; + temp[2*x + i]= os25 + od25; + temp[5*x + i]= os25 - od25; + temp[3*x + i]= os34 - od34; + temp[4*x + i]= os34 + od34; + }else if(type==1){ + data[0*x + i]= lrintf(os07 + od07); + data[7*x + i]= lrintf(os07 - od07); + data[1*x + i]= lrintf(os16 + od16); + data[6*x + i]= lrintf(os16 - od16); + data[2*x + i]= lrintf(os25 + od25); + data[5*x + i]= lrintf(os25 - od25); + data[3*x + i]= lrintf(os34 - od34); + data[4*x + i]= lrintf(os34 + od34); + }else if(type==2){ + dest[0*stride + i]= av_clip_uint8(((int)dest[0*stride + i]) + lrintf(os07 + od07)); + dest[7*stride + i]= av_clip_uint8(((int)dest[7*stride + i]) + lrintf(os07 - od07)); + dest[1*stride + i]= av_clip_uint8(((int)dest[1*stride + i]) + lrintf(os16 + od16)); + dest[6*stride + i]= av_clip_uint8(((int)dest[6*stride + i]) + lrintf(os16 - od16)); + dest[2*stride + i]= av_clip_uint8(((int)dest[2*stride + i]) + lrintf(os25 + od25)); + dest[5*stride + i]= av_clip_uint8(((int)dest[5*stride + i]) + lrintf(os25 - od25)); + dest[3*stride + i]= av_clip_uint8(((int)dest[3*stride + i]) + lrintf(os34 - od34)); + dest[4*stride + i]= av_clip_uint8(((int)dest[4*stride + i]) + lrintf(os34 + od34)); + }else{ + dest[0*stride + i]= av_clip_uint8(lrintf(os07 + od07)); + dest[7*stride + i]= av_clip_uint8(lrintf(os07 - od07)); + dest[1*stride + i]= av_clip_uint8(lrintf(os16 + od16)); + dest[6*stride + i]= av_clip_uint8(lrintf(os16 - od16)); + dest[2*stride + i]= av_clip_uint8(lrintf(os25 + od25)); + dest[5*stride + i]= av_clip_uint8(lrintf(os25 - od25)); + dest[3*stride + i]= av_clip_uint8(lrintf(os34 - od34)); + dest[4*stride + i]= av_clip_uint8(lrintf(os34 + od34)); + } + } +} + +void ff_faanidct(int16_t block[64]){ + FLOAT temp[64]; + int i; + + emms_c(); + + for(i=0; i<64; i++) + temp[i] = block[i] * prescale[i]; + + p8idct(block, temp, NULL, 0, 1, 8, 0); + p8idct(block, temp, NULL, 0, 8, 1, 1); +} + +void ff_faanidct_add(uint8_t *dest, ptrdiff_t line_size, int16_t block[64]) +{ + FLOAT temp[64]; + int i; + + emms_c(); + + for(i=0; i<64; i++) + temp[i] = block[i] * prescale[i]; + + p8idct(block, temp, NULL, 0, 1, 8, 0); + p8idct(NULL , temp, dest, line_size, 8, 1, 2); +} + +void ff_faanidct_put(uint8_t *dest, ptrdiff_t line_size, int16_t block[64]) +{ + FLOAT temp[64]; + int i; + + emms_c(); + + for(i=0; i<64; i++) + temp[i] = block[i] * prescale[i]; + + p8idct(block, temp, NULL, 0, 1, 8, 0); + p8idct(NULL , temp, dest, line_size, 8, 1, 3); +} diff --git a/media/ffvpx/libavcodec/faanidct.h b/media/ffvpx/libavcodec/faanidct.h new file mode 100644 index 0000000000..6f4da67c1b --- /dev/null +++ b/media/ffvpx/libavcodec/faanidct.h @@ -0,0 +1,32 @@ +/* + * Floating point AAN IDCT + * Copyright (c) 2008 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FAANIDCT_H +#define AVCODEC_FAANIDCT_H + +#include <stddef.h> +#include <stdint.h> + +void ff_faanidct(int16_t block[64]); +void ff_faanidct_add(uint8_t *dest, ptrdiff_t line_size, int16_t block[64]); +void ff_faanidct_put(uint8_t *dest, ptrdiff_t line_size, int16_t block[64]); + +#endif /* AVCODEC_FAANIDCT_H */ diff --git a/media/ffvpx/libavcodec/fdctdsp.c b/media/ffvpx/libavcodec/fdctdsp.c new file mode 100644 index 0000000000..5306c9d047 --- /dev/null +++ b/media/ffvpx/libavcodec/fdctdsp.c @@ -0,0 +1,51 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "avcodec.h" +#include "dct.h" +#include "faandct.h" +#include "fdctdsp.h" +#include "config.h" + +av_cold void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx) +{ + av_unused const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8; + + if (avctx->bits_per_raw_sample == 10 || avctx->bits_per_raw_sample == 9) { + c->fdct = ff_jpeg_fdct_islow_10; + c->fdct248 = ff_fdct248_islow_10; + } else if (avctx->dct_algo == FF_DCT_FASTINT) { + c->fdct = ff_fdct_ifast; + c->fdct248 = ff_fdct_ifast248; +#if CONFIG_FAANDCT + } else if (avctx->dct_algo == FF_DCT_FAAN) { + c->fdct = ff_faandct; + c->fdct248 = ff_faandct248; +#endif /* CONFIG_FAANDCT */ + } else { + c->fdct = ff_jpeg_fdct_islow_8; // slow/accurate/default + c->fdct248 = ff_fdct248_islow_8; + } + +#if ARCH_PPC + ff_fdctdsp_init_ppc(c, avctx, high_bit_depth); +#elif ARCH_X86 + ff_fdctdsp_init_x86(c, avctx, high_bit_depth); +#endif +} diff --git a/media/ffvpx/libavcodec/fdctdsp.h b/media/ffvpx/libavcodec/fdctdsp.h new file mode 100644 index 0000000000..3e1f683b9e --- /dev/null +++ b/media/ffvpx/libavcodec/fdctdsp.h @@ -0,0 +1,37 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FDCTDSP_H +#define AVCODEC_FDCTDSP_H + +#include <stdint.h> + +#include "avcodec.h" + +typedef struct FDCTDSPContext { + void (*fdct)(int16_t *block /* align 16 */); + void (*fdct248)(int16_t *block /* align 16 */); +} FDCTDSPContext; + +void ff_fdctdsp_init(FDCTDSPContext *c, AVCodecContext *avctx); +void ff_fdctdsp_init_ppc(FDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); + +#endif /* AVCODEC_FDCTDSP_H */ diff --git a/media/ffvpx/libavcodec/fdctdsp_init.c b/media/ffvpx/libavcodec/fdctdsp_init.c new file mode 100644 index 0000000000..0cb5fd625b --- /dev/null +++ b/media/ffvpx/libavcodec/fdctdsp_init.c @@ -0,0 +1,44 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/fdctdsp.h" +#include "fdct.h" + +av_cold void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + const int dct_algo = avctx->dct_algo; + + if (!high_bit_depth) { + if ((dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) { + if (INLINE_MMX(cpu_flags)) + c->fdct = ff_fdct_mmx; + + if (INLINE_MMXEXT(cpu_flags)) + c->fdct = ff_fdct_mmxext; + + if (INLINE_SSE2(cpu_flags)) + c->fdct = ff_fdct_sse2; + } + } +} diff --git a/media/ffvpx/libavcodec/fft-internal.h b/media/ffvpx/libavcodec/fft-internal.h new file mode 100644 index 0000000000..d89a3e38ca --- /dev/null +++ b/media/ffvpx/libavcodec/fft-internal.h @@ -0,0 +1,62 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FFT_INTERNAL_H +#define AVCODEC_FFT_INTERNAL_H + +#include "libavutil/mathematics.h" +#include "fft.h" + +#if FFT_FLOAT + +#define FIX15(v) (v) +#define sqrthalf (float)M_SQRT1_2 + +#define BF(x, y, a, b) do { \ + x = a - b; \ + y = a + b; \ + } while (0) + +#define CMUL(dre, dim, are, aim, bre, bim) do { \ + (dre) = (are) * (bre) - (aim) * (bim); \ + (dim) = (are) * (bim) + (aim) * (bre); \ + } while (0) + +#else /* FFT_FLOAT */ + +#define CMUL(dre, dim, are, aim, bre, bim) do { \ + int64_t accu; \ + (accu) = (int64_t)(bre) * (are); \ + (accu) -= (int64_t)(bim) * (aim); \ + (dre) = (int)(((accu) + 0x40000000) >> 31); \ + (accu) = (int64_t)(bre) * (aim); \ + (accu) += (int64_t)(bim) * (are); \ + (dim) = (int)(((accu) + 0x40000000) >> 31); \ + } while (0) + +#endif /* FFT_FLOAT */ + +#define ff_imdct_calc_c FFT_NAME(ff_imdct_calc_c) +#define ff_imdct_half_c FFT_NAME(ff_imdct_half_c) +#define ff_mdct_calc_c FFT_NAME(ff_mdct_calc_c) + +void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); + +#endif /* AVCODEC_FFT_INTERNAL_H */ diff --git a/media/ffvpx/libavcodec/fft.h b/media/ffvpx/libavcodec/fft.h new file mode 100644 index 0000000000..d46e5a3f0b --- /dev/null +++ b/media/ffvpx/libavcodec/fft.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FFT_H +#define AVCODEC_FFT_H + +#ifndef FFT_FLOAT +#define FFT_FLOAT 1 +#endif + +#include <stdint.h> +#include "config.h" + +#include "libavutil/attributes_internal.h" +#include "libavutil/mem_internal.h" + +#if FFT_FLOAT + +#include "avfft.h" + +#define FFT_NAME(x) x + +typedef float FFTDouble; + +#else + +#define Q31(x) (int)((x)*2147483648.0 + 0.5) +#define FFT_NAME(x) x ## _fixed_32 + +typedef int32_t FFTSample; + +typedef struct FFTComplex { + FFTSample re, im; +} FFTComplex; + +typedef int FFTDouble; +typedef struct FFTContext FFTContext; + +#endif /* FFT_FLOAT */ + +typedef struct FFTDComplex { + FFTDouble re, im; +} FFTDComplex; + +/* FFT computation */ + +enum fft_permutation_type { + FF_FFT_PERM_DEFAULT, + FF_FFT_PERM_SWAP_LSBS, + FF_FFT_PERM_AVX, +}; + +enum mdct_permutation_type { + FF_MDCT_PERM_NONE, + FF_MDCT_PERM_INTERLEAVE, +}; + +struct FFTContext { + int nbits; + int inverse; + uint16_t *revtab; + FFTComplex *tmp_buf; + int mdct_size; /* size of MDCT (i.e. number of input data * 2) */ + int mdct_bits; /* n = 2^nbits */ + /* pre/post rotation tables */ + FFTSample *tcos; + FFTSample *tsin; + /** + * Do the permutation needed BEFORE calling fft_calc(). + */ + void (*fft_permute)(struct FFTContext *s, FFTComplex *z); + /** + * Do a complex FFT with the parameters defined in ff_fft_init(). The + * input data must be permuted before. No 1.0/sqrt(n) normalization is done. + */ + void (*fft_calc)(struct FFTContext *s, FFTComplex *z); + void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + enum fft_permutation_type fft_permutation; + enum mdct_permutation_type mdct_permutation; + uint32_t *revtab32; +}; + +#if CONFIG_HARDCODED_TABLES +#define COSTABLE_CONST const +#define ff_init_ff_cos_tabs(index) +#else +#define COSTABLE_CONST +#define ff_init_ff_cos_tabs FFT_NAME(ff_init_ff_cos_tabs) + +/** + * Initialize the cosine table in ff_cos_tabs[index] + * @param index index in ff_cos_tabs array of the table to initialize + */ +void ff_init_ff_cos_tabs(int index); +#endif + +#define COSTABLE(size) \ + COSTABLE_CONST attribute_visibility_hidden DECLARE_ALIGNED(32, FFTSample, FFT_NAME(ff_cos_##size))[size/2] + +extern COSTABLE(16); +extern COSTABLE(32); +extern COSTABLE(64); +extern COSTABLE(128); +extern COSTABLE(256); +extern COSTABLE(512); +extern COSTABLE(1024); +extern COSTABLE(2048); +extern COSTABLE(4096); +extern COSTABLE(8192); +extern COSTABLE(16384); +extern COSTABLE(32768); +extern COSTABLE(65536); +extern COSTABLE(131072); +extern COSTABLE_CONST FFTSample* const FFT_NAME(ff_cos_tabs)[18]; + +#define ff_fft_init FFT_NAME(ff_fft_init) +#define ff_fft_end FFT_NAME(ff_fft_end) + +/** + * Set up a complex FFT. + * @param nbits log2 of the length of the input array + * @param inverse if 0 perform the forward transform, if 1 perform the inverse + */ +int ff_fft_init(FFTContext *s, int nbits, int inverse); + +void ff_fft_init_aarch64(FFTContext *s); +void ff_fft_init_x86(FFTContext *s); +void ff_fft_init_arm(FFTContext *s); +void ff_fft_init_mips(FFTContext *s); +void ff_fft_init_ppc(FFTContext *s); + +void ff_fft_end(FFTContext *s); + +#define ff_mdct_init FFT_NAME(ff_mdct_init) +#define ff_mdct_end FFT_NAME(ff_mdct_end) + +int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale); +void ff_mdct_end(FFTContext *s); + +#endif /* AVCODEC_FFT_H */ diff --git a/media/ffvpx/libavcodec/fft_fixed_32.c b/media/ffvpx/libavcodec/fft_fixed_32.c new file mode 100644 index 0000000000..e18dc83891 --- /dev/null +++ b/media/ffvpx/libavcodec/fft_fixed_32.c @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Stanislav Ocovaj (socovaj@mips.com) + * Goran Cordasic (goran@mips.com) + * Djordje Pesut (djordje@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define FFT_FLOAT 0 +#include "fft_template.c" diff --git a/media/ffvpx/libavcodec/fft_float.c b/media/ffvpx/libavcodec/fft_float.c new file mode 100644 index 0000000000..a9fd01978d --- /dev/null +++ b/media/ffvpx/libavcodec/fft_float.c @@ -0,0 +1,20 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define FFT_FLOAT 1 +#include "fft_template.c" diff --git a/media/ffvpx/libavcodec/fft_init_table.c b/media/ffvpx/libavcodec/fft_init_table.c new file mode 100644 index 0000000000..83e35ffb7c --- /dev/null +++ b/media/ffvpx/libavcodec/fft_init_table.c @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Stanislav Ocovaj (socovaj@mips.com) + * Goran Cordasic (goran@mips.com) + * Djordje Pesut (djordje@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * definitions and initialization of LUT table for FFT + */ +#include "libavutil/thread.h" + +#include "libavcodec/fft_table.h" + +const int32_t ff_w_tab_sr[MAX_FFT_SIZE/(4*16)] = { +2147483647, 2147483016, 2147481121, 2147477963, 2147473542, 2147467857, 2147460908, 2147452697, +2147443222, 2147432484, 2147420483, 2147407218, 2147392690, 2147376899, 2147359845, 2147341527, +2147321946, 2147301102, 2147278995, 2147255625, 2147230991, 2147205094, 2147177934, 2147149511, +2147119825, 2147088876, 2147056664, 2147023188, 2146988450, 2146952448, 2146915184, 2146876656, +2146836866, 2146795813, 2146753497, 2146709917, 2146665076, 2146618971, 2146571603, 2146522973, +2146473080, 2146421924, 2146369505, 2146315824, 2146260881, 2146204674, 2146147205, 2146088474, +2146028480, 2145967224, 2145904705, 2145840924, 2145775880, 2145709574, 2145642006, 2145573176, +2145503083, 2145431729, 2145359112, 2145285233, 2145210092, 2145133690, 2145056025, 2144977098, +2144896910, 2144815460, 2144732748, 2144648774, 2144563539, 2144477042, 2144389283, 2144300264, +2144209982, 2144118439, 2144025635, 2143931570, 2143836244, 2143739656, 2143641807, 2143542697, +2143442326, 2143340694, 2143237802, 2143133648, 2143028234, 2142921559, 2142813624, 2142704427, +2142593971, 2142482254, 2142369276, 2142255039, 2142139541, 2142022783, 2141904764, 2141785486, +2141664948, 2141543150, 2141420092, 2141295774, 2141170197, 2141043360, 2140915264, 2140785908, +2140655293, 2140523418, 2140390284, 2140255892, 2140120240, 2139983329, 2139845159, 2139705730, +2139565043, 2139423097, 2139279892, 2139135429, 2138989708, 2138842728, 2138694490, 2138544994, +2138394240, 2138242228, 2138088958, 2137934430, 2137778644, 2137621601, 2137463301, 2137303743, +2137142927, 2136980855, 2136817525, 2136652938, 2136487095, 2136319994, 2136151637, 2135982023, +2135811153, 2135639026, 2135465642, 2135291003, 2135115107, 2134937956, 2134759548, 2134579885, +2134398966, 2134216791, 2134033361, 2133848675, 2133662734, 2133475538, 2133287087, 2133097381, +2132906420, 2132714204, 2132520734, 2132326009, 2132130030, 2131932796, 2131734309, 2131534567, +2131333572, 2131131322, 2130927819, 2130723062, 2130517052, 2130309789, 2130101272, 2129891502, +2129680480, 2129468204, 2129254676, 2129039895, 2128823862, 2128606576, 2128388038, 2128168248, +2127947206, 2127724913, 2127501367, 2127276570, 2127050522, 2126823222, 2126594672, 2126364870, +2126133817, 2125901514, 2125667960, 2125433155, 2125197100, 2124959795, 2124721240, 2124481435, +2124240380, 2123998076, 2123754522, 2123509718, 2123263666, 2123016364, 2122767814, 2122518015, +2122266967, 2122014670, 2121761126, 2121506333, 2121250292, 2120993003, 2120734467, 2120474683, +2120213651, 2119951372, 2119687847, 2119423074, 2119157054, 2118889788, 2118621275, 2118351516, +2118080511, 2117808259, 2117534762, 2117260020, 2116984031, 2116706797, 2116428319, 2116148595, +2115867626, 2115585412, 2115301954, 2115017252, 2114731305, 2114444114, 2114155680, 2113866001, +2113575080, 2113282914, 2112989506, 2112694855, 2112398960, 2112101824, 2111803444, 2111503822, +2111202959, 2110900853, 2110597505, 2110292916, 2109987085, 2109680013, 2109371700, 2109062146, +2108751352, 2108439317, 2108126041, 2107811526, 2107495770, 2107178775, 2106860540, 2106541065, +2106220352, 2105898399, 2105575208, 2105250778, 2104925109, 2104598202, 2104270057, 2103940674, +2103610054, 2103278196, 2102945101, 2102610768, 2102275199, 2101938393, 2101600350, 2101261071, +2100920556, 2100578805, 2100235819, 2099891596, 2099546139, 2099199446, 2098851519, 2098502357, +2098151960, 2097800329, 2097447464, 2097093365, 2096738032, 2096381466, 2096023667, 2095664635, +2095304370, 2094942872, 2094580142, 2094216179, 2093850985, 2093484559, 2093116901, 2092748012, +2092377892, 2092006541, 2091633960, 2091260147, 2090885105, 2090508833, 2090131331, 2089752599, +2089372638, 2088991448, 2088609029, 2088225381, 2087840505, 2087454400, 2087067068, 2086678508, +2086288720, 2085897705, 2085505463, 2085111994, 2084717298, 2084321376, 2083924228, 2083525854, +2083126254, 2082725429, 2082323379, 2081920103, 2081515603, 2081109879, 2080702930, 2080294757, +2079885360, 2079474740, 2079062896, 2078649830, 2078235540, 2077820028, 2077403294, 2076985338, +2076566160, 2076145760, 2075724139, 2075301296, 2074877233, 2074451950, 2074025446, 2073597721, +2073168777, 2072738614, 2072307231, 2071874629, 2071440808, 2071005769, 2070569511, 2070132035, +2069693342, 2069253430, 2068812302, 2068369957, 2067926394, 2067481616, 2067035621, 2066588410, +2066139983, 2065690341, 2065239484, 2064787411, 2064334124, 2063879623, 2063423908, 2062966978, +2062508835, 2062049479, 2061588910, 2061127128, 2060664133, 2060199927, 2059734508, 2059267877, +2058800036, 2058330983, 2057860719, 2057389244, 2056916560, 2056442665, 2055967560, 2055491246, +2055013723, 2054534991, 2054055050, 2053573901, 2053091544, 2052607979, 2052123207, 2051637227, +2051150040, 2050661647, 2050172048, 2049681242, 2049189231, 2048696014, 2048201592, 2047705965, +2047209133, 2046711097, 2046211857, 2045711414, 2045209767, 2044706916, 2044202863, 2043697608, +2043191150, 2042683490, 2042174628, 2041664565, 2041153301, 2040640837, 2040127172, 2039612306, +2039096241, 2038578976, 2038060512, 2037540850, 2037019988, 2036497928, 2035974670, 2035450215, +2034924562, 2034397712, 2033869665, 2033340422, 2032809982, 2032278347, 2031745516, 2031211490, +2030676269, 2030139853, 2029602243, 2029063439, 2028523442, 2027982251, 2027439867, 2026896291, +2026351522, 2025805561, 2025258408, 2024710064, 2024160529, 2023609803, 2023057887, 2022504780, +2021950484, 2021394998, 2020838323, 2020280460, 2019721407, 2019161167, 2018599739, 2018037123, +2017473321, 2016908331, 2016342155, 2015774793, 2015206245, 2014636511, 2014065592, 2013493489, +2012920201, 2012345729, 2011770073, 2011193233, 2010615210, 2010036005, 2009455617, 2008874047, +2008291295, 2007707362, 2007122248, 2006535953, 2005948478, 2005359822, 2004769987, 2004178973, +2003586779, 2002993407, 2002398857, 2001803128, 2001206222, 2000608139, 2000008879, 1999408442, +1998806829, 1998204040, 1997600076, 1996994937, 1996388622, 1995781134, 1995172471, 1994562635, +1993951625, 1993339442, 1992726087, 1992111559, 1991495860, 1990878989, 1990260946, 1989641733, +1989021350, 1988399796, 1987777073, 1987153180, 1986528118, 1985901888, 1985274489, 1984645923, +1984016189, 1983385288, 1982753220, 1982119985, 1981485585, 1980850019, 1980213288, 1979575392, +1978936331, 1978296106, 1977654717, 1977012165, 1976368450, 1975723572, 1975077532, 1974430331, +1973781967, 1973132443, 1972481757, 1971829912, 1971176906, 1970522741, 1969867417, 1969210933, +1968553292, 1967894492, 1967234535, 1966573420, 1965911148, 1965247720, 1964583136, 1963917396, +1963250501, 1962582451, 1961913246, 1961242888, 1960571375, 1959898709, 1959224890, 1958549919, +1957873796, 1957196520, 1956518093, 1955838516, 1955157788, 1954475909, 1953792881, 1953108703, +1952423377, 1951736902, 1951049279, 1950360508, 1949670589, 1948979524, 1948287312, 1947593954, +1946899451, 1946203802, 1945507008, 1944809070, 1944109987, 1943409761, 1942708392, 1942005880, +1941302225, 1940597428, 1939891490, 1939184411, 1938476190, 1937766830, 1937056329, 1936344689, +1935631910, 1934917992, 1934202936, 1933486742, 1932769411, 1932050943, 1931331338, 1930610597, +1929888720, 1929165708, 1928441561, 1927716279, 1926989864, 1926262315, 1925533633, 1924803818, +1924072871, 1923340791, 1922607581, 1921873239, 1921137767, 1920401165, 1919663432, 1918924571, +1918184581, 1917443462, 1916701216, 1915957841, 1915213340, 1914467712, 1913720958, 1912973078, +1912224073, 1911473942, 1910722688, 1909970309, 1909216806, 1908462181, 1907706433, 1906949562, +1906191570, 1905432457, 1904672222, 1903910867, 1903148392, 1902384797, 1901620084, 1900854251, +1900087301, 1899319232, 1898550047, 1897779744, 1897008325, 1896235790, 1895462140, 1894687374, +1893911494, 1893134500, 1892356392, 1891577171, 1890796837, 1890015391, 1889232832, 1888449163, +1887664383, 1886878492, 1886091491, 1885303381, 1884514161, 1883723833, 1882932397, 1882139853, +1881346202, 1880551444, 1879755580, 1878958610, 1878160535, 1877361354, 1876561070, 1875759681, +1874957189, 1874153594, 1873348897, 1872543097, 1871736196, 1870928194, 1870119091, 1869308888, +1868497586, 1867685184, 1866871683, 1866057085, 1865241388, 1864424594, 1863606704, 1862787717, +1861967634, 1861146456, 1860324183, 1859500816, 1858676355, 1857850800, 1857024153, 1856196413, +1855367581, 1854537657, 1853706643, 1852874538, 1852041343, 1851207059, 1850371686, 1849535224, +1848697674, 1847859036, 1847019312, 1846178501, 1845336604, 1844493621, 1843649553, 1842804401, +1841958164, 1841110844, 1840262441, 1839412956, 1838562388, 1837710739, 1836858008, 1836004197, +1835149306, 1834293336, 1833436286, 1832578158, 1831718951, 1830858668, 1829997307, 1829134869, +1828271356, 1827406767, 1826541103, 1825674364, 1824806552, 1823937666, 1823067707, 1822196675, +1821324572, 1820451397, 1819577151, 1818701835, 1817825449, 1816947994, 1816069469, 1815189877, +1814309216, 1813427489, 1812544694, 1811660833, 1810775906, 1809889915, 1809002858, 1808114737, +1807225553, 1806335305, 1805443995, 1804551623, 1803658189, 1802763694, 1801868139, 1800971523, +1800073849, 1799175115, 1798275323, 1797374472, 1796472565, 1795569601, 1794665580, 1793760504, +1792854372, 1791947186, 1791038946, 1790129652, 1789219305, 1788307905, 1787395453, 1786481950, +1785567396, 1784651792, 1783735137, 1782817434, 1781898681, 1780978881, 1780058032, 1779136137, +1778213194, 1777289206, 1776364172, 1775438094, 1774510970, 1773582803, 1772653593, 1771723340, +1770792044, 1769859707, 1768926328, 1767991909, 1767056450, 1766119952, 1765182414, 1764243838, +1763304224, 1762363573, 1761421885, 1760479161, 1759535401, 1758590607, 1757644777, 1756697914, +1755750017, 1754801087, 1753851126, 1752900132, 1751948107, 1750995052, 1750040966, 1749085851, +1748129707, 1747172535, 1746214334, 1745255107, 1744294853, 1743333573, 1742371267, 1741407936, +1740443581, 1739478202, 1738511799, 1737544374, 1736575927, 1735606458, 1734635968, 1733664458, +1732691928, 1731718378, 1730743810, 1729768224, 1728791620, 1727813999, 1726835361, 1725855708, +1724875040, 1723893357, 1722910659, 1721926948, 1720942225, 1719956488, 1718969740, 1717981981, +1716993211, 1716003431, 1715012642, 1714020844, 1713028037, 1712034223, 1711039401, 1710043573, +1709046739, 1708048900, 1707050055, 1706050207, 1705049355, 1704047500, 1703044642, 1702040783, +1701035922, 1700030061, 1699023199, 1698015339, 1697006479, 1695996621, 1694985765, 1693973912, +1692961062, 1691947217, 1690932376, 1689916541, 1688899711, 1687881888, 1686863072, 1685843263, +1684822463, 1683800672, 1682777890, 1681754118, 1680729357, 1679703608, 1678676870, 1677649144, +1676620432, 1675590733, 1674560049, 1673528379, 1672495725, 1671462087, 1670427466, 1669391862, +1668355276, 1667317709, 1666279161, 1665239632, 1664199124, 1663157637, 1662115172, 1661071729, +1660027308, 1658981911, 1657935539, 1656888190, 1655839867, 1654790570, 1653740300, 1652689057, +1651636841, 1650583654, 1649529496, 1648474367, 1647418269, 1646361202, 1645303166, 1644244162, +1643184191, 1642123253, 1641061349, 1639998480, 1638934646, 1637869848, 1636804087, 1635737362, +1634669676, 1633601027, 1632531418, 1631460848, 1630389319, 1629316830, 1628243383, 1627168978, +1626093616, 1625017297, 1623940023, 1622861793, 1621782608, 1620702469, 1619621377, 1618539332, +1617456335, 1616372386, 1615287487, 1614201637, 1613114838, 1612027089, 1610938393, 1609848749, +1608758157, 1607666620, 1606574136, 1605480708, 1604386335, 1603291018, 1602194758, 1601097555, +1599999411, 1598900325, 1597800299, 1596699333, 1595597428, 1594494583, 1593390801, 1592286082, +1591180426, 1590073833, 1588966306, 1587857843, 1586748447, 1585638117, 1584526854, 1583414660, +1582301533, 1581187476, 1580072489, 1578956572, 1577839726, 1576721952, 1575603251, 1574483623, +1573363068, 1572241588, 1571119183, 1569995854, 1568871601, 1567746425, 1566620327, 1565493307, +1564365367, 1563236506, 1562106725, 1560976026, 1559844408, 1558711873, 1557578421, 1556444052, +1555308768, 1554172569, 1553035455, 1551897428, 1550758488, 1549618636, 1548477872, 1547336197, +1546193612, 1545050118, 1543905714, 1542760402, 1541614183, 1540467057, 1539319024, 1538170087, +1537020244, 1535869497, 1534717846, 1533565293, 1532411837, 1531257480, 1530102222, 1528946064, +1527789007, 1526631051, 1525472197, 1524312445, 1523151797, 1521990252, 1520827813, 1519664478, +1518500250, 1517335128, 1516169114, 1515002208, 1513834411, 1512665723, 1511496145, 1510325678, +1509154322, 1507982079, 1506808949, 1505634932, 1504460029, 1503284242, 1502107570, 1500930014, +1499751576, 1498572255, 1497392053, 1496210969, 1495029006, 1493846163, 1492662441, 1491477842, +1490292364, 1489106011, 1487918781, 1486730675, 1485541696, 1484351842, 1483161115, 1481969516, +1480777044, 1479583702, 1478389489, 1477194407, 1475998456, 1474801636, 1473603949, 1472405394, +1471205974, 1470005688, 1468804538, 1467602523, 1466399645, 1465195904, 1463991302, 1462785838, +1461579514, 1460372329, 1459164286, 1457955385, 1456745625, 1455535009, 1454323536, 1453111208, +1451898025, 1450683988, 1449469098, 1448253355, 1447036760, 1445819314, 1444601017, 1443381870, +1442161874, 1440941030, 1439719338, 1438496799, 1437273414, 1436049184, 1434824109, 1433598189, +1432371426, 1431143821, 1429915374, 1428686085, 1427455956, 1426224988, 1424993180, 1423760534, +1422527051, 1421292730, 1420057574, 1418821582, 1417584755, 1416347095, 1415108601, 1413869275, +1412629117, 1411388129, 1410146309, 1408903661, 1407660183, 1406415878, 1405170745, 1403924785, +1402678000, 1401430389, 1400181954, 1398932695, 1397682613, 1396431709, 1395179984, 1393927438, +1392674072, 1391419886, 1390164882, 1388909060, 1387652422, 1386394966, 1385136696, 1383877610, +1382617710, 1381356997, 1380095472, 1378833134, 1377569986, 1376306026, 1375041258, 1373775680, +1372509294, 1371242101, 1369974101, 1368705296, 1367435685, 1366165269, 1364894050, 1363622028, +1362349204, 1361075579, 1359801152, 1358525926, 1357249901, 1355973077, 1354695455, 1353417037, +1352137822, 1350857812, 1349577007, 1348295409, 1347013017, 1345729833, 1344445857, 1343161090, +1341875533, 1340589187, 1339302052, 1338014129, 1336725419, 1335435923, 1334145641, 1332854574, +1331562723, 1330270089, 1328976672, 1327682474, 1326387494, 1325091734, 1323795195, 1322497877, +1321199781, 1319900907, 1318601257, 1317300832, 1315999631, 1314697657, 1313394909, 1312091388, +1310787095, 1309482032, 1308176198, 1306869594, 1305562222, 1304254082, 1302945174, 1301635500, +1300325060, 1299013855, 1297701886, 1296389154, 1295075659, 1293761402, 1292446384, 1291130606, +1289814068, 1288496772, 1287178717, 1285859905, 1284540337, 1283220013, 1281898935, 1280577102, +1279254516, 1277931177, 1276607086, 1275282245, 1273956653, 1272630312, 1271303222, 1269975384, +1268646800, 1267317469, 1265987392, 1264656571, 1263325005, 1261992697, 1260659646, 1259325853, +1257991320, 1256656047, 1255320034, 1253983283, 1252645794, 1251307568, 1249968606, 1248628909, +1247288478, 1245947312, 1244605414, 1243262783, 1241919421, 1240575329, 1239230506, 1237884955, +1236538675, 1235191668, 1233843935, 1232495475, 1231146291, 1229796382, 1228445750, 1227094395, +1225742318, 1224389521, 1223036002, 1221681765, 1220326809, 1218971135, 1217614743, 1216257636, +1214899813, 1213541275, 1212182024, 1210822059, 1209461382, 1208099993, 1206737894, 1205375085, +1204011567, 1202647340, 1201282407, 1199916766, 1198550419, 1197183368, 1195815612, 1194447153, +1193077991, 1191708127, 1190337562, 1188966297, 1187594332, 1186221669, 1184848308, 1183474250, +1182099496, 1180724046, 1179347902, 1177971064, 1176593533, 1175215310, 1173836395, 1172456790, +1171076495, 1169695512, 1168313840, 1166931481, 1165548435, 1164164704, 1162780288, 1161395188, +1160009405, 1158622939, 1157235792, 1155847964, 1154459456, 1153070269, 1151680403, 1150289860, +1148898640, 1147506745, 1146114174, 1144720929, 1143327011, 1141932420, 1140537158, 1139141224, +1137744621, 1136347348, 1134949406, 1133550797, 1132151521, 1130751579, 1129350972, 1127949701, +1126547765, 1125145168, 1123741908, 1122337987, 1120933406, 1119528166, 1118122267, 1116715710, +1115308496, 1113900627, 1112492101, 1111082922, 1109673089, 1108262603, 1106851465, 1105439676, +1104027237, 1102614148, 1101200410, 1099786025, 1098370993, 1096955314, 1095538991, 1094122023, +1092704411, 1091286156, 1089867259, 1088447722, 1087027544, 1085606726, 1084185270, 1082763176, +1081340445, 1079917078, 1078493076, 1077068439, 1075643169, 1074217266, 1072790730, 1071363564, +1069935768, 1068507342, 1067078288, 1065648605, 1064218296, 1062787361, 1061355801, 1059923616, +1058490808, 1057057377, 1055623324, 1054188651, 1052753357, 1051317443, 1049880912, 1048443763, +1047005996, 1045567615, 1044128617, 1042689006, 1041248781, 1039807944, 1038366495, 1036924436, +1035481766, 1034038487, 1032594600, 1031150105, 1029705004, 1028259297, 1026812985, 1025366069, +1023918550, 1022470428, 1021021705, 1019572382, 1018122458, 1016671936, 1015220816, 1013769098, +1012316784, 1010863875, 1009410370, 1007956272, 1006501581, 1005046298, 1003590424, 1002133959, +1000676905, 999219262, 997761031, 996302214, 994842810, 993382821, 991922248, 990461091, +988999351, 987537030, 986074127, 984610645, 983146583, 981681943, 980216726, 978750932, +977284562, 975817617, 974350098, 972882006, 971413342, 969944106, 968474300, 967003923, +965532978, 964061465, 962589385, 961116739, 959643527, 958169751, 956695411, 955220508, +953745043, 952269017, 950792431, 949315286, 947837582, 946359321, 944880503, 943401129, +941921200, 940440717, 938959681, 937478092, 935995952, 934513261, 933030021, 931546231, +930061894, 928577010, 927091579, 925605603, 924119082, 922632018, 921144411, 919656262, +918167572, 916678342, 915188572, 913698265, 912207419, 910716038, 909224120, 907731667, +906238681, 904745161, 903251110, 901756526, 900261413, 898765769, 897269597, 895772898, +894275671, 892777918, 891279640, 889780838, 888281512, 886781663, 885281293, 883780402, +882278992, 880777062, 879274614, 877771649, 876268167, 874764170, 873259659, 871754633, +870249095, 868743045, 867236484, 865729413, 864221832, 862713743, 861205147, 859696043, +858186435, 856676321, 855165703, 853654582, 852142959, 850630835, 849118210, 847605086, +846091463, 844577343, 843062726, 841547612, 840032004, 838515901, 836999305, 835482217, +833964638, 832446567, 830928007, 829408958, 827889422, 826369398, 824848888, 823327893, +821806413, 820284450, 818762005, 817239078, 815715670, 814191782, 812667415, 811142571, +809617249, 808091450, 806565177, 805038429, 803511207, 801983513, 800455346, 798926709, +797397602, 795868026, 794337982, 792807470, 791276492, 789745049, 788213141, 786680769, +785147934, 783614638, 782080880, 780546663, 779011986, 777476851, 775941259, 774405210, +772868706, 771331747, 769794334, 768256469, 766718151, 765179382, 763640164, 762100496, +760560380, 759019816, 757478806, 755937350, 754395449, 752853105, 751310318, 749767089, +748223418, 746679308, 745134758, 743589770, 742044345, 740498483, 738952186, 737405453, +735858287, 734310688, 732762657, 731214195, 729665303, 728115982, 726566232, 725016055, +723465451, 721914422, 720362968, 718811090, 717258790, 715706067, 714152924, 712599360, +711045377, 709490976, 707936158, 706380923, 704825272, 703269207, 701712728, 700155836, +698598533, 697040818, 695482694, 693924160, 692365218, 690805869, 689246113, 687685952, +686125387, 684564417, 683003045, 681441272, 679879097, 678316522, 676753549, 675190177, +673626408, 672062243, 670497682, 668932727, 667367379, 665801638, 664235505, 662668981, +661102068, 659534766, 657967075, 656398998, 654830535, 653261686, 651692453, 650122837, +648552838, 646982457, 645411696, 643840556, 642269036, 640697139, 639124865, 637552215, +635979190, 634405791, 632832018, 631257873, 629683357, 628108471, 626533215, 624957590, +623381598, 621805239, 620228514, 618651424, 617073971, 615496154, 613917975, 612339436, +610760536, 609181276, 607601658, 606021683, 604441352, 602860664, 601279623, 599698227, +598116479, 596534378, 594951927, 593369126, 591785976, 590202477, 588618632, 587034440, +585449903, 583865021, 582279796, 580694229, 579108320, 577522070, 575935480, 574348552, +572761285, 571173682, 569585743, 567997469, 566408860, 564819919, 563230645, 561641039, +560051104, 558460839, 556870245, 555279324, 553688076, 552096502, 550504604, 548912382, +547319836, 545726969, 544133781, 542540273, 540946445, 539352300, 537757837, 536163058, +534567963, 532972554, 531376831, 529780796, 528184449, 526587791, 524990824, 523393547, +521795963, 520198072, 518599875, 517001373, 515402566, 513803457, 512204045, 510604332, +509004318, 507404005, 505803394, 504202485, 502601279, 500999778, 499397982, 497795892, +496193509, 494590835, 492987869, 491384614, 489781069, 488177236, 486573117, 484968710, +483364019, 481759043, 480153784, 478548243, 476942419, 475336316, 473729932, 472123270, +470516330, 468909114, 467301622, 465693854, 464085813, 462477499, 460868912, 459260055, +457650927, 456041530, 454431865, 452821933, 451211734, 449601270, 447990541, 446379549, +444768294, 443156777, 441545000, 439932963, 438320667, 436708113, 435095303, 433482236, +431868915, 430255339, 428641511, 427027430, 425413098, 423798515, 422183684, 420568604, +418953276, 417337703, 415721883, 414105819, 412489512, 410872962, 409256170, 407639137, +406021865, 404404353, 402786604, 401168618, 399550396, 397931939, 396313247, 394694323, +393075166, 391455778, 389836160, 388216313, 386596237, 384975934, 383355404, 381734649, +380113669, 378492466, 376871039, 375249392, 373627523, 372005435, 370383128, 368760603, +367137861, 365514903, 363891730, 362268343, 360644742, 359020930, 357396906, 355772673, +354148230, 352523578, 350898719, 349273654, 347648383, 346022908, 344397230, 342771348, +341145265, 339518981, 337892498, 336265816, 334638936, 333011859, 331384586, 329757119, +328129457, 326501602, 324873555, 323245317, 321616889, 319988272, 318359466, 316730474, +315101295, 313471930, 311842381, 310212649, 308582734, 306952638, 305322361, 303691904, +302061269, 300430456, 298799466, 297168301, 295536961, 293905447, 292273760, 290641901, +289009871, 287377671, 285745302, 284112765, 282480061, 280847190, 279214155, 277580955, +275947592, 274314066, 272680379, 271046532, 269412525, 267778360, 266144038, 264509558, +262874923, 261240134, 259605191, 257970095, 256334847, 254699448, 253063900, 251428203, +249792358, 248156366, 246520228, 244883945, 243247518, 241610947, 239974235, 238337382, +236700388, 235063255, 233425984, 231788575, 230151030, 228513350, 226875535, 225237587, +223599506, 221961294, 220322951, 218684479, 217045878, 215407149, 213768293, 212129312, +210490206, 208850976, 207211624, 205572149, 203932553, 202292838, 200653003, 199013051, +197372981, 195732795, 194092495, 192452080, 190811551, 189170911, 187530159, 185889297, +184248325, 182607245, 180966058, 179324764, 177683365, 176041861, 174400254, 172758544, +171116733, 169474820, 167832808, 166190698, 164548489, 162906184, 161263783, 159621287, +157978697, 156336015, 154693240, 153050374, 151407418, 149764374, 148121241, 146478021, +144834714, 143191323, 141547847, 139904288, 138260647, 136616925, 134973122, 133329239, +131685278, 130041240, 128397125, 126752935, 125108670, 123464332, 121819921, 120175438, +118530885, 116886262, 115241570, 113596810, 111951983, 110307091, 108662134, 107017112, +105372028, 103726882, 102081675, 100436408, 98791081, 97145697, 95500255, 93854758, + 92209205, 90563597, 88917937, 87272224, 85626460, 83980645, 82334782, 80688869, + 79042909, 77396903, 75750851, 74104755, 72458615, 70812432, 69166208, 67519943, + 65873638, 64227295, 62580914, 60934496, 59288042, 57641553, 55995030, 54348475, + 52701887, 51055268, 49408620, 47761942, 46115236, 44468503, 42821744, 41174960, + 39528151, 37881320, 36234466, 34587590, 32940695, 31293780, 29646846, 27999895, + 26352928, 24705945, 23058947, 21411936, 19764913, 18117878, 16470832, 14823776, + 13176712, 11529640, 9882561, 8235476, 6588387, 4941294, 3294197, 1647099 +}; + +uint16_t ff_fft_offsets_lut[21845]; + +static void fft_lut_init(uint16_t *table, int off, int size, int *index) +{ + if (size < 16) { + table[*index] = off >> 2; + (*index)++; + } + else { + fft_lut_init(table, off, size >> 1, index); + fft_lut_init(table, off + (size >> 1), size >> 2, index); + fft_lut_init(table, off + 3 * (size >> 2), size >> 2, index); + } +} + +static void fft_lut_init_start(void) +{ + int n = 0; + + fft_lut_init(ff_fft_offsets_lut, 0, 1 << 17, &n); +} + +void ff_fft_lut_init(void) +{ + static AVOnce init_once = AV_ONCE_INIT; + + ff_thread_once(&init_once, fft_lut_init_start); +} diff --git a/media/ffvpx/libavcodec/fft_table.h b/media/ffvpx/libavcodec/fft_table.h new file mode 100644 index 0000000000..09df49f2b8 --- /dev/null +++ b/media/ffvpx/libavcodec/fft_table.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Stanislav Ocovaj (socovaj@mips.com) + * Goran Cordasic (goran@mips.com) + * Djordje Pesut (djordje@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * definitions and tables for FFT + */ +#ifndef AVCODEC_FFT_TABLE_H +#define AVCODEC_FFT_TABLE_H + +#include "libavcodec/fft.h" + +#define MAX_LOG2_NFFT 17 //!< Specifies maximum allowed fft size +#define MAX_FFT_SIZE (1 << MAX_LOG2_NFFT) + +extern const int32_t ff_w_tab_sr[]; +extern uint16_t ff_fft_offsets_lut[]; +void ff_fft_lut_init(void); + +#endif /* AVCODEC_FFT_TABLE_H */ diff --git a/media/ffvpx/libavcodec/fft_template.c b/media/ffvpx/libavcodec/fft_template.c new file mode 100644 index 0000000000..f2742a3ae8 --- /dev/null +++ b/media/ffvpx/libavcodec/fft_template.c @@ -0,0 +1,628 @@ +/* + * FFT/IFFT transforms + * Copyright (c) 2008 Loren Merritt + * Copyright (c) 2002 Fabrice Bellard + * Partly based on libdjbfft by D. J. Bernstein + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * FFT/IFFT transforms. + */ + +#include <stdlib.h> +#include <string.h> +#include "libavutil/mathematics.h" +#include "libavutil/thread.h" +#include "fft.h" +#include "fft-internal.h" + +#if !FFT_FLOAT +#include "fft_table.h" +#else /* !FFT_FLOAT */ + +/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */ +#if !CONFIG_HARDCODED_TABLES +COSTABLE(16); +COSTABLE(32); +COSTABLE(64); +COSTABLE(128); +COSTABLE(256); +COSTABLE(512); +COSTABLE(1024); +COSTABLE(2048); +COSTABLE(4096); +COSTABLE(8192); +COSTABLE(16384); +COSTABLE(32768); +COSTABLE(65536); +COSTABLE(131072); + +static av_cold void init_ff_cos_tabs(int index) +{ + int i; + int m = 1<<index; + double freq = 2*M_PI/m; + FFTSample *tab = FFT_NAME(ff_cos_tabs)[index]; + for(i=0; i<=m/4; i++) + tab[i] = FIX15(cos(i*freq)); + for(i=1; i<m/4; i++) + tab[m/2-i] = tab[i]; +} + +typedef struct CosTabsInitOnce { + void (*func)(void); + AVOnce control; +} CosTabsInitOnce; + +#define INIT_FF_COS_TABS_FUNC(index, size) \ +static av_cold void init_ff_cos_tabs_ ## size (void)\ +{ \ + init_ff_cos_tabs(index); \ +} + +INIT_FF_COS_TABS_FUNC(4, 16) +INIT_FF_COS_TABS_FUNC(5, 32) +INIT_FF_COS_TABS_FUNC(6, 64) +INIT_FF_COS_TABS_FUNC(7, 128) +INIT_FF_COS_TABS_FUNC(8, 256) +INIT_FF_COS_TABS_FUNC(9, 512) +INIT_FF_COS_TABS_FUNC(10, 1024) +INIT_FF_COS_TABS_FUNC(11, 2048) +INIT_FF_COS_TABS_FUNC(12, 4096) +INIT_FF_COS_TABS_FUNC(13, 8192) +INIT_FF_COS_TABS_FUNC(14, 16384) +INIT_FF_COS_TABS_FUNC(15, 32768) +INIT_FF_COS_TABS_FUNC(16, 65536) +INIT_FF_COS_TABS_FUNC(17, 131072) + +static CosTabsInitOnce cos_tabs_init_once[] = { + { NULL }, + { NULL }, + { NULL }, + { NULL }, + { init_ff_cos_tabs_16, AV_ONCE_INIT }, + { init_ff_cos_tabs_32, AV_ONCE_INIT }, + { init_ff_cos_tabs_64, AV_ONCE_INIT }, + { init_ff_cos_tabs_128, AV_ONCE_INIT }, + { init_ff_cos_tabs_256, AV_ONCE_INIT }, + { init_ff_cos_tabs_512, AV_ONCE_INIT }, + { init_ff_cos_tabs_1024, AV_ONCE_INIT }, + { init_ff_cos_tabs_2048, AV_ONCE_INIT }, + { init_ff_cos_tabs_4096, AV_ONCE_INIT }, + { init_ff_cos_tabs_8192, AV_ONCE_INIT }, + { init_ff_cos_tabs_16384, AV_ONCE_INIT }, + { init_ff_cos_tabs_32768, AV_ONCE_INIT }, + { init_ff_cos_tabs_65536, AV_ONCE_INIT }, + { init_ff_cos_tabs_131072, AV_ONCE_INIT }, +}; + +av_cold void ff_init_ff_cos_tabs(int index) +{ + ff_thread_once(&cos_tabs_init_once[index].control, cos_tabs_init_once[index].func); +} +#endif +COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = { + NULL, NULL, NULL, NULL, + FFT_NAME(ff_cos_16), + FFT_NAME(ff_cos_32), + FFT_NAME(ff_cos_64), + FFT_NAME(ff_cos_128), + FFT_NAME(ff_cos_256), + FFT_NAME(ff_cos_512), + FFT_NAME(ff_cos_1024), + FFT_NAME(ff_cos_2048), + FFT_NAME(ff_cos_4096), + FFT_NAME(ff_cos_8192), + FFT_NAME(ff_cos_16384), + FFT_NAME(ff_cos_32768), + FFT_NAME(ff_cos_65536), + FFT_NAME(ff_cos_131072), +}; + +#endif /* FFT_FLOAT */ + +static void fft_permute_c(FFTContext *s, FFTComplex *z); +static void fft_calc_c(FFTContext *s, FFTComplex *z); + +static int split_radix_permutation(int i, int n, int inverse) +{ + int m; + if(n <= 2) return i&1; + m = n >> 1; + if(!(i&m)) return split_radix_permutation(i, m, inverse)*2; + m >>= 1; + if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1; + else return split_radix_permutation(i, m, inverse)*4 - 1; +} + + +static const int avx_tab[] = { + 0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15 +}; + +static int is_second_half_of_fft32(int i, int n) +{ + if (n <= 32) + return i >= 16; + else if (i < n/2) + return is_second_half_of_fft32(i, n/2); + else if (i < 3*n/4) + return is_second_half_of_fft32(i - n/2, n/4); + else + return is_second_half_of_fft32(i - 3*n/4, n/4); +} + +static av_cold void fft_perm_avx(FFTContext *s) +{ + int i; + int n = 1 << s->nbits; + + for (i = 0; i < n; i += 16) { + int k; + if (is_second_half_of_fft32(i, n)) { + for (k = 0; k < 16; k++) + s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = + i + avx_tab[k]; + + } else { + for (k = 0; k < 16; k++) { + int j = i + k; + j = (j & ~7) | ((j >> 1) & 3) | ((j << 2) & 4); + s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = j; + } + } + } +} + +av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) +{ + int i, j, n; + + s->revtab = NULL; + s->revtab32 = NULL; + + if (nbits < 2 || nbits > 17) + goto fail; + s->nbits = nbits; + n = 1 << nbits; + + if (nbits <= 16) { + s->revtab = av_malloc(n * sizeof(uint16_t)); + if (!s->revtab) + goto fail; + } else { + s->revtab32 = av_malloc(n * sizeof(uint32_t)); + if (!s->revtab32) + goto fail; + } + s->tmp_buf = av_malloc(n * sizeof(FFTComplex)); + if (!s->tmp_buf) + goto fail; + s->inverse = inverse; + s->fft_permutation = FF_FFT_PERM_DEFAULT; + + s->fft_permute = fft_permute_c; + s->fft_calc = fft_calc_c; +#if CONFIG_MDCT + s->imdct_calc = ff_imdct_calc_c; + s->imdct_half = ff_imdct_half_c; + s->mdct_calc = ff_mdct_calc_c; +#endif + +#if FFT_FLOAT +#if ARCH_AARCH64 + ff_fft_init_aarch64(s); +#elif ARCH_ARM + ff_fft_init_arm(s); +#elif ARCH_PPC + ff_fft_init_ppc(s); +#elif ARCH_X86 + ff_fft_init_x86(s); +#endif +#if HAVE_MIPSFPU + ff_fft_init_mips(s); +#endif + for(j=4; j<=nbits; j++) { + ff_init_ff_cos_tabs(j); + } +#else /* FFT_FLOAT */ + ff_fft_lut_init(); +#endif + + + if (ARCH_X86 && FFT_FLOAT && s->fft_permutation == FF_FFT_PERM_AVX) { + fft_perm_avx(s); + } else { +#define PROCESS_FFT_PERM_SWAP_LSBS(num) do {\ + for(i = 0; i < n; i++) {\ + int k;\ + j = i;\ + j = (j & ~3) | ((j >> 1) & 1) | ((j << 1) & 2);\ + k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\ + s->revtab##num[k] = j;\ + } \ +} while(0); + +#define PROCESS_FFT_PERM_DEFAULT(num) do {\ + for(i = 0; i < n; i++) {\ + int k;\ + j = i;\ + k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\ + s->revtab##num[k] = j;\ + } \ +} while(0); + +#define SPLIT_RADIX_PERMUTATION(num) do { \ + if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS) {\ + PROCESS_FFT_PERM_SWAP_LSBS(num) \ + } else {\ + PROCESS_FFT_PERM_DEFAULT(num) \ + }\ +} while(0); + + if (s->revtab) + SPLIT_RADIX_PERMUTATION() + if (s->revtab32) + SPLIT_RADIX_PERMUTATION(32) + +#undef PROCESS_FFT_PERM_DEFAULT +#undef PROCESS_FFT_PERM_SWAP_LSBS +#undef SPLIT_RADIX_PERMUTATION + } + + return 0; + fail: + av_freep(&s->revtab); + av_freep(&s->revtab32); + av_freep(&s->tmp_buf); + return -1; +} + +static void fft_permute_c(FFTContext *s, FFTComplex *z) +{ + int j, np; + const uint16_t *revtab = s->revtab; + const uint32_t *revtab32 = s->revtab32; + np = 1 << s->nbits; + /* TODO: handle split-radix permute in a more optimal way, probably in-place */ + if (revtab) { + for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j]; + } else + for(j=0;j<np;j++) s->tmp_buf[revtab32[j]] = z[j]; + + memcpy(z, s->tmp_buf, np * sizeof(FFTComplex)); +} + +av_cold void ff_fft_end(FFTContext *s) +{ + av_freep(&s->revtab); + av_freep(&s->revtab32); + av_freep(&s->tmp_buf); +} + +#if !FFT_FLOAT + +static void fft_calc_c(FFTContext *s, FFTComplex *z) { + + int nbits, i, n, num_transforms, offset, step; + int n4, n2, n34; + unsigned tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + FFTComplex *tmpz; + const int fft_size = (1 << s->nbits); + int64_t accu; + + num_transforms = (0x2aab >> (16 - s->nbits)) | 1; + + for (n=0; n<num_transforms; n++){ + offset = ff_fft_offsets_lut[n] << 2; + tmpz = z + offset; + + tmp1 = tmpz[0].re + (unsigned)tmpz[1].re; + tmp5 = tmpz[2].re + (unsigned)tmpz[3].re; + tmp2 = tmpz[0].im + (unsigned)tmpz[1].im; + tmp6 = tmpz[2].im + (unsigned)tmpz[3].im; + tmp3 = tmpz[0].re - (unsigned)tmpz[1].re; + tmp8 = tmpz[2].im - (unsigned)tmpz[3].im; + tmp4 = tmpz[0].im - (unsigned)tmpz[1].im; + tmp7 = tmpz[2].re - (unsigned)tmpz[3].re; + + tmpz[0].re = tmp1 + tmp5; + tmpz[2].re = tmp1 - tmp5; + tmpz[0].im = tmp2 + tmp6; + tmpz[2].im = tmp2 - tmp6; + tmpz[1].re = tmp3 + tmp8; + tmpz[3].re = tmp3 - tmp8; + tmpz[1].im = tmp4 - tmp7; + tmpz[3].im = tmp4 + tmp7; + } + + if (fft_size < 8) + return; + + num_transforms = (num_transforms >> 1) | 1; + + for (n=0; n<num_transforms; n++){ + offset = ff_fft_offsets_lut[n] << 3; + tmpz = z + offset; + + tmp1 = tmpz[4].re + (unsigned)tmpz[5].re; + tmp3 = tmpz[6].re + (unsigned)tmpz[7].re; + tmp2 = tmpz[4].im + (unsigned)tmpz[5].im; + tmp4 = tmpz[6].im + (unsigned)tmpz[7].im; + tmp5 = tmp1 + tmp3; + tmp7 = tmp1 - tmp3; + tmp6 = tmp2 + tmp4; + tmp8 = tmp2 - tmp4; + + tmp1 = tmpz[4].re - (unsigned)tmpz[5].re; + tmp2 = tmpz[4].im - (unsigned)tmpz[5].im; + tmp3 = tmpz[6].re - (unsigned)tmpz[7].re; + tmp4 = tmpz[6].im - (unsigned)tmpz[7].im; + + tmpz[4].re = tmpz[0].re - tmp5; + tmpz[0].re = tmpz[0].re + tmp5; + tmpz[4].im = tmpz[0].im - tmp6; + tmpz[0].im = tmpz[0].im + tmp6; + tmpz[6].re = tmpz[2].re - tmp8; + tmpz[2].re = tmpz[2].re + tmp8; + tmpz[6].im = tmpz[2].im + tmp7; + tmpz[2].im = tmpz[2].im - tmp7; + + accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp1 + tmp2); + tmp5 = (int32_t)((accu + 0x40000000) >> 31); + accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 - tmp4); + tmp7 = (int32_t)((accu + 0x40000000) >> 31); + accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp2 - tmp1); + tmp6 = (int32_t)((accu + 0x40000000) >> 31); + accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 + tmp4); + tmp8 = (int32_t)((accu + 0x40000000) >> 31); + tmp1 = tmp5 + tmp7; + tmp3 = tmp5 - tmp7; + tmp2 = tmp6 + tmp8; + tmp4 = tmp6 - tmp8; + + tmpz[5].re = tmpz[1].re - tmp1; + tmpz[1].re = tmpz[1].re + tmp1; + tmpz[5].im = tmpz[1].im - tmp2; + tmpz[1].im = tmpz[1].im + tmp2; + tmpz[7].re = tmpz[3].re - tmp4; + tmpz[3].re = tmpz[3].re + tmp4; + tmpz[7].im = tmpz[3].im + tmp3; + tmpz[3].im = tmpz[3].im - tmp3; + } + + step = 1 << ((MAX_LOG2_NFFT-4) - 4); + n4 = 4; + + for (nbits=4; nbits<=s->nbits; nbits++){ + n2 = 2*n4; + n34 = 3*n4; + num_transforms = (num_transforms >> 1) | 1; + + for (n=0; n<num_transforms; n++){ + const FFTSample *w_re_ptr = ff_w_tab_sr + step; + const FFTSample *w_im_ptr = ff_w_tab_sr + MAX_FFT_SIZE/(4*16) - step; + offset = ff_fft_offsets_lut[n] << nbits; + tmpz = z + offset; + + tmp5 = tmpz[ n2].re + (unsigned)tmpz[n34].re; + tmp1 = tmpz[ n2].re - (unsigned)tmpz[n34].re; + tmp6 = tmpz[ n2].im + (unsigned)tmpz[n34].im; + tmp2 = tmpz[ n2].im - (unsigned)tmpz[n34].im; + + tmpz[ n2].re = tmpz[ 0].re - tmp5; + tmpz[ 0].re = tmpz[ 0].re + tmp5; + tmpz[ n2].im = tmpz[ 0].im - tmp6; + tmpz[ 0].im = tmpz[ 0].im + tmp6; + tmpz[n34].re = tmpz[n4].re - tmp2; + tmpz[ n4].re = tmpz[n4].re + tmp2; + tmpz[n34].im = tmpz[n4].im + tmp1; + tmpz[ n4].im = tmpz[n4].im - tmp1; + + for (i=1; i<n4; i++){ + FFTSample w_re = w_re_ptr[0]; + FFTSample w_im = w_im_ptr[0]; + accu = (int64_t)w_re*tmpz[ n2+i].re; + accu += (int64_t)w_im*tmpz[ n2+i].im; + tmp1 = (int32_t)((accu + 0x40000000) >> 31); + accu = (int64_t)w_re*tmpz[ n2+i].im; + accu -= (int64_t)w_im*tmpz[ n2+i].re; + tmp2 = (int32_t)((accu + 0x40000000) >> 31); + accu = (int64_t)w_re*tmpz[n34+i].re; + accu -= (int64_t)w_im*tmpz[n34+i].im; + tmp3 = (int32_t)((accu + 0x40000000) >> 31); + accu = (int64_t)w_re*tmpz[n34+i].im; + accu += (int64_t)w_im*tmpz[n34+i].re; + tmp4 = (int32_t)((accu + 0x40000000) >> 31); + + tmp5 = tmp1 + tmp3; + tmp1 = tmp1 - tmp3; + tmp6 = tmp2 + tmp4; + tmp2 = tmp2 - tmp4; + + tmpz[ n2+i].re = tmpz[ i].re - tmp5; + tmpz[ i].re = tmpz[ i].re + tmp5; + tmpz[ n2+i].im = tmpz[ i].im - tmp6; + tmpz[ i].im = tmpz[ i].im + tmp6; + tmpz[n34+i].re = tmpz[n4+i].re - tmp2; + tmpz[ n4+i].re = tmpz[n4+i].re + tmp2; + tmpz[n34+i].im = tmpz[n4+i].im + tmp1; + tmpz[ n4+i].im = tmpz[n4+i].im - tmp1; + + w_re_ptr += step; + w_im_ptr -= step; + } + } + step >>= 1; + n4 <<= 1; + } +} + +#else /* !FFT_FLOAT */ + +#define BUTTERFLIES(a0,a1,a2,a3) {\ + BF(t3, t5, t5, t1);\ + BF(a2.re, a0.re, a0.re, t5);\ + BF(a3.im, a1.im, a1.im, t3);\ + BF(t4, t6, t2, t6);\ + BF(a3.re, a1.re, a1.re, t4);\ + BF(a2.im, a0.im, a0.im, t6);\ +} + +// force loading all the inputs before storing any. +// this is slightly slower for small data, but avoids store->load aliasing +// for addresses separated by large powers of 2. +#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\ + FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\ + BF(t3, t5, t5, t1);\ + BF(a2.re, a0.re, r0, t5);\ + BF(a3.im, a1.im, i1, t3);\ + BF(t4, t6, t2, t6);\ + BF(a3.re, a1.re, r1, t4);\ + BF(a2.im, a0.im, i0, t6);\ +} + +#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\ + CMUL(t1, t2, a2.re, a2.im, wre, -wim);\ + CMUL(t5, t6, a3.re, a3.im, wre, wim);\ + BUTTERFLIES(a0,a1,a2,a3)\ +} + +#define TRANSFORM_ZERO(a0,a1,a2,a3) {\ + t1 = a2.re;\ + t2 = a2.im;\ + t5 = a3.re;\ + t6 = a3.im;\ + BUTTERFLIES(a0,a1,a2,a3)\ +} + +/* z[0...8n-1], w[1...2n-1] */ +#define PASS(name)\ +static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\ +{\ + FFTDouble t1, t2, t3, t4, t5, t6;\ + int o1 = 2*n;\ + int o2 = 4*n;\ + int o3 = 6*n;\ + const FFTSample *wim = wre+o1;\ + n--;\ +\ + TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\ + TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ + do {\ + z += 2;\ + wre += 2;\ + wim -= 2;\ + TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\ + TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ + } while(--n);\ +} + +PASS(pass) +#if !CONFIG_SMALL +#undef BUTTERFLIES +#define BUTTERFLIES BUTTERFLIES_BIG +PASS(pass_big) +#endif + +#define DECL_FFT(n,n2,n4)\ +static void fft##n(FFTComplex *z)\ +{\ + fft##n2(z);\ + fft##n4(z+n4*2);\ + fft##n4(z+n4*3);\ + pass(z,FFT_NAME(ff_cos_##n),n4/2);\ +} + +static void fft4(FFTComplex *z) +{ + FFTDouble t1, t2, t3, t4, t5, t6, t7, t8; + + BF(t3, t1, z[0].re, z[1].re); + BF(t8, t6, z[3].re, z[2].re); + BF(z[2].re, z[0].re, t1, t6); + BF(t4, t2, z[0].im, z[1].im); + BF(t7, t5, z[2].im, z[3].im); + BF(z[3].im, z[1].im, t4, t8); + BF(z[3].re, z[1].re, t3, t7); + BF(z[2].im, z[0].im, t2, t5); +} + +static void fft8(FFTComplex *z) +{ + FFTDouble t1, t2, t3, t4, t5, t6; + + fft4(z); + + BF(t1, z[5].re, z[4].re, -z[5].re); + BF(t2, z[5].im, z[4].im, -z[5].im); + BF(t5, z[7].re, z[6].re, -z[7].re); + BF(t6, z[7].im, z[6].im, -z[7].im); + + BUTTERFLIES(z[0],z[2],z[4],z[6]); + TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf); +} + +#if !CONFIG_SMALL +static void fft16(FFTComplex *z) +{ + FFTDouble t1, t2, t3, t4, t5, t6; + FFTSample cos_16_1 = FFT_NAME(ff_cos_16)[1]; + FFTSample cos_16_3 = FFT_NAME(ff_cos_16)[3]; + + fft8(z); + fft4(z+8); + fft4(z+12); + + TRANSFORM_ZERO(z[0],z[4],z[8],z[12]); + TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf); + TRANSFORM(z[1],z[5],z[9],z[13],cos_16_1,cos_16_3); + TRANSFORM(z[3],z[7],z[11],z[15],cos_16_3,cos_16_1); +} +#else +DECL_FFT(16,8,4) +#endif +DECL_FFT(32,16,8) +DECL_FFT(64,32,16) +DECL_FFT(128,64,32) +DECL_FFT(256,128,64) +DECL_FFT(512,256,128) +#if !CONFIG_SMALL +#define pass pass_big +#endif +DECL_FFT(1024,512,256) +DECL_FFT(2048,1024,512) +DECL_FFT(4096,2048,1024) +DECL_FFT(8192,4096,2048) +DECL_FFT(16384,8192,4096) +DECL_FFT(32768,16384,8192) +DECL_FFT(65536,32768,16384) +DECL_FFT(131072,65536,32768) + +static void (* const fft_dispatch[])(FFTComplex*) = { + fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024, + fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, fft131072 +}; + +static void fft_calc_c(FFTContext *s, FFTComplex *z) +{ + fft_dispatch[s->nbits-2](z); +} +#endif /* !FFT_FLOAT */ diff --git a/media/ffvpx/libavcodec/flac.c b/media/ffvpx/libavcodec/flac.c new file mode 100644 index 0000000000..174b4801be --- /dev/null +++ b/media/ffvpx/libavcodec/flac.c @@ -0,0 +1,225 @@ +/* + * FLAC common code + * Copyright (c) 2009 Justin Ruggles + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/channel_layout.h" +#include "libavutil/crc.h" +#include "libavutil/log.h" +#include "bytestream.h" +#include "get_bits.h" +#include "flac.h" +#include "flacdata.h" +#include "flac_parse.h" + +static const int8_t sample_size_table[] = { 0, 8, 12, 0, 16, 20, 24, 32 }; + +static const AVChannelLayout flac_channel_layouts[8] = { + AV_CHANNEL_LAYOUT_MONO, + AV_CHANNEL_LAYOUT_STEREO, + AV_CHANNEL_LAYOUT_SURROUND, + AV_CHANNEL_LAYOUT_QUAD, + AV_CHANNEL_LAYOUT_5POINT0, + AV_CHANNEL_LAYOUT_5POINT1, + AV_CHANNEL_LAYOUT_6POINT1, + AV_CHANNEL_LAYOUT_7POINT1 +}; + +static int64_t get_utf8(GetBitContext *gb) +{ + int64_t val; + GET_UTF8(val, get_bits(gb, 8), return -1;) + return val; +} + +int ff_flac_decode_frame_header(AVCodecContext *avctx, GetBitContext *gb, + FLACFrameInfo *fi, int log_level_offset) +{ + int bs_code, sr_code, bps_code; + + /* frame sync code */ + if ((get_bits(gb, 15) & 0x7FFF) != 0x7FFC) { + av_log(avctx, AV_LOG_ERROR + log_level_offset, "invalid sync code\n"); + return AVERROR_INVALIDDATA; + } + + /* variable block size stream code */ + fi->is_var_size = get_bits1(gb); + + /* block size and sample rate codes */ + bs_code = get_bits(gb, 4); + sr_code = get_bits(gb, 4); + + /* channels and decorrelation */ + fi->ch_mode = get_bits(gb, 4); + if (fi->ch_mode < FLAC_MAX_CHANNELS) { + fi->channels = fi->ch_mode + 1; + fi->ch_mode = FLAC_CHMODE_INDEPENDENT; + } else if (fi->ch_mode < FLAC_MAX_CHANNELS + FLAC_CHMODE_MID_SIDE) { + fi->channels = 2; + fi->ch_mode -= FLAC_MAX_CHANNELS - 1; + } else { + av_log(avctx, AV_LOG_ERROR + log_level_offset, + "invalid channel mode: %d\n", fi->ch_mode); + return AVERROR_INVALIDDATA; + } + + /* bits per sample */ + bps_code = get_bits(gb, 3); + if (bps_code == 3) { + av_log(avctx, AV_LOG_ERROR + log_level_offset, + "invalid sample size code (%d)\n", + bps_code); + return AVERROR_INVALIDDATA; + } + fi->bps = sample_size_table[bps_code]; + + /* reserved bit */ + if (get_bits1(gb)) { + av_log(avctx, AV_LOG_ERROR + log_level_offset, + "broken stream, invalid padding\n"); + return AVERROR_INVALIDDATA; + } + + /* sample or frame count */ + fi->frame_or_sample_num = get_utf8(gb); + if (fi->frame_or_sample_num < 0) { + av_log(avctx, AV_LOG_ERROR + log_level_offset, + "sample/frame number invalid; utf8 fscked\n"); + return AVERROR_INVALIDDATA; + } + + /* blocksize */ + if (bs_code == 0) { + av_log(avctx, AV_LOG_ERROR + log_level_offset, + "reserved blocksize code: 0\n"); + return AVERROR_INVALIDDATA; + } else if (bs_code == 6) { + fi->blocksize = get_bits(gb, 8) + 1; + } else if (bs_code == 7) { + fi->blocksize = get_bits(gb, 16) + 1; + } else { + fi->blocksize = ff_flac_blocksize_table[bs_code]; + } + + /* sample rate */ + if (sr_code < 12) { + fi->samplerate = ff_flac_sample_rate_table[sr_code]; + } else if (sr_code == 12) { + fi->samplerate = get_bits(gb, 8) * 1000; + } else if (sr_code == 13) { + fi->samplerate = get_bits(gb, 16); + } else if (sr_code == 14) { + fi->samplerate = get_bits(gb, 16) * 10; + } else { + av_log(avctx, AV_LOG_ERROR + log_level_offset, + "illegal sample rate code %d\n", + sr_code); + return AVERROR_INVALIDDATA; + } + + /* header CRC-8 check */ + skip_bits(gb, 8); + if (av_crc(av_crc_get_table(AV_CRC_8_ATM), 0, gb->buffer, + get_bits_count(gb)/8)) { + av_log(avctx, AV_LOG_ERROR + log_level_offset, + "header crc mismatch\n"); + return AVERROR_INVALIDDATA; + } + + return 0; +} + +int ff_flac_is_extradata_valid(AVCodecContext *avctx, + uint8_t **streaminfo_start) +{ + if (!avctx->extradata || avctx->extradata_size < FLAC_STREAMINFO_SIZE) { + av_log(avctx, AV_LOG_ERROR, "extradata NULL or too small.\n"); + return 0; + } + if (AV_RL32(avctx->extradata) != MKTAG('f','L','a','C')) { + /* extradata contains STREAMINFO only */ + if (avctx->extradata_size != FLAC_STREAMINFO_SIZE) { + av_log(avctx, AV_LOG_WARNING, "extradata contains %d bytes too many.\n", + FLAC_STREAMINFO_SIZE-avctx->extradata_size); + } + *streaminfo_start = avctx->extradata; + } else { + if (avctx->extradata_size < 8+FLAC_STREAMINFO_SIZE) { + av_log(avctx, AV_LOG_ERROR, "extradata too small.\n"); + return 0; + } + *streaminfo_start = &avctx->extradata[8]; + } + return 1; +} + +void ff_flac_set_channel_layout(AVCodecContext *avctx, int channels) +{ + if (channels == avctx->ch_layout.nb_channels && + avctx->ch_layout.order != AV_CHANNEL_ORDER_UNSPEC) + return; + + av_channel_layout_uninit(&avctx->ch_layout); + if (channels <= FF_ARRAY_ELEMS(flac_channel_layouts)) + avctx->ch_layout = flac_channel_layouts[channels - 1]; + else + avctx->ch_layout = (AVChannelLayout){ .order = AV_CHANNEL_ORDER_UNSPEC, + .nb_channels = channels }; +} + +int ff_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s, + const uint8_t *buffer) +{ + GetBitContext gb; + init_get_bits(&gb, buffer, FLAC_STREAMINFO_SIZE*8); + + skip_bits(&gb, 16); /* skip min blocksize */ + s->max_blocksize = get_bits(&gb, 16); + if (s->max_blocksize < FLAC_MIN_BLOCKSIZE) { + av_log(avctx, AV_LOG_WARNING, "invalid max blocksize: %d\n", + s->max_blocksize); + s->max_blocksize = 16; + return AVERROR_INVALIDDATA; + } + + skip_bits(&gb, 24); /* skip min frame size */ + s->max_framesize = get_bits(&gb, 24); + + s->samplerate = get_bits(&gb, 20); + s->channels = get_bits(&gb, 3) + 1; + s->bps = get_bits(&gb, 5) + 1; + + if (s->bps < 4) { + av_log(avctx, AV_LOG_ERROR, "invalid bps: %d\n", s->bps); + s->bps = 16; + return AVERROR_INVALIDDATA; + } + + avctx->sample_rate = s->samplerate; + avctx->bits_per_raw_sample = s->bps; + ff_flac_set_channel_layout(avctx, s->channels); + + s->samples = get_bits64(&gb, 36); + + skip_bits_long(&gb, 64); /* md5 sum */ + skip_bits_long(&gb, 64); /* md5 sum */ + + return 0; +} diff --git a/media/ffvpx/libavcodec/flac.h b/media/ffvpx/libavcodec/flac.h new file mode 100644 index 0000000000..00e631ed20 --- /dev/null +++ b/media/ffvpx/libavcodec/flac.h @@ -0,0 +1,75 @@ +/* + * FLAC (Free Lossless Audio Codec) common stuff + * Copyright (c) 2008 Justin Ruggles + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * FLAC (Free Lossless Audio Codec) common stuff + */ + +#ifndef AVCODEC_FLAC_H +#define AVCODEC_FLAC_H + +#include "libavutil/intreadwrite.h" + +#define FLAC_STREAMINFO_SIZE 34 +#define FLAC_MAX_CHANNELS 8 +#define FLAC_MIN_BLOCKSIZE 16 +#define FLAC_MAX_BLOCKSIZE 65535 +#define FLAC_MIN_FRAME_SIZE 10 + +enum { + FLAC_CHMODE_INDEPENDENT = 0, + FLAC_CHMODE_LEFT_SIDE = 1, + FLAC_CHMODE_RIGHT_SIDE = 2, + FLAC_CHMODE_MID_SIDE = 3, +}; + +enum { + FLAC_METADATA_TYPE_STREAMINFO = 0, + FLAC_METADATA_TYPE_PADDING, + FLAC_METADATA_TYPE_APPLICATION, + FLAC_METADATA_TYPE_SEEKTABLE, + FLAC_METADATA_TYPE_VORBIS_COMMENT, + FLAC_METADATA_TYPE_CUESHEET, + FLAC_METADATA_TYPE_PICTURE, + FLAC_METADATA_TYPE_INVALID = 127 +}; + +/** + * Parse the metadata block parameters from the header. + * @param[in] block_header header data, at least 4 bytes + * @param[out] last indicator for last metadata block + * @param[out] type metadata block type + * @param[out] size metadata block size + */ +static av_always_inline void flac_parse_block_header(const uint8_t *block_header, + int *last, int *type, int *size) +{ + int tmp = *block_header; + if (last) + *last = tmp & 0x80; + if (type) + *type = tmp & 0x7F; + if (size) + *size = AV_RB24(block_header + 1); +} + +#endif /* AVCODEC_FLAC_H */ diff --git a/media/ffvpx/libavcodec/flac_parse.h b/media/ffvpx/libavcodec/flac_parse.h new file mode 100644 index 0000000000..67a7320bea --- /dev/null +++ b/media/ffvpx/libavcodec/flac_parse.h @@ -0,0 +1,89 @@ +/* + * FLAC (Free Lossless Audio Codec) decoder/parser common functions + * Copyright (c) 2008 Justin Ruggles + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * FLAC (Free Lossless Audio Codec) decoder/parser common functions + */ + +#ifndef AVCODEC_FLAC_PARSE_H +#define AVCODEC_FLAC_PARSE_H + +#include "avcodec.h" +#include "get_bits.h" + +typedef struct FLACStreaminfo { + int samplerate; /**< sample rate */ + int channels; /**< number of channels */ + int bps; /**< bits-per-sample */ + int max_blocksize; /**< maximum block size, in samples */ + int max_framesize; /**< maximum frame size, in bytes */ + int64_t samples; /**< total number of samples */ +} FLACStreaminfo; + +typedef struct FLACFrameInfo { + int samplerate; /**< sample rate */ + int channels; /**< number of channels */ + int bps; /**< bits-per-sample */ + int blocksize; /**< block size of the frame */ + int ch_mode; /**< channel decorrelation mode */ + int64_t frame_or_sample_num; /**< frame number or sample number */ + int is_var_size; /**< specifies if the stream uses variable + block sizes or a fixed block size; + also determines the meaning of + frame_or_sample_num */ +} FLACFrameInfo; + +/** + * Parse the Streaminfo metadata block + * @param[out] avctx codec context to set basic stream parameters + * @param[out] s where parsed information is stored + * @param[in] buffer pointer to start of 34-byte streaminfo data + * + * @return negative error code on faiure or >= 0 on success + */ +int ff_flac_parse_streaminfo(AVCodecContext *avctx, struct FLACStreaminfo *s, + const uint8_t *buffer); + +/** + * Validate the FLAC extradata. + * @param[in] avctx codec context containing the extradata. + * @param[out] format extradata format. + * @param[out] streaminfo_start pointer to start of 34-byte STREAMINFO data. + * @return 1 if valid, 0 if not valid. + */ +int ff_flac_is_extradata_valid(AVCodecContext *avctx, + uint8_t **streaminfo_start); + +/** + * Validate and decode a frame header. + * @param avctx AVCodecContext to use as av_log() context + * @param gb GetBitContext from which to read frame header + * @param[out] fi frame information + * @param log_level_offset log level offset. can be used to silence error messages. + * @return non-zero on error, 0 if ok + */ +int ff_flac_decode_frame_header(AVCodecContext *avctx, GetBitContext *gb, + FLACFrameInfo *fi, int log_level_offset); + +void ff_flac_set_channel_layout(AVCodecContext *avctx, int channels); + +#endif /* AVCODEC_FLAC_PARSE_H */ diff --git a/media/ffvpx/libavcodec/flacdata.c b/media/ffvpx/libavcodec/flacdata.c new file mode 100644 index 0000000000..d96e3e0966 --- /dev/null +++ b/media/ffvpx/libavcodec/flacdata.c @@ -0,0 +1,33 @@ +/* + * FLAC data + * Copyright (c) 2003 Alex Beregszaszi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "flacdata.h" + +const int ff_flac_sample_rate_table[16] = +{ 0, + 88200, 176400, 192000, + 8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000, + 0, 0, 0, 0 }; + +const int32_t ff_flac_blocksize_table[16] = { + 0, 192, 576<<0, 576<<1, 576<<2, 576<<3, 0, 0, +256<<0, 256<<1, 256<<2, 256<<3, 256<<4, 256<<5, 256<<6, 256<<7 +}; diff --git a/media/ffvpx/libavcodec/flacdata.h b/media/ffvpx/libavcodec/flacdata.h new file mode 100644 index 0000000000..ef21840777 --- /dev/null +++ b/media/ffvpx/libavcodec/flacdata.h @@ -0,0 +1,31 @@ +/* + * FLAC data header + * Copyright (c) 2003 Alex Beregszaszi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FLACDATA_H +#define AVCODEC_FLACDATA_H + +#include <stdint.h> + +extern const int ff_flac_sample_rate_table[16]; + +extern const int32_t ff_flac_blocksize_table[16]; + +#endif /* AVCODEC_FLACDATA_H */ diff --git a/media/ffvpx/libavcodec/flacdec.c b/media/ffvpx/libavcodec/flacdec.c new file mode 100644 index 0000000000..cc778a8dff --- /dev/null +++ b/media/ffvpx/libavcodec/flacdec.c @@ -0,0 +1,846 @@ +/* + * FLAC (Free Lossless Audio Codec) decoder + * Copyright (c) 2003 Alex Beregszaszi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * FLAC (Free Lossless Audio Codec) decoder + * @author Alex Beregszaszi + * @see http://flac.sourceforge.net/ + * + * This decoder can be used in 1 of 2 ways: Either raw FLAC data can be fed + * through, starting from the initial 'fLaC' signature; or by passing the + * 34-byte streaminfo structure through avctx->extradata[_size] followed + * by data starting with the 0xFFF8 marker. + */ + +#include <limits.h> + +#include "libavutil/avassert.h" +#include "libavutil/crc.h" +#include "libavutil/opt.h" +#include "avcodec.h" +#include "codec_internal.h" +#include "get_bits.h" +#include "bytestream.h" +#include "golomb.h" +#include "flac.h" +#include "flacdata.h" +#include "flacdsp.h" +#include "flac_parse.h" +#include "thread.h" +#include "unary.h" + + +typedef struct FLACContext { + AVClass *class; + FLACStreaminfo stream_info; + + AVCodecContext *avctx; ///< parent AVCodecContext + GetBitContext gb; ///< GetBitContext initialized to start at the current frame + + int blocksize; ///< number of samples in the current frame + int sample_shift; ///< shift required to make output samples 16-bit or 32-bit + int ch_mode; ///< channel decorrelation type in the current frame + int got_streaminfo; ///< indicates if the STREAMINFO has been read + + int32_t *decoded[FLAC_MAX_CHANNELS]; ///< decoded samples + uint8_t *decoded_buffer; + unsigned int decoded_buffer_size; + int64_t *decoded_33bps; ///< decoded samples for a 33 bps subframe + uint8_t *decoded_buffer_33bps; + unsigned int decoded_buffer_size_33bps; + int buggy_lpc; ///< use workaround for old lavc encoded files + + FLACDSPContext dsp; +} FLACContext; + +static int allocate_buffers(FLACContext *s); + +static void flac_set_bps(FLACContext *s) +{ + enum AVSampleFormat req = s->avctx->request_sample_fmt; + int need32 = s->stream_info.bps > 16; + int want32 = av_get_bytes_per_sample(req) > 2; + int planar = av_sample_fmt_is_planar(req); + + if (need32 || want32) { + if (planar) + s->avctx->sample_fmt = AV_SAMPLE_FMT_S32P; + else + s->avctx->sample_fmt = AV_SAMPLE_FMT_S32; + s->sample_shift = 32 - s->stream_info.bps; + } else { + if (planar) + s->avctx->sample_fmt = AV_SAMPLE_FMT_S16P; + else + s->avctx->sample_fmt = AV_SAMPLE_FMT_S16; + s->sample_shift = 16 - s->stream_info.bps; + } +} + +static av_cold int flac_decode_init(AVCodecContext *avctx) +{ + uint8_t *streaminfo; + int ret; + FLACContext *s = avctx->priv_data; + s->avctx = avctx; + + /* for now, the raw FLAC header is allowed to be passed to the decoder as + frame data instead of extradata. */ + if (!avctx->extradata) + return 0; + + if (!ff_flac_is_extradata_valid(avctx, &streaminfo)) + return AVERROR_INVALIDDATA; + + /* initialize based on the demuxer-supplied streamdata header */ + ret = ff_flac_parse_streaminfo(avctx, &s->stream_info, streaminfo); + if (ret < 0) + return ret; + ret = allocate_buffers(s); + if (ret < 0) + return ret; + flac_set_bps(s); + ff_flacdsp_init(&s->dsp, avctx->sample_fmt, + s->stream_info.channels); + s->got_streaminfo = 1; + + return 0; +} + +static void dump_headers(AVCodecContext *avctx, FLACStreaminfo *s) +{ + av_log(avctx, AV_LOG_DEBUG, " Max Blocksize: %d\n", s->max_blocksize); + av_log(avctx, AV_LOG_DEBUG, " Max Framesize: %d\n", s->max_framesize); + av_log(avctx, AV_LOG_DEBUG, " Samplerate: %d\n", s->samplerate); + av_log(avctx, AV_LOG_DEBUG, " Channels: %d\n", s->channels); + av_log(avctx, AV_LOG_DEBUG, " Bits: %d\n", s->bps); +} + +static int allocate_buffers(FLACContext *s) +{ + int buf_size; + int ret; + + av_assert0(s->stream_info.max_blocksize); + + buf_size = av_samples_get_buffer_size(NULL, s->stream_info.channels, + s->stream_info.max_blocksize, + AV_SAMPLE_FMT_S32P, 0); + if (buf_size < 0) + return buf_size; + + av_fast_malloc(&s->decoded_buffer, &s->decoded_buffer_size, buf_size); + if (!s->decoded_buffer) + return AVERROR(ENOMEM); + + ret = av_samples_fill_arrays((uint8_t **)s->decoded, NULL, + s->decoded_buffer, + s->stream_info.channels, + s->stream_info.max_blocksize, + AV_SAMPLE_FMT_S32P, 0); + if (ret >= 0 && s->stream_info.bps == 32 && s->stream_info.channels == 2) { + buf_size = av_samples_get_buffer_size(NULL, 1, + s->stream_info.max_blocksize, + AV_SAMPLE_FMT_S64P, 0); + if (buf_size < 0) + return buf_size; + + av_fast_malloc(&s->decoded_buffer_33bps, &s->decoded_buffer_size_33bps, buf_size); + if (!s->decoded_buffer_33bps) + return AVERROR(ENOMEM); + + ret = av_samples_fill_arrays((uint8_t **)&s->decoded_33bps, NULL, + s->decoded_buffer_33bps, + 1, + s->stream_info.max_blocksize, + AV_SAMPLE_FMT_S64P, 0); + + } + return ret < 0 ? ret : 0; +} + +/** + * Parse the STREAMINFO from an inline header. + * @param s the flac decoding context + * @param buf input buffer, starting with the "fLaC" marker + * @param buf_size buffer size + * @return non-zero if metadata is invalid + */ +static int parse_streaminfo(FLACContext *s, const uint8_t *buf, int buf_size) +{ + int metadata_type, metadata_size, ret; + + if (buf_size < FLAC_STREAMINFO_SIZE+8) { + /* need more data */ + return 0; + } + flac_parse_block_header(&buf[4], NULL, &metadata_type, &metadata_size); + if (metadata_type != FLAC_METADATA_TYPE_STREAMINFO || + metadata_size != FLAC_STREAMINFO_SIZE) { + return AVERROR_INVALIDDATA; + } + ret = ff_flac_parse_streaminfo(s->avctx, &s->stream_info, &buf[8]); + if (ret < 0) + return ret; + ret = allocate_buffers(s); + if (ret < 0) + return ret; + flac_set_bps(s); + ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt, + s->stream_info.channels); + s->got_streaminfo = 1; + + return 0; +} + +/** + * Determine the size of an inline header. + * @param buf input buffer, starting with the "fLaC" marker + * @param buf_size buffer size + * @return number of bytes in the header, or 0 if more data is needed + */ +static int get_metadata_size(const uint8_t *buf, int buf_size) +{ + int metadata_last, metadata_size; + const uint8_t *buf_end = buf + buf_size; + + buf += 4; + do { + if (buf_end - buf < 4) + return AVERROR_INVALIDDATA; + flac_parse_block_header(buf, &metadata_last, NULL, &metadata_size); + buf += 4; + if (buf_end - buf < metadata_size) { + /* need more data in order to read the complete header */ + return AVERROR_INVALIDDATA; + } + buf += metadata_size; + } while (!metadata_last); + + return buf_size - (buf_end - buf); +} + +static int decode_residuals(FLACContext *s, int32_t *decoded, int pred_order) +{ + GetBitContext gb = s->gb; + int i, tmp, partition, method_type, rice_order; + int rice_bits, rice_esc; + int samples; + + method_type = get_bits(&gb, 2); + rice_order = get_bits(&gb, 4); + + samples = s->blocksize >> rice_order; + rice_bits = 4 + method_type; + rice_esc = (1 << rice_bits) - 1; + + decoded += pred_order; + i = pred_order; + + if (method_type > 1) { + av_log(s->avctx, AV_LOG_ERROR, "illegal residual coding method %d\n", + method_type); + return AVERROR_INVALIDDATA; + } + + if (samples << rice_order != s->blocksize) { + av_log(s->avctx, AV_LOG_ERROR, "invalid rice order: %i blocksize %i\n", + rice_order, s->blocksize); + return AVERROR_INVALIDDATA; + } + + if (pred_order > samples) { + av_log(s->avctx, AV_LOG_ERROR, "invalid predictor order: %i > %i\n", + pred_order, samples); + return AVERROR_INVALIDDATA; + } + + for (partition = 0; partition < (1 << rice_order); partition++) { + tmp = get_bits(&gb, rice_bits); + if (tmp == rice_esc) { + tmp = get_bits(&gb, 5); + for (; i < samples; i++) + *decoded++ = get_sbits_long(&gb, tmp); + } else { + int real_limit = (tmp > 1) ? (INT_MAX >> (tmp - 1)) + 2 : INT_MAX; + for (; i < samples; i++) { + int v = get_sr_golomb_flac(&gb, tmp, real_limit, 1); + if (v == 0x80000000){ + av_log(s->avctx, AV_LOG_ERROR, "invalid residual\n"); + return AVERROR_INVALIDDATA; + } + + *decoded++ = v; + } + } + i= 0; + } + + s->gb = gb; + + return 0; +} + +static int decode_subframe_fixed(FLACContext *s, int32_t *decoded, + int pred_order, int bps) +{ + const int blocksize = s->blocksize; + unsigned av_uninit(a), av_uninit(b), av_uninit(c), av_uninit(d); + int i; + int ret; + + /* warm up samples */ + for (i = 0; i < pred_order; i++) { + decoded[i] = get_sbits_long(&s->gb, bps); + } + + if ((ret = decode_residuals(s, decoded, pred_order)) < 0) + return ret; + + if (pred_order > 0) + a = decoded[pred_order-1]; + if (pred_order > 1) + b = a - decoded[pred_order-2]; + if (pred_order > 2) + c = b - decoded[pred_order-2] + decoded[pred_order-3]; + if (pred_order > 3) + d = c - decoded[pred_order-2] + 2U*decoded[pred_order-3] - decoded[pred_order-4]; + + switch (pred_order) { + case 0: + break; + case 1: + for (i = pred_order; i < blocksize; i++) + decoded[i] = a += decoded[i]; + break; + case 2: + for (i = pred_order; i < blocksize; i++) + decoded[i] = a += b += decoded[i]; + break; + case 3: + for (i = pred_order; i < blocksize; i++) + decoded[i] = a += b += c += decoded[i]; + break; + case 4: + for (i = pred_order; i < blocksize; i++) + decoded[i] = a += b += c += d += decoded[i]; + break; + default: + av_log(s->avctx, AV_LOG_ERROR, "illegal pred order %d\n", pred_order); + return AVERROR_INVALIDDATA; + } + + return 0; +} + +#define DECODER_SUBFRAME_FIXED_WIDE(residual) { \ + const int blocksize = s->blocksize; \ + int ret; \ + \ + if ((ret = decode_residuals(s, residual, pred_order)) < 0) \ + return ret; \ + \ + switch (pred_order) { \ + case 0: \ + for (int i = pred_order; i < blocksize; i++) \ + decoded[i] = residual[i]; \ + break; \ + case 1: \ + for (int i = pred_order; i < blocksize; i++) \ + decoded[i] = (int64_t)residual[i] + (int64_t)decoded[i-1];\ + break; \ + case 2: \ + for (int i = pred_order; i < blocksize; i++) \ + decoded[i] = (int64_t)residual[i] + 2*(int64_t)decoded[i-1] - (int64_t)decoded[i-2]; \ + break; \ + case 3: \ + for (int i = pred_order; i < blocksize; i++) \ + decoded[i] = (int64_t)residual[i] + 3*(int64_t)decoded[i-1] - 3*(int64_t)decoded[i-2] + (int64_t)decoded[i-3]; \ + break; \ + case 4: \ + for (int i = pred_order; i < blocksize; i++) \ + decoded[i] = (int64_t)residual[i] + 4*(int64_t)decoded[i-1] - 6*(int64_t)decoded[i-2] + 4*(int64_t)decoded[i-3] - (int64_t)decoded[i-4]; \ + break; \ + default: \ + av_log(s->avctx, AV_LOG_ERROR, "illegal pred order %d\n", pred_order); \ + return AVERROR_INVALIDDATA; \ + } \ + return 0; \ +} + +static int decode_subframe_fixed_wide(FLACContext *s, int32_t *decoded, + int pred_order, int bps) +{ + /* warm up samples */ + for (int i = 0; i < pred_order; i++) { + decoded[i] = get_sbits_long(&s->gb, bps); + } + DECODER_SUBFRAME_FIXED_WIDE(decoded); +} + + +static int decode_subframe_fixed_33bps(FLACContext *s, int64_t *decoded, + int32_t *residual, int pred_order) +{ + /* warm up samples */ \ + for (int i = 0; i < pred_order; i++) { \ + decoded[i] = get_sbits64(&s->gb, 33); \ + } \ + DECODER_SUBFRAME_FIXED_WIDE(residual); +} + +static void lpc_analyze_remodulate(SUINT32 *decoded, const int coeffs[32], + int order, int qlevel, int len, int bps) +{ + int i, j; + int ebps = 1 << (bps-1); + unsigned sigma = 0; + + for (i = order; i < len; i++) + sigma |= decoded[i] + ebps; + + if (sigma < 2*ebps) + return; + + for (i = len - 1; i >= order; i--) { + int64_t p = 0; + for (j = 0; j < order; j++) + p += coeffs[j] * (int64_t)(int32_t)decoded[i-order+j]; + decoded[i] -= p >> qlevel; + } + for (i = order; i < len; i++, decoded++) { + int32_t p = 0; + for (j = 0; j < order; j++) + p += coeffs[j] * (uint32_t)decoded[j]; + decoded[j] += p >> qlevel; + } +} + +static int decode_subframe_lpc(FLACContext *s, int32_t *decoded, int pred_order, + int bps) +{ + int i, ret; + int coeff_prec, qlevel; + int coeffs[32]; + + /* warm up samples */ + for (i = 0; i < pred_order; i++) { + decoded[i] = get_sbits_long(&s->gb, bps); + } + + coeff_prec = get_bits(&s->gb, 4) + 1; + if (coeff_prec == 16) { + av_log(s->avctx, AV_LOG_ERROR, "invalid coeff precision\n"); + return AVERROR_INVALIDDATA; + } + qlevel = get_sbits(&s->gb, 5); + if (qlevel < 0) { + av_log(s->avctx, AV_LOG_ERROR, "qlevel %d not supported, maybe buggy stream\n", + qlevel); + return AVERROR_INVALIDDATA; + } + + for (i = 0; i < pred_order; i++) { + coeffs[pred_order - i - 1] = get_sbits(&s->gb, coeff_prec); + } + + if ((ret = decode_residuals(s, decoded, pred_order)) < 0) + return ret; + + if ( ( s->buggy_lpc && s->stream_info.bps <= 16) + || ( !s->buggy_lpc && bps <= 16 + && bps + coeff_prec + av_log2(pred_order) <= 32)) { + s->dsp.lpc16(decoded, coeffs, pred_order, qlevel, s->blocksize); + } else { + s->dsp.lpc32(decoded, coeffs, pred_order, qlevel, s->blocksize); + if (s->stream_info.bps <= 16) + lpc_analyze_remodulate(decoded, coeffs, pred_order, qlevel, s->blocksize, bps); + } + + return 0; +} + +static int decode_subframe_lpc_33bps(FLACContext *s, int64_t *decoded, + int32_t *residual, int pred_order) +{ + int i, j, ret; + int coeff_prec, qlevel; + int coeffs[32]; + + /* warm up samples */ + for (i = 0; i < pred_order; i++) { + decoded[i] = get_sbits64(&s->gb, 33); + } + + coeff_prec = get_bits(&s->gb, 4) + 1; + if (coeff_prec == 16) { + av_log(s->avctx, AV_LOG_ERROR, "invalid coeff precision\n"); + return AVERROR_INVALIDDATA; + } + qlevel = get_sbits(&s->gb, 5); + if (qlevel < 0) { + av_log(s->avctx, AV_LOG_ERROR, "qlevel %d not supported, maybe buggy stream\n", + qlevel); + return AVERROR_INVALIDDATA; + } + + for (i = 0; i < pred_order; i++) { + coeffs[pred_order - i - 1] = get_sbits(&s->gb, coeff_prec); + } + + if ((ret = decode_residuals(s, residual, pred_order)) < 0) + return ret; + + for (i = pred_order; i < s->blocksize; i++, decoded++) { + int64_t sum = 0; + for (j = 0; j < pred_order; j++) + sum += (int64_t)coeffs[j] * decoded[j]; + decoded[j] = residual[i] + (sum >> qlevel); + } + + return 0; +} + +static inline int decode_subframe(FLACContext *s, int channel) +{ + int32_t *decoded = s->decoded[channel]; + int type, wasted = 0; + int bps = s->stream_info.bps; + int i, ret; + + if (channel == 0) { + if (s->ch_mode == FLAC_CHMODE_RIGHT_SIDE) + bps++; + } else { + if (s->ch_mode == FLAC_CHMODE_LEFT_SIDE || s->ch_mode == FLAC_CHMODE_MID_SIDE) + bps++; + } + + if (get_bits1(&s->gb)) { + av_log(s->avctx, AV_LOG_ERROR, "invalid subframe padding\n"); + return AVERROR_INVALIDDATA; + } + type = get_bits(&s->gb, 6); + + if (get_bits1(&s->gb)) { + int left = get_bits_left(&s->gb); + if ( left <= 0 || + (left < bps && !show_bits_long(&s->gb, left)) || + !show_bits_long(&s->gb, bps-1)) { + av_log(s->avctx, AV_LOG_ERROR, + "Invalid number of wasted bits > available bits (%d) - left=%d\n", + bps, left); + return AVERROR_INVALIDDATA; + } + wasted = 1 + get_unary(&s->gb, 1, get_bits_left(&s->gb)); + bps -= wasted; + } + +//FIXME use av_log2 for types + if (type == 0) { + if (bps < 33) { + int32_t tmp = get_sbits_long(&s->gb, bps); + for (i = 0; i < s->blocksize; i++) + decoded[i] = tmp; + } else { + int64_t tmp = get_sbits64(&s->gb, 33); + for (i = 0; i < s->blocksize; i++) + s->decoded_33bps[i] = tmp; + } + } else if (type == 1) { + if (bps < 33) { + for (i = 0; i < s->blocksize; i++) + decoded[i] = get_sbits_long(&s->gb, bps); + } else { + for (i = 0; i < s->blocksize; i++) + s->decoded_33bps[i] = get_sbits64(&s->gb, 33); + } + } else if ((type >= 8) && (type <= 12)) { + int order = type & ~0x8; + if (bps < 33) { + if (bps + order <= 32) { + if ((ret = decode_subframe_fixed(s, decoded, order, bps)) < 0) + return ret; + } else { + if ((ret = decode_subframe_fixed_wide(s, decoded, order, bps)) < 0) + return ret; + } + } else { + if ((ret = decode_subframe_fixed_33bps(s, s->decoded_33bps, decoded, order)) < 0) + return ret; + } + } else if (type >= 32) { + if (bps < 33) { + if ((ret = decode_subframe_lpc(s, decoded, (type & ~0x20)+1, bps)) < 0) + return ret; + } else { + if ((ret = decode_subframe_lpc_33bps(s, s->decoded_33bps, decoded, (type & ~0x20)+1)) < 0) + return ret; + } + } else { + av_log(s->avctx, AV_LOG_ERROR, "invalid coding type\n"); + return AVERROR_INVALIDDATA; + } + + if (wasted) { + if (wasted+bps == 33) { + int i; + for (i = 0; i < s->blocksize; i++) + s->decoded_33bps[i] = (uint64_t)decoded[i] << wasted; + } else if (wasted < 32) { + int i; + for (i = 0; i < s->blocksize; i++) + decoded[i] = (unsigned)decoded[i] << wasted; + } + } + + return 0; +} + +static int decode_frame(FLACContext *s) +{ + int i, ret; + GetBitContext *gb = &s->gb; + FLACFrameInfo fi; + + if ((ret = ff_flac_decode_frame_header(s->avctx, gb, &fi, 0)) < 0) { + av_log(s->avctx, AV_LOG_ERROR, "invalid frame header\n"); + return ret; + } + + if ( s->stream_info.channels + && fi.channels != s->stream_info.channels + && s->got_streaminfo) { + s->stream_info.channels = fi.channels; + ff_flac_set_channel_layout(s->avctx, fi.channels); + ret = allocate_buffers(s); + if (ret < 0) + return ret; + } + s->stream_info.channels = fi.channels; + ff_flac_set_channel_layout(s->avctx, fi.channels); + s->ch_mode = fi.ch_mode; + + if (!s->stream_info.bps && !fi.bps) { + av_log(s->avctx, AV_LOG_ERROR, "bps not found in STREAMINFO or frame header\n"); + return AVERROR_INVALIDDATA; + } + if (!fi.bps) { + fi.bps = s->stream_info.bps; + } else if (s->stream_info.bps && fi.bps != s->stream_info.bps) { + av_log(s->avctx, AV_LOG_ERROR, "switching bps mid-stream is not " + "supported\n"); + return AVERROR_INVALIDDATA; + } + + if (!s->stream_info.bps) { + s->stream_info.bps = s->avctx->bits_per_raw_sample = fi.bps; + flac_set_bps(s); + } + + if (!s->stream_info.max_blocksize) + s->stream_info.max_blocksize = FLAC_MAX_BLOCKSIZE; + if (fi.blocksize > s->stream_info.max_blocksize) { + av_log(s->avctx, AV_LOG_ERROR, "blocksize %d > %d\n", fi.blocksize, + s->stream_info.max_blocksize); + return AVERROR_INVALIDDATA; + } + s->blocksize = fi.blocksize; + + if (!s->stream_info.samplerate && !fi.samplerate) { + av_log(s->avctx, AV_LOG_ERROR, "sample rate not found in STREAMINFO" + " or frame header\n"); + return AVERROR_INVALIDDATA; + } + if (fi.samplerate == 0) + fi.samplerate = s->stream_info.samplerate; + s->stream_info.samplerate = s->avctx->sample_rate = fi.samplerate; + + if (!s->got_streaminfo) { + ret = allocate_buffers(s); + if (ret < 0) + return ret; + s->got_streaminfo = 1; + dump_headers(s->avctx, &s->stream_info); + } + ff_flacdsp_init(&s->dsp, s->avctx->sample_fmt, + s->stream_info.channels); + +// dump_headers(s->avctx, &s->stream_info); + + /* subframes */ + for (i = 0; i < s->stream_info.channels; i++) { + if ((ret = decode_subframe(s, i)) < 0) + return ret; + } + + align_get_bits(gb); + + /* frame footer */ + skip_bits(gb, 16); /* data crc */ + + return 0; +} + +static void decorrelate_33bps(int ch_mode, int32_t **decoded, int64_t *decoded_33bps, int len) +{ + int i; + if (ch_mode == FLAC_CHMODE_LEFT_SIDE ) { + for (i = 0; i < len; i++) + decoded[1][i] = decoded[0][i] - decoded_33bps[i]; + } else if (ch_mode == FLAC_CHMODE_RIGHT_SIDE ) { + for (i = 0; i < len; i++) + decoded[0][i] = decoded[1][i] + decoded_33bps[i]; + } else if (ch_mode == FLAC_CHMODE_MID_SIDE ) { + for (i = 0; i < len; i++) { + uint64_t a = decoded[0][i]; + int64_t b = decoded_33bps[i]; + a -= b >> 1; + decoded[0][i] = (a + b); + decoded[1][i] = a; + } + } +} + +static int flac_decode_frame(AVCodecContext *avctx, AVFrame *frame, + int *got_frame_ptr, AVPacket *avpkt) +{ + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; + FLACContext *s = avctx->priv_data; + int bytes_read = 0; + int ret; + + *got_frame_ptr = 0; + + if (buf_size > 5 && !memcmp(buf, "\177FLAC", 5)) { + av_log(s->avctx, AV_LOG_DEBUG, "skipping flac header packet 1\n"); + return buf_size; + } + + if (buf_size > 0 && (*buf & 0x7F) == FLAC_METADATA_TYPE_VORBIS_COMMENT) { + av_log(s->avctx, AV_LOG_DEBUG, "skipping vorbis comment\n"); + return buf_size; + } + + /* check that there is at least the smallest decodable amount of data. + this amount corresponds to the smallest valid FLAC frame possible. + FF F8 69 02 00 00 9A 00 00 34 */ + if (buf_size < FLAC_MIN_FRAME_SIZE) + return buf_size; + + /* check for inline header */ + if (AV_RB32(buf) == MKBETAG('f','L','a','C')) { + if (!s->got_streaminfo && (ret = parse_streaminfo(s, buf, buf_size))) { + av_log(s->avctx, AV_LOG_ERROR, "invalid header\n"); + return ret; + } + return get_metadata_size(buf, buf_size); + } + + /* decode frame */ + if ((ret = init_get_bits8(&s->gb, buf, buf_size)) < 0) + return ret; + if ((ret = decode_frame(s)) < 0) { + av_log(s->avctx, AV_LOG_ERROR, "decode_frame() failed\n"); + return ret; + } + bytes_read = get_bits_count(&s->gb)/8; + + if ((s->avctx->err_recognition & (AV_EF_CRCCHECK|AV_EF_COMPLIANT)) && + av_crc(av_crc_get_table(AV_CRC_16_ANSI), + 0, buf, bytes_read)) { + av_log(s->avctx, AV_LOG_ERROR, "CRC error at PTS %"PRId64"\n", avpkt->pts); + if (s->avctx->err_recognition & AV_EF_EXPLODE) + return AVERROR_INVALIDDATA; + } + + /* get output buffer */ + frame->nb_samples = s->blocksize; + if ((ret = ff_thread_get_buffer(avctx, frame, 0)) < 0) + return ret; + + if (s->stream_info.bps == 32 && s->ch_mode > 0) { + decorrelate_33bps(s->ch_mode, s->decoded, s->decoded_33bps, s->blocksize); + s->dsp.decorrelate[0](frame->data, s->decoded, s->stream_info.channels, + s->blocksize, s->sample_shift); + } else { + s->dsp.decorrelate[s->ch_mode](frame->data, s->decoded, + s->stream_info.channels, + s->blocksize, s->sample_shift); + } + + if (bytes_read > buf_size) { + av_log(s->avctx, AV_LOG_ERROR, "overread: %d\n", bytes_read - buf_size); + return AVERROR_INVALIDDATA; + } + if (bytes_read < buf_size) { + av_log(s->avctx, AV_LOG_DEBUG, "underread: %d orig size: %d\n", + buf_size - bytes_read, buf_size); + } + + *got_frame_ptr = 1; + + return bytes_read; +} + +static av_cold int flac_decode_close(AVCodecContext *avctx) +{ + FLACContext *s = avctx->priv_data; + + av_freep(&s->decoded_buffer); + av_freep(&s->decoded_buffer_33bps); + + return 0; +} + +static const AVOption options[] = { +{ "use_buggy_lpc", "emulate old buggy lavc behavior", offsetof(FLACContext, buggy_lpc), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_AUDIO_PARAM }, +{ NULL }, +}; + +static const AVClass flac_decoder_class = { + .class_name = "FLAC decoder", + .item_name = av_default_item_name, + .option = options, + .version = LIBAVUTIL_VERSION_INT, +}; + +const FFCodec ff_flac_decoder = { + .p.name = "flac", + CODEC_LONG_NAME("FLAC (Free Lossless Audio Codec)"), + .p.type = AVMEDIA_TYPE_AUDIO, + .p.id = AV_CODEC_ID_FLAC, + .priv_data_size = sizeof(FLACContext), + .init = flac_decode_init, + .close = flac_decode_close, + FF_CODEC_DECODE_CB(flac_decode_frame), + .p.capabilities = AV_CODEC_CAP_CHANNEL_CONF | + AV_CODEC_CAP_DR1 | + AV_CODEC_CAP_FRAME_THREADS, + .p.sample_fmts = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16, + AV_SAMPLE_FMT_S16P, + AV_SAMPLE_FMT_S32, + AV_SAMPLE_FMT_S32P, + AV_SAMPLE_FMT_NONE }, + .p.priv_class = &flac_decoder_class, +}; diff --git a/media/ffvpx/libavcodec/flacdsp.c b/media/ffvpx/libavcodec/flacdsp.c new file mode 100644 index 0000000000..42e231db53 --- /dev/null +++ b/media/ffvpx/libavcodec/flacdsp.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2012 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/internal.h" +#include "libavutil/samplefmt.h" +#include "flacdsp.h" +#include "config.h" + +#define SAMPLE_SIZE 16 +#define PLANAR 0 +#include "flacdsp_template.c" + +#undef PLANAR +#define PLANAR 1 +#include "flacdsp_template.c" + +#undef SAMPLE_SIZE +#undef PLANAR +#define SAMPLE_SIZE 32 +#define PLANAR 0 +#include "flacdsp_template.c" + +#undef PLANAR +#define PLANAR 1 +#include "flacdsp_template.c" + +static void flac_lpc_16_c(int32_t *decoded, const int coeffs[32], + int pred_order, int qlevel, int len) +{ + int i, j; + + for (i = pred_order; i < len - 1; i += 2, decoded += 2) { + SUINT c = coeffs[0]; + SUINT d = decoded[0]; + int s0 = 0, s1 = 0; + for (j = 1; j < pred_order; j++) { + s0 += c*d; + d = decoded[j]; + s1 += c*d; + c = coeffs[j]; + } + s0 += c*d; + d = decoded[j] += (SUINT)(s0 >> qlevel); + s1 += c*d; + decoded[j + 1] += (SUINT)(s1 >> qlevel); + } + if (i < len) { + int sum = 0; + for (j = 0; j < pred_order; j++) + sum += coeffs[j] * (SUINT)decoded[j]; + decoded[j] = decoded[j] + (unsigned)(sum >> qlevel); + } +} + +static void flac_lpc_32_c(int32_t *decoded, const int coeffs[32], + int pred_order, int qlevel, int len) +{ + int i, j; + + for (i = pred_order; i < len; i++, decoded++) { + int64_t sum = 0; + for (j = 0; j < pred_order; j++) + sum += (int64_t)coeffs[j] * decoded[j]; + decoded[j] += sum >> qlevel; + } + +} + +av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int channels) +{ + c->lpc16 = flac_lpc_16_c; + c->lpc32 = flac_lpc_32_c; + + switch (fmt) { + case AV_SAMPLE_FMT_S32: + c->decorrelate[0] = flac_decorrelate_indep_c_32; + c->decorrelate[1] = flac_decorrelate_ls_c_32; + c->decorrelate[2] = flac_decorrelate_rs_c_32; + c->decorrelate[3] = flac_decorrelate_ms_c_32; + break; + + case AV_SAMPLE_FMT_S32P: + c->decorrelate[0] = flac_decorrelate_indep_c_32p; + c->decorrelate[1] = flac_decorrelate_ls_c_32p; + c->decorrelate[2] = flac_decorrelate_rs_c_32p; + c->decorrelate[3] = flac_decorrelate_ms_c_32p; + break; + + case AV_SAMPLE_FMT_S16: + c->decorrelate[0] = flac_decorrelate_indep_c_16; + c->decorrelate[1] = flac_decorrelate_ls_c_16; + c->decorrelate[2] = flac_decorrelate_rs_c_16; + c->decorrelate[3] = flac_decorrelate_ms_c_16; + break; + + case AV_SAMPLE_FMT_S16P: + c->decorrelate[0] = flac_decorrelate_indep_c_16p; + c->decorrelate[1] = flac_decorrelate_ls_c_16p; + c->decorrelate[2] = flac_decorrelate_rs_c_16p; + c->decorrelate[3] = flac_decorrelate_ms_c_16p; + break; + } + +#if ARCH_ARM + ff_flacdsp_init_arm(c, fmt, channels); +#elif ARCH_X86 + ff_flacdsp_init_x86(c, fmt, channels); +#endif +} diff --git a/media/ffvpx/libavcodec/flacdsp.h b/media/ffvpx/libavcodec/flacdsp.h new file mode 100644 index 0000000000..9f8ed38b66 --- /dev/null +++ b/media/ffvpx/libavcodec/flacdsp.h @@ -0,0 +1,43 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FLACDSP_H +#define AVCODEC_FLACDSP_H + +#include <stdint.h> + +#include "libavutil/samplefmt.h" + +typedef struct FLACDSPContext { + void (*decorrelate[4])(uint8_t **out, int32_t **in, int channels, + int len, int shift); + void (*lpc16)(int32_t *samples, const int coeffs[32], int order, + int qlevel, int len); + void (*lpc32)(int32_t *samples, const int coeffs[32], int order, + int qlevel, int len); + void (*lpc16_encode)(int32_t *res, const int32_t *smp, int len, int order, + const int32_t coefs[32], int shift); + void (*lpc32_encode)(int32_t *res, const int32_t *smp, int len, int order, + const int32_t coefs[32], int shift); +} FLACDSPContext; + +void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int channels); +void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels); +void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels); + +#endif /* AVCODEC_FLACDSP_H */ diff --git a/media/ffvpx/libavcodec/flacdsp_lpc_template.c b/media/ffvpx/libavcodec/flacdsp_lpc_template.c new file mode 100644 index 0000000000..dd847d3b32 --- /dev/null +++ b/media/ffvpx/libavcodec/flacdsp_lpc_template.c @@ -0,0 +1,159 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> +#include "libavutil/common.h" +#include "mathops.h" + +#undef FUNC +#undef sum_type +#undef MUL +#undef CLIP +#undef FSUF + +#define FUNC(n) AV_JOIN(n ## _, SAMPLE_SIZE) + +#if SAMPLE_SIZE == 32 +# define sum_type int64_t +# define MUL(a, b) MUL64(a, b) +# define CLIP(x) av_clipl_int32(x) +#else +# define sum_type int32_t +# define MUL(a, b) ((a) * (b)) +# define CLIP(x) (x) +#endif + +#define LPC1(x) { \ + int c = coefs[(x)-1]; \ + p0 += MUL(c, s); \ + s = smp[i-(x)+1]; \ + p1 += MUL(c, s); \ +} + +static av_always_inline void FUNC(lpc_encode_unrolled)(int32_t *res, + const int32_t *smp, int len, int order, + const int32_t *coefs, int shift, int big) +{ + int i; + for (i = order; i < len; i += 2) { + int s = smp[i-order]; + sum_type p0 = 0, p1 = 0; + if (big) { + switch (order) { + case 32: LPC1(32) + case 31: LPC1(31) + case 30: LPC1(30) + case 29: LPC1(29) + case 28: LPC1(28) + case 27: LPC1(27) + case 26: LPC1(26) + case 25: LPC1(25) + case 24: LPC1(24) + case 23: LPC1(23) + case 22: LPC1(22) + case 21: LPC1(21) + case 20: LPC1(20) + case 19: LPC1(19) + case 18: LPC1(18) + case 17: LPC1(17) + case 16: LPC1(16) + case 15: LPC1(15) + case 14: LPC1(14) + case 13: LPC1(13) + case 12: LPC1(12) + case 11: LPC1(11) + case 10: LPC1(10) + case 9: LPC1( 9) + LPC1( 8) + LPC1( 7) + LPC1( 6) + LPC1( 5) + LPC1( 4) + LPC1( 3) + LPC1( 2) + LPC1( 1) + } + } else { + switch (order) { + case 8: LPC1( 8) + case 7: LPC1( 7) + case 6: LPC1( 6) + case 5: LPC1( 5) + case 4: LPC1( 4) + case 3: LPC1( 3) + case 2: LPC1( 2) + case 1: LPC1( 1) + } + } + res[i ] = smp[i ] - CLIP(p0 >> shift); + res[i+1] = smp[i+1] - CLIP(p1 >> shift); + } +} + +static void FUNC(flac_lpc_encode_c)(int32_t *res, const int32_t *smp, int len, + int order, const int32_t *coefs, int shift) +{ + int i; + for (i = 0; i < order; i++) + res[i] = smp[i]; +#if CONFIG_SMALL + for (i = order; i < len; i += 2) { + int j; + int s = smp[i]; + sum_type p0 = 0, p1 = 0; + for (j = 0; j < order; j++) { + int c = coefs[j]; + p1 += MUL(c, s); + s = smp[i-j-1]; + p0 += MUL(c, s); + } + res[i ] = smp[i ] - CLIP(p0 >> shift); + res[i+1] = smp[i+1] - CLIP(p1 >> shift); + } +#else + switch (order) { + case 1: FUNC(lpc_encode_unrolled)(res, smp, len, 1, coefs, shift, 0); break; + case 2: FUNC(lpc_encode_unrolled)(res, smp, len, 2, coefs, shift, 0); break; + case 3: FUNC(lpc_encode_unrolled)(res, smp, len, 3, coefs, shift, 0); break; + case 4: FUNC(lpc_encode_unrolled)(res, smp, len, 4, coefs, shift, 0); break; + case 5: FUNC(lpc_encode_unrolled)(res, smp, len, 5, coefs, shift, 0); break; + case 6: FUNC(lpc_encode_unrolled)(res, smp, len, 6, coefs, shift, 0); break; + case 7: FUNC(lpc_encode_unrolled)(res, smp, len, 7, coefs, shift, 0); break; + case 8: FUNC(lpc_encode_unrolled)(res, smp, len, 8, coefs, shift, 0); break; + default: FUNC(lpc_encode_unrolled)(res, smp, len, order, coefs, shift, 1); break; + } +#endif +} + +/* Comment for clarity/de-obfuscation. + * + * for (int i = order; i < len; i++) { + * int32_t p = 0; + * for (int j = 0; j < order; j++) { + * int c = coefs[j]; + * int s = smp[(i-1)-j]; + * p += c*s; + * } + * res[i] = smp[i] - (p >> shift); + * } + * + * The CONFIG_SMALL code above simplifies to this, in the case of SAMPLE_SIZE + * not being equal to 32 (at the present time that means for 16-bit audio). The + * code above does 2 samples per iteration. Commit bfdd5bc (made all the way + * back in 2007) says that way is faster. + */ diff --git a/media/ffvpx/libavcodec/flacdsp_template.c b/media/ffvpx/libavcodec/flacdsp_template.c new file mode 100644 index 0000000000..0a6fe59e28 --- /dev/null +++ b/media/ffvpx/libavcodec/flacdsp_template.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2012 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> +#include "libavutil/macros.h" + +#undef FUNC +#undef FSUF +#undef sample +#undef sample_type +#undef OUT +#undef S + +#if SAMPLE_SIZE == 32 +# define sample_type int32_t +#else +# define sample_type int16_t +#endif + +#if PLANAR +# define FSUF AV_JOIN(SAMPLE_SIZE, p) +# define sample sample_type * +# define OUT(n) n +# define S(s, c, i) (s[c][i]) +#else +# define FSUF SAMPLE_SIZE +# define sample sample_type +# define OUT(n) n[0] +# define S(s, c, i) (*s++) +#endif + +#define FUNC(n) AV_JOIN(n ## _, FSUF) + +static void FUNC(flac_decorrelate_indep_c)(uint8_t **out, int32_t **in, + int channels, int len, int shift) +{ + sample *samples = (sample *) OUT(out); + int i, j; + + for (j = 0; j < len; j++) + for (i = 0; i < channels; i++) + S(samples, i, j) = (int)((unsigned)in[i][j] << shift); +} + +static void FUNC(flac_decorrelate_ls_c)(uint8_t **out, int32_t **in, + int channels, int len, int shift) +{ + sample *samples = (sample *) OUT(out); + int i; + + for (i = 0; i < len; i++) { + unsigned a = in[0][i]; + unsigned b = in[1][i]; + S(samples, 0, i) = a << shift; + S(samples, 1, i) = (a - b) << shift; + } +} + +static void FUNC(flac_decorrelate_rs_c)(uint8_t **out, int32_t **in, + int channels, int len, int shift) +{ + sample *samples = (sample *) OUT(out); + int i; + + for (i = 0; i < len; i++) { + unsigned a = in[0][i]; + unsigned b = in[1][i]; + S(samples, 0, i) = (a + b) << shift; + S(samples, 1, i) = b << shift; + } +} + +static void FUNC(flac_decorrelate_ms_c)(uint8_t **out, int32_t **in, + int channels, int len, int shift) +{ + sample *samples = (sample *) OUT(out); + int i; + + for (i = 0; i < len; i++) { + unsigned a = in[0][i]; + int b = in[1][i]; + a -= b >> 1; + S(samples, 0, i) = (a + b) << shift; + S(samples, 1, i) = a << shift; + } +} diff --git a/media/ffvpx/libavcodec/frame_thread_encoder.h b/media/ffvpx/libavcodec/frame_thread_encoder.h new file mode 100644 index 0000000000..201cba2a8f --- /dev/null +++ b/media/ffvpx/libavcodec/frame_thread_encoder.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2012 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FRAME_THREAD_ENCODER_H +#define AVCODEC_FRAME_THREAD_ENCODER_H + +#include "avcodec.h" + +/** + * Initialize frame thread encoder. + * @note hardware encoders are not supported + */ +int ff_frame_thread_encoder_init(AVCodecContext *avctx); +void ff_frame_thread_encoder_free(AVCodecContext *avctx); +int ff_thread_video_encode_frame(AVCodecContext *avctx, AVPacket *pkt, + AVFrame *frame, int *got_packet_ptr); + +#endif /* AVCODEC_FRAME_THREAD_ENCODER_H */ diff --git a/media/ffvpx/libavcodec/get_bits.h b/media/ffvpx/libavcodec/get_bits.h new file mode 100644 index 0000000000..65dc080ddb --- /dev/null +++ b/media/ffvpx/libavcodec/get_bits.h @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * bitstream reader API header. + */ + +#ifndef AVCODEC_GET_BITS_H +#define AVCODEC_GET_BITS_H + +#include <stdint.h> + +#include "libavutil/common.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/avassert.h" + +#include "defs.h" +#include "mathops.h" +#include "vlc.h" + +/* + * Safe bitstream reading: + * optionally, the get_bits API can check to ensure that we + * don't read past input buffer boundaries. This is protected + * with CONFIG_SAFE_BITSTREAM_READER at the global level, and + * then below that with UNCHECKED_BITSTREAM_READER at the per- + * decoder level. This means that decoders that check internally + * can "#define UNCHECKED_BITSTREAM_READER 1" to disable + * overread checks. + * Boundary checking causes a minor performance penalty so for + * applications that won't want/need this, it can be disabled + * globally using "#define CONFIG_SAFE_BITSTREAM_READER 0". + */ +#ifndef UNCHECKED_BITSTREAM_READER +#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER +#endif + +#ifndef CACHED_BITSTREAM_READER +#define CACHED_BITSTREAM_READER 0 +#endif + +#if CACHED_BITSTREAM_READER + +// we always want the LE implementation, to provide get_bits_le() +#define BITSTREAM_LE + +#ifndef BITSTREAM_READER_LE +# define BITSTREAM_BE +# define BITSTREAM_DEFAULT_BE +#endif + +#include "bitstream.h" + +#undef BITSTREAM_LE +#undef BITSTREAM_BE +#undef BITSTREAM_DEFAULT_BE + +typedef BitstreamContext GetBitContext; + +#define get_bits_count bits_tell +#define get_bits_left bits_left +#define skip_bits_long bits_skip +#define skip_bits bits_skip +#define get_bits bits_read_nz +#define get_bitsz bits_read +#define get_bits_long bits_read +#define get_bits1 bits_read_bit +#define get_bits64 bits_read_64 +#define get_xbits bits_read_xbits +#define get_sbits bits_read_signed_nz +#define get_sbits_long bits_read_signed +#define show_bits bits_peek +#define show_bits_long bits_peek +#define init_get_bits bits_init +#define init_get_bits8 bits_init8 +#define align_get_bits bits_align +#define get_vlc2 bits_read_vlc + +#define init_get_bits8_le(s, buffer, byte_size) bits_init8_le((BitstreamContextLE*)s, buffer, byte_size) +#define get_bits_le(s, n) bits_read_le((BitstreamContextLE*)s, n) + +#define show_bits1(s) bits_peek(s, 1) +#define skip_bits1(s) bits_skip(s, 1) + +#define skip_1stop_8data_bits bits_skip_1stop_8data + +#else // CACHED_BITSTREAM_READER + +typedef struct GetBitContext { + const uint8_t *buffer, *buffer_end; + int index; + int size_in_bits; + int size_in_bits_plus8; +} GetBitContext; + +static inline unsigned int get_bits(GetBitContext *s, int n); +static inline void skip_bits(GetBitContext *s, int n); +static inline unsigned int show_bits(GetBitContext *s, int n); + +/* Bitstream reader API docs: + * name + * arbitrary name which is used as prefix for the internal variables + * + * gb + * getbitcontext + * + * OPEN_READER(name, gb) + * load gb into local variables + * + * CLOSE_READER(name, gb) + * store local vars in gb + * + * UPDATE_CACHE(name, gb) + * Refill the internal cache from the bitstream. + * After this call at least MIN_CACHE_BITS will be available. + * + * GET_CACHE(name, gb) + * Will output the contents of the internal cache, + * next bit is MSB of 32 or 64 bits (FIXME 64 bits). + * + * SHOW_UBITS(name, gb, num) + * Will return the next num bits. + * + * SHOW_SBITS(name, gb, num) + * Will return the next num bits and do sign extension. + * + * SKIP_BITS(name, gb, num) + * Will skip over the next num bits. + * Note, this is equivalent to SKIP_CACHE; SKIP_COUNTER. + * + * SKIP_CACHE(name, gb, num) + * Will remove the next num bits from the cache (note SKIP_COUNTER + * MUST be called before UPDATE_CACHE / CLOSE_READER). + * + * SKIP_COUNTER(name, gb, num) + * Will increment the internal bit counter (see SKIP_CACHE & SKIP_BITS). + * + * LAST_SKIP_BITS(name, gb, num) + * Like SKIP_BITS, to be used if next call is UPDATE_CACHE or CLOSE_READER. + * + * BITS_LEFT(name, gb) + * Return the number of bits left + * + * For examples see get_bits, show_bits, skip_bits, get_vlc. + */ + +#if defined LONG_BITSTREAM_READER +# define MIN_CACHE_BITS 32 +#else +# define MIN_CACHE_BITS 25 +#endif + +#define OPEN_READER_NOSIZE(name, gb) \ + unsigned int name ## _index = (gb)->index; \ + unsigned int av_unused name ## _cache + +#if UNCHECKED_BITSTREAM_READER +#define OPEN_READER(name, gb) OPEN_READER_NOSIZE(name, gb) + +#define BITS_AVAILABLE(name, gb) 1 +#else +#define OPEN_READER(name, gb) \ + OPEN_READER_NOSIZE(name, gb); \ + unsigned int name ## _size_plus8 = (gb)->size_in_bits_plus8 + +#define BITS_AVAILABLE(name, gb) name ## _index < name ## _size_plus8 +#endif + +#define CLOSE_READER(name, gb) (gb)->index = name ## _index + +# ifdef LONG_BITSTREAM_READER + +# define UPDATE_CACHE_LE(name, gb) name ## _cache = \ + AV_RL64((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7) + +# define UPDATE_CACHE_BE(name, gb) name ## _cache = \ + AV_RB64((gb)->buffer + (name ## _index >> 3)) >> (32 - (name ## _index & 7)) + +#else + +# define UPDATE_CACHE_LE(name, gb) name ## _cache = \ + AV_RL32((gb)->buffer + (name ## _index >> 3)) >> (name ## _index & 7) + +# define UPDATE_CACHE_BE(name, gb) name ## _cache = \ + AV_RB32((gb)->buffer + (name ## _index >> 3)) << (name ## _index & 7) + +#endif + + +#ifdef BITSTREAM_READER_LE + +# define UPDATE_CACHE(name, gb) UPDATE_CACHE_LE(name, gb) + +# define SKIP_CACHE(name, gb, num) name ## _cache >>= (num) + +#else + +# define UPDATE_CACHE(name, gb) UPDATE_CACHE_BE(name, gb) + +# define SKIP_CACHE(name, gb, num) name ## _cache <<= (num) + +#endif + +#if UNCHECKED_BITSTREAM_READER +# define SKIP_COUNTER(name, gb, num) name ## _index += (num) +#else +# define SKIP_COUNTER(name, gb, num) \ + name ## _index = FFMIN(name ## _size_plus8, name ## _index + (num)) +#endif + +#define BITS_LEFT(name, gb) ((int)((gb)->size_in_bits - name ## _index)) + +#define SKIP_BITS(name, gb, num) \ + do { \ + SKIP_CACHE(name, gb, num); \ + SKIP_COUNTER(name, gb, num); \ + } while (0) + +#define LAST_SKIP_BITS(name, gb, num) SKIP_COUNTER(name, gb, num) + +#define SHOW_UBITS_LE(name, gb, num) zero_extend(name ## _cache, num) +#define SHOW_SBITS_LE(name, gb, num) sign_extend(name ## _cache, num) + +#define SHOW_UBITS_BE(name, gb, num) NEG_USR32(name ## _cache, num) +#define SHOW_SBITS_BE(name, gb, num) NEG_SSR32(name ## _cache, num) + +#ifdef BITSTREAM_READER_LE +# define SHOW_UBITS(name, gb, num) SHOW_UBITS_LE(name, gb, num) +# define SHOW_SBITS(name, gb, num) SHOW_SBITS_LE(name, gb, num) +#else +# define SHOW_UBITS(name, gb, num) SHOW_UBITS_BE(name, gb, num) +# define SHOW_SBITS(name, gb, num) SHOW_SBITS_BE(name, gb, num) +#endif + +#define GET_CACHE(name, gb) ((uint32_t) name ## _cache) + + +static inline int get_bits_count(const GetBitContext *s) +{ + return s->index; +} + +/** + * Skips the specified number of bits. + * @param n the number of bits to skip, + * For the UNCHECKED_BITSTREAM_READER this must not cause the distance + * from the start to overflow int32_t. Staying within the bitstream + padding + * is sufficient, too. + */ +static inline void skip_bits_long(GetBitContext *s, int n) +{ +#if UNCHECKED_BITSTREAM_READER + s->index += n; +#else + s->index += av_clip(n, -s->index, s->size_in_bits_plus8 - s->index); +#endif +} + +/** + * Read MPEG-1 dc-style VLC (sign bit + mantissa with no MSB). + * if MSB not set it is negative + * @param n length in bits + */ +static inline int get_xbits(GetBitContext *s, int n) +{ + register int sign; + register int32_t cache; + OPEN_READER(re, s); + av_assert2(n>0 && n<=25); + UPDATE_CACHE(re, s); + cache = GET_CACHE(re, s); + sign = ~cache >> 31; + LAST_SKIP_BITS(re, s, n); + CLOSE_READER(re, s); + return (NEG_USR32(sign ^ cache, n) ^ sign) - sign; +} + +static inline int get_xbits_le(GetBitContext *s, int n) +{ + register int sign; + register int32_t cache; + OPEN_READER(re, s); + av_assert2(n>0 && n<=25); + UPDATE_CACHE_LE(re, s); + cache = GET_CACHE(re, s); + sign = sign_extend(~cache, n) >> 31; + LAST_SKIP_BITS(re, s, n); + CLOSE_READER(re, s); + return (zero_extend(sign ^ cache, n) ^ sign) - sign; +} + +static inline int get_sbits(GetBitContext *s, int n) +{ + register int tmp; + OPEN_READER(re, s); + av_assert2(n>0 && n<=25); + UPDATE_CACHE(re, s); + tmp = SHOW_SBITS(re, s, n); + LAST_SKIP_BITS(re, s, n); + CLOSE_READER(re, s); + return tmp; +} + +/** + * Read 1-25 bits. + */ +static inline unsigned int get_bits(GetBitContext *s, int n) +{ + register unsigned int tmp; + OPEN_READER(re, s); + av_assert2(n>0 && n<=25); + UPDATE_CACHE(re, s); + tmp = SHOW_UBITS(re, s, n); + LAST_SKIP_BITS(re, s, n); + CLOSE_READER(re, s); + av_assert2(tmp < UINT64_C(1) << n); + return tmp; +} + +/** + * Read 0-25 bits. + */ +static av_always_inline int get_bitsz(GetBitContext *s, int n) +{ + return n ? get_bits(s, n) : 0; +} + +static inline unsigned int get_bits_le(GetBitContext *s, int n) +{ + register int tmp; + OPEN_READER(re, s); + av_assert2(n>0 && n<=25); + UPDATE_CACHE_LE(re, s); + tmp = SHOW_UBITS_LE(re, s, n); + LAST_SKIP_BITS(re, s, n); + CLOSE_READER(re, s); + return tmp; +} + +/** + * Show 1-25 bits. + */ +static inline unsigned int show_bits(GetBitContext *s, int n) +{ + register unsigned int tmp; + OPEN_READER_NOSIZE(re, s); + av_assert2(n>0 && n<=25); + UPDATE_CACHE(re, s); + tmp = SHOW_UBITS(re, s, n); + return tmp; +} + +static inline void skip_bits(GetBitContext *s, int n) +{ + OPEN_READER(re, s); + LAST_SKIP_BITS(re, s, n); + CLOSE_READER(re, s); +} + +static inline unsigned int get_bits1(GetBitContext *s) +{ + unsigned int index = s->index; + uint8_t result = s->buffer[index >> 3]; +#ifdef BITSTREAM_READER_LE + result >>= index & 7; + result &= 1; +#else + result <<= index & 7; + result >>= 8 - 1; +#endif +#if !UNCHECKED_BITSTREAM_READER + if (s->index < s->size_in_bits_plus8) +#endif + index++; + s->index = index; + + return result; +} + +static inline unsigned int show_bits1(GetBitContext *s) +{ + return show_bits(s, 1); +} + +static inline void skip_bits1(GetBitContext *s) +{ + skip_bits(s, 1); +} + +/** + * Read 0-32 bits. + */ +static inline unsigned int get_bits_long(GetBitContext *s, int n) +{ + av_assert2(n>=0 && n<=32); + if (!n) { + return 0; + } else if (n <= MIN_CACHE_BITS) { + return get_bits(s, n); + } else { +#ifdef BITSTREAM_READER_LE + unsigned ret = get_bits(s, 16); + return ret | (get_bits(s, n - 16) << 16); +#else + unsigned ret = get_bits(s, 16) << (n - 16); + return ret | get_bits(s, n - 16); +#endif + } +} + +/** + * Read 0-64 bits. + */ +static inline uint64_t get_bits64(GetBitContext *s, int n) +{ + if (n <= 32) { + return get_bits_long(s, n); + } else { +#ifdef BITSTREAM_READER_LE + uint64_t ret = get_bits_long(s, 32); + return ret | (uint64_t) get_bits_long(s, n - 32) << 32; +#else + uint64_t ret = (uint64_t) get_bits_long(s, n - 32) << 32; + return ret | get_bits_long(s, 32); +#endif + } +} + +/** + * Read 0-32 bits as a signed integer. + */ +static inline int get_sbits_long(GetBitContext *s, int n) +{ + // sign_extend(x, 0) is undefined + if (!n) + return 0; + + return sign_extend(get_bits_long(s, n), n); +} + +/** + * Read 0-64 bits as a signed integer. + */ +static inline int64_t get_sbits64(GetBitContext *s, int n) +{ + // sign_extend(x, 0) is undefined + if (!n) + return 0; + + return sign_extend64(get_bits64(s, n), n); +} + +/** + * Show 0-32 bits. + */ +static inline unsigned int show_bits_long(GetBitContext *s, int n) +{ + if (n <= MIN_CACHE_BITS) { + return show_bits(s, n); + } else { + GetBitContext gb = *s; + return get_bits_long(&gb, n); + } +} + + +/** + * Initialize GetBitContext. + * @param buffer bitstream buffer, must be AV_INPUT_BUFFER_PADDING_SIZE bytes + * larger than the actual read bits because some optimized bitstream + * readers read 32 or 64 bit at once and could read over the end + * @param bit_size the size of the buffer in bits + * @return 0 on success, AVERROR_INVALIDDATA if the buffer_size would overflow. + */ +static inline int init_get_bits(GetBitContext *s, const uint8_t *buffer, + int bit_size) +{ + int buffer_size; + int ret = 0; + + if (bit_size >= INT_MAX - FFMAX(7, AV_INPUT_BUFFER_PADDING_SIZE*8) || bit_size < 0 || !buffer) { + bit_size = 0; + buffer = NULL; + ret = AVERROR_INVALIDDATA; + } + + buffer_size = (bit_size + 7) >> 3; + + s->buffer = buffer; + s->size_in_bits = bit_size; + s->size_in_bits_plus8 = bit_size + 8; + s->buffer_end = buffer + buffer_size; + s->index = 0; + + return ret; +} + +/** + * Initialize GetBitContext. + * @param buffer bitstream buffer, must be AV_INPUT_BUFFER_PADDING_SIZE bytes + * larger than the actual read bits because some optimized bitstream + * readers read 32 or 64 bit at once and could read over the end + * @param byte_size the size of the buffer in bytes + * @return 0 on success, AVERROR_INVALIDDATA if the buffer_size would overflow. + */ +static inline int init_get_bits8(GetBitContext *s, const uint8_t *buffer, + int byte_size) +{ + if (byte_size > INT_MAX / 8 || byte_size < 0) + byte_size = -1; + return init_get_bits(s, buffer, byte_size * 8); +} + +static inline int init_get_bits8_le(GetBitContext *s, const uint8_t *buffer, + int byte_size) +{ + if (byte_size > INT_MAX / 8 || byte_size < 0) + byte_size = -1; + return init_get_bits(s, buffer, byte_size * 8); +} + +static inline const uint8_t *align_get_bits(GetBitContext *s) +{ + int n = -get_bits_count(s) & 7; + if (n) + skip_bits(s, n); + return s->buffer + (s->index >> 3); +} + +/** + * If the vlc code is invalid and max_depth=1, then no bits will be removed. + * If the vlc code is invalid and max_depth>1, then the number of bits removed + * is undefined. + */ +#define GET_VLC(code, name, gb, table, bits, max_depth) \ + do { \ + int n, nb_bits; \ + unsigned int index; \ + \ + index = SHOW_UBITS(name, gb, bits); \ + code = table[index].sym; \ + n = table[index].len; \ + \ + if (max_depth > 1 && n < 0) { \ + LAST_SKIP_BITS(name, gb, bits); \ + UPDATE_CACHE(name, gb); \ + \ + nb_bits = -n; \ + \ + index = SHOW_UBITS(name, gb, nb_bits) + code; \ + code = table[index].sym; \ + n = table[index].len; \ + if (max_depth > 2 && n < 0) { \ + LAST_SKIP_BITS(name, gb, nb_bits); \ + UPDATE_CACHE(name, gb); \ + \ + nb_bits = -n; \ + \ + index = SHOW_UBITS(name, gb, nb_bits) + code; \ + code = table[index].sym; \ + n = table[index].len; \ + } \ + } \ + SKIP_BITS(name, gb, n); \ + } while (0) + +#define GET_RL_VLC(level, run, name, gb, table, bits, \ + max_depth, need_update) \ + do { \ + int n, nb_bits; \ + unsigned int index; \ + \ + index = SHOW_UBITS(name, gb, bits); \ + level = table[index].level; \ + n = table[index].len; \ + \ + if (max_depth > 1 && n < 0) { \ + SKIP_BITS(name, gb, bits); \ + if (need_update) { \ + UPDATE_CACHE(name, gb); \ + } \ + \ + nb_bits = -n; \ + \ + index = SHOW_UBITS(name, gb, nb_bits) + level; \ + level = table[index].level; \ + n = table[index].len; \ + if (max_depth > 2 && n < 0) { \ + LAST_SKIP_BITS(name, gb, nb_bits); \ + if (need_update) { \ + UPDATE_CACHE(name, gb); \ + } \ + nb_bits = -n; \ + \ + index = SHOW_UBITS(name, gb, nb_bits) + level; \ + level = table[index].level; \ + n = table[index].len; \ + } \ + } \ + run = table[index].run; \ + SKIP_BITS(name, gb, n); \ + } while (0) + +/** + * Parse a vlc code. + * @param bits is the number of bits which will be read at once, must be + * identical to nb_bits in init_vlc() + * @param max_depth is the number of times bits bits must be read to completely + * read the longest vlc code + * = (max_vlc_length + bits - 1) / bits + * @returns the code parsed or -1 if no vlc matches + */ +static av_always_inline int get_vlc2(GetBitContext *s, const VLCElem *table, + int bits, int max_depth) +{ + int code; + + OPEN_READER(re, s); + UPDATE_CACHE(re, s); + + GET_VLC(code, re, s, table, bits, max_depth); + + CLOSE_READER(re, s); + + return code; +} + +static inline int decode012(GetBitContext *gb) +{ + int n; + n = get_bits1(gb); + if (n == 0) + return 0; + else + return get_bits1(gb) + 1; +} + +static inline int decode210(GetBitContext *gb) +{ + if (get_bits1(gb)) + return 0; + else + return 2 - get_bits1(gb); +} + +static inline int get_bits_left(GetBitContext *gb) +{ + return gb->size_in_bits - get_bits_count(gb); +} + +static inline int skip_1stop_8data_bits(GetBitContext *gb) +{ + if (get_bits_left(gb) <= 0) + return AVERROR_INVALIDDATA; + + while (get_bits1(gb)) { + skip_bits(gb, 8); + if (get_bits_left(gb) <= 0) + return AVERROR_INVALIDDATA; + } + + return 0; +} + +#endif // CACHED_BITSTREAM_READER + +#endif /* AVCODEC_GET_BITS_H */ diff --git a/media/ffvpx/libavcodec/get_buffer.c b/media/ffvpx/libavcodec/get_buffer.c new file mode 100644 index 0000000000..a04fd878de --- /dev/null +++ b/media/ffvpx/libavcodec/get_buffer.c @@ -0,0 +1,304 @@ +/* + * The default get_buffer2() implementation + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/avassert.h" +#include "libavutil/avutil.h" +#include "libavutil/buffer.h" +#include "libavutil/frame.h" +#include "libavutil/hwcontext.h" +#include "libavutil/imgutils.h" +#include "libavutil/mem.h" +#include "libavutil/samplefmt.h" +#include "libavutil/version.h" + +#include "avcodec.h" +#include "internal.h" + +typedef struct FramePool { + /** + * Pools for each data plane. For audio all the planes have the same size, + * so only pools[0] is used. + */ + AVBufferPool *pools[4]; + + /* + * Pool parameters + */ + int format; + int width, height; + int stride_align[AV_NUM_DATA_POINTERS]; + int linesize[4]; + int planes; + int channels; + int samples; +} FramePool; + +static void frame_pool_free(void *opaque, uint8_t *data) +{ + FramePool *pool = (FramePool*)data; + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(pool->pools); i++) + av_buffer_pool_uninit(&pool->pools[i]); + + av_freep(&data); +} + +static AVBufferRef *frame_pool_alloc(void) +{ + FramePool *pool = av_mallocz(sizeof(*pool)); + AVBufferRef *buf; + + if (!pool) + return NULL; + + buf = av_buffer_create((uint8_t*)pool, sizeof(*pool), + frame_pool_free, NULL, 0); + if (!buf) { + av_freep(&pool); + return NULL; + } + + return buf; +} + +static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame) +{ + FramePool *pool = avctx->internal->pool ? + (FramePool*)avctx->internal->pool->data : NULL; + AVBufferRef *pool_buf; + int i, ret, ch, planes; + + if (avctx->codec_type == AVMEDIA_TYPE_AUDIO) { + int planar = av_sample_fmt_is_planar(frame->format); + ch = frame->ch_layout.nb_channels; +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + if (!ch) + ch = frame->channels; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + planes = planar ? ch : 1; + } + + if (pool && pool->format == frame->format) { + if (avctx->codec_type == AVMEDIA_TYPE_VIDEO && + pool->width == frame->width && pool->height == frame->height) + return 0; + if (avctx->codec_type == AVMEDIA_TYPE_AUDIO && pool->planes == planes && + pool->channels == ch && frame->nb_samples == pool->samples) + return 0; + } + + pool_buf = frame_pool_alloc(); + if (!pool_buf) + return AVERROR(ENOMEM); + pool = (FramePool*)pool_buf->data; + + switch (avctx->codec_type) { + case AVMEDIA_TYPE_VIDEO: { + int linesize[4]; + int w = frame->width; + int h = frame->height; + int unaligned; + ptrdiff_t linesize1[4]; + size_t size[4]; + + avcodec_align_dimensions2(avctx, &w, &h, pool->stride_align); + + do { + // NOTE: do not align linesizes individually, this breaks e.g. assumptions + // that linesize[0] == 2*linesize[1] in the MPEG-encoder for 4:2:2 + ret = av_image_fill_linesizes(linesize, avctx->pix_fmt, w); + if (ret < 0) + goto fail; + // increase alignment of w for next try (rhs gives the lowest bit set in w) + w += w & ~(w - 1); + + unaligned = 0; + for (i = 0; i < 4; i++) + unaligned |= linesize[i] % pool->stride_align[i]; + } while (unaligned); + + for (i = 0; i < 4; i++) + linesize1[i] = linesize[i]; + ret = av_image_fill_plane_sizes(size, avctx->pix_fmt, h, linesize1); + if (ret < 0) + goto fail; + + for (i = 0; i < 4; i++) { + pool->linesize[i] = linesize[i]; + if (size[i]) { + if (size[i] > INT_MAX - (16 + STRIDE_ALIGN - 1)) { + ret = AVERROR(EINVAL); + goto fail; + } + pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1, + CONFIG_MEMORY_POISONING ? + NULL : + av_buffer_allocz); + if (!pool->pools[i]) { + ret = AVERROR(ENOMEM); + goto fail; + } + } + } + pool->format = frame->format; + pool->width = frame->width; + pool->height = frame->height; + + break; + } + case AVMEDIA_TYPE_AUDIO: { + ret = av_samples_get_buffer_size(&pool->linesize[0], ch, + frame->nb_samples, frame->format, 0); + if (ret < 0) + goto fail; + + pool->pools[0] = av_buffer_pool_init(pool->linesize[0], NULL); + if (!pool->pools[0]) { + ret = AVERROR(ENOMEM); + goto fail; + } + + pool->format = frame->format; + pool->planes = planes; + pool->channels = ch; + pool->samples = frame->nb_samples; + break; + } + default: av_assert0(0); + } + + av_buffer_unref(&avctx->internal->pool); + avctx->internal->pool = pool_buf; + + return 0; +fail: + av_buffer_unref(&pool_buf); + return ret; +} + +static int audio_get_buffer(AVCodecContext *avctx, AVFrame *frame) +{ + FramePool *pool = (FramePool*)avctx->internal->pool->data; + int planes = pool->planes; + int i; + + frame->linesize[0] = pool->linesize[0]; + + if (planes > AV_NUM_DATA_POINTERS) { + frame->extended_data = av_calloc(planes, sizeof(*frame->extended_data)); + frame->nb_extended_buf = planes - AV_NUM_DATA_POINTERS; + frame->extended_buf = av_calloc(frame->nb_extended_buf, + sizeof(*frame->extended_buf)); + if (!frame->extended_data || !frame->extended_buf) { + av_freep(&frame->extended_data); + av_freep(&frame->extended_buf); + return AVERROR(ENOMEM); + } + } else { + frame->extended_data = frame->data; + av_assert0(frame->nb_extended_buf == 0); + } + + for (i = 0; i < FFMIN(planes, AV_NUM_DATA_POINTERS); i++) { + frame->buf[i] = av_buffer_pool_get(pool->pools[0]); + if (!frame->buf[i]) + goto fail; + frame->extended_data[i] = frame->data[i] = frame->buf[i]->data; + } + for (i = 0; i < frame->nb_extended_buf; i++) { + frame->extended_buf[i] = av_buffer_pool_get(pool->pools[0]); + if (!frame->extended_buf[i]) + goto fail; + frame->extended_data[i + AV_NUM_DATA_POINTERS] = frame->extended_buf[i]->data; + } + + if (avctx->debug & FF_DEBUG_BUFFERS) + av_log(avctx, AV_LOG_DEBUG, "default_get_buffer called on frame %p", frame); + + return 0; +fail: + av_frame_unref(frame); + return AVERROR(ENOMEM); +} + +static int video_get_buffer(AVCodecContext *s, AVFrame *pic) +{ + FramePool *pool = (FramePool*)s->internal->pool->data; + int i; + + if (pic->data[0] || pic->data[1] || pic->data[2] || pic->data[3]) { + av_log(s, AV_LOG_ERROR, "pic->data[*]!=NULL in avcodec_default_get_buffer\n"); + return -1; + } + + memset(pic->data, 0, sizeof(pic->data)); + pic->extended_data = pic->data; + + for (i = 0; i < 4 && pool->pools[i]; i++) { + pic->linesize[i] = pool->linesize[i]; + + pic->buf[i] = av_buffer_pool_get(pool->pools[i]); + if (!pic->buf[i]) + goto fail; + + pic->data[i] = pic->buf[i]->data; + } + for (; i < AV_NUM_DATA_POINTERS; i++) { + pic->data[i] = NULL; + pic->linesize[i] = 0; + } + + if (s->debug & FF_DEBUG_BUFFERS) + av_log(s, AV_LOG_DEBUG, "default_get_buffer called on pic %p\n", pic); + + return 0; +fail: + av_frame_unref(pic); + return AVERROR(ENOMEM); +} + +int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags) +{ + int ret; + + if (avctx->hw_frames_ctx) { + ret = av_hwframe_get_buffer(avctx->hw_frames_ctx, frame, 0); + frame->width = avctx->coded_width; + frame->height = avctx->coded_height; + return ret; + } + + if ((ret = update_frame_pool(avctx, frame)) < 0) + return ret; + + switch (avctx->codec_type) { + case AVMEDIA_TYPE_VIDEO: + return video_get_buffer(avctx, frame); + case AVMEDIA_TYPE_AUDIO: + return audio_get_buffer(avctx, frame); + default: + return -1; + } +} diff --git a/media/ffvpx/libavcodec/golomb.c b/media/ffvpx/libavcodec/golomb.c new file mode 100644 index 0000000000..f9ca8149eb --- /dev/null +++ b/media/ffvpx/libavcodec/golomb.c @@ -0,0 +1,173 @@ +/* + * exp golomb vlc stuff + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * @brief + * exp golomb vlc stuff + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#include <stdint.h> + +const uint8_t ff_golomb_vlc_len[512]={ +19,17,15,15,13,13,13,13,11,11,11,11,11,11,11,11,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, +7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, +5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, +5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; + +const uint8_t ff_ue_golomb_vlc_code[512]={ +32,32,32,32,32,32,32,32,31,32,32,32,32,32,32,32,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30, + 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +const int8_t ff_se_golomb_vlc_code[512]={ + 17, 17, 17, 17, 17, 17, 17, 17, 16, 17, 17, 17, 17, 17, 17, 17, 8, -8, 9, -9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15, + 4, 4, 4, 4, -4, -4, -4, -4, 5, 5, 5, 5, -5, -5, -5, -5, 6, 6, 6, 6, -6, -6, -6, -6, 7, 7, 7, 7, -7, -7, -7, -7, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + + +const uint8_t ff_ue_golomb_len[256]={ + 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,11, +11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,13, +13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, +13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,15, +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,17, +}; + +const uint8_t ff_interleaved_golomb_vlc_len[256]={ +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +}; + +const uint8_t ff_interleaved_ue_golomb_vlc_code[256]={ + 15,16,7, 7, 17,18,8, 8, 3, 3, 3, 3, 3, 3, 3, 3, + 19,20,9, 9, 21,22,10,10,4, 4, 4, 4, 4, 4, 4, 4, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 23,24,11,11,25,26,12,12,5, 5, 5, 5, 5, 5, 5, 5, + 27,28,13,13,29,30,14,14,6, 6, 6, 6, 6, 6, 6, 6, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +const int8_t ff_interleaved_se_golomb_vlc_code[256]={ + 8, -8, 4, 4, 9, -9, -4, -4, 2, 2, 2, 2, 2, 2, 2, 2, + 10,-10, 5, 5, 11,-11, -5, -5, -2, -2, -2, -2, -2, -2, -2, -2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 12,-12, 6, 6, 13,-13, -6, -6, 3, 3, 3, 3, 3, 3, 3, 3, + 14,-14, 7, 7, 15,-15, -7, -7, -3, -3, -3, -3, -3, -3, -3, -3, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +const uint8_t ff_interleaved_dirac_golomb_vlc_code[256]={ +0, 1, 0, 0, 2, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, +4, 5, 2, 2, 6, 7, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +8, 9, 4, 4, 10,11,5, 5, 2, 2, 2, 2, 2, 2, 2, 2, +12,13,6, 6, 14,15,7, 7, 3, 3, 3, 3, 3, 3, 3, 3, +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}; diff --git a/media/ffvpx/libavcodec/golomb.h b/media/ffvpx/libavcodec/golomb.h new file mode 100644 index 0000000000..164c2583b6 --- /dev/null +++ b/media/ffvpx/libavcodec/golomb.h @@ -0,0 +1,616 @@ +/* + * exp golomb vlc stuff + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (c) 2004 Alex Beregszaszi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * @brief + * exp golomb vlc stuff + * @author Michael Niedermayer <michaelni@gmx.at> and Alex Beregszaszi + */ + +#ifndef AVCODEC_GOLOMB_H +#define AVCODEC_GOLOMB_H + +#include <stdint.h> + +#include "get_bits.h" + +#define INVALID_VLC 0x80000000 + +extern const uint8_t ff_golomb_vlc_len[512]; +extern const uint8_t ff_ue_golomb_vlc_code[512]; +extern const int8_t ff_se_golomb_vlc_code[512]; + +extern const uint8_t ff_interleaved_golomb_vlc_len[256]; +extern const uint8_t ff_interleaved_ue_golomb_vlc_code[256]; +extern const int8_t ff_interleaved_se_golomb_vlc_code[256]; +extern const uint8_t ff_interleaved_dirac_golomb_vlc_code[256]; + +/** + * Read an unsigned Exp-Golomb code in the range 0 to 8190. + * + * @returns the read value or a negative error code. + */ +static inline int get_ue_golomb(GetBitContext *gb) +{ + unsigned int buf; + +#if CACHED_BITSTREAM_READER + buf = show_bits_long(gb, 32); + + if (buf >= (1 << 27)) { + buf >>= 32 - 9; + skip_bits_long(gb, ff_golomb_vlc_len[buf]); + + return ff_ue_golomb_vlc_code[buf]; + } else { + int log = 2 * av_log2(buf) - 31; + + skip_bits_long(gb, 32 - log); + if (log < 7) + return AVERROR_INVALIDDATA; + buf >>= log; + buf--; + + return buf; + } +#else + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf = GET_CACHE(re, gb); + + if (buf >= (1 << 27)) { + buf >>= 32 - 9; + LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); + CLOSE_READER(re, gb); + + return ff_ue_golomb_vlc_code[buf]; + } else { + int log = 2 * av_log2(buf) - 31; + LAST_SKIP_BITS(re, gb, 32 - log); + CLOSE_READER(re, gb); + if (log < 7) + return AVERROR_INVALIDDATA; + buf >>= log; + buf--; + + return buf; + } +#endif +} + +/** + * Read an unsigned Exp-Golomb code in the range 0 to UINT32_MAX-1. + */ +static inline unsigned get_ue_golomb_long(GetBitContext *gb) +{ + unsigned buf, log; + + buf = show_bits_long(gb, 32); + log = 31 - av_log2(buf); + skip_bits_long(gb, log); + + return get_bits_long(gb, log + 1) - 1; +} + +/** + * read unsigned exp golomb code, constraint to a max of 31. + * If the value encountered is not in 0..31, the return value + * is outside the range 0..30. + */ +static inline int get_ue_golomb_31(GetBitContext *gb) +{ + unsigned int buf; + +#if CACHED_BITSTREAM_READER + buf = show_bits_long(gb, 32); + + buf >>= 32 - 9; + skip_bits_long(gb, ff_golomb_vlc_len[buf]); +#else + + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf = GET_CACHE(re, gb); + + buf >>= 32 - 9; + LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); + CLOSE_READER(re, gb); +#endif + + return ff_ue_golomb_vlc_code[buf]; +} + +static inline unsigned get_interleaved_ue_golomb(GetBitContext *gb) +{ + uint32_t buf; + +#if CACHED_BITSTREAM_READER + buf = show_bits_long(gb, 32); + + if (buf & 0xAA800000) { + buf >>= 32 - 8; + skip_bits_long(gb, ff_interleaved_golomb_vlc_len[buf]); + + return ff_interleaved_ue_golomb_vlc_code[buf]; + } else { + unsigned ret = 1; + + do { + buf >>= 32 - 8; + skip_bits_long(gb, FFMIN(ff_interleaved_golomb_vlc_len[buf], 8)); + + if (ff_interleaved_golomb_vlc_len[buf] != 9) { + ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1; + ret |= ff_interleaved_dirac_golomb_vlc_code[buf]; + break; + } + ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf]; + buf = show_bits_long(gb, 32); + } while (get_bits_left(gb) > 0); + + return ret - 1; + } +#else + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf = GET_CACHE(re, gb); + + if (buf & 0xAA800000) { + buf >>= 32 - 8; + LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]); + CLOSE_READER(re, gb); + + return ff_interleaved_ue_golomb_vlc_code[buf]; + } else { + unsigned ret = 1; + + do { + buf >>= 32 - 8; + LAST_SKIP_BITS(re, gb, + FFMIN(ff_interleaved_golomb_vlc_len[buf], 8)); + + if (ff_interleaved_golomb_vlc_len[buf] != 9) { + ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1; + ret |= ff_interleaved_dirac_golomb_vlc_code[buf]; + break; + } + ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf]; + UPDATE_CACHE(re, gb); + buf = GET_CACHE(re, gb); + } while (ret<0x8000000U && BITS_AVAILABLE(re, gb)); + + CLOSE_READER(re, gb); + return ret - 1; + } +#endif +} + +/** + * read unsigned truncated exp golomb code. + */ +static inline int get_te0_golomb(GetBitContext *gb, int range) +{ + av_assert2(range >= 1); + + if (range == 1) + return 0; + else if (range == 2) + return get_bits1(gb) ^ 1; + else + return get_ue_golomb(gb); +} + +/** + * read unsigned truncated exp golomb code. + */ +static inline int get_te_golomb(GetBitContext *gb, int range) +{ + av_assert2(range >= 1); + + if (range == 2) + return get_bits1(gb) ^ 1; + else + return get_ue_golomb(gb); +} + +/** + * read signed exp golomb code. + */ +static inline int get_se_golomb(GetBitContext *gb) +{ + unsigned int buf; + +#if CACHED_BITSTREAM_READER + buf = show_bits_long(gb, 32); + + if (buf >= (1 << 27)) { + buf >>= 32 - 9; + skip_bits_long(gb, ff_golomb_vlc_len[buf]); + + return ff_se_golomb_vlc_code[buf]; + } else { + int log = 2 * av_log2(buf) - 31; + buf >>= log; + + skip_bits_long(gb, 32 - log); + + if (buf & 1) + buf = -(buf >> 1); + else + buf = (buf >> 1); + + return buf; + } +#else + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf = GET_CACHE(re, gb); + + if (buf >= (1 << 27)) { + buf >>= 32 - 9; + LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); + CLOSE_READER(re, gb); + + return ff_se_golomb_vlc_code[buf]; + } else { + int log = av_log2(buf), sign; + LAST_SKIP_BITS(re, gb, 31 - log); + UPDATE_CACHE(re, gb); + buf = GET_CACHE(re, gb); + + buf >>= log; + + LAST_SKIP_BITS(re, gb, 32 - log); + CLOSE_READER(re, gb); + + sign = -(buf & 1); + buf = ((buf >> 1) ^ sign) - sign; + + return buf; + } +#endif +} + +static inline int get_se_golomb_long(GetBitContext *gb) +{ + unsigned int buf = get_ue_golomb_long(gb); + int sign = (buf & 1) - 1; + return ((buf >> 1) ^ sign) + 1; +} + +static inline int get_interleaved_se_golomb(GetBitContext *gb) +{ + unsigned int buf; + +#if CACHED_BITSTREAM_READER + buf = show_bits_long(gb, 32); + + if (buf & 0xAA800000) { + buf >>= 32 - 8; + skip_bits_long(gb, ff_interleaved_golomb_vlc_len[buf]); + + return ff_interleaved_se_golomb_vlc_code[buf]; + } else { + int log; + skip_bits(gb, 8); + buf |= 1 | show_bits(gb, 24); + + if ((buf & 0xAAAAAAAA) == 0) + return INVALID_VLC; + + for (log = 31; (buf & 0x80000000) == 0; log--) + buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30); + + skip_bits_long(gb, 63 - 2 * log - 8); + + return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1; + } +#else + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf = GET_CACHE(re, gb); + + if (buf & 0xAA800000) { + buf >>= 32 - 8; + LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]); + CLOSE_READER(re, gb); + + return ff_interleaved_se_golomb_vlc_code[buf]; + } else { + int log; + LAST_SKIP_BITS(re, gb, 8); + UPDATE_CACHE(re, gb); + buf |= 1 | (GET_CACHE(re, gb) >> 8); + + if ((buf & 0xAAAAAAAA) == 0) + return INVALID_VLC; + + for (log = 31; (buf & 0x80000000) == 0; log--) + buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30); + + LAST_SKIP_BITS(re, gb, 63 - 2 * log - 8); + CLOSE_READER(re, gb); + + return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1; + } +#endif +} + +static inline int dirac_get_se_golomb(GetBitContext *gb) +{ + uint32_t ret = get_interleaved_ue_golomb(gb); + + if (ret) { + int sign = -get_bits1(gb); + ret = (ret ^ sign) - sign; + } + + return ret; +} + +/** + * read unsigned golomb rice code (ffv1). + */ +static inline int get_ur_golomb(GetBitContext *gb, int k, int limit, + int esc_len) +{ + unsigned int buf; + int log; + +#if CACHED_BITSTREAM_READER + buf = show_bits_long(gb, 32); + + log = av_log2(buf); + + if (log > 31 - limit) { + buf >>= log - k; + buf += (30 - log) << k; + skip_bits_long(gb, 32 + k - log); + + return buf; + } else { + skip_bits_long(gb, limit); + buf = get_bits_long(gb, esc_len); + + return buf + limit - 1; + } +#else + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf = GET_CACHE(re, gb); + + log = av_log2(buf); + + if (log > 31 - limit) { + buf >>= log - k; + buf += (30U - log) << k; + LAST_SKIP_BITS(re, gb, 32 + k - log); + CLOSE_READER(re, gb); + + return buf; + } else { + LAST_SKIP_BITS(re, gb, limit); + UPDATE_CACHE(re, gb); + + buf = SHOW_UBITS(re, gb, esc_len); + + LAST_SKIP_BITS(re, gb, esc_len); + CLOSE_READER(re, gb); + + return buf + limit - 1; + } +#endif +} + +/** + * read unsigned golomb rice code (jpegls). + */ +static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit, + int esc_len) +{ + unsigned int buf; + int log; + +#if CACHED_BITSTREAM_READER + buf = show_bits_long(gb, 32); + + log = av_log2(buf); + + if (log - k >= 1 && 32 - log < limit) { + buf >>= log - k; + buf += (30 - log) << k; + skip_bits_long(gb, 32 + k - log); + + return buf; + } else { + int i; + for (i = 0; + i < limit && get_bits1(gb) == 0 && get_bits_left(gb) > 0; + i++); + + if (i < limit - 1) { + buf = get_bits_long(gb, k); + + return buf + (i << k); + } else if (i == limit - 1) { + buf = get_bits_long(gb, esc_len); + + return buf + 1; + } else + return -1; + } +#else + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf = GET_CACHE(re, gb); + + log = av_log2(buf); + + av_assert2(k <= 31); + + if (log - k >= 32 - MIN_CACHE_BITS + (MIN_CACHE_BITS == 32) && + 32 - log < limit) { + buf >>= log - k; + buf += (30U - log) << k; + LAST_SKIP_BITS(re, gb, 32 + k - log); + CLOSE_READER(re, gb); + + return buf; + } else { + int i; + for (i = 0; i + MIN_CACHE_BITS <= limit && SHOW_UBITS(re, gb, MIN_CACHE_BITS) == 0; i += MIN_CACHE_BITS) { + if (gb->size_in_bits <= re_index) { + CLOSE_READER(re, gb); + return -1; + } + LAST_SKIP_BITS(re, gb, MIN_CACHE_BITS); + UPDATE_CACHE(re, gb); + } + for (; i < limit && SHOW_UBITS(re, gb, 1) == 0; i++) { + SKIP_BITS(re, gb, 1); + } + LAST_SKIP_BITS(re, gb, 1); + UPDATE_CACHE(re, gb); + + if (i < limit - 1) { + if (k) { + if (k > MIN_CACHE_BITS - 1) { + buf = SHOW_UBITS(re, gb, 16) << (k-16); + LAST_SKIP_BITS(re, gb, 16); + UPDATE_CACHE(re, gb); + buf |= SHOW_UBITS(re, gb, k-16); + LAST_SKIP_BITS(re, gb, k-16); + } else { + buf = SHOW_UBITS(re, gb, k); + LAST_SKIP_BITS(re, gb, k); + } + } else { + buf = 0; + } + + buf += ((SUINT)i << k); + } else if (i == limit - 1) { + buf = SHOW_UBITS(re, gb, esc_len); + LAST_SKIP_BITS(re, gb, esc_len); + + buf ++; + } else { + buf = -1; + } + CLOSE_READER(re, gb); + return buf; + } +#endif +} + +/** + * read signed golomb rice code (ffv1). + */ +static inline int get_sr_golomb(GetBitContext *gb, int k, int limit, + int esc_len) +{ + unsigned v = get_ur_golomb(gb, k, limit, esc_len); + return (v >> 1) ^ -(v & 1); +} + +/** + * read signed golomb rice code (flac). + */ +static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit, + int esc_len) +{ + unsigned v = get_ur_golomb_jpegls(gb, k, limit, esc_len); + return (v >> 1) ^ -(v & 1); +} + +/** + * read unsigned golomb rice code (shorten). + */ +static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k) +{ + return get_ur_golomb_jpegls(gb, k, INT_MAX, 0); +} + +/** + * read signed golomb rice code (shorten). + */ +static inline int get_sr_golomb_shorten(GetBitContext *gb, int k) +{ + int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0); + return (uvar >> 1) ^ -(uvar & 1); +} + +#ifdef TRACE + +static inline int get_ue(GetBitContext *s, const char *file, const char *func, + int line) +{ + int show = show_bits(s, 24); + int pos = get_bits_count(s); + int i = get_ue_golomb(s); + int len = get_bits_count(s) - pos; + int bits = show >> (24 - len); + + av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d ue @%5d in %s %s:%d\n", + bits, len, i, pos, file, func, line); + + return i; +} + +static inline int get_se(GetBitContext *s, const char *file, const char *func, + int line) +{ + int show = show_bits(s, 24); + int pos = get_bits_count(s); + int i = get_se_golomb(s); + int len = get_bits_count(s) - pos; + int bits = show >> (24 - len); + + av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d se @%5d in %s %s:%d\n", + bits, len, i, pos, file, func, line); + + return i; +} + +static inline int get_te(GetBitContext *s, int r, char *file, const char *func, + int line) +{ + int show = show_bits(s, 24); + int pos = get_bits_count(s); + int i = get_te0_golomb(s, r); + int len = get_bits_count(s) - pos; + int bits = show >> (24 - len); + + av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d te @%5d in %s %s:%d\n", + bits, len, i, pos, file, func, line); + + return i; +} + +#define get_ue_golomb(a) get_ue(a, __FILE__, __func__, __LINE__) +#define get_se_golomb(a) get_se(a, __FILE__, __func__, __LINE__) +#define get_te_golomb(a, r) get_te(a, r, __FILE__, __func__, __LINE__) +#define get_te0_golomb(a, r) get_te(a, r, __FILE__, __func__, __LINE__) + +#endif /* TRACE */ +#endif /* AVCODEC_GOLOMB_H */ diff --git a/media/ffvpx/libavcodec/h263dsp.h b/media/ffvpx/libavcodec/h263dsp.h new file mode 100644 index 0000000000..1abea3ca8c --- /dev/null +++ b/media/ffvpx/libavcodec/h263dsp.h @@ -0,0 +1,35 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_H263DSP_H +#define AVCODEC_H263DSP_H + +#include <stdint.h> + +extern const uint8_t ff_h263_loop_filter_strength[32]; + +typedef struct H263DSPContext { + void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale); + void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale); +} H263DSPContext; + +void ff_h263dsp_init(H263DSPContext *ctx); +void ff_h263dsp_init_x86(H263DSPContext *ctx); +void ff_h263dsp_init_mips(H263DSPContext *ctx); + +#endif /* AVCODEC_H263DSP_H */ diff --git a/media/ffvpx/libavcodec/h264chroma.h b/media/ffvpx/libavcodec/h264chroma.h new file mode 100644 index 0000000000..b8f9c8f4fc --- /dev/null +++ b/media/ffvpx/libavcodec/h264chroma.h @@ -0,0 +1,41 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_H264CHROMA_H +#define AVCODEC_H264CHROMA_H + +#include <stddef.h> +#include <stdint.h> + +typedef void (*h264_chroma_mc_func)(uint8_t *dst /*align 8*/, const uint8_t *src /*align 1*/, ptrdiff_t srcStride, int h, int x, int y); + +typedef struct H264ChromaContext { + h264_chroma_mc_func put_h264_chroma_pixels_tab[4]; + h264_chroma_mc_func avg_h264_chroma_pixels_tab[4]; +} H264ChromaContext; + +void ff_h264chroma_init(H264ChromaContext *c, int bit_depth); + +void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth); +void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth); +void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth); +void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth); +void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth); +void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth); + +#endif /* AVCODEC_H264CHROMA_H */ diff --git a/media/ffvpx/libavcodec/h264dsp.h b/media/ffvpx/libavcodec/h264dsp.h new file mode 100644 index 0000000000..e0880c4d88 --- /dev/null +++ b/media/ffvpx/libavcodec/h264dsp.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 DSP functions. + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#ifndef AVCODEC_H264DSP_H +#define AVCODEC_H264DSP_H + +#include <stdint.h> +#include <stddef.h> + +typedef void (*h264_weight_func)(uint8_t *block, ptrdiff_t stride, int height, + int log2_denom, int weight, int offset); +typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, + ptrdiff_t stride, int height, int log2_denom, + int weightd, int weights, int offset); + +/** + * Context for storing H.264 DSP functions + */ +typedef struct H264DSPContext { + /* weighted MC */ + h264_weight_func weight_h264_pixels_tab[4]; + h264_biweight_func biweight_h264_pixels_tab[4]; + + /* loop filter */ + void (*h264_v_loop_filter_luma)(uint8_t *pix /*align 16*/, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); + void (*h264_h_loop_filter_luma)(uint8_t *pix /*align 4 */, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); + void (*h264_h_loop_filter_luma_mbaff)(uint8_t *pix /*align 16*/, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); + /* v/h_loop_filter_luma_intra: align 16 */ + void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); + void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); + void (*h264_h_loop_filter_luma_mbaff_intra)(uint8_t *pix /*align 16*/, + ptrdiff_t stride, int alpha, int beta); + void (*h264_v_loop_filter_chroma)(uint8_t *pix /*align 8*/, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); + void (*h264_h_loop_filter_chroma)(uint8_t *pix /*align 4*/, ptrdiff_t stride, + int alpha, int beta, int8_t *tc0); + void (*h264_h_loop_filter_chroma_mbaff)(uint8_t *pix /*align 8*/, + ptrdiff_t stride, int alpha, int beta, + int8_t *tc0); + void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix /*align 8*/, + ptrdiff_t stride, int alpha, int beta); + void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix /*align 8*/, + ptrdiff_t stride, int alpha, int beta); + void (*h264_h_loop_filter_chroma_mbaff_intra)(uint8_t *pix /*align 8*/, + ptrdiff_t stride, int alpha, int beta); + // h264_loop_filter_strength: simd only. the C version is inlined in h264_loopfilter.c + void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], + int8_t ref[2][40], int16_t mv[2][40][2], + int bidir, int edges, int step, + int mask_mv0, int mask_mv1, int field); + + /* IDCT */ + void (*h264_idct_add)(uint8_t *dst /*align 4*/, + int16_t *block /*align 16*/, int stride); + void (*h264_idct8_add)(uint8_t *dst /*align 8*/, + int16_t *block /*align 16*/, int stride); + void (*h264_idct_dc_add)(uint8_t *dst /*align 4*/, + int16_t *block /*align 16*/, int stride); + void (*h264_idct8_dc_add)(uint8_t *dst /*align 8*/, + int16_t *block /*align 16*/, int stride); + + void (*h264_idct_add16)(uint8_t *dst /*align 16*/, const int *blockoffset, + int16_t *block /*align 16*/, int stride, + const uint8_t nnzc[5 * 8]); + void (*h264_idct8_add4)(uint8_t *dst /*align 16*/, const int *blockoffset, + int16_t *block /*align 16*/, int stride, + const uint8_t nnzc[5 * 8]); + void (*h264_idct_add8)(uint8_t **dst /*align 16*/, const int *blockoffset, + int16_t *block /*align 16*/, int stride, + const uint8_t nnzc[15 * 8]); + void (*h264_idct_add16intra)(uint8_t *dst /*align 16*/, const int *blockoffset, + int16_t *block /*align 16*/, + int stride, const uint8_t nnzc[5 * 8]); + void (*h264_luma_dc_dequant_idct)(int16_t *output, + int16_t *input /*align 16*/, int qmul); + void (*h264_chroma_dc_dequant_idct)(int16_t *block, int qmul); + + /* bypass-transform */ + void (*h264_add_pixels8_clear)(uint8_t *dst, int16_t *block, int stride); + void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride); + + /** + * Search buf from the start for up to size bytes. Return the index + * of a zero byte, or >= size if not found. Ideally, use lookahead + * to filter out any zero bytes that are known to not be followed by + * one or more further zero bytes and a one byte. Better still, filter + * out any bytes that form the trailing_zero_8bits syntax element too. + */ + int (*startcode_find_candidate)(const uint8_t *buf, int size); +} H264DSPContext; + +void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc); +void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc); +void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc); +void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc); +void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc); +void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc); +void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc); + +#endif /* AVCODEC_H264DSP_H */ diff --git a/media/ffvpx/libavcodec/h264pred.c b/media/ffvpx/libavcodec/h264pred.c new file mode 100644 index 0000000000..25f9995a0b --- /dev/null +++ b/media/ffvpx/libavcodec/h264pred.c @@ -0,0 +1,602 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG-4 part10 prediction functions. + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/intreadwrite.h" +#include "codec_id.h" +#include "h264pred.h" +#include "mathops.h" + +#define BIT_DEPTH 8 +#include "h264pred_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 9 +#include "h264pred_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 10 +#include "h264pred_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 12 +#include "h264pred_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 14 +#include "h264pred_template.c" +#undef BIT_DEPTH + +static void pred4x4_127_dc_c(uint8_t *src, const uint8_t *topright, + ptrdiff_t _stride) +{ + int stride = _stride; + const uint32_t a = 0x7F7F7F7FU; + + AV_WN32A(src + 0 * stride, a); + AV_WN32A(src + 1 * stride, a); + AV_WN32A(src + 2 * stride, a); + AV_WN32A(src + 3 * stride, a); +} + +static void pred4x4_129_dc_c(uint8_t *src, const uint8_t *topright, + ptrdiff_t _stride) +{ + int stride = _stride; + const uint32_t a = 0x81818181U; + + AV_WN32A(src + 0 * stride, a); + AV_WN32A(src + 1 * stride, a); + AV_WN32A(src + 2 * stride, a); + AV_WN32A(src + 3 * stride, a); +} + +static void pred4x4_vertical_vp8_c(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride) +{ + const unsigned lt = src[-1-1*stride]; + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE + uint32_t v = PACK_4U8((lt + 2*t0 + t1 + 2) >> 2, + (t0 + 2*t1 + t2 + 2) >> 2, + (t1 + 2*t2 + t3 + 2) >> 2, + (t2 + 2*t3 + t4 + 2) >> 2); + + AV_WN32A(src+0*stride, v); + AV_WN32A(src+1*stride, v); + AV_WN32A(src+2*stride, v); + AV_WN32A(src+3*stride, v); +} + +static void pred4x4_horizontal_vp8_c(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride) +{ + const unsigned lt = src[-1-1*stride]; + LOAD_LEFT_EDGE + + AV_WN32A(src+0*stride, ((lt + 2*l0 + l1 + 2) >> 2)*0x01010101); + AV_WN32A(src+1*stride, ((l0 + 2*l1 + l2 + 2) >> 2)*0x01010101); + AV_WN32A(src+2*stride, ((l1 + 2*l2 + l3 + 2) >> 2)*0x01010101); + AV_WN32A(src+3*stride, ((l2 + 2*l3 + l3 + 2) >> 2)*0x01010101); +} + +static void pred4x4_down_left_svq3_c(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride) +{ + LOAD_TOP_EDGE + LOAD_LEFT_EDGE + + src[0+0*stride]=(l1 + t1)>>1; + src[1+0*stride]= + src[0+1*stride]=(l2 + t2)>>1; + src[2+0*stride]= + src[1+1*stride]= + src[0+2*stride]= + src[3+0*stride]= + src[2+1*stride]= + src[1+2*stride]= + src[0+3*stride]= + src[3+1*stride]= + src[2+2*stride]= + src[1+3*stride]= + src[3+2*stride]= + src[2+3*stride]= + src[3+3*stride]=(l3 + t3)>>1; +} + +static void pred4x4_down_left_rv40_c(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride) +{ + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE + LOAD_LEFT_EDGE + LOAD_DOWN_LEFT_EDGE + + src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3; + src[1+0*stride]= + src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3; + src[2+0*stride]= + src[1+1*stride]= + src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + l4 + 2*l3 + 2)>>3; + src[3+0*stride]= + src[2+1*stride]= + src[1+2*stride]= + src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3 + l5 + 2*l4 + 2)>>3; + src[3+1*stride]= + src[2+2*stride]= + src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l4 + l6 + 2*l5 + 2)>>3; + src[3+2*stride]= + src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l5 + l7 + 2*l6 + 2)>>3; + src[3+3*stride]=(t6 + t7 + 1 + l6 + l7 + 1)>>2; +} + +static void pred4x4_down_left_rv40_nodown_c(uint8_t *src, + const uint8_t *topright, + ptrdiff_t stride) +{ + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE + LOAD_LEFT_EDGE + + src[0+0*stride]=(t0 + t2 + 2*t1 + 2 + l0 + l2 + 2*l1 + 2)>>3; + src[1+0*stride]= + src[0+1*stride]=(t1 + t3 + 2*t2 + 2 + l1 + l3 + 2*l2 + 2)>>3; + src[2+0*stride]= + src[1+1*stride]= + src[0+2*stride]=(t2 + t4 + 2*t3 + 2 + l2 + 3*l3 + 2)>>3; + src[3+0*stride]= + src[2+1*stride]= + src[1+2*stride]= + src[0+3*stride]=(t3 + t5 + 2*t4 + 2 + l3*4 + 2)>>3; + src[3+1*stride]= + src[2+2*stride]= + src[1+3*stride]=(t4 + t6 + 2*t5 + 2 + l3*4 + 2)>>3; + src[3+2*stride]= + src[2+3*stride]=(t5 + t7 + 2*t6 + 2 + l3*4 + 2)>>3; + src[3+3*stride]=(t6 + t7 + 1 + 2*l3 + 1)>>2; +} + +static void pred4x4_vertical_left_rv40(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride, + const int l0, const int l1, const int l2, + const int l3, const int l4) +{ + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE + + src[0+0*stride]=(2*t0 + 2*t1 + l1 + 2*l2 + l3 + 4)>>3; + src[1+0*stride]= + src[0+2*stride]=(t1 + t2 + 1)>>1; + src[2+0*stride]= + src[1+2*stride]=(t2 + t3 + 1)>>1; + src[3+0*stride]= + src[2+2*stride]=(t3 + t4+ 1)>>1; + src[3+2*stride]=(t4 + t5+ 1)>>1; + src[0+1*stride]=(t0 + 2*t1 + t2 + l2 + 2*l3 + l4 + 4)>>3; + src[1+1*stride]= + src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2; + src[2+1*stride]= + src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2; + src[3+1*stride]= + src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2; + src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; +} + +static void pred4x4_vertical_left_rv40_c(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride) +{ + LOAD_LEFT_EDGE + LOAD_DOWN_LEFT_EDGE + + pred4x4_vertical_left_rv40(src, topright, stride, l0, l1, l2, l3, l4); +} + +static void pred4x4_vertical_left_rv40_nodown_c(uint8_t *src, + const uint8_t *topright, + ptrdiff_t stride) +{ + LOAD_LEFT_EDGE + + pred4x4_vertical_left_rv40(src, topright, stride, l0, l1, l2, l3, l3); +} + +static void pred4x4_vertical_left_vp8_c(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride) +{ + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE + + src[0+0*stride]=(t0 + t1 + 1)>>1; + src[1+0*stride]= + src[0+2*stride]=(t1 + t2 + 1)>>1; + src[2+0*stride]= + src[1+2*stride]=(t2 + t3 + 1)>>1; + src[3+0*stride]= + src[2+2*stride]=(t3 + t4 + 1)>>1; + src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[1+1*stride]= + src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2; + src[2+1*stride]= + src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2; + src[3+1*stride]= + src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2; + src[3+2*stride]=(t4 + 2*t5 + t6 + 2)>>2; + src[3+3*stride]=(t5 + 2*t6 + t7 + 2)>>2; +} + +static void pred4x4_horizontal_up_rv40_c(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride) +{ + LOAD_LEFT_EDGE + LOAD_DOWN_LEFT_EDGE + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE + + src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3; + src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3; + src[2+0*stride]= + src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3; + src[3+0*stride]= + src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3; + src[2+1*stride]= + src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3; + src[3+1*stride]= + src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3; + src[3+2*stride]= + src[1+3*stride]=(l3 + 2*l4 + l5 + 2)>>2; + src[0+3*stride]= + src[2+2*stride]=(t6 + t7 + l3 + l4 + 2)>>2; + src[2+3*stride]=(l4 + l5 + 1)>>1; + src[3+3*stride]=(l4 + 2*l5 + l6 + 2)>>2; +} + +static void pred4x4_horizontal_up_rv40_nodown_c(uint8_t *src, + const uint8_t *topright, + ptrdiff_t stride) +{ + LOAD_LEFT_EDGE + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE + + src[0+0*stride]=(t1 + 2*t2 + t3 + 2*l0 + 2*l1 + 4)>>3; + src[1+0*stride]=(t2 + 2*t3 + t4 + l0 + 2*l1 + l2 + 4)>>3; + src[2+0*stride]= + src[0+1*stride]=(t3 + 2*t4 + t5 + 2*l1 + 2*l2 + 4)>>3; + src[3+0*stride]= + src[1+1*stride]=(t4 + 2*t5 + t6 + l1 + 2*l2 + l3 + 4)>>3; + src[2+1*stride]= + src[0+2*stride]=(t5 + 2*t6 + t7 + 2*l2 + 2*l3 + 4)>>3; + src[3+1*stride]= + src[1+2*stride]=(t6 + 3*t7 + l2 + 3*l3 + 4)>>3; + src[3+2*stride]= + src[1+3*stride]=l3; + src[0+3*stride]= + src[2+2*stride]=(t6 + t7 + 2*l3 + 2)>>2; + src[2+3*stride]= + src[3+3*stride]=l3; +} + +static void pred4x4_tm_vp8_c(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride) +{ + const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP - src[-1-stride]; + uint8_t *top = src-stride; + int y; + + for (y = 0; y < 4; y++) { + const uint8_t *cm_in = cm + src[-1]; + src[0] = cm_in[top[0]]; + src[1] = cm_in[top[1]]; + src[2] = cm_in[top[2]]; + src[3] = cm_in[top[3]]; + src += stride; + } +} + +static void pred16x16_plane_svq3_c(uint8_t *src, ptrdiff_t stride) +{ + pred16x16_plane_compat_8_c(src, stride, 1, 0); +} + +static void pred16x16_plane_rv40_c(uint8_t *src, ptrdiff_t stride) +{ + pred16x16_plane_compat_8_c(src, stride, 0, 1); +} + +static void pred16x16_tm_vp8_c(uint8_t *src, ptrdiff_t stride) +{ + const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP - src[-1-stride]; + uint8_t *top = src-stride; + int y; + + for (y = 0; y < 16; y++) { + const uint8_t *cm_in = cm + src[-1]; + src[0] = cm_in[top[0]]; + src[1] = cm_in[top[1]]; + src[2] = cm_in[top[2]]; + src[3] = cm_in[top[3]]; + src[4] = cm_in[top[4]]; + src[5] = cm_in[top[5]]; + src[6] = cm_in[top[6]]; + src[7] = cm_in[top[7]]; + src[8] = cm_in[top[8]]; + src[9] = cm_in[top[9]]; + src[10] = cm_in[top[10]]; + src[11] = cm_in[top[11]]; + src[12] = cm_in[top[12]]; + src[13] = cm_in[top[13]]; + src[14] = cm_in[top[14]]; + src[15] = cm_in[top[15]]; + src += stride; + } +} + +static void pred8x8_left_dc_rv40_c(uint8_t *src, ptrdiff_t stride) +{ + int i; + unsigned dc0; + + dc0=0; + for(i=0;i<8; i++) + dc0+= src[-1+i*stride]; + dc0= 0x01010101*((dc0 + 4)>>3); + + for(i=0; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= dc0; + } +} + +static void pred8x8_top_dc_rv40_c(uint8_t *src, ptrdiff_t stride) +{ + int i; + unsigned dc0; + + dc0=0; + for(i=0;i<8; i++) + dc0+= src[i-stride]; + dc0= 0x01010101*((dc0 + 4)>>3); + + for(i=0; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= dc0; + } +} + +static void pred8x8_dc_rv40_c(uint8_t *src, ptrdiff_t stride) +{ + int i; + unsigned dc0 = 0; + + for(i=0;i<4; i++){ + dc0+= src[-1+i*stride] + src[i-stride]; + dc0+= src[4+i-stride]; + dc0+= src[-1+(i+4)*stride]; + } + dc0= 0x01010101*((dc0 + 8)>>4); + + for(i=0; i<4; i++){ + ((uint32_t*)(src+i*stride))[0]= dc0; + ((uint32_t*)(src+i*stride))[1]= dc0; + } + for(i=4; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= dc0; + ((uint32_t*)(src+i*stride))[1]= dc0; + } +} + +static void pred8x8_tm_vp8_c(uint8_t *src, ptrdiff_t stride) +{ + const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP - src[-1-stride]; + uint8_t *top = src-stride; + int y; + + for (y = 0; y < 8; y++) { + const uint8_t *cm_in = cm + src[-1]; + src[0] = cm_in[top[0]]; + src[1] = cm_in[top[1]]; + src[2] = cm_in[top[2]]; + src[3] = cm_in[top[3]]; + src[4] = cm_in[top[4]]; + src[5] = cm_in[top[5]]; + src[6] = cm_in[top[6]]; + src[7] = cm_in[top[7]]; + src += stride; + } +} + +/** + * Set the intra prediction function pointers. + */ +av_cold void ff_h264_pred_init(H264PredContext *h, int codec_id, + const int bit_depth, + int chroma_format_idc) +{ +#undef FUNC +#undef FUNCC +#define FUNC(a, depth) a ## _ ## depth +#define FUNCC(a, depth) a ## _ ## depth ## _c +#define FUNCD(a) a ## _c + +#define H264_PRED(depth) \ + h->pred4x4[VERT_PRED ] = FUNCC(pred4x4_vertical, depth);\ + h->pred4x4[HOR_PRED ] = FUNCC(pred4x4_horizontal, depth);\ + h->pred4x4[DC_PRED ] = FUNCC(pred4x4_dc, depth);\ + h->pred4x4[DIAG_DOWN_LEFT_PRED ] = FUNCC(pred4x4_down_left, depth);\ + h->pred4x4[DIAG_DOWN_RIGHT_PRED] = FUNCC(pred4x4_down_right, depth);\ + h->pred4x4[VERT_RIGHT_PRED ] = FUNCC(pred4x4_vertical_right, depth);\ + h->pred4x4[HOR_DOWN_PRED ] = FUNCC(pred4x4_horizontal_down, depth);\ + h->pred4x4[VERT_LEFT_PRED ] = FUNCC(pred4x4_vertical_left, depth);\ + h->pred4x4[HOR_UP_PRED ] = FUNCC(pred4x4_horizontal_up, depth);\ + h->pred4x4[LEFT_DC_PRED ] = FUNCC(pred4x4_left_dc, depth);\ + h->pred4x4[TOP_DC_PRED ] = FUNCC(pred4x4_top_dc, depth);\ + if (depth > 8 || codec_id != AV_CODEC_ID_VP8)\ + h->pred4x4[DC_128_PRED ] = FUNCC(pred4x4_128_dc, depth);\ +\ + h->pred8x8l[VERT_PRED ]= FUNCC(pred8x8l_vertical , depth);\ + h->pred8x8l[HOR_PRED ]= FUNCC(pred8x8l_horizontal , depth);\ + h->pred8x8l[DC_PRED ]= FUNCC(pred8x8l_dc , depth);\ + h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= FUNCC(pred8x8l_down_left , depth);\ + h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= FUNCC(pred8x8l_down_right , depth);\ + h->pred8x8l[VERT_RIGHT_PRED ]= FUNCC(pred8x8l_vertical_right , depth);\ + h->pred8x8l[HOR_DOWN_PRED ]= FUNCC(pred8x8l_horizontal_down , depth);\ + h->pred8x8l[VERT_LEFT_PRED ]= FUNCC(pred8x8l_vertical_left , depth);\ + h->pred8x8l[HOR_UP_PRED ]= FUNCC(pred8x8l_horizontal_up , depth);\ + h->pred8x8l[LEFT_DC_PRED ]= FUNCC(pred8x8l_left_dc , depth);\ + h->pred8x8l[TOP_DC_PRED ]= FUNCC(pred8x8l_top_dc , depth);\ + h->pred8x8l[DC_128_PRED ]= FUNCC(pred8x8l_128_dc , depth);\ +\ + if (chroma_format_idc <= 1) {\ + h->pred8x8[VERT_PRED8x8 ]= FUNCC(pred8x8_vertical , depth);\ + h->pred8x8[HOR_PRED8x8 ]= FUNCC(pred8x8_horizontal , depth);\ + h->pred8x8[PLANE_PRED8x8] = FUNCC(pred8x8_plane, depth);\ + } else {\ + h->pred8x8[VERT_PRED8x8 ]= FUNCC(pred8x16_vertical , depth);\ + h->pred8x8[HOR_PRED8x8 ]= FUNCC(pred8x16_horizontal , depth);\ + h->pred8x8[PLANE_PRED8x8] = FUNCC(pred8x16_plane, depth);\ + }\ + if (depth > 8 || (codec_id != AV_CODEC_ID_RV40 && \ + codec_id != AV_CODEC_ID_VP7 && \ + codec_id != AV_CODEC_ID_VP8)) { \ + if (chroma_format_idc <= 1) {\ + h->pred8x8[DC_PRED8x8 ]= FUNCC(pred8x8_dc , depth);\ + h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x8_left_dc , depth);\ + h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x8_top_dc , depth);\ + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\ + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\ + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\ + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\ + } else {\ + h->pred8x8[DC_PRED8x8 ]= FUNCC(pred8x16_dc , depth);\ + h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x16_left_dc , depth);\ + h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x16_top_dc , depth);\ + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l0t, depth);\ + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0lt, depth);\ + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l00, depth);\ + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0l0, depth);\ + }\ + }else{\ + h->pred8x8[DC_PRED8x8 ]= FUNCD(pred8x8_dc_rv40);\ + h->pred8x8[LEFT_DC_PRED8x8]= FUNCD(pred8x8_left_dc_rv40);\ + h->pred8x8[TOP_DC_PRED8x8 ]= FUNCD(pred8x8_top_dc_rv40);\ + }\ + if (chroma_format_idc <= 1) {\ + h->pred8x8[DC_128_PRED8x8 ]= FUNCC(pred8x8_128_dc , depth);\ + } else {\ + h->pred8x8[DC_128_PRED8x8 ]= FUNCC(pred8x16_128_dc , depth);\ + }\ +\ + h->pred16x16[DC_PRED8x8 ]= FUNCC(pred16x16_dc , depth);\ + h->pred16x16[VERT_PRED8x8 ]= FUNCC(pred16x16_vertical , depth);\ + h->pred16x16[HOR_PRED8x8 ]= FUNCC(pred16x16_horizontal , depth);\ + h->pred16x16[PLANE_PRED8x8 ]= FUNCC(pred16x16_plane , depth);\ + h->pred16x16[LEFT_DC_PRED8x8]= FUNCC(pred16x16_left_dc , depth);\ + h->pred16x16[TOP_DC_PRED8x8 ]= FUNCC(pred16x16_top_dc , depth);\ + h->pred16x16[DC_128_PRED8x8 ]= FUNCC(pred16x16_128_dc , depth);\ +\ + /* special lossless h/v prediction for H.264 */ \ + h->pred4x4_add [VERT_PRED ]= FUNCC(pred4x4_vertical_add , depth);\ + h->pred4x4_add [ HOR_PRED ]= FUNCC(pred4x4_horizontal_add , depth);\ + h->pred8x8l_add [VERT_PRED ]= FUNCC(pred8x8l_vertical_add , depth);\ + h->pred8x8l_add [ HOR_PRED ]= FUNCC(pred8x8l_horizontal_add , depth);\ + h->pred8x8l_filter_add [VERT_PRED ]= FUNCC(pred8x8l_vertical_filter_add , depth);\ + h->pred8x8l_filter_add [ HOR_PRED ]= FUNCC(pred8x8l_horizontal_filter_add , depth);\ + if (chroma_format_idc <= 1) {\ + h->pred8x8_add[VERT_PRED8x8] = FUNCC(pred8x8_vertical_add, depth);\ + h->pred8x8_add[ HOR_PRED8x8] = FUNCC(pred8x8_horizontal_add, depth);\ + } else {\ + h->pred8x8_add [VERT_PRED8x8]= FUNCC(pred8x16_vertical_add , depth);\ + h->pred8x8_add [ HOR_PRED8x8]= FUNCC(pred8x16_horizontal_add , depth);\ + }\ + h->pred16x16_add[VERT_PRED8x8]= FUNCC(pred16x16_vertical_add , depth);\ + h->pred16x16_add[ HOR_PRED8x8]= FUNCC(pred16x16_horizontal_add , depth);\ + + switch (bit_depth) { + case 9: + H264_PRED(9) + break; + case 10: + H264_PRED(10) + break; + case 12: + H264_PRED(12) + break; + case 14: + H264_PRED(14) + break; + default: + av_assert0(bit_depth<=8); + H264_PRED(8) + switch (codec_id) { + case AV_CODEC_ID_SVQ3: + h->pred4x4[DIAG_DOWN_LEFT_PRED] = FUNCD(pred4x4_down_left_svq3); + h->pred16x16[PLANE_PRED8x8 ] = FUNCD(pred16x16_plane_svq3); + break; + case AV_CODEC_ID_RV40: + h->pred4x4[DIAG_DOWN_LEFT_PRED] = FUNCD(pred4x4_down_left_rv40); + h->pred4x4[VERT_LEFT_PRED ] = FUNCD(pred4x4_vertical_left_rv40); + h->pred4x4[HOR_UP_PRED ] = FUNCD(pred4x4_horizontal_up_rv40); + h->pred4x4[DIAG_DOWN_LEFT_PRED_RV40_NODOWN] = FUNCD(pred4x4_down_left_rv40_nodown); + h->pred4x4[HOR_UP_PRED_RV40_NODOWN] = FUNCD(pred4x4_horizontal_up_rv40_nodown); + h->pred4x4[VERT_LEFT_PRED_RV40_NODOWN] = FUNCD(pred4x4_vertical_left_rv40_nodown); + h->pred16x16[PLANE_PRED8x8 ] = FUNCD(pred16x16_plane_rv40); + break; + case AV_CODEC_ID_VP7: + case AV_CODEC_ID_VP8: + h->pred4x4[VERT_PRED ] = FUNCD(pred4x4_vertical_vp8); + h->pred4x4[HOR_PRED ] = FUNCD(pred4x4_horizontal_vp8); + h->pred4x4[VERT_LEFT_PRED ] = FUNCD(pred4x4_vertical_left_vp8); + h->pred4x4[TM_VP8_PRED ] = FUNCD(pred4x4_tm_vp8); + h->pred4x4[VERT_VP8_PRED ] = FUNCC(pred4x4_vertical, 8); + h->pred4x4[DC_127_PRED ] = FUNCD(pred4x4_127_dc); + h->pred4x4[DC_129_PRED ] = FUNCD(pred4x4_129_dc); + h->pred4x4[HOR_VP8_PRED ] = FUNCC(pred4x4_horizontal, 8); + h->pred8x8[PLANE_PRED8x8 ] = FUNCD(pred8x8_tm_vp8); + h->pred8x8[DC_127_PRED8x8 ] = FUNCC(pred8x8_127_dc, 8); + h->pred8x8[DC_129_PRED8x8 ] = FUNCC(pred8x8_129_dc, 8); + h->pred16x16[PLANE_PRED8x8 ] = FUNCD(pred16x16_tm_vp8); + h->pred16x16[DC_127_PRED8x8] = FUNCC(pred16x16_127_dc, 8); + h->pred16x16[DC_129_PRED8x8] = FUNCC(pred16x16_129_dc, 8); + break; + } + break; + } + +#if ARCH_AARCH64 + ff_h264_pred_init_aarch64(h, codec_id, bit_depth, chroma_format_idc); +#elif ARCH_ARM + ff_h264_pred_init_arm(h, codec_id, bit_depth, chroma_format_idc); +#elif ARCH_X86 + ff_h264_pred_init_x86(h, codec_id, bit_depth, chroma_format_idc); +#elif ARCH_MIPS + ff_h264_pred_init_mips(h, codec_id, bit_depth, chroma_format_idc); +#elif ARCH_LOONGARCH + ff_h264_pred_init_loongarch(h, codec_id, bit_depth, chroma_format_idc); +#endif +} diff --git a/media/ffvpx/libavcodec/h264pred.h b/media/ffvpx/libavcodec/h264pred.h new file mode 100644 index 0000000000..cb008548fc --- /dev/null +++ b/media/ffvpx/libavcodec/h264pred.h @@ -0,0 +1,130 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG-4 prediction functions. + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#ifndef AVCODEC_H264PRED_H +#define AVCODEC_H264PRED_H + +#include <stddef.h> +#include <stdint.h> + +/** + * Prediction types + */ +//@{ +#define VERT_PRED 0 +#define HOR_PRED 1 +#define DC_PRED 2 +#define DIAG_DOWN_LEFT_PRED 3 +#define DIAG_DOWN_RIGHT_PRED 4 +#define VERT_RIGHT_PRED 5 +#define HOR_DOWN_PRED 6 +#define VERT_LEFT_PRED 7 +#define HOR_UP_PRED 8 + +// DC edge (not for VP8) +#define LEFT_DC_PRED 9 +#define TOP_DC_PRED 10 +#define DC_128_PRED 11 + +// RV40 specific +#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN 12 +#define HOR_UP_PRED_RV40_NODOWN 13 +#define VERT_LEFT_PRED_RV40_NODOWN 14 + +// VP8 specific +#define TM_VP8_PRED 9 ///< "True Motion", used instead of plane +#define VERT_VP8_PRED 10 ///< for VP8, #VERT_PRED is the average of + ///< (left col+cur col x2+right col) / 4; + ///< this is the "unaveraged" one +#define HOR_VP8_PRED 14 ///< unaveraged version of #HOR_PRED, see + ///< #VERT_VP8_PRED for details +#define DC_127_PRED 12 +#define DC_129_PRED 13 + +#define DC_PRED8x8 0 +#define HOR_PRED8x8 1 +#define VERT_PRED8x8 2 +#define PLANE_PRED8x8 3 + +// DC edge +#define LEFT_DC_PRED8x8 4 +#define TOP_DC_PRED8x8 5 +#define DC_128_PRED8x8 6 + +// H.264/SVQ3 (8x8) specific +#define ALZHEIMER_DC_L0T_PRED8x8 7 +#define ALZHEIMER_DC_0LT_PRED8x8 8 +#define ALZHEIMER_DC_L00_PRED8x8 9 +#define ALZHEIMER_DC_0L0_PRED8x8 10 + +// VP8 specific +#define DC_127_PRED8x8 7 +#define DC_129_PRED8x8 8 +//@} + +#define PART_NOT_AVAILABLE -2 + +/** + * Context for storing H.264 prediction functions + */ +typedef struct H264PredContext { + void(*pred4x4[9 + 3 + 3])(uint8_t *src, const uint8_t *topright, + ptrdiff_t stride); + void(*pred8x8l[9 + 3])(uint8_t *src, int topleft, int topright, + ptrdiff_t stride); + void(*pred8x8[4 + 3 + 4])(uint8_t *src, ptrdiff_t stride); + void(*pred16x16[4 + 3 + 2])(uint8_t *src, ptrdiff_t stride); + + void(*pred4x4_add[2])(uint8_t *pix /*align 4*/, + int16_t *block /*align 16*/, ptrdiff_t stride); + void(*pred8x8l_add[2])(uint8_t *pix /*align 8*/, + int16_t *block /*align 16*/, ptrdiff_t stride); + void(*pred8x8l_filter_add[2])(uint8_t *pix /*align 8*/, + int16_t *block /*align 16*/, int topleft, int topright, ptrdiff_t stride); + void(*pred8x8_add[3])(uint8_t *pix /*align 8*/, + const int *block_offset, + int16_t *block /*align 16*/, ptrdiff_t stride); + void(*pred16x16_add[3])(uint8_t *pix /*align 16*/, + const int *block_offset, + int16_t *block /*align 16*/, ptrdiff_t stride); +} H264PredContext; + +void ff_h264_pred_init(H264PredContext *h, int codec_id, + const int bit_depth, const int chroma_format_idc); +void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, + const int bit_depth, + const int chroma_format_idc); +void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, + const int bit_depth, const int chroma_format_idc); +void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, + const int bit_depth, const int chroma_format_idc); +void ff_h264_pred_init_mips(H264PredContext *h, int codec_id, + const int bit_depth, const int chroma_format_idc); +void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id, + const int bit_depth, const int chroma_format_idc); + +#endif /* AVCODEC_H264PRED_H */ diff --git a/media/ffvpx/libavcodec/h264pred_template.c b/media/ffvpx/libavcodec/h264pred_template.c new file mode 100644 index 0000000000..b5bc942a5e --- /dev/null +++ b/media/ffvpx/libavcodec/h264pred_template.c @@ -0,0 +1,1333 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG-4 part10 prediction functions. + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#include "libavutil/intreadwrite.h" + +#include "mathops.h" + +#include "bit_depth_template.c" + +static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const pixel4 a= AV_RN4PA(src-stride); + + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); +} + +static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride])); + AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride])); + AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride])); + AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride])); +} + +static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; + const pixel4 a = PIXEL_SPLAT_X4(dc); + + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); +} + +static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; + const pixel4 a = PIXEL_SPLAT_X4(dc); + + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); +} + +static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; + const pixel4 a = PIXEL_SPLAT_X4(dc); + + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); +} + +static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)); + + AV_WN4PA(src+0*stride, a); + AV_WN4PA(src+1*stride, a); + AV_WN4PA(src+2*stride, a); + AV_WN4PA(src+3*stride, a); +} + + +#define LOAD_TOP_RIGHT_EDGE\ + const unsigned av_unused t4 = topright[0];\ + const unsigned av_unused t5 = topright[1];\ + const unsigned av_unused t6 = topright[2];\ + const unsigned av_unused t7 = topright[3];\ + +#define LOAD_DOWN_LEFT_EDGE\ + const unsigned av_unused l4 = src[-1+4*stride];\ + const unsigned av_unused l5 = src[-1+5*stride];\ + const unsigned av_unused l6 = src[-1+6*stride];\ + const unsigned av_unused l7 = src[-1+7*stride];\ + +#define LOAD_LEFT_EDGE\ + const unsigned av_unused l0 = src[-1+0*stride];\ + const unsigned av_unused l1 = src[-1+1*stride];\ + const unsigned av_unused l2 = src[-1+2*stride];\ + const unsigned av_unused l3 = src[-1+3*stride];\ + +#define LOAD_TOP_EDGE\ + const unsigned av_unused t0 = src[ 0-1*stride];\ + const unsigned av_unused t1 = src[ 1-1*stride];\ + const unsigned av_unused t2 = src[ 2-1*stride];\ + const unsigned av_unused t3 = src[ 3-1*stride];\ + +static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const int lt= src[-1-1*stride]; + LOAD_TOP_EDGE + LOAD_LEFT_EDGE + + src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2; + src[0+2*stride]= + src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2; + src[0+1*stride]= + src[1+2*stride]= + src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2; + src[0+0*stride]= + src[1+1*stride]= + src[2+2*stride]= + src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2; + src[1+0*stride]= + src[2+1*stride]= + src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2; + src[2+0*stride]= + src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2; +} + +static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + const pixel *topright = (const pixel*)_topright; + int stride = _stride>>(sizeof(pixel)-1); + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE +// LOAD_LEFT_EDGE + + src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2; + src[1+0*stride]= + src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2; + src[2+0*stride]= + src[1+1*stride]= + src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2; + src[3+0*stride]= + src[2+1*stride]= + src[1+2*stride]= + src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2; + src[3+1*stride]= + src[2+2*stride]= + src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2; + src[3+2*stride]= + src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2; + src[3+3*stride]=(t6 + 3*t7 + 2)>>2; +} + +static void FUNCC(pred4x4_vertical_right)(uint8_t *_src, + const uint8_t *topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const int lt= src[-1-1*stride]; + LOAD_TOP_EDGE + LOAD_LEFT_EDGE + + src[0+0*stride]= + src[1+2*stride]=(lt + t0 + 1)>>1; + src[1+0*stride]= + src[2+2*stride]=(t0 + t1 + 1)>>1; + src[2+0*stride]= + src[3+2*stride]=(t1 + t2 + 1)>>1; + src[3+0*stride]=(t2 + t3 + 1)>>1; + src[0+1*stride]= + src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2; + src[1+1*stride]= + src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2; + src[2+1*stride]= + src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2; + src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2; + src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; +} + +static void FUNCC(pred4x4_vertical_left)(uint8_t *_src, + const uint8_t *_topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + const pixel *topright = (const pixel*)_topright; + int stride = _stride>>(sizeof(pixel)-1); + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE + + src[0+0*stride]=(t0 + t1 + 1)>>1; + src[1+0*stride]= + src[0+2*stride]=(t1 + t2 + 1)>>1; + src[2+0*stride]= + src[1+2*stride]=(t2 + t3 + 1)>>1; + src[3+0*stride]= + src[2+2*stride]=(t3 + t4+ 1)>>1; + src[3+2*stride]=(t4 + t5+ 1)>>1; + src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[1+1*stride]= + src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2; + src[2+1*stride]= + src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2; + src[3+1*stride]= + src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2; + src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; +} + +static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + LOAD_LEFT_EDGE + + src[0+0*stride]=(l0 + l1 + 1)>>1; + src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2; + src[2+0*stride]= + src[0+1*stride]=(l1 + l2 + 1)>>1; + src[3+0*stride]= + src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2; + src[2+1*stride]= + src[0+2*stride]=(l2 + l3 + 1)>>1; + src[3+1*stride]= + src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2; + src[3+2*stride]= + src[1+3*stride]= + src[0+3*stride]= + src[2+2*stride]= + src[2+3*stride]= + src[3+3*stride]=l3; +} + +static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src, + const uint8_t *topright, + ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const int lt= src[-1-1*stride]; + LOAD_TOP_EDGE + LOAD_LEFT_EDGE + + src[0+0*stride]= + src[2+1*stride]=(lt + l0 + 1)>>1; + src[1+0*stride]= + src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2; + src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2; + src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[0+1*stride]= + src[2+2*stride]=(l0 + l1 + 1)>>1; + src[1+1*stride]= + src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2; + src[0+2*stride]= + src[2+3*stride]=(l1 + l2+ 1)>>1; + src[1+2*stride]= + src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; + src[0+3*stride]=(l2 + l3 + 1)>>1; + src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2; +} + +static void FUNCC(pred16x16_vertical)(uint8_t *_src, ptrdiff_t _stride) +{ + int i; + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0); + const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1); + const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2); + const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3); + + for(i=0; i<16; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, b); + AV_WN4PA(((pixel4*)(src+i*stride))+2, c); + AV_WN4PA(((pixel4*)(src+i*stride))+3, d); + } +} + +static void FUNCC(pred16x16_horizontal)(uint8_t *_src, ptrdiff_t stride) +{ + int i; + pixel *src = (pixel*)_src; + stride >>= sizeof(pixel)-1; + + for(i=0; i<16; i++){ + const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); + + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, a); + AV_WN4PA(((pixel4*)(src+i*stride))+2, a); + AV_WN4PA(((pixel4*)(src+i*stride))+3, a); + } +} + +#define PREDICT_16x16_DC(v)\ + for(i=0; i<16; i++){\ + AV_WN4PA(src+ 0, v);\ + AV_WN4PA(src+ 4, v);\ + AV_WN4PA(src+ 8, v);\ + AV_WN4PA(src+12, v);\ + src += stride;\ + } + +static void FUNCC(pred16x16_dc)(uint8_t *_src, ptrdiff_t stride) +{ + int i, dc=0; + pixel *src = (pixel*)_src; + pixel4 dcsplat; + stride >>= sizeof(pixel)-1; + + for(i=0;i<16; i++){ + dc+= src[-1+i*stride]; + } + + for(i=0;i<16; i++){ + dc+= src[i-stride]; + } + + dcsplat = PIXEL_SPLAT_X4((dc+16)>>5); + PREDICT_16x16_DC(dcsplat); +} + +static void FUNCC(pred16x16_left_dc)(uint8_t *_src, ptrdiff_t stride) +{ + int i, dc=0; + pixel *src = (pixel*)_src; + pixel4 dcsplat; + stride >>= sizeof(pixel)-1; + + for(i=0;i<16; i++){ + dc+= src[-1+i*stride]; + } + + dcsplat = PIXEL_SPLAT_X4((dc+8)>>4); + PREDICT_16x16_DC(dcsplat); +} + +static void FUNCC(pred16x16_top_dc)(uint8_t *_src, ptrdiff_t stride) +{ + int i, dc=0; + pixel *src = (pixel*)_src; + pixel4 dcsplat; + stride >>= sizeof(pixel)-1; + + for(i=0;i<16; i++){ + dc+= src[i-stride]; + } + + dcsplat = PIXEL_SPLAT_X4((dc+8)>>4); + PREDICT_16x16_DC(dcsplat); +} + +#define PRED16x16_X(n, v) \ +static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\ +{\ + int i;\ + pixel *src = (pixel*)_src;\ + stride >>= sizeof(pixel)-1;\ + PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\ +} + +PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0) +#if BIT_DEPTH == 8 +PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1) +PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1) +#endif + +static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src, + ptrdiff_t _stride, + const int svq3, + const int rv40) +{ + int i, j, k; + int a; + INIT_CLIP + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const pixel * const src0 = src +7-stride; + const pixel * src1 = src +8*stride-1; + const pixel * src2 = src1-2*stride; // == src+6*stride-1; + int H = src0[1] - src0[-1]; + int V = src1[0] - src2[ 0]; + for(k=2; k<=8; ++k) { + src1 += stride; src2 -= stride; + H += k*(src0[k] - src0[-k]); + V += k*(src1[0] - src2[ 0]); + } + if(svq3){ + H = ( 5*(H/4) ) / 16; + V = ( 5*(V/4) ) / 16; + + /* required for 100% accuracy */ + i = H; H = V; V = i; + }else if(rv40){ + H = ( H + (H>>2) ) >> 4; + V = ( V + (V>>2) ) >> 4; + }else{ + H = ( 5*H+32 ) >> 6; + V = ( 5*V+32 ) >> 6; + } + + a = 16*(src1[0] + src2[16] + 1) - 7*(V+H); + for(j=16; j>0; --j) { + int b = a; + a += V; + for(i=-16; i<0; i+=4) { + src[16+i] = CLIP((b ) >> 5); + src[17+i] = CLIP((b+ H) >> 5); + src[18+i] = CLIP((b+2*H) >> 5); + src[19+i] = CLIP((b+3*H) >> 5); + b += 4*H; + } + src += stride; + } +} + +static void FUNCC(pred16x16_plane)(uint8_t *src, ptrdiff_t stride) +{ + FUNCC(pred16x16_plane_compat)(src, stride, 0, 0); +} + +static void FUNCC(pred8x8_vertical)(uint8_t *_src, ptrdiff_t _stride) +{ + int i; + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0); + const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1); + + for(i=0; i<8; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, b); + } +} + +static void FUNCC(pred8x16_vertical)(uint8_t *_src, ptrdiff_t _stride) +{ + int i; + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0); + const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1); + + for(i=0; i<16; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, b); + } +} + +static void FUNCC(pred8x8_horizontal)(uint8_t *_src, ptrdiff_t stride) +{ + int i; + pixel *src = (pixel*)_src; + stride >>= sizeof(pixel)-1; + + for(i=0; i<8; i++){ + const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, a); + } +} + +static void FUNCC(pred8x16_horizontal)(uint8_t *_src, ptrdiff_t stride) +{ + int i; + pixel *src = (pixel*)_src; + stride >>= sizeof(pixel)-1; + for(i=0; i<16; i++){ + const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]); + AV_WN4PA(((pixel4*)(src+i*stride))+0, a); + AV_WN4PA(((pixel4*)(src+i*stride))+1, a); + } +} + +#define PRED8x8_X(n, v)\ +static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\ +{\ + int i;\ + const pixel4 a = PIXEL_SPLAT_X4(v);\ + pixel *src = (pixel*)_src;\ + stride >>= sizeof(pixel)-1;\ + for(i=0; i<8; i++){\ + AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\ + AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\ + }\ +} + +PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0) +#if BIT_DEPTH == 8 +PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1) +PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1) +#endif + +static void FUNCC(pred8x16_128_dc)(uint8_t *_src, ptrdiff_t stride) +{ + FUNCC(pred8x8_128_dc)(_src, stride); + FUNCC(pred8x8_128_dc)(_src+8*stride, stride); +} + +static void FUNCC(pred8x8_left_dc)(uint8_t *_src, ptrdiff_t stride) +{ + int i; + int dc0, dc2; + pixel4 dc0splat, dc2splat; + pixel *src = (pixel*)_src; + stride >>= sizeof(pixel)-1; + + dc0=dc2=0; + for(i=0;i<4; i++){ + dc0+= src[-1+i*stride]; + dc2+= src[-1+(i+4)*stride]; + } + dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); + dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); + + for(i=0; i<4; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat); + } + for(i=4; i<8; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat); + } +} + +static void FUNCC(pred8x16_left_dc)(uint8_t *_src, ptrdiff_t stride) +{ + FUNCC(pred8x8_left_dc)(_src, stride); + FUNCC(pred8x8_left_dc)(_src+8*stride, stride); +} + +static void FUNCC(pred8x8_top_dc)(uint8_t *_src, ptrdiff_t stride) +{ + int i; + int dc0, dc1; + pixel4 dc0splat, dc1splat; + pixel *src = (pixel*)_src; + stride >>= sizeof(pixel)-1; + + dc0=dc1=0; + for(i=0;i<4; i++){ + dc0+= src[i-stride]; + dc1+= src[4+i-stride]; + } + dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); + dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); + + for(i=0; i<4; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); + } + for(i=4; i<8; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); + } +} + +static void FUNCC(pred8x16_top_dc)(uint8_t *_src, ptrdiff_t stride) +{ + int i; + int dc0, dc1; + pixel4 dc0splat, dc1splat; + pixel *src = (pixel*)_src; + stride >>= sizeof(pixel)-1; + + dc0=dc1=0; + for(i=0;i<4; i++){ + dc0+= src[i-stride]; + dc1+= src[4+i-stride]; + } + dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2); + dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); + + for(i=0; i<16; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); + } +} + +static void FUNCC(pred8x8_dc)(uint8_t *_src, ptrdiff_t stride) +{ + int i; + int dc0, dc1, dc2; + pixel4 dc0splat, dc1splat, dc2splat, dc3splat; + pixel *src = (pixel*)_src; + stride >>= sizeof(pixel)-1; + + dc0=dc1=dc2=0; + for(i=0;i<4; i++){ + dc0+= src[-1+i*stride] + src[i-stride]; + dc1+= src[4+i-stride]; + dc2+= src[-1+(i+4)*stride]; + } + dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3); + dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); + dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); + dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3); + + for(i=0; i<4; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); + } + for(i=4; i<8; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat); + } +} + +static void FUNCC(pred8x16_dc)(uint8_t *_src, ptrdiff_t stride) +{ + int i; + int dc0, dc1, dc2, dc3, dc4; + pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat; + pixel *src = (pixel*)_src; + stride >>= sizeof(pixel)-1; + + dc0=dc1=dc2=dc3=dc4=0; + for(i=0;i<4; i++){ + dc0+= src[-1+i*stride] + src[i-stride]; + dc1+= src[4+i-stride]; + dc2+= src[-1+(i+4)*stride]; + dc3+= src[-1+(i+8)*stride]; + dc4+= src[-1+(i+12)*stride]; + } + dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3); + dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2); + dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2); + dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3); + dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2); + dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3); + dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2); + dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3); + + for(i=0; i<4; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat); + } + for(i=4; i<8; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat); + } + for(i=8; i<12; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat); + } + for(i=12; i<16; i++){ + AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat); + AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat); + } +} + +//the following 4 function should not be optimized! +static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride) +{ + FUNCC(pred8x8_top_dc)(src, stride); + FUNCC(pred4x4_dc)(src, NULL, stride); +} + +static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride) +{ + FUNCC(pred8x16_top_dc)(src, stride); + FUNCC(pred4x4_dc)(src, NULL, stride); +} + +static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride) +{ + FUNCC(pred8x8_dc)(src, stride); + FUNCC(pred4x4_top_dc)(src, NULL, stride); +} + +static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride) +{ + FUNCC(pred8x16_dc)(src, stride); + FUNCC(pred4x4_top_dc)(src, NULL, stride); +} + +static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride) +{ + FUNCC(pred8x8_left_dc)(src, stride); + FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride); + FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride); +} + +static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride) +{ + FUNCC(pred8x16_left_dc)(src, stride); + FUNCC(pred4x4_128_dc)(src + 4*stride , NULL, stride); + FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride); +} + +static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride) +{ + FUNCC(pred8x8_left_dc)(src, stride); + FUNCC(pred4x4_128_dc)(src , NULL, stride); + FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride); +} + +static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride) +{ + FUNCC(pred8x16_left_dc)(src, stride); + FUNCC(pred4x4_128_dc)(src , NULL, stride); + FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride); +} + +static void FUNCC(pred8x8_plane)(uint8_t *_src, ptrdiff_t _stride) +{ + int j, k; + int a; + INIT_CLIP + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const pixel * const src0 = src +3-stride; + const pixel * src1 = src +4*stride-1; + const pixel * src2 = src1-2*stride; // == src+2*stride-1; + int H = src0[1] - src0[-1]; + int V = src1[0] - src2[ 0]; + for(k=2; k<=4; ++k) { + src1 += stride; src2 -= stride; + H += k*(src0[k] - src0[-k]); + V += k*(src1[0] - src2[ 0]); + } + H = ( 17*H+16 ) >> 5; + V = ( 17*V+16 ) >> 5; + + a = 16*(src1[0] + src2[8]+1) - 3*(V+H); + for(j=8; j>0; --j) { + int b = a; + a += V; + src[0] = CLIP((b ) >> 5); + src[1] = CLIP((b+ H) >> 5); + src[2] = CLIP((b+2*H) >> 5); + src[3] = CLIP((b+3*H) >> 5); + src[4] = CLIP((b+4*H) >> 5); + src[5] = CLIP((b+5*H) >> 5); + src[6] = CLIP((b+6*H) >> 5); + src[7] = CLIP((b+7*H) >> 5); + src += stride; + } +} + +static void FUNCC(pred8x16_plane)(uint8_t *_src, ptrdiff_t _stride) +{ + int j, k; + int a; + INIT_CLIP + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + const pixel * const src0 = src +3-stride; + const pixel * src1 = src +8*stride-1; + const pixel * src2 = src1-2*stride; // == src+6*stride-1; + int H = src0[1] - src0[-1]; + int V = src1[0] - src2[ 0]; + + for (k = 2; k <= 4; ++k) { + src1 += stride; src2 -= stride; + H += k*(src0[k] - src0[-k]); + V += k*(src1[0] - src2[ 0]); + } + for (; k <= 8; ++k) { + src1 += stride; src2 -= stride; + V += k*(src1[0] - src2[0]); + } + + H = (17*H+16) >> 5; + V = (5*V+32) >> 6; + + a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H; + for(j=16; j>0; --j) { + int b = a; + a += V; + src[0] = CLIP((b ) >> 5); + src[1] = CLIP((b+ H) >> 5); + src[2] = CLIP((b+2*H) >> 5); + src[3] = CLIP((b+3*H) >> 5); + src[4] = CLIP((b+4*H) >> 5); + src[5] = CLIP((b+5*H) >> 5); + src[6] = CLIP((b+6*H) >> 5); + src[7] = CLIP((b+7*H) >> 5); + src += stride; + } +} + +#define SRC(x,y) src[(x)+(y)*stride] +#define PL(y) \ + const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; +#define PREDICT_8x8_LOAD_LEFT \ + const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \ + + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \ + PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \ + const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2 + +#define PT(x) \ + const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; +#define PREDICT_8x8_LOAD_TOP \ + const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \ + + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \ + PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \ + const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \ + + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2 + +#define PTR(x) \ + t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; +#define PREDICT_8x8_LOAD_TOPRIGHT \ + int t8, t9, t10, t11, t12, t13, t14, t15; \ + if(has_topright) { \ + PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \ + t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \ + } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1); + +#define PREDICT_8x8_LOAD_TOPLEFT \ + const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2 + +#define PREDICT_8x8_DC(v) \ + int y; \ + for( y = 0; y < 8; y++ ) { \ + AV_WN4PA(((pixel4*)src)+0, v); \ + AV_WN4PA(((pixel4*)src)+1, v); \ + src += stride; \ + } + +static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + + PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1))); +} +static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + + PREDICT_8x8_LOAD_LEFT; + const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3); + PREDICT_8x8_DC(dc); +} +static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + + PREDICT_8x8_LOAD_TOP; + const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3); + PREDICT_8x8_DC(dc); +} +static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOP; + const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7 + +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4); + PREDICT_8x8_DC(dc); +} +static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + pixel4 a; + + PREDICT_8x8_LOAD_LEFT; +#define ROW(y) a = PIXEL_SPLAT_X4(l##y); \ + AV_WN4PA(src+y*stride, a); \ + AV_WN4PA(src+y*stride+4, a); + ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); +#undef ROW +} +static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + int y; + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + pixel4 a, b; + + PREDICT_8x8_LOAD_TOP; + src[0] = t0; + src[1] = t1; + src[2] = t2; + src[3] = t3; + src[4] = t4; + src[5] = t5; + src[6] = t6; + src[7] = t7; + a = AV_RN4PA(((pixel4*)src)+0); + b = AV_RN4PA(((pixel4*)src)+1); + for( y = 1; y < 8; y++ ) { + AV_WN4PA(((pixel4*)(src+y*stride))+0, a); + AV_WN4PA(((pixel4*)(src+y*stride))+1, b); + } +} +static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_TOPRIGHT; + SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2; + SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2; + SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2; + SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2; + SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2; + SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2; + SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2; + SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; + SRC(7,7)= (t14 + 3*t15 + 2) >> 2; +} +static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2; + SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2; + SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2; + SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2; + SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2; + SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2; + SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2; + SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2; + SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; +} +static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2; + SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2; + SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2; + SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2; + SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2; + SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2; + SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1; + SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2; + SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1; + SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1; + SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1; + SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1; + SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1; + SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1; + SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(7,0)= (t6 + t7 + 1) >> 1; +} +static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + SRC(0,7)= (l6 + l7 + 1) >> 1; + SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; + SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; + SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; + SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; + SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; + SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; + SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; + SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; + SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; + SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; + SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; + SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; + SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; + SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; + SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; + SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; + SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; + SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; + SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; + SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; +} +static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_TOPRIGHT; + SRC(0,0)= (t0 + t1 + 1) >> 1; + SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1; + SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1; + SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1; + SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1; + SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1; + SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1; + SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2; + SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1; + SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2; + SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1; + SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2; + SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1; + SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2; + SRC(7,6)= (t10 + t11 + 1) >> 1; + SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; +} +static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + pixel *src = (pixel*)_src; + int stride = _stride>>(sizeof(pixel)-1); + PREDICT_8x8_LOAD_LEFT; + SRC(0,0)= (l0 + l1 + 1) >> 1; + SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; + SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; + SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; + SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; + SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; + SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; + SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; + SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; + SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; + SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; + SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; + SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; + SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; + SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= + SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= + SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= + SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; +} + +static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + int i; + pixel *src = (pixel*)_src; + const dctcoef *block = (const dctcoef*)_block; + pixel pix[8]; + int stride = _stride>>(sizeof(pixel)-1); + PREDICT_8x8_LOAD_TOP; + + pix[0] = t0; + pix[1] = t1; + pix[2] = t2; + pix[3] = t3; + pix[4] = t4; + pix[5] = t5; + pix[6] = t6; + pix[7] = t7; + + for(i=0; i<8; i++){ + pixel v = pix[i]; + src[0*stride]= v += block[0]; + src[1*stride]= v += block[8]; + src[2*stride]= v += block[16]; + src[3*stride]= v += block[24]; + src[4*stride]= v += block[32]; + src[5*stride]= v += block[40]; + src[6*stride]= v += block[48]; + src[7*stride]= v + block[56]; + src++; + block++; + } + + memset(_block, 0, sizeof(dctcoef) * 64); +} + +static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft, + int has_topright, ptrdiff_t _stride) +{ + int i; + pixel *src = (pixel*)_src; + const dctcoef *block = (const dctcoef*)_block; + pixel pix[8]; + int stride = _stride>>(sizeof(pixel)-1); + PREDICT_8x8_LOAD_LEFT; + + pix[0] = l0; + pix[1] = l1; + pix[2] = l2; + pix[3] = l3; + pix[4] = l4; + pix[5] = l5; + pix[6] = l6; + pix[7] = l7; + + for(i=0; i<8; i++){ + pixel v = pix[i]; + src[0]= v += block[0]; + src[1]= v += block[1]; + src[2]= v += block[2]; + src[3]= v += block[3]; + src[4]= v += block[4]; + src[5]= v += block[5]; + src[6]= v += block[6]; + src[7]= v + block[7]; + src+= stride; + block+= 8; + } + + memset(_block, 0, sizeof(dctcoef) * 64); +} + +#undef PREDICT_8x8_LOAD_LEFT +#undef PREDICT_8x8_LOAD_TOP +#undef PREDICT_8x8_LOAD_TOPLEFT +#undef PREDICT_8x8_LOAD_TOPRIGHT +#undef PREDICT_8x8_DC +#undef PTR +#undef PT +#undef PL +#undef SRC + +static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block, + ptrdiff_t stride) +{ + int i; + pixel *pix = (pixel*)_pix; + const dctcoef *block = (const dctcoef*)_block; + stride >>= sizeof(pixel)-1; + pix -= stride; + for(i=0; i<4; i++){ + pixel v = pix[0]; + pix[1*stride]= v += block[0]; + pix[2*stride]= v += block[4]; + pix[3*stride]= v += block[8]; + pix[4*stride]= v + block[12]; + pix++; + block++; + } + + memset(_block, 0, sizeof(dctcoef) * 16); +} + +static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block, + ptrdiff_t stride) +{ + int i; + pixel *pix = (pixel*)_pix; + const dctcoef *block = (const dctcoef*)_block; + stride >>= sizeof(pixel)-1; + for(i=0; i<4; i++){ + pixel v = pix[-1]; + pix[0]= v += block[0]; + pix[1]= v += block[1]; + pix[2]= v += block[2]; + pix[3]= v + block[3]; + pix+= stride; + block+= 4; + } + + memset(_block, 0, sizeof(dctcoef) * 16); +} + +static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block, + ptrdiff_t stride) +{ + int i; + pixel *pix = (pixel*)_pix; + const dctcoef *block = (const dctcoef*)_block; + stride >>= sizeof(pixel)-1; + pix -= stride; + for(i=0; i<8; i++){ + pixel v = pix[0]; + pix[1*stride]= v += block[0]; + pix[2*stride]= v += block[8]; + pix[3*stride]= v += block[16]; + pix[4*stride]= v += block[24]; + pix[5*stride]= v += block[32]; + pix[6*stride]= v += block[40]; + pix[7*stride]= v += block[48]; + pix[8*stride]= v + block[56]; + pix++; + block++; + } + + memset(_block, 0, sizeof(dctcoef) * 64); +} + +static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block, + ptrdiff_t stride) +{ + int i; + pixel *pix = (pixel*)_pix; + const dctcoef *block = (const dctcoef*)_block; + stride >>= sizeof(pixel)-1; + for(i=0; i<8; i++){ + pixel v = pix[-1]; + pix[0]= v += block[0]; + pix[1]= v += block[1]; + pix[2]= v += block[2]; + pix[3]= v += block[3]; + pix[4]= v += block[4]; + pix[5]= v += block[5]; + pix[6]= v += block[6]; + pix[7]= v + block[7]; + pix+= stride; + block+= 8; + } + + memset(_block, 0, sizeof(dctcoef) * 64); +} + +static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, + int16_t *block, + ptrdiff_t stride) +{ + int i; + for(i=0; i<16; i++) + FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); +} + +static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, + const int *block_offset, + int16_t *block, + ptrdiff_t stride) +{ + int i; + for(i=0; i<16; i++) + FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); +} + +static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, + int16_t *block, ptrdiff_t stride) +{ + int i; + for(i=0; i<4; i++) + FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); +} + +static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, + int16_t *block, ptrdiff_t stride) +{ + int i; + for(i=0; i<4; i++) + FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); + for(i=4; i<8; i++) + FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride); +} + +static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, + int16_t *block, + ptrdiff_t stride) +{ + int i; + for(i=0; i<4; i++) + FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); +} + +static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, + const int *block_offset, + int16_t *block, ptrdiff_t stride) +{ + int i; + for(i=0; i<4; i++) + FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride); + for(i=4; i<8; i++) + FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride); +} diff --git a/media/ffvpx/libavcodec/hpeldsp.h b/media/ffvpx/libavcodec/hpeldsp.h new file mode 100644 index 0000000000..45e81b10a5 --- /dev/null +++ b/media/ffvpx/libavcodec/hpeldsp.h @@ -0,0 +1,107 @@ +/* + * Half-pel DSP functions. + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Half-pel DSP functions. + */ + +#ifndef AVCODEC_HPELDSP_H +#define AVCODEC_HPELDSP_H + +#include <stdint.h> +#include <stddef.h> + +/* add and put pixel (decoding) */ +// blocksizes for hpel_pixels_func are 8x4,8x8 16x8 16x16 +// h for hpel_pixels_func is limited to {width/2, width} but never larger +// than 16 and never smaller than 4 +typedef void (*op_pixels_func)(uint8_t *block /*align width (8 or 16)*/, + const uint8_t *pixels /*align 1*/, + ptrdiff_t line_size, int h); + +/** + * Half-pel DSP context. + */ +typedef struct HpelDSPContext { + /** + * Halfpel motion compensation with rounding (a+b+1)>>1. + * this is an array[4][4] of motion compensation functions for 4 + * horizontal blocksizes (8,16) and the 4 halfpel positions<br> + * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] + * @param block destination where the result is stored + * @param pixels source + * @param line_size number of bytes in a horizontal line of block + * @param h height + */ + op_pixels_func put_pixels_tab[4][4]; + + /** + * Halfpel motion compensation with rounding (a+b+1)>>1. + * This is an array[4][4] of motion compensation functions for 4 + * horizontal blocksizes (8,16) and the 4 halfpel positions<br> + * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] + * @param block destination into which the result is averaged (a+b+1)>>1 + * @param pixels source + * @param line_size number of bytes in a horizontal line of block + * @param h height + */ + op_pixels_func avg_pixels_tab[4][4]; + + /** + * Halfpel motion compensation with no rounding (a+b)>>1. + * this is an array[4][4] of motion compensation functions for 2 + * horizontal blocksizes (8,16) and the 4 halfpel positions<br> + * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] + * @param block destination where the result is stored + * @param pixels source + * @param line_size number of bytes in a horizontal line of block + * @param h height + * @note The size is kept at [4][4] to match the above pixel_tabs and avoid + * out of bounds reads in the motion estimation code. + */ + op_pixels_func put_no_rnd_pixels_tab[4][4]; + + /** + * Halfpel motion compensation with no rounding (a+b)>>1. + * this is an array[4] of motion compensation functions for 1 + * horizontal blocksize (16) and the 4 halfpel positions<br> + * *pixels_tab[0][ xhalfpel + 2*yhalfpel ] + * @param block destination into which the result is averaged (a+b)>>1 + * @param pixels source + * @param line_size number of bytes in a horizontal line of block + * @param h height + */ + op_pixels_func avg_no_rnd_pixels_tab[4]; +} HpelDSPContext; + +void ff_hpeldsp_init(HpelDSPContext *c, int flags); + +void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags); +void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags); +void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags); +void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags); +void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags); +void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags); +void ff_hpeldsp_init_loongarch(HpelDSPContext *c, int flags); + +#endif /* AVCODEC_HPELDSP_H */ diff --git a/media/ffvpx/libavcodec/hwaccel.h b/media/ffvpx/libavcodec/hwaccel.h new file mode 100644 index 0000000000..3aaa92571c --- /dev/null +++ b/media/ffvpx/libavcodec/hwaccel.h @@ -0,0 +1,84 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_HWACCEL_H +#define AVCODEC_HWACCEL_H + +#include "avcodec.h" +#include "hwaccels.h" + + +#define HWACCEL_CAP_ASYNC_SAFE (1 << 0) + + +typedef struct AVCodecHWConfigInternal { + /** + * This is the structure which will be returned to the user by + * avcodec_get_hw_config(). + */ + AVCodecHWConfig public; + /** + * If this configuration uses a hwaccel, a pointer to it. + * If not, NULL. + */ + const AVHWAccel *hwaccel; +} AVCodecHWConfigInternal; + + +// These macros are used to simplify AVCodecHWConfigInternal definitions. + +#define HW_CONFIG_HWACCEL(device, frames, ad_hoc, format, device_type_, name) \ + &(const AVCodecHWConfigInternal) { \ + .public = { \ + .pix_fmt = AV_PIX_FMT_ ## format, \ + .methods = (device ? AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX : 0) | \ + (frames ? AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX : 0) | \ + (ad_hoc ? AV_CODEC_HW_CONFIG_METHOD_AD_HOC : 0), \ + .device_type = AV_HWDEVICE_TYPE_ ## device_type_, \ + }, \ + .hwaccel = &name, \ + } + +#define HW_CONFIG_INTERNAL(format) \ + &(const AVCodecHWConfigInternal) { \ + .public = { \ + .pix_fmt = AV_PIX_FMT_ ## format, \ + .methods = AV_CODEC_HW_CONFIG_METHOD_INTERNAL, \ + .device_type = AV_HWDEVICE_TYPE_NONE, \ + }, \ + .hwaccel = NULL, \ + } + +#define HWACCEL_DXVA2(codec) \ + HW_CONFIG_HWACCEL(1, 1, 1, DXVA2_VLD, DXVA2, ff_ ## codec ## _dxva2_hwaccel) +#define HWACCEL_D3D11VA2(codec) \ + HW_CONFIG_HWACCEL(1, 1, 0, D3D11, D3D11VA, ff_ ## codec ## _d3d11va2_hwaccel) +#define HWACCEL_NVDEC(codec) \ + HW_CONFIG_HWACCEL(1, 1, 0, CUDA, CUDA, ff_ ## codec ## _nvdec_hwaccel) +#define HWACCEL_VAAPI(codec) \ + HW_CONFIG_HWACCEL(1, 1, 1, VAAPI, VAAPI, ff_ ## codec ## _vaapi_hwaccel) +#define HWACCEL_VDPAU(codec) \ + HW_CONFIG_HWACCEL(1, 1, 1, VDPAU, VDPAU, ff_ ## codec ## _vdpau_hwaccel) +#define HWACCEL_VIDEOTOOLBOX(codec) \ + HW_CONFIG_HWACCEL(1, 1, 1, VIDEOTOOLBOX, VIDEOTOOLBOX, ff_ ## codec ## _videotoolbox_hwaccel) +#define HWACCEL_D3D11VA(codec) \ + HW_CONFIG_HWACCEL(0, 0, 1, D3D11VA_VLD, NONE, ff_ ## codec ## _d3d11va_hwaccel) +#define HWACCEL_XVMC(codec) \ + HW_CONFIG_HWACCEL(0, 0, 1, XVMC, NONE, ff_ ## codec ## _xvmc_hwaccel) + +#endif /* AVCODEC_HWACCEL_H */ diff --git a/media/ffvpx/libavcodec/hwaccels.h b/media/ffvpx/libavcodec/hwaccels.h new file mode 100644 index 0000000000..aca55831f3 --- /dev/null +++ b/media/ffvpx/libavcodec/hwaccels.h @@ -0,0 +1,85 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_HWACCELS_H +#define AVCODEC_HWACCELS_H + +#include "avcodec.h" + +extern const AVHWAccel ff_av1_d3d11va_hwaccel; +extern const AVHWAccel ff_av1_d3d11va2_hwaccel; +extern const AVHWAccel ff_av1_dxva2_hwaccel; +extern const AVHWAccel ff_av1_nvdec_hwaccel; +extern const AVHWAccel ff_av1_vaapi_hwaccel; +extern const AVHWAccel ff_av1_vdpau_hwaccel; +extern const AVHWAccel ff_h263_vaapi_hwaccel; +extern const AVHWAccel ff_h263_videotoolbox_hwaccel; +extern const AVHWAccel ff_h264_d3d11va_hwaccel; +extern const AVHWAccel ff_h264_d3d11va2_hwaccel; +extern const AVHWAccel ff_h264_dxva2_hwaccel; +extern const AVHWAccel ff_h264_nvdec_hwaccel; +extern const AVHWAccel ff_h264_vaapi_hwaccel; +extern const AVHWAccel ff_h264_vdpau_hwaccel; +extern const AVHWAccel ff_h264_videotoolbox_hwaccel; +extern const AVHWAccel ff_hevc_d3d11va_hwaccel; +extern const AVHWAccel ff_hevc_d3d11va2_hwaccel; +extern const AVHWAccel ff_hevc_dxva2_hwaccel; +extern const AVHWAccel ff_hevc_nvdec_hwaccel; +extern const AVHWAccel ff_hevc_vaapi_hwaccel; +extern const AVHWAccel ff_hevc_vdpau_hwaccel; +extern const AVHWAccel ff_hevc_videotoolbox_hwaccel; +extern const AVHWAccel ff_mjpeg_nvdec_hwaccel; +extern const AVHWAccel ff_mjpeg_vaapi_hwaccel; +extern const AVHWAccel ff_mpeg1_nvdec_hwaccel; +extern const AVHWAccel ff_mpeg1_vdpau_hwaccel; +extern const AVHWAccel ff_mpeg1_videotoolbox_hwaccel; +extern const AVHWAccel ff_mpeg2_d3d11va_hwaccel; +extern const AVHWAccel ff_mpeg2_d3d11va2_hwaccel; +extern const AVHWAccel ff_mpeg2_nvdec_hwaccel; +extern const AVHWAccel ff_mpeg2_dxva2_hwaccel; +extern const AVHWAccel ff_mpeg2_vaapi_hwaccel; +extern const AVHWAccel ff_mpeg2_vdpau_hwaccel; +extern const AVHWAccel ff_mpeg2_videotoolbox_hwaccel; +extern const AVHWAccel ff_mpeg4_nvdec_hwaccel; +extern const AVHWAccel ff_mpeg4_vaapi_hwaccel; +extern const AVHWAccel ff_mpeg4_vdpau_hwaccel; +extern const AVHWAccel ff_mpeg4_videotoolbox_hwaccel; +extern const AVHWAccel ff_prores_videotoolbox_hwaccel; +extern const AVHWAccel ff_vc1_d3d11va_hwaccel; +extern const AVHWAccel ff_vc1_d3d11va2_hwaccel; +extern const AVHWAccel ff_vc1_dxva2_hwaccel; +extern const AVHWAccel ff_vc1_nvdec_hwaccel; +extern const AVHWAccel ff_vc1_vaapi_hwaccel; +extern const AVHWAccel ff_vc1_vdpau_hwaccel; +extern const AVHWAccel ff_vp8_nvdec_hwaccel; +extern const AVHWAccel ff_vp8_vaapi_hwaccel; +extern const AVHWAccel ff_vp9_d3d11va_hwaccel; +extern const AVHWAccel ff_vp9_d3d11va2_hwaccel; +extern const AVHWAccel ff_vp9_dxva2_hwaccel; +extern const AVHWAccel ff_vp9_nvdec_hwaccel; +extern const AVHWAccel ff_vp9_vaapi_hwaccel; +extern const AVHWAccel ff_vp9_vdpau_hwaccel; +extern const AVHWAccel ff_vp9_videotoolbox_hwaccel; +extern const AVHWAccel ff_wmv3_d3d11va_hwaccel; +extern const AVHWAccel ff_wmv3_d3d11va2_hwaccel; +extern const AVHWAccel ff_wmv3_dxva2_hwaccel; +extern const AVHWAccel ff_wmv3_nvdec_hwaccel; +extern const AVHWAccel ff_wmv3_vaapi_hwaccel; +extern const AVHWAccel ff_wmv3_vdpau_hwaccel; + +#endif /* AVCODEC_HWACCELS_H */ diff --git a/media/ffvpx/libavcodec/hwconfig.h b/media/ffvpx/libavcodec/hwconfig.h new file mode 100644 index 0000000000..721424912c --- /dev/null +++ b/media/ffvpx/libavcodec/hwconfig.h @@ -0,0 +1,100 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_HWCONFIG_H +#define AVCODEC_HWCONFIG_H + +#include "avcodec.h" +#include "hwaccels.h" + + +#define HWACCEL_CAP_ASYNC_SAFE (1 << 0) + + +typedef struct AVCodecHWConfigInternal { + /** + * This is the structure which will be returned to the user by + * avcodec_get_hw_config(). + */ + AVCodecHWConfig public; + /** + * If this configuration uses a hwaccel, a pointer to it. + * If not, NULL. + */ + const AVHWAccel *hwaccel; +} AVCodecHWConfigInternal; + + +// These macros are used to simplify AVCodecHWConfigInternal definitions. + +#define HW_CONFIG_HWACCEL(device, frames, ad_hoc, format, device_type_, name) \ + &(const AVCodecHWConfigInternal) { \ + .public = { \ + .pix_fmt = AV_PIX_FMT_ ## format, \ + .methods = (device ? AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX : 0) | \ + (frames ? AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX : 0) | \ + (ad_hoc ? AV_CODEC_HW_CONFIG_METHOD_AD_HOC : 0), \ + .device_type = AV_HWDEVICE_TYPE_ ## device_type_, \ + }, \ + .hwaccel = &name, \ + } + +#define HW_CONFIG_INTERNAL(format) \ + &(const AVCodecHWConfigInternal) { \ + .public = { \ + .pix_fmt = AV_PIX_FMT_ ## format, \ + .methods = AV_CODEC_HW_CONFIG_METHOD_INTERNAL, \ + .device_type = AV_HWDEVICE_TYPE_NONE, \ + }, \ + .hwaccel = NULL, \ + } + +#define HWACCEL_DXVA2(codec) \ + HW_CONFIG_HWACCEL(1, 1, 1, DXVA2_VLD, DXVA2, ff_ ## codec ## _dxva2_hwaccel) +#define HWACCEL_D3D11VA2(codec) \ + HW_CONFIG_HWACCEL(1, 1, 0, D3D11, D3D11VA, ff_ ## codec ## _d3d11va2_hwaccel) +#define HWACCEL_NVDEC(codec) \ + HW_CONFIG_HWACCEL(1, 1, 0, CUDA, CUDA, ff_ ## codec ## _nvdec_hwaccel) +#define HWACCEL_VAAPI(codec) \ + HW_CONFIG_HWACCEL(1, 1, 1, VAAPI, VAAPI, ff_ ## codec ## _vaapi_hwaccel) +#define HWACCEL_VDPAU(codec) \ + HW_CONFIG_HWACCEL(1, 1, 1, VDPAU, VDPAU, ff_ ## codec ## _vdpau_hwaccel) +#define HWACCEL_VIDEOTOOLBOX(codec) \ + HW_CONFIG_HWACCEL(1, 1, 1, VIDEOTOOLBOX, VIDEOTOOLBOX, ff_ ## codec ## _videotoolbox_hwaccel) +#define HWACCEL_D3D11VA(codec) \ + HW_CONFIG_HWACCEL(0, 0, 1, D3D11VA_VLD, NONE, ff_ ## codec ## _d3d11va_hwaccel) + +#define HW_CONFIG_ENCODER(device, frames, ad_hoc, format, device_type_) \ + &(const AVCodecHWConfigInternal) { \ + .public = { \ + .pix_fmt = AV_PIX_FMT_ ## format, \ + .methods = (device ? AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX : 0) | \ + (frames ? AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX : 0) | \ + (ad_hoc ? AV_CODEC_HW_CONFIG_METHOD_AD_HOC : 0), \ + .device_type = AV_HWDEVICE_TYPE_ ## device_type_, \ + }, \ + .hwaccel = NULL, \ + } + +#define HW_CONFIG_ENCODER_DEVICE(format, device_type_) \ + HW_CONFIG_ENCODER(1, 0, 0, format, device_type_) + +#define HW_CONFIG_ENCODER_FRAMES(format, device_type_) \ + HW_CONFIG_ENCODER(0, 1, 0, format, device_type_) + +#endif /* AVCODEC_HWCONFIG_H */ diff --git a/media/ffvpx/libavcodec/idctdsp.c b/media/ffvpx/libavcodec/idctdsp.c new file mode 100644 index 0000000000..7216afb094 --- /dev/null +++ b/media/ffvpx/libavcodec/idctdsp.c @@ -0,0 +1,315 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "config_components.h" +#include "libavutil/attributes.h" +#include "libavutil/common.h" +#include "avcodec.h" +#include "dct.h" +#include "faanidct.h" +#include "idctdsp.h" +#include "simple_idct.h" +#include "xvididct.h" + +av_cold void ff_permute_scantable(uint8_t dst[64], const uint8_t src[64], + const uint8_t permutation[64]) +{ + for (int i = 0; i < 64; i++) { + int j = src[i]; + dst[i] = permutation[j]; + } +} + +av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation, + enum idct_permutation_type perm_type) +{ + int i; + +#if ARCH_X86 + if (ff_init_scantable_permutation_x86(idct_permutation, + perm_type)) + return; +#endif + + switch (perm_type) { + case FF_IDCT_PERM_NONE: + for (i = 0; i < 64; i++) + idct_permutation[i] = i; + break; + case FF_IDCT_PERM_LIBMPEG2: + for (i = 0; i < 64; i++) + idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); + break; + case FF_IDCT_PERM_TRANSPOSE: + for (i = 0; i < 64; i++) + idct_permutation[i] = ((i & 7) << 3) | (i >> 3); + break; + case FF_IDCT_PERM_PARTTRANS: + for (i = 0; i < 64; i++) + idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3); + break; + default: + av_log(NULL, AV_LOG_ERROR, + "Internal error, IDCT permutation not set\n"); + } +} + +void ff_put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels, + ptrdiff_t line_size) +{ + int i; + + /* read the pixels */ + for (i = 0; i < 8; i++) { + pixels[0] = av_clip_uint8(block[0]); + pixels[1] = av_clip_uint8(block[1]); + pixels[2] = av_clip_uint8(block[2]); + pixels[3] = av_clip_uint8(block[3]); + pixels[4] = av_clip_uint8(block[4]); + pixels[5] = av_clip_uint8(block[5]); + pixels[6] = av_clip_uint8(block[6]); + pixels[7] = av_clip_uint8(block[7]); + + pixels += line_size; + block += 8; + } +} + +static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels, + int line_size) +{ + int i; + + /* read the pixels */ + for(i=0;i<4;i++) { + pixels[0] = av_clip_uint8(block[0]); + pixels[1] = av_clip_uint8(block[1]); + pixels[2] = av_clip_uint8(block[2]); + pixels[3] = av_clip_uint8(block[3]); + + pixels += line_size; + block += 8; + } +} + +static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels, + int line_size) +{ + int i; + + /* read the pixels */ + for(i=0;i<2;i++) { + pixels[0] = av_clip_uint8(block[0]); + pixels[1] = av_clip_uint8(block[1]); + + pixels += line_size; + block += 8; + } +} + +static void put_signed_pixels_clamped_c(const int16_t *block, + uint8_t *av_restrict pixels, + ptrdiff_t line_size) +{ + int i, j; + + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + if (*block < -128) + *pixels = 0; + else if (*block > 127) + *pixels = 255; + else + *pixels = (uint8_t) (*block + 128); + block++; + pixels++; + } + pixels += (line_size - 8); + } +} + +void ff_add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels, + ptrdiff_t line_size) +{ + int i; + + /* read the pixels */ + for (i = 0; i < 8; i++) { + pixels[0] = av_clip_uint8(pixels[0] + block[0]); + pixels[1] = av_clip_uint8(pixels[1] + block[1]); + pixels[2] = av_clip_uint8(pixels[2] + block[2]); + pixels[3] = av_clip_uint8(pixels[3] + block[3]); + pixels[4] = av_clip_uint8(pixels[4] + block[4]); + pixels[5] = av_clip_uint8(pixels[5] + block[5]); + pixels[6] = av_clip_uint8(pixels[6] + block[6]); + pixels[7] = av_clip_uint8(pixels[7] + block[7]); + pixels += line_size; + block += 8; + } +} + +static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels, + int line_size) +{ + int i; + + /* read the pixels */ + for(i=0;i<4;i++) { + pixels[0] = av_clip_uint8(pixels[0] + block[0]); + pixels[1] = av_clip_uint8(pixels[1] + block[1]); + pixels[2] = av_clip_uint8(pixels[2] + block[2]); + pixels[3] = av_clip_uint8(pixels[3] + block[3]); + pixels += line_size; + block += 8; + } +} + +static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels, + int line_size) +{ + int i; + + /* read the pixels */ + for(i=0;i<2;i++) { + pixels[0] = av_clip_uint8(pixels[0] + block[0]); + pixels[1] = av_clip_uint8(pixels[1] + block[1]); + pixels += line_size; + block += 8; + } +} + +static void ff_jref_idct4_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + ff_j_rev_dct4 (block); + put_pixels_clamped4_c(block, dest, line_size); +} +static void ff_jref_idct4_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + ff_j_rev_dct4 (block); + add_pixels_clamped4_c(block, dest, line_size); +} + +static void ff_jref_idct2_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + ff_j_rev_dct2 (block); + put_pixels_clamped2_c(block, dest, line_size); +} +static void ff_jref_idct2_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + ff_j_rev_dct2 (block); + add_pixels_clamped2_c(block, dest, line_size); +} + +static void ff_jref_idct1_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + dest[0] = av_clip_uint8((block[0] + 4)>>3); +} +static void ff_jref_idct1_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3)); +} + +av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx) +{ + av_unused const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8; + + if (avctx->lowres==1) { + c->idct_put = ff_jref_idct4_put; + c->idct_add = ff_jref_idct4_add; + c->idct = ff_j_rev_dct4; + c->perm_type = FF_IDCT_PERM_NONE; + } else if (avctx->lowres==2) { + c->idct_put = ff_jref_idct2_put; + c->idct_add = ff_jref_idct2_add; + c->idct = ff_j_rev_dct2; + c->perm_type = FF_IDCT_PERM_NONE; + } else if (avctx->lowres==3) { + c->idct_put = ff_jref_idct1_put; + c->idct_add = ff_jref_idct1_add; + c->idct = ff_j_rev_dct1; + c->perm_type = FF_IDCT_PERM_NONE; + } else { + if (avctx->bits_per_raw_sample == 10 || avctx->bits_per_raw_sample == 9) { + /* 10-bit MPEG-4 Simple Studio Profile requires a higher precision IDCT + However, it only uses idct_put */ + if (c->mpeg4_studio_profile) { + c->idct_put = ff_simple_idct_put_int32_10bit; + c->idct_add = NULL; + c->idct = NULL; + } else { + c->idct_put = ff_simple_idct_put_int16_10bit; + c->idct_add = ff_simple_idct_add_int16_10bit; + c->idct = ff_simple_idct_int16_10bit; + } + c->perm_type = FF_IDCT_PERM_NONE; + } else if (avctx->bits_per_raw_sample == 12) { + c->idct_put = ff_simple_idct_put_int16_12bit; + c->idct_add = ff_simple_idct_add_int16_12bit; + c->idct = ff_simple_idct_int16_12bit; + c->perm_type = FF_IDCT_PERM_NONE; + } else { + if (avctx->idct_algo == FF_IDCT_INT) { + c->idct_put = ff_jref_idct_put; + c->idct_add = ff_jref_idct_add; + c->idct = ff_j_rev_dct; + c->perm_type = FF_IDCT_PERM_LIBMPEG2; +#if CONFIG_FAANIDCT + } else if (avctx->idct_algo == FF_IDCT_FAAN) { + c->idct_put = ff_faanidct_put; + c->idct_add = ff_faanidct_add; + c->idct = ff_faanidct; + c->perm_type = FF_IDCT_PERM_NONE; +#endif /* CONFIG_FAANIDCT */ + } else { // accurate/default + c->idct_put = ff_simple_idct_put_int16_8bit; + c->idct_add = ff_simple_idct_add_int16_8bit; + c->idct = ff_simple_idct_int16_8bit; + c->perm_type = FF_IDCT_PERM_NONE; + } + } + } + + c->put_pixels_clamped = ff_put_pixels_clamped_c; + c->put_signed_pixels_clamped = put_signed_pixels_clamped_c; + c->add_pixels_clamped = ff_add_pixels_clamped_c; + + if (CONFIG_MPEG4_DECODER && avctx->idct_algo == FF_IDCT_XVID) + ff_xvid_idct_init(c, avctx); + +#if ARCH_AARCH64 + ff_idctdsp_init_aarch64(c, avctx, high_bit_depth); +#elif ARCH_ALPHA + ff_idctdsp_init_alpha(c, avctx, high_bit_depth); +#elif ARCH_ARM + ff_idctdsp_init_arm(c, avctx, high_bit_depth); +#elif ARCH_PPC + ff_idctdsp_init_ppc(c, avctx, high_bit_depth); +#elif ARCH_RISCV + ff_idctdsp_init_riscv(c, avctx, high_bit_depth); +#elif ARCH_X86 + ff_idctdsp_init_x86(c, avctx, high_bit_depth); +#elif ARCH_MIPS + ff_idctdsp_init_mips(c, avctx, high_bit_depth); +#elif ARCH_LOONGARCH + ff_idctdsp_init_loongarch(c, avctx, high_bit_depth); +#endif + + ff_init_scantable_permutation(c->idct_permutation, + c->perm_type); +} diff --git a/media/ffvpx/libavcodec/idctdsp.h b/media/ffvpx/libavcodec/idctdsp.h new file mode 100644 index 0000000000..7224463349 --- /dev/null +++ b/media/ffvpx/libavcodec/idctdsp.h @@ -0,0 +1,117 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_IDCTDSP_H +#define AVCODEC_IDCTDSP_H + +#include <stdint.h> + +#include "config.h" + +#include "avcodec.h" + +enum idct_permutation_type { + FF_IDCT_PERM_NONE, + FF_IDCT_PERM_LIBMPEG2, + FF_IDCT_PERM_SIMPLE, + FF_IDCT_PERM_TRANSPOSE, + FF_IDCT_PERM_PARTTRANS, + FF_IDCT_PERM_SSE2, +}; + +void ff_permute_scantable(uint8_t dst[64], const uint8_t src[64], + const uint8_t permutation[64]); +void ff_init_scantable_permutation(uint8_t *idct_permutation, + enum idct_permutation_type perm_type); +int ff_init_scantable_permutation_x86(uint8_t *idct_permutation, + enum idct_permutation_type perm_type); + +typedef struct IDCTDSPContext { + /* pixel ops : interface with DCT */ + void (*put_pixels_clamped)(const int16_t *block /* align 16 */, + uint8_t *av_restrict pixels /* align 8 */, + ptrdiff_t line_size); + void (*put_signed_pixels_clamped)(const int16_t *block /* align 16 */, + uint8_t *av_restrict pixels /* align 8 */, + ptrdiff_t line_size); + void (*add_pixels_clamped)(const int16_t *block /* align 16 */, + uint8_t *av_restrict pixels /* align 8 */, + ptrdiff_t line_size); + + void (*idct)(int16_t *block /* align 16 */); + + /** + * block -> idct -> clip to unsigned 8 bit -> dest. + * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...) + * @param line_size size in bytes of a horizontal line of dest + */ + void (*idct_put)(uint8_t *dest /* align 8 */, + ptrdiff_t line_size, int16_t *block /* align 16 */); + + /** + * block -> idct -> add dest -> clip to unsigned 8 bit -> dest. + * @param line_size size in bytes of a horizontal line of dest + */ + void (*idct_add)(uint8_t *dest /* align 8 */, + ptrdiff_t line_size, int16_t *block /* align 16 */); + + /** + * IDCT input permutation. + * Several optimized IDCTs need a permutated input (relative to the + * normal order of the reference IDCT). + * This permutation must be performed before the idct_put/add. + * Note, normally this can be merged with the zigzag/alternate scan<br> + * An example to avoid confusion: + * - (->decode coeffs -> zigzag reorder -> dequant -> reference IDCT -> ...) + * - (x -> reference DCT -> reference IDCT -> x) + * - (x -> reference DCT -> simple_mmx_perm = idct_permutation + * -> simple_idct_mmx -> x) + * - (-> decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant + * -> simple_idct_mmx -> ...) + */ + uint8_t idct_permutation[64]; + enum idct_permutation_type perm_type; + + int mpeg4_studio_profile; +} IDCTDSPContext; + +void ff_put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels, + ptrdiff_t line_size); +void ff_add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels, + ptrdiff_t line_size); + +void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx); + +void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_alpha(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_riscv(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_mips(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_loongarch(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); + +#endif /* AVCODEC_IDCTDSP_H */ diff --git a/media/ffvpx/libavcodec/imgconvert.c b/media/ffvpx/libavcodec/imgconvert.c new file mode 100644 index 0000000000..96511ac7d6 --- /dev/null +++ b/media/ffvpx/libavcodec/imgconvert.c @@ -0,0 +1,48 @@ +/* + * Misc image conversion routines + * Copyright (c) 2001, 2002, 2003 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * misc image conversion routines + */ + +#include "avcodec.h" +#include "libavutil/pixdesc.h" +#include "libavutil/pixfmt.h" + +enum AVPixelFormat avcodec_find_best_pix_fmt_of_list(const enum AVPixelFormat *pix_fmt_list, + enum AVPixelFormat src_pix_fmt, + int has_alpha, int *loss_ptr){ + int i; + + enum AVPixelFormat best = AV_PIX_FMT_NONE; + int loss; + + for (i=0; pix_fmt_list[i] != AV_PIX_FMT_NONE; i++) { + loss = loss_ptr ? *loss_ptr : 0; + best = av_find_best_pix_fmt_of_2(best, pix_fmt_list[i], src_pix_fmt, has_alpha, &loss); + } + + if (loss_ptr) + *loss_ptr = loss; + return best; +} + diff --git a/media/ffvpx/libavcodec/internal.h b/media/ffvpx/libavcodec/internal.h new file mode 100644 index 0000000000..a283c52e01 --- /dev/null +++ b/media/ffvpx/libavcodec/internal.h @@ -0,0 +1,247 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * common internal api header. + */ + +#ifndef AVCODEC_INTERNAL_H +#define AVCODEC_INTERNAL_H + +#include <stdint.h> + +#include "libavutil/buffer.h" +#include "libavutil/channel_layout.h" +#include "libavutil/mathematics.h" +#include "libavutil/pixfmt.h" +#include "avcodec.h" +#include "config.h" + +#if CONFIG_LCMS2 +# include "fflcms2.h" +#endif + +#define FF_SANE_NB_CHANNELS 512U + +#if HAVE_SIMD_ALIGN_64 +# define STRIDE_ALIGN 64 /* AVX-512 */ +#elif HAVE_SIMD_ALIGN_32 +# define STRIDE_ALIGN 32 +#elif HAVE_SIMD_ALIGN_16 +# define STRIDE_ALIGN 16 +#else +# define STRIDE_ALIGN 8 +#endif + +typedef struct AVCodecInternal { + /** + * When using frame-threaded decoding, this field is set for the first + * worker thread (e.g. to decode extradata just once). + */ + int is_copy; + + /** + * An audio frame with less than required samples has been submitted (and + * potentially padded with silence). Reject all subsequent frames. + */ + int last_audio_frame; + + /** + * Audio encoders can set this flag during init to indicate that they + * want the small last frame to be padded to a multiple of pad_samples. + */ + int pad_samples; + + AVBufferRef *pool; + + void *thread_ctx; + + /** + * This packet is used to hold the packet given to decoders + * implementing the .decode API; it is unused by the generic + * code for decoders implementing the .receive_frame API and + * may be freely used (but not freed) by them with the caveat + * that the packet will be unreferenced generically in + * avcodec_flush_buffers(). + */ + AVPacket *in_pkt; + struct AVBSFContext *bsf; + + /** + * Properties (timestamps+side data) extracted from the last packet passed + * for decoding. + */ + AVPacket *last_pkt_props; + + /** + * temporary buffer used for encoders to store their bitstream + */ + uint8_t *byte_buffer; + unsigned int byte_buffer_size; + + /** + * This is set to AV_PKT_FLAG_KEY for encoders that encode intra-only + * formats (i.e. whose codec descriptor has AV_CODEC_PROP_INTRA_ONLY set). + * This is used to set said flag generically for said encoders. + */ + int intra_only_flag; + + void *frame_thread_encoder; + + /** + * The input frame is stored here for encoders implementing the simple + * encode API. + * + * Not allocated in other cases. + */ + AVFrame *in_frame; + + /** + * When the AV_CODEC_FLAG_RECON_FRAME flag is used. the encoder should store + * here the reconstructed frame corresponding to the last returned packet. + * + * Not allocated in other cases. + */ + AVFrame *recon_frame; + + /** + * If this is set, then FFCodec->close (if existing) needs to be called + * for the parent AVCodecContext. + */ + int needs_close; + + /** + * Number of audio samples to skip at the start of the next decoded frame + */ + int skip_samples; + + /** + * hwaccel-specific private data + */ + void *hwaccel_priv_data; + + /** + * checks API usage: after codec draining, flush is required to resume operation + */ + int draining; + + /** + * Temporary buffers for newly received or not yet output packets/frames. + */ + AVPacket *buffer_pkt; + AVFrame *buffer_frame; + int draining_done; + + int showed_multi_packet_warning; + + /* to prevent infinite loop on errors when draining */ + int nb_draining_errors; + + /* used when avctx flag AV_CODEC_FLAG_DROPCHANGED is set */ + int changed_frames_dropped; + int initial_format; + int initial_width, initial_height; + int initial_sample_rate; + AVChannelLayout initial_ch_layout; + +#if CONFIG_LCMS2 + FFIccContext icc; /* used to read and write embedded ICC profiles */ +#endif +} AVCodecInternal; + +/** + * Return the index into tab at which {a,b} match elements {[0],[1]} of tab. + * If there is no such matching pair then size is returned. + */ +int ff_match_2uint16(const uint16_t (*tab)[2], int size, int a, int b); + +unsigned int ff_toupper4(unsigned int x); + +void ff_color_frame(AVFrame *frame, const int color[4]); + +/** + * Maximum size in bytes of extradata. + * This value was chosen such that every bit of the buffer is + * addressable by a 32-bit signed integer as used by get_bits. + */ +#define FF_MAX_EXTRADATA_SIZE ((1 << 28) - AV_INPUT_BUFFER_PADDING_SIZE) + +/** + * 2^(x) for integer x + * @return correctly rounded float + */ +static av_always_inline float ff_exp2fi(int x) { + /* Normal range */ + if (-126 <= x && x <= 128) + return av_int2float((x+127) << 23); + /* Too large */ + else if (x > 128) + return INFINITY; + /* Subnormal numbers */ + else if (x > -150) + return av_int2float(1 << (x+149)); + /* Negligibly small */ + else + return 0; +} + +int avpriv_h264_has_num_reorder_frames(AVCodecContext *avctx); + +int avpriv_codec_get_cap_skip_frame_fill_param(const AVCodec *codec); + +/** + * Add a CPB properties side data to an encoding context. + */ +AVCPBProperties *ff_add_cpb_side_data(AVCodecContext *avctx); + +/** + * Check AVFrame for S12M timecode side data and allocate and fill TC SEI message with timecode info + * + * @param frame Raw frame to get S12M timecode side data from + * @param rate The frame rate + * @param prefix_len Number of bytes to allocate before SEI message + * @param data Pointer to a variable to store allocated memory + * Upon return the variable will hold NULL on error or if frame has no S12M timecode info. + * Otherwise it will point to prefix_len uninitialized bytes followed by + * *sei_size SEI message + * @param sei_size Pointer to a variable to store generated SEI message length + * @return Zero on success, negative error code on failure + */ +int ff_alloc_timecode_sei(const AVFrame *frame, AVRational rate, size_t prefix_len, + void **data, size_t *sei_size); + +/** + * Get an estimated video bitrate based on frame size, frame rate and coded + * bits per pixel. + */ +int64_t ff_guess_coded_bitrate(AVCodecContext *avctx); + +/** + * Check if a value is in the list. If not, return the default value + * + * @param ctx Context for the log msg + * @param val_name Name of the checked value, for log msg + * @param array_valid_values Array of valid int, ended with INT_MAX + * @param default_value Value return if checked value is not in the array + * @return Value or default_value. + */ +int ff_int_from_list_or_default(void *ctx, const char * val_name, int val, + const int * array_valid_values, int default_value); + +#endif /* AVCODEC_INTERNAL_H */ diff --git a/media/ffvpx/libavcodec/jfdctfst.c b/media/ffvpx/libavcodec/jfdctfst.c new file mode 100644 index 0000000000..805e05808c --- /dev/null +++ b/media/ffvpx/libavcodec/jfdctfst.c @@ -0,0 +1,331 @@ +/* + * This file is part of the Independent JPEG Group's software. + * + * The authors make NO WARRANTY or representation, either express or implied, + * with respect to this software, its quality, accuracy, merchantability, or + * fitness for a particular purpose. This software is provided "AS IS", and + * you, its user, assume the entire risk as to its quality and accuracy. + * + * This software is copyright (C) 1994-1996, Thomas G. Lane. + * All Rights Reserved except as specified below. + * + * Permission is hereby granted to use, copy, modify, and distribute this + * software (or portions thereof) for any purpose, without fee, subject to + * these conditions: + * (1) If any part of the source code for this software is distributed, then + * this README file must be included, with this copyright and no-warranty + * notice unaltered; and any additions, deletions, or changes to the original + * files must be clearly indicated in accompanying documentation. + * (2) If only executable code is distributed, then the accompanying + * documentation must state that "this software is based in part on the work + * of the Independent JPEG Group". + * (3) Permission for use of this software is granted only if the user accepts + * full responsibility for any undesirable consequences; the authors accept + * NO LIABILITY for damages of any kind. + * + * These conditions apply to any software derived from or based on the IJG + * code, not just to the unmodified library. If you use our work, you ought + * to acknowledge us. + * + * Permission is NOT granted for the use of any IJG author's name or company + * name in advertising or publicity relating to this software or products + * derived from it. This software may be referred to only as "the Independent + * JPEG Group's software". + * + * We specifically permit and encourage the use of this software as the basis + * of commercial products, provided that all warranty or liability claims are + * assumed by the product vendor. + * + * This file contains a fast, not so accurate integer implementation of the + * forward DCT (Discrete Cosine Transform). + * + * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT + * on each column. Direct algorithms are also available, but they are + * much more complex and seem not to be any faster when reduced to code. + * + * This implementation is based on Arai, Agui, and Nakajima's algorithm for + * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in + * Japanese, but the algorithm is described in the Pennebaker & Mitchell + * JPEG textbook (see REFERENCES section in file README). The following code + * is based directly on figure 4-8 in P&M. + * While an 8-point DCT cannot be done in less than 11 multiplies, it is + * possible to arrange the computation so that many of the multiplies are + * simple scalings of the final outputs. These multiplies can then be + * folded into the multiplications or divisions by the JPEG quantization + * table entries. The AA&N method leaves only 5 multiplies and 29 adds + * to be done in the DCT itself. + * The primary disadvantage of this method is that with fixed-point math, + * accuracy is lost due to imprecise representation of the scaled + * quantization values. The smaller the quantization table entry, the less + * precise the scaled value, so this implementation does worse with high- + * quality-setting files than with low-quality ones. + */ + +/** + * @file + * Independent JPEG Group's fast AAN dct. + */ + +#include <stdint.h> +#include "libavutil/attributes.h" +#include "dct.h" + +#define DCTSIZE 8 +#define GLOBAL(x) x +#define RIGHT_SHIFT(x, n) ((x) >> (n)) + +/* + * This module is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 + Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ +#endif + + +/* Scaling decisions are generally the same as in the LL&M algorithm; + * see jfdctint.c for more details. However, we choose to descale + * (right shift) multiplication products as soon as they are formed, + * rather than carrying additional fractional bits into subsequent additions. + * This compromises accuracy slightly, but it lets us save a few shifts. + * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples) + * everywhere except in the multiplications proper; this saves a good deal + * of work on 16-bit-int machines. + * + * Again to save a few shifts, the intermediate results between pass 1 and + * pass 2 are not upscaled, but are represented only to integral precision. + * + * A final compromise is to represent the multiplicative constants to only + * 8 fractional bits, rather than 13. This saves some shifting work on some + * machines, and may also reduce the cost of multiplication (since there + * are fewer one-bits in the constants). + */ + +#define CONST_BITS 8 + + +/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus + * causing a lot of useless floating-point operations at run time. + * To get around this we use the following pre-calculated constants. + * If you change CONST_BITS you may want to add appropriate values. + * (With a reasonable C compiler, you can just rely on the FIX() macro...) + */ + +#if CONST_BITS == 8 +#define FIX_0_382683433 ((int32_t) 98) /* FIX(0.382683433) */ +#define FIX_0_541196100 ((int32_t) 139) /* FIX(0.541196100) */ +#define FIX_0_707106781 ((int32_t) 181) /* FIX(0.707106781) */ +#define FIX_1_306562965 ((int32_t) 334) /* FIX(1.306562965) */ +#else +#define FIX_0_382683433 FIX(0.382683433) +#define FIX_0_541196100 FIX(0.541196100) +#define FIX_0_707106781 FIX(0.707106781) +#define FIX_1_306562965 FIX(1.306562965) +#endif + + +/* We can gain a little more speed, with a further compromise in accuracy, + * by omitting the addition in a descaling shift. This yields an incorrectly + * rounded result half the time... + */ + +#ifndef USE_ACCURATE_ROUNDING +#undef DESCALE +#define DESCALE(x,n) RIGHT_SHIFT(x, n) +#endif + + +/* Multiply a int16_t variable by an int32_t constant, and immediately + * descale to yield a int16_t result. + */ + +#define MULTIPLY(var,const) ((int16_t) DESCALE((var) * (const), CONST_BITS)) + +static av_always_inline void row_fdct(int16_t * data){ + int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int tmp10, tmp11, tmp12, tmp13; + int z1, z2, z3, z4, z5, z11, z13; + int16_t *dataptr; + int ctr; + + /* Pass 1: process rows. */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[0] + dataptr[7]; + tmp7 = dataptr[0] - dataptr[7]; + tmp1 = dataptr[1] + dataptr[6]; + tmp6 = dataptr[1] - dataptr[6]; + tmp2 = dataptr[2] + dataptr[5]; + tmp5 = dataptr[2] - dataptr[5]; + tmp3 = dataptr[3] + dataptr[4]; + tmp4 = dataptr[3] - dataptr[4]; + + /* Even part */ + + tmp10 = tmp0 + tmp3; /* phase 2 */ + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + dataptr[0] = tmp10 + tmp11; /* phase 3 */ + dataptr[4] = tmp10 - tmp11; + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ + dataptr[2] = tmp13 + z1; /* phase 5 */ + dataptr[6] = tmp13 - z1; + + /* Odd part */ + + tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + /* The rotator is modified from fig 4-8 to avoid extra negations. */ + z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ + z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ + z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ + z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ + + z11 = tmp7 + z3; /* phase 5 */ + z13 = tmp7 - z3; + + dataptr[5] = z13 + z2; /* phase 6 */ + dataptr[3] = z13 - z2; + dataptr[1] = z11 + z4; + dataptr[7] = z11 - z4; + + dataptr += DCTSIZE; /* advance pointer to next row */ + } +} + +/* + * Perform the forward DCT on one block of samples. + */ + +GLOBAL(void) +ff_fdct_ifast (int16_t * data) +{ + int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int tmp10, tmp11, tmp12, tmp13; + int z1, z2, z3, z4, z5, z11, z13; + int16_t *dataptr; + int ctr; + + row_fdct(data); + + /* Pass 2: process columns. */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; + tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; + tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; + tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; + tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; + + /* Even part */ + + tmp10 = tmp0 + tmp3; /* phase 2 */ + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */ + dataptr[DCTSIZE*4] = tmp10 - tmp11; + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ + dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */ + dataptr[DCTSIZE*6] = tmp13 - z1; + + /* Odd part */ + + tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + /* The rotator is modified from fig 4-8 to avoid extra negations. */ + z5 = MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ + z2 = MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ + z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ + z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ + + z11 = tmp7 + z3; /* phase 5 */ + z13 = tmp7 - z3; + + dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */ + dataptr[DCTSIZE*3] = z13 - z2; + dataptr[DCTSIZE*1] = z11 + z4; + dataptr[DCTSIZE*7] = z11 - z4; + + dataptr++; /* advance pointer to next column */ + } +} + +/* + * Perform the forward 2-4-8 DCT on one block of samples. + */ + +GLOBAL(void) +ff_fdct_ifast248 (int16_t * data) +{ + int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int tmp10, tmp11, tmp12, tmp13; + int z1; + int16_t *dataptr; + int ctr; + + row_fdct(data); + + /* Pass 2: process columns. */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*1]; + tmp1 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3]; + tmp2 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7]; + tmp4 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*1]; + tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3]; + tmp6 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5]; + tmp7 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7]; + + /* Even part */ + + tmp10 = tmp0 + tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + tmp13 = tmp0 - tmp3; + + dataptr[DCTSIZE*0] = tmp10 + tmp11; + dataptr[DCTSIZE*4] = tmp10 - tmp11; + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); + dataptr[DCTSIZE*2] = tmp13 + z1; + dataptr[DCTSIZE*6] = tmp13 - z1; + + tmp10 = tmp4 + tmp7; + tmp11 = tmp5 + tmp6; + tmp12 = tmp5 - tmp6; + tmp13 = tmp4 - tmp7; + + dataptr[DCTSIZE*1] = tmp10 + tmp11; + dataptr[DCTSIZE*5] = tmp10 - tmp11; + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); + dataptr[DCTSIZE*3] = tmp13 + z1; + dataptr[DCTSIZE*7] = tmp13 - z1; + + dataptr++; /* advance pointer to next column */ + } +} + + +#undef GLOBAL +#undef CONST_BITS +#undef DESCALE +#undef FIX_0_541196100 +#undef FIX_1_306562965 diff --git a/media/ffvpx/libavcodec/jfdctint.c b/media/ffvpx/libavcodec/jfdctint.c new file mode 100644 index 0000000000..6a39578f88 --- /dev/null +++ b/media/ffvpx/libavcodec/jfdctint.c @@ -0,0 +1,25 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BIT_DEPTH 8 +#include "jfdctint_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 10 +#include "jfdctint_template.c" +#undef BIT_DEPTH diff --git a/media/ffvpx/libavcodec/jfdctint_template.c b/media/ffvpx/libavcodec/jfdctint_template.c new file mode 100644 index 0000000000..67fb77b5e1 --- /dev/null +++ b/media/ffvpx/libavcodec/jfdctint_template.c @@ -0,0 +1,398 @@ +/* + * This file is part of the Independent JPEG Group's software. + * + * The authors make NO WARRANTY or representation, either express or implied, + * with respect to this software, its quality, accuracy, merchantability, or + * fitness for a particular purpose. This software is provided "AS IS", and + * you, its user, assume the entire risk as to its quality and accuracy. + * + * This software is copyright (C) 1991-1996, Thomas G. Lane. + * All Rights Reserved except as specified below. + * + * Permission is hereby granted to use, copy, modify, and distribute this + * software (or portions thereof) for any purpose, without fee, subject to + * these conditions: + * (1) If any part of the source code for this software is distributed, then + * this README file must be included, with this copyright and no-warranty + * notice unaltered; and any additions, deletions, or changes to the original + * files must be clearly indicated in accompanying documentation. + * (2) If only executable code is distributed, then the accompanying + * documentation must state that "this software is based in part on the work + * of the Independent JPEG Group". + * (3) Permission for use of this software is granted only if the user accepts + * full responsibility for any undesirable consequences; the authors accept + * NO LIABILITY for damages of any kind. + * + * These conditions apply to any software derived from or based on the IJG + * code, not just to the unmodified library. If you use our work, you ought + * to acknowledge us. + * + * Permission is NOT granted for the use of any IJG author's name or company + * name in advertising or publicity relating to this software or products + * derived from it. This software may be referred to only as "the Independent + * JPEG Group's software". + * + * We specifically permit and encourage the use of this software as the basis + * of commercial products, provided that all warranty or liability claims are + * assumed by the product vendor. + * + * This file contains a slow-but-accurate integer implementation of the + * forward DCT (Discrete Cosine Transform). + * + * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT + * on each column. Direct algorithms are also available, but they are + * much more complex and seem not to be any faster when reduced to code. + * + * This implementation is based on an algorithm described in + * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT + * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, + * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. + * The primary algorithm described there uses 11 multiplies and 29 adds. + * We use their alternate method with 12 multiplies and 32 adds. + * The advantage of this method is that no data path contains more than one + * multiplication; this allows a very simple and accurate implementation in + * scaled fixed-point arithmetic, with a minimal number of shifts. + */ + +/** + * @file + * Independent JPEG Group's slow & accurate dct. + */ + +#include "libavutil/common.h" +#include "dct.h" + +#include "bit_depth_template.c" + +#define DCTSIZE 8 +#define BITS_IN_JSAMPLE BIT_DEPTH +#define GLOBAL(x) x +#define RIGHT_SHIFT(x, n) ((x) >> (n)) +#define MULTIPLY16C16(var,const) ((var)*(const)) +#define DESCALE(x,n) RIGHT_SHIFT((x) + (1 << ((n) - 1)), n) + + +/* + * This module is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 +#error "Sorry, this code only copes with 8x8 DCTs." +#endif + + +/* + * The poop on this scaling stuff is as follows: + * + * Each 1-D DCT step produces outputs which are a factor of sqrt(N) + * larger than the true DCT outputs. The final outputs are therefore + * a factor of N larger than desired; since N=8 this can be cured by + * a simple right shift at the end of the algorithm. The advantage of + * this arrangement is that we save two multiplications per 1-D DCT, + * because the y0 and y4 outputs need not be divided by sqrt(N). + * In the IJG code, this factor of 8 is removed by the quantization step + * (in jcdctmgr.c), NOT in this module. + * + * We have to do addition and subtraction of the integer inputs, which + * is no problem, and multiplication by fractional constants, which is + * a problem to do in integer arithmetic. We multiply all the constants + * by CONST_SCALE and convert them to integer constants (thus retaining + * CONST_BITS bits of precision in the constants). After doing a + * multiplication we have to divide the product by CONST_SCALE, with proper + * rounding, to produce the correct output. This division can be done + * cheaply as a right shift of CONST_BITS bits. We postpone shifting + * as long as possible so that partial sums can be added together with + * full fractional precision. + * + * The outputs of the first pass are scaled up by PASS1_BITS bits so that + * they are represented to better-than-integral precision. These outputs + * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word + * with the recommended scaling. (For 12-bit sample data, the intermediate + * array is int32_t anyway.) + * + * To avoid overflow of the 32-bit intermediate results in pass 2, we must + * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis + * shows that the values given below are the most effective. + */ + +#undef CONST_BITS +#undef PASS1_BITS +#undef OUT_SHIFT + +#if BITS_IN_JSAMPLE == 8 +#define CONST_BITS 13 +#define PASS1_BITS 4 /* set this to 2 if 16x16 multiplies are faster */ +#define OUT_SHIFT PASS1_BITS +#else +#define CONST_BITS 13 +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#define OUT_SHIFT (PASS1_BITS + 1) +#endif + +/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus + * causing a lot of useless floating-point operations at run time. + * To get around this we use the following pre-calculated constants. + * If you change CONST_BITS you may want to add appropriate values. + * (With a reasonable C compiler, you can just rely on the FIX() macro...) + */ + +#if CONST_BITS == 13 +#define FIX_0_298631336 ((int32_t) 2446) /* FIX(0.298631336) */ +#define FIX_0_390180644 ((int32_t) 3196) /* FIX(0.390180644) */ +#define FIX_0_541196100 ((int32_t) 4433) /* FIX(0.541196100) */ +#define FIX_0_765366865 ((int32_t) 6270) /* FIX(0.765366865) */ +#define FIX_0_899976223 ((int32_t) 7373) /* FIX(0.899976223) */ +#define FIX_1_175875602 ((int32_t) 9633) /* FIX(1.175875602) */ +#define FIX_1_501321110 ((int32_t) 12299) /* FIX(1.501321110) */ +#define FIX_1_847759065 ((int32_t) 15137) /* FIX(1.847759065) */ +#define FIX_1_961570560 ((int32_t) 16069) /* FIX(1.961570560) */ +#define FIX_2_053119869 ((int32_t) 16819) /* FIX(2.053119869) */ +#define FIX_2_562915447 ((int32_t) 20995) /* FIX(2.562915447) */ +#define FIX_3_072711026 ((int32_t) 25172) /* FIX(3.072711026) */ +#else +#define FIX_0_298631336 FIX(0.298631336) +#define FIX_0_390180644 FIX(0.390180644) +#define FIX_0_541196100 FIX(0.541196100) +#define FIX_0_765366865 FIX(0.765366865) +#define FIX_0_899976223 FIX(0.899976223) +#define FIX_1_175875602 FIX(1.175875602) +#define FIX_1_501321110 FIX(1.501321110) +#define FIX_1_847759065 FIX(1.847759065) +#define FIX_1_961570560 FIX(1.961570560) +#define FIX_2_053119869 FIX(2.053119869) +#define FIX_2_562915447 FIX(2.562915447) +#define FIX_3_072711026 FIX(3.072711026) +#endif + + +/* Multiply an int32_t variable by an int32_t constant to yield an int32_t result. + * For 8-bit samples with the recommended scaling, all the variable + * and constant values involved are no more than 16 bits wide, so a + * 16x16->32 bit multiply can be used instead of a full 32x32 multiply. + * For 12-bit samples, a full 32-bit multiplication will be needed. + */ + +#if BITS_IN_JSAMPLE == 8 && CONST_BITS<=13 && PASS1_BITS<=2 +#define MULTIPLY(var,const) MULTIPLY16C16(var,const) +#else +#define MULTIPLY(var,const) ((var) * (const)) +#endif + + +static av_always_inline void FUNC(row_fdct)(int16_t *data) +{ + int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int tmp10, tmp11, tmp12, tmp13; + int z1, z2, z3, z4, z5; + int16_t *dataptr; + int ctr; + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true DCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[0] + dataptr[7]; + tmp7 = dataptr[0] - dataptr[7]; + tmp1 = dataptr[1] + dataptr[6]; + tmp6 = dataptr[1] - dataptr[6]; + tmp2 = dataptr[2] + dataptr[5]; + tmp5 = dataptr[2] - dataptr[5]; + tmp3 = dataptr[3] + dataptr[4]; + tmp4 = dataptr[3] - dataptr[4]; + + /* Even part per LL&M figure 1 --- note that published figure is faulty; + * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". + */ + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + dataptr[0] = (int16_t) ((tmp10 + tmp11) * (1 << PASS1_BITS)); + dataptr[4] = (int16_t) ((tmp10 - tmp11) * (1 << PASS1_BITS)); + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); + dataptr[2] = (int16_t) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), + CONST_BITS-PASS1_BITS); + dataptr[6] = (int16_t) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), + CONST_BITS-PASS1_BITS); + + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). + * cK represents cos(K*pi/16). + * i0..i3 in the paper are tmp4..tmp7 here. + */ + + z1 = tmp4 + tmp7; + z2 = tmp5 + tmp6; + z3 = tmp4 + tmp6; + z4 = tmp5 + tmp7; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + dataptr[7] = (int16_t) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); + dataptr[5] = (int16_t) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); + dataptr[3] = (int16_t) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); + dataptr[1] = (int16_t) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } +} + +/* + * Perform the forward DCT on one block of samples. + */ + +GLOBAL(void) +FUNC(ff_jpeg_fdct_islow)(int16_t *data) +{ + int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int tmp10, tmp11, tmp12, tmp13; + int z1, z2, z3, z4, z5; + int16_t *dataptr; + int ctr; + + FUNC(row_fdct)(data); + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7]; + tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7]; + tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6]; + tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6]; + tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5]; + tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; + tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; + + /* Even part per LL&M figure 1 --- note that published figure is faulty; + * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". + */ + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + dataptr[DCTSIZE*0] = DESCALE(tmp10 + tmp11, OUT_SHIFT); + dataptr[DCTSIZE*4] = DESCALE(tmp10 - tmp11, OUT_SHIFT); + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); + dataptr[DCTSIZE*2] = DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), + CONST_BITS + OUT_SHIFT); + dataptr[DCTSIZE*6] = DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), + CONST_BITS + OUT_SHIFT); + + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). + * cK represents cos(K*pi/16). + * i0..i3 in the paper are tmp4..tmp7 here. + */ + + z1 = tmp4 + tmp7; + z2 = tmp5 + tmp6; + z3 = tmp4 + tmp6; + z4 = tmp5 + tmp7; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + + z3 += z5; + z4 += z5; + + dataptr[DCTSIZE*7] = DESCALE(tmp4 + z1 + z3, CONST_BITS + OUT_SHIFT); + dataptr[DCTSIZE*5] = DESCALE(tmp5 + z2 + z4, CONST_BITS + OUT_SHIFT); + dataptr[DCTSIZE*3] = DESCALE(tmp6 + z2 + z3, CONST_BITS + OUT_SHIFT); + dataptr[DCTSIZE*1] = DESCALE(tmp7 + z1 + z4, CONST_BITS + OUT_SHIFT); + + dataptr++; /* advance pointer to next column */ + } +} + +/* + * The secret of DCT2-4-8 is really simple -- you do the usual 1-DCT + * on the rows and then, instead of doing even and odd, part on the columns + * you do even part two times. + */ +GLOBAL(void) +FUNC(ff_fdct248_islow)(int16_t *data) +{ + int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int tmp10, tmp11, tmp12, tmp13; + int z1; + int16_t *dataptr; + int ctr; + + FUNC(row_fdct)(data); + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + */ + + dataptr = data; + for (ctr = DCTSIZE-1; ctr >= 0; ctr--) { + tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*1]; + tmp1 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3]; + tmp2 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5]; + tmp3 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7]; + tmp4 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*1]; + tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3]; + tmp6 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5]; + tmp7 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7]; + + tmp10 = tmp0 + tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + tmp13 = tmp0 - tmp3; + + dataptr[DCTSIZE*0] = DESCALE(tmp10 + tmp11, OUT_SHIFT); + dataptr[DCTSIZE*4] = DESCALE(tmp10 - tmp11, OUT_SHIFT); + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); + dataptr[DCTSIZE*2] = DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), + CONST_BITS+OUT_SHIFT); + dataptr[DCTSIZE*6] = DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), + CONST_BITS+OUT_SHIFT); + + tmp10 = tmp4 + tmp7; + tmp11 = tmp5 + tmp6; + tmp12 = tmp5 - tmp6; + tmp13 = tmp4 - tmp7; + + dataptr[DCTSIZE*1] = DESCALE(tmp10 + tmp11, OUT_SHIFT); + dataptr[DCTSIZE*5] = DESCALE(tmp10 - tmp11, OUT_SHIFT); + + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); + dataptr[DCTSIZE*3] = DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), + CONST_BITS + OUT_SHIFT); + dataptr[DCTSIZE*7] = DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), + CONST_BITS + OUT_SHIFT); + + dataptr++; /* advance pointer to next column */ + } +} diff --git a/media/ffvpx/libavcodec/jrevdct.c b/media/ffvpx/libavcodec/jrevdct.c new file mode 100644 index 0000000000..7f1863515f --- /dev/null +++ b/media/ffvpx/libavcodec/jrevdct.c @@ -0,0 +1,1172 @@ +/* + * This file is part of the Independent JPEG Group's software. + * + * The authors make NO WARRANTY or representation, either express or implied, + * with respect to this software, its quality, accuracy, merchantability, or + * fitness for a particular purpose. This software is provided "AS IS", and + * you, its user, assume the entire risk as to its quality and accuracy. + * + * This software is copyright (C) 1991, 1992, Thomas G. Lane. + * All Rights Reserved except as specified below. + * + * Permission is hereby granted to use, copy, modify, and distribute this + * software (or portions thereof) for any purpose, without fee, subject to + * these conditions: + * (1) If any part of the source code for this software is distributed, then + * this README file must be included, with this copyright and no-warranty + * notice unaltered; and any additions, deletions, or changes to the original + * files must be clearly indicated in accompanying documentation. + * (2) If only executable code is distributed, then the accompanying + * documentation must state that "this software is based in part on the work + * of the Independent JPEG Group". + * (3) Permission for use of this software is granted only if the user accepts + * full responsibility for any undesirable consequences; the authors accept + * NO LIABILITY for damages of any kind. + * + * These conditions apply to any software derived from or based on the IJG + * code, not just to the unmodified library. If you use our work, you ought + * to acknowledge us. + * + * Permission is NOT granted for the use of any IJG author's name or company + * name in advertising or publicity relating to this software or products + * derived from it. This software may be referred to only as "the Independent + * JPEG Group's software". + * + * We specifically permit and encourage the use of this software as the basis + * of commercial products, provided that all warranty or liability claims are + * assumed by the product vendor. + * + * This file contains the basic inverse-DCT transformation subroutine. + * + * This implementation is based on an algorithm described in + * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT + * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, + * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. + * The primary algorithm described there uses 11 multiplies and 29 adds. + * We use their alternate method with 12 multiplies and 32 adds. + * The advantage of this method is that no data path contains more than one + * multiplication; this allows a very simple and accurate implementation in + * scaled fixed-point arithmetic, with a minimal number of shifts. + * + * I've made lots of modifications to attempt to take advantage of the + * sparse nature of the DCT matrices we're getting. Although the logic + * is cumbersome, it's straightforward and the resulting code is much + * faster. + * + * A better way to do this would be to pass in the DCT block as a sparse + * matrix, perhaps with the difference cases encoded. + */ + +/** + * @file + * Independent JPEG Group's LLM idct. + */ + +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/intreadwrite.h" + +#include "dct.h" +#include "idctdsp.h" + +#define EIGHT_BIT_SAMPLES + +#define DCTSIZE 8 +#define DCTSIZE2 64 + +#define GLOBAL + +#define RIGHT_SHIFT(x, n) ((x) >> (n)) + +typedef int16_t DCTBLOCK[DCTSIZE2]; + +#define CONST_BITS 13 + +/* + * This routine is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 + Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ +#endif + + +/* + * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT + * on each column. Direct algorithms are also available, but they are + * much more complex and seem not to be any faster when reduced to code. + * + * The poop on this scaling stuff is as follows: + * + * Each 1-D IDCT step produces outputs which are a factor of sqrt(N) + * larger than the true IDCT outputs. The final outputs are therefore + * a factor of N larger than desired; since N=8 this can be cured by + * a simple right shift at the end of the algorithm. The advantage of + * this arrangement is that we save two multiplications per 1-D IDCT, + * because the y0 and y4 inputs need not be divided by sqrt(N). + * + * We have to do addition and subtraction of the integer inputs, which + * is no problem, and multiplication by fractional constants, which is + * a problem to do in integer arithmetic. We multiply all the constants + * by CONST_SCALE and convert them to integer constants (thus retaining + * CONST_BITS bits of precision in the constants). After doing a + * multiplication we have to divide the product by CONST_SCALE, with proper + * rounding, to produce the correct output. This division can be done + * cheaply as a right shift of CONST_BITS bits. We postpone shifting + * as long as possible so that partial sums can be added together with + * full fractional precision. + * + * The outputs of the first pass are scaled up by PASS1_BITS bits so that + * they are represented to better-than-integral precision. These outputs + * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word + * with the recommended scaling. (To scale up 12-bit sample data further, an + * intermediate int32 array would be needed.) + * + * To avoid overflow of the 32-bit intermediate results in pass 2, we must + * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis + * shows that the values given below are the most effective. + */ + +#ifdef EIGHT_BIT_SAMPLES +#define PASS1_BITS 2 +#else +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#endif + +#define ONE ((int32_t) 1) + +#define CONST_SCALE (ONE << CONST_BITS) + +/* Convert a positive real constant to an integer scaled by CONST_SCALE. + * IMPORTANT: if your compiler doesn't do this arithmetic at compile time, + * you will pay a significant penalty in run time. In that case, figure + * the correct integer constant values and insert them by hand. + */ + +/* Actually FIX is no longer used, we precomputed them all */ +#define FIX(x) ((int32_t) ((x) * CONST_SCALE + 0.5)) + +/* Descale and correctly round an int32_t value that's scaled by N bits. + * We assume RIGHT_SHIFT rounds towards minus infinity, so adding + * the fudge factor is correct for either sign of X. + */ + +#define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n) + +/* Multiply an int32_t variable by an int32_t constant to yield an int32_t result. + * For 8-bit samples with the recommended scaling, all the variable + * and constant values involved are no more than 16 bits wide, so a + * 16x16->32 bit multiply can be used instead of a full 32x32 multiply; + * this provides a useful speedup on many machines. + * There is no way to specify a 16x16->32 multiply in portable C, but + * some C compilers will do the right thing if you provide the correct + * combination of casts. + * NB: for 12-bit samples, a full 32-bit multiplication will be needed. + */ + +#ifdef EIGHT_BIT_SAMPLES +#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ +#define MULTIPLY(var,const) (((int16_t) (var)) * ((int16_t) (const))) +#endif +#ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */ +#define MULTIPLY(var,const) (((int16_t) (var)) * ((int32_t) (const))) +#endif +#endif + +#ifndef MULTIPLY /* default definition */ +#define MULTIPLY(var,const) ((var) * (const)) +#endif + + +/* + Unlike our decoder where we approximate the FIXes, we need to use exact +ones here or successive P-frames will drift too much with Reference frame coding +*/ +#define FIX_0_211164243 1730 +#define FIX_0_275899380 2260 +#define FIX_0_298631336 2446 +#define FIX_0_390180644 3196 +#define FIX_0_509795579 4176 +#define FIX_0_541196100 4433 +#define FIX_0_601344887 4926 +#define FIX_0_765366865 6270 +#define FIX_0_785694958 6436 +#define FIX_0_899976223 7373 +#define FIX_1_061594337 8697 +#define FIX_1_111140466 9102 +#define FIX_1_175875602 9633 +#define FIX_1_306562965 10703 +#define FIX_1_387039845 11363 +#define FIX_1_451774981 11893 +#define FIX_1_501321110 12299 +#define FIX_1_662939225 13623 +#define FIX_1_847759065 15137 +#define FIX_1_961570560 16069 +#define FIX_2_053119869 16819 +#define FIX_2_172734803 17799 +#define FIX_2_562915447 20995 +#define FIX_3_072711026 25172 + +/* + * Perform the inverse DCT on one block of coefficients. + */ + +void ff_j_rev_dct(DCTBLOCK data) +{ + int32_t tmp0, tmp1, tmp2, tmp3; + int32_t tmp10, tmp11, tmp12, tmp13; + int32_t z1, z2, z3, z4, z5; + int32_t d0, d1, d2, d3, d4, d5, d6, d7; + register int16_t *dataptr; + int rowctr; + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + + dataptr = data; + + for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { + /* Due to quantization, we will usually find that many of the input + * coefficients are zero, especially the AC terms. We can exploit this + * by short-circuiting the IDCT calculation for any row in which all + * the AC terms are zero. In that case each output is equal to the + * DC coefficient (with scale factor as needed). + * With typical images and quantization tables, half or more of the + * row DCT calculations can be simplified this way. + */ + + register uint8_t *idataptr = (uint8_t*)dataptr; + + /* WARNING: we do the same permutation as MMX idct to simplify the + video core */ + d0 = dataptr[0]; + d2 = dataptr[1]; + d4 = dataptr[2]; + d6 = dataptr[3]; + d1 = dataptr[4]; + d3 = dataptr[5]; + d5 = dataptr[6]; + d7 = dataptr[7]; + + if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) { + /* AC terms all zero */ + if (d0) { + /* Compute a 32 bit value to assign. */ + int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS)); + register unsigned v = (dcval & 0xffff) | ((uint32_t)dcval << 16); + + AV_WN32A(&idataptr[ 0], v); + AV_WN32A(&idataptr[ 4], v); + AV_WN32A(&idataptr[ 8], v); + AV_WN32A(&idataptr[12], v); + } + + dataptr += DCTSIZE; /* advance pointer to next row */ + continue; + } + + /* Even part: reverse the even part of the forward DCT. */ + /* The rotator is sqrt(2)*c(-6). */ +{ + if (d6) { + if (d2) { + /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ + z1 = MULTIPLY(d2 + d6, FIX_0_541196100); + tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); + tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); + + tmp0 = (d0 + d4) * CONST_SCALE; + tmp1 = (d0 - d4) * CONST_SCALE; + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } else { + /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ + tmp2 = MULTIPLY(-d6, FIX_1_306562965); + tmp3 = MULTIPLY(d6, FIX_0_541196100); + + tmp0 = (d0 + d4) * CONST_SCALE; + tmp1 = (d0 - d4) * CONST_SCALE; + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } + } else { + if (d2) { + /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ + tmp2 = MULTIPLY(d2, FIX_0_541196100); + tmp3 = MULTIPLY(d2, FIX_1_306562965); + + tmp0 = (d0 + d4) * CONST_SCALE; + tmp1 = (d0 - d4) * CONST_SCALE; + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } else { + /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ + tmp10 = tmp13 = (d0 + d4) * CONST_SCALE; + tmp11 = tmp12 = (d0 - d4) * CONST_SCALE; + } + } + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + + if (d7) { + if (d5) { + if (d3) { + if (d1) { + /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */ + z1 = d7 + d1; + z2 = d5 + d3; + z3 = d7 + d3; + z4 = d5 + d1; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); + + tmp0 = MULTIPLY(d7, FIX_0_298631336); + tmp1 = MULTIPLY(d5, FIX_2_053119869); + tmp2 = MULTIPLY(d3, FIX_3_072711026); + tmp3 = MULTIPLY(d1, FIX_1_501321110); + z1 = MULTIPLY(-z1, FIX_0_899976223); + z2 = MULTIPLY(-z2, FIX_2_562915447); + z3 = MULTIPLY(-z3, FIX_1_961570560); + z4 = MULTIPLY(-z4, FIX_0_390180644); + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + } else { + /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */ + z2 = d5 + d3; + z3 = d7 + d3; + z5 = MULTIPLY(z3 + d5, FIX_1_175875602); + + tmp0 = MULTIPLY(d7, FIX_0_298631336); + tmp1 = MULTIPLY(d5, FIX_2_053119869); + tmp2 = MULTIPLY(d3, FIX_3_072711026); + z1 = MULTIPLY(-d7, FIX_0_899976223); + z2 = MULTIPLY(-z2, FIX_2_562915447); + z3 = MULTIPLY(-z3, FIX_1_961570560); + z4 = MULTIPLY(-d5, FIX_0_390180644); + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 = z1 + z4; + } + } else { + if (d1) { + /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */ + z1 = d7 + d1; + z4 = d5 + d1; + z5 = MULTIPLY(d7 + z4, FIX_1_175875602); + + tmp0 = MULTIPLY(d7, FIX_0_298631336); + tmp1 = MULTIPLY(d5, FIX_2_053119869); + tmp3 = MULTIPLY(d1, FIX_1_501321110); + z1 = MULTIPLY(-z1, FIX_0_899976223); + z2 = MULTIPLY(-d5, FIX_2_562915447); + z3 = MULTIPLY(-d7, FIX_1_961570560); + z4 = MULTIPLY(-z4, FIX_0_390180644); + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 = z2 + z3; + tmp3 += z1 + z4; + } else { + /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */ + tmp0 = MULTIPLY(-d7, FIX_0_601344887); + z1 = MULTIPLY(-d7, FIX_0_899976223); + z3 = MULTIPLY(-d7, FIX_1_961570560); + tmp1 = MULTIPLY(-d5, FIX_0_509795579); + z2 = MULTIPLY(-d5, FIX_2_562915447); + z4 = MULTIPLY(-d5, FIX_0_390180644); + z5 = MULTIPLY(d5 + d7, FIX_1_175875602); + + z3 += z5; + z4 += z5; + + tmp0 += z3; + tmp1 += z4; + tmp2 = z2 + z3; + tmp3 = z1 + z4; + } + } + } else { + if (d3) { + if (d1) { + /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */ + z1 = d7 + d1; + z3 = d7 + d3; + z5 = MULTIPLY(z3 + d1, FIX_1_175875602); + + tmp0 = MULTIPLY(d7, FIX_0_298631336); + tmp2 = MULTIPLY(d3, FIX_3_072711026); + tmp3 = MULTIPLY(d1, FIX_1_501321110); + z1 = MULTIPLY(-z1, FIX_0_899976223); + z2 = MULTIPLY(-d3, FIX_2_562915447); + z3 = MULTIPLY(-z3, FIX_1_961570560); + z4 = MULTIPLY(-d1, FIX_0_390180644); + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 = z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + } else { + /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */ + z3 = d7 + d3; + + tmp0 = MULTIPLY(-d7, FIX_0_601344887); + z1 = MULTIPLY(-d7, FIX_0_899976223); + tmp2 = MULTIPLY(d3, FIX_0_509795579); + z2 = MULTIPLY(-d3, FIX_2_562915447); + z5 = MULTIPLY(z3, FIX_1_175875602); + z3 = MULTIPLY(-z3, FIX_0_785694958); + + tmp0 += z3; + tmp1 = z2 + z5; + tmp2 += z3; + tmp3 = z1 + z5; + } + } else { + if (d1) { + /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */ + z1 = d7 + d1; + z5 = MULTIPLY(z1, FIX_1_175875602); + + z1 = MULTIPLY(z1, FIX_0_275899380); + z3 = MULTIPLY(-d7, FIX_1_961570560); + tmp0 = MULTIPLY(-d7, FIX_1_662939225); + z4 = MULTIPLY(-d1, FIX_0_390180644); + tmp3 = MULTIPLY(d1, FIX_1_111140466); + + tmp0 += z1; + tmp1 = z4 + z5; + tmp2 = z3 + z5; + tmp3 += z1; + } else { + /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */ + tmp0 = MULTIPLY(-d7, FIX_1_387039845); + tmp1 = MULTIPLY(d7, FIX_1_175875602); + tmp2 = MULTIPLY(-d7, FIX_0_785694958); + tmp3 = MULTIPLY(d7, FIX_0_275899380); + } + } + } + } else { + if (d5) { + if (d3) { + if (d1) { + /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */ + z2 = d5 + d3; + z4 = d5 + d1; + z5 = MULTIPLY(d3 + z4, FIX_1_175875602); + + tmp1 = MULTIPLY(d5, FIX_2_053119869); + tmp2 = MULTIPLY(d3, FIX_3_072711026); + tmp3 = MULTIPLY(d1, FIX_1_501321110); + z1 = MULTIPLY(-d1, FIX_0_899976223); + z2 = MULTIPLY(-z2, FIX_2_562915447); + z3 = MULTIPLY(-d3, FIX_1_961570560); + z4 = MULTIPLY(-z4, FIX_0_390180644); + + z3 += z5; + z4 += z5; + + tmp0 = z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + } else { + /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */ + z2 = d5 + d3; + + z5 = MULTIPLY(z2, FIX_1_175875602); + tmp1 = MULTIPLY(d5, FIX_1_662939225); + z4 = MULTIPLY(-d5, FIX_0_390180644); + z2 = MULTIPLY(-z2, FIX_1_387039845); + tmp2 = MULTIPLY(d3, FIX_1_111140466); + z3 = MULTIPLY(-d3, FIX_1_961570560); + + tmp0 = z3 + z5; + tmp1 += z2; + tmp2 += z2; + tmp3 = z4 + z5; + } + } else { + if (d1) { + /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */ + z4 = d5 + d1; + + z5 = MULTIPLY(z4, FIX_1_175875602); + z1 = MULTIPLY(-d1, FIX_0_899976223); + tmp3 = MULTIPLY(d1, FIX_0_601344887); + tmp1 = MULTIPLY(-d5, FIX_0_509795579); + z2 = MULTIPLY(-d5, FIX_2_562915447); + z4 = MULTIPLY(z4, FIX_0_785694958); + + tmp0 = z1 + z5; + tmp1 += z4; + tmp2 = z2 + z5; + tmp3 += z4; + } else { + /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */ + tmp0 = MULTIPLY(d5, FIX_1_175875602); + tmp1 = MULTIPLY(d5, FIX_0_275899380); + tmp2 = MULTIPLY(-d5, FIX_1_387039845); + tmp3 = MULTIPLY(d5, FIX_0_785694958); + } + } + } else { + if (d3) { + if (d1) { + /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */ + z5 = d1 + d3; + tmp3 = MULTIPLY(d1, FIX_0_211164243); + tmp2 = MULTIPLY(-d3, FIX_1_451774981); + z1 = MULTIPLY(d1, FIX_1_061594337); + z2 = MULTIPLY(-d3, FIX_2_172734803); + z4 = MULTIPLY(z5, FIX_0_785694958); + z5 = MULTIPLY(z5, FIX_1_175875602); + + tmp0 = z1 - z4; + tmp1 = z2 + z4; + tmp2 += z5; + tmp3 += z5; + } else { + /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */ + tmp0 = MULTIPLY(-d3, FIX_0_785694958); + tmp1 = MULTIPLY(-d3, FIX_1_387039845); + tmp2 = MULTIPLY(-d3, FIX_0_275899380); + tmp3 = MULTIPLY(d3, FIX_1_175875602); + } + } else { + if (d1) { + /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */ + tmp0 = MULTIPLY(d1, FIX_0_275899380); + tmp1 = MULTIPLY(d1, FIX_0_785694958); + tmp2 = MULTIPLY(d1, FIX_1_175875602); + tmp3 = MULTIPLY(d1, FIX_1_387039845); + } else { + /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */ + tmp0 = tmp1 = tmp2 = tmp3 = 0; + } + } + } + } +} + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + dataptr[0] = (int16_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); + dataptr[7] = (int16_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); + dataptr[1] = (int16_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); + dataptr[6] = (int16_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); + dataptr[2] = (int16_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); + dataptr[5] = (int16_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); + dataptr[3] = (int16_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); + dataptr[4] = (int16_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); + + dataptr += DCTSIZE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + dataptr = data; + for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { + /* Columns of zeroes can be exploited in the same way as we did with rows. + * However, the row calculation has created many nonzero AC terms, so the + * simplification applies less often (typically 5% to 10% of the time). + * On machines with very fast multiplication, it's possible that the + * test takes more time than it's worth. In that case this section + * may be commented out. + */ + + d0 = dataptr[DCTSIZE*0]; + d1 = dataptr[DCTSIZE*1]; + d2 = dataptr[DCTSIZE*2]; + d3 = dataptr[DCTSIZE*3]; + d4 = dataptr[DCTSIZE*4]; + d5 = dataptr[DCTSIZE*5]; + d6 = dataptr[DCTSIZE*6]; + d7 = dataptr[DCTSIZE*7]; + + /* Even part: reverse the even part of the forward DCT. */ + /* The rotator is sqrt(2)*c(-6). */ + if (d6) { + if (d2) { + /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ + z1 = MULTIPLY(d2 + d6, FIX_0_541196100); + tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); + tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); + + tmp0 = (d0 + d4) * CONST_SCALE; + tmp1 = (d0 - d4) * CONST_SCALE; + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } else { + /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ + tmp2 = MULTIPLY(-d6, FIX_1_306562965); + tmp3 = MULTIPLY(d6, FIX_0_541196100); + + tmp0 = (d0 + d4) * CONST_SCALE; + tmp1 = (d0 - d4) * CONST_SCALE; + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } + } else { + if (d2) { + /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ + tmp2 = MULTIPLY(d2, FIX_0_541196100); + tmp3 = MULTIPLY(d2, FIX_1_306562965); + + tmp0 = (d0 + d4) * CONST_SCALE; + tmp1 = (d0 - d4) * CONST_SCALE; + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } else { + /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ + tmp10 = tmp13 = (d0 + d4) * CONST_SCALE; + tmp11 = tmp12 = (d0 - d4) * CONST_SCALE; + } + } + + /* Odd part per figure 8; the matrix is unitary and hence its + * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. + */ + if (d7) { + if (d5) { + if (d3) { + if (d1) { + /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */ + z1 = d7 + d1; + z2 = d5 + d3; + z3 = d7 + d3; + z4 = d5 + d1; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); + + tmp0 = MULTIPLY(d7, FIX_0_298631336); + tmp1 = MULTIPLY(d5, FIX_2_053119869); + tmp2 = MULTIPLY(d3, FIX_3_072711026); + tmp3 = MULTIPLY(d1, FIX_1_501321110); + z1 = MULTIPLY(-z1, FIX_0_899976223); + z2 = MULTIPLY(-z2, FIX_2_562915447); + z3 = MULTIPLY(-z3, FIX_1_961570560); + z4 = MULTIPLY(-z4, FIX_0_390180644); + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + } else { + /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */ + z2 = d5 + d3; + z3 = d7 + d3; + z5 = MULTIPLY(z3 + d5, FIX_1_175875602); + + tmp0 = MULTIPLY(d7, FIX_0_298631336); + tmp1 = MULTIPLY(d5, FIX_2_053119869); + tmp2 = MULTIPLY(d3, FIX_3_072711026); + z1 = MULTIPLY(-d7, FIX_0_899976223); + z2 = MULTIPLY(-z2, FIX_2_562915447); + z3 = MULTIPLY(-z3, FIX_1_961570560); + z4 = MULTIPLY(-d5, FIX_0_390180644); + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 = z1 + z4; + } + } else { + if (d1) { + /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */ + z1 = d7 + d1; + z3 = d7; + z4 = d5 + d1; + z5 = MULTIPLY(z3 + z4, FIX_1_175875602); + + tmp0 = MULTIPLY(d7, FIX_0_298631336); + tmp1 = MULTIPLY(d5, FIX_2_053119869); + tmp3 = MULTIPLY(d1, FIX_1_501321110); + z1 = MULTIPLY(-z1, FIX_0_899976223); + z2 = MULTIPLY(-d5, FIX_2_562915447); + z3 = MULTIPLY(-d7, FIX_1_961570560); + z4 = MULTIPLY(-z4, FIX_0_390180644); + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 += z2 + z4; + tmp2 = z2 + z3; + tmp3 += z1 + z4; + } else { + /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */ + tmp0 = MULTIPLY(-d7, FIX_0_601344887); + z1 = MULTIPLY(-d7, FIX_0_899976223); + z3 = MULTIPLY(-d7, FIX_1_961570560); + tmp1 = MULTIPLY(-d5, FIX_0_509795579); + z2 = MULTIPLY(-d5, FIX_2_562915447); + z4 = MULTIPLY(-d5, FIX_0_390180644); + z5 = MULTIPLY(d5 + d7, FIX_1_175875602); + + z3 += z5; + z4 += z5; + + tmp0 += z3; + tmp1 += z4; + tmp2 = z2 + z3; + tmp3 = z1 + z4; + } + } + } else { + if (d3) { + if (d1) { + /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */ + z1 = d7 + d1; + z3 = d7 + d3; + z5 = MULTIPLY(z3 + d1, FIX_1_175875602); + + tmp0 = MULTIPLY(d7, FIX_0_298631336); + tmp2 = MULTIPLY(d3, FIX_3_072711026); + tmp3 = MULTIPLY(d1, FIX_1_501321110); + z1 = MULTIPLY(-z1, FIX_0_899976223); + z2 = MULTIPLY(-d3, FIX_2_562915447); + z3 = MULTIPLY(-z3, FIX_1_961570560); + z4 = MULTIPLY(-d1, FIX_0_390180644); + + z3 += z5; + z4 += z5; + + tmp0 += z1 + z3; + tmp1 = z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + } else { + /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */ + z3 = d7 + d3; + + tmp0 = MULTIPLY(-d7, FIX_0_601344887); + z1 = MULTIPLY(-d7, FIX_0_899976223); + tmp2 = MULTIPLY(d3, FIX_0_509795579); + z2 = MULTIPLY(-d3, FIX_2_562915447); + z5 = MULTIPLY(z3, FIX_1_175875602); + z3 = MULTIPLY(-z3, FIX_0_785694958); + + tmp0 += z3; + tmp1 = z2 + z5; + tmp2 += z3; + tmp3 = z1 + z5; + } + } else { + if (d1) { + /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */ + z1 = d7 + d1; + z5 = MULTIPLY(z1, FIX_1_175875602); + + z1 = MULTIPLY(z1, FIX_0_275899380); + z3 = MULTIPLY(-d7, FIX_1_961570560); + tmp0 = MULTIPLY(-d7, FIX_1_662939225); + z4 = MULTIPLY(-d1, FIX_0_390180644); + tmp3 = MULTIPLY(d1, FIX_1_111140466); + + tmp0 += z1; + tmp1 = z4 + z5; + tmp2 = z3 + z5; + tmp3 += z1; + } else { + /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */ + tmp0 = MULTIPLY(-d7, FIX_1_387039845); + tmp1 = MULTIPLY(d7, FIX_1_175875602); + tmp2 = MULTIPLY(-d7, FIX_0_785694958); + tmp3 = MULTIPLY(d7, FIX_0_275899380); + } + } + } + } else { + if (d5) { + if (d3) { + if (d1) { + /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */ + z2 = d5 + d3; + z4 = d5 + d1; + z5 = MULTIPLY(d3 + z4, FIX_1_175875602); + + tmp1 = MULTIPLY(d5, FIX_2_053119869); + tmp2 = MULTIPLY(d3, FIX_3_072711026); + tmp3 = MULTIPLY(d1, FIX_1_501321110); + z1 = MULTIPLY(-d1, FIX_0_899976223); + z2 = MULTIPLY(-z2, FIX_2_562915447); + z3 = MULTIPLY(-d3, FIX_1_961570560); + z4 = MULTIPLY(-z4, FIX_0_390180644); + + z3 += z5; + z4 += z5; + + tmp0 = z1 + z3; + tmp1 += z2 + z4; + tmp2 += z2 + z3; + tmp3 += z1 + z4; + } else { + /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */ + z2 = d5 + d3; + + z5 = MULTIPLY(z2, FIX_1_175875602); + tmp1 = MULTIPLY(d5, FIX_1_662939225); + z4 = MULTIPLY(-d5, FIX_0_390180644); + z2 = MULTIPLY(-z2, FIX_1_387039845); + tmp2 = MULTIPLY(d3, FIX_1_111140466); + z3 = MULTIPLY(-d3, FIX_1_961570560); + + tmp0 = z3 + z5; + tmp1 += z2; + tmp2 += z2; + tmp3 = z4 + z5; + } + } else { + if (d1) { + /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */ + z4 = d5 + d1; + + z5 = MULTIPLY(z4, FIX_1_175875602); + z1 = MULTIPLY(-d1, FIX_0_899976223); + tmp3 = MULTIPLY(d1, FIX_0_601344887); + tmp1 = MULTIPLY(-d5, FIX_0_509795579); + z2 = MULTIPLY(-d5, FIX_2_562915447); + z4 = MULTIPLY(z4, FIX_0_785694958); + + tmp0 = z1 + z5; + tmp1 += z4; + tmp2 = z2 + z5; + tmp3 += z4; + } else { + /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */ + tmp0 = MULTIPLY(d5, FIX_1_175875602); + tmp1 = MULTIPLY(d5, FIX_0_275899380); + tmp2 = MULTIPLY(-d5, FIX_1_387039845); + tmp3 = MULTIPLY(d5, FIX_0_785694958); + } + } + } else { + if (d3) { + if (d1) { + /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */ + z5 = d1 + d3; + tmp3 = MULTIPLY(d1, FIX_0_211164243); + tmp2 = MULTIPLY(-d3, FIX_1_451774981); + z1 = MULTIPLY(d1, FIX_1_061594337); + z2 = MULTIPLY(-d3, FIX_2_172734803); + z4 = MULTIPLY(z5, FIX_0_785694958); + z5 = MULTIPLY(z5, FIX_1_175875602); + + tmp0 = z1 - z4; + tmp1 = z2 + z4; + tmp2 += z5; + tmp3 += z5; + } else { + /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */ + tmp0 = MULTIPLY(-d3, FIX_0_785694958); + tmp1 = MULTIPLY(-d3, FIX_1_387039845); + tmp2 = MULTIPLY(-d3, FIX_0_275899380); + tmp3 = MULTIPLY(d3, FIX_1_175875602); + } + } else { + if (d1) { + /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */ + tmp0 = MULTIPLY(d1, FIX_0_275899380); + tmp1 = MULTIPLY(d1, FIX_0_785694958); + tmp2 = MULTIPLY(d1, FIX_1_175875602); + tmp3 = MULTIPLY(d1, FIX_1_387039845); + } else { + /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */ + tmp0 = tmp1 = tmp2 = tmp3 = 0; + } + } + } + } + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + dataptr[DCTSIZE*0] = (int16_t) DESCALE(tmp10 + tmp3, + CONST_BITS+PASS1_BITS+3); + dataptr[DCTSIZE*7] = (int16_t) DESCALE(tmp10 - tmp3, + CONST_BITS+PASS1_BITS+3); + dataptr[DCTSIZE*1] = (int16_t) DESCALE(tmp11 + tmp2, + CONST_BITS+PASS1_BITS+3); + dataptr[DCTSIZE*6] = (int16_t) DESCALE(tmp11 - tmp2, + CONST_BITS+PASS1_BITS+3); + dataptr[DCTSIZE*2] = (int16_t) DESCALE(tmp12 + tmp1, + CONST_BITS+PASS1_BITS+3); + dataptr[DCTSIZE*5] = (int16_t) DESCALE(tmp12 - tmp1, + CONST_BITS+PASS1_BITS+3); + dataptr[DCTSIZE*3] = (int16_t) DESCALE(tmp13 + tmp0, + CONST_BITS+PASS1_BITS+3); + dataptr[DCTSIZE*4] = (int16_t) DESCALE(tmp13 - tmp0, + CONST_BITS+PASS1_BITS+3); + + dataptr++; /* advance pointer to next column */ + } +} + +#undef DCTSIZE +#define DCTSIZE 4 +#define DCTSTRIDE 8 + +void ff_j_rev_dct4(DCTBLOCK data) +{ + int32_t tmp0, tmp1, tmp2, tmp3; + int32_t tmp10, tmp11, tmp12, tmp13; + int32_t z1; + int32_t d0, d2, d4, d6; + register int16_t *dataptr; + int rowctr; + + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + + data[0] += 4; + + dataptr = data; + + for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { + /* Due to quantization, we will usually find that many of the input + * coefficients are zero, especially the AC terms. We can exploit this + * by short-circuiting the IDCT calculation for any row in which all + * the AC terms are zero. In that case each output is equal to the + * DC coefficient (with scale factor as needed). + * With typical images and quantization tables, half or more of the + * row DCT calculations can be simplified this way. + */ + + register uint8_t *idataptr = (uint8_t*)dataptr; + + d0 = dataptr[0]; + d2 = dataptr[1]; + d4 = dataptr[2]; + d6 = dataptr[3]; + + if ((d2 | d4 | d6) == 0) { + /* AC terms all zero */ + if (d0) { + /* Compute a 32 bit value to assign. */ + int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS)); + register unsigned v = (dcval & 0xffff) | ((uint32_t)dcval << 16); + + AV_WN32A(&idataptr[0], v); + AV_WN32A(&idataptr[4], v); + } + + dataptr += DCTSTRIDE; /* advance pointer to next row */ + continue; + } + + /* Even part: reverse the even part of the forward DCT. */ + /* The rotator is sqrt(2)*c(-6). */ + if (d6) { + if (d2) { + /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ + z1 = MULTIPLY(d2 + d6, FIX_0_541196100); + tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); + tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); + + tmp0 = (d0 + d4) * (1 << CONST_BITS); + tmp1 = (d0 - d4) * (1 << CONST_BITS); + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } else { + /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ + tmp2 = MULTIPLY(-d6, FIX_1_306562965); + tmp3 = MULTIPLY(d6, FIX_0_541196100); + + tmp0 = (d0 + d4) * (1 << CONST_BITS); + tmp1 = (d0 - d4) * (1 << CONST_BITS); + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } + } else { + if (d2) { + /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ + tmp2 = MULTIPLY(d2, FIX_0_541196100); + tmp3 = MULTIPLY(d2, FIX_1_306562965); + + tmp0 = (d0 + d4) * (1 << CONST_BITS); + tmp1 = (d0 - d4) * (1 << CONST_BITS); + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } else { + /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ + tmp10 = tmp13 = (d0 + d4) * (1 << CONST_BITS); + tmp11 = tmp12 = (d0 - d4) * (1 << CONST_BITS); + } + } + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS); + dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS); + dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS); + dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS); + + dataptr += DCTSTRIDE; /* advance pointer to next row */ + } + + /* Pass 2: process columns. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + dataptr = data; + for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { + /* Columns of zeroes can be exploited in the same way as we did with rows. + * However, the row calculation has created many nonzero AC terms, so the + * simplification applies less often (typically 5% to 10% of the time). + * On machines with very fast multiplication, it's possible that the + * test takes more time than it's worth. In that case this section + * may be commented out. + */ + + d0 = dataptr[DCTSTRIDE*0]; + d2 = dataptr[DCTSTRIDE*1]; + d4 = dataptr[DCTSTRIDE*2]; + d6 = dataptr[DCTSTRIDE*3]; + + /* Even part: reverse the even part of the forward DCT. */ + /* The rotator is sqrt(2)*c(-6). */ + if (d6) { + if (d2) { + /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ + z1 = MULTIPLY(d2 + d6, FIX_0_541196100); + tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); + tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); + + tmp0 = (d0 + d4) * (1 << CONST_BITS); + tmp1 = (d0 - d4) * (1 << CONST_BITS); + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } else { + /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ + tmp2 = MULTIPLY(-d6, FIX_1_306562965); + tmp3 = MULTIPLY(d6, FIX_0_541196100); + + tmp0 = (d0 + d4) * (1 << CONST_BITS); + tmp1 = (d0 - d4) * (1 << CONST_BITS); + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } + } else { + if (d2) { + /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ + tmp2 = MULTIPLY(d2, FIX_0_541196100); + tmp3 = MULTIPLY(d2, FIX_1_306562965); + + tmp0 = (d0 + d4) * (1 << CONST_BITS); + tmp1 = (d0 - d4) * (1 << CONST_BITS); + + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + } else { + /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ + tmp10 = tmp13 = (d0 + d4) * (1 << CONST_BITS); + tmp11 = tmp12 = (d0 - d4) * (1 << CONST_BITS); + } + } + + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ + + dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3); + dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3); + dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3); + dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3); + + dataptr++; /* advance pointer to next column */ + } +} + +void ff_j_rev_dct2(DCTBLOCK data){ + int d00, d01, d10, d11; + + data[0] += 4; + d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE]; + d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE]; + d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE]; + d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE]; + + data[0+0*DCTSTRIDE]= (d00 + d10)>>3; + data[1+0*DCTSTRIDE]= (d01 + d11)>>3; + data[0+1*DCTSTRIDE]= (d00 - d10)>>3; + data[1+1*DCTSTRIDE]= (d01 - d11)>>3; +} + +void ff_j_rev_dct1(DCTBLOCK data){ + data[0] = (data[0] + 4)>>3; +} + +#undef FIX +#undef CONST_BITS + +void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + ff_j_rev_dct(block); + ff_put_pixels_clamped_c(block, dest, line_size); +} + +void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + ff_j_rev_dct(block); + ff_add_pixels_clamped_c(block, dest, line_size); +} diff --git a/media/ffvpx/libavcodec/libdav1d.c b/media/ffvpx/libavcodec/libdav1d.c new file mode 100644 index 0000000000..2488a709c7 --- /dev/null +++ b/media/ffvpx/libavcodec/libdav1d.c @@ -0,0 +1,642 @@ +/* + * Copyright (c) 2018 Ronald S. Bultje <rsbultje gmail com> + * Copyright (c) 2018 James Almer <jamrial gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <dav1d/dav1d.h> + +#include "libavutil/avassert.h" +#include "libavutil/cpu.h" +#include "libavutil/film_grain_params.h" +#include "libavutil/mastering_display_metadata.h" +#include "libavutil/imgutils.h" +#include "libavutil/opt.h" + +#include "atsc_a53.h" +#include "avcodec.h" +#include "bytestream.h" +#include "codec_internal.h" +#include "decode.h" +#include "internal.h" + +#define FF_DAV1D_VERSION_AT_LEAST(x,y) \ + (DAV1D_API_VERSION_MAJOR > (x) || DAV1D_API_VERSION_MAJOR == (x) && DAV1D_API_VERSION_MINOR >= (y)) + +typedef struct Libdav1dContext { + AVClass *class; + Dav1dContext *c; + AVBufferPool *pool; + int pool_size; + + Dav1dData data; + int tile_threads; + int frame_threads; + int max_frame_delay; + int apply_grain; + int operating_point; + int all_layers; +} Libdav1dContext; + +static const enum AVPixelFormat pix_fmt[][3] = { + [DAV1D_PIXEL_LAYOUT_I400] = { AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY10, AV_PIX_FMT_GRAY12 }, + [DAV1D_PIXEL_LAYOUT_I420] = { AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P12 }, + [DAV1D_PIXEL_LAYOUT_I422] = { AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P12 }, + [DAV1D_PIXEL_LAYOUT_I444] = { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV444P12 }, +}; + +static const enum AVPixelFormat pix_fmt_rgb[3] = { + AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, +}; + +static void libdav1d_log_callback(void *opaque, const char *fmt, va_list vl) +{ + AVCodecContext *c = opaque; + + av_vlog(c, AV_LOG_ERROR, fmt, vl); +} + +static int libdav1d_picture_allocator(Dav1dPicture *p, void *cookie) +{ + Libdav1dContext *dav1d = cookie; + enum AVPixelFormat format = pix_fmt[p->p.layout][p->seq_hdr->hbd]; + int ret, linesize[4], h = FFALIGN(p->p.h, 128), w = FFALIGN(p->p.w, 128); + uint8_t *aligned_ptr, *data[4]; + AVBufferRef *buf; + + ret = av_image_get_buffer_size(format, w, h, DAV1D_PICTURE_ALIGNMENT); + if (ret < 0) + return ret; + + if (ret != dav1d->pool_size) { + av_buffer_pool_uninit(&dav1d->pool); + // Use twice the amount of required padding bytes for aligned_ptr below. + dav1d->pool = av_buffer_pool_init(ret + DAV1D_PICTURE_ALIGNMENT * 2, NULL); + if (!dav1d->pool) { + dav1d->pool_size = 0; + return AVERROR(ENOMEM); + } + dav1d->pool_size = ret; + } + buf = av_buffer_pool_get(dav1d->pool); + if (!buf) + return AVERROR(ENOMEM); + + // libdav1d requires DAV1D_PICTURE_ALIGNMENT aligned buffers, which av_malloc() + // doesn't guarantee for example when AVX is disabled at configure time. + // Use the extra DAV1D_PICTURE_ALIGNMENT padding bytes in the buffer to align it + // if required. + aligned_ptr = (uint8_t *)FFALIGN((uintptr_t)buf->data, DAV1D_PICTURE_ALIGNMENT); + ret = av_image_fill_arrays(data, linesize, aligned_ptr, format, w, h, + DAV1D_PICTURE_ALIGNMENT); + if (ret < 0) { + av_buffer_unref(&buf); + return ret; + } + + p->data[0] = data[0]; + p->data[1] = data[1]; + p->data[2] = data[2]; + p->stride[0] = linesize[0]; + p->stride[1] = linesize[1]; + p->allocator_data = buf; + + return 0; +} + +static void libdav1d_picture_release(Dav1dPicture *p, void *cookie) +{ + AVBufferRef *buf = p->allocator_data; + + av_buffer_unref(&buf); +} + +static void libdav1d_init_params(AVCodecContext *c, const Dav1dSequenceHeader *seq) +{ + c->profile = seq->profile; + c->level = ((seq->operating_points[0].major_level - 2) << 2) + | seq->operating_points[0].minor_level; + + switch (seq->chr) { + case DAV1D_CHR_VERTICAL: + c->chroma_sample_location = AVCHROMA_LOC_LEFT; + break; + case DAV1D_CHR_COLOCATED: + c->chroma_sample_location = AVCHROMA_LOC_TOPLEFT; + break; + } + c->colorspace = (enum AVColorSpace) seq->mtrx; + c->color_primaries = (enum AVColorPrimaries) seq->pri; + c->color_trc = (enum AVColorTransferCharacteristic) seq->trc; + c->color_range = seq->color_range ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG; + + if (seq->layout == DAV1D_PIXEL_LAYOUT_I444 && + seq->mtrx == DAV1D_MC_IDENTITY && + seq->pri == DAV1D_COLOR_PRI_BT709 && + seq->trc == DAV1D_TRC_SRGB) + c->pix_fmt = pix_fmt_rgb[seq->hbd]; + else + c->pix_fmt = pix_fmt[seq->layout][seq->hbd]; + + if (seq->num_units_in_tick && seq->time_scale) { + av_reduce(&c->framerate.den, &c->framerate.num, + seq->num_units_in_tick, seq->time_scale, INT_MAX); + if (seq->equal_picture_interval) + c->ticks_per_frame = seq->num_ticks_per_picture; + } + + if (seq->film_grain_present) + c->properties |= FF_CODEC_PROPERTY_FILM_GRAIN; + else + c->properties &= ~FF_CODEC_PROPERTY_FILM_GRAIN; +} + +static av_cold int libdav1d_parse_extradata(AVCodecContext *c) +{ + Dav1dSequenceHeader seq; + size_t offset = 0; + int res; + + if (!c->extradata || c->extradata_size <= 0) + return 0; + + if (c->extradata[0] & 0x80) { + int version = c->extradata[0] & 0x7F; + + if (version != 1 || c->extradata_size < 4) { + int explode = !!(c->err_recognition & AV_EF_EXPLODE); + av_log(c, explode ? AV_LOG_ERROR : AV_LOG_WARNING, + "Error decoding extradata\n"); + return explode ? AVERROR_INVALIDDATA : 0; + } + + // Do nothing if there are no configOBUs to parse + if (c->extradata_size == 4) + return 0; + + offset = 4; + } + + res = dav1d_parse_sequence_header(&seq, c->extradata + offset, + c->extradata_size - offset); + if (res < 0) + return 0; // Assume no seqhdr OBUs are present + + libdav1d_init_params(c, &seq); + res = ff_set_dimensions(c, seq.max_width, seq.max_height); + if (res < 0) + return res; + + return 0; +} + +static av_cold int libdav1d_init(AVCodecContext *c) +{ + Libdav1dContext *dav1d = c->priv_data; + Dav1dSettings s; +#if FF_DAV1D_VERSION_AT_LEAST(6,0) + int threads = c->thread_count; +#else + int threads = (c->thread_count ? c->thread_count : av_cpu_count()) * 3 / 2; +#endif + int res; + + av_log(c, AV_LOG_INFO, "libdav1d %s\n", dav1d_version()); + + dav1d_default_settings(&s); + s.logger.cookie = c; + s.logger.callback = libdav1d_log_callback; + s.allocator.cookie = dav1d; + s.allocator.alloc_picture_callback = libdav1d_picture_allocator; + s.allocator.release_picture_callback = libdav1d_picture_release; + s.frame_size_limit = c->max_pixels; + if (dav1d->apply_grain >= 0) + s.apply_grain = dav1d->apply_grain; + else + s.apply_grain = !(c->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN); + + s.all_layers = dav1d->all_layers; + if (dav1d->operating_point >= 0) + s.operating_point = dav1d->operating_point; +#if FF_DAV1D_VERSION_AT_LEAST(6,2) + s.strict_std_compliance = c->strict_std_compliance > 0; +#endif + +#if FF_DAV1D_VERSION_AT_LEAST(6,0) + if (dav1d->frame_threads || dav1d->tile_threads) + s.n_threads = FFMAX(dav1d->frame_threads, dav1d->tile_threads); + else + s.n_threads = FFMIN(threads, DAV1D_MAX_THREADS); + if (dav1d->max_frame_delay > 0 && (c->flags & AV_CODEC_FLAG_LOW_DELAY)) + av_log(c, AV_LOG_WARNING, "Low delay mode requested, forcing max_frame_delay 1\n"); + s.max_frame_delay = (c->flags & AV_CODEC_FLAG_LOW_DELAY) ? 1 : dav1d->max_frame_delay; + av_log(c, AV_LOG_DEBUG, "Using %d threads, %d max_frame_delay\n", + s.n_threads, s.max_frame_delay); +#else + s.n_tile_threads = dav1d->tile_threads + ? dav1d->tile_threads + : FFMIN(floor(sqrt(threads)), DAV1D_MAX_TILE_THREADS); + s.n_frame_threads = dav1d->frame_threads + ? dav1d->frame_threads + : FFMIN(ceil(threads / s.n_tile_threads), DAV1D_MAX_FRAME_THREADS); + if (dav1d->max_frame_delay > 0) + s.n_frame_threads = FFMIN(s.n_frame_threads, dav1d->max_frame_delay); + av_log(c, AV_LOG_DEBUG, "Using %d frame threads, %d tile threads\n", + s.n_frame_threads, s.n_tile_threads); +#endif + +#if FF_DAV1D_VERSION_AT_LEAST(6,8) + if (c->skip_frame >= AVDISCARD_NONKEY) + s.decode_frame_type = DAV1D_DECODEFRAMETYPE_KEY; + else if (c->skip_frame >= AVDISCARD_NONINTRA) + s.decode_frame_type = DAV1D_DECODEFRAMETYPE_INTRA; + else if (c->skip_frame >= AVDISCARD_NONREF) + s.decode_frame_type = DAV1D_DECODEFRAMETYPE_REFERENCE; +#endif + + res = libdav1d_parse_extradata(c); + if (res < 0) + return res; + + res = dav1d_open(&dav1d->c, &s); + if (res < 0) + return AVERROR(ENOMEM); + + return 0; +} + +static void libdav1d_flush(AVCodecContext *c) +{ + Libdav1dContext *dav1d = c->priv_data; + + dav1d_data_unref(&dav1d->data); + dav1d_flush(dav1d->c); +} + +typedef struct OpaqueData { + void *pkt_orig_opaque; +#if FF_API_REORDERED_OPAQUE + int64_t reordered_opaque; +#endif +} OpaqueData; + +static void libdav1d_data_free(const uint8_t *data, void *opaque) { + AVBufferRef *buf = opaque; + + av_buffer_unref(&buf); +} + +static void libdav1d_user_data_free(const uint8_t *data, void *opaque) { + AVPacket *pkt = opaque; + av_assert0(data == opaque); + av_free(pkt->opaque); + av_packet_free(&pkt); +} + +static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame) +{ + Libdav1dContext *dav1d = c->priv_data; + Dav1dData *data = &dav1d->data; + Dav1dPicture pic = { 0 }, *p = &pic; + AVPacket *pkt; + OpaqueData *od = NULL; +#if FF_DAV1D_VERSION_AT_LEAST(5,1) + enum Dav1dEventFlags event_flags = 0; +#endif + int res; + + if (!data->sz) { + pkt = av_packet_alloc(); + + if (!pkt) + return AVERROR(ENOMEM); + + res = ff_decode_get_packet(c, pkt); + if (res < 0 && res != AVERROR_EOF) { + av_packet_free(&pkt); + return res; + } + + if (pkt->size) { + res = dav1d_data_wrap(data, pkt->data, pkt->size, + libdav1d_data_free, pkt->buf); + if (res < 0) { + av_packet_free(&pkt); + return res; + } + + pkt->buf = NULL; + +FF_DISABLE_DEPRECATION_WARNINGS + if ( +#if FF_API_REORDERED_OPAQUE + c->reordered_opaque != AV_NOPTS_VALUE || +#endif + (pkt->opaque && (c->flags & AV_CODEC_FLAG_COPY_OPAQUE))) { + od = av_mallocz(sizeof(*od)); + if (!od) { + av_packet_free(&pkt); + dav1d_data_unref(data); + return AVERROR(ENOMEM); + } + od->pkt_orig_opaque = pkt->opaque; +#if FF_API_REORDERED_OPAQUE + od->reordered_opaque = c->reordered_opaque; +#endif +FF_ENABLE_DEPRECATION_WARNINGS + } + pkt->opaque = od; + + res = dav1d_data_wrap_user_data(data, (const uint8_t *)pkt, + libdav1d_user_data_free, pkt); + if (res < 0) { + av_free(pkt->opaque); + av_packet_free(&pkt); + dav1d_data_unref(data); + return res; + } + pkt = NULL; + } else { + av_packet_free(&pkt); + if (res >= 0) + return AVERROR(EAGAIN); + } + } + + res = dav1d_send_data(dav1d->c, data); + if (res < 0) { + if (res == AVERROR(EINVAL)) + res = AVERROR_INVALIDDATA; + if (res != AVERROR(EAGAIN)) { + dav1d_data_unref(data); + return res; + } + } + + res = dav1d_get_picture(dav1d->c, p); + if (res < 0) { + if (res == AVERROR(EINVAL)) + res = AVERROR_INVALIDDATA; + else if (res == AVERROR(EAGAIN) && c->internal->draining) + res = AVERROR_EOF; + + return res; + } + + av_assert0(p->data[0] && p->allocator_data); + + // This requires the custom allocator above + frame->buf[0] = av_buffer_ref(p->allocator_data); + if (!frame->buf[0]) { + dav1d_picture_unref(p); + return AVERROR(ENOMEM); + } + + frame->data[0] = p->data[0]; + frame->data[1] = p->data[1]; + frame->data[2] = p->data[2]; + frame->linesize[0] = p->stride[0]; + frame->linesize[1] = p->stride[1]; + frame->linesize[2] = p->stride[1]; + +#if FF_DAV1D_VERSION_AT_LEAST(5,1) + dav1d_get_event_flags(dav1d->c, &event_flags); + if (c->pix_fmt == AV_PIX_FMT_NONE || + event_flags & DAV1D_EVENT_FLAG_NEW_SEQUENCE) +#endif + libdav1d_init_params(c, p->seq_hdr); + res = ff_decode_frame_props(c, frame); + if (res < 0) + goto fail; + + frame->width = p->p.w; + frame->height = p->p.h; + if (c->width != p->p.w || c->height != p->p.h) { + res = ff_set_dimensions(c, p->p.w, p->p.h); + if (res < 0) + goto fail; + } + + av_reduce(&frame->sample_aspect_ratio.num, + &frame->sample_aspect_ratio.den, + frame->height * (int64_t)p->frame_hdr->render_width, + frame->width * (int64_t)p->frame_hdr->render_height, + INT_MAX); + ff_set_sar(c, frame->sample_aspect_ratio); + + pkt = (AVPacket *)p->m.user_data.data; + od = pkt->opaque; +#if FF_API_REORDERED_OPAQUE +FF_DISABLE_DEPRECATION_WARNINGS + if (od && od->reordered_opaque != AV_NOPTS_VALUE) + frame->reordered_opaque = od->reordered_opaque; + else + frame->reordered_opaque = AV_NOPTS_VALUE; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + + // restore the original user opaque value for + // ff_decode_frame_props_from_pkt() + pkt->opaque = od ? od->pkt_orig_opaque : NULL; + av_freep(&od); + + // match timestamps and packet size + res = ff_decode_frame_props_from_pkt(c, frame, pkt); + pkt->opaque = NULL; + if (res < 0) + goto fail; + + frame->pkt_dts = pkt->pts; + frame->key_frame = p->frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY; + + switch (p->frame_hdr->frame_type) { + case DAV1D_FRAME_TYPE_KEY: + case DAV1D_FRAME_TYPE_INTRA: + frame->pict_type = AV_PICTURE_TYPE_I; + break; + case DAV1D_FRAME_TYPE_INTER: + frame->pict_type = AV_PICTURE_TYPE_P; + break; + case DAV1D_FRAME_TYPE_SWITCH: + frame->pict_type = AV_PICTURE_TYPE_SP; + break; + default: + res = AVERROR_INVALIDDATA; + goto fail; + } + + if (p->mastering_display) { + AVMasteringDisplayMetadata *mastering = av_mastering_display_metadata_create_side_data(frame); + if (!mastering) { + res = AVERROR(ENOMEM); + goto fail; + } + + for (int i = 0; i < 3; i++) { + mastering->display_primaries[i][0] = av_make_q(p->mastering_display->primaries[i][0], 1 << 16); + mastering->display_primaries[i][1] = av_make_q(p->mastering_display->primaries[i][1], 1 << 16); + } + mastering->white_point[0] = av_make_q(p->mastering_display->white_point[0], 1 << 16); + mastering->white_point[1] = av_make_q(p->mastering_display->white_point[1], 1 << 16); + + mastering->max_luminance = av_make_q(p->mastering_display->max_luminance, 1 << 8); + mastering->min_luminance = av_make_q(p->mastering_display->min_luminance, 1 << 14); + + mastering->has_primaries = 1; + mastering->has_luminance = 1; + } + if (p->content_light) { + AVContentLightMetadata *light = av_content_light_metadata_create_side_data(frame); + if (!light) { + res = AVERROR(ENOMEM); + goto fail; + } + light->MaxCLL = p->content_light->max_content_light_level; + light->MaxFALL = p->content_light->max_frame_average_light_level; + } + if (p->itut_t35) { + GetByteContext gb; + unsigned int user_identifier; + + bytestream2_init(&gb, p->itut_t35->payload, p->itut_t35->payload_size); + bytestream2_skip(&gb, 1); // terminal provider code + bytestream2_skip(&gb, 1); // terminal provider oriented code + user_identifier = bytestream2_get_be32(&gb); + switch (user_identifier) { + case MKBETAG('G', 'A', '9', '4'): { // closed captions + AVBufferRef *buf = NULL; + + res = ff_parse_a53_cc(&buf, gb.buffer, bytestream2_get_bytes_left(&gb)); + if (res < 0) + goto fail; + if (!res) + break; + + if (!av_frame_new_side_data_from_buf(frame, AV_FRAME_DATA_A53_CC, buf)) + av_buffer_unref(&buf); + + c->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS; + break; + } + default: // ignore unsupported identifiers + break; + } + } + if (p->frame_hdr->film_grain.present && (!dav1d->apply_grain || + (c->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN))) { + AVFilmGrainParams *fgp = av_film_grain_params_create_side_data(frame); + if (!fgp) { + res = AVERROR(ENOMEM); + goto fail; + } + + fgp->type = AV_FILM_GRAIN_PARAMS_AV1; + fgp->seed = p->frame_hdr->film_grain.data.seed; + fgp->codec.aom.num_y_points = p->frame_hdr->film_grain.data.num_y_points; + fgp->codec.aom.chroma_scaling_from_luma = p->frame_hdr->film_grain.data.chroma_scaling_from_luma; + fgp->codec.aom.scaling_shift = p->frame_hdr->film_grain.data.scaling_shift; + fgp->codec.aom.ar_coeff_lag = p->frame_hdr->film_grain.data.ar_coeff_lag; + fgp->codec.aom.ar_coeff_shift = p->frame_hdr->film_grain.data.ar_coeff_shift; + fgp->codec.aom.grain_scale_shift = p->frame_hdr->film_grain.data.grain_scale_shift; + fgp->codec.aom.overlap_flag = p->frame_hdr->film_grain.data.overlap_flag; + fgp->codec.aom.limit_output_range = p->frame_hdr->film_grain.data.clip_to_restricted_range; + + memcpy(&fgp->codec.aom.y_points, &p->frame_hdr->film_grain.data.y_points, + sizeof(fgp->codec.aom.y_points)); + memcpy(&fgp->codec.aom.num_uv_points, &p->frame_hdr->film_grain.data.num_uv_points, + sizeof(fgp->codec.aom.num_uv_points)); + memcpy(&fgp->codec.aom.uv_points, &p->frame_hdr->film_grain.data.uv_points, + sizeof(fgp->codec.aom.uv_points)); + memcpy(&fgp->codec.aom.ar_coeffs_y, &p->frame_hdr->film_grain.data.ar_coeffs_y, + sizeof(fgp->codec.aom.ar_coeffs_y)); + memcpy(&fgp->codec.aom.ar_coeffs_uv[0], &p->frame_hdr->film_grain.data.ar_coeffs_uv[0], + sizeof(fgp->codec.aom.ar_coeffs_uv[0])); + memcpy(&fgp->codec.aom.ar_coeffs_uv[1], &p->frame_hdr->film_grain.data.ar_coeffs_uv[1], + sizeof(fgp->codec.aom.ar_coeffs_uv[1])); + memcpy(&fgp->codec.aom.uv_mult, &p->frame_hdr->film_grain.data.uv_mult, + sizeof(fgp->codec.aom.uv_mult)); + memcpy(&fgp->codec.aom.uv_mult_luma, &p->frame_hdr->film_grain.data.uv_luma_mult, + sizeof(fgp->codec.aom.uv_mult_luma)); + memcpy(&fgp->codec.aom.uv_offset, &p->frame_hdr->film_grain.data.uv_offset, + sizeof(fgp->codec.aom.uv_offset)); + } + + res = 0; +fail: + dav1d_picture_unref(p); + if (res < 0) + av_frame_unref(frame); + return res; +} + +static av_cold int libdav1d_close(AVCodecContext *c) +{ + Libdav1dContext *dav1d = c->priv_data; + + av_buffer_pool_uninit(&dav1d->pool); + dav1d_data_unref(&dav1d->data); + dav1d_close(&dav1d->c); + + return 0; +} + +#ifndef DAV1D_MAX_FRAME_THREADS +#define DAV1D_MAX_FRAME_THREADS DAV1D_MAX_THREADS +#endif +#ifndef DAV1D_MAX_TILE_THREADS +#define DAV1D_MAX_TILE_THREADS DAV1D_MAX_THREADS +#endif +#ifndef DAV1D_MAX_FRAME_DELAY +#define DAV1D_MAX_FRAME_DELAY DAV1D_MAX_FRAME_THREADS +#endif + +#define OFFSET(x) offsetof(Libdav1dContext, x) +#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM +static const AVOption libdav1d_options[] = { + { "tilethreads", "Tile threads", OFFSET(tile_threads), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, DAV1D_MAX_TILE_THREADS, VD | AV_OPT_FLAG_DEPRECATED }, + { "framethreads", "Frame threads", OFFSET(frame_threads), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, DAV1D_MAX_FRAME_THREADS, VD | AV_OPT_FLAG_DEPRECATED }, + { "max_frame_delay", "Max frame delay", OFFSET(max_frame_delay), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, DAV1D_MAX_FRAME_DELAY, VD }, + { "filmgrain", "Apply Film Grain", OFFSET(apply_grain), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VD | AV_OPT_FLAG_DEPRECATED }, + { "oppoint", "Select an operating point of the scalable bitstream", OFFSET(operating_point), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 31, VD }, + { "alllayers", "Output all spatial layers", OFFSET(all_layers), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VD }, + { NULL } +}; + +static const AVClass libdav1d_class = { + .class_name = "libdav1d decoder", + .item_name = av_default_item_name, + .option = libdav1d_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +const FFCodec ff_libdav1d_decoder = { + .p.name = "libdav1d", + CODEC_LONG_NAME("dav1d AV1 decoder by VideoLAN"), + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_AV1, + .priv_data_size = sizeof(Libdav1dContext), + .init = libdav1d_init, + .close = libdav1d_close, + .flush = libdav1d_flush, + FF_CODEC_RECEIVE_FRAME_CB(libdav1d_receive_frame), + .p.capabilities = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_OTHER_THREADS, + .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_SETS_FRAME_PROPS | + FF_CODEC_CAP_AUTO_THREADS, + .p.priv_class = &libdav1d_class, + .p.wrapper_name = "libdav1d", +}; diff --git a/media/ffvpx/libavcodec/log2_tab.c b/media/ffvpx/libavcodec/log2_tab.c new file mode 100644 index 0000000000..47a1df03b7 --- /dev/null +++ b/media/ffvpx/libavcodec/log2_tab.c @@ -0,0 +1 @@ +#include "libavutil/log2_tab.c" diff --git a/media/ffvpx/libavcodec/mathops.h b/media/ffvpx/libavcodec/mathops.h new file mode 100644 index 0000000000..a1dc323304 --- /dev/null +++ b/media/ffvpx/libavcodec/mathops.h @@ -0,0 +1,255 @@ +/* + * simple math operations + * Copyright (c) 2001, 2002 Fabrice Bellard + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef AVCODEC_MATHOPS_H +#define AVCODEC_MATHOPS_H + +#include <stdint.h> + +#include "libavutil/attributes_internal.h" +#include "libavutil/common.h" +#include "config.h" + +#define MAX_NEG_CROP 1024 + +extern const uint32_t ff_inverse[257]; +extern const uint8_t ff_log2_run[41]; +extern const uint8_t ff_sqrt_tab[256]; +extern const uint8_t attribute_visibility_hidden ff_crop_tab[256 + 2 * MAX_NEG_CROP]; +extern const uint8_t ff_zigzag_direct[64]; +extern const uint8_t ff_zigzag_scan[16+1]; + +#if ARCH_ARM +# include "arm/mathops.h" +#elif ARCH_AVR32 +# include "avr32/mathops.h" +#elif ARCH_MIPS +# include "mips/mathops.h" +#elif ARCH_PPC +# include "ppc/mathops.h" +#elif ARCH_X86 +# include "x86/mathops.h" +#endif + +/* generic implementation */ + +#ifndef MUL64 +# define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) +#endif + +#ifndef MULL +# define MULL(a,b,s) (MUL64(a, b) >> (s)) +#endif + +#ifndef MULH +static av_always_inline int MULH(int a, int b){ + return MUL64(a, b) >> 32; +} +#endif + +#ifndef UMULH +static av_always_inline unsigned UMULH(unsigned a, unsigned b){ + return ((uint64_t)(a) * (uint64_t)(b))>>32; +} +#endif + +#ifndef MAC64 +# define MAC64(d, a, b) ((d) += MUL64(a, b)) +#endif + +#ifndef MLS64 +# define MLS64(d, a, b) ((d) -= MUL64(a, b)) +#endif + +/* signed 16x16 -> 32 multiply add accumulate */ +#ifndef MAC16 +# define MAC16(rt, ra, rb) rt += (ra) * (rb) +#endif + +/* signed 16x16 -> 32 multiply */ +#ifndef MUL16 +# define MUL16(ra, rb) ((ra) * (rb)) +#endif + +#ifndef MLS16 +# define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb)) +#endif + +/* median of 3 */ +#ifndef mid_pred +#define mid_pred mid_pred +static inline av_const int mid_pred(int a, int b, int c) +{ + if(a>b){ + if(c>b){ + if(c>a) b=a; + else b=c; + } + }else{ + if(b>c){ + if(c>a) b=c; + else b=a; + } + } + return b; +} +#endif + +#ifndef median4 +#define median4 median4 +static inline av_const int median4(int a, int b, int c, int d) +{ + if (a < b) { + if (c < d) return (FFMIN(b, d) + FFMAX(a, c)) / 2; + else return (FFMIN(b, c) + FFMAX(a, d)) / 2; + } else { + if (c < d) return (FFMIN(a, d) + FFMAX(b, c)) / 2; + else return (FFMIN(a, c) + FFMAX(b, d)) / 2; + } +} +#endif + +#define FF_SIGNBIT(x) ((x) >> CHAR_BIT * sizeof(x) - 1) + +#ifndef sign_extend +static inline av_const int sign_extend(int val, unsigned bits) +{ + unsigned shift = 8 * sizeof(int) - bits; + union { unsigned u; int s; } v = { (unsigned) val << shift }; + return v.s >> shift; +} +#endif + +#ifndef sign_extend64 +static inline av_const int64_t sign_extend64(int64_t val, unsigned bits) +{ + unsigned shift = 8 * sizeof(int64_t) - bits; + union { uint64_t u; int64_t s; } v = { (uint64_t) val << shift }; + return v.s >> shift; +} +#endif + +#ifndef zero_extend +static inline av_const unsigned zero_extend(unsigned val, unsigned bits) +{ + return (val << ((8 * sizeof(int)) - bits)) >> ((8 * sizeof(int)) - bits); +} +#endif + +#ifndef COPY3_IF_LT +#define COPY3_IF_LT(x, y, a, b, c, d)\ +if ((y) < (x)) {\ + (x) = (y);\ + (a) = (b);\ + (c) = (d);\ +} +#endif + +#ifndef MASK_ABS +#define MASK_ABS(mask, level) do { \ + mask = level >> 31; \ + level = (level ^ mask) - mask; \ + } while (0) +#endif + +#ifndef NEG_SSR32 +# define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s))) +#endif + +#ifndef NEG_USR32 +# define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s))) +#endif + +#if HAVE_BIGENDIAN +# ifndef PACK_2U8 +# define PACK_2U8(a,b) (((a) << 8) | (b)) +# endif +# ifndef PACK_4U8 +# define PACK_4U8(a,b,c,d) (((a) << 24) | ((b) << 16) | ((c) << 8) | (d)) +# endif +# ifndef PACK_2U16 +# define PACK_2U16(a,b) (((a) << 16) | (b)) +# endif +#else +# ifndef PACK_2U8 +# define PACK_2U8(a,b) (((b) << 8) | (a)) +# endif +# ifndef PACK_4U2 +# define PACK_4U8(a,b,c,d) (((d) << 24) | ((c) << 16) | ((b) << 8) | (a)) +# endif +# ifndef PACK_2U16 +# define PACK_2U16(a,b) (((b) << 16) | (a)) +# endif +#endif + +#ifndef PACK_2S8 +# define PACK_2S8(a,b) PACK_2U8((a)&255, (b)&255) +#endif +#ifndef PACK_4S8 +# define PACK_4S8(a,b,c,d) PACK_4U8((a)&255, (b)&255, (c)&255, (d)&255) +#endif +#ifndef PACK_2S16 +# define PACK_2S16(a,b) PACK_2U16((a)&0xffff, (b)&0xffff) +#endif + +#ifndef FASTDIV +# define FASTDIV(a,b) ((uint32_t)((((uint64_t)a) * ff_inverse[b]) >> 32)) +#endif /* FASTDIV */ + +#ifndef ff_sqrt +#define ff_sqrt ff_sqrt +static inline av_const unsigned int ff_sqrt(unsigned int a) +{ + unsigned int b; + + if (a < 255) return (ff_sqrt_tab[a + 1] - 1) >> 4; + else if (a < (1 << 12)) b = ff_sqrt_tab[a >> 4] >> 2; +#if !CONFIG_SMALL + else if (a < (1 << 14)) b = ff_sqrt_tab[a >> 6] >> 1; + else if (a < (1 << 16)) b = ff_sqrt_tab[a >> 8] ; +#endif + else { + int s = av_log2_16bit(a >> 16) >> 1; + unsigned int c = a >> (s + 2); + b = ff_sqrt_tab[c >> (s + 8)]; + b = FASTDIV(c,b) + (b << s); + } + + return b - (a < b * b); +} +#endif + +static inline av_const float ff_sqrf(float a) +{ + return a*a; +} + +static inline int8_t ff_u8_to_s8(uint8_t a) +{ + union { + uint8_t u8; + int8_t s8; + } b; + b.u8 = a; + return b.s8; +} + +#endif /* AVCODEC_MATHOPS_H */ diff --git a/media/ffvpx/libavcodec/mathtables.c b/media/ffvpx/libavcodec/mathtables.c new file mode 100644 index 0000000000..8b0031eb00 --- /dev/null +++ b/media/ffvpx/libavcodec/mathtables.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "mathops.h" + +/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256 + * for a>16909558, is an overestimate by less than 1 part in 1<<24 */ +const uint32_t ff_inverse[257]={ + 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757, + 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154, + 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709, + 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333, + 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367, + 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283, + 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315, + 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085, + 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498, + 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675, + 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441, + 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183, + 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712, + 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400, + 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163, + 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641, + 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573, + 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737, + 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493, + 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373, + 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368, + 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671, + 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767, + 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740, + 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751, + 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635, + 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593, + 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944, + 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933, + 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575, + 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532, + 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010, + 16777216 +}; + +const uint8_t ff_sqrt_tab[256]={ + 0, 16, 23, 28, 32, 36, 40, 43, 46, 48, 51, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 77, 79, 80, 82, 84, 85, 87, 88, 90, + 91, 92, 94, 95, 96, 98, 99,100,102,103,104,105,107,108,109,110,111,112,114,115,116,117,118,119,120,121,122,123,124,125,126,127, +128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,144,145,146,147,148,149,150,151,151,152,153,154,155,156,156, +157,158,159,160,160,161,162,163,164,164,165,166,167,168,168,169,170,171,171,172,173,174,174,175,176,176,177,178,179,179,180,181, +182,182,183,184,184,185,186,186,187,188,188,189,190,190,191,192,192,193,194,194,195,196,196,197,198,198,199,200,200,201,202,202, +203,204,204,205,205,206,207,207,208,208,209,210,210,211,212,212,213,213,214,215,215,216,216,217,218,218,219,219,220,220,221,222, +222,223,223,224,224,225,226,226,227,227,228,228,229,230,230,231,231,232,232,233,233,234,235,235,236,236,237,237,238,238,239,239, +240,240,241,242,242,243,243,244,244,245,245,246,246,247,247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255,255,255 +}; + +#define times4(x) x, x, x, x +#define times256(x) times4(times4(times4(times4(times4(x))))) + +const uint8_t ff_crop_tab[256 + 2 * MAX_NEG_CROP] = { +times256(0x00), +0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, +0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, +0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, +0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, +0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F, +0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F, +0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, +0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F, +0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F, +0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F, +0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF, +0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF, +0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF, +0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, +0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, +0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF, +times256(0xFF) +}; + +const uint8_t ff_zigzag_direct[64] = { + 0, 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63 +}; + +const uint8_t ff_zigzag_scan[16+1] = { + 0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, + 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, + 1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, + 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, +}; + +const uint8_t ff_log2_run[41] = { + 0, 0, 0, 0, 1, 1, 1, 1, + 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, + 8, 9, 10, 11, 12, 13, 14, 15, +16, 17, 18, 19, 20, 21, 22, 23, +24, +}; diff --git a/media/ffvpx/libavcodec/me_cmp.h b/media/ffvpx/libavcodec/me_cmp.h new file mode 100644 index 0000000000..90ea76c891 --- /dev/null +++ b/media/ffvpx/libavcodec/me_cmp.h @@ -0,0 +1,96 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ME_CMP_H +#define AVCODEC_ME_CMP_H + +#include <stdint.h> + +#include "libavutil/attributes_internal.h" + +#include "avcodec.h" + +extern const uint32_t attribute_visibility_hidden ff_square_tab[512]; + + +/* minimum alignment rules ;) + * If you notice errors in the align stuff, need more alignment for some ASM code + * for some CPU or need to use a function with less aligned data then send a mail + * to the ffmpeg-devel mailing list, ... + * + * !warning These alignments might not match reality, (missing attribute((align)) + * stuff somewhere possible). + * I (Michael) did not check them, these are just the alignments which I think + * could be reached easily ... + * + * !future video codecs might need functions with less strict alignment + */ + +struct MpegEncContext; +/* Motion estimation: + * h is limited to { width / 2, width, 2 * width }, + * but never larger than 16 and never smaller than 2. + * Although currently h < 4 is not used as functions with + * width < 8 are neither used nor implemented. */ +typedef int (*me_cmp_func)(struct MpegEncContext *c, + const uint8_t *blk1 /* align width (8 or 16) */, + const uint8_t *blk2 /* align 1 */, ptrdiff_t stride, + int h); + +typedef struct MECmpContext { + int (*sum_abs_dctelem)(const int16_t *block /* align 16 */); + + me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */ + me_cmp_func sse[6]; + me_cmp_func hadamard8_diff[6]; + me_cmp_func dct_sad[6]; + me_cmp_func quant_psnr[6]; + me_cmp_func bit[6]; + me_cmp_func rd[6]; + me_cmp_func vsad[6]; + me_cmp_func vsse[6]; + me_cmp_func nsse[6]; + me_cmp_func w53[6]; + me_cmp_func w97[6]; + me_cmp_func dct_max[6]; + me_cmp_func dct264_sad[6]; + + me_cmp_func me_pre_cmp[6]; + me_cmp_func me_cmp[6]; + me_cmp_func me_sub_cmp[6]; + me_cmp_func mb_cmp[6]; + me_cmp_func ildct_cmp[6]; // only width 16 used + me_cmp_func frame_skip_cmp[6]; // only width 8 used + + me_cmp_func pix_abs[2][4]; + me_cmp_func median_sad[6]; +} MECmpContext; + +void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx); +void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx); +void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx); +void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx); +void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx); +void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx); +void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx); + +void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type); + +void ff_dsputil_init_dwt(MECmpContext *c); + +#endif /* AVCODEC_ME_CMP_H */ diff --git a/media/ffvpx/libavcodec/motion_est.h b/media/ffvpx/libavcodec/motion_est.h new file mode 100644 index 0000000000..f6a563b08c --- /dev/null +++ b/media/ffvpx/libavcodec/motion_est.h @@ -0,0 +1,132 @@ +/* + * Motion estimation + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MOTION_EST_H +#define AVCODEC_MOTION_EST_H + +#include <stdint.h> + +#include "avcodec.h" +#include "hpeldsp.h" +#include "qpeldsp.h" + +struct MpegEncContext; + +#if ARCH_IA64 // Limit static arrays to avoid gcc failing "short data segment overflowed" +#define MAX_MV 1024 +#else +#define MAX_MV 4096 +#endif +#define MAX_DMV (2*MAX_MV) +#define ME_MAP_SIZE 64 + +#define FF_ME_ZERO 0 +#define FF_ME_EPZS 1 +#define FF_ME_XONE 2 + +/** + * Motion estimation context. + */ +typedef struct MotionEstContext { + AVCodecContext *avctx; + int skip; ///< set if ME is skipped for the current MB + int co_located_mv[4][2]; ///< mv from last P-frame for direct mode ME + int direct_basis_mv[4][2]; + uint8_t *scratchpad; /**< data area for the ME algo, so that + * the ME does not need to malloc/free. */ + uint8_t *temp; + uint32_t *map; ///< map to avoid duplicate evaluations + uint32_t *score_map; ///< map to store the scores + unsigned map_generation; + int pre_penalty_factor; + int penalty_factor; /**< an estimate of the bits required to + * code a given mv value, e.g. (1,0) takes + * more bits than (0,0). We have to + * estimate whether any reduction in + * residual is worth the extra bits. */ + int sub_penalty_factor; + int mb_penalty_factor; + int flags; + int sub_flags; + int mb_flags; + int pre_pass; ///< = 1 for the pre pass + int dia_size; + int xmin; + int xmax; + int ymin; + int ymax; + int pred_x; + int pred_y; + const uint8_t *src[4][4]; + const uint8_t *ref[4][4]; + int stride; + int uvstride; + /* temp variables for picture complexity calculation */ + int64_t mc_mb_var_sum_temp; + int64_t mb_var_sum_temp; + int scene_change_score; + + op_pixels_func(*hpel_put)[4]; + op_pixels_func(*hpel_avg)[4]; + qpel_mc_func(*qpel_put)[16]; + qpel_mc_func(*qpel_avg)[16]; + const uint8_t (*mv_penalty)[MAX_DMV * 2 + 1]; ///< bit amount needed to encode a MV + const uint8_t *current_mv_penalty; + int (*sub_motion_search)(struct MpegEncContext *s, + int *mx_ptr, int *my_ptr, int dmin, + int src_index, int ref_index, + int size, int h); +} MotionEstContext; + +static inline int ff_h263_round_chroma(int x) +{ + //FIXME static or not? + static const uint8_t h263_chroma_roundtab[16] = { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, + }; + return h263_chroma_roundtab[x & 0xf] + (x >> 3); +} + +int ff_init_me(struct MpegEncContext *s); + +void ff_estimate_p_frame_motion(struct MpegEncContext *s, int mb_x, int mb_y); +void ff_estimate_b_frame_motion(struct MpegEncContext *s, int mb_x, int mb_y); + +int ff_pre_estimate_p_frame_motion(struct MpegEncContext *s, + int mb_x, int mb_y); + +int ff_epzs_motion_search(struct MpegEncContext *s, int *mx_ptr, int *my_ptr, + int P[10][2], int src_index, int ref_index, + const int16_t (*last_mv)[2], int ref_mv_scale, + int size, int h); + +int ff_get_mb_score(struct MpegEncContext *s, int mx, int my, int src_index, + int ref_index, int size, int h, int add_rate); + +int ff_get_best_fcode(struct MpegEncContext *s, + const int16_t (*mv_table)[2], int type); + +void ff_fix_long_p_mvs(struct MpegEncContext *s, int type); +void ff_fix_long_mvs(struct MpegEncContext *s, uint8_t *field_select_table, + int field_select, int16_t (*mv_table)[2], int f_code, + int type, int truncate); + +#endif /* AVCODEC_MOTION_EST_H */ diff --git a/media/ffvpx/libavcodec/moz.build b/media/ffvpx/libavcodec/moz.build new file mode 100644 index 0000000000..61d9962a71 --- /dev/null +++ b/media/ffvpx/libavcodec/moz.build @@ -0,0 +1,143 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Due to duplicate file names, we compile libavutil/x86 in its own +# moz.build file. +if CONFIG['FFVPX_ASFLAGS']: + if CONFIG['CPU_ARCH'] == 'x86' or CONFIG['CPU_ARCH'] == 'x86_64': + DIRS += ['x86'] + elif CONFIG['CPU_ARCH'] == 'arm': + DIRS += ['arm'] + +if CONFIG['CPU_ARCH'] == 'aarch64': + DIRS += ['aarch64'] + +SharedLibrary('mozavcodec') +SOURCES += [ + 'allcodecs.c', + 'avcodec.c', + 'avdct.c', + 'avpacket.c', + 'bitstream.c', + 'bitstream_filters.c', + 'bsf.c', + 'codec_desc.c', + 'codec_par.c', + 'dct.c', + 'dct32_fixed.c', + 'dct32_float.c', + 'decode.c', + 'encode.c', + 'faandct.c', + 'faanidct.c', + 'fdctdsp.c', + 'fft_fixed_32.c', + 'fft_float.c', + 'fft_init_table.c', + 'flac.c', + 'flacdata.c', + 'flacdec.c', + 'flacdsp.c', + 'get_buffer.c', + 'idctdsp.c', + 'jfdctfst.c', + 'jfdctint.c', + 'jrevdct.c', + 'log2_tab.c', + 'mpegaudio.c', + 'mpegaudiodata.c', + 'mpegaudiodec_common.c', + 'mpegaudiodec_fixed.c', + 'mpegaudiodecheader.c', + 'mpegaudiodsp.c', + 'mpegaudiodsp_data.c', + 'mpegaudiodsp_fixed.c', + 'mpegaudiodsp_float.c', + 'mpegaudiotabs.c', + 'null_bsf.c', + 'options.c', + 'parser.c', + 'parsers.c', + 'profiles.c', + 'pthread.c', + 'pthread_frame.c', + 'pthread_slice.c', + 'rdft.c', + 'reverse.c', + 'simple_idct.c', + 'utils.c', + 'version.c', + 'vlc.c', + 'vorbis_parser.c', + 'xiph.c' +] + +if not CONFIG['MOZ_FFVPX_AUDIOONLY']: + SOURCES += [ + 'atsc_a53.c', + 'av1_frame_split_bsf.c', + 'av1dec.c', + 'avpicture.c', + 'cbs.c', + 'cbs_av1.c', + 'golomb.c', + 'h264pred.c', + 'imgconvert.c', + 'libdav1d.c', + 'mathtables.c', + 'qsv_api.c', + 'raw.c', + 'videodsp.c', + 'vp8.c', + 'vp8_parser.c', + 'vp8dsp.c', + 'vp9.c', + 'vp9_parser.c', + 'vp9_superframe_split_bsf.c', + 'vp9block.c', + 'vp9data.c', + 'vp9dsp.c', + 'vp9dsp_10bpp.c', + 'vp9dsp_12bpp.c', + 'vp9dsp_8bpp.c', + 'vp9lpf.c', + 'vp9mvs.c', + 'vp9prob.c', + 'vp9recon.c', + 'vpx_rac.c', + ] + USE_LIBS += [ + 'dav1d', + 'media_libdav1d_asm', + ] + if CONFIG['MOZ_WAYLAND']: + LOCAL_INCLUDES += ['/media/mozva'] + SOURCES += [ + 'vaapi_av1.c', + 'vaapi_decode.c', + 'vaapi_vp8.c', + 'vaapi_vp9.c', + ] + USE_LIBS += [ + 'mozva' + ] + +if CONFIG['MOZ_LIBAV_FFT']: + SOURCES += [ + 'avfft.c', + ] + +SYMBOLS_FILE = 'avcodec.symbols' +NoVisibilityFlags() + +USE_LIBS += [ + 'mozavutil' +] + +if CONFIG['OS_TARGET'] != 'WINNT': + OS_LIBS += ['m'] + +include("../ffvpxcommon.mozbuild") diff --git a/media/ffvpx/libavcodec/mpeg12data.h b/media/ffvpx/libavcodec/mpeg12data.h new file mode 100644 index 0000000000..bc39655fbf --- /dev/null +++ b/media/ffvpx/libavcodec/mpeg12data.h @@ -0,0 +1,53 @@ +/* + * MPEG-1/2 tables + * copyright (c) 2000,2001 Fabrice Bellard + * copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * MPEG-1/2 tables. + */ + +#ifndef AVCODEC_MPEG12DATA_H +#define AVCODEC_MPEG12DATA_H + +#include <stdint.h> +#include "libavutil/rational.h" + +extern const uint16_t ff_mpeg1_default_intra_matrix[]; +extern const uint16_t ff_mpeg1_default_non_intra_matrix[64]; + +extern const uint16_t ff_mpeg12_vlc_dc_lum_code[12]; +extern const unsigned char ff_mpeg12_vlc_dc_lum_bits[12]; +extern const uint16_t ff_mpeg12_vlc_dc_chroma_code[12]; +extern const unsigned char ff_mpeg12_vlc_dc_chroma_bits[12]; + +extern const uint8_t ff_mpeg12_mbAddrIncrTable[36][2]; +extern const uint8_t ff_mpeg12_mbPatTable[64][2]; + +extern const uint8_t ff_mpeg12_mbMotionVectorTable[17][2]; + +extern const AVRational ff_mpeg12_frame_rate_tab[]; +extern const AVRational ff_mpeg2_frame_rate_tab[]; + +extern const float ff_mpeg1_aspect[16]; +extern const AVRational ff_mpeg2_aspect[16]; + +#endif /* AVCODEC_MPEG12DATA_H */ diff --git a/media/ffvpx/libavcodec/mpegaudio.c b/media/ffvpx/libavcodec/mpegaudio.c new file mode 100644 index 0000000000..cba52992ef --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudio.c @@ -0,0 +1,50 @@ +/* + * MPEG Audio common code + * Copyright (c) 2001, 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * MPEG Audio common code. + */ + +#include "mpegaudio.h" + + +/* bitrate is in kb/s */ +int ff_mpa_l2_select_table(int bitrate, int nb_channels, int freq, int lsf) +{ + int ch_bitrate, table; + + ch_bitrate = bitrate / nb_channels; + if (!lsf) { + if ((freq == 48000 && ch_bitrate >= 56) || + (ch_bitrate >= 56 && ch_bitrate <= 80)) + table = 0; + else if (freq != 48000 && ch_bitrate >= 96) + table = 1; + else if (freq != 32000 && ch_bitrate <= 48) + table = 2; + else + table = 3; + } else { + table = 4; + } + return table; +} diff --git a/media/ffvpx/libavcodec/mpegaudio.h b/media/ffvpx/libavcodec/mpegaudio.h new file mode 100644 index 0000000000..74590a8e8b --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudio.h @@ -0,0 +1,81 @@ +/* + * copyright (c) 2001 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * mpeg audio declarations for both encoder and decoder. + */ + +#ifndef AVCODEC_MPEGAUDIO_H +#define AVCODEC_MPEGAUDIO_H + +#ifndef USE_FLOATS +# define USE_FLOATS 0 +#endif + +#include <stdint.h> +#include "libavutil/internal.h" + +/* max frame size, in samples */ +#define MPA_FRAME_SIZE 1152 + +/* max compressed frame size */ +#define MPA_MAX_CODED_FRAME_SIZE 1792 + +#define MPA_MAX_CHANNELS 2 + +#define SBLIMIT 32 /* number of subbands */ + +#define MPA_STEREO 0 +#define MPA_JSTEREO 1 +#define MPA_DUAL 2 +#define MPA_MONO 3 + +#ifndef FRAC_BITS +#define FRAC_BITS 23 /* fractional bits for sb_samples and dct */ +#define WFRAC_BITS 16 /* fractional bits for window */ +#endif + +#define IMDCT_SCALAR 1.759 + +#define FRAC_ONE (1 << FRAC_BITS) + +#define FIX(a) ((int)((a) * FRAC_ONE)) + +#if USE_FLOATS +# define INTFLOAT float +# define SUINTFLOAT float +typedef float MPA_INT; +typedef float OUT_INT; +#elif FRAC_BITS <= 15 +# define INTFLOAT int +# define SUINTFLOAT SUINT +typedef int16_t MPA_INT; +typedef int16_t OUT_INT; +#else +# define INTFLOAT int +# define SUINTFLOAT SUINT +typedef int32_t MPA_INT; +typedef int16_t OUT_INT; +#endif + +int ff_mpa_l2_select_table(int bitrate, int nb_channels, int freq, int lsf); + +#endif /* AVCODEC_MPEGAUDIO_H */ diff --git a/media/ffvpx/libavcodec/mpegaudio_tablegen.h b/media/ffvpx/libavcodec/mpegaudio_tablegen.h new file mode 100644 index 0000000000..bae6962ac0 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudio_tablegen.h @@ -0,0 +1,89 @@ +/* + * Header file for hardcoded mpegaudiodec tables + * + * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MPEGAUDIO_TABLEGEN_H +#define AVCODEC_MPEGAUDIO_TABLEGEN_H + +#include <stdint.h> +#include <math.h> +#include "libavutil/attributes.h" + +#if CONFIG_HARDCODED_TABLES +#define mpegaudio_tableinit() +#include "libavcodec/mpegaudio_tables.h" +#else +#if defined(BUILD_TABLES) || !USE_FLOATS +#define FIXED_TABLE +static uint32_t exp_table_fixed[512]; +static uint32_t expval_table_fixed[512][16]; +#endif + +#if defined(BUILD_TABLES) || USE_FLOATS +#define FLOAT_TABLE +static float exp_table_float[512]; +static float expval_table_float[512][16]; +#endif + +#define IMDCT_SCALAR 1.759 + +static av_cold void mpegaudio_tableinit(void) +{ + int i, value, exponent; + static const double exp2_lut[4] = { + 1.00000000000000000000, /* 2 ^ (0 * 0.25) */ + 1.18920711500272106672, /* 2 ^ (1 * 0.25) */ + M_SQRT2 , /* 2 ^ (2 * 0.25) */ + 1.68179283050742908606, /* 2 ^ (3 * 0.25) */ + }; + double pow43_lut[16]; + double exp2_base = 2.11758236813575084767080625169910490512847900390625e-22; // 2^(-72) + double exp2_val; + + for (i = 0; i < 16; ++i) + pow43_lut[i] = i * cbrt(i); + + for (exponent = 0; exponent < 512; exponent++) { + if (exponent && (exponent & 3) == 0) + exp2_base *= 2; + exp2_val = exp2_base * exp2_lut[exponent & 3] / IMDCT_SCALAR; + for (value = 0; value < 16; value++) { + double f = pow43_lut[value] * exp2_val; +#ifdef FIXED_TABLE + expval_table_fixed[exponent][value] = (f < 0xFFFFFFFF ? llrint(f) : 0xFFFFFFFF); +#endif +#ifdef FLOAT_TABLE + expval_table_float[exponent][value] = f; +#endif + } +#ifdef FIXED_TABLE + exp_table_fixed[exponent] = expval_table_fixed[exponent][1]; +#endif +#ifdef FLOAT_TABLE + exp_table_float[exponent] = expval_table_float[exponent][1]; +#endif + } +} +#undef FLOAT_TABLE +#undef FIXED_TABLE +#endif /* CONFIG_HARDCODED_TABLES */ + +#endif /* AVCODEC_MPEGAUDIO_TABLEGEN_H */ diff --git a/media/ffvpx/libavcodec/mpegaudiodata.c b/media/ffvpx/libavcodec/mpegaudiodata.c new file mode 100644 index 0000000000..669590908f --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodata.c @@ -0,0 +1,133 @@ +/* + * MPEG Audio common tables + * copyright (c) 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * mpeg audio layer common tables. + */ + +#include "mpegaudiodata.h" + +/*******************************************************/ +/* layer 2 tables */ + +const int ff_mpa_sblimit_table[5] = { 27 , 30 , 8, 12 , 30 }; + +const int ff_mpa_quant_steps[17] = { + 3, 5, 7, 9, 15, + 31, 63, 127, 255, 511, + 1023, 2047, 4095, 8191, 16383, + 32767, 65535 +}; + +/* we use a negative value if grouped */ +const int ff_mpa_quant_bits[17] = { + -5, -7, 3, -10, 4, + 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, + 15, 16 +}; + +/* encoding tables which give the quantization index. Note how it is + possible to store them efficiently ! */ +static const unsigned char alloc_table_1[] = { + 4, 0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 4, 0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 4, 0, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 3, 0, 1, 2, 3, 4, 5, 16, + 2, 0, 1, 16, + 2, 0, 1, 16, + 2, 0, 1, 16, + 2, 0, 1, 16, + 2, 0, 1, 16, + 2, 0, 1, 16, + 2, 0, 1, 16, +}; + +static const unsigned char alloc_table_3[] = { + 4, 0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 4, 0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, +}; + +static const unsigned char alloc_table_4[] = { + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 3, 0, 1, 3, 4, 5, 6, 7, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, + 2, 0, 1, 3, +}; + +const unsigned char * const ff_mpa_alloc_tables[5] = +{ alloc_table_1, alloc_table_1, alloc_table_3, alloc_table_3, alloc_table_4, }; diff --git a/media/ffvpx/libavcodec/mpegaudiodata.h b/media/ffvpx/libavcodec/mpegaudiodata.h new file mode 100644 index 0000000000..a4148a1ffe --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodata.h @@ -0,0 +1,82 @@ +/* + * MPEG Audio common tables + * copyright (c) 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * mpeg audio layer common tables. + */ + +#ifndef AVCODEC_MPEGAUDIODATA_H +#define AVCODEC_MPEGAUDIODATA_H + +#include <stdint.h> + +#include "config.h" + +#include "vlc.h" + +#define MODE_EXT_MS_STEREO 2 +#define MODE_EXT_I_STEREO 1 + +extern const uint16_t ff_mpa_bitrate_tab[2][3][15]; +extern const uint16_t ff_mpa_freq_tab[3]; +extern const int ff_mpa_sblimit_table[5]; +extern const int ff_mpa_quant_steps[17]; +extern const int ff_mpa_quant_bits[17]; +extern const unsigned char * const ff_mpa_alloc_tables[5]; + +#define TABLE_4_3_SIZE ((8191 + 16)*4) +#if CONFIG_HARDCODED_TABLES +extern const int8_t ff_table_4_3_exp [TABLE_4_3_SIZE]; +extern const uint32_t ff_table_4_3_value[TABLE_4_3_SIZE]; +#else +extern int8_t ff_table_4_3_exp [TABLE_4_3_SIZE]; +extern uint32_t ff_table_4_3_value[TABLE_4_3_SIZE]; +#endif + +/* VLCs for decoding layer 3 huffman tables */ +extern VLC ff_huff_vlc[16]; +extern VLC ff_huff_quad_vlc[2]; + +/* layer3 scale factor size */ +extern const uint8_t ff_slen_table[2][16]; +/* number of lsf scale factors for a given size */ +extern const uint8_t ff_lsf_nsf_table[6][3][4]; +extern const uint8_t ff_mpa_huff_data[32][2]; + +/* band size tables */ +extern const uint8_t ff_band_size_long[9][22]; +extern const uint8_t ff_band_size_short[9][13]; +/* computed from ff_band_size_long */ +extern uint16_t ff_band_index_long[9][23]; + +extern int16_t *const ff_division_tabs[4]; + +/* lower 2 bits: modulo 3, higher bits: shift */ +extern uint16_t ff_scale_factor_modshift[64]; + +extern const uint8_t ff_mpa_pretab[2][22]; + +/* Initialize tables shared between the fixed and + * floating point MPEG audio decoders. */ +void ff_mpegaudiodec_common_init_static(void); + +#endif /* AVCODEC_MPEGAUDIODATA_H */ diff --git a/media/ffvpx/libavcodec/mpegaudiodec_common.c b/media/ffvpx/libavcodec/mpegaudiodec_common.c new file mode 100644 index 0000000000..5fcb39b325 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodec_common.c @@ -0,0 +1,483 @@ +/* + * MPEG Audio decoder + * copyright (c) 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * mpeg audio layer decoder tables. + */ + +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/avassert.h" +#include "libavutil/libm.h" +#include "libavutil/thread.h" + +#include "mpegaudiodata.h" + +#include "mpegaudiodec_common_tablegen.h" + +uint16_t ff_scale_factor_modshift[64]; + +static int16_t division_tab3[1 << 6 ]; +static int16_t division_tab5[1 << 8 ]; +static int16_t division_tab9[1 << 11]; + +int16_t *const ff_division_tabs[4] = { + division_tab3, division_tab5, NULL, division_tab9 +}; + + +/*******************************************************/ +/* layer 3 tables */ + +const uint8_t ff_slen_table[2][16] = { + { 0, 0, 0, 0, 3, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 }, + { 0, 1, 2, 3, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 3 }, +}; + +const uint8_t ff_lsf_nsf_table[6][3][4] = { + { { 6, 5, 5, 5 }, { 9, 9, 9, 9 }, { 6, 9, 9, 9 } }, + { { 6, 5, 7, 3 }, { 9, 9, 12, 6 }, { 6, 9, 12, 6 } }, + { { 11, 10, 0, 0 }, { 18, 18, 0, 0 }, { 15, 18, 0, 0 } }, + { { 7, 7, 7, 0 }, { 12, 12, 12, 0 }, { 6, 15, 12, 0 } }, + { { 6, 6, 6, 3 }, { 12, 9, 9, 6 }, { 6, 12, 9, 6 } }, + { { 8, 8, 5, 0 }, { 15, 12, 9, 0 }, { 6, 18, 9, 0 } }, +}; + +/* mpegaudio layer 3 huffman tables */ +VLC ff_huff_vlc[16]; +static VLCElem huff_vlc_tables[128 + 128 + 128 + 130 + 128 + 154 + 166 + 142 + + 204 + 190 + 170 + 542 + 460 + 662 + 414]; +VLC ff_huff_quad_vlc[2]; +static VLCElem huff_quad_vlc_tables[64 + 16]; + +static const uint8_t mpa_hufflens[] = { + /* Huffman table 1 - 4 entries */ + 3, 3, 2, 1, + /* Huffman table 2 - 9 entries */ + 6, 6, 5, 5, 5, 3, 3, 3, 1, + /* Huffman table 3 - 9 entries */ + 6, 6, 5, 5, 5, 3, 2, 2, 2, + /* Huffman table 5 - 16 entries */ + 8, 8, 7, 6, 7, 7, 7, 7, 6, 6, 6, 6, 3, 3, 3, 1, + /* Huffman table 6 - 16 entries */ + 7, 7, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 3, 2, 3, 3, + /* Huffman table 7 - 36 entries */ + 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 9, 9, 8, 9, 9, 8, 8, 7, 7, + 7, 8, 8, 8, 8, 7, 7, 7, 7, 6, 5, 6, 6, 4, 3, 3, 1, + /* Huffman table 8 - 36 entries */ + 11, 11, 10, 9, 10, 10, 9, 9, 9, 8, 8, 9, 9, 9, 9, 8, 8, 8, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 4, 4, 2, 3, 3, 2, + /* Huffman table 9 - 36 entries */ + 9, 9, 8, 8, 9, 9, 8, 8, 8, 8, 7, 7, 7, 8, 8, 7, 7, 7, 7, + 6, 6, 6, 6, 5, 5, 6, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, + /* Huffman table 10 - 64 entries */ + 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 11, 11, 10, 9, 9, 10, + 10, 9, 9, 10, 10, 9, 10, 10, 8, 8, 9, 9, 10, 10, 9, 9, 10, 10, 8, + 8, 8, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 6, + 6, 6, 6, 4, 3, 3, 1, + /* Huffman table 11 - 64 entries */ + 10, 10, 10, 10, 10, 10, 10, 11, 11, 10, 10, 9, 9, 9, 10, 10, 10, 10, 8, + 8, 9, 9, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 8, 7, 8, 8, 7, 7, + 8, 8, 8, 9, 9, 8, 8, 8, 8, 8, 8, 7, 7, 6, 6, 7, 7, 6, 5, + 4, 5, 5, 3, 3, 3, 2, + /* Huffman table 12 - 64 entries */ + 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 9, 9, 8, 8, 8, 8, 8, 8, + 9, 9, 8, 8, 8, 8, 8, 9, 9, 7, 7, 7, 8, 8, 8, 8, 8, 8, 7, + 7, 7, 7, 8, 8, 7, 7, 7, 6, 6, 6, 6, 7, 7, 6, 5, 5, 5, 4, + 4, 5, 5, 4, 3, 3, 3, + /* Huffman table 13 - 256 entries */ + 19, 19, 18, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 15, 15, 16, + 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 15, 16, 16, 14, 14, 15, + 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 14, 13, 14, + 14, 13, 13, 14, 14, 13, 14, 14, 13, 14, 14, 13, 14, 14, 13, 13, 14, 14, 12, + 12, 12, 13, 13, 13, 13, 13, 13, 12, 13, 13, 12, 12, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 12, 12, 13, 13, 12, 12, 12, 12, 13, 13, 13, 13, 12, + 13, 13, 12, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 12, 12, 11, + 11, 12, 12, 11, 12, 12, 12, 12, 11, 11, 12, 12, 11, 12, 12, 11, 12, 12, 11, + 12, 12, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 11, 11, + 10, 11, 11, 10, 11, 11, 11, 11, 10, 10, 11, 11, 10, 10, 11, 11, 11, 11, 11, + 11, 9, 9, 10, 10, 10, 10, 10, 11, 11, 9, 9, 9, 10, 10, 9, 9, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 8, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, + 9, 8, 8, 9, 9, 9, 9, 9, 9, 8, 7, 8, 8, 8, 8, 7, 7, 7, 7, + 7, 6, 6, 6, 6, 4, 4, 3, 1, + /* Huffman table 15 - 256 entries */ + 13, 13, 13, 13, 12, 13, 13, 13, 13, 13, 13, 12, 13, 13, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, + 13, 11, 11, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 12, 12, 11, 11, 11, 11, + 11, 11, 11, 11, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 11, 11, 11, 11, 11, + 11, 10, 11, 11, 11, 11, 11, 11, 10, 10, 11, 11, 10, 10, 10, 10, 11, 11, 10, + 10, 10, 10, 10, 10, 10, 11, 11, 10, 10, 10, 10, 10, 11, 11, 9, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 10, 10, 10, 10, 9, 10, 10, 9, 10, + 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, 9, + 9, 9, 9, 10, 10, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, + 8, 8, 9, 9, 8, 8, 8, 8, 8, 8, 8, 9, 9, 8, 7, 8, 8, 7, 7, + 7, 7, 8, 8, 7, 7, 7, 7, 7, 6, 7, 7, 6, 6, 7, 7, 6, 6, 6, + 5, 5, 5, 5, 5, 3, 4, 4, 3, + /* Huffman table 16 - 256 entries */ + 11, 11, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 10, 10, 10, 10, 10, 8, + 10, 10, 9, 9, 9, 9, 10, 16, 17, 17, 15, 15, 16, 16, 14, 15, 15, 14, 14, + 15, 15, 14, 14, 15, 15, 15, 15, 14, 15, 15, 14, 13, 8, 9, 9, 8, 8, 13, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 14, 14, 14, 14, 13, 14, 14, + 13, 13, 13, 14, 14, 14, 14, 13, 13, 14, 14, 13, 14, 14, 12, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 13, 13, 13, 13, 13, 13, 12, 13, + 13, 12, 12, 13, 13, 11, 12, 12, 12, 12, 12, 12, 12, 13, 13, 11, 12, 12, 12, + 12, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 12, 12, 11, 11, 11, 11, 12, 12, + 12, 12, 12, 12, 12, 12, 11, 12, 12, 11, 12, 12, 11, 12, 12, 11, 12, 12, 11, + 10, 10, 11, 11, 11, 11, 11, 11, 10, 10, 11, 11, 10, 10, 11, 11, 11, 11, 11, + 11, 11, 11, 10, 11, 11, 10, 10, 10, 11, 11, 10, 10, 11, 11, 10, 10, 11, 11, + 10, 9, 9, 10, 10, 10, 10, 10, 10, 9, 9, 9, 10, 10, 9, 10, 10, 9, 9, + 8, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 9, 9, 8, 8, 7, 7, 8, 8, + 7, 6, 6, 6, 6, 4, 4, 3, 1, + /* Huffman table 24 - 256 entries */ + 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 7, 7, 8, 8, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 9, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 4, 11, 11, 11, 11, 12, 12, 11, 10, 11, 11, 10, 10, 10, 10, 11, 11, 10, + 10, 10, 10, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 11, 11, 10, 11, 11, 10, 9, 10, 10, 10, 10, 11, 11, 10, 9, 9, 10, + 10, 9, 10, 10, 10, 10, 9, 9, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, 9, 10, 10, 8, 9, 9, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 8, 8, 8, 8, 8, + 8, 9, 9, 7, 8, 8, 7, 7, 7, 7, 7, 8, 8, 7, 7, 6, 6, 7, 7, + 6, 5, 5, 6, 6, 4, 4, 4, 4, +}; + +static const uint8_t mpa_huffsymbols[] = { + /* Huffman table 1 - 4 entries */ + 0x11, 0x01, 0x10, 0x00, + /* Huffman table 2 - 9 entries */ + 0x22, 0x02, 0x12, 0x21, 0x20, 0x11, 0x01, 0x10, 0x00, + /* Huffman table 3 - 9 entries */ + 0x22, 0x02, 0x12, 0x21, 0x20, 0x10, 0x11, 0x01, 0x00, + /* Huffman table 5 - 16 entries */ + 0x33, 0x23, 0x32, 0x31, 0x13, 0x03, 0x30, 0x22, 0x12, 0x21, 0x02, 0x20, + 0x11, 0x01, 0x10, 0x00, + /* Huffman table 6 - 16 entries */ + 0x33, 0x03, 0x23, 0x32, 0x30, 0x13, 0x31, 0x22, 0x02, 0x12, 0x21, 0x20, + 0x01, 0x11, 0x10, 0x00, + /* Huffman table 7 - 36 entries */ + 0x55, 0x45, 0x54, 0x53, 0x35, 0x44, 0x25, 0x52, 0x15, 0x51, 0x05, 0x34, + 0x50, 0x43, 0x33, 0x24, 0x42, 0x14, 0x41, 0x40, 0x04, 0x23, 0x32, 0x03, + 0x13, 0x31, 0x30, 0x22, 0x12, 0x21, 0x02, 0x20, 0x11, 0x01, 0x10, 0x00, + /* Huffman table 8 - 36 entries */ + 0x55, 0x54, 0x45, 0x53, 0x35, 0x44, 0x25, 0x52, 0x05, 0x15, 0x51, 0x34, + 0x43, 0x50, 0x33, 0x24, 0x42, 0x14, 0x41, 0x04, 0x40, 0x23, 0x32, 0x13, + 0x31, 0x03, 0x30, 0x22, 0x02, 0x20, 0x12, 0x21, 0x11, 0x01, 0x10, 0x00, + /* Huffman table 9 - 36 entries */ + 0x55, 0x45, 0x35, 0x53, 0x54, 0x05, 0x44, 0x25, 0x52, 0x15, 0x51, 0x34, + 0x43, 0x50, 0x04, 0x24, 0x42, 0x33, 0x40, 0x14, 0x41, 0x23, 0x32, 0x13, + 0x31, 0x03, 0x30, 0x22, 0x02, 0x12, 0x21, 0x20, 0x11, 0x01, 0x10, 0x00, + /* Huffman table 10 - 64 entries */ + 0x77, 0x67, 0x76, 0x57, 0x75, 0x66, 0x47, 0x74, 0x56, 0x65, 0x37, 0x73, + 0x46, 0x55, 0x54, 0x63, 0x27, 0x72, 0x64, 0x07, 0x70, 0x62, 0x45, 0x35, + 0x06, 0x53, 0x44, 0x17, 0x71, 0x36, 0x26, 0x25, 0x52, 0x15, 0x51, 0x34, + 0x43, 0x16, 0x61, 0x60, 0x05, 0x50, 0x24, 0x42, 0x33, 0x04, 0x14, 0x41, + 0x40, 0x23, 0x32, 0x03, 0x13, 0x31, 0x30, 0x22, 0x12, 0x21, 0x02, 0x20, + 0x11, 0x01, 0x10, 0x00, + /* Huffman table 11 - 64 entries */ + 0x77, 0x67, 0x76, 0x75, 0x66, 0x47, 0x74, 0x57, 0x55, 0x56, 0x65, 0x37, + 0x73, 0x46, 0x45, 0x54, 0x35, 0x53, 0x27, 0x72, 0x64, 0x07, 0x71, 0x17, + 0x70, 0x36, 0x63, 0x60, 0x44, 0x25, 0x52, 0x05, 0x15, 0x62, 0x26, 0x06, + 0x16, 0x61, 0x51, 0x34, 0x50, 0x43, 0x33, 0x24, 0x42, 0x14, 0x41, 0x04, + 0x40, 0x23, 0x32, 0x13, 0x31, 0x03, 0x30, 0x22, 0x21, 0x12, 0x02, 0x20, + 0x11, 0x01, 0x10, 0x00, + /* Huffman table 12 - 64 entries */ + 0x77, 0x67, 0x76, 0x57, 0x75, 0x66, 0x47, 0x74, 0x65, 0x56, 0x37, 0x73, + 0x55, 0x27, 0x72, 0x46, 0x64, 0x17, 0x71, 0x07, 0x70, 0x36, 0x63, 0x45, + 0x54, 0x44, 0x06, 0x05, 0x26, 0x62, 0x61, 0x16, 0x60, 0x35, 0x53, 0x25, + 0x52, 0x15, 0x51, 0x34, 0x43, 0x50, 0x04, 0x24, 0x42, 0x14, 0x33, 0x41, + 0x23, 0x32, 0x40, 0x03, 0x30, 0x13, 0x31, 0x22, 0x12, 0x21, 0x02, 0x20, + 0x00, 0x11, 0x01, 0x10, + /* Huffman table 13 - 256 entries */ + 0xFE, 0xFC, 0xFD, 0xED, 0xFF, 0xEF, 0xDF, 0xEE, 0xCF, 0xDE, 0xBF, 0xFB, + 0xCE, 0xDC, 0xAF, 0xE9, 0xEC, 0xDD, 0xFA, 0xCD, 0xBE, 0xEB, 0x9F, 0xF9, + 0xEA, 0xBD, 0xDB, 0x8F, 0xF8, 0xCC, 0xAE, 0x9E, 0x8E, 0x7F, 0x7E, 0xF7, + 0xDA, 0xAD, 0xBC, 0xCB, 0xF6, 0x6F, 0xE8, 0x5F, 0x9D, 0xD9, 0xF5, 0xE7, + 0xAC, 0xBB, 0x4F, 0xF4, 0xCA, 0xE6, 0xF3, 0x3F, 0x8D, 0xD8, 0x2F, 0xF2, + 0x6E, 0x9C, 0x0F, 0xC9, 0x5E, 0xAB, 0x7D, 0xD7, 0x4E, 0xC8, 0xD6, 0x3E, + 0xB9, 0x9B, 0xAA, 0x1F, 0xF1, 0xF0, 0xBA, 0xE5, 0xE4, 0x8C, 0x6D, 0xE3, + 0xE2, 0x2E, 0x0E, 0x1E, 0xE1, 0xE0, 0x5D, 0xD5, 0x7C, 0xC7, 0x4D, 0x8B, + 0xB8, 0xD4, 0x9A, 0xA9, 0x6C, 0xC6, 0x3D, 0xD3, 0x7B, 0x2D, 0xD2, 0x1D, + 0xB7, 0x5C, 0xC5, 0x99, 0x7A, 0xC3, 0xA7, 0x97, 0x4B, 0xD1, 0x0D, 0xD0, + 0x8A, 0xA8, 0x4C, 0xC4, 0x6B, 0xB6, 0x3C, 0x2C, 0xC2, 0x5B, 0xB5, 0x89, + 0x1C, 0xC1, 0x98, 0x0C, 0xC0, 0xB4, 0x6A, 0xA6, 0x79, 0x3B, 0xB3, 0x88, + 0x5A, 0x2B, 0xA5, 0x69, 0xA4, 0x78, 0x87, 0x94, 0x77, 0x76, 0xB2, 0x1B, + 0xB1, 0x0B, 0xB0, 0x96, 0x4A, 0x3A, 0xA3, 0x59, 0x95, 0x2A, 0xA2, 0x1A, + 0xA1, 0x0A, 0x68, 0xA0, 0x86, 0x49, 0x93, 0x39, 0x58, 0x85, 0x67, 0x29, + 0x92, 0x57, 0x75, 0x38, 0x83, 0x66, 0x47, 0x74, 0x56, 0x65, 0x73, 0x19, + 0x91, 0x09, 0x90, 0x48, 0x84, 0x72, 0x46, 0x64, 0x28, 0x82, 0x18, 0x37, + 0x27, 0x17, 0x71, 0x55, 0x07, 0x70, 0x36, 0x63, 0x45, 0x54, 0x26, 0x62, + 0x35, 0x81, 0x08, 0x80, 0x16, 0x61, 0x06, 0x60, 0x53, 0x44, 0x25, 0x52, + 0x05, 0x15, 0x51, 0x34, 0x43, 0x50, 0x24, 0x42, 0x33, 0x14, 0x41, 0x04, + 0x40, 0x23, 0x32, 0x13, 0x31, 0x03, 0x30, 0x22, 0x12, 0x21, 0x02, 0x20, + 0x11, 0x01, 0x10, 0x00, + /* Huffman table 15 - 256 entries */ + 0xFF, 0xEF, 0xFE, 0xDF, 0xEE, 0xFD, 0xCF, 0xFC, 0xDE, 0xED, 0xBF, 0xFB, + 0xCE, 0xEC, 0xDD, 0xAF, 0xFA, 0xBE, 0xEB, 0xCD, 0xDC, 0x9F, 0xF9, 0xEA, + 0xBD, 0xDB, 0x8F, 0xF8, 0xCC, 0x9E, 0xE9, 0x7F, 0xF7, 0xAD, 0xDA, 0xBC, + 0x6F, 0xAE, 0x0F, 0xCB, 0xF6, 0x8E, 0xE8, 0x5F, 0x9D, 0xF5, 0x7E, 0xE7, + 0xAC, 0xCA, 0xBB, 0xD9, 0x8D, 0x4F, 0xF4, 0x3F, 0xF3, 0xD8, 0xE6, 0x2F, + 0xF2, 0x6E, 0xF0, 0x1F, 0xF1, 0x9C, 0xC9, 0x5E, 0xAB, 0xBA, 0xE5, 0x7D, + 0xD7, 0x4E, 0xE4, 0x8C, 0xC8, 0x3E, 0x6D, 0xD6, 0xE3, 0x9B, 0xB9, 0x2E, + 0xAA, 0xE2, 0x1E, 0xE1, 0x0E, 0xE0, 0x5D, 0xD5, 0x7C, 0xC7, 0x4D, 0x8B, + 0xD4, 0xB8, 0x9A, 0xA9, 0x6C, 0xC6, 0x3D, 0xD3, 0xD2, 0x2D, 0x0D, 0x1D, + 0x7B, 0xB7, 0xD1, 0x5C, 0xD0, 0xC5, 0x8A, 0xA8, 0x4C, 0xC4, 0x6B, 0xB6, + 0x99, 0x0C, 0x3C, 0xC3, 0x7A, 0xA7, 0xA6, 0xC0, 0x0B, 0xC2, 0x2C, 0x5B, + 0xB5, 0x1C, 0x89, 0x98, 0xC1, 0x4B, 0xB4, 0x6A, 0x3B, 0x79, 0xB3, 0x97, + 0x88, 0x2B, 0x5A, 0xB2, 0xA5, 0x1B, 0xB1, 0xB0, 0x69, 0x96, 0x4A, 0xA4, + 0x78, 0x87, 0x3A, 0xA3, 0x59, 0x95, 0x2A, 0xA2, 0x1A, 0xA1, 0x0A, 0xA0, + 0x68, 0x86, 0x49, 0x94, 0x39, 0x93, 0x77, 0x09, 0x58, 0x85, 0x29, 0x67, + 0x76, 0x92, 0x91, 0x19, 0x90, 0x48, 0x84, 0x57, 0x75, 0x38, 0x83, 0x66, + 0x47, 0x28, 0x82, 0x18, 0x81, 0x74, 0x08, 0x80, 0x56, 0x65, 0x37, 0x73, + 0x46, 0x27, 0x72, 0x64, 0x17, 0x55, 0x71, 0x07, 0x70, 0x36, 0x63, 0x45, + 0x54, 0x26, 0x62, 0x16, 0x06, 0x60, 0x35, 0x61, 0x53, 0x44, 0x25, 0x52, + 0x15, 0x51, 0x05, 0x50, 0x34, 0x43, 0x24, 0x42, 0x33, 0x41, 0x14, 0x04, + 0x23, 0x32, 0x40, 0x03, 0x13, 0x31, 0x30, 0x22, 0x12, 0x21, 0x02, 0x20, + 0x11, 0x01, 0x10, 0x00, + /* Huffman table 16 - 256 entries */ + 0xEF, 0xFE, 0xDF, 0xFD, 0xCF, 0xFC, 0xBF, 0xFB, 0xAF, 0xFA, 0x9F, 0xF9, + 0xF8, 0x8F, 0x7F, 0xF7, 0x6F, 0xF6, 0xFF, 0x5F, 0xF5, 0x4F, 0xF4, 0xF3, + 0xF0, 0x3F, 0xCE, 0xEC, 0xDD, 0xDE, 0xE9, 0xEA, 0xD9, 0xEE, 0xED, 0xEB, + 0xBE, 0xCD, 0xDC, 0xDB, 0xAE, 0xCC, 0xAD, 0xDA, 0x7E, 0xAC, 0xCA, 0xC9, + 0x7D, 0x5E, 0xBD, 0xF2, 0x2F, 0x0F, 0x1F, 0xF1, 0x9E, 0xBC, 0xCB, 0x8E, + 0xE8, 0x9D, 0xE7, 0xBB, 0x8D, 0xD8, 0x6E, 0xE6, 0x9C, 0xAB, 0xBA, 0xE5, + 0xD7, 0x4E, 0xE4, 0x8C, 0xC8, 0x3E, 0x6D, 0xD6, 0x9B, 0xB9, 0xAA, 0xE1, + 0xD4, 0xB8, 0xA9, 0x7B, 0xB7, 0xD0, 0xE3, 0x0E, 0xE0, 0x5D, 0xD5, 0x7C, + 0xC7, 0x4D, 0x8B, 0x9A, 0x6C, 0xC6, 0x3D, 0x5C, 0xC5, 0x0D, 0x8A, 0xA8, + 0x99, 0x4C, 0xB6, 0x7A, 0x3C, 0x5B, 0x89, 0x1C, 0xC0, 0x98, 0x79, 0xE2, + 0x2E, 0x1E, 0xD3, 0x2D, 0xD2, 0xD1, 0x3B, 0x97, 0x88, 0x1D, 0xC4, 0x6B, + 0xC3, 0xA7, 0x2C, 0xC2, 0xB5, 0xC1, 0x0C, 0x4B, 0xB4, 0x6A, 0xA6, 0xB3, + 0x5A, 0xA5, 0x2B, 0xB2, 0x1B, 0xB1, 0x0B, 0xB0, 0x69, 0x96, 0x4A, 0xA4, + 0x78, 0x87, 0xA3, 0x3A, 0x59, 0x2A, 0x95, 0x68, 0xA1, 0x86, 0x77, 0x94, + 0x49, 0x57, 0x67, 0xA2, 0x1A, 0x0A, 0xA0, 0x39, 0x93, 0x58, 0x85, 0x29, + 0x92, 0x76, 0x09, 0x19, 0x91, 0x90, 0x48, 0x84, 0x75, 0x38, 0x83, 0x66, + 0x28, 0x82, 0x47, 0x74, 0x18, 0x81, 0x80, 0x08, 0x56, 0x37, 0x73, 0x65, + 0x46, 0x27, 0x72, 0x64, 0x55, 0x07, 0x17, 0x71, 0x70, 0x36, 0x63, 0x45, + 0x54, 0x26, 0x62, 0x16, 0x61, 0x06, 0x60, 0x53, 0x35, 0x44, 0x25, 0x52, + 0x51, 0x15, 0x05, 0x34, 0x43, 0x50, 0x24, 0x42, 0x33, 0x14, 0x41, 0x04, + 0x40, 0x23, 0x32, 0x13, 0x31, 0x03, 0x30, 0x22, 0x12, 0x21, 0x02, 0x20, + 0x11, 0x01, 0x10, 0x00, + /* Huffman table 24 - 256 entries */ + 0xEF, 0xFE, 0xDF, 0xFD, 0xCF, 0xFC, 0xBF, 0xFB, 0xFA, 0xAF, 0x9F, 0xF9, + 0xF8, 0x8F, 0x7F, 0xF7, 0x6F, 0xF6, 0x5F, 0xF5, 0x4F, 0xF4, 0x3F, 0xF3, + 0x2F, 0xF2, 0xF1, 0x1F, 0xF0, 0x0F, 0xEE, 0xDE, 0xED, 0xCE, 0xEC, 0xDD, + 0xBE, 0xEB, 0xCD, 0xDC, 0xAE, 0xEA, 0xBD, 0xDB, 0xCC, 0x9E, 0xE9, 0xAD, + 0xDA, 0xBC, 0xCB, 0x8E, 0xE8, 0x9D, 0xD9, 0x7E, 0xE7, 0xAC, 0xFF, 0xCA, + 0xBB, 0x8D, 0xD8, 0x0E, 0xE0, 0x0D, 0xE6, 0x6E, 0x9C, 0xC9, 0x5E, 0xBA, + 0xE5, 0xAB, 0x7D, 0xD7, 0xE4, 0x8C, 0xC8, 0x4E, 0x2E, 0x3E, 0x6D, 0xD6, + 0xE3, 0x9B, 0xB9, 0xAA, 0xE2, 0x1E, 0xE1, 0x5D, 0xD5, 0x7C, 0xC7, 0x4D, + 0x8B, 0xB8, 0xD4, 0x9A, 0xA9, 0x6C, 0xC6, 0x3D, 0xD3, 0x2D, 0xD2, 0x1D, + 0x7B, 0xB7, 0xD1, 0x5C, 0xC5, 0x8A, 0xA8, 0x99, 0x4C, 0xC4, 0x6B, 0xB6, + 0xD0, 0x0C, 0x3C, 0xC3, 0x7A, 0xA7, 0x2C, 0xC2, 0x5B, 0xB5, 0x1C, 0x89, + 0x98, 0xC1, 0x4B, 0xC0, 0x0B, 0x3B, 0xB0, 0x0A, 0x1A, 0xB4, 0x6A, 0xA6, + 0x79, 0x97, 0xA0, 0x09, 0x90, 0xB3, 0x88, 0x2B, 0x5A, 0xB2, 0xA5, 0x1B, + 0xB1, 0x69, 0x96, 0xA4, 0x4A, 0x78, 0x87, 0x3A, 0xA3, 0x59, 0x95, 0x2A, + 0xA2, 0xA1, 0x68, 0x86, 0x77, 0x49, 0x94, 0x39, 0x93, 0x58, 0x85, 0x29, + 0x67, 0x76, 0x92, 0x19, 0x91, 0x48, 0x84, 0x57, 0x75, 0x38, 0x83, 0x66, + 0x28, 0x82, 0x18, 0x47, 0x74, 0x81, 0x08, 0x80, 0x56, 0x65, 0x17, 0x07, + 0x70, 0x73, 0x37, 0x27, 0x72, 0x46, 0x64, 0x55, 0x71, 0x36, 0x63, 0x45, + 0x54, 0x26, 0x62, 0x16, 0x61, 0x06, 0x60, 0x35, 0x53, 0x44, 0x25, 0x52, + 0x15, 0x05, 0x50, 0x51, 0x34, 0x43, 0x24, 0x42, 0x33, 0x14, 0x41, 0x04, + 0x40, 0x23, 0x32, 0x13, 0x31, 0x03, 0x30, 0x22, 0x12, 0x21, 0x02, 0x20, + 0x11, 0x01, 0x10, 0x00, +}; + +static const uint8_t mpa_huff_sizes_minus_one[] = +{ + 3, 8, 8, 15, 15, 35, 35, 35, 63, 63, 63, 255, 255, 255, 255 +}; + +const uint8_t ff_mpa_huff_data[32][2] = { +{ 0, 0 }, +{ 1, 0 }, +{ 2, 0 }, +{ 3, 0 }, +{ 0, 0 }, +{ 4, 0 }, +{ 5, 0 }, +{ 6, 0 }, +{ 7, 0 }, +{ 8, 0 }, +{ 9, 0 }, +{ 10, 0 }, +{ 11, 0 }, +{ 12, 0 }, +{ 0, 0 }, +{ 13, 0 }, +{ 14, 1 }, +{ 14, 2 }, +{ 14, 3 }, +{ 14, 4 }, +{ 14, 6 }, +{ 14, 8 }, +{ 14, 10 }, +{ 14, 13 }, +{ 15, 4 }, +{ 15, 5 }, +{ 15, 6 }, +{ 15, 7 }, +{ 15, 8 }, +{ 15, 9 }, +{ 15, 11 }, +{ 15, 13 }, +}; + + +/* huffman tables for quadrules */ +static const uint8_t mpa_quad_codes[2][16] = { + { 1, 5, 4, 5, 6, 5, 4, 4, 7, 3, 6, 0, 7, 2, 3, 1, }, + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, }, +}; + +static const uint8_t mpa_quad_bits[2][16] = { + { 1, 4, 4, 5, 4, 6, 5, 6, 4, 5, 5, 6, 5, 6, 6, 6, }, + { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, +}; + +const uint8_t ff_band_size_long[9][22] = { +{ 4, 4, 4, 4, 4, 4, 6, 6, 8, 8, 10, + 12, 16, 20, 24, 28, 34, 42, 50, 54, 76, 158, }, /* 44100 */ +{ 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 10, + 12, 16, 18, 22, 28, 34, 40, 46, 54, 54, 192, }, /* 48000 */ +{ 4, 4, 4, 4, 4, 4, 6, 6, 8, 10, 12, + 16, 20, 24, 30, 38, 46, 56, 68, 84, 102, 26, }, /* 32000 */ +{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16, + 20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 22050 */ +{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16, + 18, 22, 26, 32, 38, 46, 54, 62, 70, 76, 36, }, /* 24000 */ +{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16, + 20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 16000 */ +{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16, + 20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 11025 */ +{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16, + 20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 12000 */ +{ 12, 12, 12, 12, 12, 12, 16, 20, 24, 28, 32, + 40, 48, 56, 64, 76, 90, 2, 2, 2, 2, 2, }, /* 8000 */ +}; + +const uint8_t ff_band_size_short[9][13] = { +{ 4, 4, 4, 4, 6, 8, 10, 12, 14, 18, 22, 30, 56, }, /* 44100 */ +{ 4, 4, 4, 4, 6, 6, 10, 12, 14, 16, 20, 26, 66, }, /* 48000 */ +{ 4, 4, 4, 4, 6, 8, 12, 16, 20, 26, 34, 42, 12, }, /* 32000 */ +{ 4, 4, 4, 6, 6, 8, 10, 14, 18, 26, 32, 42, 18, }, /* 22050 */ +{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 32, 44, 12, }, /* 24000 */ +{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 30, 40, 18, }, /* 16000 */ +{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 30, 40, 18, }, /* 11025 */ +{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 30, 40, 18, }, /* 12000 */ +{ 8, 8, 8, 12, 16, 20, 24, 28, 36, 2, 2, 2, 26, }, /* 8000 */ +}; + +uint16_t ff_band_index_long[9][23]; + +const uint8_t ff_mpa_pretab[2][22] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 2, 0 }, +}; + +static av_cold void mpegaudiodec_common_init_static(void) +{ + const uint8_t *huff_sym = mpa_huffsymbols, *huff_lens = mpa_hufflens; + int offset; + + /* scale factors table for layer 1/2 */ + for (int i = 0; i < 64; i++) { + int shift, mod; + /* 1.0 (i = 3) is normalized to 2 ^ FRAC_BITS */ + shift = i / 3; + mod = i % 3; + ff_scale_factor_modshift[i] = mod | (shift << 2); + } + + /* huffman decode tables */ + offset = 0; + for (int i = 0; i < 15;) { + uint16_t tmp_symbols[256]; + int nb_codes_minus_one = mpa_huff_sizes_minus_one[i]; + int j; + + for (j = 0; j <= nb_codes_minus_one; j++) { + uint8_t high = huff_sym[j] & 0xF0, low = huff_sym[j] & 0xF; + + tmp_symbols[j] = high << 1 | ((high && low) << 4) | low; + } + + ff_huff_vlc[++i].table = huff_vlc_tables + offset; + ff_huff_vlc[i].table_allocated = FF_ARRAY_ELEMS(huff_vlc_tables) - offset; + ff_init_vlc_from_lengths(&ff_huff_vlc[i], 7, j, + huff_lens, 1, tmp_symbols, 2, 2, + 0, INIT_VLC_STATIC_OVERLONG, NULL); + offset += ff_huff_vlc[i].table_size; + huff_lens += j; + huff_sym += j; + } + av_assert0(offset == FF_ARRAY_ELEMS(huff_vlc_tables)); + + offset = 0; + for (int i = 0; i < 2; i++) { + int bits = i == 0 ? 6 : 4; + ff_huff_quad_vlc[i].table = huff_quad_vlc_tables + offset; + ff_huff_quad_vlc[i].table_allocated = 1 << bits; + offset += 1 << bits; + init_vlc(&ff_huff_quad_vlc[i], bits, 16, + mpa_quad_bits[i], 1, 1, mpa_quad_codes[i], 1, 1, + INIT_VLC_USE_NEW_STATIC); + } + av_assert0(offset == FF_ARRAY_ELEMS(huff_quad_vlc_tables)); + + for (int i = 0; i < 9; i++) { + int k = 0; + for (int j = 0; j < 22; j++) { + ff_band_index_long[i][j] = k; + k += ff_band_size_long[i][j] >> 1; + } + ff_band_index_long[i][22] = k; + } + + for (int i = 0; i < 4; i++) { + if (ff_mpa_quant_bits[i] < 0) { + for (int j = 0; j < (1 << (-ff_mpa_quant_bits[i] + 1)); j++) { + int val1, val2, val3, steps; + int val = j; + steps = ff_mpa_quant_steps[i]; + val1 = val % steps; + val /= steps; + val2 = val % steps; + val3 = val / steps; + ff_division_tabs[i][j] = val1 + (val2 << 4) + (val3 << 8); + } + } + } + mpegaudiodec_common_tableinit(); +} + +av_cold void ff_mpegaudiodec_common_init_static(void) +{ + static AVOnce init_static_once = AV_ONCE_INIT; + + ff_thread_once(&init_static_once, mpegaudiodec_common_init_static); +} diff --git a/media/ffvpx/libavcodec/mpegaudiodec_common_tablegen.h b/media/ffvpx/libavcodec/mpegaudiodec_common_tablegen.h new file mode 100644 index 0000000000..bf402c9d84 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodec_common_tablegen.h @@ -0,0 +1,72 @@ +/* + * Header file for hardcoded shared mpegaudiodec tables + * + * Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de> + * Copyright (c) 2020 Andreas Rheinhardt <andreas.rheinhardt@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MPEGAUDIODEC_COMMON_TABLEGEN_H +#define AVCODEC_MPEGAUDIODEC_COMMON_TABLEGEN_H + +#include <stdint.h> + +#define TABLE_4_3_SIZE ((8191 + 16)*4) + +#if CONFIG_HARDCODED_TABLES +#define mpegaudiodec_common_tableinit() +#include "libavcodec/mpegaudiodec_common_tables.h" +#else +#include <math.h> +#include "libavutil/attributes.h" + +int8_t ff_table_4_3_exp [TABLE_4_3_SIZE]; +uint32_t ff_table_4_3_value[TABLE_4_3_SIZE]; + +#define FRAC_BITS 23 +#define IMDCT_SCALAR 1.759 + +static av_cold void mpegaudiodec_common_tableinit(void) +{ + static const double exp2_lut[4] = { + 1.00000000000000000000, /* 2 ^ (0 * 0.25) */ + 1.18920711500272106672, /* 2 ^ (1 * 0.25) */ + M_SQRT2 , /* 2 ^ (2 * 0.25) */ + 1.68179283050742908606, /* 2 ^ (3 * 0.25) */ + }; + double pow43_val = 0; + + for (int i = 1; i < TABLE_4_3_SIZE; i++) { + double f, fm; + int e, m; + double value = i / 4; + if ((i & 3) == 0) + pow43_val = value / IMDCT_SCALAR * cbrt(value); + f = pow43_val * exp2_lut[i & 3]; + fm = frexp(f, &e); + m = llrint(fm * (1LL << 31)); + e += FRAC_BITS - 31 + 5 - 100; + + /* normalized to FRAC_BITS */ + ff_table_4_3_value[i] = m; + ff_table_4_3_exp [i] = -e; + } +} + +#endif /* CONFIG_HARDCODED_TABLES */ +#endif /* AVCODEC_MPEGAUDIODEC_COMMON_TABLEGEN_H */ diff --git a/media/ffvpx/libavcodec/mpegaudiodec_fixed.c b/media/ffvpx/libavcodec/mpegaudiodec_fixed.c new file mode 100644 index 0000000000..b5b6822a19 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodec_fixed.c @@ -0,0 +1,148 @@ +/* + * Fixed-point MPEG audio decoder + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "config_components.h" +#include "libavutil/samplefmt.h" + +#define USE_FLOATS 0 + +#include "codec_internal.h" +#include "mpegaudio.h" + +#define SHR(a,b) (((int)(a))>>(b)) +/* WARNING: only correct for positive numbers */ +#define FIXR_OLD(a) ((int)((a) * FRAC_ONE + 0.5)) +#define FIXR(a) ((int)((a) * FRAC_ONE + 0.5)) +#define FIXHR(a) ((int)((a) * (1LL<<32) + 0.5)) +#define MULH3(x, y, s) MULH((s)*(x), y) +#define MULLx(x, y, s) MULL((int)(x),(y),s) +#define RENAME(a) a ## _fixed +#define OUT_FMT AV_SAMPLE_FMT_S16 +#define OUT_FMT_P AV_SAMPLE_FMT_S16P + +/* Intensity stereo table. See commit b91d46614df189e7905538e7f5c4ed9c7ed0d274 + * (float based mp1/mp2/mp3 decoders.) for how they were created. */ +static const int32_t is_table[2][16] = { + { 0x000000, 0x1B0CB1, 0x2ED9EC, 0x400000, 0x512614, 0x64F34F, 0x800000 }, + { 0x800000, 0x64F34F, 0x512614, 0x400000, 0x2ED9EC, 0x1B0CB1, 0x000000 } +}; + +/* Antialiasing table. See commit ce4a29c066cddfc180979ed86396812f24337985 + * (optimize antialias) for how they were created. */ +static const int32_t csa_table[8][4] = { + { 0x36E129F8, 0xDF128056, 0x15F3AA4E, 0xA831565E }, + { 0x386E75F2, 0xE1CF24A5, 0x1A3D9A97, 0xA960AEB3 }, + { 0x3CC6B73A, 0xEBF19FA6, 0x28B856E0, 0xAF2AE86C }, + { 0x3EEEA054, 0xF45B88BC, 0x334A2910, 0xB56CE868 }, + { 0x3FB6905C, 0xF9F27F18, 0x39A90F74, 0xBA3BEEBC }, + { 0x3FF23F20, 0xFD60D1E4, 0x3D531104, 0xBD6E92C4 }, + { 0x3FFE5932, 0xFF175EE4, 0x3F15B816, 0xBF1905B2 }, + { 0x3FFFE34A, 0xFFC3612F, 0x3FC34479, 0xBFC37DE5 } +}; + +#include "mpegaudiodec_template.c" + +#if CONFIG_MP1_DECODER +const FFCodec ff_mp1_decoder = { + .p.name = "mp1", + CODEC_LONG_NAME("MP1 (MPEG audio layer 1)"), + .p.type = AVMEDIA_TYPE_AUDIO, + .p.id = AV_CODEC_ID_MP1, + .priv_data_size = sizeof(MPADecodeContext), + .init = decode_init, + FF_CODEC_DECODE_CB(decode_frame), + .p.capabilities = AV_CODEC_CAP_CHANNEL_CONF | + AV_CODEC_CAP_DR1, + .flush = flush, + .p.sample_fmts = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P, + AV_SAMPLE_FMT_S16, + AV_SAMPLE_FMT_NONE }, +}; +#endif +#if CONFIG_MP2_DECODER +const FFCodec ff_mp2_decoder = { + .p.name = "mp2", + CODEC_LONG_NAME("MP2 (MPEG audio layer 2)"), + .p.type = AVMEDIA_TYPE_AUDIO, + .p.id = AV_CODEC_ID_MP2, + .priv_data_size = sizeof(MPADecodeContext), + .init = decode_init, + FF_CODEC_DECODE_CB(decode_frame), + .p.capabilities = AV_CODEC_CAP_CHANNEL_CONF | + AV_CODEC_CAP_DR1, + .flush = flush, + .p.sample_fmts = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P, + AV_SAMPLE_FMT_S16, + AV_SAMPLE_FMT_NONE }, +}; +#endif +#if CONFIG_MP3_DECODER +const FFCodec ff_mp3_decoder = { + .p.name = "mp3", + CODEC_LONG_NAME("MP3 (MPEG audio layer 3)"), + .p.type = AVMEDIA_TYPE_AUDIO, + .p.id = AV_CODEC_ID_MP3, + .priv_data_size = sizeof(MPADecodeContext), + .init = decode_init, + FF_CODEC_DECODE_CB(decode_frame), + .p.capabilities = AV_CODEC_CAP_CHANNEL_CONF | + AV_CODEC_CAP_DR1, + .flush = flush, + .p.sample_fmts = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P, + AV_SAMPLE_FMT_S16, + AV_SAMPLE_FMT_NONE }, +}; +#endif +#if CONFIG_MP3ADU_DECODER +const FFCodec ff_mp3adu_decoder = { + .p.name = "mp3adu", + CODEC_LONG_NAME("ADU (Application Data Unit) MP3 (MPEG audio layer 3)"), + .p.type = AVMEDIA_TYPE_AUDIO, + .p.id = AV_CODEC_ID_MP3ADU, + .priv_data_size = sizeof(MPADecodeContext), + .init = decode_init, + FF_CODEC_DECODE_CB(decode_frame_adu), + .p.capabilities = AV_CODEC_CAP_CHANNEL_CONF | + AV_CODEC_CAP_DR1, + .flush = flush, + .p.sample_fmts = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P, + AV_SAMPLE_FMT_S16, + AV_SAMPLE_FMT_NONE }, +}; +#endif +#if CONFIG_MP3ON4_DECODER +const FFCodec ff_mp3on4_decoder = { + .p.name = "mp3on4", + CODEC_LONG_NAME("MP3onMP4"), + .p.type = AVMEDIA_TYPE_AUDIO, + .p.id = AV_CODEC_ID_MP3ON4, + .priv_data_size = sizeof(MP3On4DecodeContext), + .init = decode_init_mp3on4, + .close = decode_close_mp3on4, + FF_CODEC_DECODE_CB(decode_frame_mp3on4), + .p.capabilities = AV_CODEC_CAP_CHANNEL_CONF | + AV_CODEC_CAP_DR1, + .flush = flush_mp3on4, + .p.sample_fmts = (const enum AVSampleFormat[]) { AV_SAMPLE_FMT_S16P, + AV_SAMPLE_FMT_NONE }, + .caps_internal = FF_CODEC_CAP_INIT_CLEANUP, +}; +#endif diff --git a/media/ffvpx/libavcodec/mpegaudiodec_template.c b/media/ffvpx/libavcodec/mpegaudiodec_template.c new file mode 100644 index 0000000000..3e4ee79be6 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodec_template.c @@ -0,0 +1,1899 @@ +/* + * MPEG Audio decoder + * Copyright (c) 2001, 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * MPEG Audio decoder + */ + +#include "config_components.h" + +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/channel_layout.h" +#include "libavutil/crc.h" +#include "libavutil/float_dsp.h" +#include "libavutil/libm.h" +#include "libavutil/mem_internal.h" +#include "libavutil/thread.h" + +#include "avcodec.h" +#include "decode.h" +#include "get_bits.h" +#include "mathops.h" +#include "mpegaudiodsp.h" + +/* + * TODO: + * - test lsf / mpeg25 extensively. + */ + +#include "mpegaudio.h" +#include "mpegaudiodecheader.h" + +#define BACKSTEP_SIZE 512 +#define EXTRABYTES 24 +#define LAST_BUF_SIZE 2 * BACKSTEP_SIZE + EXTRABYTES + +/* layer 3 "granule" */ +typedef struct GranuleDef { + uint8_t scfsi; + int part2_3_length; + int big_values; + int global_gain; + int scalefac_compress; + uint8_t block_type; + uint8_t switch_point; + int table_select[3]; + int subblock_gain[3]; + uint8_t scalefac_scale; + uint8_t count1table_select; + int region_size[3]; /* number of huffman codes in each region */ + int preflag; + int short_start, long_end; /* long/short band indexes */ + uint8_t scale_factors[40]; + DECLARE_ALIGNED(16, INTFLOAT, sb_hybrid)[SBLIMIT * 18]; /* 576 samples */ +} GranuleDef; + +typedef struct MPADecodeContext { + MPA_DECODE_HEADER + uint8_t last_buf[LAST_BUF_SIZE]; + int last_buf_size; + int extrasize; + /* next header (used in free format parsing) */ + uint32_t free_format_next_header; + GetBitContext gb; + GetBitContext in_gb; + DECLARE_ALIGNED(32, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2]; + int synth_buf_offset[MPA_MAX_CHANNELS]; + DECLARE_ALIGNED(32, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT]; + INTFLOAT mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous samples, for layer 3 MDCT */ + GranuleDef granules[2][2]; /* Used in Layer 3 */ + int adu_mode; ///< 0 for standard mp3, 1 for adu formatted mp3 + int dither_state; + int err_recognition; + AVCodecContext* avctx; + MPADSPContext mpadsp; + void (*butterflies_float)(float *av_restrict v1, float *av_restrict v2, int len); + AVFrame *frame; + uint32_t crc; +} MPADecodeContext; + +#define HEADER_SIZE 4 + +#include "mpegaudiodata.h" + +#include "mpegaudio_tablegen.h" +/* intensity stereo coef table */ +static INTFLOAT is_table_lsf[2][2][16]; + +/* [i][j]: 2^(-j/3) * FRAC_ONE * 2^(i+2) / (2^(i+2) - 1) */ +static int32_t scale_factor_mult[15][3]; +/* mult table for layer 2 group quantization */ + +#define SCALE_GEN(v) \ +{ FIXR_OLD(1.0 * (v)), FIXR_OLD(0.7937005259 * (v)), FIXR_OLD(0.6299605249 * (v)) } + +static const int32_t scale_factor_mult2[3][3] = { + SCALE_GEN(4.0 / 3.0), /* 3 steps */ + SCALE_GEN(4.0 / 5.0), /* 5 steps */ + SCALE_GEN(4.0 / 9.0), /* 9 steps */ +}; + +/** + * Convert region offsets to region sizes and truncate + * size to big_values. + */ +static void region_offset2size(GranuleDef *g) +{ + int i, k, j = 0; + g->region_size[2] = 576 / 2; + for (i = 0; i < 3; i++) { + k = FFMIN(g->region_size[i], g->big_values); + g->region_size[i] = k - j; + j = k; + } +} + +static void init_short_region(MPADecodeContext *s, GranuleDef *g) +{ + if (g->block_type == 2) { + if (s->sample_rate_index != 8) + g->region_size[0] = (36 / 2); + else + g->region_size[0] = (72 / 2); + } else { + if (s->sample_rate_index <= 2) + g->region_size[0] = (36 / 2); + else if (s->sample_rate_index != 8) + g->region_size[0] = (54 / 2); + else + g->region_size[0] = (108 / 2); + } + g->region_size[1] = (576 / 2); +} + +static void init_long_region(MPADecodeContext *s, GranuleDef *g, + int ra1, int ra2) +{ + int l; + g->region_size[0] = ff_band_index_long[s->sample_rate_index][ra1 + 1]; + /* should not overflow */ + l = FFMIN(ra1 + ra2 + 2, 22); + g->region_size[1] = ff_band_index_long[s->sample_rate_index][ l]; +} + +static void compute_band_indexes(MPADecodeContext *s, GranuleDef *g) +{ + if (g->block_type == 2) { + if (g->switch_point) { + if(s->sample_rate_index == 8) + avpriv_request_sample(s->avctx, "switch point in 8khz"); + /* if switched mode, we handle the 36 first samples as + long blocks. For 8000Hz, we handle the 72 first + exponents as long blocks */ + if (s->sample_rate_index <= 2) + g->long_end = 8; + else + g->long_end = 6; + + g->short_start = 3; + } else { + g->long_end = 0; + g->short_start = 0; + } + } else { + g->short_start = 13; + g->long_end = 22; + } +} + +/* layer 1 unscaling */ +/* n = number of bits of the mantissa minus 1 */ +static inline int l1_unscale(int n, int mant, int scale_factor) +{ + int shift, mod; + int64_t val; + + shift = ff_scale_factor_modshift[scale_factor]; + mod = shift & 3; + shift >>= 2; + val = MUL64((int)(mant + (-1U << n) + 1), scale_factor_mult[n-1][mod]); + shift += n; + /* NOTE: at this point, 1 <= shift >= 21 + 15 */ + return (int)((val + (1LL << (shift - 1))) >> shift); +} + +static inline int l2_unscale_group(int steps, int mant, int scale_factor) +{ + int shift, mod, val; + + shift = ff_scale_factor_modshift[scale_factor]; + mod = shift & 3; + shift >>= 2; + + val = (mant - (steps >> 1)) * scale_factor_mult2[steps >> 2][mod]; + /* NOTE: at this point, 0 <= shift <= 21 */ + if (shift > 0) + val = (val + (1 << (shift - 1))) >> shift; + return val; +} + +/* compute value^(4/3) * 2^(exponent/4). It normalized to FRAC_BITS */ +static inline int l3_unscale(int value, int exponent) +{ + unsigned int m; + int e; + + e = ff_table_4_3_exp [4 * value + (exponent & 3)]; + m = ff_table_4_3_value[4 * value + (exponent & 3)]; + e -= exponent >> 2; +#ifdef DEBUG + if(e < 1) + av_log(NULL, AV_LOG_WARNING, "l3_unscale: e is %d\n", e); +#endif + if (e > (SUINT)31) + return 0; + m = (m + ((1U << e) >> 1)) >> e; + + return m; +} + +static av_cold void decode_init_static(void) +{ + int i, j; + + /* scale factor multiply for layer 1 */ + for (i = 0; i < 15; i++) { + int n, norm; + n = i + 2; + norm = ((INT64_C(1) << n) * FRAC_ONE) / ((1 << n) - 1); + scale_factor_mult[i][0] = MULLx(norm, FIXR(1.0 * 2.0), FRAC_BITS); + scale_factor_mult[i][1] = MULLx(norm, FIXR(0.7937005259 * 2.0), FRAC_BITS); + scale_factor_mult[i][2] = MULLx(norm, FIXR(0.6299605249 * 2.0), FRAC_BITS); + ff_dlog(NULL, "%d: norm=%x s=%"PRIx32" %"PRIx32" %"PRIx32"\n", i, + (unsigned)norm, + scale_factor_mult[i][0], + scale_factor_mult[i][1], + scale_factor_mult[i][2]); + } + + /* compute n ^ (4/3) and store it in mantissa/exp format */ + + mpegaudio_tableinit(); + + for (i = 0; i < 16; i++) { + double f; + int e, k; + + for (j = 0; j < 2; j++) { + e = -(j + 1) * ((i + 1) >> 1); + f = exp2(e / 4.0); + k = i & 1; + is_table_lsf[j][k ^ 1][i] = FIXR(f); + is_table_lsf[j][k ][i] = FIXR(1.0); + ff_dlog(NULL, "is_table_lsf %d %d: %f %f\n", + i, j, (float) is_table_lsf[j][0][i], + (float) is_table_lsf[j][1][i]); + } + } + RENAME(ff_mpa_synth_init)(); + ff_mpegaudiodec_common_init_static(); +} + +static av_cold int decode_init(AVCodecContext * avctx) +{ + static AVOnce init_static_once = AV_ONCE_INIT; + MPADecodeContext *s = avctx->priv_data; + + s->avctx = avctx; + +#if USE_FLOATS + { + AVFloatDSPContext *fdsp; + fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT); + if (!fdsp) + return AVERROR(ENOMEM); + s->butterflies_float = fdsp->butterflies_float; + av_free(fdsp); + } +#endif + + ff_mpadsp_init(&s->mpadsp); + + if (avctx->request_sample_fmt == OUT_FMT && + avctx->codec_id != AV_CODEC_ID_MP3ON4) + avctx->sample_fmt = OUT_FMT; + else + avctx->sample_fmt = OUT_FMT_P; + s->err_recognition = avctx->err_recognition; + + if (avctx->codec_id == AV_CODEC_ID_MP3ADU) + s->adu_mode = 1; + + ff_thread_once(&init_static_once, decode_init_static); + + return 0; +} + +#define C3 FIXHR(0.86602540378443864676/2) +#define C4 FIXHR(0.70710678118654752439/2) //0.5 / cos(pi*(9)/36) +#define C5 FIXHR(0.51763809020504152469/2) //0.5 / cos(pi*(5)/36) +#define C6 FIXHR(1.93185165257813657349/4) //0.5 / cos(pi*(15)/36) + +/* 12 points IMDCT. We compute it "by hand" by factorizing obvious + cases. */ +static void imdct12(INTFLOAT *out, SUINTFLOAT *in) +{ + SUINTFLOAT in0, in1, in2, in3, in4, in5, t1, t2; + + in0 = in[0*3]; + in1 = in[1*3] + in[0*3]; + in2 = in[2*3] + in[1*3]; + in3 = in[3*3] + in[2*3]; + in4 = in[4*3] + in[3*3]; + in5 = in[5*3] + in[4*3]; + in5 += in3; + in3 += in1; + + in2 = MULH3(in2, C3, 2); + in3 = MULH3(in3, C3, 4); + + t1 = in0 - in4; + t2 = MULH3(in1 - in5, C4, 2); + + out[ 7] = + out[10] = t1 + t2; + out[ 1] = + out[ 4] = t1 - t2; + + in0 += SHR(in4, 1); + in4 = in0 + in2; + in5 += 2*in1; + in1 = MULH3(in5 + in3, C5, 1); + out[ 8] = + out[ 9] = in4 + in1; + out[ 2] = + out[ 3] = in4 - in1; + + in0 -= in2; + in5 = MULH3(in5 - in3, C6, 2); + out[ 0] = + out[ 5] = in0 - in5; + out[ 6] = + out[11] = in0 + in5; +} + +static int handle_crc(MPADecodeContext *s, int sec_len) +{ + if (s->error_protection && (s->err_recognition & AV_EF_CRCCHECK)) { + const uint8_t *buf = s->gb.buffer - HEADER_SIZE; + int sec_byte_len = sec_len >> 3; + int sec_rem_bits = sec_len & 7; + const AVCRC *crc_tab = av_crc_get_table(AV_CRC_16_ANSI); + uint8_t tmp_buf[4]; + uint32_t crc_val = av_crc(crc_tab, UINT16_MAX, &buf[2], 2); + crc_val = av_crc(crc_tab, crc_val, &buf[6], sec_byte_len); + + AV_WB32(tmp_buf, + ((buf[6 + sec_byte_len] & (0xFF00U >> sec_rem_bits)) << 24) + + ((s->crc << 16) >> sec_rem_bits)); + + crc_val = av_crc(crc_tab, crc_val, tmp_buf, 3); + + if (crc_val) { + av_log(s->avctx, AV_LOG_ERROR, "CRC mismatch %X!\n", crc_val); + if (s->err_recognition & AV_EF_EXPLODE) + return AVERROR_INVALIDDATA; + } + } + return 0; +} + +/* return the number of decoded frames */ +static int mp_decode_layer1(MPADecodeContext *s) +{ + int bound, i, v, n, ch, j, mant; + uint8_t allocation[MPA_MAX_CHANNELS][SBLIMIT]; + uint8_t scale_factors[MPA_MAX_CHANNELS][SBLIMIT]; + int ret; + + ret = handle_crc(s, (s->nb_channels == 1) ? 8*16 : 8*32); + if (ret < 0) + return ret; + + if (s->mode == MPA_JSTEREO) + bound = (s->mode_ext + 1) * 4; + else + bound = SBLIMIT; + + /* allocation bits */ + for (i = 0; i < bound; i++) { + for (ch = 0; ch < s->nb_channels; ch++) { + allocation[ch][i] = get_bits(&s->gb, 4); + } + } + for (i = bound; i < SBLIMIT; i++) + allocation[0][i] = get_bits(&s->gb, 4); + + /* scale factors */ + for (i = 0; i < bound; i++) { + for (ch = 0; ch < s->nb_channels; ch++) { + if (allocation[ch][i]) + scale_factors[ch][i] = get_bits(&s->gb, 6); + } + } + for (i = bound; i < SBLIMIT; i++) { + if (allocation[0][i]) { + scale_factors[0][i] = get_bits(&s->gb, 6); + scale_factors[1][i] = get_bits(&s->gb, 6); + } + } + + /* compute samples */ + for (j = 0; j < 12; j++) { + for (i = 0; i < bound; i++) { + for (ch = 0; ch < s->nb_channels; ch++) { + n = allocation[ch][i]; + if (n) { + mant = get_bits(&s->gb, n + 1); + v = l1_unscale(n, mant, scale_factors[ch][i]); + } else { + v = 0; + } + s->sb_samples[ch][j][i] = v; + } + } + for (i = bound; i < SBLIMIT; i++) { + n = allocation[0][i]; + if (n) { + mant = get_bits(&s->gb, n + 1); + v = l1_unscale(n, mant, scale_factors[0][i]); + s->sb_samples[0][j][i] = v; + v = l1_unscale(n, mant, scale_factors[1][i]); + s->sb_samples[1][j][i] = v; + } else { + s->sb_samples[0][j][i] = 0; + s->sb_samples[1][j][i] = 0; + } + } + } + return 12; +} + +static int mp_decode_layer2(MPADecodeContext *s) +{ + int sblimit; /* number of used subbands */ + const unsigned char *alloc_table; + int table, bit_alloc_bits, i, j, ch, bound, v; + unsigned char bit_alloc[MPA_MAX_CHANNELS][SBLIMIT]; + unsigned char scale_code[MPA_MAX_CHANNELS][SBLIMIT]; + unsigned char scale_factors[MPA_MAX_CHANNELS][SBLIMIT][3], *sf; + int scale, qindex, bits, steps, k, l, m, b; + int ret; + + /* select decoding table */ + table = ff_mpa_l2_select_table(s->bit_rate / 1000, s->nb_channels, + s->sample_rate, s->lsf); + sblimit = ff_mpa_sblimit_table[table]; + alloc_table = ff_mpa_alloc_tables[table]; + + if (s->mode == MPA_JSTEREO) + bound = (s->mode_ext + 1) * 4; + else + bound = sblimit; + + ff_dlog(s->avctx, "bound=%d sblimit=%d\n", bound, sblimit); + + /* sanity check */ + if (bound > sblimit) + bound = sblimit; + + /* parse bit allocation */ + j = 0; + for (i = 0; i < bound; i++) { + bit_alloc_bits = alloc_table[j]; + for (ch = 0; ch < s->nb_channels; ch++) + bit_alloc[ch][i] = get_bits(&s->gb, bit_alloc_bits); + j += 1 << bit_alloc_bits; + } + for (i = bound; i < sblimit; i++) { + bit_alloc_bits = alloc_table[j]; + v = get_bits(&s->gb, bit_alloc_bits); + bit_alloc[0][i] = v; + bit_alloc[1][i] = v; + j += 1 << bit_alloc_bits; + } + + /* scale codes */ + for (i = 0; i < sblimit; i++) { + for (ch = 0; ch < s->nb_channels; ch++) { + if (bit_alloc[ch][i]) + scale_code[ch][i] = get_bits(&s->gb, 2); + } + } + + ret = handle_crc(s, get_bits_count(&s->gb) - 16); + if (ret < 0) + return ret; + + /* scale factors */ + for (i = 0; i < sblimit; i++) { + for (ch = 0; ch < s->nb_channels; ch++) { + if (bit_alloc[ch][i]) { + sf = scale_factors[ch][i]; + switch (scale_code[ch][i]) { + default: + case 0: + sf[0] = get_bits(&s->gb, 6); + sf[1] = get_bits(&s->gb, 6); + sf[2] = get_bits(&s->gb, 6); + break; + case 2: + sf[0] = get_bits(&s->gb, 6); + sf[1] = sf[0]; + sf[2] = sf[0]; + break; + case 1: + sf[0] = get_bits(&s->gb, 6); + sf[2] = get_bits(&s->gb, 6); + sf[1] = sf[0]; + break; + case 3: + sf[0] = get_bits(&s->gb, 6); + sf[2] = get_bits(&s->gb, 6); + sf[1] = sf[2]; + break; + } + } + } + } + + /* samples */ + for (k = 0; k < 3; k++) { + for (l = 0; l < 12; l += 3) { + j = 0; + for (i = 0; i < bound; i++) { + bit_alloc_bits = alloc_table[j]; + for (ch = 0; ch < s->nb_channels; ch++) { + b = bit_alloc[ch][i]; + if (b) { + scale = scale_factors[ch][i][k]; + qindex = alloc_table[j+b]; + bits = ff_mpa_quant_bits[qindex]; + if (bits < 0) { + int v2; + /* 3 values at the same time */ + v = get_bits(&s->gb, -bits); + v2 = ff_division_tabs[qindex][v]; + steps = ff_mpa_quant_steps[qindex]; + + s->sb_samples[ch][k * 12 + l + 0][i] = + l2_unscale_group(steps, v2 & 15, scale); + s->sb_samples[ch][k * 12 + l + 1][i] = + l2_unscale_group(steps, (v2 >> 4) & 15, scale); + s->sb_samples[ch][k * 12 + l + 2][i] = + l2_unscale_group(steps, v2 >> 8 , scale); + } else { + for (m = 0; m < 3; m++) { + v = get_bits(&s->gb, bits); + v = l1_unscale(bits - 1, v, scale); + s->sb_samples[ch][k * 12 + l + m][i] = v; + } + } + } else { + s->sb_samples[ch][k * 12 + l + 0][i] = 0; + s->sb_samples[ch][k * 12 + l + 1][i] = 0; + s->sb_samples[ch][k * 12 + l + 2][i] = 0; + } + } + /* next subband in alloc table */ + j += 1 << bit_alloc_bits; + } + /* XXX: find a way to avoid this duplication of code */ + for (i = bound; i < sblimit; i++) { + bit_alloc_bits = alloc_table[j]; + b = bit_alloc[0][i]; + if (b) { + int mant, scale0, scale1; + scale0 = scale_factors[0][i][k]; + scale1 = scale_factors[1][i][k]; + qindex = alloc_table[j + b]; + bits = ff_mpa_quant_bits[qindex]; + if (bits < 0) { + /* 3 values at the same time */ + v = get_bits(&s->gb, -bits); + steps = ff_mpa_quant_steps[qindex]; + mant = v % steps; + v = v / steps; + s->sb_samples[0][k * 12 + l + 0][i] = + l2_unscale_group(steps, mant, scale0); + s->sb_samples[1][k * 12 + l + 0][i] = + l2_unscale_group(steps, mant, scale1); + mant = v % steps; + v = v / steps; + s->sb_samples[0][k * 12 + l + 1][i] = + l2_unscale_group(steps, mant, scale0); + s->sb_samples[1][k * 12 + l + 1][i] = + l2_unscale_group(steps, mant, scale1); + s->sb_samples[0][k * 12 + l + 2][i] = + l2_unscale_group(steps, v, scale0); + s->sb_samples[1][k * 12 + l + 2][i] = + l2_unscale_group(steps, v, scale1); + } else { + for (m = 0; m < 3; m++) { + mant = get_bits(&s->gb, bits); + s->sb_samples[0][k * 12 + l + m][i] = + l1_unscale(bits - 1, mant, scale0); + s->sb_samples[1][k * 12 + l + m][i] = + l1_unscale(bits - 1, mant, scale1); + } + } + } else { + s->sb_samples[0][k * 12 + l + 0][i] = 0; + s->sb_samples[0][k * 12 + l + 1][i] = 0; + s->sb_samples[0][k * 12 + l + 2][i] = 0; + s->sb_samples[1][k * 12 + l + 0][i] = 0; + s->sb_samples[1][k * 12 + l + 1][i] = 0; + s->sb_samples[1][k * 12 + l + 2][i] = 0; + } + /* next subband in alloc table */ + j += 1 << bit_alloc_bits; + } + /* fill remaining samples to zero */ + for (i = sblimit; i < SBLIMIT; i++) { + for (ch = 0; ch < s->nb_channels; ch++) { + s->sb_samples[ch][k * 12 + l + 0][i] = 0; + s->sb_samples[ch][k * 12 + l + 1][i] = 0; + s->sb_samples[ch][k * 12 + l + 2][i] = 0; + } + } + } + } + return 3 * 12; +} + +#define SPLIT(dst,sf,n) \ + if (n == 3) { \ + int m = (sf * 171) >> 9; \ + dst = sf - 3 * m; \ + sf = m; \ + } else if (n == 4) { \ + dst = sf & 3; \ + sf >>= 2; \ + } else if (n == 5) { \ + int m = (sf * 205) >> 10; \ + dst = sf - 5 * m; \ + sf = m; \ + } else if (n == 6) { \ + int m = (sf * 171) >> 10; \ + dst = sf - 6 * m; \ + sf = m; \ + } else { \ + dst = 0; \ + } + +static av_always_inline void lsf_sf_expand(int *slen, int sf, int n1, int n2, + int n3) +{ + SPLIT(slen[3], sf, n3) + SPLIT(slen[2], sf, n2) + SPLIT(slen[1], sf, n1) + slen[0] = sf; +} + +static void exponents_from_scale_factors(MPADecodeContext *s, GranuleDef *g, + int16_t *exponents) +{ + const uint8_t *bstab, *pretab; + int len, i, j, k, l, v0, shift, gain, gains[3]; + int16_t *exp_ptr; + + exp_ptr = exponents; + gain = g->global_gain - 210; + shift = g->scalefac_scale + 1; + + bstab = ff_band_size_long[s->sample_rate_index]; + pretab = ff_mpa_pretab[g->preflag]; + for (i = 0; i < g->long_end; i++) { + v0 = gain - ((g->scale_factors[i] + pretab[i]) << shift) + 400; + len = bstab[i]; + for (j = len; j > 0; j--) + *exp_ptr++ = v0; + } + + if (g->short_start < 13) { + bstab = ff_band_size_short[s->sample_rate_index]; + gains[0] = gain - (g->subblock_gain[0] << 3); + gains[1] = gain - (g->subblock_gain[1] << 3); + gains[2] = gain - (g->subblock_gain[2] << 3); + k = g->long_end; + for (i = g->short_start; i < 13; i++) { + len = bstab[i]; + for (l = 0; l < 3; l++) { + v0 = gains[l] - (g->scale_factors[k++] << shift) + 400; + for (j = len; j > 0; j--) + *exp_ptr++ = v0; + } + } + } +} + +static void switch_buffer(MPADecodeContext *s, int *pos, int *end_pos, + int *end_pos2) +{ + if (s->in_gb.buffer && *pos >= s->gb.size_in_bits - s->extrasize * 8) { + s->gb = s->in_gb; + s->in_gb.buffer = NULL; + s->extrasize = 0; + av_assert2((get_bits_count(&s->gb) & 7) == 0); + skip_bits_long(&s->gb, *pos - *end_pos); + *end_pos2 = + *end_pos = *end_pos2 + get_bits_count(&s->gb) - *pos; + *pos = get_bits_count(&s->gb); + } +} + +/* Following is an optimized code for + INTFLOAT v = *src + if(get_bits1(&s->gb)) + v = -v; + *dst = v; +*/ +#if USE_FLOATS +#define READ_FLIP_SIGN(dst,src) \ + v = AV_RN32A(src) ^ (get_bits1(&s->gb) << 31); \ + AV_WN32A(dst, v); +#else +#define READ_FLIP_SIGN(dst,src) \ + v = -get_bits1(&s->gb); \ + *(dst) = (*(src) ^ v) - v; +#endif + +static int huffman_decode(MPADecodeContext *s, GranuleDef *g, + int16_t *exponents, int end_pos2) +{ + int s_index; + int i; + int last_pos, bits_left; + VLC *vlc; + int end_pos = FFMIN(end_pos2, s->gb.size_in_bits - s->extrasize * 8); + + /* low frequencies (called big values) */ + s_index = 0; + for (i = 0; i < 3; i++) { + int j, k, l, linbits; + j = g->region_size[i]; + if (j == 0) + continue; + /* select vlc table */ + k = g->table_select[i]; + l = ff_mpa_huff_data[k][0]; + linbits = ff_mpa_huff_data[k][1]; + vlc = &ff_huff_vlc[l]; + + if (!l) { + memset(&g->sb_hybrid[s_index], 0, sizeof(*g->sb_hybrid) * 2 * j); + s_index += 2 * j; + continue; + } + + /* read huffcode and compute each couple */ + for (; j > 0; j--) { + int exponent, x, y; + int v; + int pos = get_bits_count(&s->gb); + + if (pos >= end_pos){ + switch_buffer(s, &pos, &end_pos, &end_pos2); + if (pos >= end_pos) + break; + } + y = get_vlc2(&s->gb, vlc->table, 7, 3); + + if (!y) { + g->sb_hybrid[s_index ] = + g->sb_hybrid[s_index + 1] = 0; + s_index += 2; + continue; + } + + exponent= exponents[s_index]; + + ff_dlog(s->avctx, "region=%d n=%d y=%d exp=%d\n", + i, g->region_size[i] - j, y, exponent); + if (y & 16) { + x = y >> 5; + y = y & 0x0f; + if (x < 15) { + READ_FLIP_SIGN(g->sb_hybrid + s_index, RENAME(expval_table)[exponent] + x) + } else { + x += get_bitsz(&s->gb, linbits); + v = l3_unscale(x, exponent); + if (get_bits1(&s->gb)) + v = -v; + g->sb_hybrid[s_index] = v; + } + if (y < 15) { + READ_FLIP_SIGN(g->sb_hybrid + s_index + 1, RENAME(expval_table)[exponent] + y) + } else { + y += get_bitsz(&s->gb, linbits); + v = l3_unscale(y, exponent); + if (get_bits1(&s->gb)) + v = -v; + g->sb_hybrid[s_index + 1] = v; + } + } else { + x = y >> 5; + y = y & 0x0f; + x += y; + if (x < 15) { + READ_FLIP_SIGN(g->sb_hybrid + s_index + !!y, RENAME(expval_table)[exponent] + x) + } else { + x += get_bitsz(&s->gb, linbits); + v = l3_unscale(x, exponent); + if (get_bits1(&s->gb)) + v = -v; + g->sb_hybrid[s_index+!!y] = v; + } + g->sb_hybrid[s_index + !y] = 0; + } + s_index += 2; + } + } + + /* high frequencies */ + vlc = &ff_huff_quad_vlc[g->count1table_select]; + last_pos = 0; + while (s_index <= 572) { + int pos, code; + pos = get_bits_count(&s->gb); + if (pos >= end_pos) { + if (pos > end_pos2 && last_pos) { + /* some encoders generate an incorrect size for this + part. We must go back into the data */ + s_index -= 4; + skip_bits_long(&s->gb, last_pos - pos); + av_log(s->avctx, AV_LOG_INFO, "overread, skip %d enddists: %d %d\n", last_pos - pos, end_pos-pos, end_pos2-pos); + if(s->err_recognition & (AV_EF_BITSTREAM|AV_EF_COMPLIANT)) + s_index=0; + break; + } + switch_buffer(s, &pos, &end_pos, &end_pos2); + if (pos >= end_pos) + break; + } + last_pos = pos; + + code = get_vlc2(&s->gb, vlc->table, vlc->bits, 1); + ff_dlog(s->avctx, "t=%d code=%d\n", g->count1table_select, code); + g->sb_hybrid[s_index + 0] = + g->sb_hybrid[s_index + 1] = + g->sb_hybrid[s_index + 2] = + g->sb_hybrid[s_index + 3] = 0; + while (code) { + static const int idxtab[16] = { 3,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0 }; + int v; + int pos = s_index + idxtab[code]; + code ^= 8 >> idxtab[code]; + READ_FLIP_SIGN(g->sb_hybrid + pos, RENAME(exp_table)+exponents[pos]) + } + s_index += 4; + } + /* skip extension bits */ + bits_left = end_pos2 - get_bits_count(&s->gb); + if (bits_left < 0 && (s->err_recognition & (AV_EF_BUFFER|AV_EF_COMPLIANT))) { + av_log(s->avctx, AV_LOG_ERROR, "bits_left=%d\n", bits_left); + s_index=0; + } else if (bits_left > 0 && (s->err_recognition & (AV_EF_BUFFER|AV_EF_AGGRESSIVE))) { + av_log(s->avctx, AV_LOG_ERROR, "bits_left=%d\n", bits_left); + s_index = 0; + } + memset(&g->sb_hybrid[s_index], 0, sizeof(*g->sb_hybrid) * (576 - s_index)); + skip_bits_long(&s->gb, bits_left); + + i = get_bits_count(&s->gb); + switch_buffer(s, &i, &end_pos, &end_pos2); + + return 0; +} + +/* Reorder short blocks from bitstream order to interleaved order. It + would be faster to do it in parsing, but the code would be far more + complicated */ +static void reorder_block(MPADecodeContext *s, GranuleDef *g) +{ + int i, j, len; + INTFLOAT *ptr, *dst, *ptr1; + INTFLOAT tmp[576]; + + if (g->block_type != 2) + return; + + if (g->switch_point) { + if (s->sample_rate_index != 8) + ptr = g->sb_hybrid + 36; + else + ptr = g->sb_hybrid + 72; + } else { + ptr = g->sb_hybrid; + } + + for (i = g->short_start; i < 13; i++) { + len = ff_band_size_short[s->sample_rate_index][i]; + ptr1 = ptr; + dst = tmp; + for (j = len; j > 0; j--) { + *dst++ = ptr[0*len]; + *dst++ = ptr[1*len]; + *dst++ = ptr[2*len]; + ptr++; + } + ptr += 2 * len; + memcpy(ptr1, tmp, len * 3 * sizeof(*ptr1)); + } +} + +#define ISQRT2 FIXR(0.70710678118654752440) + +static void compute_stereo(MPADecodeContext *s, GranuleDef *g0, GranuleDef *g1) +{ + int i, j, k, l; + int sf_max, sf, len, non_zero_found; + INTFLOAT *tab0, *tab1, v1, v2; + const INTFLOAT (*is_tab)[16]; + SUINTFLOAT tmp0, tmp1; + int non_zero_found_short[3]; + + /* intensity stereo */ + if (s->mode_ext & MODE_EXT_I_STEREO) { + if (!s->lsf) { + is_tab = is_table; + sf_max = 7; + } else { + is_tab = is_table_lsf[g1->scalefac_compress & 1]; + sf_max = 16; + } + + tab0 = g0->sb_hybrid + 576; + tab1 = g1->sb_hybrid + 576; + + non_zero_found_short[0] = 0; + non_zero_found_short[1] = 0; + non_zero_found_short[2] = 0; + k = (13 - g1->short_start) * 3 + g1->long_end - 3; + for (i = 12; i >= g1->short_start; i--) { + /* for last band, use previous scale factor */ + if (i != 11) + k -= 3; + len = ff_band_size_short[s->sample_rate_index][i]; + for (l = 2; l >= 0; l--) { + tab0 -= len; + tab1 -= len; + if (!non_zero_found_short[l]) { + /* test if non zero band. if so, stop doing i-stereo */ + for (j = 0; j < len; j++) { + if (tab1[j] != 0) { + non_zero_found_short[l] = 1; + goto found1; + } + } + sf = g1->scale_factors[k + l]; + if (sf >= sf_max) + goto found1; + + v1 = is_tab[0][sf]; + v2 = is_tab[1][sf]; + for (j = 0; j < len; j++) { + tmp0 = tab0[j]; + tab0[j] = MULLx(tmp0, v1, FRAC_BITS); + tab1[j] = MULLx(tmp0, v2, FRAC_BITS); + } + } else { +found1: + if (s->mode_ext & MODE_EXT_MS_STEREO) { + /* lower part of the spectrum : do ms stereo + if enabled */ + for (j = 0; j < len; j++) { + tmp0 = tab0[j]; + tmp1 = tab1[j]; + tab0[j] = MULLx(tmp0 + tmp1, ISQRT2, FRAC_BITS); + tab1[j] = MULLx(tmp0 - tmp1, ISQRT2, FRAC_BITS); + } + } + } + } + } + + non_zero_found = non_zero_found_short[0] | + non_zero_found_short[1] | + non_zero_found_short[2]; + + for (i = g1->long_end - 1;i >= 0;i--) { + len = ff_band_size_long[s->sample_rate_index][i]; + tab0 -= len; + tab1 -= len; + /* test if non zero band. if so, stop doing i-stereo */ + if (!non_zero_found) { + for (j = 0; j < len; j++) { + if (tab1[j] != 0) { + non_zero_found = 1; + goto found2; + } + } + /* for last band, use previous scale factor */ + k = (i == 21) ? 20 : i; + sf = g1->scale_factors[k]; + if (sf >= sf_max) + goto found2; + v1 = is_tab[0][sf]; + v2 = is_tab[1][sf]; + for (j = 0; j < len; j++) { + tmp0 = tab0[j]; + tab0[j] = MULLx(tmp0, v1, FRAC_BITS); + tab1[j] = MULLx(tmp0, v2, FRAC_BITS); + } + } else { +found2: + if (s->mode_ext & MODE_EXT_MS_STEREO) { + /* lower part of the spectrum : do ms stereo + if enabled */ + for (j = 0; j < len; j++) { + tmp0 = tab0[j]; + tmp1 = tab1[j]; + tab0[j] = MULLx(tmp0 + tmp1, ISQRT2, FRAC_BITS); + tab1[j] = MULLx(tmp0 - tmp1, ISQRT2, FRAC_BITS); + } + } + } + } + } else if (s->mode_ext & MODE_EXT_MS_STEREO) { + /* ms stereo ONLY */ + /* NOTE: the 1/sqrt(2) normalization factor is included in the + global gain */ +#if USE_FLOATS + s->butterflies_float(g0->sb_hybrid, g1->sb_hybrid, 576); +#else + tab0 = g0->sb_hybrid; + tab1 = g1->sb_hybrid; + for (i = 0; i < 576; i++) { + tmp0 = tab0[i]; + tmp1 = tab1[i]; + tab0[i] = tmp0 + tmp1; + tab1[i] = tmp0 - tmp1; + } +#endif + } +} + +#if USE_FLOATS +#if HAVE_MIPSFPU +# include "mips/compute_antialias_float.h" +#endif /* HAVE_MIPSFPU */ +#else +#if HAVE_MIPSDSP +# include "mips/compute_antialias_fixed.h" +#endif /* HAVE_MIPSDSP */ +#endif /* USE_FLOATS */ + +#ifndef compute_antialias +#if USE_FLOATS +#define AA(j) do { \ + float tmp0 = ptr[-1-j]; \ + float tmp1 = ptr[ j]; \ + ptr[-1-j] = tmp0 * csa_table[j][0] - tmp1 * csa_table[j][1]; \ + ptr[ j] = tmp0 * csa_table[j][1] + tmp1 * csa_table[j][0]; \ + } while (0) +#else +#define AA(j) do { \ + SUINT tmp0 = ptr[-1-j]; \ + SUINT tmp1 = ptr[ j]; \ + SUINT tmp2 = MULH(tmp0 + tmp1, csa_table[j][0]); \ + ptr[-1-j] = 4 * (tmp2 - MULH(tmp1, csa_table[j][2])); \ + ptr[ j] = 4 * (tmp2 + MULH(tmp0, csa_table[j][3])); \ + } while (0) +#endif + +static void compute_antialias(MPADecodeContext *s, GranuleDef *g) +{ + INTFLOAT *ptr; + int n, i; + + /* we antialias only "long" bands */ + if (g->block_type == 2) { + if (!g->switch_point) + return; + /* XXX: check this for 8000Hz case */ + n = 1; + } else { + n = SBLIMIT - 1; + } + + ptr = g->sb_hybrid + 18; + for (i = n; i > 0; i--) { + AA(0); + AA(1); + AA(2); + AA(3); + AA(4); + AA(5); + AA(6); + AA(7); + + ptr += 18; + } +} +#endif /* compute_antialias */ + +static void compute_imdct(MPADecodeContext *s, GranuleDef *g, + INTFLOAT *sb_samples, INTFLOAT *mdct_buf) +{ + INTFLOAT *win, *out_ptr, *ptr, *buf, *ptr1; + INTFLOAT out2[12]; + int i, j, mdct_long_end, sblimit; + + /* find last non zero block */ + ptr = g->sb_hybrid + 576; + ptr1 = g->sb_hybrid + 2 * 18; + while (ptr >= ptr1) { + int32_t *p; + ptr -= 6; + p = (int32_t*)ptr; + if (p[0] | p[1] | p[2] | p[3] | p[4] | p[5]) + break; + } + sblimit = ((ptr - g->sb_hybrid) / 18) + 1; + + if (g->block_type == 2) { + /* XXX: check for 8000 Hz */ + if (g->switch_point) + mdct_long_end = 2; + else + mdct_long_end = 0; + } else { + mdct_long_end = sblimit; + } + + s->mpadsp.RENAME(imdct36_blocks)(sb_samples, mdct_buf, g->sb_hybrid, + mdct_long_end, g->switch_point, + g->block_type); + + buf = mdct_buf + 4*18*(mdct_long_end >> 2) + (mdct_long_end & 3); + ptr = g->sb_hybrid + 18 * mdct_long_end; + + for (j = mdct_long_end; j < sblimit; j++) { + /* select frequency inversion */ + win = RENAME(ff_mdct_win)[2 + (4 & -(j & 1))]; + out_ptr = sb_samples + j; + + for (i = 0; i < 6; i++) { + *out_ptr = buf[4*i]; + out_ptr += SBLIMIT; + } + imdct12(out2, ptr + 0); + for (i = 0; i < 6; i++) { + *out_ptr = MULH3(out2[i ], win[i ], 1) + buf[4*(i + 6*1)]; + buf[4*(i + 6*2)] = MULH3(out2[i + 6], win[i + 6], 1); + out_ptr += SBLIMIT; + } + imdct12(out2, ptr + 1); + for (i = 0; i < 6; i++) { + *out_ptr = MULH3(out2[i ], win[i ], 1) + buf[4*(i + 6*2)]; + buf[4*(i + 6*0)] = MULH3(out2[i + 6], win[i + 6], 1); + out_ptr += SBLIMIT; + } + imdct12(out2, ptr + 2); + for (i = 0; i < 6; i++) { + buf[4*(i + 6*0)] = MULH3(out2[i ], win[i ], 1) + buf[4*(i + 6*0)]; + buf[4*(i + 6*1)] = MULH3(out2[i + 6], win[i + 6], 1); + buf[4*(i + 6*2)] = 0; + } + ptr += 18; + buf += (j&3) != 3 ? 1 : (4*18-3); + } + /* zero bands */ + for (j = sblimit; j < SBLIMIT; j++) { + /* overlap */ + out_ptr = sb_samples + j; + for (i = 0; i < 18; i++) { + *out_ptr = buf[4*i]; + buf[4*i] = 0; + out_ptr += SBLIMIT; + } + buf += (j&3) != 3 ? 1 : (4*18-3); + } +} + +/* main layer3 decoding function */ +static int mp_decode_layer3(MPADecodeContext *s) +{ + int nb_granules, main_data_begin; + int gr, ch, blocksplit_flag, i, j, k, n, bits_pos; + GranuleDef *g; + int16_t exponents[576]; //FIXME try INTFLOAT + int ret; + + /* read side info */ + if (s->lsf) { + ret = handle_crc(s, ((s->nb_channels == 1) ? 8*9 : 8*17)); + main_data_begin = get_bits(&s->gb, 8); + skip_bits(&s->gb, s->nb_channels); + nb_granules = 1; + } else { + ret = handle_crc(s, ((s->nb_channels == 1) ? 8*17 : 8*32)); + main_data_begin = get_bits(&s->gb, 9); + if (s->nb_channels == 2) + skip_bits(&s->gb, 3); + else + skip_bits(&s->gb, 5); + nb_granules = 2; + for (ch = 0; ch < s->nb_channels; ch++) { + s->granules[ch][0].scfsi = 0;/* all scale factors are transmitted */ + s->granules[ch][1].scfsi = get_bits(&s->gb, 4); + } + } + if (ret < 0) + return ret; + + for (gr = 0; gr < nb_granules; gr++) { + for (ch = 0; ch < s->nb_channels; ch++) { + ff_dlog(s->avctx, "gr=%d ch=%d: side_info\n", gr, ch); + g = &s->granules[ch][gr]; + g->part2_3_length = get_bits(&s->gb, 12); + g->big_values = get_bits(&s->gb, 9); + if (g->big_values > 288) { + av_log(s->avctx, AV_LOG_ERROR, "big_values too big\n"); + return AVERROR_INVALIDDATA; + } + + g->global_gain = get_bits(&s->gb, 8); + /* if MS stereo only is selected, we precompute the + 1/sqrt(2) renormalization factor */ + if ((s->mode_ext & (MODE_EXT_MS_STEREO | MODE_EXT_I_STEREO)) == + MODE_EXT_MS_STEREO) + g->global_gain -= 2; + if (s->lsf) + g->scalefac_compress = get_bits(&s->gb, 9); + else + g->scalefac_compress = get_bits(&s->gb, 4); + blocksplit_flag = get_bits1(&s->gb); + if (blocksplit_flag) { + g->block_type = get_bits(&s->gb, 2); + if (g->block_type == 0) { + av_log(s->avctx, AV_LOG_ERROR, "invalid block type\n"); + return AVERROR_INVALIDDATA; + } + g->switch_point = get_bits1(&s->gb); + for (i = 0; i < 2; i++) + g->table_select[i] = get_bits(&s->gb, 5); + for (i = 0; i < 3; i++) + g->subblock_gain[i] = get_bits(&s->gb, 3); + init_short_region(s, g); + } else { + int region_address1, region_address2; + g->block_type = 0; + g->switch_point = 0; + for (i = 0; i < 3; i++) + g->table_select[i] = get_bits(&s->gb, 5); + /* compute huffman coded region sizes */ + region_address1 = get_bits(&s->gb, 4); + region_address2 = get_bits(&s->gb, 3); + ff_dlog(s->avctx, "region1=%d region2=%d\n", + region_address1, region_address2); + init_long_region(s, g, region_address1, region_address2); + } + region_offset2size(g); + compute_band_indexes(s, g); + + g->preflag = 0; + if (!s->lsf) + g->preflag = get_bits1(&s->gb); + g->scalefac_scale = get_bits1(&s->gb); + g->count1table_select = get_bits1(&s->gb); + ff_dlog(s->avctx, "block_type=%d switch_point=%d\n", + g->block_type, g->switch_point); + } + } + + if (!s->adu_mode) { + int skip; + const uint8_t *ptr = s->gb.buffer + (get_bits_count(&s->gb) >> 3); + s->extrasize = av_clip((get_bits_left(&s->gb) >> 3) - s->extrasize, 0, + FFMAX(0, LAST_BUF_SIZE - s->last_buf_size)); + av_assert1((get_bits_count(&s->gb) & 7) == 0); + /* now we get bits from the main_data_begin offset */ + ff_dlog(s->avctx, "seekback:%d, lastbuf:%d\n", + main_data_begin, s->last_buf_size); + + memcpy(s->last_buf + s->last_buf_size, ptr, s->extrasize); + s->in_gb = s->gb; + init_get_bits(&s->gb, s->last_buf, (s->last_buf_size + s->extrasize) * 8); + s->last_buf_size <<= 3; + for (gr = 0; gr < nb_granules && (s->last_buf_size >> 3) < main_data_begin; gr++) { + for (ch = 0; ch < s->nb_channels; ch++) { + g = &s->granules[ch][gr]; + s->last_buf_size += g->part2_3_length; + memset(g->sb_hybrid, 0, sizeof(g->sb_hybrid)); + compute_imdct(s, g, &s->sb_samples[ch][18 * gr][0], s->mdct_buf[ch]); + } + } + skip = s->last_buf_size - 8 * main_data_begin; + if (skip >= s->gb.size_in_bits - s->extrasize * 8 && s->in_gb.buffer) { + skip_bits_long(&s->in_gb, skip - s->gb.size_in_bits + s->extrasize * 8); + s->gb = s->in_gb; + s->in_gb.buffer = NULL; + s->extrasize = 0; + } else { + skip_bits_long(&s->gb, skip); + } + } else { + gr = 0; + s->extrasize = 0; + } + + for (; gr < nb_granules; gr++) { + for (ch = 0; ch < s->nb_channels; ch++) { + g = &s->granules[ch][gr]; + bits_pos = get_bits_count(&s->gb); + + if (!s->lsf) { + uint8_t *sc; + int slen, slen1, slen2; + + /* MPEG-1 scale factors */ + slen1 = ff_slen_table[0][g->scalefac_compress]; + slen2 = ff_slen_table[1][g->scalefac_compress]; + ff_dlog(s->avctx, "slen1=%d slen2=%d\n", slen1, slen2); + if (g->block_type == 2) { + n = g->switch_point ? 17 : 18; + j = 0; + if (slen1) { + for (i = 0; i < n; i++) + g->scale_factors[j++] = get_bits(&s->gb, slen1); + } else { + for (i = 0; i < n; i++) + g->scale_factors[j++] = 0; + } + if (slen2) { + for (i = 0; i < 18; i++) + g->scale_factors[j++] = get_bits(&s->gb, slen2); + for (i = 0; i < 3; i++) + g->scale_factors[j++] = 0; + } else { + for (i = 0; i < 21; i++) + g->scale_factors[j++] = 0; + } + } else { + sc = s->granules[ch][0].scale_factors; + j = 0; + for (k = 0; k < 4; k++) { + n = k == 0 ? 6 : 5; + if ((g->scfsi & (0x8 >> k)) == 0) { + slen = (k < 2) ? slen1 : slen2; + if (slen) { + for (i = 0; i < n; i++) + g->scale_factors[j++] = get_bits(&s->gb, slen); + } else { + for (i = 0; i < n; i++) + g->scale_factors[j++] = 0; + } + } else { + /* simply copy from last granule */ + for (i = 0; i < n; i++) { + g->scale_factors[j] = sc[j]; + j++; + } + } + } + g->scale_factors[j++] = 0; + } + } else { + int tindex, tindex2, slen[4], sl, sf; + + /* LSF scale factors */ + if (g->block_type == 2) + tindex = g->switch_point ? 2 : 1; + else + tindex = 0; + + sf = g->scalefac_compress; + if ((s->mode_ext & MODE_EXT_I_STEREO) && ch == 1) { + /* intensity stereo case */ + sf >>= 1; + if (sf < 180) { + lsf_sf_expand(slen, sf, 6, 6, 0); + tindex2 = 3; + } else if (sf < 244) { + lsf_sf_expand(slen, sf - 180, 4, 4, 0); + tindex2 = 4; + } else { + lsf_sf_expand(slen, sf - 244, 3, 0, 0); + tindex2 = 5; + } + } else { + /* normal case */ + if (sf < 400) { + lsf_sf_expand(slen, sf, 5, 4, 4); + tindex2 = 0; + } else if (sf < 500) { + lsf_sf_expand(slen, sf - 400, 5, 4, 0); + tindex2 = 1; + } else { + lsf_sf_expand(slen, sf - 500, 3, 0, 0); + tindex2 = 2; + g->preflag = 1; + } + } + + j = 0; + for (k = 0; k < 4; k++) { + n = ff_lsf_nsf_table[tindex2][tindex][k]; + sl = slen[k]; + if (sl) { + for (i = 0; i < n; i++) + g->scale_factors[j++] = get_bits(&s->gb, sl); + } else { + for (i = 0; i < n; i++) + g->scale_factors[j++] = 0; + } + } + /* XXX: should compute exact size */ + for (; j < 40; j++) + g->scale_factors[j] = 0; + } + + exponents_from_scale_factors(s, g, exponents); + + /* read Huffman coded residue */ + huffman_decode(s, g, exponents, bits_pos + g->part2_3_length); + } /* ch */ + + if (s->mode == MPA_JSTEREO) + compute_stereo(s, &s->granules[0][gr], &s->granules[1][gr]); + + for (ch = 0; ch < s->nb_channels; ch++) { + g = &s->granules[ch][gr]; + + reorder_block(s, g); + compute_antialias(s, g); + compute_imdct(s, g, &s->sb_samples[ch][18 * gr][0], s->mdct_buf[ch]); + } + } /* gr */ + if (get_bits_count(&s->gb) < 0) + skip_bits_long(&s->gb, -get_bits_count(&s->gb)); + return nb_granules * 18; +} + +static int mp_decode_frame(MPADecodeContext *s, OUT_INT **samples, + const uint8_t *buf, int buf_size) +{ + int i, nb_frames, ch, ret; + OUT_INT *samples_ptr; + + init_get_bits(&s->gb, buf + HEADER_SIZE, (buf_size - HEADER_SIZE) * 8); + if (s->error_protection) + s->crc = get_bits(&s->gb, 16); + + switch(s->layer) { + case 1: + s->avctx->frame_size = 384; + nb_frames = mp_decode_layer1(s); + break; + case 2: + s->avctx->frame_size = 1152; + nb_frames = mp_decode_layer2(s); + break; + case 3: + s->avctx->frame_size = s->lsf ? 576 : 1152; + default: + nb_frames = mp_decode_layer3(s); + + s->last_buf_size=0; + if (s->in_gb.buffer) { + align_get_bits(&s->gb); + i = (get_bits_left(&s->gb) >> 3) - s->extrasize; + if (i >= 0 && i <= BACKSTEP_SIZE) { + memmove(s->last_buf, s->gb.buffer + (get_bits_count(&s->gb) >> 3), i); + s->last_buf_size=i; + } else + av_log(s->avctx, AV_LOG_ERROR, "invalid old backstep %d\n", i); + s->gb = s->in_gb; + s->in_gb.buffer = NULL; + s->extrasize = 0; + } + + align_get_bits(&s->gb); + av_assert1((get_bits_count(&s->gb) & 7) == 0); + i = (get_bits_left(&s->gb) >> 3) - s->extrasize; + if (i < 0 || i > BACKSTEP_SIZE || nb_frames < 0) { + if (i < 0) + av_log(s->avctx, AV_LOG_ERROR, "invalid new backstep %d\n", i); + i = FFMIN(BACKSTEP_SIZE, buf_size - HEADER_SIZE); + } + av_assert1(i <= buf_size - HEADER_SIZE && i >= 0); + memcpy(s->last_buf + s->last_buf_size, s->gb.buffer + buf_size - HEADER_SIZE - i, i); + s->last_buf_size += i; + } + + if(nb_frames < 0) + return nb_frames; + + /* get output buffer */ + if (!samples) { + av_assert0(s->frame); + s->frame->nb_samples = s->avctx->frame_size; + if ((ret = ff_get_buffer(s->avctx, s->frame, 0)) < 0) + return ret; + samples = (OUT_INT **)s->frame->extended_data; + } + + /* apply the synthesis filter */ + for (ch = 0; ch < s->nb_channels; ch++) { + int sample_stride; + if (s->avctx->sample_fmt == OUT_FMT_P) { + samples_ptr = samples[ch]; + sample_stride = 1; + } else { + samples_ptr = samples[0] + ch; + sample_stride = s->nb_channels; + } + for (i = 0; i < nb_frames; i++) { + RENAME(ff_mpa_synth_filter)(&s->mpadsp, s->synth_buf[ch], + &(s->synth_buf_offset[ch]), + RENAME(ff_mpa_synth_window), + &s->dither_state, samples_ptr, + sample_stride, s->sb_samples[ch][i]); + samples_ptr += 32 * sample_stride; + } + } + + return nb_frames * 32 * sizeof(OUT_INT) * s->nb_channels; +} + +static int decode_frame(AVCodecContext *avctx, AVFrame *frame, + int *got_frame_ptr, AVPacket *avpkt) +{ + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; + MPADecodeContext *s = avctx->priv_data; + uint32_t header; + int ret; + + int skipped = 0; + while(buf_size && !*buf){ + buf++; + buf_size--; + skipped++; + } + + if (buf_size < HEADER_SIZE) + return AVERROR_INVALIDDATA; + + header = AV_RB32(buf); + if (header >> 8 == AV_RB32("TAG") >> 8) { + av_log(avctx, AV_LOG_DEBUG, "discarding ID3 tag\n"); + return buf_size + skipped; + } + ret = avpriv_mpegaudio_decode_header((MPADecodeHeader *)s, header); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Header missing\n"); + return AVERROR_INVALIDDATA; + } else if (ret == 1) { + /* free format: prepare to compute frame size */ + s->frame_size = -1; + return AVERROR_INVALIDDATA; + } + /* update codec info */ + av_channel_layout_uninit(&avctx->ch_layout); + avctx->ch_layout = s->nb_channels == 1 ? (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO : + (AVChannelLayout)AV_CHANNEL_LAYOUT_STEREO; + if (!avctx->bit_rate) + avctx->bit_rate = s->bit_rate; + + if (s->frame_size <= 0) { + av_log(avctx, AV_LOG_ERROR, "incomplete frame\n"); + return AVERROR_INVALIDDATA; + } else if (s->frame_size < buf_size) { + av_log(avctx, AV_LOG_DEBUG, "incorrect frame size - multiple frames in buffer?\n"); + buf_size= s->frame_size; + } + + s->frame = frame; + + ret = mp_decode_frame(s, NULL, buf, buf_size); + if (ret >= 0) { + s->frame->nb_samples = avctx->frame_size; + *got_frame_ptr = 1; + avctx->sample_rate = s->sample_rate; + //FIXME maybe move the other codec info stuff from above here too + } else { + av_log(avctx, AV_LOG_ERROR, "Error while decoding MPEG audio frame.\n"); + /* Only return an error if the bad frame makes up the whole packet or + * the error is related to buffer management. + * If there is more data in the packet, just consume the bad frame + * instead of returning an error, which would discard the whole + * packet. */ + *got_frame_ptr = 0; + if (buf_size == avpkt->size || ret != AVERROR_INVALIDDATA) + return ret; + } + s->frame_size = 0; + return buf_size + skipped; +} + +static void mp_flush(MPADecodeContext *ctx) +{ + memset(ctx->synth_buf, 0, sizeof(ctx->synth_buf)); + memset(ctx->mdct_buf, 0, sizeof(ctx->mdct_buf)); + ctx->last_buf_size = 0; + ctx->dither_state = 0; +} + +static void flush(AVCodecContext *avctx) +{ + mp_flush(avctx->priv_data); +} + +#if CONFIG_MP3ADU_DECODER || CONFIG_MP3ADUFLOAT_DECODER +static int decode_frame_adu(AVCodecContext *avctx, AVFrame *frame, + int *got_frame_ptr, AVPacket *avpkt) +{ + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; + MPADecodeContext *s = avctx->priv_data; + uint32_t header; + int len, ret; + + len = buf_size; + + // Discard too short frames + if (buf_size < HEADER_SIZE) { + av_log(avctx, AV_LOG_ERROR, "Packet is too small\n"); + return AVERROR_INVALIDDATA; + } + + + if (len > MPA_MAX_CODED_FRAME_SIZE) + len = MPA_MAX_CODED_FRAME_SIZE; + + // Get header and restore sync word + header = AV_RB32(buf) | 0xffe00000; + + ret = avpriv_mpegaudio_decode_header((MPADecodeHeader *)s, header); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Invalid frame header\n"); + return ret; + } + /* update codec info */ + avctx->sample_rate = s->sample_rate; + av_channel_layout_uninit(&avctx->ch_layout); + avctx->ch_layout = s->nb_channels == 1 ? (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO : + (AVChannelLayout)AV_CHANNEL_LAYOUT_STEREO; + if (!avctx->bit_rate) + avctx->bit_rate = s->bit_rate; + + s->frame_size = len; + + s->frame = frame; + + ret = mp_decode_frame(s, NULL, buf, buf_size); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Error while decoding MPEG audio frame.\n"); + return ret; + } + + *got_frame_ptr = 1; + + return buf_size; +} +#endif /* CONFIG_MP3ADU_DECODER || CONFIG_MP3ADUFLOAT_DECODER */ + +#if CONFIG_MP3ON4_DECODER || CONFIG_MP3ON4FLOAT_DECODER + +/** + * Context for MP3On4 decoder + */ +typedef struct MP3On4DecodeContext { + int frames; ///< number of mp3 frames per block (number of mp3 decoder instances) + int syncword; ///< syncword patch + const uint8_t *coff; ///< channel offsets in output buffer + MPADecodeContext *mp3decctx[5]; ///< MPADecodeContext for every decoder instance +} MP3On4DecodeContext; + +#include "mpeg4audio.h" + +/* Next 3 arrays are indexed by channel config number (passed via codecdata) */ + +/* number of mp3 decoder instances */ +static const uint8_t mp3Frames[8] = { 0, 1, 1, 2, 3, 3, 4, 5 }; + +/* offsets into output buffer, assume output order is FL FR C LFE BL BR SL SR */ +static const uint8_t chan_offset[8][5] = { + { 0 }, + { 0 }, // C + { 0 }, // FLR + { 2, 0 }, // C FLR + { 2, 0, 3 }, // C FLR BS + { 2, 0, 3 }, // C FLR BLRS + { 2, 0, 4, 3 }, // C FLR BLRS LFE + { 2, 0, 6, 4, 3 }, // C FLR BLRS BLR LFE +}; + +/* mp3on4 channel layouts */ +static const int16_t chan_layout[8] = { + 0, + AV_CH_LAYOUT_MONO, + AV_CH_LAYOUT_STEREO, + AV_CH_LAYOUT_SURROUND, + AV_CH_LAYOUT_4POINT0, + AV_CH_LAYOUT_5POINT0, + AV_CH_LAYOUT_5POINT1, + AV_CH_LAYOUT_7POINT1 +}; + +static av_cold int decode_close_mp3on4(AVCodecContext * avctx) +{ + MP3On4DecodeContext *s = avctx->priv_data; + int i; + + for (i = 0; i < s->frames; i++) + av_freep(&s->mp3decctx[i]); + + return 0; +} + + +static av_cold int decode_init_mp3on4(AVCodecContext * avctx) +{ + MP3On4DecodeContext *s = avctx->priv_data; + MPEG4AudioConfig cfg; + int i, ret; + + if ((avctx->extradata_size < 2) || !avctx->extradata) { + av_log(avctx, AV_LOG_ERROR, "Codec extradata missing or too short.\n"); + return AVERROR_INVALIDDATA; + } + + avpriv_mpeg4audio_get_config2(&cfg, avctx->extradata, + avctx->extradata_size, 1, avctx); + if (!cfg.chan_config || cfg.chan_config > 7) { + av_log(avctx, AV_LOG_ERROR, "Invalid channel config number.\n"); + return AVERROR_INVALIDDATA; + } + s->frames = mp3Frames[cfg.chan_config]; + s->coff = chan_offset[cfg.chan_config]; + av_channel_layout_uninit(&avctx->ch_layout); + av_channel_layout_from_mask(&avctx->ch_layout, chan_layout[cfg.chan_config]); + + if (cfg.sample_rate < 16000) + s->syncword = 0xffe00000; + else + s->syncword = 0xfff00000; + + /* Init the first mp3 decoder in standard way, so that all tables get builded + * We replace avctx->priv_data with the context of the first decoder so that + * decode_init() does not have to be changed. + * Other decoders will be initialized here copying data from the first context + */ + // Allocate zeroed memory for the first decoder context + s->mp3decctx[0] = av_mallocz(sizeof(MPADecodeContext)); + if (!s->mp3decctx[0]) + return AVERROR(ENOMEM); + // Put decoder context in place to make init_decode() happy + avctx->priv_data = s->mp3decctx[0]; + ret = decode_init(avctx); + // Restore mp3on4 context pointer + avctx->priv_data = s; + if (ret < 0) + return ret; + s->mp3decctx[0]->adu_mode = 1; // Set adu mode + + /* Create a separate codec/context for each frame (first is already ok). + * Each frame is 1 or 2 channels - up to 5 frames allowed + */ + for (i = 1; i < s->frames; i++) { + s->mp3decctx[i] = av_mallocz(sizeof(MPADecodeContext)); + if (!s->mp3decctx[i]) + return AVERROR(ENOMEM); + s->mp3decctx[i]->adu_mode = 1; + s->mp3decctx[i]->avctx = avctx; + s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp; + s->mp3decctx[i]->butterflies_float = s->mp3decctx[0]->butterflies_float; + } + + return 0; +} + + +static void flush_mp3on4(AVCodecContext *avctx) +{ + int i; + MP3On4DecodeContext *s = avctx->priv_data; + + for (i = 0; i < s->frames; i++) + mp_flush(s->mp3decctx[i]); +} + + +static int decode_frame_mp3on4(AVCodecContext *avctx, AVFrame *frame, + int *got_frame_ptr, AVPacket *avpkt) +{ + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; + MP3On4DecodeContext *s = avctx->priv_data; + MPADecodeContext *m; + int fsize, len = buf_size, out_size = 0; + uint32_t header; + OUT_INT **out_samples; + OUT_INT *outptr[2]; + int fr, ch, ret; + + /* get output buffer */ + frame->nb_samples = MPA_FRAME_SIZE; + if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) + return ret; + out_samples = (OUT_INT **)frame->extended_data; + + // Discard too short frames + if (buf_size < HEADER_SIZE) + return AVERROR_INVALIDDATA; + + avctx->bit_rate = 0; + + ch = 0; + for (fr = 0; fr < s->frames; fr++) { + fsize = AV_RB16(buf) >> 4; + fsize = FFMIN3(fsize, len, MPA_MAX_CODED_FRAME_SIZE); + m = s->mp3decctx[fr]; + av_assert1(m); + + if (fsize < HEADER_SIZE) { + av_log(avctx, AV_LOG_ERROR, "Frame size smaller than header size\n"); + return AVERROR_INVALIDDATA; + } + header = (AV_RB32(buf) & 0x000fffff) | s->syncword; // patch header + + ret = avpriv_mpegaudio_decode_header((MPADecodeHeader *)m, header); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Bad header, discard block\n"); + return AVERROR_INVALIDDATA; + } + + if (ch + m->nb_channels > avctx->ch_layout.nb_channels || + s->coff[fr] + m->nb_channels > avctx->ch_layout.nb_channels) { + av_log(avctx, AV_LOG_ERROR, "frame channel count exceeds codec " + "channel count\n"); + return AVERROR_INVALIDDATA; + } + ch += m->nb_channels; + + outptr[0] = out_samples[s->coff[fr]]; + if (m->nb_channels > 1) + outptr[1] = out_samples[s->coff[fr] + 1]; + + if ((ret = mp_decode_frame(m, outptr, buf, fsize)) < 0) { + av_log(avctx, AV_LOG_ERROR, "failed to decode channel %d\n", ch); + memset(outptr[0], 0, MPA_FRAME_SIZE*sizeof(OUT_INT)); + if (m->nb_channels > 1) + memset(outptr[1], 0, MPA_FRAME_SIZE*sizeof(OUT_INT)); + ret = m->nb_channels * MPA_FRAME_SIZE*sizeof(OUT_INT); + } + + out_size += ret; + buf += fsize; + len -= fsize; + + avctx->bit_rate += m->bit_rate; + } + if (ch != avctx->ch_layout.nb_channels) { + av_log(avctx, AV_LOG_ERROR, "failed to decode all channels\n"); + return AVERROR_INVALIDDATA; + } + + /* update codec info */ + avctx->sample_rate = s->mp3decctx[0]->sample_rate; + + frame->nb_samples = out_size / (avctx->ch_layout.nb_channels * sizeof(OUT_INT)); + *got_frame_ptr = 1; + + return buf_size; +} +#endif /* CONFIG_MP3ON4_DECODER || CONFIG_MP3ON4FLOAT_DECODER */ diff --git a/media/ffvpx/libavcodec/mpegaudiodecheader.c b/media/ffvpx/libavcodec/mpegaudiodecheader.c new file mode 100644 index 0000000000..ef63befbf4 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodecheader.c @@ -0,0 +1,152 @@ +/* + * MPEG Audio header decoder + * Copyright (c) 2001, 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * MPEG Audio header decoder. + */ + +#include "libavutil/macros.h" + +#include "mpegaudio.h" +#include "mpegaudiodata.h" +#include "mpegaudiodecheader.h" + + +int avpriv_mpegaudio_decode_header(MPADecodeHeader *s, uint32_t header) +{ + int sample_rate, frame_size, mpeg25, padding; + int sample_rate_index, bitrate_index; + int ret; + + ret = ff_mpa_check_header(header); + if (ret < 0) + return ret; + + if (header & (1<<20)) { + s->lsf = (header & (1<<19)) ? 0 : 1; + mpeg25 = 0; + } else { + s->lsf = 1; + mpeg25 = 1; + } + + s->layer = 4 - ((header >> 17) & 3); + /* extract frequency */ + sample_rate_index = (header >> 10) & 3; + if (sample_rate_index >= FF_ARRAY_ELEMS(ff_mpa_freq_tab)) + sample_rate_index = 0; + sample_rate = ff_mpa_freq_tab[sample_rate_index] >> (s->lsf + mpeg25); + sample_rate_index += 3 * (s->lsf + mpeg25); + s->sample_rate_index = sample_rate_index; + s->error_protection = ((header >> 16) & 1) ^ 1; + s->sample_rate = sample_rate; + + bitrate_index = (header >> 12) & 0xf; + padding = (header >> 9) & 1; + //extension = (header >> 8) & 1; + s->mode = (header >> 6) & 3; + s->mode_ext = (header >> 4) & 3; + //copyright = (header >> 3) & 1; + //original = (header >> 2) & 1; + //emphasis = header & 3; + + if (s->mode == MPA_MONO) + s->nb_channels = 1; + else + s->nb_channels = 2; + + if (bitrate_index != 0) { + frame_size = ff_mpa_bitrate_tab[s->lsf][s->layer - 1][bitrate_index]; + s->bit_rate = frame_size * 1000; + switch(s->layer) { + case 1: + frame_size = (frame_size * 12000) / sample_rate; + frame_size = (frame_size + padding) * 4; + break; + case 2: + frame_size = (frame_size * 144000) / sample_rate; + frame_size += padding; + break; + default: + case 3: + frame_size = (frame_size * 144000) / (sample_rate << s->lsf); + frame_size += padding; + break; + } + s->frame_size = frame_size; + } else { + /* if no frame size computed, signal it */ + return 1; + } + +#if defined(DEBUG) + ff_dlog(NULL, "layer%d, %d Hz, %d kbits/s, ", + s->layer, s->sample_rate, s->bit_rate); + if (s->nb_channels == 2) { + if (s->layer == 3) { + if (s->mode_ext & MODE_EXT_MS_STEREO) + ff_dlog(NULL, "ms-"); + if (s->mode_ext & MODE_EXT_I_STEREO) + ff_dlog(NULL, "i-"); + } + ff_dlog(NULL, "stereo"); + } else { + ff_dlog(NULL, "mono"); + } + ff_dlog(NULL, "\n"); +#endif + return 0; +} + +int ff_mpa_decode_header(uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bit_rate, enum AVCodecID *codec_id) +{ + MPADecodeHeader s1, *s = &s1; + + if (avpriv_mpegaudio_decode_header(s, head) != 0) { + return -1; + } + + switch(s->layer) { + case 1: + *codec_id = AV_CODEC_ID_MP1; + *frame_size = 384; + break; + case 2: + *codec_id = AV_CODEC_ID_MP2; + *frame_size = 1152; + break; + default: + case 3: + if (*codec_id != AV_CODEC_ID_MP3ADU) + *codec_id = AV_CODEC_ID_MP3; + if (s->lsf) + *frame_size = 576; + else + *frame_size = 1152; + break; + } + + *sample_rate = s->sample_rate; + *channels = s->nb_channels; + *bit_rate = s->bit_rate; + return s->frame_size; +} diff --git a/media/ffvpx/libavcodec/mpegaudiodecheader.h b/media/ffvpx/libavcodec/mpegaudiodecheader.h new file mode 100644 index 0000000000..ed5d1f3b33 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodecheader.h @@ -0,0 +1,81 @@ +/* + * MPEG Audio header decoder + * Copyright (c) 2001, 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * MPEG Audio header decoder. + */ + +#ifndef AVCODEC_MPEGAUDIODECHEADER_H +#define AVCODEC_MPEGAUDIODECHEADER_H + +#include <stdint.h> +#include "codec_id.h" + +#define MP3_MASK 0xFFFE0CCF + +#define MPA_DECODE_HEADER \ + int frame_size; \ + int error_protection; \ + int layer; \ + int sample_rate; \ + int sample_rate_index; /* between 0 and 8 */ \ + int bit_rate; \ + int nb_channels; \ + int mode; \ + int mode_ext; \ + int lsf; + +typedef struct MPADecodeHeader { + MPA_DECODE_HEADER +} MPADecodeHeader; + +/* header decoding. MUST check the header before because no + consistency check is done there. Return 1 if free format found and + that the frame size must be computed externally */ +int avpriv_mpegaudio_decode_header(MPADecodeHeader *s, uint32_t header); + +/* useful helper to get MPEG audio stream info. Return -1 if error in + header, otherwise the coded frame size in bytes */ +int ff_mpa_decode_header(uint32_t head, int *sample_rate, + int *channels, int *frame_size, int *bitrate, enum AVCodecID *codec_id); + +/* fast header check for resync */ +static inline int ff_mpa_check_header(uint32_t header){ + /* header */ + if ((header & 0xffe00000) != 0xffe00000) + return -1; + /* version check */ + if ((header & (3<<19)) == 1<<19) + return -1; + /* layer check */ + if ((header & (3<<17)) == 0) + return -1; + /* bit rate */ + if ((header & (0xf<<12)) == 0xf<<12) + return -1; + /* frequency */ + if ((header & (3<<10)) == 3<<10) + return -1; + return 0; +} + +#endif /* AVCODEC_MPEGAUDIODECHEADER_H */ diff --git a/media/ffvpx/libavcodec/mpegaudiodectab.h b/media/ffvpx/libavcodec/mpegaudiodectab.h new file mode 100644 index 0000000000..accd12b8e2 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodectab.h @@ -0,0 +1,615 @@ +/* + * MPEG Audio decoder + * copyright (c) 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * mpeg audio layer decoder tables. + */ + +#ifndef AVCODEC_MPEGAUDIODECTAB_H +#define AVCODEC_MPEGAUDIODECTAB_H + +#include <stddef.h> +#include <stdint.h> + +#include "mpegaudio.h" + +/*******************************************************/ +/* layer 3 tables */ + +/* layer 3 huffman tables */ +typedef struct HuffTable { + int xsize; + const uint8_t *bits; + const uint16_t *codes; +} HuffTable; + +/* layer3 scale factor size */ +static const uint8_t slen_table[2][16] = { + { 0, 0, 0, 0, 3, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 }, + { 0, 1, 2, 3, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 3 }, +}; + +/* number of lsf scale factors for a given size */ +static const uint8_t lsf_nsf_table[6][3][4] = { + { { 6, 5, 5, 5 }, { 9, 9, 9, 9 }, { 6, 9, 9, 9 } }, + { { 6, 5, 7, 3 }, { 9, 9, 12, 6 }, { 6, 9, 12, 6 } }, + { { 11, 10, 0, 0 }, { 18, 18, 0, 0 }, { 15, 18, 0, 0 } }, + { { 7, 7, 7, 0 }, { 12, 12, 12, 0 }, { 6, 15, 12, 0 } }, + { { 6, 6, 6, 3 }, { 12, 9, 9, 6 }, { 6, 12, 9, 6 } }, + { { 8, 8, 5, 0 }, { 15, 12, 9, 0 }, { 6, 18, 9, 0 } }, +}; + +/* mpegaudio layer 3 huffman tables */ + +static const uint16_t mpa_huffcodes_1[4] = { + 0x0001, 0x0001, 0x0001, 0x0000, +}; + +static const uint8_t mpa_huffbits_1[4] = { + 1, 3, 2, 3, +}; + +static const uint16_t mpa_huffcodes_2[9] = { + 0x0001, 0x0002, 0x0001, 0x0003, 0x0001, 0x0001, 0x0003, 0x0002, + 0x0000, +}; + +static const uint8_t mpa_huffbits_2[9] = { + 1, 3, 6, 3, 3, 5, 5, 5, + 6, +}; + +static const uint16_t mpa_huffcodes_3[9] = { + 0x0003, 0x0002, 0x0001, 0x0001, 0x0001, 0x0001, 0x0003, 0x0002, + 0x0000, +}; + +static const uint8_t mpa_huffbits_3[9] = { + 2, 2, 6, 3, 2, 5, 5, 5, + 6, +}; + +static const uint16_t mpa_huffcodes_5[16] = { + 0x0001, 0x0002, 0x0006, 0x0005, 0x0003, 0x0001, 0x0004, 0x0004, + 0x0007, 0x0005, 0x0007, 0x0001, 0x0006, 0x0001, 0x0001, 0x0000, +}; + +static const uint8_t mpa_huffbits_5[16] = { + 1, 3, 6, 7, 3, 3, 6, 7, + 6, 6, 7, 8, 7, 6, 7, 8, +}; + +static const uint16_t mpa_huffcodes_6[16] = { + 0x0007, 0x0003, 0x0005, 0x0001, 0x0006, 0x0002, 0x0003, 0x0002, + 0x0005, 0x0004, 0x0004, 0x0001, 0x0003, 0x0003, 0x0002, 0x0000, +}; + +static const uint8_t mpa_huffbits_6[16] = { + 3, 3, 5, 7, 3, 2, 4, 5, + 4, 4, 5, 6, 6, 5, 6, 7, +}; + +static const uint16_t mpa_huffcodes_7[36] = { + 0x0001, 0x0002, 0x000a, 0x0013, 0x0010, 0x000a, 0x0003, 0x0003, + 0x0007, 0x000a, 0x0005, 0x0003, 0x000b, 0x0004, 0x000d, 0x0011, + 0x0008, 0x0004, 0x000c, 0x000b, 0x0012, 0x000f, 0x000b, 0x0002, + 0x0007, 0x0006, 0x0009, 0x000e, 0x0003, 0x0001, 0x0006, 0x0004, + 0x0005, 0x0003, 0x0002, 0x0000, +}; + +static const uint8_t mpa_huffbits_7[36] = { + 1, 3, 6, 8, 8, 9, 3, 4, + 6, 7, 7, 8, 6, 5, 7, 8, + 8, 9, 7, 7, 8, 9, 9, 9, + 7, 7, 8, 9, 9, 10, 8, 8, + 9, 10, 10, 10, +}; + +static const uint16_t mpa_huffcodes_8[36] = { + 0x0003, 0x0004, 0x0006, 0x0012, 0x000c, 0x0005, 0x0005, 0x0001, + 0x0002, 0x0010, 0x0009, 0x0003, 0x0007, 0x0003, 0x0005, 0x000e, + 0x0007, 0x0003, 0x0013, 0x0011, 0x000f, 0x000d, 0x000a, 0x0004, + 0x000d, 0x0005, 0x0008, 0x000b, 0x0005, 0x0001, 0x000c, 0x0004, + 0x0004, 0x0001, 0x0001, 0x0000, +}; + +static const uint8_t mpa_huffbits_8[36] = { + 2, 3, 6, 8, 8, 9, 3, 2, + 4, 8, 8, 8, 6, 4, 6, 8, + 8, 9, 8, 8, 8, 9, 9, 10, + 8, 7, 8, 9, 10, 10, 9, 8, + 9, 9, 11, 11, +}; + +static const uint16_t mpa_huffcodes_9[36] = { + 0x0007, 0x0005, 0x0009, 0x000e, 0x000f, 0x0007, 0x0006, 0x0004, + 0x0005, 0x0005, 0x0006, 0x0007, 0x0007, 0x0006, 0x0008, 0x0008, + 0x0008, 0x0005, 0x000f, 0x0006, 0x0009, 0x000a, 0x0005, 0x0001, + 0x000b, 0x0007, 0x0009, 0x0006, 0x0004, 0x0001, 0x000e, 0x0004, + 0x0006, 0x0002, 0x0006, 0x0000, +}; + +static const uint8_t mpa_huffbits_9[36] = { + 3, 3, 5, 6, 8, 9, 3, 3, + 4, 5, 6, 8, 4, 4, 5, 6, + 7, 8, 6, 5, 6, 7, 7, 8, + 7, 6, 7, 7, 8, 9, 8, 7, + 8, 8, 9, 9, +}; + +static const uint16_t mpa_huffcodes_10[64] = { + 0x0001, 0x0002, 0x000a, 0x0017, 0x0023, 0x001e, 0x000c, 0x0011, + 0x0003, 0x0003, 0x0008, 0x000c, 0x0012, 0x0015, 0x000c, 0x0007, + 0x000b, 0x0009, 0x000f, 0x0015, 0x0020, 0x0028, 0x0013, 0x0006, + 0x000e, 0x000d, 0x0016, 0x0022, 0x002e, 0x0017, 0x0012, 0x0007, + 0x0014, 0x0013, 0x0021, 0x002f, 0x001b, 0x0016, 0x0009, 0x0003, + 0x001f, 0x0016, 0x0029, 0x001a, 0x0015, 0x0014, 0x0005, 0x0003, + 0x000e, 0x000d, 0x000a, 0x000b, 0x0010, 0x0006, 0x0005, 0x0001, + 0x0009, 0x0008, 0x0007, 0x0008, 0x0004, 0x0004, 0x0002, 0x0000, +}; + +static const uint8_t mpa_huffbits_10[64] = { + 1, 3, 6, 8, 9, 9, 9, 10, + 3, 4, 6, 7, 8, 9, 8, 8, + 6, 6, 7, 8, 9, 10, 9, 9, + 7, 7, 8, 9, 10, 10, 9, 10, + 8, 8, 9, 10, 10, 10, 10, 10, + 9, 9, 10, 10, 11, 11, 10, 11, + 8, 8, 9, 10, 10, 10, 11, 11, + 9, 8, 9, 10, 10, 11, 11, 11, +}; + +static const uint16_t mpa_huffcodes_11[64] = { + 0x0003, 0x0004, 0x000a, 0x0018, 0x0022, 0x0021, 0x0015, 0x000f, + 0x0005, 0x0003, 0x0004, 0x000a, 0x0020, 0x0011, 0x000b, 0x000a, + 0x000b, 0x0007, 0x000d, 0x0012, 0x001e, 0x001f, 0x0014, 0x0005, + 0x0019, 0x000b, 0x0013, 0x003b, 0x001b, 0x0012, 0x000c, 0x0005, + 0x0023, 0x0021, 0x001f, 0x003a, 0x001e, 0x0010, 0x0007, 0x0005, + 0x001c, 0x001a, 0x0020, 0x0013, 0x0011, 0x000f, 0x0008, 0x000e, + 0x000e, 0x000c, 0x0009, 0x000d, 0x000e, 0x0009, 0x0004, 0x0001, + 0x000b, 0x0004, 0x0006, 0x0006, 0x0006, 0x0003, 0x0002, 0x0000, +}; + +static const uint8_t mpa_huffbits_11[64] = { + 2, 3, 5, 7, 8, 9, 8, 9, + 3, 3, 4, 6, 8, 8, 7, 8, + 5, 5, 6, 7, 8, 9, 8, 8, + 7, 6, 7, 9, 8, 10, 8, 9, + 8, 8, 8, 9, 9, 10, 9, 10, + 8, 8, 9, 10, 10, 11, 10, 11, + 8, 7, 7, 8, 9, 10, 10, 10, + 8, 7, 8, 9, 10, 10, 10, 10, +}; + +static const uint16_t mpa_huffcodes_12[64] = { + 0x0009, 0x0006, 0x0010, 0x0021, 0x0029, 0x0027, 0x0026, 0x001a, + 0x0007, 0x0005, 0x0006, 0x0009, 0x0017, 0x0010, 0x001a, 0x000b, + 0x0011, 0x0007, 0x000b, 0x000e, 0x0015, 0x001e, 0x000a, 0x0007, + 0x0011, 0x000a, 0x000f, 0x000c, 0x0012, 0x001c, 0x000e, 0x0005, + 0x0020, 0x000d, 0x0016, 0x0013, 0x0012, 0x0010, 0x0009, 0x0005, + 0x0028, 0x0011, 0x001f, 0x001d, 0x0011, 0x000d, 0x0004, 0x0002, + 0x001b, 0x000c, 0x000b, 0x000f, 0x000a, 0x0007, 0x0004, 0x0001, + 0x001b, 0x000c, 0x0008, 0x000c, 0x0006, 0x0003, 0x0001, 0x0000, +}; + +static const uint8_t mpa_huffbits_12[64] = { + 4, 3, 5, 7, 8, 9, 9, 9, + 3, 3, 4, 5, 7, 7, 8, 8, + 5, 4, 5, 6, 7, 8, 7, 8, + 6, 5, 6, 6, 7, 8, 8, 8, + 7, 6, 7, 7, 8, 8, 8, 9, + 8, 7, 8, 8, 8, 9, 8, 9, + 8, 7, 7, 8, 8, 9, 9, 10, + 9, 8, 8, 9, 9, 9, 9, 10, +}; + +static const uint16_t mpa_huffcodes_13[256] = { + 0x0001, 0x0005, 0x000e, 0x0015, 0x0022, 0x0033, 0x002e, 0x0047, + 0x002a, 0x0034, 0x0044, 0x0034, 0x0043, 0x002c, 0x002b, 0x0013, + 0x0003, 0x0004, 0x000c, 0x0013, 0x001f, 0x001a, 0x002c, 0x0021, + 0x001f, 0x0018, 0x0020, 0x0018, 0x001f, 0x0023, 0x0016, 0x000e, + 0x000f, 0x000d, 0x0017, 0x0024, 0x003b, 0x0031, 0x004d, 0x0041, + 0x001d, 0x0028, 0x001e, 0x0028, 0x001b, 0x0021, 0x002a, 0x0010, + 0x0016, 0x0014, 0x0025, 0x003d, 0x0038, 0x004f, 0x0049, 0x0040, + 0x002b, 0x004c, 0x0038, 0x0025, 0x001a, 0x001f, 0x0019, 0x000e, + 0x0023, 0x0010, 0x003c, 0x0039, 0x0061, 0x004b, 0x0072, 0x005b, + 0x0036, 0x0049, 0x0037, 0x0029, 0x0030, 0x0035, 0x0017, 0x0018, + 0x003a, 0x001b, 0x0032, 0x0060, 0x004c, 0x0046, 0x005d, 0x0054, + 0x004d, 0x003a, 0x004f, 0x001d, 0x004a, 0x0031, 0x0029, 0x0011, + 0x002f, 0x002d, 0x004e, 0x004a, 0x0073, 0x005e, 0x005a, 0x004f, + 0x0045, 0x0053, 0x0047, 0x0032, 0x003b, 0x0026, 0x0024, 0x000f, + 0x0048, 0x0022, 0x0038, 0x005f, 0x005c, 0x0055, 0x005b, 0x005a, + 0x0056, 0x0049, 0x004d, 0x0041, 0x0033, 0x002c, 0x002b, 0x002a, + 0x002b, 0x0014, 0x001e, 0x002c, 0x0037, 0x004e, 0x0048, 0x0057, + 0x004e, 0x003d, 0x002e, 0x0036, 0x0025, 0x001e, 0x0014, 0x0010, + 0x0035, 0x0019, 0x0029, 0x0025, 0x002c, 0x003b, 0x0036, 0x0051, + 0x0042, 0x004c, 0x0039, 0x0036, 0x0025, 0x0012, 0x0027, 0x000b, + 0x0023, 0x0021, 0x001f, 0x0039, 0x002a, 0x0052, 0x0048, 0x0050, + 0x002f, 0x003a, 0x0037, 0x0015, 0x0016, 0x001a, 0x0026, 0x0016, + 0x0035, 0x0019, 0x0017, 0x0026, 0x0046, 0x003c, 0x0033, 0x0024, + 0x0037, 0x001a, 0x0022, 0x0017, 0x001b, 0x000e, 0x0009, 0x0007, + 0x0022, 0x0020, 0x001c, 0x0027, 0x0031, 0x004b, 0x001e, 0x0034, + 0x0030, 0x0028, 0x0034, 0x001c, 0x0012, 0x0011, 0x0009, 0x0005, + 0x002d, 0x0015, 0x0022, 0x0040, 0x0038, 0x0032, 0x0031, 0x002d, + 0x001f, 0x0013, 0x000c, 0x000f, 0x000a, 0x0007, 0x0006, 0x0003, + 0x0030, 0x0017, 0x0014, 0x0027, 0x0024, 0x0023, 0x0035, 0x0015, + 0x0010, 0x0017, 0x000d, 0x000a, 0x0006, 0x0001, 0x0004, 0x0002, + 0x0010, 0x000f, 0x0011, 0x001b, 0x0019, 0x0014, 0x001d, 0x000b, + 0x0011, 0x000c, 0x0010, 0x0008, 0x0001, 0x0001, 0x0000, 0x0001, +}; + +static const uint8_t mpa_huffbits_13[256] = { + 1, 4, 6, 7, 8, 9, 9, 10, + 9, 10, 11, 11, 12, 12, 13, 13, + 3, 4, 6, 7, 8, 8, 9, 9, + 9, 9, 10, 10, 11, 12, 12, 12, + 6, 6, 7, 8, 9, 9, 10, 10, + 9, 10, 10, 11, 11, 12, 13, 13, + 7, 7, 8, 9, 9, 10, 10, 10, + 10, 11, 11, 11, 11, 12, 13, 13, + 8, 7, 9, 9, 10, 10, 11, 11, + 10, 11, 11, 12, 12, 13, 13, 14, + 9, 8, 9, 10, 10, 10, 11, 11, + 11, 11, 12, 11, 13, 13, 14, 14, + 9, 9, 10, 10, 11, 11, 11, 11, + 11, 12, 12, 12, 13, 13, 14, 14, + 10, 9, 10, 11, 11, 11, 12, 12, + 12, 12, 13, 13, 13, 14, 16, 16, + 9, 8, 9, 10, 10, 11, 11, 12, + 12, 12, 12, 13, 13, 14, 15, 15, + 10, 9, 10, 10, 11, 11, 11, 13, + 12, 13, 13, 14, 14, 14, 16, 15, + 10, 10, 10, 11, 11, 12, 12, 13, + 12, 13, 14, 13, 14, 15, 16, 17, + 11, 10, 10, 11, 12, 12, 12, 12, + 13, 13, 13, 14, 15, 15, 15, 16, + 11, 11, 11, 12, 12, 13, 12, 13, + 14, 14, 15, 15, 15, 16, 16, 16, + 12, 11, 12, 13, 13, 13, 14, 14, + 14, 14, 14, 15, 16, 15, 16, 16, + 13, 12, 12, 13, 13, 13, 15, 14, + 14, 17, 15, 15, 15, 17, 16, 16, + 12, 12, 13, 14, 14, 14, 15, 14, + 15, 15, 16, 16, 19, 18, 19, 16, +}; + +static const uint16_t mpa_huffcodes_15[256] = { + 0x0007, 0x000c, 0x0012, 0x0035, 0x002f, 0x004c, 0x007c, 0x006c, + 0x0059, 0x007b, 0x006c, 0x0077, 0x006b, 0x0051, 0x007a, 0x003f, + 0x000d, 0x0005, 0x0010, 0x001b, 0x002e, 0x0024, 0x003d, 0x0033, + 0x002a, 0x0046, 0x0034, 0x0053, 0x0041, 0x0029, 0x003b, 0x0024, + 0x0013, 0x0011, 0x000f, 0x0018, 0x0029, 0x0022, 0x003b, 0x0030, + 0x0028, 0x0040, 0x0032, 0x004e, 0x003e, 0x0050, 0x0038, 0x0021, + 0x001d, 0x001c, 0x0019, 0x002b, 0x0027, 0x003f, 0x0037, 0x005d, + 0x004c, 0x003b, 0x005d, 0x0048, 0x0036, 0x004b, 0x0032, 0x001d, + 0x0034, 0x0016, 0x002a, 0x0028, 0x0043, 0x0039, 0x005f, 0x004f, + 0x0048, 0x0039, 0x0059, 0x0045, 0x0031, 0x0042, 0x002e, 0x001b, + 0x004d, 0x0025, 0x0023, 0x0042, 0x003a, 0x0034, 0x005b, 0x004a, + 0x003e, 0x0030, 0x004f, 0x003f, 0x005a, 0x003e, 0x0028, 0x0026, + 0x007d, 0x0020, 0x003c, 0x0038, 0x0032, 0x005c, 0x004e, 0x0041, + 0x0037, 0x0057, 0x0047, 0x0033, 0x0049, 0x0033, 0x0046, 0x001e, + 0x006d, 0x0035, 0x0031, 0x005e, 0x0058, 0x004b, 0x0042, 0x007a, + 0x005b, 0x0049, 0x0038, 0x002a, 0x0040, 0x002c, 0x0015, 0x0019, + 0x005a, 0x002b, 0x0029, 0x004d, 0x0049, 0x003f, 0x0038, 0x005c, + 0x004d, 0x0042, 0x002f, 0x0043, 0x0030, 0x0035, 0x0024, 0x0014, + 0x0047, 0x0022, 0x0043, 0x003c, 0x003a, 0x0031, 0x0058, 0x004c, + 0x0043, 0x006a, 0x0047, 0x0036, 0x0026, 0x0027, 0x0017, 0x000f, + 0x006d, 0x0035, 0x0033, 0x002f, 0x005a, 0x0052, 0x003a, 0x0039, + 0x0030, 0x0048, 0x0039, 0x0029, 0x0017, 0x001b, 0x003e, 0x0009, + 0x0056, 0x002a, 0x0028, 0x0025, 0x0046, 0x0040, 0x0034, 0x002b, + 0x0046, 0x0037, 0x002a, 0x0019, 0x001d, 0x0012, 0x000b, 0x000b, + 0x0076, 0x0044, 0x001e, 0x0037, 0x0032, 0x002e, 0x004a, 0x0041, + 0x0031, 0x0027, 0x0018, 0x0010, 0x0016, 0x000d, 0x000e, 0x0007, + 0x005b, 0x002c, 0x0027, 0x0026, 0x0022, 0x003f, 0x0034, 0x002d, + 0x001f, 0x0034, 0x001c, 0x0013, 0x000e, 0x0008, 0x0009, 0x0003, + 0x007b, 0x003c, 0x003a, 0x0035, 0x002f, 0x002b, 0x0020, 0x0016, + 0x0025, 0x0018, 0x0011, 0x000c, 0x000f, 0x000a, 0x0002, 0x0001, + 0x0047, 0x0025, 0x0022, 0x001e, 0x001c, 0x0014, 0x0011, 0x001a, + 0x0015, 0x0010, 0x000a, 0x0006, 0x0008, 0x0006, 0x0002, 0x0000, +}; + +static const uint8_t mpa_huffbits_15[256] = { + 3, 4, 5, 7, 7, 8, 9, 9, + 9, 10, 10, 11, 11, 11, 12, 13, + 4, 3, 5, 6, 7, 7, 8, 8, + 8, 9, 9, 10, 10, 10, 11, 11, + 5, 5, 5, 6, 7, 7, 8, 8, + 8, 9, 9, 10, 10, 11, 11, 11, + 6, 6, 6, 7, 7, 8, 8, 9, + 9, 9, 10, 10, 10, 11, 11, 11, + 7, 6, 7, 7, 8, 8, 9, 9, + 9, 9, 10, 10, 10, 11, 11, 11, + 8, 7, 7, 8, 8, 8, 9, 9, + 9, 9, 10, 10, 11, 11, 11, 12, + 9, 7, 8, 8, 8, 9, 9, 9, + 9, 10, 10, 10, 11, 11, 12, 12, + 9, 8, 8, 9, 9, 9, 9, 10, + 10, 10, 10, 10, 11, 11, 11, 12, + 9, 8, 8, 9, 9, 9, 9, 10, + 10, 10, 10, 11, 11, 12, 12, 12, + 9, 8, 9, 9, 9, 9, 10, 10, + 10, 11, 11, 11, 11, 12, 12, 12, + 10, 9, 9, 9, 10, 10, 10, 10, + 10, 11, 11, 11, 11, 12, 13, 12, + 10, 9, 9, 9, 10, 10, 10, 10, + 11, 11, 11, 11, 12, 12, 12, 13, + 11, 10, 9, 10, 10, 10, 11, 11, + 11, 11, 11, 11, 12, 12, 13, 13, + 11, 10, 10, 10, 10, 11, 11, 11, + 11, 12, 12, 12, 12, 12, 13, 13, + 12, 11, 11, 11, 11, 11, 11, 11, + 12, 12, 12, 12, 13, 13, 12, 13, + 12, 11, 11, 11, 11, 11, 11, 12, + 12, 12, 12, 12, 13, 13, 13, 13, +}; + +static const uint16_t mpa_huffcodes_16[256] = { + 0x0001, 0x0005, 0x000e, 0x002c, 0x004a, 0x003f, 0x006e, 0x005d, + 0x00ac, 0x0095, 0x008a, 0x00f2, 0x00e1, 0x00c3, 0x0178, 0x0011, + 0x0003, 0x0004, 0x000c, 0x0014, 0x0023, 0x003e, 0x0035, 0x002f, + 0x0053, 0x004b, 0x0044, 0x0077, 0x00c9, 0x006b, 0x00cf, 0x0009, + 0x000f, 0x000d, 0x0017, 0x0026, 0x0043, 0x003a, 0x0067, 0x005a, + 0x00a1, 0x0048, 0x007f, 0x0075, 0x006e, 0x00d1, 0x00ce, 0x0010, + 0x002d, 0x0015, 0x0027, 0x0045, 0x0040, 0x0072, 0x0063, 0x0057, + 0x009e, 0x008c, 0x00fc, 0x00d4, 0x00c7, 0x0183, 0x016d, 0x001a, + 0x004b, 0x0024, 0x0044, 0x0041, 0x0073, 0x0065, 0x00b3, 0x00a4, + 0x009b, 0x0108, 0x00f6, 0x00e2, 0x018b, 0x017e, 0x016a, 0x0009, + 0x0042, 0x001e, 0x003b, 0x0038, 0x0066, 0x00b9, 0x00ad, 0x0109, + 0x008e, 0x00fd, 0x00e8, 0x0190, 0x0184, 0x017a, 0x01bd, 0x0010, + 0x006f, 0x0036, 0x0034, 0x0064, 0x00b8, 0x00b2, 0x00a0, 0x0085, + 0x0101, 0x00f4, 0x00e4, 0x00d9, 0x0181, 0x016e, 0x02cb, 0x000a, + 0x0062, 0x0030, 0x005b, 0x0058, 0x00a5, 0x009d, 0x0094, 0x0105, + 0x00f8, 0x0197, 0x018d, 0x0174, 0x017c, 0x0379, 0x0374, 0x0008, + 0x0055, 0x0054, 0x0051, 0x009f, 0x009c, 0x008f, 0x0104, 0x00f9, + 0x01ab, 0x0191, 0x0188, 0x017f, 0x02d7, 0x02c9, 0x02c4, 0x0007, + 0x009a, 0x004c, 0x0049, 0x008d, 0x0083, 0x0100, 0x00f5, 0x01aa, + 0x0196, 0x018a, 0x0180, 0x02df, 0x0167, 0x02c6, 0x0160, 0x000b, + 0x008b, 0x0081, 0x0043, 0x007d, 0x00f7, 0x00e9, 0x00e5, 0x00db, + 0x0189, 0x02e7, 0x02e1, 0x02d0, 0x0375, 0x0372, 0x01b7, 0x0004, + 0x00f3, 0x0078, 0x0076, 0x0073, 0x00e3, 0x00df, 0x018c, 0x02ea, + 0x02e6, 0x02e0, 0x02d1, 0x02c8, 0x02c2, 0x00df, 0x01b4, 0x0006, + 0x00ca, 0x00e0, 0x00de, 0x00da, 0x00d8, 0x0185, 0x0182, 0x017d, + 0x016c, 0x0378, 0x01bb, 0x02c3, 0x01b8, 0x01b5, 0x06c0, 0x0004, + 0x02eb, 0x00d3, 0x00d2, 0x00d0, 0x0172, 0x017b, 0x02de, 0x02d3, + 0x02ca, 0x06c7, 0x0373, 0x036d, 0x036c, 0x0d83, 0x0361, 0x0002, + 0x0179, 0x0171, 0x0066, 0x00bb, 0x02d6, 0x02d2, 0x0166, 0x02c7, + 0x02c5, 0x0362, 0x06c6, 0x0367, 0x0d82, 0x0366, 0x01b2, 0x0000, + 0x000c, 0x000a, 0x0007, 0x000b, 0x000a, 0x0011, 0x000b, 0x0009, + 0x000d, 0x000c, 0x000a, 0x0007, 0x0005, 0x0003, 0x0001, 0x0003, +}; + +static const uint8_t mpa_huffbits_16[256] = { + 1, 4, 6, 8, 9, 9, 10, 10, + 11, 11, 11, 12, 12, 12, 13, 9, + 3, 4, 6, 7, 8, 9, 9, 9, + 10, 10, 10, 11, 12, 11, 12, 8, + 6, 6, 7, 8, 9, 9, 10, 10, + 11, 10, 11, 11, 11, 12, 12, 9, + 8, 7, 8, 9, 9, 10, 10, 10, + 11, 11, 12, 12, 12, 13, 13, 10, + 9, 8, 9, 9, 10, 10, 11, 11, + 11, 12, 12, 12, 13, 13, 13, 9, + 9, 8, 9, 9, 10, 11, 11, 12, + 11, 12, 12, 13, 13, 13, 14, 10, + 10, 9, 9, 10, 11, 11, 11, 11, + 12, 12, 12, 12, 13, 13, 14, 10, + 10, 9, 10, 10, 11, 11, 11, 12, + 12, 13, 13, 13, 13, 15, 15, 10, + 10, 10, 10, 11, 11, 11, 12, 12, + 13, 13, 13, 13, 14, 14, 14, 10, + 11, 10, 10, 11, 11, 12, 12, 13, + 13, 13, 13, 14, 13, 14, 13, 11, + 11, 11, 10, 11, 12, 12, 12, 12, + 13, 14, 14, 14, 15, 15, 14, 10, + 12, 11, 11, 11, 12, 12, 13, 14, + 14, 14, 14, 14, 14, 13, 14, 11, + 12, 12, 12, 12, 12, 13, 13, 13, + 13, 15, 14, 14, 14, 14, 16, 11, + 14, 12, 12, 12, 13, 13, 14, 14, + 14, 16, 15, 15, 15, 17, 15, 11, + 13, 13, 11, 12, 14, 14, 13, 14, + 14, 15, 16, 15, 17, 15, 14, 11, + 9, 8, 8, 9, 9, 10, 10, 10, + 11, 11, 11, 11, 11, 11, 11, 8, +}; + +static const uint16_t mpa_huffcodes_24[256] = { + 0x000f, 0x000d, 0x002e, 0x0050, 0x0092, 0x0106, 0x00f8, 0x01b2, + 0x01aa, 0x029d, 0x028d, 0x0289, 0x026d, 0x0205, 0x0408, 0x0058, + 0x000e, 0x000c, 0x0015, 0x0026, 0x0047, 0x0082, 0x007a, 0x00d8, + 0x00d1, 0x00c6, 0x0147, 0x0159, 0x013f, 0x0129, 0x0117, 0x002a, + 0x002f, 0x0016, 0x0029, 0x004a, 0x0044, 0x0080, 0x0078, 0x00dd, + 0x00cf, 0x00c2, 0x00b6, 0x0154, 0x013b, 0x0127, 0x021d, 0x0012, + 0x0051, 0x0027, 0x004b, 0x0046, 0x0086, 0x007d, 0x0074, 0x00dc, + 0x00cc, 0x00be, 0x00b2, 0x0145, 0x0137, 0x0125, 0x010f, 0x0010, + 0x0093, 0x0048, 0x0045, 0x0087, 0x007f, 0x0076, 0x0070, 0x00d2, + 0x00c8, 0x00bc, 0x0160, 0x0143, 0x0132, 0x011d, 0x021c, 0x000e, + 0x0107, 0x0042, 0x0081, 0x007e, 0x0077, 0x0072, 0x00d6, 0x00ca, + 0x00c0, 0x00b4, 0x0155, 0x013d, 0x012d, 0x0119, 0x0106, 0x000c, + 0x00f9, 0x007b, 0x0079, 0x0075, 0x0071, 0x00d7, 0x00ce, 0x00c3, + 0x00b9, 0x015b, 0x014a, 0x0134, 0x0123, 0x0110, 0x0208, 0x000a, + 0x01b3, 0x0073, 0x006f, 0x006d, 0x00d3, 0x00cb, 0x00c4, 0x00bb, + 0x0161, 0x014c, 0x0139, 0x012a, 0x011b, 0x0213, 0x017d, 0x0011, + 0x01ab, 0x00d4, 0x00d0, 0x00cd, 0x00c9, 0x00c1, 0x00ba, 0x00b1, + 0x00a9, 0x0140, 0x012f, 0x011e, 0x010c, 0x0202, 0x0179, 0x0010, + 0x014f, 0x00c7, 0x00c5, 0x00bf, 0x00bd, 0x00b5, 0x00ae, 0x014d, + 0x0141, 0x0131, 0x0121, 0x0113, 0x0209, 0x017b, 0x0173, 0x000b, + 0x029c, 0x00b8, 0x00b7, 0x00b3, 0x00af, 0x0158, 0x014b, 0x013a, + 0x0130, 0x0122, 0x0115, 0x0212, 0x017f, 0x0175, 0x016e, 0x000a, + 0x028c, 0x015a, 0x00ab, 0x00a8, 0x00a4, 0x013e, 0x0135, 0x012b, + 0x011f, 0x0114, 0x0107, 0x0201, 0x0177, 0x0170, 0x016a, 0x0006, + 0x0288, 0x0142, 0x013c, 0x0138, 0x0133, 0x012e, 0x0124, 0x011c, + 0x010d, 0x0105, 0x0200, 0x0178, 0x0172, 0x016c, 0x0167, 0x0004, + 0x026c, 0x012c, 0x0128, 0x0126, 0x0120, 0x011a, 0x0111, 0x010a, + 0x0203, 0x017c, 0x0176, 0x0171, 0x016d, 0x0169, 0x0165, 0x0002, + 0x0409, 0x0118, 0x0116, 0x0112, 0x010b, 0x0108, 0x0103, 0x017e, + 0x017a, 0x0174, 0x016f, 0x016b, 0x0168, 0x0166, 0x0164, 0x0000, + 0x002b, 0x0014, 0x0013, 0x0011, 0x000f, 0x000d, 0x000b, 0x0009, + 0x0007, 0x0006, 0x0004, 0x0007, 0x0005, 0x0003, 0x0001, 0x0003, +}; + +static const uint8_t mpa_huffbits_24[256] = { + 4, 4, 6, 7, 8, 9, 9, 10, + 10, 11, 11, 11, 11, 11, 12, 9, + 4, 4, 5, 6, 7, 8, 8, 9, + 9, 9, 10, 10, 10, 10, 10, 8, + 6, 5, 6, 7, 7, 8, 8, 9, + 9, 9, 9, 10, 10, 10, 11, 7, + 7, 6, 7, 7, 8, 8, 8, 9, + 9, 9, 9, 10, 10, 10, 10, 7, + 8, 7, 7, 8, 8, 8, 8, 9, + 9, 9, 10, 10, 10, 10, 11, 7, + 9, 7, 8, 8, 8, 8, 9, 9, + 9, 9, 10, 10, 10, 10, 10, 7, + 9, 8, 8, 8, 8, 9, 9, 9, + 9, 10, 10, 10, 10, 10, 11, 7, + 10, 8, 8, 8, 9, 9, 9, 9, + 10, 10, 10, 10, 10, 11, 11, 8, + 10, 9, 9, 9, 9, 9, 9, 9, + 9, 10, 10, 10, 10, 11, 11, 8, + 10, 9, 9, 9, 9, 9, 9, 10, + 10, 10, 10, 10, 11, 11, 11, 8, + 11, 9, 9, 9, 9, 10, 10, 10, + 10, 10, 10, 11, 11, 11, 11, 8, + 11, 10, 9, 9, 9, 10, 10, 10, + 10, 10, 10, 11, 11, 11, 11, 8, + 11, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 11, 11, 11, 11, 11, 8, + 11, 10, 10, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 11, 11, 8, + 12, 10, 10, 10, 10, 10, 10, 11, + 11, 11, 11, 11, 11, 11, 11, 8, + 8, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 8, 8, 8, 8, 4, +}; + +static const HuffTable mpa_huff_tables[16] = { +{ 1, NULL, NULL }, +{ 2, mpa_huffbits_1, mpa_huffcodes_1 }, +{ 3, mpa_huffbits_2, mpa_huffcodes_2 }, +{ 3, mpa_huffbits_3, mpa_huffcodes_3 }, +{ 4, mpa_huffbits_5, mpa_huffcodes_5 }, +{ 4, mpa_huffbits_6, mpa_huffcodes_6 }, +{ 6, mpa_huffbits_7, mpa_huffcodes_7 }, +{ 6, mpa_huffbits_8, mpa_huffcodes_8 }, +{ 6, mpa_huffbits_9, mpa_huffcodes_9 }, +{ 8, mpa_huffbits_10, mpa_huffcodes_10 }, +{ 8, mpa_huffbits_11, mpa_huffcodes_11 }, +{ 8, mpa_huffbits_12, mpa_huffcodes_12 }, +{ 16, mpa_huffbits_13, mpa_huffcodes_13 }, +{ 16, mpa_huffbits_15, mpa_huffcodes_15 }, +{ 16, mpa_huffbits_16, mpa_huffcodes_16 }, +{ 16, mpa_huffbits_24, mpa_huffcodes_24 }, +}; + +static const uint8_t mpa_huff_data[32][2] = { +{ 0, 0 }, +{ 1, 0 }, +{ 2, 0 }, +{ 3, 0 }, +{ 0, 0 }, +{ 4, 0 }, +{ 5, 0 }, +{ 6, 0 }, +{ 7, 0 }, +{ 8, 0 }, +{ 9, 0 }, +{ 10, 0 }, +{ 11, 0 }, +{ 12, 0 }, +{ 0, 0 }, +{ 13, 0 }, +{ 14, 1 }, +{ 14, 2 }, +{ 14, 3 }, +{ 14, 4 }, +{ 14, 6 }, +{ 14, 8 }, +{ 14, 10 }, +{ 14, 13 }, +{ 15, 4 }, +{ 15, 5 }, +{ 15, 6 }, +{ 15, 7 }, +{ 15, 8 }, +{ 15, 9 }, +{ 15, 11 }, +{ 15, 13 }, +}; + + +/* huffman tables for quadrules */ +static const uint8_t mpa_quad_codes[2][16] = { + { 1, 5, 4, 5, 6, 5, 4, 4, 7, 3, 6, 0, 7, 2, 3, 1, }, + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, }, +}; + +static const uint8_t mpa_quad_bits[2][16] = { + { 1, 4, 4, 5, 4, 6, 5, 6, 4, 5, 5, 6, 5, 6, 6, 6, }, + { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, +}; + +/* band size tables */ +static const uint8_t band_size_long[9][22] = { +{ 4, 4, 4, 4, 4, 4, 6, 6, 8, 8, 10, + 12, 16, 20, 24, 28, 34, 42, 50, 54, 76, 158, }, /* 44100 */ +{ 4, 4, 4, 4, 4, 4, 6, 6, 6, 8, 10, + 12, 16, 18, 22, 28, 34, 40, 46, 54, 54, 192, }, /* 48000 */ +{ 4, 4, 4, 4, 4, 4, 6, 6, 8, 10, 12, + 16, 20, 24, 30, 38, 46, 56, 68, 84, 102, 26, }, /* 32000 */ +{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16, + 20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 22050 */ +{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16, + 18, 22, 26, 32, 38, 46, 52, 64, 70, 76, 36, }, /* 24000 */ +{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16, + 20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 16000 */ +{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16, + 20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 11025 */ +{ 6, 6, 6, 6, 6, 6, 8, 10, 12, 14, 16, + 20, 24, 28, 32, 38, 46, 52, 60, 68, 58, 54, }, /* 12000 */ +{ 12, 12, 12, 12, 12, 12, 16, 20, 24, 28, 32, + 40, 48, 56, 64, 76, 90, 2, 2, 2, 2, 2, }, /* 8000 */ +}; + +static const uint8_t band_size_short[9][13] = { +{ 4, 4, 4, 4, 6, 8, 10, 12, 14, 18, 22, 30, 56, }, /* 44100 */ +{ 4, 4, 4, 4, 6, 6, 10, 12, 14, 16, 20, 26, 66, }, /* 48000 */ +{ 4, 4, 4, 4, 6, 8, 12, 16, 20, 26, 34, 42, 12, }, /* 32000 */ +{ 4, 4, 4, 6, 6, 8, 10, 14, 18, 26, 32, 42, 18, }, /* 22050 */ +{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 32, 44, 12, }, /* 24000 */ +{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 30, 40, 18, }, /* 16000 */ +{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 30, 40, 18, }, /* 11025 */ +{ 4, 4, 4, 6, 8, 10, 12, 14, 18, 24, 30, 40, 18, }, /* 12000 */ +{ 8, 8, 8, 12, 16, 20, 24, 28, 36, 2, 2, 2, 26, }, /* 8000 */ +}; + +static const uint8_t mpa_pretab[2][22] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 2, 0 }, +}; + +/* table for alias reduction (XXX: store it as integer !) */ +static const float ci_table[8] = { + -0.6, -0.535, -0.33, -0.185, -0.095, -0.041, -0.0142, -0.0037, +}; + +#endif /* AVCODEC_MPEGAUDIODECTAB_H */ diff --git a/media/ffvpx/libavcodec/mpegaudiodsp.c b/media/ffvpx/libavcodec/mpegaudiodsp.c new file mode 100644 index 0000000000..5a5a679d91 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodsp.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2011 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/thread.h" +#include "mpegaudio.h" +#include "mpegaudiodsp.h" +#include "dct.h" +#include "dct32.h" + +static AVOnce mpadsp_table_init = AV_ONCE_INIT; + +static av_cold void mpadsp_init_tabs(void) +{ + int i, j; + /* compute mdct windows */ + for (i = 0; i < 36; i++) { + for (j = 0; j < 4; j++) { + double d; + + if (j == 2 && i % 3 != 1) + continue; + + d = sin(M_PI * (i + 0.5) / 36.0); + if (j == 1) { + if (i >= 30) d = 0; + else if (i >= 24) d = sin(M_PI * (i - 18 + 0.5) / 12.0); + else if (i >= 18) d = 1; + } else if (j == 3) { + if (i < 6) d = 0; + else if (i < 12) d = sin(M_PI * (i - 6 + 0.5) / 12.0); + else if (i < 18) d = 1; + } + //merge last stage of imdct into the window coefficients + d *= 0.5 * IMDCT_SCALAR / cos(M_PI * (2 * i + 19) / 72); + + if (j == 2) { + ff_mdct_win_float[j][i/3] = d / (1 << 5); + ff_mdct_win_fixed[j][i/3] = d / (1 << 5) * (1LL << 32) + 0.5; + } else { + int idx = i < 18 ? i : i + (MDCT_BUF_SIZE/2 - 18); + ff_mdct_win_float[j][idx] = d / (1 << 5); + ff_mdct_win_fixed[j][idx] = d / (1 << 5) * (1LL << 32) + 0.5; + } + } + } + + /* NOTE: we do frequency inversion after the MDCT by changing + the sign of the right window coefs */ + for (j = 0; j < 4; j++) { + for (i = 0; i < MDCT_BUF_SIZE; i += 2) { + ff_mdct_win_float[j + 4][i ] = ff_mdct_win_float[j][i ]; + ff_mdct_win_float[j + 4][i + 1] = -ff_mdct_win_float[j][i + 1]; + ff_mdct_win_fixed[j + 4][i ] = ff_mdct_win_fixed[j][i ]; + ff_mdct_win_fixed[j + 4][i + 1] = -ff_mdct_win_fixed[j][i + 1]; + } + } + +#if ARCH_X86 + ff_mpadsp_init_x86_tabs(); +#endif +} + +av_cold void ff_mpadsp_init(MPADSPContext *s) +{ + DCTContext dct; + + ff_dct_init(&dct, 5, DCT_II); + ff_thread_once(&mpadsp_table_init, &mpadsp_init_tabs); + + s->apply_window_float = ff_mpadsp_apply_window_float; + s->apply_window_fixed = ff_mpadsp_apply_window_fixed; + + s->dct32_float = dct.dct32; + s->dct32_fixed = ff_dct32_fixed; + + s->imdct36_blocks_float = ff_imdct36_blocks_float; + s->imdct36_blocks_fixed = ff_imdct36_blocks_fixed; + +#if ARCH_AARCH64 + ff_mpadsp_init_aarch64(s); +#elif ARCH_ARM + ff_mpadsp_init_arm(s); +#elif ARCH_PPC + ff_mpadsp_init_ppc(s); +#elif ARCH_X86 + ff_mpadsp_init_x86(s); +#endif +#if HAVE_MIPSFPU + ff_mpadsp_init_mipsfpu(s); +#endif +#if HAVE_MIPSDSP + ff_mpadsp_init_mipsdsp(s); +#endif +} diff --git a/media/ffvpx/libavcodec/mpegaudiodsp.h b/media/ffvpx/libavcodec/mpegaudiodsp.h new file mode 100644 index 0000000000..7bc635191a --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodsp.h @@ -0,0 +1,92 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MPEGAUDIODSP_H +#define AVCODEC_MPEGAUDIODSP_H + +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/macros.h" + +typedef struct MPADSPContext { + void (*apply_window_float)(float *synth_buf, float *window, + int *dither_state, float *samples, + ptrdiff_t incr); + void (*apply_window_fixed)(int32_t *synth_buf, int32_t *window, + int *dither_state, int16_t *samples, + ptrdiff_t incr); + void (*dct32_float)(float *dst, const float *src); + void (*dct32_fixed)(int *dst, const int *src); + + void (*imdct36_blocks_float)(float *out, float *buf, float *in, + int count, int switch_point, int block_type); + void (*imdct36_blocks_fixed)(int *out, int *buf, int *in, + int count, int switch_point, int block_type); +} MPADSPContext; + +void ff_mpadsp_init(MPADSPContext *s); + +extern int32_t ff_mpa_synth_window_fixed[]; +extern float ff_mpa_synth_window_float[]; + +extern const int32_t ff_mpa_enwindow[257]; + +void ff_mpa_synth_filter_fixed(MPADSPContext *s, + int32_t *synth_buf_ptr, int *synth_buf_offset, + int32_t *window, int *dither_state, + int16_t *samples, ptrdiff_t incr, + int32_t *sb_samples); + +void ff_mpa_synth_filter_float(MPADSPContext *s, + float *synth_buf_ptr, int *synth_buf_offset, + float *window, int *dither_state, + float *samples, ptrdiff_t incr, + float *sb_samples); + +void ff_mpadsp_init_aarch64(MPADSPContext *s); +void ff_mpadsp_init_arm(MPADSPContext *s); +void ff_mpadsp_init_ppc(MPADSPContext *s); +void ff_mpadsp_init_x86(MPADSPContext *s); +void ff_mpadsp_init_x86_tabs(void); +void ff_mpadsp_init_mipsfpu(MPADSPContext *s); +void ff_mpadsp_init_mipsdsp(MPADSPContext *s); + +void ff_mpa_synth_init_float(void); +void ff_mpa_synth_init_fixed(void); + +void ff_mpadsp_apply_window_float(float *synth_buf, float *window, + int *dither_state, float *samples, + ptrdiff_t incr); +void ff_mpadsp_apply_window_fixed(int32_t *synth_buf, int32_t *window, + int *dither_state, int16_t *samples, + ptrdiff_t incr); + +void ff_imdct36_blocks_float(float *out, float *buf, float *in, + int count, int switch_point, int block_type); + +void ff_imdct36_blocks_fixed(int *out, int *buf, int *in, + int count, int switch_point, int block_type); + +/** For SSE implementation, MDCT_BUF_SIZE/2 should be 128-bit aligned */ +#define MDCT_BUF_SIZE FFALIGN(36, 2*4) + +extern int ff_mdct_win_fixed[8][MDCT_BUF_SIZE]; +extern float ff_mdct_win_float[8][MDCT_BUF_SIZE]; + +#endif /* AVCODEC_MPEGAUDIODSP_H */ diff --git a/media/ffvpx/libavcodec/mpegaudiodsp_data.c b/media/ffvpx/libavcodec/mpegaudiodsp_data.c new file mode 100644 index 0000000000..4550de9b80 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodsp_data.c @@ -0,0 +1,56 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "mpegaudiodsp.h" + +/* half mpeg encoding window (full precision) */ +const int32_t ff_mpa_enwindow[257] = { + 0, -1, -1, -1, -1, -1, -1, -2, + -2, -2, -2, -3, -3, -4, -4, -5, + -5, -6, -7, -7, -8, -9, -10, -11, + -13, -14, -16, -17, -19, -21, -24, -26, + -29, -31, -35, -38, -41, -45, -49, -53, + -58, -63, -68, -73, -79, -85, -91, -97, + -104, -111, -117, -125, -132, -139, -147, -154, + -161, -169, -176, -183, -190, -196, -202, -208, + 213, 218, 222, 225, 227, 228, 228, 227, + 224, 221, 215, 208, 200, 189, 177, 163, + 146, 127, 106, 83, 57, 29, -2, -36, + -72, -111, -153, -197, -244, -294, -347, -401, + -459, -519, -581, -645, -711, -779, -848, -919, + -991, -1064, -1137, -1210, -1283, -1356, -1428, -1498, + -1567, -1634, -1698, -1759, -1817, -1870, -1919, -1962, + -2001, -2032, -2057, -2075, -2085, -2087, -2080, -2063, + 2037, 2000, 1952, 1893, 1822, 1739, 1644, 1535, + 1414, 1280, 1131, 970, 794, 605, 402, 185, + -45, -288, -545, -814, -1095, -1388, -1692, -2006, + -2330, -2663, -3004, -3351, -3705, -4063, -4425, -4788, + -5153, -5517, -5879, -6237, -6589, -6935, -7271, -7597, + -7910, -8209, -8491, -8755, -8998, -9219, -9416, -9585, + -9727, -9838, -9916, -9959, -9966, -9935, -9863, -9750, + -9592, -9389, -9139, -8840, -8492, -8092, -7640, -7134, + 6574, 5959, 5288, 4561, 3776, 2935, 2037, 1082, + 70, -998, -2122, -3300, -4533, -5818, -7154, -8540, + -9975,-11455,-12980,-14548,-16155,-17799,-19478,-21189, +-22929,-24694,-26482,-28289,-30112,-31947,-33791,-35640, +-37489,-39336,-41176,-43006,-44821,-46617,-48390,-50137, +-51853,-53534,-55178,-56778,-58333,-59838,-61289,-62684, +-64019,-65290,-66494,-67629,-68692,-69679,-70590,-71420, +-72169,-72835,-73415,-73908,-74313,-74630,-74856,-74992, + 75038, +}; diff --git a/media/ffvpx/libavcodec/mpegaudiodsp_fixed.c b/media/ffvpx/libavcodec/mpegaudiodsp_fixed.c new file mode 100644 index 0000000000..83c9d66095 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodsp_fixed.c @@ -0,0 +1,20 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define USE_FLOATS 0 +#include "mpegaudiodsp_template.c" diff --git a/media/ffvpx/libavcodec/mpegaudiodsp_float.c b/media/ffvpx/libavcodec/mpegaudiodsp_float.c new file mode 100644 index 0000000000..c45b136089 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodsp_float.c @@ -0,0 +1,20 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define USE_FLOATS 1 +#include "mpegaudiodsp_template.c" diff --git a/media/ffvpx/libavcodec/mpegaudiodsp_template.c b/media/ffvpx/libavcodec/mpegaudiodsp_template.c new file mode 100644 index 0000000000..fbbd94e486 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiodsp_template.c @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2001, 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/mem_internal.h" +#include "libavutil/thread.h" + +#include "dct32.h" +#include "mathops.h" +#include "mpegaudiodsp.h" +#include "mpegaudio.h" + +#if USE_FLOATS +#define RENAME(n) n##_float + +static inline float round_sample(float *sum) +{ + float sum1=*sum; + *sum = 0; + return sum1; +} + +#define MACS(rt, ra, rb) rt+=(ra)*(rb) +#define MULS(ra, rb) ((ra)*(rb)) +#define MULH3(x, y, s) ((s)*(y)*(x)) +#define MLSS(rt, ra, rb) rt-=(ra)*(rb) +#define MULLx(x, y, s) ((y)*(x)) +#define FIXHR(x) ((float)(x)) +#define FIXR(x) ((float)(x)) +#define SHR(a,b) ((a)*(1.0f/(1<<(b)))) + +#else + +#define RENAME(n) n##_fixed +#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15) + +static inline int round_sample(int64_t *sum) +{ + int sum1; + sum1 = (int)((*sum) >> OUT_SHIFT); + *sum &= (1<<OUT_SHIFT)-1; + return av_clip_int16(sum1); +} + +# define MULS(ra, rb) MUL64(ra, rb) +# define MACS(rt, ra, rb) MAC64(rt, ra, rb) +# define MLSS(rt, ra, rb) MLS64(rt, ra, rb) +# define MULH3(x, y, s) MULH((s)*(x), y) +# define MULLx(x, y, s) MULL((int)(x),(y),s) +# define SHR(a,b) (((int)(a))>>(b)) +# define FIXR(a) ((int)((a) * FRAC_ONE + 0.5)) +# define FIXHR(a) ((int)((a) * (1LL<<32) + 0.5)) +#endif + +/** Window for MDCT. Actually only the elements in [0,17] and + [MDCT_BUF_SIZE/2, MDCT_BUF_SIZE/2 + 17] are actually used. The rest + is just to preserve alignment for SIMD implementations. +*/ +DECLARE_ALIGNED(16, INTFLOAT, RENAME(ff_mdct_win))[8][MDCT_BUF_SIZE]; + +DECLARE_ALIGNED(16, MPA_INT, RENAME(ff_mpa_synth_window))[512+256]; + +#define SUM8(op, sum, w, p) \ +{ \ + op(sum, (w)[0 * 64], (p)[0 * 64]); \ + op(sum, (w)[1 * 64], (p)[1 * 64]); \ + op(sum, (w)[2 * 64], (p)[2 * 64]); \ + op(sum, (w)[3 * 64], (p)[3 * 64]); \ + op(sum, (w)[4 * 64], (p)[4 * 64]); \ + op(sum, (w)[5 * 64], (p)[5 * 64]); \ + op(sum, (w)[6 * 64], (p)[6 * 64]); \ + op(sum, (w)[7 * 64], (p)[7 * 64]); \ +} + +#define SUM8P2(sum1, op1, sum2, op2, w1, w2, p) \ +{ \ + INTFLOAT tmp;\ + tmp = p[0 * 64];\ + op1(sum1, (w1)[0 * 64], tmp);\ + op2(sum2, (w2)[0 * 64], tmp);\ + tmp = p[1 * 64];\ + op1(sum1, (w1)[1 * 64], tmp);\ + op2(sum2, (w2)[1 * 64], tmp);\ + tmp = p[2 * 64];\ + op1(sum1, (w1)[2 * 64], tmp);\ + op2(sum2, (w2)[2 * 64], tmp);\ + tmp = p[3 * 64];\ + op1(sum1, (w1)[3 * 64], tmp);\ + op2(sum2, (w2)[3 * 64], tmp);\ + tmp = p[4 * 64];\ + op1(sum1, (w1)[4 * 64], tmp);\ + op2(sum2, (w2)[4 * 64], tmp);\ + tmp = p[5 * 64];\ + op1(sum1, (w1)[5 * 64], tmp);\ + op2(sum2, (w2)[5 * 64], tmp);\ + tmp = p[6 * 64];\ + op1(sum1, (w1)[6 * 64], tmp);\ + op2(sum2, (w2)[6 * 64], tmp);\ + tmp = p[7 * 64];\ + op1(sum1, (w1)[7 * 64], tmp);\ + op2(sum2, (w2)[7 * 64], tmp);\ +} + +void RENAME(ff_mpadsp_apply_window)(MPA_INT *synth_buf, MPA_INT *window, + int *dither_state, OUT_INT *samples, + ptrdiff_t incr) +{ + register const MPA_INT *w, *w2, *p; + int j; + OUT_INT *samples2; +#if USE_FLOATS + float sum, sum2; +#else + int64_t sum, sum2; +#endif + + /* copy to avoid wrap */ + memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf)); + + samples2 = samples + 31 * incr; + w = window; + w2 = window + 31; + + sum = *dither_state; + p = synth_buf + 16; + SUM8(MACS, sum, w, p); + p = synth_buf + 48; + SUM8(MLSS, sum, w + 32, p); + *samples = round_sample(&sum); + samples += incr; + w++; + + /* we calculate two samples at the same time to avoid one memory + access per two sample */ + for(j=1;j<16;j++) { + sum2 = 0; + p = synth_buf + 16 + j; + SUM8P2(sum, MACS, sum2, MLSS, w, w2, p); + p = synth_buf + 48 - j; + SUM8P2(sum, MLSS, sum2, MLSS, w + 32, w2 + 32, p); + + *samples = round_sample(&sum); + samples += incr; + sum += sum2; + *samples2 = round_sample(&sum); + samples2 -= incr; + w++; + w2--; + } + + p = synth_buf + 32; + SUM8(MLSS, sum, w + 32, p); + *samples = round_sample(&sum); + *dither_state= sum; +} + +/* 32 sub band synthesis filter. Input: 32 sub band samples, Output: + 32 samples. */ +void RENAME(ff_mpa_synth_filter)(MPADSPContext *s, MPA_INT *synth_buf_ptr, + int *synth_buf_offset, + MPA_INT *window, int *dither_state, + OUT_INT *samples, ptrdiff_t incr, + MPA_INT *sb_samples) +{ + MPA_INT *synth_buf; + int offset; + + offset = *synth_buf_offset; + synth_buf = synth_buf_ptr + offset; + + s->RENAME(dct32)(synth_buf, sb_samples); + s->RENAME(apply_window)(synth_buf, window, dither_state, samples, incr); + + offset = (offset - 32) & 511; + *synth_buf_offset = offset; +} + +static av_cold void mpa_synth_init(MPA_INT *window) +{ + int i, j; + + /* max = 18760, max sum over all 16 coefs : 44736 */ + for(i=0;i<257;i++) { + INTFLOAT v; + v = ff_mpa_enwindow[i]; +#if USE_FLOATS + v *= 1.0 / (1LL<<(16 + FRAC_BITS)); +#endif + window[i] = v; + if ((i & 63) != 0) + v = -v; + if (i != 0) + window[512 - i] = v; + } + + + // Needed for avoiding shuffles in ASM implementations + for(i=0; i < 8; i++) + for(j=0; j < 16; j++) + window[512+16*i+j] = window[64*i+32-j]; + + for(i=0; i < 8; i++) + for(j=0; j < 16; j++) + window[512+128+16*i+j] = window[64*i+48-j]; +} + +static av_cold void mpa_synth_window_init(void) +{ + mpa_synth_init(RENAME(ff_mpa_synth_window)); +} + +av_cold void RENAME(ff_mpa_synth_init)(void) +{ + static AVOnce init_static_once = AV_ONCE_INIT; + ff_thread_once(&init_static_once, mpa_synth_window_init); +} + +/* cos(pi*i/18) */ +#define C1 FIXHR(0.98480775301220805936/2) +#define C2 FIXHR(0.93969262078590838405/2) +#define C3 FIXHR(0.86602540378443864676/2) +#define C4 FIXHR(0.76604444311897803520/2) +#define C5 FIXHR(0.64278760968653932632/2) +#define C6 FIXHR(0.5/2) +#define C7 FIXHR(0.34202014332566873304/2) +#define C8 FIXHR(0.17364817766693034885/2) + +/* 0.5 / cos(pi*(2*i+1)/36) */ +static const INTFLOAT icos36[9] = { + FIXR(0.50190991877167369479), + FIXR(0.51763809020504152469), //0 + FIXR(0.55168895948124587824), + FIXR(0.61038729438072803416), + FIXR(0.70710678118654752439), //1 + FIXR(0.87172339781054900991), + FIXR(1.18310079157624925896), + FIXR(1.93185165257813657349), //2 + FIXR(5.73685662283492756461), +}; + +/* 0.5 / cos(pi*(2*i+1)/36) */ +static const INTFLOAT icos36h[9] = { + FIXHR(0.50190991877167369479/2), + FIXHR(0.51763809020504152469/2), //0 + FIXHR(0.55168895948124587824/2), + FIXHR(0.61038729438072803416/2), + FIXHR(0.70710678118654752439/2), //1 + FIXHR(0.87172339781054900991/2), + FIXHR(1.18310079157624925896/4), + FIXHR(1.93185165257813657349/4), //2 +// FIXHR(5.73685662283492756461), +}; + +/* using Lee like decomposition followed by hand coded 9 points DCT */ +static void imdct36(INTFLOAT *out, INTFLOAT *buf, SUINTFLOAT *in, INTFLOAT *win) +{ + int i, j; + SUINTFLOAT t0, t1, t2, t3, s0, s1, s2, s3; + SUINTFLOAT tmp[18], *tmp1, *in1; + + for (i = 17; i >= 1; i--) + in[i] += in[i-1]; + for (i = 17; i >= 3; i -= 2) + in[i] += in[i-2]; + + for (j = 0; j < 2; j++) { + tmp1 = tmp + j; + in1 = in + j; + + t2 = in1[2*4] + in1[2*8] - in1[2*2]; + + t3 = in1[2*0] + SHR(in1[2*6],1); + t1 = in1[2*0] - in1[2*6]; + tmp1[ 6] = t1 - SHR(t2,1); + tmp1[16] = t1 + t2; + + t0 = MULH3(in1[2*2] + in1[2*4] , C2, 2); + t1 = MULH3(in1[2*4] - in1[2*8] , -2*C8, 1); + t2 = MULH3(in1[2*2] + in1[2*8] , -C4, 2); + + tmp1[10] = t3 - t0 - t2; + tmp1[ 2] = t3 + t0 + t1; + tmp1[14] = t3 + t2 - t1; + + tmp1[ 4] = MULH3(in1[2*5] + in1[2*7] - in1[2*1], -C3, 2); + t2 = MULH3(in1[2*1] + in1[2*5], C1, 2); + t3 = MULH3(in1[2*5] - in1[2*7], -2*C7, 1); + t0 = MULH3(in1[2*3], C3, 2); + + t1 = MULH3(in1[2*1] + in1[2*7], -C5, 2); + + tmp1[ 0] = t2 + t3 + t0; + tmp1[12] = t2 + t1 - t0; + tmp1[ 8] = t3 - t1 - t0; + } + + i = 0; + for (j = 0; j < 4; j++) { + t0 = tmp[i]; + t1 = tmp[i + 2]; + s0 = t1 + t0; + s2 = t1 - t0; + + t2 = tmp[i + 1]; + t3 = tmp[i + 3]; + s1 = MULH3(t3 + t2, icos36h[ j], 2); + s3 = MULLx(t3 - t2, icos36 [8 - j], FRAC_BITS); + + t0 = s0 + s1; + t1 = s0 - s1; + out[(9 + j) * SBLIMIT] = MULH3(t1, win[ 9 + j], 1) + buf[4*(9 + j)]; + out[(8 - j) * SBLIMIT] = MULH3(t1, win[ 8 - j], 1) + buf[4*(8 - j)]; + buf[4 * ( 9 + j )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + 9 + j], 1); + buf[4 * ( 8 - j )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + 8 - j], 1); + + t0 = s2 + s3; + t1 = s2 - s3; + out[(9 + 8 - j) * SBLIMIT] = MULH3(t1, win[ 9 + 8 - j], 1) + buf[4*(9 + 8 - j)]; + out[ j * SBLIMIT] = MULH3(t1, win[ j], 1) + buf[4*( j)]; + buf[4 * ( 9 + 8 - j )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + 9 + 8 - j], 1); + buf[4 * ( j )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + j], 1); + i += 4; + } + + s0 = tmp[16]; + s1 = MULH3(tmp[17], icos36h[4], 2); + t0 = s0 + s1; + t1 = s0 - s1; + out[(9 + 4) * SBLIMIT] = MULH3(t1, win[ 9 + 4], 1) + buf[4*(9 + 4)]; + out[(8 - 4) * SBLIMIT] = MULH3(t1, win[ 8 - 4], 1) + buf[4*(8 - 4)]; + buf[4 * ( 9 + 4 )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + 9 + 4], 1); + buf[4 * ( 8 - 4 )] = MULH3(t0, win[MDCT_BUF_SIZE/2 + 8 - 4], 1); +} + +void RENAME(ff_imdct36_blocks)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, + int count, int switch_point, int block_type) +{ + int j; + for (j=0 ; j < count; j++) { + /* apply window & overlap with previous buffer */ + + /* select window */ + int win_idx = (switch_point && j < 2) ? 0 : block_type; + INTFLOAT *win = RENAME(ff_mdct_win)[win_idx + (4 & -(j & 1))]; + + imdct36(out, buf, in, win); + + in += 18; + buf += ((j&3) != 3 ? 1 : (72-3)); + out++; + } +} + diff --git a/media/ffvpx/libavcodec/mpegaudiotab.h b/media/ffvpx/libavcodec/mpegaudiotab.h new file mode 100644 index 0000000000..bb2e5de4ea --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiotab.h @@ -0,0 +1,102 @@ +/* + * mpeg audio layer 2 tables. Most of them come from the mpeg audio + * specification. + * + * Copyright (c) 2000, 2001 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * mpeg audio layer 2 tables. + * Most of them come from the mpeg audio specification. + */ + +#ifndef AVCODEC_MPEGAUDIOTAB_H +#define AVCODEC_MPEGAUDIOTAB_H + +#include <stdint.h> +#include "mpegaudio.h" + +static const int costab32[30] = { + FIX(0.54119610014619701222), + FIX(1.3065629648763763537), + + FIX(0.50979557910415917998), + FIX(2.5629154477415054814), + FIX(0.89997622313641556513), + FIX(0.60134488693504528634), + + FIX(0.5024192861881556782), + FIX(5.1011486186891552563), + FIX(0.78815462345125020249), + FIX(0.64682178335999007679), + FIX(0.56694403481635768927), + FIX(1.0606776859903470633), + FIX(1.7224470982383341955), + FIX(0.52249861493968885462), + + FIX(10.19000812354803287), + FIX(0.674808341455005678), + FIX(1.1694399334328846596), + FIX(0.53104259108978413284), + FIX(2.0577810099534108446), + FIX(0.58293496820613388554), + FIX(0.83934964541552681272), + FIX(0.50547095989754364798), + FIX(3.4076084184687189804), + FIX(0.62250412303566482475), + FIX(0.97256823786196078263), + FIX(0.51544730992262455249), + FIX(1.4841646163141661852), + FIX(0.5531038960344445421), + FIX(0.74453627100229857749), + FIX(0.5006029982351962726), +}; + +static const int bitinv32[32] = { + 0, 16, 8, 24, 4, 20, 12, 28, + 2, 18, 10, 26, 6, 22, 14, 30, + 1, 17, 9, 25, 5, 21, 13, 29, + 3, 19, 11, 27, 7, 23, 15, 31 +}; + + +/* signal to noise ratio of each quantification step (could be + computed from quant_steps[]). The values are dB multiplied by 10 +*/ +static const unsigned short quant_snr[17] = { + 70, 110, 160, 208, + 253, 316, 378, 439, + 499, 559, 620, 680, + 740, 800, 861, 920, + 980 +}; + +/* fixed psycho acoustic model. Values of SNR taken from the 'toolame' + project */ +static const float fixed_smr[SBLIMIT] = { + 30, 17, 16, 10, 3, 12, 8, 2.5, + 5, 5, 6, 6, 5, 6, 10, 6, + -4, -10, -21, -30, -42, -55, -68, -75, + -75, -75, -75, -75, -91, -107, -110, -108 +}; + +static const unsigned char nb_scale_factors[4] = { 3, 2, 1, 2 }; + +#endif /* AVCODEC_MPEGAUDIOTAB_H */ diff --git a/media/ffvpx/libavcodec/mpegaudiotabs.c b/media/ffvpx/libavcodec/mpegaudiotabs.c new file mode 100644 index 0000000000..eaa380c808 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiotabs.c @@ -0,0 +1,22 @@ +/* + * MPEG Audio common tables + * copyright (c) 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "mpegaudiotabs.h" diff --git a/media/ffvpx/libavcodec/mpegaudiotabs.h b/media/ffvpx/libavcodec/mpegaudiotabs.h new file mode 100644 index 0000000000..671b83848d --- /dev/null +++ b/media/ffvpx/libavcodec/mpegaudiotabs.h @@ -0,0 +1,39 @@ +/* + * MPEG Audio common tables + * copyright (c) 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MPEGAUDIOTABS_H +#define AVCODEC_MPEGAUDIOTABS_H + +#include <stdint.h> + +const uint16_t ff_mpa_bitrate_tab[2][3][15] = { + { { 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448 }, + { 0, 32, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384 }, + { 0, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320 } }, + { { 0, 32, 48, 56, 64, 80, 96, 112, 128, 144, 160, 176, 192, 224, 256 }, + { 0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160 }, + { 0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160 } + } +}; + +const uint16_t ff_mpa_freq_tab[3] = { 44100, 48000, 32000 }; + +#endif diff --git a/media/ffvpx/libavcodec/mpegpicture.h b/media/ffvpx/libavcodec/mpegpicture.h new file mode 100644 index 0000000000..7919aa402c --- /dev/null +++ b/media/ffvpx/libavcodec/mpegpicture.h @@ -0,0 +1,105 @@ +/* + * Mpeg video formats-related defines and utility functions + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MPEGPICTURE_H +#define AVCODEC_MPEGPICTURE_H + +#include <stdint.h> + +#include "libavutil/frame.h" + +#include "avcodec.h" +#include "motion_est.h" +#include "threadframe.h" + +#define MPEGVIDEO_MAX_PLANES 4 +#define MAX_PICTURE_COUNT 36 +#define EDGE_WIDTH 16 + +typedef struct ScratchpadContext { + uint8_t *edge_emu_buffer; ///< temporary buffer for if MVs point to out-of-frame data + uint8_t *rd_scratchpad; ///< scratchpad for rate distortion mb decision + uint8_t *obmc_scratchpad; + uint8_t *b_scratchpad; ///< scratchpad used for writing into write only buffers +} ScratchpadContext; + +/** + * Picture. + */ +typedef struct Picture { + struct AVFrame *f; + ThreadFrame tf; + + AVBufferRef *qscale_table_buf; + int8_t *qscale_table; + + AVBufferRef *motion_val_buf[2]; + int16_t (*motion_val[2])[2]; + + AVBufferRef *mb_type_buf; + uint32_t *mb_type; ///< types and macros are defined in mpegutils.h + + AVBufferRef *mbskip_table_buf; + uint8_t *mbskip_table; + + AVBufferRef *ref_index_buf[2]; + int8_t *ref_index[2]; + + int alloc_mb_width; ///< mb_width used to allocate tables + int alloc_mb_height; ///< mb_height used to allocate tables + int alloc_mb_stride; ///< mb_stride used to allocate tables + + AVBufferRef *hwaccel_priv_buf; + void *hwaccel_picture_private; ///< Hardware accelerator private data + + int field_picture; ///< whether or not the picture was encoded in separate fields + + int b_frame_score; + int needs_realloc; ///< Picture needs to be reallocated (eg due to a frame size change) + + int reference; + int shared; + + int display_picture_number; + int coded_picture_number; +} Picture; + +/** + * Allocate a Picture. + * The pixels are allocated/set by calling get_buffer() if shared = 0. + */ +int ff_alloc_picture(AVCodecContext *avctx, Picture *pic, MotionEstContext *me, + ScratchpadContext *sc, int shared, int encoding, + int chroma_x_shift, int chroma_y_shift, int out_format, + int mb_stride, int mb_width, int mb_height, int b8_stride, + ptrdiff_t *linesize, ptrdiff_t *uvlinesize); + +int ff_mpeg_framesize_alloc(AVCodecContext *avctx, MotionEstContext *me, + ScratchpadContext *sc, int linesize); + +int ff_mpeg_ref_picture(AVCodecContext *avctx, Picture *dst, Picture *src); +void ff_mpeg_unref_picture(AVCodecContext *avctx, Picture *picture); + +void ff_mpv_picture_free(AVCodecContext *avctx, Picture *pic); +int ff_update_picture_tables(Picture *dst, const Picture *src); + +int ff_find_unused_picture(AVCodecContext *avctx, Picture *picture, int shared); + +#endif /* AVCODEC_MPEGPICTURE_H */ diff --git a/media/ffvpx/libavcodec/mpegutils.h b/media/ffvpx/libavcodec/mpegutils.h new file mode 100644 index 0000000000..386110bb8c --- /dev/null +++ b/media/ffvpx/libavcodec/mpegutils.h @@ -0,0 +1,142 @@ +/* + * Mpeg video formats-related defines and utility functions + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MPEGUTILS_H +#define AVCODEC_MPEGUTILS_H + +#include <stdint.h> + +#include "libavutil/frame.h" + +#include "avcodec.h" + +/** + * Return value for header parsers if frame is not coded. + * */ +#define FRAME_SKIPPED 100 + +/* picture type */ +#define PICT_TOP_FIELD 1 +#define PICT_BOTTOM_FIELD 2 +#define PICT_FRAME 3 + +#define MAX_MB_BYTES (30 * 16 * 16 * 3 / 8 + 120) +#define MAX_FCODE 7 + +/* MB types */ +#define MB_TYPE_INTRA4x4 (1 << 0) +#define MB_TYPE_INTRA16x16 (1 << 1) // FIXME H.264-specific +#define MB_TYPE_INTRA_PCM (1 << 2) // FIXME H.264-specific +#define MB_TYPE_16x16 (1 << 3) +#define MB_TYPE_16x8 (1 << 4) +#define MB_TYPE_8x16 (1 << 5) +#define MB_TYPE_8x8 (1 << 6) +#define MB_TYPE_INTERLACED (1 << 7) +#define MB_TYPE_DIRECT2 (1 << 8) // FIXME +#define MB_TYPE_ACPRED (1 << 9) +#define MB_TYPE_GMC (1 << 10) +#define MB_TYPE_SKIP (1 << 11) +#define MB_TYPE_P0L0 (1 << 12) +#define MB_TYPE_P1L0 (1 << 13) +#define MB_TYPE_P0L1 (1 << 14) +#define MB_TYPE_P1L1 (1 << 15) +#define MB_TYPE_L0 (MB_TYPE_P0L0 | MB_TYPE_P1L0) +#define MB_TYPE_L1 (MB_TYPE_P0L1 | MB_TYPE_P1L1) +#define MB_TYPE_L0L1 (MB_TYPE_L0 | MB_TYPE_L1) +#define MB_TYPE_QUANT (1 << 16) +#define MB_TYPE_CBP (1 << 17) + +#define MB_TYPE_INTRA MB_TYPE_INTRA4x4 // default mb_type if there is just one type + +#define IS_INTRA4x4(a) ((a) & MB_TYPE_INTRA4x4) +#define IS_INTRA16x16(a) ((a) & MB_TYPE_INTRA16x16) +#define IS_PCM(a) ((a) & MB_TYPE_INTRA_PCM) +#define IS_INTRA(a) ((a) & 7) +#define IS_INTER(a) ((a) & (MB_TYPE_16x16 | MB_TYPE_16x8 | \ + MB_TYPE_8x16 | MB_TYPE_8x8)) +#define IS_SKIP(a) ((a) & MB_TYPE_SKIP) +#define IS_INTRA_PCM(a) ((a) & MB_TYPE_INTRA_PCM) +#define IS_INTERLACED(a) ((a) & MB_TYPE_INTERLACED) +#define IS_DIRECT(a) ((a) & MB_TYPE_DIRECT2) +#define IS_GMC(a) ((a) & MB_TYPE_GMC) +#define IS_16X16(a) ((a) & MB_TYPE_16x16) +#define IS_16X8(a) ((a) & MB_TYPE_16x8) +#define IS_8X16(a) ((a) & MB_TYPE_8x16) +#define IS_8X8(a) ((a) & MB_TYPE_8x8) +#define IS_SUB_8X8(a) ((a) & MB_TYPE_16x16) // note reused +#define IS_SUB_8X4(a) ((a) & MB_TYPE_16x8) // note reused +#define IS_SUB_4X8(a) ((a) & MB_TYPE_8x16) // note reused +#define IS_SUB_4X4(a) ((a) & MB_TYPE_8x8) // note reused +#define IS_ACPRED(a) ((a) & MB_TYPE_ACPRED) +#define IS_QUANT(a) ((a) & MB_TYPE_QUANT) +#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0 << ((part) + 2 * (list)))) + +// does this mb use listX, note does not work if subMBs +#define USES_LIST(a, list) ((a) & ((MB_TYPE_P0L0 | MB_TYPE_P1L0) << (2 * (list)))) + +#define HAS_CBP(a) ((a) & MB_TYPE_CBP) + +/* MB types for encoding */ +#define CANDIDATE_MB_TYPE_INTRA (1 << 0) +#define CANDIDATE_MB_TYPE_INTER (1 << 1) +#define CANDIDATE_MB_TYPE_INTER4V (1 << 2) +#define CANDIDATE_MB_TYPE_SKIPPED (1 << 3) + +#define CANDIDATE_MB_TYPE_DIRECT (1 << 4) +#define CANDIDATE_MB_TYPE_FORWARD (1 << 5) +#define CANDIDATE_MB_TYPE_BACKWARD (1 << 6) +#define CANDIDATE_MB_TYPE_BIDIR (1 << 7) + +#define CANDIDATE_MB_TYPE_INTER_I (1 << 8) +#define CANDIDATE_MB_TYPE_FORWARD_I (1 << 9) +#define CANDIDATE_MB_TYPE_BACKWARD_I (1 << 10) +#define CANDIDATE_MB_TYPE_BIDIR_I (1 << 11) + +#define CANDIDATE_MB_TYPE_DIRECT0 (1 << 12) + +#define INPLACE_OFFSET 16 + +enum OutputFormat { + FMT_MPEG1, + FMT_H261, + FMT_H263, + FMT_MJPEG, + FMT_SPEEDHQ, +}; + + +/** + * Draw a horizontal band if supported. + * + * @param h is the normal height, this will be reduced automatically if needed + */ +void ff_draw_horiz_band(AVCodecContext *avctx, const AVFrame *cur, const AVFrame *last, + int y, int h, int picture_structure, int first_field, + int low_delay); + +/** + * Print debugging info for the given picture. + */ +void ff_print_debug_info2(AVCodecContext *avctx, AVFrame *pict, + const uint8_t *mbskip_table, const uint32_t *mbtype_table, + const int8_t *qscale_table, int16_t (*const motion_val[2])[2], + int mb_width, int mb_height, int mb_stride, int quarter_sample); + +#endif /* AVCODEC_MPEGUTILS_H */ diff --git a/media/ffvpx/libavcodec/mpegvideo.h b/media/ffvpx/libavcodec/mpegvideo.h new file mode 100644 index 0000000000..55828e6102 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegvideo.h @@ -0,0 +1,612 @@ +/* + * Generic DCT based hybrid video encoder + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * mpegvideo header. + */ + +#ifndef AVCODEC_MPEGVIDEO_H +#define AVCODEC_MPEGVIDEO_H + +#include "avcodec.h" +#include "blockdsp.h" +#include "error_resilience.h" +#include "fdctdsp.h" +#include "get_bits.h" +#include "h264chroma.h" +#include "h263dsp.h" +#include "hpeldsp.h" +#include "idctdsp.h" +#include "me_cmp.h" +#include "motion_est.h" +#include "mpegpicture.h" +#include "mpegvideoencdsp.h" +#include "pixblockdsp.h" +#include "put_bits.h" +#include "ratecontrol.h" +#include "mpegutils.h" +#include "qpeldsp.h" +#include "videodsp.h" + +#define MAX_THREADS 32 + +#define MAX_B_FRAMES 16 + +/** + * Scantable. + */ +typedef struct ScanTable { + const uint8_t *scantable; + uint8_t permutated[64]; + uint8_t raster_end[64]; +} ScanTable; + +/** + * MpegEncContext. + */ +typedef struct MpegEncContext { + AVClass *class; + + int y_dc_scale, c_dc_scale; + int ac_pred; + int block_last_index[12]; ///< last non zero coefficient in block + int h263_aic; ///< Advanced INTRA Coding (AIC) + + /* scantables */ + ScanTable inter_scantable; ///< if inter == intra then intra should be used to reduce the cache usage + + /* WARNING: changes above this line require updates to hardcoded + * offsets used in ASM. */ + + ScanTable intra_scantable; + uint8_t permutated_intra_h_scantable[64]; + uint8_t permutated_intra_v_scantable[64]; + + struct AVCodecContext *avctx; + /* The following pointer is intended for codecs sharing code + * between decoder and encoder and in need of a common context to do so. */ + void *private_ctx; + /* the following parameters must be initialized before encoding */ + int width, height;///< picture size. must be a multiple of 16 + int gop_size; + int intra_only; ///< if true, only intra pictures are generated + int64_t bit_rate; ///< wanted bit rate + enum OutputFormat out_format; ///< output format + int h263_pred; ///< use MPEG-4/H.263 ac/dc predictions + int pb_frame; ///< PB-frame mode (0 = none, 1 = base, 2 = improved) + +/* the following codec id fields are deprecated in favor of codec_id */ + int h263_plus; ///< H.263+ headers + int h263_flv; ///< use flv H.263 header + + enum AVCodecID codec_id; /* see AV_CODEC_ID_xxx */ + int fixed_qscale; ///< fixed qscale if non zero + int encoding; ///< true if we are encoding (vs decoding) + int max_b_frames; ///< max number of B-frames for encoding + int luma_elim_threshold; + int chroma_elim_threshold; + int workaround_bugs; ///< workaround bugs in encoders which cannot be detected automatically + int codec_tag; ///< internal codec_tag upper case converted from avctx codec_tag + /* the following fields are managed internally by the encoder */ + + /* sequence parameters */ + int context_initialized; + int input_picture_number; ///< used to set pic->display_picture_number, should not be used for/by anything else + int coded_picture_number; ///< used to set pic->coded_picture_number, should not be used for/by anything else + int picture_number; //FIXME remove, unclear definition + int extradata_parsed; + int picture_in_gop_number; ///< 0-> first pic in gop, ... + int mb_width, mb_height; ///< number of MBs horizontally & vertically + int mb_stride; ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11 + int b8_stride; ///< 2*mb_width+1 used for some 8x8 block arrays to allow simple addressing + int h_edge_pos, v_edge_pos;///< horizontal / vertical position of the right/bottom edge (pixel replication) + int mb_num; ///< number of MBs of a picture + ptrdiff_t linesize; ///< line size, in bytes, may be different from width + ptrdiff_t uvlinesize; ///< line size, for chroma in bytes, may be different from width + Picture *picture; ///< main picture buffer + Picture **input_picture; ///< next pictures on display order for encoding + Picture **reordered_input_picture; ///< pointer to the next pictures in coded order for encoding + + int64_t user_specified_pts; ///< last non-zero pts from AVFrame which was passed into avcodec_send_frame() + /** + * pts difference between the first and second input frame, used for + * calculating dts of the first frame when there's a delay */ + int64_t dts_delta; + /** + * reordered pts to be used as dts for the next output frame when there's + * a delay */ + int64_t reordered_pts; + + /** bit output */ + PutBitContext pb; + + int start_mb_y; ///< start mb_y of this thread (so current thread should process start_mb_y <= row < end_mb_y) + int end_mb_y; ///< end mb_y of this thread (so current thread should process start_mb_y <= row < end_mb_y) + struct MpegEncContext *thread_context[MAX_THREADS]; + int slice_context_count; ///< number of used thread_contexts + + /** + * copy of the previous picture structure. + * note, linesize & data, might not match the previous picture (for field pictures) + */ + Picture last_picture; + + /** + * copy of the next picture structure. + * note, linesize & data, might not match the next picture (for field pictures) + */ + Picture next_picture; + + /** + * Reference to the source picture for encoding. + * note, linesize & data, might not match the source picture (for field pictures) + */ + AVFrame *new_picture; + + /** + * copy of the current picture structure. + * note, linesize & data, might not match the current picture (for field pictures) + */ + Picture current_picture; ///< buffer to store the decompressed current picture + + Picture *last_picture_ptr; ///< pointer to the previous picture. + Picture *next_picture_ptr; ///< pointer to the next picture (for bidir pred) + Picture *current_picture_ptr; ///< pointer to the current picture + int skipped_last_frame; + int last_dc[3]; ///< last DC values for MPEG-1 + int16_t *dc_val_base; + int16_t *dc_val[3]; ///< used for MPEG-4 DC prediction, all 3 arrays must be continuous + const uint8_t *y_dc_scale_table; ///< qscale -> y_dc_scale table + const uint8_t *c_dc_scale_table; ///< qscale -> c_dc_scale table + const uint8_t *chroma_qscale_table; ///< qscale -> chroma_qscale (H.263) + uint8_t *coded_block_base; + uint8_t *coded_block; ///< used for coded block pattern prediction (msmpeg4v3, wmv1) + int16_t (*ac_val_base)[16]; + int16_t (*ac_val[3])[16]; ///< used for MPEG-4 AC prediction, all 3 arrays must be continuous + int mb_skipped; ///< MUST BE SET only during DECODING + uint8_t *mbskip_table; /**< used to avoid copy if macroblock skipped (for black regions for example) + and used for B-frame encoding & decoding (contains skip table of next P-frame) */ + uint8_t *mbintra_table; ///< used to avoid setting {ac, dc, cbp}-pred stuff to zero on inter MB decoding + uint8_t *cbp_table; ///< used to store cbp, ac_pred for partitioned decoding + uint8_t *pred_dir_table; ///< used to store pred_dir for partitioned decoding + + ScratchpadContext sc; + + int qscale; ///< QP + int chroma_qscale; ///< chroma QP + unsigned int lambda; ///< Lagrange multiplier used in rate distortion + unsigned int lambda2; ///< (lambda*lambda) >> FF_LAMBDA_SHIFT + int *lambda_table; + int adaptive_quant; ///< use adaptive quantization + int dquant; ///< qscale difference to prev qscale + int pict_type; ///< AV_PICTURE_TYPE_I, AV_PICTURE_TYPE_P, AV_PICTURE_TYPE_B, ... + int vbv_delay; + int last_pict_type; //FIXME removes + int last_non_b_pict_type; ///< used for MPEG-4 gmc B-frames & ratecontrol + int droppable; + int last_lambda_for[5]; ///< last lambda for a specific pict type + int skipdct; ///< skip dct and code zero residual + + /* motion compensation */ + int unrestricted_mv; ///< mv can point outside of the coded picture + int h263_long_vectors; ///< use horrible H.263v1 long vector mode + + BlockDSPContext bdsp; + FDCTDSPContext fdsp; + H264ChromaContext h264chroma; + HpelDSPContext hdsp; + IDCTDSPContext idsp; + MECmpContext mecc; + MpegvideoEncDSPContext mpvencdsp; + PixblockDSPContext pdsp; + QpelDSPContext qdsp; + VideoDSPContext vdsp; + H263DSPContext h263dsp; + int f_code; ///< forward MV resolution + int b_code; ///< backward MV resolution for B-frames (MPEG-4) + int16_t (*p_mv_table_base)[2]; + int16_t (*b_forw_mv_table_base)[2]; + int16_t (*b_back_mv_table_base)[2]; + int16_t (*b_bidir_forw_mv_table_base)[2]; + int16_t (*b_bidir_back_mv_table_base)[2]; + int16_t (*b_direct_mv_table_base)[2]; + int16_t (*p_field_mv_table_base)[2]; + int16_t (*b_field_mv_table_base)[2]; + int16_t (*p_mv_table)[2]; ///< MV table (1MV per MB) P-frame encoding + int16_t (*b_forw_mv_table)[2]; ///< MV table (1MV per MB) forward mode B-frame encoding + int16_t (*b_back_mv_table)[2]; ///< MV table (1MV per MB) backward mode B-frame encoding + int16_t (*b_bidir_forw_mv_table)[2]; ///< MV table (1MV per MB) bidir mode B-frame encoding + int16_t (*b_bidir_back_mv_table)[2]; ///< MV table (1MV per MB) bidir mode B-frame encoding + int16_t (*b_direct_mv_table)[2]; ///< MV table (1MV per MB) direct mode B-frame encoding + int16_t (*p_field_mv_table[2][2])[2]; ///< MV table (2MV per MB) interlaced P-frame encoding + int16_t (*b_field_mv_table[2][2][2])[2];///< MV table (4MV per MB) interlaced B-frame encoding + uint8_t (*p_field_select_table[2]); ///< Only the first element is allocated + uint8_t (*b_field_select_table[2][2]); ///< Only the first element is allocated + + /* The following fields are encoder-only */ + uint16_t *mb_var; ///< Table for MB variances + uint16_t *mc_mb_var; ///< Table for motion compensated MB variances + uint8_t *mb_mean; ///< Table for MB luminance + int64_t mb_var_sum; ///< sum of MB variance for current frame + int64_t mc_mb_var_sum; ///< motion compensated MB variance for current frame + uint64_t encoding_error[MPEGVIDEO_MAX_PLANES]; + + int motion_est; ///< ME algorithm + int me_penalty_compensation; + int me_pre; ///< prepass for motion estimation + int mv_dir; +#define MV_DIR_FORWARD 1 +#define MV_DIR_BACKWARD 2 +#define MV_DIRECT 4 ///< bidirectional mode where the difference equals the MV of the last P/S/I-Frame (MPEG-4) + int mv_type; +#define MV_TYPE_16X16 0 ///< 1 vector for the whole mb +#define MV_TYPE_8X8 1 ///< 4 vectors (H.263, MPEG-4 4MV) +#define MV_TYPE_16X8 2 ///< 2 vectors, one per 16x8 block +#define MV_TYPE_FIELD 3 ///< 2 vectors, one per field +#define MV_TYPE_DMV 4 ///< 2 vectors, special mpeg2 Dual Prime Vectors + /**motion vectors for a macroblock + first coordinate : 0 = forward 1 = backward + second " : depend on type + third " : 0 = x, 1 = y + */ + int mv[2][4][2]; + int field_select[2][2]; + int last_mv[2][2][2]; ///< last MV, used for MV prediction in MPEG-1 & B-frame MPEG-4 + const uint8_t *fcode_tab; ///< smallest fcode needed for each MV + int16_t direct_scale_mv[2][64]; ///< precomputed to avoid divisions in ff_mpeg4_set_direct_mv + + MotionEstContext me; + + int no_rounding; /**< apply no rounding to motion compensation (MPEG-4, msmpeg4, ...) + for B-frames rounding mode is always 0 */ + + /* macroblock layer */ + int mb_x, mb_y; + int mb_skip_run; + int mb_intra; + uint16_t *mb_type; ///< Table for candidate MB types for encoding (defines in mpegutils.h) + + int block_index[6]; ///< index to current MB in block based arrays with edges + int block_wrap[6]; + uint8_t *dest[3]; + + int *mb_index2xy; ///< mb_index -> mb_x + mb_y*mb_stride + + /** matrix transmitted in the bitstream */ + uint16_t intra_matrix[64]; + uint16_t chroma_intra_matrix[64]; + uint16_t inter_matrix[64]; + uint16_t chroma_inter_matrix[64]; + + int intra_quant_bias; ///< bias for the quantizer + int inter_quant_bias; ///< bias for the quantizer + int min_qcoeff; ///< minimum encodable coefficient + int max_qcoeff; ///< maximum encodable coefficient + int ac_esc_length; ///< num of bits needed to encode the longest esc + uint8_t *intra_ac_vlc_length; + uint8_t *intra_ac_vlc_last_length; + uint8_t *intra_chroma_ac_vlc_length; + uint8_t *intra_chroma_ac_vlc_last_length; + uint8_t *inter_ac_vlc_length; + uint8_t *inter_ac_vlc_last_length; + uint8_t *luma_dc_vlc_length; + + int coded_score[12]; + + /** precomputed matrix (combine qscale and DCT renorm) */ + int (*q_intra_matrix)[64]; + int (*q_chroma_intra_matrix)[64]; + int (*q_inter_matrix)[64]; + /** identical to the above but for MMX & these are not permutated, second 64 entries are bias*/ + uint16_t (*q_intra_matrix16)[2][64]; + uint16_t (*q_chroma_intra_matrix16)[2][64]; + uint16_t (*q_inter_matrix16)[2][64]; + + /* noise reduction */ + int (*dct_error_sum)[64]; + int dct_count[2]; + uint16_t (*dct_offset)[64]; + + /* bit rate control */ + int64_t total_bits; + int frame_bits; ///< bits used for the current frame + int stuffing_bits; ///< bits used for stuffing + int next_lambda; ///< next lambda used for retrying to encode a frame + RateControlContext rc_context; ///< contains stuff only accessed in ratecontrol.c + + /* statistics, used for 2-pass encoding */ + int mv_bits; + int header_bits; + int i_tex_bits; + int p_tex_bits; + int i_count; + int skip_count; + int misc_bits; ///< cbp, mb_type + int last_bits; ///< temp var used for calculating the above vars + + /* error concealment / resync */ + int resync_mb_x; ///< x position of last resync marker + int resync_mb_y; ///< y position of last resync marker + GetBitContext last_resync_gb; ///< used to search for the next resync marker + int mb_num_left; ///< number of MBs left in this video packet (for partitioned Slices only) + + /* H.263 specific */ + int gob_index; + int obmc; ///< overlapped block motion compensation + int mb_info; ///< interval for outputting info about mb offsets as side data + int prev_mb_info, last_mb_info; + uint8_t *mb_info_ptr; + int mb_info_size; + int ehc_mode; + + /* H.263+ specific */ + int umvplus; ///< == H.263+ && unrestricted_mv + int h263_aic_dir; ///< AIC direction: 0 = left, 1 = top + int h263_slice_structured; + int alt_inter_vlc; ///< alternative inter vlc + int modified_quant; + int loop_filter; + int custom_pcf; + + /* MPEG-4 specific */ + int studio_profile; + int dct_precision; + ///< number of bits to represent the fractional part of time (encoder only) + int time_increment_bits; + int last_time_base; + int time_base; ///< time in seconds of last I,P,S Frame + int64_t time; ///< time of current frame + int64_t last_non_b_time; + uint16_t pp_time; ///< time distance between the last 2 p,s,i frames + uint16_t pb_time; ///< time distance between the last b and p,s,i frame + uint16_t pp_field_time; + uint16_t pb_field_time; ///< like above, just for interlaced + int mcsel; + int quant_precision; + int quarter_sample; ///< 1->qpel, 0->half pel ME/MC + int data_partitioning; ///< data partitioning flag from header + int partitioned_frame; ///< is current frame partitioned + int low_delay; ///< no reordering needed / has no B-frames + PutBitContext tex_pb; ///< used for data partitioned VOPs + PutBitContext pb2; ///< used for data partitioned VOPs + int mpeg_quant; + int padding_bug_score; ///< used to detect the VERY common padding bug in MPEG-4 + + /* divx specific, used to workaround (many) bugs in divx5 */ + int divx_packed; + uint8_t *bitstream_buffer; //Divx 5.01 puts several frames in a single one, this is used to reorder them + int bitstream_buffer_size; + unsigned int allocated_bitstream_buffer_size; + + /* RV10 specific */ + int rv10_version; ///< RV10 version: 0 or 3 + int rv10_first_dc_coded[3]; + + /* MJPEG specific */ + struct MJpegContext *mjpeg_ctx; + int esc_pos; + + /* MSMPEG4 specific */ + int mv_table_index; + int rl_table_index; + int rl_chroma_table_index; + int dc_table_index; + int use_skip_mb_code; + int slice_height; ///< in macroblocks + int first_slice_line; ///< used in MPEG-4 too to handle resync markers + int flipflop_rounding; + int msmpeg4_version; ///< 0=not msmpeg4, 1=mp41, 2=mp42, 3=mp43/divx3 4=wmv1/7 5=wmv2/8 + int per_mb_rl_table; + int esc3_level_length; + int esc3_run_length; + int inter_intra_pred; + int mspel; + + /* decompression specific */ + GetBitContext gb; + + /* MPEG-1 specific */ + int last_mv_dir; ///< last mv_dir, used for B-frame encoding + int vbv_delay_pos; ///< offset of vbv_delay in the bitstream + + /* MPEG-2-specific - I wished not to have to support this mess. */ + int progressive_sequence; + int mpeg_f_code[2][2]; + + // picture structure defines are loaded from mpegutils.h + int picture_structure; + + int intra_dc_precision; + int frame_pred_frame_dct; + int top_field_first; + int concealment_motion_vectors; + int q_scale_type; + int brd_scale; + int intra_vlc_format; + int alternate_scan; + int repeat_first_field; + int chroma_420_type; + int chroma_format; +#define CHROMA_420 1 +#define CHROMA_422 2 +#define CHROMA_444 3 + int chroma_x_shift;//depend on pix_format, that depend on chroma_format + int chroma_y_shift; + + int progressive_frame; + int full_pel[2]; + int interlaced_dct; + int first_field; ///< is 1 for the first field of a field picture 0 otherwise + + /* RTP specific */ + int rtp_mode; + int rtp_payload_size; + + uint8_t *ptr_lastgob; + int16_t (*pblocks[12])[64]; + + int16_t (*block)[64]; ///< points to one of the following blocks + int16_t (*blocks)[12][64]; // for HQ mode we need to keep the best block + int (*decode_mb)(struct MpegEncContext *s, int16_t block[12][64]); // used by some codecs to avoid a switch() + +#define SLICE_OK 0 +#define SLICE_ERROR -1 +#define SLICE_END -2 ///<end marker found +#define SLICE_NOEND -3 ///<no end marker or error found but mb count exceeded + + void (*dct_unquantize_mpeg1_intra)(struct MpegEncContext *s, + int16_t *block/*align 16*/, int n, int qscale); + void (*dct_unquantize_mpeg1_inter)(struct MpegEncContext *s, + int16_t *block/*align 16*/, int n, int qscale); + void (*dct_unquantize_mpeg2_intra)(struct MpegEncContext *s, + int16_t *block/*align 16*/, int n, int qscale); + void (*dct_unquantize_mpeg2_inter)(struct MpegEncContext *s, + int16_t *block/*align 16*/, int n, int qscale); + void (*dct_unquantize_h263_intra)(struct MpegEncContext *s, + int16_t *block/*align 16*/, int n, int qscale); + void (*dct_unquantize_h263_inter)(struct MpegEncContext *s, + int16_t *block/*align 16*/, int n, int qscale); + void (*dct_unquantize_intra)(struct MpegEncContext *s, // unquantizer to use (MPEG-4 can use both) + int16_t *block/*align 16*/, int n, int qscale); + void (*dct_unquantize_inter)(struct MpegEncContext *s, // unquantizer to use (MPEG-4 can use both) + int16_t *block/*align 16*/, int n, int qscale); + int (*dct_quantize)(struct MpegEncContext *s, int16_t *block/*align 16*/, int n, int qscale, int *overflow); + int (*fast_dct_quantize)(struct MpegEncContext *s, int16_t *block/*align 16*/, int n, int qscale, int *overflow); + void (*denoise_dct)(struct MpegEncContext *s, int16_t *block); + + int mpv_flags; ///< flags set by private options + int quantizer_noise_shaping; + + /** + * ratecontrol qmin qmax limiting method + * 0-> clipping, 1-> use a nice continuous function to limit qscale within qmin/qmax. + */ + float rc_qsquish; + float rc_qmod_amp; + int rc_qmod_freq; + float rc_initial_cplx; + float rc_buffer_aggressivity; + float border_masking; + int lmin, lmax; + int vbv_ignore_qmax; + + char *rc_eq; + + /* temp buffers for rate control */ + float *cplx_tab, *bits_tab; + + /* flag to indicate a reinitialization is required, e.g. after + * a frame size change */ + int context_reinit; + + ERContext er; + + int error_rate; + + /* temporary frames used by b_frame_strategy = 2 */ + AVFrame *tmp_frames[MAX_B_FRAMES + 2]; + int b_frame_strategy; + int b_sensitivity; + + /* frame skip options for encoding */ + int frame_skip_threshold; + int frame_skip_factor; + int frame_skip_exp; + int frame_skip_cmp; + + int scenechange_threshold; + int noise_reduction; + + int intra_penalty; +} MpegEncContext; + + +/** + * Set the given MpegEncContext to common defaults (same for encoding + * and decoding). The changed fields will not depend upon the prior + * state of the MpegEncContext. + */ +void ff_mpv_common_defaults(MpegEncContext *s); + +int ff_mpv_common_init(MpegEncContext *s); +void ff_mpv_common_init_arm(MpegEncContext *s); +void ff_mpv_common_init_axp(MpegEncContext *s); +void ff_mpv_common_init_neon(MpegEncContext *s); +void ff_mpv_common_init_ppc(MpegEncContext *s); +void ff_mpv_common_init_x86(MpegEncContext *s); +void ff_mpv_common_init_mips(MpegEncContext *s); +/** + * Initialize an MpegEncContext's thread contexts. Presumes that + * slice_context_count is already set and that all the fields + * that are freed/reset in free_duplicate_context() are NULL. + */ +int ff_mpv_init_duplicate_contexts(MpegEncContext *s); +/** + * Initialize and allocates MpegEncContext fields dependent on the resolution. + */ +int ff_mpv_init_context_frame(MpegEncContext *s); +/** + * Frees and resets MpegEncContext fields depending on the resolution + * as well as the slice thread contexts. + * Is used during resolution changes to avoid a full reinitialization of the + * codec. + */ +void ff_mpv_free_context_frame(MpegEncContext *s); + +void ff_mpv_common_end(MpegEncContext *s); + +void ff_clean_intra_table_entries(MpegEncContext *s); + +int ff_update_duplicate_context(MpegEncContext *dst, const MpegEncContext *src); +void ff_set_qscale(MpegEncContext * s, int qscale); + +void ff_mpv_idct_init(MpegEncContext *s); +void ff_init_scantable(const uint8_t *permutation, ScanTable *st, + const uint8_t *src_scantable); +void ff_init_block_index(MpegEncContext *s); + +void ff_mpv_motion(MpegEncContext *s, + uint8_t *dest_y, uint8_t *dest_cb, + uint8_t *dest_cr, int dir, + uint8_t *const *ref_picture, + op_pixels_func (*pix_op)[4], + qpel_mc_func (*qpix_op)[16]); + +static inline void ff_update_block_index(MpegEncContext *s, int bits_per_raw_sample, + int lowres, int chroma_x_shift) +{ + const int bytes_per_pixel = 1 + (bits_per_raw_sample > 8); + const int block_size = (8 * bytes_per_pixel) >> lowres; + + s->block_index[0]+=2; + s->block_index[1]+=2; + s->block_index[2]+=2; + s->block_index[3]+=2; + s->block_index[4]++; + s->block_index[5]++; + s->dest[0]+= 2*block_size; + s->dest[1] += (2 >> chroma_x_shift) * block_size; + s->dest[2] += (2 >> chroma_x_shift) * block_size; +} + +#endif /* AVCODEC_MPEGVIDEO_H */ diff --git a/media/ffvpx/libavcodec/mpegvideodata.h b/media/ffvpx/libavcodec/mpegvideodata.h new file mode 100644 index 0000000000..42c9d6c293 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegvideodata.h @@ -0,0 +1,39 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MPEGVIDEODATA_H +#define AVCODEC_MPEGVIDEODATA_H + +#include <stdint.h> + +#include "libavutil/attributes_internal.h" + +FF_VISIBILITY_PUSH_HIDDEN +/* encoding scans */ +extern const uint8_t ff_alternate_horizontal_scan[64]; +extern const uint8_t ff_alternate_vertical_scan[64]; + +extern const uint8_t ff_mpeg12_dc_scale_table[4][32]; +static const uint8_t *const ff_mpeg1_dc_scale_table = ff_mpeg12_dc_scale_table[0]; + +extern const uint8_t ff_mpeg2_non_linear_qscale[32]; + +extern const uint8_t ff_default_chroma_qscale_table[32]; +FF_VISIBILITY_POP_HIDDEN + +#endif /* AVCODEC_MPEGVIDEODATA_H */ diff --git a/media/ffvpx/libavcodec/mpegvideodsp.h b/media/ffvpx/libavcodec/mpegvideodsp.h new file mode 100644 index 0000000000..293e2548d3 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegvideodsp.h @@ -0,0 +1,47 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MPEGVIDEODSP_H +#define AVCODEC_MPEGVIDEODSP_H + +#include <stdint.h> + +void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, int shift, int r, + int width, int height); + +typedef struct MpegVideoDSPContext { + /** + * translational global motion compensation. + */ + void (*gmc1)(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, + int srcStride, int h, int x16, int y16, int rounder); + /** + * global motion compensation. + */ + void (*gmc)(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, + int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, + int shift, int r, int width, int height); +} MpegVideoDSPContext; + +void ff_mpegvideodsp_init(MpegVideoDSPContext *c); +void ff_mpegvideodsp_init_ppc(MpegVideoDSPContext *c); +void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c); + +#endif /* AVCODEC_MPEGVIDEODSP_H */ diff --git a/media/ffvpx/libavcodec/mpegvideoencdsp.h b/media/ffvpx/libavcodec/mpegvideoencdsp.h new file mode 100644 index 0000000000..95084679d9 --- /dev/null +++ b/media/ffvpx/libavcodec/mpegvideoencdsp.h @@ -0,0 +1,58 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_MPEGVIDEOENCDSP_H +#define AVCODEC_MPEGVIDEOENCDSP_H + +#include <stdint.h> + +#include "avcodec.h" + +#define BASIS_SHIFT 16 +#define RECON_SHIFT 6 + +#define EDGE_TOP 1 +#define EDGE_BOTTOM 2 + +typedef struct MpegvideoEncDSPContext { + int (*try_8x8basis)(const int16_t rem[64], const int16_t weight[64], + const int16_t basis[64], int scale); + void (*add_8x8basis)(int16_t rem[64], const int16_t basis[64], int scale); + + int (*pix_sum)(const uint8_t *pix, int line_size); + int (*pix_norm1)(const uint8_t *pix, int line_size); + + void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, + int src_wrap, int width, int height); + + void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, + int w, int h, int sides); +} MpegvideoEncDSPContext; + +void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c, + AVCodecContext *avctx); +void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c, + AVCodecContext *avctx); +void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c, + AVCodecContext *avctx); +void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, + AVCodecContext *avctx); +void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c, + AVCodecContext *avctx); + +#endif /* AVCODEC_MPEGVIDEOENCDSP_H */ diff --git a/media/ffvpx/libavcodec/null_bsf.c b/media/ffvpx/libavcodec/null_bsf.c new file mode 100644 index 0000000000..28237076fb --- /dev/null +++ b/media/ffvpx/libavcodec/null_bsf.c @@ -0,0 +1,29 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Null bitstream filter -- pass the input through unchanged. + */ + +#include "bsf_internal.h" + +const FFBitStreamFilter ff_null_bsf = { + .p.name = "null", + .filter = ff_bsf_get_packet_ref, +}; diff --git a/media/ffvpx/libavcodec/options.c b/media/ffvpx/libavcodec/options.c new file mode 100644 index 0000000000..a9b35ee1c3 --- /dev/null +++ b/media/ffvpx/libavcodec/options.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Options definition for AVCodecContext. + */ + +#include "config_components.h" + +#include "avcodec.h" +#include "codec_internal.h" +#include "libavutil/avassert.h" +#include "libavutil/internal.h" +#include "libavutil/mem.h" +#include "libavutil/opt.h" +#include <string.h> + +FF_DISABLE_DEPRECATION_WARNINGS +#include "options_table.h" +FF_ENABLE_DEPRECATION_WARNINGS + +static const char* context_to_name(void* ptr) { + AVCodecContext *avc= ptr; + + if (avc && avc->codec) + return avc->codec->name; + else + return "NULL"; +} + +static void *codec_child_next(void *obj, void *prev) +{ + AVCodecContext *s = obj; + if (!prev && s->codec && s->codec->priv_class && s->priv_data) + return s->priv_data; + return NULL; +} + +static const AVClass *codec_child_class_iterate(void **iter) +{ + const AVCodec *c; + /* find next codec with priv options */ + while (c = av_codec_iterate(iter)) + if (c->priv_class) + return c->priv_class; + return NULL; +} + +static AVClassCategory get_category(void *ptr) +{ + AVCodecContext* avctx = ptr; + if (avctx->codec && av_codec_is_decoder(avctx->codec)) + return AV_CLASS_CATEGORY_DECODER; + else + return AV_CLASS_CATEGORY_ENCODER; +} + +static const AVClass av_codec_context_class = { + .class_name = "AVCodecContext", + .item_name = context_to_name, + .option = avcodec_options, + .version = LIBAVUTIL_VERSION_INT, + .log_level_offset_offset = offsetof(AVCodecContext, log_level_offset), + .child_next = codec_child_next, + .child_class_iterate = codec_child_class_iterate, + .category = AV_CLASS_CATEGORY_ENCODER, + .get_category = get_category, +}; + +static int init_context_defaults(AVCodecContext *s, const AVCodec *codec) +{ + const FFCodec *const codec2 = ffcodec(codec); + int flags=0; + memset(s, 0, sizeof(AVCodecContext)); + + s->av_class = &av_codec_context_class; + + s->codec_type = codec ? codec->type : AVMEDIA_TYPE_UNKNOWN; + if (codec) { + s->codec = codec; + s->codec_id = codec->id; + } + + if(s->codec_type == AVMEDIA_TYPE_AUDIO) + flags= AV_OPT_FLAG_AUDIO_PARAM; + else if(s->codec_type == AVMEDIA_TYPE_VIDEO) + flags= AV_OPT_FLAG_VIDEO_PARAM; + else if(s->codec_type == AVMEDIA_TYPE_SUBTITLE) + flags= AV_OPT_FLAG_SUBTITLE_PARAM; + av_opt_set_defaults2(s, flags, flags); + + av_channel_layout_uninit(&s->ch_layout); + + s->time_base = (AVRational){0,1}; + s->framerate = (AVRational){ 0, 1 }; + s->pkt_timebase = (AVRational){ 0, 1 }; + s->get_buffer2 = avcodec_default_get_buffer2; + s->get_format = avcodec_default_get_format; + s->get_encode_buffer = avcodec_default_get_encode_buffer; + s->execute = avcodec_default_execute; + s->execute2 = avcodec_default_execute2; + s->sample_aspect_ratio = (AVRational){0,1}; + s->ch_layout.order = AV_CHANNEL_ORDER_UNSPEC; + s->pix_fmt = AV_PIX_FMT_NONE; + s->sw_pix_fmt = AV_PIX_FMT_NONE; + s->sample_fmt = AV_SAMPLE_FMT_NONE; + +#if FF_API_REORDERED_OPAQUE +FF_DISABLE_DEPRECATION_WARNINGS + s->reordered_opaque = AV_NOPTS_VALUE; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + if(codec && codec2->priv_data_size){ + s->priv_data = av_mallocz(codec2->priv_data_size); + if (!s->priv_data) + return AVERROR(ENOMEM); + if(codec->priv_class){ + *(const AVClass**)s->priv_data = codec->priv_class; + av_opt_set_defaults(s->priv_data); + } + } + if (codec && codec2->defaults) { + int ret; + const FFCodecDefault *d = codec2->defaults; + while (d->key) { + ret = av_opt_set(s, d->key, d->value, 0); + av_assert0(ret >= 0); + d++; + } + } + return 0; +} + +AVCodecContext *avcodec_alloc_context3(const AVCodec *codec) +{ + AVCodecContext *avctx= av_malloc(sizeof(AVCodecContext)); + + if (!avctx) + return NULL; + + if (init_context_defaults(avctx, codec) < 0) { + av_free(avctx); + return NULL; + } + + return avctx; +} + +void avcodec_free_context(AVCodecContext **pavctx) +{ + AVCodecContext *avctx = *pavctx; + + if (!avctx) + return; + + avcodec_close(avctx); + + av_freep(&avctx->extradata); + av_freep(&avctx->subtitle_header); + av_freep(&avctx->intra_matrix); + av_freep(&avctx->inter_matrix); + av_freep(&avctx->rc_override); + av_channel_layout_uninit(&avctx->ch_layout); + + av_freep(pavctx); +} + +const AVClass *avcodec_get_class(void) +{ + return &av_codec_context_class; +} + +#define SROFFSET(x) offsetof(AVSubtitleRect,x) + +static const AVOption subtitle_rect_options[]={ +{"x", "", SROFFSET(x), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0}, +{"y", "", SROFFSET(y), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0}, +{"w", "", SROFFSET(w), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0}, +{"h", "", SROFFSET(h), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0}, +{"type", "", SROFFSET(type), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, 0}, +{"flags", "", SROFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = 0}, 0, 1, 0, "flags"}, +{"forced", "", SROFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = 0}, 0, 1, 0}, +{NULL}, +}; + +static const AVClass av_subtitle_rect_class = { + .class_name = "AVSubtitleRect", + .item_name = NULL, + .option = subtitle_rect_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +const AVClass *avcodec_get_subtitle_rect_class(void) +{ + return &av_subtitle_rect_class; +} diff --git a/media/ffvpx/libavcodec/options_table.h b/media/ffvpx/libavcodec/options_table.h new file mode 100644 index 0000000000..4fea57673a --- /dev/null +++ b/media/ffvpx/libavcodec/options_table.h @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_OPTIONS_TABLE_H +#define AVCODEC_OPTIONS_TABLE_H + +#include "config_components.h" + +#include <float.h> +#include <limits.h> +#include <stdint.h> + +#include "libavutil/opt.h" +#include "avcodec.h" +#include "version_major.h" + +#define OFFSET(x) offsetof(AVCodecContext,x) +#define DEFAULT 0 //should be NAN but it does not work as it is not a constant in glibc as required by ANSI/ISO C +//these names are too long to be readable +#define V AV_OPT_FLAG_VIDEO_PARAM +#define A AV_OPT_FLAG_AUDIO_PARAM +#define S AV_OPT_FLAG_SUBTITLE_PARAM +#define E AV_OPT_FLAG_ENCODING_PARAM +#define D AV_OPT_FLAG_DECODING_PARAM +#define CC AV_OPT_FLAG_CHILD_CONSTS + +#define AV_CODEC_DEFAULT_BITRATE 200*1000 + +static const AVOption avcodec_options[] = { +{"b", "set bitrate (in bits/s)", OFFSET(bit_rate), AV_OPT_TYPE_INT64, {.i64 = AV_CODEC_DEFAULT_BITRATE }, 0, INT64_MAX, A|V|E}, +{"ab", "set bitrate (in bits/s)", OFFSET(bit_rate), AV_OPT_TYPE_INT64, {.i64 = 128*1000 }, 0, INT_MAX, A|E}, +{"bt", "Set video bitrate tolerance (in bits/s). In 1-pass mode, bitrate tolerance specifies how far " + "ratecontrol is willing to deviate from the target average bitrate value. This is not related " + "to minimum/maximum bitrate. Lowering tolerance too much has an adverse effect on quality.", + OFFSET(bit_rate_tolerance), AV_OPT_TYPE_INT, {.i64 = AV_CODEC_DEFAULT_BITRATE*20 }, 1, INT_MAX, V|E}, +{"flags", NULL, OFFSET(flags), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, UINT_MAX, V|A|S|E|D, "flags"}, +{"unaligned", "allow decoders to produce unaligned output", 0, AV_OPT_TYPE_CONST, { .i64 = AV_CODEC_FLAG_UNALIGNED }, INT_MIN, INT_MAX, V | D, "flags" }, +{"mv4", "use four motion vectors per macroblock (MPEG-4)", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_4MV }, INT_MIN, INT_MAX, V|E, "flags"}, +{"qpel", "use 1/4-pel motion compensation", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_QPEL }, INT_MIN, INT_MAX, V|E, "flags"}, +{"loop", "use loop filter", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_LOOP_FILTER }, INT_MIN, INT_MAX, V|E, "flags"}, +{"qscale", "use fixed qscale", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_QSCALE }, INT_MIN, INT_MAX, 0, "flags"}, +{"recon_frame", "export reconstructed frames", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_RECON_FRAME}, .unit = "flags"}, +{"copy_opaque", "propagate opaque values", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_COPY_OPAQUE}, .unit = "flags"}, +{"frame_duration", "use frame durations", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_FRAME_DURATION}, .unit = "flags"}, +{"pass1", "use internal 2-pass ratecontrol in first pass mode", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_PASS1 }, INT_MIN, INT_MAX, 0, "flags"}, +{"pass2", "use internal 2-pass ratecontrol in second pass mode", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_PASS2 }, INT_MIN, INT_MAX, 0, "flags"}, +{"gray", "only decode/encode grayscale", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_GRAY }, INT_MIN, INT_MAX, V|E|D, "flags"}, +{"psnr", "error[?] variables will be set during encoding", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_PSNR }, INT_MIN, INT_MAX, V|E, "flags"}, +{"ildct", "use interlaced DCT", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_INTERLACED_DCT }, INT_MIN, INT_MAX, V|E, "flags"}, +{"low_delay", "force low delay", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_LOW_DELAY }, INT_MIN, INT_MAX, V|D|E, "flags"}, +{"global_header", "place global headers in extradata instead of every keyframe", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_GLOBAL_HEADER }, INT_MIN, INT_MAX, V|A|E, "flags"}, +{"bitexact", "use only bitexact functions (except (I)DCT)", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_BITEXACT }, INT_MIN, INT_MAX, A|V|S|D|E, "flags"}, +{"aic", "H.263 advanced intra coding / MPEG-4 AC prediction", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_AC_PRED }, INT_MIN, INT_MAX, V|E, "flags"}, +{"ilme", "interlaced motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_INTERLACED_ME }, INT_MIN, INT_MAX, V|E, "flags"}, +{"cgop", "closed GOP", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_CLOSED_GOP }, INT_MIN, INT_MAX, V|E, "flags"}, +{"output_corrupt", "Output even potentially corrupted frames", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_OUTPUT_CORRUPT }, INT_MIN, INT_MAX, V|D, "flags"}, +{"drop_changed", "Drop frames whose parameters differ from first decoded frame", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_DROPCHANGED }, INT_MIN, INT_MAX, A|V|D, "flags"}, +{"flags2", NULL, OFFSET(flags2), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT}, 0, UINT_MAX, V|A|E|D|S, "flags2"}, +{"fast", "allow non-spec-compliant speedup tricks", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_FAST }, INT_MIN, INT_MAX, V|E, "flags2"}, +{"noout", "skip bitstream encoding", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_NO_OUTPUT }, INT_MIN, INT_MAX, V|E, "flags2"}, +{"ignorecrop", "ignore cropping information from sps", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_IGNORE_CROP }, INT_MIN, INT_MAX, V|D, "flags2"}, +{"local_header", "place global headers at every keyframe instead of in extradata", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_LOCAL_HEADER }, INT_MIN, INT_MAX, V|E, "flags2"}, +{"chunks", "Frame data might be split into multiple chunks", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_CHUNKS }, INT_MIN, INT_MAX, V|D, "flags2"}, +{"showall", "Show all frames before the first keyframe", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_SHOW_ALL }, INT_MIN, INT_MAX, V|D, "flags2"}, +{"export_mvs", "export motion vectors through frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_EXPORT_MVS}, INT_MIN, INT_MAX, V|D, "flags2"}, +{"skip_manual", "do not skip samples and export skip information as frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_SKIP_MANUAL}, INT_MIN, INT_MAX, A|D, "flags2"}, +{"ass_ro_flush_noop", "do not reset ASS ReadOrder field on flush", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_RO_FLUSH_NOOP}, INT_MIN, INT_MAX, S|D, "flags2"}, +{"icc_profiles", "generate/parse embedded ICC profiles from/to colorimetry tags", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_ICC_PROFILES}, INT_MIN, INT_MAX, S|D, "flags2"}, +{"export_side_data", "Export metadata as side data", OFFSET(export_side_data), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT}, 0, UINT_MAX, A|V|S|D|E, "export_side_data"}, +{"mvs", "export motion vectors through frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_EXPORT_DATA_MVS}, INT_MIN, INT_MAX, V|D, "export_side_data"}, +{"prft", "export Producer Reference Time through packet side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_EXPORT_DATA_PRFT}, INT_MIN, INT_MAX, A|V|S|E, "export_side_data"}, +{"venc_params", "export video encoding parameters through frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_EXPORT_DATA_VIDEO_ENC_PARAMS}, INT_MIN, INT_MAX, V|D, "export_side_data"}, +{"film_grain", "export film grain parameters through frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_EXPORT_DATA_FILM_GRAIN}, INT_MIN, INT_MAX, V|D, "export_side_data"}, +{"time_base", NULL, OFFSET(time_base), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, INT_MAX}, +{"g", "set the group of picture (GOP) size", OFFSET(gop_size), AV_OPT_TYPE_INT, {.i64 = 12 }, INT_MIN, INT_MAX, V|E}, +{"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|D|E}, +#if FF_API_OLD_CHANNEL_LAYOUT +{"ac", "set number of audio channels", OFFSET(channels), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|D|E}, +#endif +{"cutoff", "set cutoff bandwidth", OFFSET(cutoff), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|E}, +{"frame_size", NULL, OFFSET(frame_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|E}, +{"frame_number", NULL, OFFSET(frame_num), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, INT_MIN, INT_MAX}, +{"delay", NULL, OFFSET(delay), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX}, +{"qcomp", "video quantizer scale compression (VBR). Constant of ratecontrol equation. " + "Recommended range for default rc_eq: 0.0-1.0", + OFFSET(qcompress), AV_OPT_TYPE_FLOAT, {.dbl = 0.5 }, -FLT_MAX, FLT_MAX, V|E}, +{"qblur", "video quantizer scale blur (VBR)", OFFSET(qblur), AV_OPT_TYPE_FLOAT, {.dbl = 0.5 }, -1, FLT_MAX, V|E}, +{"qmin", "minimum video quantizer scale (VBR)", OFFSET(qmin), AV_OPT_TYPE_INT, {.i64 = 2 }, -1, 69, V|E}, +{"qmax", "maximum video quantizer scale (VBR)", OFFSET(qmax), AV_OPT_TYPE_INT, {.i64 = 31 }, -1, 1024, V|E}, +{"qdiff", "maximum difference between the quantizer scales (VBR)", OFFSET(max_qdiff), AV_OPT_TYPE_INT, {.i64 = 3 }, INT_MIN, INT_MAX, V|E}, +{"bf", "set maximum number of B-frames between non-B-frames", OFFSET(max_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, -1, INT_MAX, V|E}, +{"b_qfactor", "QP factor between P- and B-frames", OFFSET(b_quant_factor), AV_OPT_TYPE_FLOAT, {.dbl = 1.25 }, -FLT_MAX, FLT_MAX, V|E}, +{"codec_tag", NULL, OFFSET(codec_tag), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX}, +{"bug", "work around not autodetected encoder bugs", OFFSET(workaround_bugs), AV_OPT_TYPE_FLAGS, {.i64 = FF_BUG_AUTODETECT }, INT_MIN, INT_MAX, V|D, "bug"}, +{"autodetect", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_AUTODETECT }, INT_MIN, INT_MAX, V|D, "bug"}, +{"xvid_ilace", "Xvid interlacing bug (autodetected if FOURCC == XVIX)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_XVID_ILACE }, INT_MIN, INT_MAX, V|D, "bug"}, +{"ump4", "(autodetected if FOURCC == UMP4)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_UMP4 }, INT_MIN, INT_MAX, V|D, "bug"}, +{"no_padding", "padding bug (autodetected)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_NO_PADDING }, INT_MIN, INT_MAX, V|D, "bug"}, +{"amv", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_AMV }, INT_MIN, INT_MAX, V|D, "bug"}, +{"qpel_chroma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_QPEL_CHROMA }, INT_MIN, INT_MAX, V|D, "bug"}, +{"std_qpel", "old standard qpel (autodetected per FOURCC/version)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_STD_QPEL }, INT_MIN, INT_MAX, V|D, "bug"}, +{"qpel_chroma2", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_QPEL_CHROMA2 }, INT_MIN, INT_MAX, V|D, "bug"}, +{"direct_blocksize", "direct-qpel-blocksize bug (autodetected per FOURCC/version)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_DIRECT_BLOCKSIZE }, INT_MIN, INT_MAX, V|D, "bug"}, +{"edge", "edge padding bug (autodetected per FOURCC/version)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_EDGE }, INT_MIN, INT_MAX, V|D, "bug"}, +{"hpel_chroma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_HPEL_CHROMA }, INT_MIN, INT_MAX, V|D, "bug"}, +{"dc_clip", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_DC_CLIP }, INT_MIN, INT_MAX, V|D, "bug"}, +{"ms", "work around various bugs in Microsoft's broken decoders", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_MS }, INT_MIN, INT_MAX, V|D, "bug"}, +{"trunc", "truncated frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_TRUNCATED}, INT_MIN, INT_MAX, V|D, "bug"}, +{"iedge", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_IEDGE }, INT_MIN, INT_MAX, V|D, "bug"}, +{"strict", "how strictly to follow the standards", OFFSET(strict_std_compliance), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|V|D|E, "strict"}, +{"very", "strictly conform to a older more strict version of the spec or reference software", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_VERY_STRICT }, INT_MIN, INT_MAX, A|V|D|E, "strict"}, +{"strict", "strictly conform to all the things in the spec no matter what the consequences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_STRICT }, INT_MIN, INT_MAX, A|V|D|E, "strict"}, +{"normal", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_NORMAL }, INT_MIN, INT_MAX, A|V|D|E, "strict"}, +{"unofficial", "allow unofficial extensions", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_UNOFFICIAL }, INT_MIN, INT_MAX, A|V|D|E, "strict"}, +{"experimental", "allow non-standardized experimental things", 0, AV_OPT_TYPE_CONST, {.i64 = FF_COMPLIANCE_EXPERIMENTAL }, INT_MIN, INT_MAX, A|V|D|E, "strict"}, +{"b_qoffset", "QP offset between P- and B-frames", OFFSET(b_quant_offset), AV_OPT_TYPE_FLOAT, {.dbl = 1.25 }, -FLT_MAX, FLT_MAX, V|E}, +{"err_detect", "set error detection flags", OFFSET(err_recognition), AV_OPT_TYPE_FLAGS, {.i64 = 0 }, INT_MIN, INT_MAX, A|V|S|D|E, "err_detect"}, +{"crccheck", "verify embedded CRCs", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_CRCCHECK }, INT_MIN, INT_MAX, A|V|S|D|E, "err_detect"}, +{"bitstream", "detect bitstream specification deviations", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BITSTREAM }, INT_MIN, INT_MAX, A|V|S|D|E, "err_detect"}, +{"buffer", "detect improper bitstream length", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_BUFFER }, INT_MIN, INT_MAX, A|V|S|D|E, "err_detect"}, +{"explode", "abort decoding on minor error detection", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_EXPLODE }, INT_MIN, INT_MAX, A|V|S|D|E, "err_detect"}, +{"ignore_err", "ignore errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_IGNORE_ERR }, INT_MIN, INT_MAX, A|V|S|D|E, "err_detect"}, +{"careful", "consider things that violate the spec, are fast to check and have not been seen in the wild as errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_CAREFUL }, INT_MIN, INT_MAX, A|V|S|D|E, "err_detect"}, +{"compliant", "consider all spec non compliancies as errors", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_COMPLIANT | AV_EF_CAREFUL }, INT_MIN, INT_MAX, A|V|S|D|E, "err_detect"}, +{"aggressive", "consider things that a sane encoder should not do as an error", 0, AV_OPT_TYPE_CONST, {.i64 = AV_EF_AGGRESSIVE | AV_EF_COMPLIANT | AV_EF_CAREFUL}, INT_MIN, INT_MAX, A|V|S|D|E, "err_detect"}, +{"has_b_frames", NULL, OFFSET(has_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX}, +{"block_align", NULL, OFFSET(block_align), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX}, +{"rc_override_count", NULL, OFFSET(rc_override_count), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX}, +{"maxrate", "maximum bitrate (in bits/s). Used for VBV together with bufsize.", OFFSET(rc_max_rate), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, 0, INT_MAX, V|A|E}, +{"minrate", "minimum bitrate (in bits/s). Most useful in setting up a CBR encode. It is of little use otherwise.", + OFFSET(rc_min_rate), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E}, +{"bufsize", "set ratecontrol buffer size (in bits)", OFFSET(rc_buffer_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|V|E}, +{"i_qfactor", "QP factor between P- and I-frames", OFFSET(i_quant_factor), AV_OPT_TYPE_FLOAT, {.dbl = -0.8 }, -FLT_MAX, FLT_MAX, V|E}, +{"i_qoffset", "QP offset between P- and I-frames", OFFSET(i_quant_offset), AV_OPT_TYPE_FLOAT, {.dbl = 0.0 }, -FLT_MAX, FLT_MAX, V|E}, +{"dct", "DCT algorithm", OFFSET(dct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E, "dct"}, +{"auto", "autoselect a good one", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_AUTO }, INT_MIN, INT_MAX, V|E, "dct"}, +{"fastint", "fast integer", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FASTINT }, INT_MIN, INT_MAX, V|E, "dct"}, +{"int", "accurate integer", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_INT }, INT_MIN, INT_MAX, V|E, "dct"}, +{"mmx", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_MMX }, INT_MIN, INT_MAX, V|E, "dct"}, +{"altivec", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_ALTIVEC }, INT_MIN, INT_MAX, V|E, "dct"}, +{"faan", "floating point AAN DCT", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FAAN }, INT_MIN, INT_MAX, V|E, "dct"}, +{"lumi_mask", "compresses bright areas stronger than medium ones", OFFSET(lumi_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E}, +{"tcplx_mask", "temporal complexity masking", OFFSET(temporal_cplx_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E}, +{"scplx_mask", "spatial complexity masking", OFFSET(spatial_cplx_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E}, +{"p_mask", "inter masking", OFFSET(p_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E}, +{"dark_mask", "compresses dark areas stronger than medium ones", OFFSET(dark_masking), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, -FLT_MAX, FLT_MAX, V|E}, +{"idct", "select IDCT implementation", OFFSET(idct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E|D, "idct"}, +{"auto", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_AUTO }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"int", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_INT }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simple", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLE }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simplemmx", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEMMX }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"arm", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ARM }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"altivec", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ALTIVEC }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simplearm", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARM }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simplearmv5te", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV5TE }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simplearmv6", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV6 }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"simpleneon", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLENEON }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"xvid", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"xvidmmx", "deprecated, for compatibility only", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"faani", "floating point AAN IDCT", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_FAAN }, INT_MIN, INT_MAX, V|D|E, "idct"}, +{"simpleauto", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEAUTO }, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"slice_count", NULL, OFFSET(slice_count), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX}, +{"ec", "set error concealment strategy", OFFSET(error_concealment), AV_OPT_TYPE_FLAGS, {.i64 = 3 }, INT_MIN, INT_MAX, V|D, "ec"}, +{"guess_mvs", "iterative motion vector (MV) search (slow)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_GUESS_MVS }, INT_MIN, INT_MAX, V|D, "ec"}, +{"deblock", "use strong deblock filter for damaged MBs", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_DEBLOCK }, INT_MIN, INT_MAX, V|D, "ec"}, +{"favor_inter", "favor predicting from the previous frame", 0, AV_OPT_TYPE_CONST, {.i64 = FF_EC_FAVOR_INTER }, INT_MIN, INT_MAX, V|D, "ec"}, +{"bits_per_coded_sample", NULL, OFFSET(bits_per_coded_sample), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX}, +{"aspect", "sample aspect ratio", OFFSET(sample_aspect_ratio), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, 10, V|E}, +{"sar", "sample aspect ratio", OFFSET(sample_aspect_ratio), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, 10, V|E}, +{"debug", "print specific debug info", OFFSET(debug), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, INT_MAX, V|A|S|E|D, "debug"}, +{"pict", "picture info", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_PICT_INFO }, INT_MIN, INT_MAX, V|D, "debug"}, +{"rc", "rate control", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_RC }, INT_MIN, INT_MAX, V|E, "debug"}, +{"bitstream", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BITSTREAM }, INT_MIN, INT_MAX, V|D, "debug"}, +{"mb_type", "macroblock (MB) type", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_MB_TYPE }, INT_MIN, INT_MAX, V|D, "debug"}, +{"qp", "per-block quantization parameter (QP)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_QP }, INT_MIN, INT_MAX, V|D, "debug"}, +{"dct_coeff", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_DCT_COEFF }, INT_MIN, INT_MAX, V|D, "debug"}, +{"green_metadata", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_GREEN_MD }, INT_MIN, INT_MAX, V|D, "debug"}, +{"skip", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_SKIP }, INT_MIN, INT_MAX, V|D, "debug"}, +{"startcode", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_STARTCODE }, INT_MIN, INT_MAX, V|D, "debug"}, +{"er", "error recognition", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_ER }, INT_MIN, INT_MAX, V|D, "debug"}, +{"mmco", "memory management control operations (H.264)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_MMCO }, INT_MIN, INT_MAX, V|D, "debug"}, +{"bugs", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BUGS }, INT_MIN, INT_MAX, V|D, "debug"}, +{"buffers", "picture buffer allocations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BUFFERS }, INT_MIN, INT_MAX, V|D, "debug"}, +{"thread_ops", "threading operations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_THREADS }, INT_MIN, INT_MAX, V|A|D, "debug"}, +{"nomc", "skip motion compensation", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_NOMC }, INT_MIN, INT_MAX, V|A|D, "debug"}, +{"dia_size", "diamond type & size for motion estimation", OFFSET(dia_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E}, +{"last_pred", "amount of motion predictors from the previous frame", OFFSET(last_predictor_count), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E}, +{"pre_dia_size", "diamond type & size for motion estimation pre-pass", OFFSET(pre_dia_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E}, +{"subq", "sub-pel motion estimation quality", OFFSET(me_subpel_quality), AV_OPT_TYPE_INT, {.i64 = 8 }, INT_MIN, INT_MAX, V|E}, +{"me_range", "limit motion vectors range (1023 for DivX player)", OFFSET(me_range), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E}, +{"global_quality", NULL, OFFSET(global_quality), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E}, +{"slice_flags", NULL, OFFSET(slice_flags), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX}, +{"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, 2, V|E, "mbd"}, +{"simple", "use mbcmp", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_SIMPLE }, INT_MIN, INT_MAX, V|E, "mbd"}, +{"bits", "use fewest bits", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_BITS }, INT_MIN, INT_MAX, V|E, "mbd"}, +{"rd", "use best rate distortion", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_RD }, INT_MIN, INT_MAX, V|E, "mbd"}, +{"rc_init_occupancy", "number of bits which should be loaded into the rc buffer before decoding starts", OFFSET(rc_initial_buffer_occupancy), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E}, +{"threads", "set the number of threads", OFFSET(thread_count), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, INT_MAX, V|A|E|D, "threads"}, +{"auto", "autodetect a suitable number of threads to use", 0, AV_OPT_TYPE_CONST, {.i64 = 0 }, INT_MIN, INT_MAX, V|E|D, "threads"}, +{"dc", "intra_dc_precision", OFFSET(intra_dc_precision), AV_OPT_TYPE_INT, {.i64 = 0 }, -8, 16, V|E}, +{"nssew", "nsse weight", OFFSET(nsse_weight), AV_OPT_TYPE_INT, {.i64 = 8 }, INT_MIN, INT_MAX, V|E}, +{"skip_top", "number of macroblock rows at the top which are skipped", OFFSET(skip_top), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|D}, +{"skip_bottom", "number of macroblock rows at the bottom which are skipped", OFFSET(skip_bottom), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|D}, +{"profile", NULL, OFFSET(profile), AV_OPT_TYPE_INT, {.i64 = FF_PROFILE_UNKNOWN }, INT_MIN, INT_MAX, V|A|E|CC, "avctx.profile"}, +{"unknown", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_UNKNOWN }, INT_MIN, INT_MAX, V|A|E, "avctx.profile"}, +{"main10", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_HEVC_MAIN_10 }, INT_MIN, INT_MAX, V|E, "avctx.profile"}, +{"level", NULL, OFFSET(level), AV_OPT_TYPE_INT, {.i64 = FF_LEVEL_UNKNOWN }, INT_MIN, INT_MAX, V|A|E|CC, "avctx.level"}, +{"unknown", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LEVEL_UNKNOWN }, INT_MIN, INT_MAX, V|A|E, "avctx.level"}, +{"lowres", "decode at 1= 1/2, 2=1/4, 3=1/8 resolutions", OFFSET(lowres), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|A|D}, +{"cmp", "full-pel ME compare function", OFFSET(me_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"subcmp", "sub-pel ME compare function", OFFSET(me_sub_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"mbcmp", "macroblock compare function", OFFSET(mb_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"ildctcmp", "interlaced DCT compare function", OFFSET(ildct_cmp), AV_OPT_TYPE_INT, {.i64 = FF_CMP_VSAD }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"precmp", "pre motion estimation compare function", OFFSET(me_pre_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"sad", "sum of absolute differences, fast", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SAD }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"sse", "sum of squared errors", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SSE }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"satd", "sum of absolute Hadamard transformed differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_SATD }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"dct", "sum of absolute DCT transformed differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCT }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"psnr", "sum of squared quantization errors (avoid, low quality)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_PSNR }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"bit", "number of bits needed for the block", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_BIT }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"rd", "rate distortion optimal, slow", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_RD }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"zero", "0", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_ZERO }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"vsad", "sum of absolute vertical differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_VSAD }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"vsse", "sum of squared vertical differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_VSSE }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"nsse", "noise preserving sum of squared differences", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_NSSE }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +#if CONFIG_SNOW_ENCODER +{"w53", "5/3 wavelet, only used in snow", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_W53 }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"w97", "9/7 wavelet, only used in snow", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_W97 }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +#endif +{"dctmax", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_DCTMAX }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"chroma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_CHROMA }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"msad", "sum of absolute differences, median predicted", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_MEDIAN_SAD }, INT_MIN, INT_MAX, V|E, "cmp_func"}, +{"mblmin", "minimum macroblock Lagrange factor (VBR)", OFFSET(mb_lmin), AV_OPT_TYPE_INT, {.i64 = FF_QP2LAMBDA * 2 }, 1, FF_LAMBDA_MAX, V|E}, +{"mblmax", "maximum macroblock Lagrange factor (VBR)", OFFSET(mb_lmax), AV_OPT_TYPE_INT, {.i64 = FF_QP2LAMBDA * 31 }, 1, FF_LAMBDA_MAX, V|E}, +{"skip_loop_filter", "skip loop filtering process for the selected frames", OFFSET(skip_loop_filter), AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"}, +{"skip_idct" , "skip IDCT/dequantization for the selected frames", OFFSET(skip_idct), AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"}, +{"skip_frame" , "skip decoding for the selected frames", OFFSET(skip_frame), AV_OPT_TYPE_INT, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"}, +{"none" , "discard no frame", 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONE }, INT_MIN, INT_MAX, V|D, "avdiscard"}, +{"default" , "discard useless frames", 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_DEFAULT }, INT_MIN, INT_MAX, V|D, "avdiscard"}, +{"noref" , "discard all non-reference frames", 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONREF }, INT_MIN, INT_MAX, V|D, "avdiscard"}, +{"bidir" , "discard all bidirectional frames", 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_BIDIR }, INT_MIN, INT_MAX, V|D, "avdiscard"}, +{"nokey" , "discard all frames except keyframes", 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONKEY }, INT_MIN, INT_MAX, V|D, "avdiscard"}, +{"nointra" , "discard all frames except I frames", 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_NONINTRA}, INT_MIN, INT_MAX, V|D, "avdiscard"}, +{"all" , "discard all frames", 0, AV_OPT_TYPE_CONST, {.i64 = AVDISCARD_ALL }, INT_MIN, INT_MAX, V|D, "avdiscard"}, +{"bidir_refine", "refine the two motion vectors used in bidirectional macroblocks", OFFSET(bidir_refine), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, 4, V|E}, +{"keyint_min", "minimum interval between IDR-frames", OFFSET(keyint_min), AV_OPT_TYPE_INT, {.i64 = 25 }, INT_MIN, INT_MAX, V|E}, +{"refs", "reference frames to consider for motion compensation", OFFSET(refs), AV_OPT_TYPE_INT, {.i64 = 1 }, INT_MIN, INT_MAX, V|E}, +{"trellis", "rate-distortion optimal quantization", OFFSET(trellis), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E}, +{"mv0_threshold", NULL, OFFSET(mv0_threshold), AV_OPT_TYPE_INT, {.i64 = 256 }, 0, INT_MAX, V|E}, +{"compression_level", NULL, OFFSET(compression_level), AV_OPT_TYPE_INT, {.i64 = FF_COMPRESSION_DEFAULT }, INT_MIN, INT_MAX, V|A|E}, +{"bits_per_raw_sample", NULL, OFFSET(bits_per_raw_sample), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX}, +{"ch_layout", NULL, OFFSET(ch_layout), AV_OPT_TYPE_CHLAYOUT, {.str = NULL }, 0, 0, A|E|D, "ch_layout"}, +#if FF_API_OLD_CHANNEL_LAYOUT +{"channel_layout", NULL, OFFSET(channel_layout), AV_OPT_TYPE_CHANNEL_LAYOUT, {.i64 = DEFAULT }, 0, UINT64_MAX, A|E|D, "channel_layout"}, +{"request_channel_layout", NULL, OFFSET(request_channel_layout), AV_OPT_TYPE_CHANNEL_LAYOUT, {.i64 = DEFAULT }, 0, UINT64_MAX, A|D, "request_channel_layout"}, +#endif +{"rc_max_vbv_use", NULL, OFFSET(rc_max_available_vbv_use), AV_OPT_TYPE_FLOAT, {.dbl = 0 }, 0.0, FLT_MAX, V|E}, +{"rc_min_vbv_use", NULL, OFFSET(rc_min_vbv_overflow_use), AV_OPT_TYPE_FLOAT, {.dbl = 3 }, 0.0, FLT_MAX, V|E}, +{"ticks_per_frame", NULL, OFFSET(ticks_per_frame), AV_OPT_TYPE_INT, {.i64 = 1 }, 1, INT_MAX, A|V|E|D}, +{"color_primaries", "color primaries", OFFSET(color_primaries), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_UNSPECIFIED }, 1, INT_MAX, V|E|D, "color_primaries_type"}, +{"bt709", "BT.709", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"unknown", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"bt470m", "BT.470 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470M }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"bt470bg", "BT.470 BG", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470BG }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"smpte170m", "SMPTE 170 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE170M }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"smpte240m", "SMPTE 240 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE240M }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"film", "Film", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_FILM }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"bt2020", "BT.2020", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"smpte428", "SMPTE 428-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE428 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"smpte428_1", "SMPTE 428-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE428 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"smpte431", "SMPTE 431-2", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE431 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"smpte432", "SMPTE 422-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_SMPTE432 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"jedec-p22", "JEDEC P22", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_JEDEC_P22 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"ebu3213", "EBU 3213-E", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_EBU3213 }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"unspecified", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "color_primaries_type"}, +{"color_trc", "color transfer characteristics", OFFSET(color_trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_UNSPECIFIED }, 1, INT_MAX, V|E|D, "color_trc_type"}, +{"bt709", "BT.709", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"unknown", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"gamma22", "BT.470 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_GAMMA22 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"gamma28", "BT.470 BG", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_GAMMA28 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"smpte170m", "SMPTE 170 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE170M }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"smpte240m", "SMPTE 240 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE240M }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"linear", "Linear", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LINEAR }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"log100", "Log", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LOG }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"log316", "Log square root", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LOG_SQRT }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"iec61966-2-4", "IEC 61966-2-4", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_4 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"bt1361e", "BT.1361", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT1361_ECG }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"iec61966-2-1", "IEC 61966-2-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_1 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"bt2020-10", "BT.2020 - 10 bit", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"bt2020-12", "BT.2020 - 12 bit", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_12 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"smpte2084", "SMPTE 2084", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE2084 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"smpte428", "SMPTE 428-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE428 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"arib-std-b67", "ARIB STD-B67", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_ARIB_STD_B67 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"unspecified", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"log", "Log", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LOG }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"log_sqrt", "Log square root", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_LOG_SQRT }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"iec61966_2_4", "IEC 61966-2-4", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_4 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"bt1361", "BT.1361", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT1361_ECG }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"iec61966_2_1", "IEC 61966-2-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_IEC61966_2_1 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"bt2020_10bit", "BT.2020 - 10 bit", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"bt2020_12bit", "BT.2020 - 12 bit", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_12 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"smpte428_1", "SMPTE 428-1", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_SMPTE428 }, INT_MIN, INT_MAX, V|E|D, "color_trc_type"}, +{"colorspace", "color space", OFFSET(colorspace), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_UNSPECIFIED }, 0, INT_MAX, V|E|D, "colorspace_type"}, +{"rgb", "RGB", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_RGB }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"bt709", "BT.709", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709 }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"unknown", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"fcc", "FCC", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_FCC }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"bt470bg", "BT.470 BG", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT470BG }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"smpte170m", "SMPTE 170 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_SMPTE170M }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"smpte240m", "SMPTE 240 M", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_SMPTE240M }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"ycgco", "YCGCO", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_YCGCO }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"bt2020nc", "BT.2020 NCL", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"bt2020c", "BT.2020 CL", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_CL }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"smpte2085", "SMPTE 2085", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_SMPTE2085 }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"chroma-derived-nc", "Chroma-derived NCL", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_CHROMA_DERIVED_NCL }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"chroma-derived-c", "Chroma-derived CL", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_CHROMA_DERIVED_CL }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"ictcp", "ICtCp", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_ICTCP }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"unspecified", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"ycocg", "YCGCO", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_YCGCO }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"bt2020_ncl", "BT.2020 NCL", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"bt2020_cl", "BT.2020 CL", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_CL }, INT_MIN, INT_MAX, V|E|D, "colorspace_type"}, +{"color_range", "color range", OFFSET(color_range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_UNSPECIFIED }, 0, INT_MAX, V|E|D, "color_range_type"}, +{"unknown", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "color_range_type"}, +{"tv", "MPEG (219*2^(n-8))", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG }, INT_MIN, INT_MAX, V|E|D, "color_range_type"}, +{"pc", "JPEG (2^n-1)", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG }, INT_MIN, INT_MAX, V|E|D, "color_range_type"}, +{"unspecified", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "color_range_type"}, +{"mpeg", "MPEG (219*2^(n-8))", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG }, INT_MIN, INT_MAX, V|E|D, "color_range_type"}, +{"jpeg", "JPEG (2^n-1)", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG }, INT_MIN, INT_MAX, V|E|D, "color_range_type"}, +{"chroma_sample_location", "chroma sample location", OFFSET(chroma_sample_location), AV_OPT_TYPE_INT, {.i64 = AVCHROMA_LOC_UNSPECIFIED }, 0, INT_MAX, V|E|D, "chroma_sample_location_type"}, +{"unknown", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"}, +{"left", "Left", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_LEFT }, INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"}, +{"center", "Center", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_CENTER }, INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"}, +{"topleft", "Top-left", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_TOPLEFT }, INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"}, +{"top", "Top", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_TOP }, INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"}, +{"bottomleft", "Bottom-left", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_BOTTOMLEFT }, INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"}, +{"bottom", "Bottom", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_BOTTOM }, INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"}, +{"unspecified", "Unspecified", 0, AV_OPT_TYPE_CONST, {.i64 = AVCHROMA_LOC_UNSPECIFIED }, INT_MIN, INT_MAX, V|E|D, "chroma_sample_location_type"}, +{"log_level_offset", "set the log level offset", OFFSET(log_level_offset), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX }, +{"slices", "set the number of slices, used in parallelized encoding", OFFSET(slices), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|E}, +{"thread_type", "select multithreading type", OFFSET(thread_type), AV_OPT_TYPE_FLAGS, {.i64 = FF_THREAD_SLICE|FF_THREAD_FRAME }, 0, INT_MAX, V|A|E|D, "thread_type"}, +{"slice", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_THREAD_SLICE }, INT_MIN, INT_MAX, V|E|D, "thread_type"}, +{"frame", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_THREAD_FRAME }, INT_MIN, INT_MAX, V|E|D, "thread_type"}, +{"audio_service_type", "audio service type", OFFSET(audio_service_type), AV_OPT_TYPE_INT, {.i64 = AV_AUDIO_SERVICE_TYPE_MAIN }, 0, AV_AUDIO_SERVICE_TYPE_NB-1, A|E, "audio_service_type"}, +{"ma", "Main Audio Service", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_MAIN }, INT_MIN, INT_MAX, A|E, "audio_service_type"}, +{"ef", "Effects", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_EFFECTS }, INT_MIN, INT_MAX, A|E, "audio_service_type"}, +{"vi", "Visually Impaired", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_VISUALLY_IMPAIRED }, INT_MIN, INT_MAX, A|E, "audio_service_type"}, +{"hi", "Hearing Impaired", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_HEARING_IMPAIRED }, INT_MIN, INT_MAX, A|E, "audio_service_type"}, +{"di", "Dialogue", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_DIALOGUE }, INT_MIN, INT_MAX, A|E, "audio_service_type"}, +{"co", "Commentary", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_COMMENTARY }, INT_MIN, INT_MAX, A|E, "audio_service_type"}, +{"em", "Emergency", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_EMERGENCY }, INT_MIN, INT_MAX, A|E, "audio_service_type"}, +{"vo", "Voice Over", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_VOICE_OVER }, INT_MIN, INT_MAX, A|E, "audio_service_type"}, +{"ka", "Karaoke", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE }, INT_MIN, INT_MAX, A|E, "audio_service_type"}, +{"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, INT_MAX, A|D, "request_sample_fmt"}, +{"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0}, +{"sub_charenc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, S|D}, +{"sub_charenc_mode", "set input text subtitles character encoding mode", OFFSET(sub_charenc_mode), AV_OPT_TYPE_FLAGS, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC}, -1, INT_MAX, S|D, "sub_charenc_mode"}, +{"do_nothing", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_DO_NOTHING}, INT_MIN, INT_MAX, S|D, "sub_charenc_mode"}, +{"auto", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC}, INT_MIN, INT_MAX, S|D, "sub_charenc_mode"}, +{"pre_decoder", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_PRE_DECODER}, INT_MIN, INT_MAX, S|D, "sub_charenc_mode"}, +{"ignore", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_IGNORE}, INT_MIN, INT_MAX, S|D, "sub_charenc_mode"}, +{"apply_cropping", NULL, OFFSET(apply_cropping), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, V | D }, +{"skip_alpha", "Skip processing alpha", OFFSET(skip_alpha), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, V|D }, +{"field_order", "Field order", OFFSET(field_order), AV_OPT_TYPE_INT, {.i64 = AV_FIELD_UNKNOWN }, 0, 5, V|D|E, "field_order" }, +{"progressive", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_PROGRESSIVE }, 0, 0, V|D|E, "field_order" }, +{"tt", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_TT }, 0, 0, V|D|E, "field_order" }, +{"bb", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_BB }, 0, 0, V|D|E, "field_order" }, +{"tb", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_TB }, 0, 0, V|D|E, "field_order" }, +{"bt", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AV_FIELD_BT }, 0, 0, V|D|E, "field_order" }, +{"dump_separator", "set information dump field separator", OFFSET(dump_separator), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, A|V|S|D|E}, +{"codec_whitelist", "List of decoders that are allowed to be used", OFFSET(codec_whitelist), AV_OPT_TYPE_STRING, { .str = NULL }, 0, 0, A|V|S|D }, +{"pixel_format", "set pixel format", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64=AV_PIX_FMT_NONE}, -1, INT_MAX, 0 }, +{"video_size", "set video size", OFFSET(width), AV_OPT_TYPE_IMAGE_SIZE, {.str=NULL}, 0, INT_MAX, 0 }, +{"max_pixels", "Maximum number of pixels", OFFSET(max_pixels), AV_OPT_TYPE_INT64, {.i64 = INT_MAX }, 0, INT_MAX, A|V|S|D|E }, +{"max_samples", "Maximum number of samples", OFFSET(max_samples), AV_OPT_TYPE_INT64, {.i64 = INT_MAX }, 0, INT_MAX, A|D|E }, +{"hwaccel_flags", NULL, OFFSET(hwaccel_flags), AV_OPT_TYPE_FLAGS, {.i64 = AV_HWACCEL_FLAG_IGNORE_LEVEL }, 0, UINT_MAX, V|D, "hwaccel_flags"}, +{"ignore_level", "ignore level even if the codec level used is unknown or higher than the maximum supported level reported by the hardware driver", 0, AV_OPT_TYPE_CONST, { .i64 = AV_HWACCEL_FLAG_IGNORE_LEVEL }, INT_MIN, INT_MAX, V | D, "hwaccel_flags" }, +{"allow_high_depth", "allow to output YUV pixel formats with a different chroma sampling than 4:2:0 and/or other than 8 bits per component", 0, AV_OPT_TYPE_CONST, {.i64 = AV_HWACCEL_FLAG_ALLOW_HIGH_DEPTH }, INT_MIN, INT_MAX, V | D, "hwaccel_flags"}, +{"allow_profile_mismatch", "attempt to decode anyway if HW accelerated decoder's supported profiles do not exactly match the stream", 0, AV_OPT_TYPE_CONST, {.i64 = AV_HWACCEL_FLAG_ALLOW_PROFILE_MISMATCH }, INT_MIN, INT_MAX, V | D, "hwaccel_flags"}, +{"unsafe_output", "allow potentially unsafe hwaccel frame output that might require special care to process successfully", 0, AV_OPT_TYPE_CONST, {.i64 = AV_HWACCEL_FLAG_UNSAFE_OUTPUT }, INT_MIN, INT_MAX, V | D, "hwaccel_flags"}, +{"extra_hw_frames", "Number of extra hardware frames to allocate for the user", OFFSET(extra_hw_frames), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, V|D }, +{"discard_damaged_percentage", "Percentage of damaged samples to discard a frame", OFFSET(discard_damaged_percentage), AV_OPT_TYPE_INT, {.i64 = 95 }, 0, 100, V|D }, +{NULL}, +}; + +#undef A +#undef V +#undef S +#undef E +#undef D +#undef CC +#undef DEFAULT +#undef OFFSET + +#endif /* AVCODEC_OPTIONS_TABLE_H */ diff --git a/media/ffvpx/libavcodec/packet.h b/media/ffvpx/libavcodec/packet.h new file mode 100644 index 0000000000..f28e7e7011 --- /dev/null +++ b/media/ffvpx/libavcodec/packet.h @@ -0,0 +1,731 @@ +/* + * AVPacket public API + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PACKET_H +#define AVCODEC_PACKET_H + +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/attributes.h" +#include "libavutil/buffer.h" +#include "libavutil/dict.h" +#include "libavutil/rational.h" +#include "libavutil/version.h" + +#include "libavcodec/version_major.h" + +/** + * @defgroup lavc_packet AVPacket + * + * Types and functions for working with AVPacket. + * @{ + */ +enum AVPacketSideDataType { + /** + * An AV_PKT_DATA_PALETTE side data packet contains exactly AVPALETTE_SIZE + * bytes worth of palette. This side data signals that a new palette is + * present. + */ + AV_PKT_DATA_PALETTE, + + /** + * The AV_PKT_DATA_NEW_EXTRADATA is used to notify the codec or the format + * that the extradata buffer was changed and the receiving side should + * act upon it appropriately. The new extradata is embedded in the side + * data buffer and should be immediately used for processing the current + * frame or packet. + */ + AV_PKT_DATA_NEW_EXTRADATA, + + /** + * An AV_PKT_DATA_PARAM_CHANGE side data packet is laid out as follows: + * @code + * u32le param_flags + * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT) + * s32le channel_count + * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_LAYOUT) + * u64le channel_layout + * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_SAMPLE_RATE) + * s32le sample_rate + * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS) + * s32le width + * s32le height + * @endcode + */ + AV_PKT_DATA_PARAM_CHANGE, + + /** + * An AV_PKT_DATA_H263_MB_INFO side data packet contains a number of + * structures with info about macroblocks relevant to splitting the + * packet into smaller packets on macroblock edges (e.g. as for RFC 2190). + * That is, it does not necessarily contain info about all macroblocks, + * as long as the distance between macroblocks in the info is smaller + * than the target payload size. + * Each MB info structure is 12 bytes, and is laid out as follows: + * @code + * u32le bit offset from the start of the packet + * u8 current quantizer at the start of the macroblock + * u8 GOB number + * u16le macroblock address within the GOB + * u8 horizontal MV predictor + * u8 vertical MV predictor + * u8 horizontal MV predictor for block number 3 + * u8 vertical MV predictor for block number 3 + * @endcode + */ + AV_PKT_DATA_H263_MB_INFO, + + /** + * This side data should be associated with an audio stream and contains + * ReplayGain information in form of the AVReplayGain struct. + */ + AV_PKT_DATA_REPLAYGAIN, + + /** + * This side data contains a 3x3 transformation matrix describing an affine + * transformation that needs to be applied to the decoded video frames for + * correct presentation. + * + * See libavutil/display.h for a detailed description of the data. + */ + AV_PKT_DATA_DISPLAYMATRIX, + + /** + * This side data should be associated with a video stream and contains + * Stereoscopic 3D information in form of the AVStereo3D struct. + */ + AV_PKT_DATA_STEREO3D, + + /** + * This side data should be associated with an audio stream and corresponds + * to enum AVAudioServiceType. + */ + AV_PKT_DATA_AUDIO_SERVICE_TYPE, + + /** + * This side data contains quality related information from the encoder. + * @code + * u32le quality factor of the compressed frame. Allowed range is between 1 (good) and FF_LAMBDA_MAX (bad). + * u8 picture type + * u8 error count + * u16 reserved + * u64le[error count] sum of squared differences between encoder in and output + * @endcode + */ + AV_PKT_DATA_QUALITY_STATS, + + /** + * This side data contains an integer value representing the stream index + * of a "fallback" track. A fallback track indicates an alternate + * track to use when the current track can not be decoded for some reason. + * e.g. no decoder available for codec. + */ + AV_PKT_DATA_FALLBACK_TRACK, + + /** + * This side data corresponds to the AVCPBProperties struct. + */ + AV_PKT_DATA_CPB_PROPERTIES, + + /** + * Recommmends skipping the specified number of samples + * @code + * u32le number of samples to skip from start of this packet + * u32le number of samples to skip from end of this packet + * u8 reason for start skip + * u8 reason for end skip (0=padding silence, 1=convergence) + * @endcode + */ + AV_PKT_DATA_SKIP_SAMPLES, + + /** + * An AV_PKT_DATA_JP_DUALMONO side data packet indicates that + * the packet may contain "dual mono" audio specific to Japanese DTV + * and if it is true, recommends only the selected channel to be used. + * @code + * u8 selected channels (0=main/left, 1=sub/right, 2=both) + * @endcode + */ + AV_PKT_DATA_JP_DUALMONO, + + /** + * A list of zero terminated key/value strings. There is no end marker for + * the list, so it is required to rely on the side data size to stop. + */ + AV_PKT_DATA_STRINGS_METADATA, + + /** + * Subtitle event position + * @code + * u32le x1 + * u32le y1 + * u32le x2 + * u32le y2 + * @endcode + */ + AV_PKT_DATA_SUBTITLE_POSITION, + + /** + * Data found in BlockAdditional element of matroska container. There is + * no end marker for the data, so it is required to rely on the side data + * size to recognize the end. 8 byte id (as found in BlockAddId) followed + * by data. + */ + AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL, + + /** + * The optional first identifier line of a WebVTT cue. + */ + AV_PKT_DATA_WEBVTT_IDENTIFIER, + + /** + * The optional settings (rendering instructions) that immediately + * follow the timestamp specifier of a WebVTT cue. + */ + AV_PKT_DATA_WEBVTT_SETTINGS, + + /** + * A list of zero terminated key/value strings. There is no end marker for + * the list, so it is required to rely on the side data size to stop. This + * side data includes updated metadata which appeared in the stream. + */ + AV_PKT_DATA_METADATA_UPDATE, + + /** + * MPEGTS stream ID as uint8_t, this is required to pass the stream ID + * information from the demuxer to the corresponding muxer. + */ + AV_PKT_DATA_MPEGTS_STREAM_ID, + + /** + * Mastering display metadata (based on SMPTE-2086:2014). This metadata + * should be associated with a video stream and contains data in the form + * of the AVMasteringDisplayMetadata struct. + */ + AV_PKT_DATA_MASTERING_DISPLAY_METADATA, + + /** + * This side data should be associated with a video stream and corresponds + * to the AVSphericalMapping structure. + */ + AV_PKT_DATA_SPHERICAL, + + /** + * Content light level (based on CTA-861.3). This metadata should be + * associated with a video stream and contains data in the form of the + * AVContentLightMetadata struct. + */ + AV_PKT_DATA_CONTENT_LIGHT_LEVEL, + + /** + * ATSC A53 Part 4 Closed Captions. This metadata should be associated with + * a video stream. A53 CC bitstream is stored as uint8_t in AVPacketSideData.data. + * The number of bytes of CC data is AVPacketSideData.size. + */ + AV_PKT_DATA_A53_CC, + + /** + * This side data is encryption initialization data. + * The format is not part of ABI, use av_encryption_init_info_* methods to + * access. + */ + AV_PKT_DATA_ENCRYPTION_INIT_INFO, + + /** + * This side data contains encryption info for how to decrypt the packet. + * The format is not part of ABI, use av_encryption_info_* methods to access. + */ + AV_PKT_DATA_ENCRYPTION_INFO, + + /** + * Active Format Description data consisting of a single byte as specified + * in ETSI TS 101 154 using AVActiveFormatDescription enum. + */ + AV_PKT_DATA_AFD, + + /** + * Producer Reference Time data corresponding to the AVProducerReferenceTime struct, + * usually exported by some encoders (on demand through the prft flag set in the + * AVCodecContext export_side_data field). + */ + AV_PKT_DATA_PRFT, + + /** + * ICC profile data consisting of an opaque octet buffer following the + * format described by ISO 15076-1. + */ + AV_PKT_DATA_ICC_PROFILE, + + /** + * DOVI configuration + * ref: + * dolby-vision-bitstreams-within-the-iso-base-media-file-format-v2.1.2, section 2.2 + * dolby-vision-bitstreams-in-mpeg-2-transport-stream-multiplex-v1.2, section 3.3 + * Tags are stored in struct AVDOVIDecoderConfigurationRecord. + */ + AV_PKT_DATA_DOVI_CONF, + + /** + * Timecode which conforms to SMPTE ST 12-1:2014. The data is an array of 4 uint32_t + * where the first uint32_t describes how many (1-3) of the other timecodes are used. + * The timecode format is described in the documentation of av_timecode_get_smpte_from_framenum() + * function in libavutil/timecode.h. + */ + AV_PKT_DATA_S12M_TIMECODE, + + /** + * HDR10+ dynamic metadata associated with a video frame. The metadata is in + * the form of the AVDynamicHDRPlus struct and contains + * information for color volume transform - application 4 of + * SMPTE 2094-40:2016 standard. + */ + AV_PKT_DATA_DYNAMIC_HDR10_PLUS, + + /** + * The number of side data types. + * This is not part of the public API/ABI in the sense that it may + * change when new side data types are added. + * This must stay the last enum value. + * If its value becomes huge, some code using it + * needs to be updated as it assumes it to be smaller than other limits. + */ + AV_PKT_DATA_NB +}; + +#define AV_PKT_DATA_QUALITY_FACTOR AV_PKT_DATA_QUALITY_STATS //DEPRECATED + +typedef struct AVPacketSideData { + uint8_t *data; + size_t size; + enum AVPacketSideDataType type; +} AVPacketSideData; + +/** + * This structure stores compressed data. It is typically exported by demuxers + * and then passed as input to decoders, or received as output from encoders and + * then passed to muxers. + * + * For video, it should typically contain one compressed frame. For audio it may + * contain several compressed frames. Encoders are allowed to output empty + * packets, with no compressed data, containing only side data + * (e.g. to update some stream parameters at the end of encoding). + * + * The semantics of data ownership depends on the buf field. + * If it is set, the packet data is dynamically allocated and is + * valid indefinitely until a call to av_packet_unref() reduces the + * reference count to 0. + * + * If the buf field is not set av_packet_ref() would make a copy instead + * of increasing the reference count. + * + * The side data is always allocated with av_malloc(), copied by + * av_packet_ref() and freed by av_packet_unref(). + * + * sizeof(AVPacket) being a part of the public ABI is deprecated. once + * av_init_packet() is removed, new packets will only be able to be allocated + * with av_packet_alloc(), and new fields may be added to the end of the struct + * with a minor bump. + * + * @see av_packet_alloc + * @see av_packet_ref + * @see av_packet_unref + */ +typedef struct AVPacket { + /** + * A reference to the reference-counted buffer where the packet data is + * stored. + * May be NULL, then the packet data is not reference-counted. + */ + AVBufferRef *buf; + /** + * Presentation timestamp in AVStream->time_base units; the time at which + * the decompressed packet will be presented to the user. + * Can be AV_NOPTS_VALUE if it is not stored in the file. + * pts MUST be larger or equal to dts as presentation cannot happen before + * decompression, unless one wants to view hex dumps. Some formats misuse + * the terms dts and pts/cts to mean something different. Such timestamps + * must be converted to true pts/dts before they are stored in AVPacket. + */ + int64_t pts; + /** + * Decompression timestamp in AVStream->time_base units; the time at which + * the packet is decompressed. + * Can be AV_NOPTS_VALUE if it is not stored in the file. + */ + int64_t dts; + uint8_t *data; + int size; + int stream_index; + /** + * A combination of AV_PKT_FLAG values + */ + int flags; + /** + * Additional packet data that can be provided by the container. + * Packet can contain several types of side information. + */ + AVPacketSideData *side_data; + int side_data_elems; + + /** + * Duration of this packet in AVStream->time_base units, 0 if unknown. + * Equals next_pts - this_pts in presentation order. + */ + int64_t duration; + + int64_t pos; ///< byte position in stream, -1 if unknown + + /** + * for some private data of the user + */ + void *opaque; + + /** + * AVBufferRef for free use by the API user. FFmpeg will never check the + * contents of the buffer ref. FFmpeg calls av_buffer_unref() on it when + * the packet is unreferenced. av_packet_copy_props() calls create a new + * reference with av_buffer_ref() for the target packet's opaque_ref field. + * + * This is unrelated to the opaque field, although it serves a similar + * purpose. + */ + AVBufferRef *opaque_ref; + + /** + * Time base of the packet's timestamps. + * In the future, this field may be set on packets output by encoders or + * demuxers, but its value will be by default ignored on input to decoders + * or muxers. + */ + AVRational time_base; +} AVPacket; + +#if FF_API_INIT_PACKET +attribute_deprecated +typedef struct AVPacketList { + AVPacket pkt; + struct AVPacketList *next; +} AVPacketList; +#endif + +#define AV_PKT_FLAG_KEY 0x0001 ///< The packet contains a keyframe +#define AV_PKT_FLAG_CORRUPT 0x0002 ///< The packet content is corrupted +/** + * Flag is used to discard packets which are required to maintain valid + * decoder state but are not required for output and should be dropped + * after decoding. + **/ +#define AV_PKT_FLAG_DISCARD 0x0004 +/** + * The packet comes from a trusted source. + * + * Otherwise-unsafe constructs such as arbitrary pointers to data + * outside the packet may be followed. + */ +#define AV_PKT_FLAG_TRUSTED 0x0008 +/** + * Flag is used to indicate packets that contain frames that can + * be discarded by the decoder. I.e. Non-reference frames. + */ +#define AV_PKT_FLAG_DISPOSABLE 0x0010 + +enum AVSideDataParamChangeFlags { +#if FF_API_OLD_CHANNEL_LAYOUT + /** + * @deprecated those are not used by any decoder + */ + AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT = 0x0001, + AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_LAYOUT = 0x0002, +#endif + AV_SIDE_DATA_PARAM_CHANGE_SAMPLE_RATE = 0x0004, + AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS = 0x0008, +}; + +/** + * Allocate an AVPacket and set its fields to default values. The resulting + * struct must be freed using av_packet_free(). + * + * @return An AVPacket filled with default values or NULL on failure. + * + * @note this only allocates the AVPacket itself, not the data buffers. Those + * must be allocated through other means such as av_new_packet. + * + * @see av_new_packet + */ +AVPacket *av_packet_alloc(void); + +/** + * Create a new packet that references the same data as src. + * + * This is a shortcut for av_packet_alloc()+av_packet_ref(). + * + * @return newly created AVPacket on success, NULL on error. + * + * @see av_packet_alloc + * @see av_packet_ref + */ +AVPacket *av_packet_clone(const AVPacket *src); + +/** + * Free the packet, if the packet is reference counted, it will be + * unreferenced first. + * + * @param pkt packet to be freed. The pointer will be set to NULL. + * @note passing NULL is a no-op. + */ +void av_packet_free(AVPacket **pkt); + +#if FF_API_INIT_PACKET +/** + * Initialize optional fields of a packet with default values. + * + * Note, this does not touch the data and size members, which have to be + * initialized separately. + * + * @param pkt packet + * + * @see av_packet_alloc + * @see av_packet_unref + * + * @deprecated This function is deprecated. Once it's removed, + sizeof(AVPacket) will not be a part of the ABI anymore. + */ +attribute_deprecated +void av_init_packet(AVPacket *pkt); +#endif + +/** + * Allocate the payload of a packet and initialize its fields with + * default values. + * + * @param pkt packet + * @param size wanted payload size + * @return 0 if OK, AVERROR_xxx otherwise + */ +int av_new_packet(AVPacket *pkt, int size); + +/** + * Reduce packet size, correctly zeroing padding + * + * @param pkt packet + * @param size new size + */ +void av_shrink_packet(AVPacket *pkt, int size); + +/** + * Increase packet size, correctly zeroing padding + * + * @param pkt packet + * @param grow_by number of bytes by which to increase the size of the packet + */ +int av_grow_packet(AVPacket *pkt, int grow_by); + +/** + * Initialize a reference-counted packet from av_malloc()ed data. + * + * @param pkt packet to be initialized. This function will set the data, size, + * and buf fields, all others are left untouched. + * @param data Data allocated by av_malloc() to be used as packet data. If this + * function returns successfully, the data is owned by the underlying AVBuffer. + * The caller may not access the data through other means. + * @param size size of data in bytes, without the padding. I.e. the full buffer + * size is assumed to be size + AV_INPUT_BUFFER_PADDING_SIZE. + * + * @return 0 on success, a negative AVERROR on error + */ +int av_packet_from_data(AVPacket *pkt, uint8_t *data, int size); + +/** + * Allocate new information of a packet. + * + * @param pkt packet + * @param type side information type + * @param size side information size + * @return pointer to fresh allocated data or NULL otherwise + */ +uint8_t* av_packet_new_side_data(AVPacket *pkt, enum AVPacketSideDataType type, + size_t size); + +/** + * Wrap an existing array as a packet side data. + * + * @param pkt packet + * @param type side information type + * @param data the side data array. It must be allocated with the av_malloc() + * family of functions. The ownership of the data is transferred to + * pkt. + * @param size side information size + * @return a non-negative number on success, a negative AVERROR code on + * failure. On failure, the packet is unchanged and the data remains + * owned by the caller. + */ +int av_packet_add_side_data(AVPacket *pkt, enum AVPacketSideDataType type, + uint8_t *data, size_t size); + +/** + * Shrink the already allocated side data buffer + * + * @param pkt packet + * @param type side information type + * @param size new side information size + * @return 0 on success, < 0 on failure + */ +int av_packet_shrink_side_data(AVPacket *pkt, enum AVPacketSideDataType type, + size_t size); + +/** + * Get side information from packet. + * + * @param pkt packet + * @param type desired side information type + * @param size If supplied, *size will be set to the size of the side data + * or to zero if the desired side data is not present. + * @return pointer to data if present or NULL otherwise + */ +uint8_t* av_packet_get_side_data(const AVPacket *pkt, enum AVPacketSideDataType type, + size_t *size); + +const char *av_packet_side_data_name(enum AVPacketSideDataType type); + +/** + * Pack a dictionary for use in side_data. + * + * @param dict The dictionary to pack. + * @param size pointer to store the size of the returned data + * @return pointer to data if successful, NULL otherwise + */ +uint8_t *av_packet_pack_dictionary(AVDictionary *dict, size_t *size); +/** + * Unpack a dictionary from side_data. + * + * @param data data from side_data + * @param size size of the data + * @param dict the metadata storage dictionary + * @return 0 on success, < 0 on failure + */ +int av_packet_unpack_dictionary(const uint8_t *data, size_t size, + AVDictionary **dict); + +/** + * Convenience function to free all the side data stored. + * All the other fields stay untouched. + * + * @param pkt packet + */ +void av_packet_free_side_data(AVPacket *pkt); + +/** + * Setup a new reference to the data described by a given packet + * + * If src is reference-counted, setup dst as a new reference to the + * buffer in src. Otherwise allocate a new buffer in dst and copy the + * data from src into it. + * + * All the other fields are copied from src. + * + * @see av_packet_unref + * + * @param dst Destination packet. Will be completely overwritten. + * @param src Source packet + * + * @return 0 on success, a negative AVERROR on error. On error, dst + * will be blank (as if returned by av_packet_alloc()). + */ +int av_packet_ref(AVPacket *dst, const AVPacket *src); + +/** + * Wipe the packet. + * + * Unreference the buffer referenced by the packet and reset the + * remaining packet fields to their default values. + * + * @param pkt The packet to be unreferenced. + */ +void av_packet_unref(AVPacket *pkt); + +/** + * Move every field in src to dst and reset src. + * + * @see av_packet_unref + * + * @param src Source packet, will be reset + * @param dst Destination packet + */ +void av_packet_move_ref(AVPacket *dst, AVPacket *src); + +/** + * Copy only "properties" fields from src to dst. + * + * Properties for the purpose of this function are all the fields + * beside those related to the packet data (buf, data, size) + * + * @param dst Destination packet + * @param src Source packet + * + * @return 0 on success AVERROR on failure. + */ +int av_packet_copy_props(AVPacket *dst, const AVPacket *src); + +/** + * Ensure the data described by a given packet is reference counted. + * + * @note This function does not ensure that the reference will be writable. + * Use av_packet_make_writable instead for that purpose. + * + * @see av_packet_ref + * @see av_packet_make_writable + * + * @param pkt packet whose data should be made reference counted. + * + * @return 0 on success, a negative AVERROR on error. On failure, the + * packet is unchanged. + */ +int av_packet_make_refcounted(AVPacket *pkt); + +/** + * Create a writable reference for the data described by a given packet, + * avoiding data copy if possible. + * + * @param pkt Packet whose data should be made writable. + * + * @return 0 on success, a negative AVERROR on failure. On failure, the + * packet is unchanged. + */ +int av_packet_make_writable(AVPacket *pkt); + +/** + * Convert valid timing fields (timestamps / durations) in a packet from one + * timebase to another. Timestamps with unknown values (AV_NOPTS_VALUE) will be + * ignored. + * + * @param pkt packet on which the conversion will be performed + * @param tb_src source timebase, in which the timing fields in pkt are + * expressed + * @param tb_dst destination timebase, to which the timing fields will be + * converted + */ +void av_packet_rescale_ts(AVPacket *pkt, AVRational tb_src, AVRational tb_dst); + +/** + * @} + */ + +#endif // AVCODEC_PACKET_H diff --git a/media/ffvpx/libavcodec/packet_internal.h b/media/ffvpx/libavcodec/packet_internal.h new file mode 100644 index 0000000000..92a0d4e6d5 --- /dev/null +++ b/media/ffvpx/libavcodec/packet_internal.h @@ -0,0 +1,73 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PACKET_INTERNAL_H +#define AVCODEC_PACKET_INTERNAL_H + +#include <stdint.h> + +#include "packet.h" + +typedef struct PacketListEntry { + struct PacketListEntry *next; + AVPacket pkt; +} PacketListEntry; + +typedef struct PacketList { + PacketListEntry *head, *tail; +} PacketList; + +/** + * Append an AVPacket to the list. + * + * @param list A PacketList + * @param pkt The packet being appended. The data described in it will + * be made reference counted if it isn't already. + * @param copy A callback to copy the contents of the packet to the list. + May be null, in which case the packet's reference will be + moved to the list. + * @return 0 on success, negative AVERROR value on failure. On failure, + the packet and the list are unchanged. + */ +int avpriv_packet_list_put(PacketList *list, AVPacket *pkt, + int (*copy)(AVPacket *dst, const AVPacket *src), + int flags); + +/** + * Remove the oldest AVPacket in the list and return it. + * + * @note The pkt will be overwritten completely on success. The caller + * owns the packet and must unref it by itself. + * + * @param head A pointer to a PacketList struct + * @param pkt Pointer to an AVPacket struct + * @return 0 on success, and a packet is returned. AVERROR(EAGAIN) if + * the list was empty. + */ +int avpriv_packet_list_get(PacketList *list, AVPacket *pkt); + +/** + * Wipe the list and unref all the packets in it. + */ +void avpriv_packet_list_free(PacketList *list); + +int ff_side_data_set_encoder_stats(AVPacket *pkt, int quality, int64_t *error, int error_count, int pict_type); + +int ff_side_data_set_prft(AVPacket *pkt, int64_t timestamp); + +#endif // AVCODEC_PACKET_INTERNAL_H diff --git a/media/ffvpx/libavcodec/parser.c b/media/ffvpx/libavcodec/parser.c new file mode 100644 index 0000000000..49de7e6a57 --- /dev/null +++ b/media/ffvpx/libavcodec/parser.c @@ -0,0 +1,289 @@ +/* + * Audio and Video frame extraction + * Copyright (c) 2003 Fabrice Bellard + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <inttypes.h> +#include <stdint.h> +#include <string.h> + +#include "libavutil/avassert.h" +#include "libavutil/mem.h" + +#include "parser.h" + +AVCodecParserContext *av_parser_init(int codec_id) +{ + AVCodecParserContext *s = NULL; + const AVCodecParser *parser; + void *i = 0; + int ret; + + if (codec_id == AV_CODEC_ID_NONE) + return NULL; + + while ((parser = av_parser_iterate(&i))) { + if (parser->codec_ids[0] == codec_id || + parser->codec_ids[1] == codec_id || + parser->codec_ids[2] == codec_id || + parser->codec_ids[3] == codec_id || + parser->codec_ids[4] == codec_id || + parser->codec_ids[5] == codec_id || + parser->codec_ids[6] == codec_id) + goto found; + } + return NULL; + +found: + s = av_mallocz(sizeof(AVCodecParserContext)); + if (!s) + goto err_out; + s->parser = parser; + s->priv_data = av_mallocz(parser->priv_data_size); + if (!s->priv_data) + goto err_out; + s->fetch_timestamp=1; + s->pict_type = AV_PICTURE_TYPE_I; + if (parser->parser_init) { + ret = parser->parser_init(s); + if (ret != 0) + goto err_out; + } + s->key_frame = -1; + s->dts_sync_point = INT_MIN; + s->dts_ref_dts_delta = INT_MIN; + s->pts_dts_delta = INT_MIN; + s->format = -1; + + return s; + +err_out: + if (s) + av_freep(&s->priv_data); + av_free(s); + return NULL; +} + +void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove, int fuzzy) +{ + int i; + + if (!fuzzy) { + s->dts = + s->pts = AV_NOPTS_VALUE; + s->pos = -1; + s->offset = 0; + } + for (i = 0; i < AV_PARSER_PTS_NB; i++) { + if (s->cur_offset + off >= s->cur_frame_offset[i] && + (s->frame_offset < s->cur_frame_offset[i] || + (!s->frame_offset && !s->next_frame_offset)) && // first field/frame + // check disabled since MPEG-TS does not send complete PES packets + /*s->next_frame_offset + off <*/ s->cur_frame_end[i]){ + + if (!fuzzy || s->cur_frame_dts[i] != AV_NOPTS_VALUE) { + s->dts = s->cur_frame_dts[i]; + s->pts = s->cur_frame_pts[i]; + s->pos = s->cur_frame_pos[i]; + s->offset = s->next_frame_offset - s->cur_frame_offset[i]; + } + if (remove) + s->cur_frame_offset[i] = INT64_MAX; + if (s->cur_offset + off < s->cur_frame_end[i]) + break; + } + } +} + +int av_parser_parse2(AVCodecParserContext *s, AVCodecContext *avctx, + uint8_t **poutbuf, int *poutbuf_size, + const uint8_t *buf, int buf_size, + int64_t pts, int64_t dts, int64_t pos) +{ + int index, i; + uint8_t dummy_buf[AV_INPUT_BUFFER_PADDING_SIZE]; + + av_assert1(avctx->codec_id != AV_CODEC_ID_NONE); + + /* Parsers only work for the specified codec ids. */ + av_assert1(avctx->codec_id == s->parser->codec_ids[0] || + avctx->codec_id == s->parser->codec_ids[1] || + avctx->codec_id == s->parser->codec_ids[2] || + avctx->codec_id == s->parser->codec_ids[3] || + avctx->codec_id == s->parser->codec_ids[4] || + avctx->codec_id == s->parser->codec_ids[5] || + avctx->codec_id == s->parser->codec_ids[6]); + + if (!(s->flags & PARSER_FLAG_FETCHED_OFFSET)) { + s->next_frame_offset = + s->cur_offset = pos; + s->flags |= PARSER_FLAG_FETCHED_OFFSET; + } + + if (buf_size == 0) { + /* padding is always necessary even if EOF, so we add it here */ + memset(dummy_buf, 0, sizeof(dummy_buf)); + buf = dummy_buf; + } else if (s->cur_offset + buf_size != s->cur_frame_end[s->cur_frame_start_index]) { /* skip remainder packets */ + /* add a new packet descriptor */ + i = (s->cur_frame_start_index + 1) & (AV_PARSER_PTS_NB - 1); + s->cur_frame_start_index = i; + s->cur_frame_offset[i] = s->cur_offset; + s->cur_frame_end[i] = s->cur_offset + buf_size; + s->cur_frame_pts[i] = pts; + s->cur_frame_dts[i] = dts; + s->cur_frame_pos[i] = pos; + } + + if (s->fetch_timestamp) { + s->fetch_timestamp = 0; + s->last_pts = s->pts; + s->last_dts = s->dts; + s->last_pos = s->pos; + ff_fetch_timestamp(s, 0, 0, 0); + } + /* WARNING: the returned index can be negative */ + index = s->parser->parser_parse(s, avctx, (const uint8_t **) poutbuf, + poutbuf_size, buf, buf_size); + av_assert0(index > -0x20000000); // The API does not allow returning AVERROR codes +#define FILL(name) if(s->name > 0 && avctx->name <= 0) avctx->name = s->name + if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) { + FILL(field_order); + } + + /* update the file pointer */ + if (*poutbuf_size) { + /* fill the data for the current frame */ + s->frame_offset = s->next_frame_offset; + + /* offset of the next frame */ + s->next_frame_offset = s->cur_offset + index; + s->fetch_timestamp = 1; + } else { + /* Don't return a pointer to dummy_buf. */ + *poutbuf = NULL; + } + if (index < 0) + index = 0; + s->cur_offset += index; + return index; +} + +void av_parser_close(AVCodecParserContext *s) +{ + if (s) { + if (s->parser->parser_close) + s->parser->parser_close(s); + av_freep(&s->priv_data); + av_free(s); + } +} + +int ff_combine_frame(ParseContext *pc, int next, + const uint8_t **buf, int *buf_size) +{ + if (pc->overread) { + ff_dlog(NULL, "overread %d, state:%"PRIX32" next:%d index:%d o_index:%d\n", + pc->overread, pc->state, next, pc->index, pc->overread_index); + ff_dlog(NULL, "%X %X %X %X\n", + (*buf)[0], (*buf)[1], (*buf)[2], (*buf)[3]); + } + + /* Copy overread bytes from last frame into buffer. */ + for (; pc->overread > 0; pc->overread--) + pc->buffer[pc->index++] = pc->buffer[pc->overread_index++]; + + if (next > *buf_size) + return AVERROR(EINVAL); + + /* flush remaining if EOF */ + if (!*buf_size && next == END_NOT_FOUND) + next = 0; + + pc->last_index = pc->index; + + /* copy into buffer end return */ + if (next == END_NOT_FOUND) { + void *new_buffer = av_fast_realloc(pc->buffer, &pc->buffer_size, + *buf_size + pc->index + + AV_INPUT_BUFFER_PADDING_SIZE); + + if (!new_buffer) { + av_log(NULL, AV_LOG_ERROR, "Failed to reallocate parser buffer to %d\n", *buf_size + pc->index + AV_INPUT_BUFFER_PADDING_SIZE); + pc->index = 0; + return AVERROR(ENOMEM); + } + pc->buffer = new_buffer; + memcpy(&pc->buffer[pc->index], *buf, *buf_size); + pc->index += *buf_size; + return -1; + } + + av_assert0(next >= 0 || pc->buffer); + + *buf_size = + pc->overread_index = pc->index + next; + + /* append to buffer */ + if (pc->index) { + void *new_buffer = av_fast_realloc(pc->buffer, &pc->buffer_size, + next + pc->index + + AV_INPUT_BUFFER_PADDING_SIZE); + if (!new_buffer) { + av_log(NULL, AV_LOG_ERROR, "Failed to reallocate parser buffer to %d\n", next + pc->index + AV_INPUT_BUFFER_PADDING_SIZE); + pc->overread_index = + pc->index = 0; + return AVERROR(ENOMEM); + } + pc->buffer = new_buffer; + if (next > -AV_INPUT_BUFFER_PADDING_SIZE) + memcpy(&pc->buffer[pc->index], *buf, + next + AV_INPUT_BUFFER_PADDING_SIZE); + pc->index = 0; + *buf = pc->buffer; + } + + if (next < -8) { + pc->overread += -8 - next; + next = -8; + } + /* store overread bytes */ + for (; next < 0; next++) { + pc->state = pc->state << 8 | pc->buffer[pc->last_index + next]; + pc->state64 = pc->state64 << 8 | pc->buffer[pc->last_index + next]; + pc->overread++; + } + + if (pc->overread) { + ff_dlog(NULL, "overread %d, state:%"PRIX32" next:%d index:%d o_index:%d\n", + pc->overread, pc->state, next, pc->index, pc->overread_index); + ff_dlog(NULL, "%X %X %X %X\n", + (*buf)[0], (*buf)[1], (*buf)[2], (*buf)[3]); + } + + return 0; +} + +void ff_parse_close(AVCodecParserContext *s) +{ + ParseContext *pc = s->priv_data; + + av_freep(&pc->buffer); +} diff --git a/media/ffvpx/libavcodec/parser.h b/media/ffvpx/libavcodec/parser.h new file mode 100644 index 0000000000..2cee5ae4ff --- /dev/null +++ b/media/ffvpx/libavcodec/parser.h @@ -0,0 +1,58 @@ +/* + * AVCodecParser prototypes and definitions + * Copyright (c) 2003 Fabrice Bellard + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PARSER_H +#define AVCODEC_PARSER_H + +#include "avcodec.h" + +typedef struct ParseContext{ + uint8_t *buffer; + int index; + int last_index; + unsigned int buffer_size; + uint32_t state; ///< contains the last few bytes in MSB order + int frame_start_found; + int overread; ///< the number of bytes which where irreversibly read from the next frame + int overread_index; ///< the index into ParseContext.buffer of the overread bytes + uint64_t state64; ///< contains the last 8 bytes in MSB order +} ParseContext; + +#define END_NOT_FOUND (-100) + +/** + * Combine the (truncated) bitstream to a complete frame. + * @return -1 if no complete frame could be created, + * AVERROR(ENOMEM) if there was a memory allocation error + */ +int ff_combine_frame(ParseContext *pc, int next, const uint8_t **buf, int *buf_size); +void ff_parse_close(AVCodecParserContext *s); + +/** + * Fetch timestamps for a specific byte within the current access unit. + * @param off byte position within the access unit + * @param remove Found timestamps will be removed if set to 1, kept if set to 0. + * @param fuzzy Only use found value if it is more informative than what we already have + */ +void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove, int fuzzy); + +#endif /* AVCODEC_PARSER_H */ diff --git a/media/ffvpx/libavcodec/parser_list.c b/media/ffvpx/libavcodec/parser_list.c new file mode 100644 index 0000000000..f16ecd05c5 --- /dev/null +++ b/media/ffvpx/libavcodec/parser_list.c @@ -0,0 +1,10 @@ +#include "config_components.h" + +static const AVCodecParser * const parser_list[] = { +#if CONFIG_VP8_PARSER + &ff_vp8_parser, +#endif +#if CONFIG_VP9_PARSER + &ff_vp9_parser, +#endif + NULL }; diff --git a/media/ffvpx/libavcodec/parsers.c b/media/ffvpx/libavcodec/parsers.c new file mode 100644 index 0000000000..d355808018 --- /dev/null +++ b/media/ffvpx/libavcodec/parsers.c @@ -0,0 +1,93 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "avcodec.h" + +extern const AVCodecParser ff_aac_parser; +extern const AVCodecParser ff_aac_latm_parser; +extern const AVCodecParser ff_ac3_parser; +extern const AVCodecParser ff_adx_parser; +extern const AVCodecParser ff_amr_parser; +extern const AVCodecParser ff_av1_parser; +extern const AVCodecParser ff_avs2_parser; +extern const AVCodecParser ff_avs3_parser; +extern const AVCodecParser ff_bmp_parser; +extern const AVCodecParser ff_cavsvideo_parser; +extern const AVCodecParser ff_cook_parser; +extern const AVCodecParser ff_cri_parser; +extern const AVCodecParser ff_dca_parser; +extern const AVCodecParser ff_dirac_parser; +extern const AVCodecParser ff_dnxhd_parser; +extern const AVCodecParser ff_dolby_e_parser; +extern const AVCodecParser ff_dpx_parser; +extern const AVCodecParser ff_dvaudio_parser; +extern const AVCodecParser ff_dvbsub_parser; +extern const AVCodecParser ff_dvdsub_parser; +extern const AVCodecParser ff_dvd_nav_parser; +extern const AVCodecParser ff_flac_parser; +extern const AVCodecParser ff_ftr_parser; +extern const AVCodecParser ff_g723_1_parser; +extern const AVCodecParser ff_g729_parser; +extern const AVCodecParser ff_gif_parser; +extern const AVCodecParser ff_gsm_parser; +extern const AVCodecParser ff_h261_parser; +extern const AVCodecParser ff_h263_parser; +extern const AVCodecParser ff_h264_parser; +extern const AVCodecParser ff_hevc_parser; +extern const AVCodecParser ff_hdr_parser; +extern const AVCodecParser ff_ipu_parser; +extern const AVCodecParser ff_jpeg2000_parser; +extern const AVCodecParser ff_misc4_parser; +extern const AVCodecParser ff_mjpeg_parser; +extern const AVCodecParser ff_mlp_parser; +extern const AVCodecParser ff_mpeg4video_parser; +extern const AVCodecParser ff_mpegaudio_parser; +extern const AVCodecParser ff_mpegvideo_parser; +extern const AVCodecParser ff_opus_parser; +extern const AVCodecParser ff_png_parser; +extern const AVCodecParser ff_pnm_parser; +extern const AVCodecParser ff_qoi_parser; +extern const AVCodecParser ff_rv30_parser; +extern const AVCodecParser ff_rv40_parser; +extern const AVCodecParser ff_sbc_parser; +extern const AVCodecParser ff_sipr_parser; +extern const AVCodecParser ff_tak_parser; +extern const AVCodecParser ff_vc1_parser; +extern const AVCodecParser ff_vorbis_parser; +extern const AVCodecParser ff_vp3_parser; +extern const AVCodecParser ff_vp8_parser; +extern const AVCodecParser ff_vp9_parser; +extern const AVCodecParser ff_webp_parser; +extern const AVCodecParser ff_xbm_parser; +extern const AVCodecParser ff_xma_parser; +extern const AVCodecParser ff_xwd_parser; + +#include "libavcodec/parser_list.c" + +const AVCodecParser *av_parser_iterate(void **opaque) +{ + uintptr_t i = (uintptr_t)*opaque; + const AVCodecParser *p = parser_list[i]; + + if (p) + *opaque = (void*)(i + 1); + + return p; +} diff --git a/media/ffvpx/libavcodec/pixblockdsp.h b/media/ffvpx/libavcodec/pixblockdsp.h new file mode 100644 index 0000000000..9b002aa3d6 --- /dev/null +++ b/media/ffvpx/libavcodec/pixblockdsp.h @@ -0,0 +1,62 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PIXBLOCKDSP_H +#define AVCODEC_PIXBLOCKDSP_H + +#include <stdint.h> + +#include "config.h" + +#include "avcodec.h" + +typedef struct PixblockDSPContext { + void (*get_pixels)(int16_t *av_restrict block /* align 16 */, + const uint8_t *pixels /* align 8 */, + ptrdiff_t stride); + void (*get_pixels_unaligned)(int16_t *av_restrict block /* align 16 */, + const uint8_t *pixels, + ptrdiff_t stride); + void (*diff_pixels)(int16_t *av_restrict block /* align 16 */, + const uint8_t *s1 /* align 8 */, + const uint8_t *s2 /* align 8 */, + ptrdiff_t stride); + void (*diff_pixels_unaligned)(int16_t *av_restrict block /* align 16 */, + const uint8_t *s1, + const uint8_t *s2, + ptrdiff_t stride); + +} PixblockDSPContext; + +void ff_pixblockdsp_init(PixblockDSPContext *c, AVCodecContext *avctx); +void ff_pixblockdsp_init_aarch64(PixblockDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_pixblockdsp_init_alpha(PixblockDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_pixblockdsp_init_arm(PixblockDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_pixblockdsp_init_riscv(PixblockDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_pixblockdsp_init_x86(PixblockDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_pixblockdsp_init_mips(PixblockDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); + +#endif /* AVCODEC_PIXBLOCKDSP_H */ diff --git a/media/ffvpx/libavcodec/profiles.c b/media/ffvpx/libavcodec/profiles.c new file mode 100644 index 0000000000..2230fc5415 --- /dev/null +++ b/media/ffvpx/libavcodec/profiles.c @@ -0,0 +1,185 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "avcodec.h" +#include "profiles.h" + +#if !CONFIG_SMALL + +const AVProfile ff_aac_profiles[] = { + { FF_PROFILE_AAC_LOW, "LC" }, + { FF_PROFILE_AAC_HE, "HE-AAC" }, + { FF_PROFILE_AAC_HE_V2, "HE-AACv2" }, + { FF_PROFILE_AAC_LD, "LD" }, + { FF_PROFILE_AAC_ELD, "ELD" }, + { FF_PROFILE_AAC_MAIN, "Main" }, + { FF_PROFILE_AAC_SSR, "SSR" }, + { FF_PROFILE_AAC_LTP, "LTP" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_dca_profiles[] = { + { FF_PROFILE_DTS, "DTS" }, + { FF_PROFILE_DTS_ES, "DTS-ES" }, + { FF_PROFILE_DTS_96_24, "DTS 96/24" }, + { FF_PROFILE_DTS_HD_HRA, "DTS-HD HRA" }, + { FF_PROFILE_DTS_HD_MA, "DTS-HD MA" }, + { FF_PROFILE_DTS_EXPRESS, "DTS Express" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_dnxhd_profiles[] = { + { FF_PROFILE_DNXHD, "DNXHD"}, + { FF_PROFILE_DNXHR_LB, "DNXHR LB"}, + { FF_PROFILE_DNXHR_SQ, "DNXHR SQ"}, + { FF_PROFILE_DNXHR_HQ, "DNXHR HQ" }, + { FF_PROFILE_DNXHR_HQX, "DNXHR HQX"}, + { FF_PROFILE_DNXHR_444, "DNXHR 444"}, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_h264_profiles[] = { + { FF_PROFILE_H264_BASELINE, "Baseline" }, + { FF_PROFILE_H264_CONSTRAINED_BASELINE, "Constrained Baseline" }, + { FF_PROFILE_H264_MAIN, "Main" }, + { FF_PROFILE_H264_EXTENDED, "Extended" }, + { FF_PROFILE_H264_HIGH, "High" }, + { FF_PROFILE_H264_HIGH_10, "High 10" }, + { FF_PROFILE_H264_HIGH_10_INTRA, "High 10 Intra" }, + { FF_PROFILE_H264_HIGH_422, "High 4:2:2" }, + { FF_PROFILE_H264_HIGH_422_INTRA, "High 4:2:2 Intra" }, + { FF_PROFILE_H264_HIGH_444, "High 4:4:4" }, + { FF_PROFILE_H264_HIGH_444_PREDICTIVE, "High 4:4:4 Predictive" }, + { FF_PROFILE_H264_HIGH_444_INTRA, "High 4:4:4 Intra" }, + { FF_PROFILE_H264_CAVLC_444, "CAVLC 4:4:4" }, + { FF_PROFILE_H264_MULTIVIEW_HIGH, "Multiview High" }, + { FF_PROFILE_H264_STEREO_HIGH, "Stereo High" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_vvc_profiles[] = { + { FF_PROFILE_VVC_MAIN_10, "Main 10" }, + { FF_PROFILE_VVC_MAIN_10_444, "Main 10 4:4:4" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_hevc_profiles[] = { + { FF_PROFILE_HEVC_MAIN, "Main" }, + { FF_PROFILE_HEVC_MAIN_10, "Main 10" }, + { FF_PROFILE_HEVC_MAIN_STILL_PICTURE, "Main Still Picture" }, + { FF_PROFILE_HEVC_REXT, "Rext" }, + { FF_PROFILE_HEVC_SCC, "Scc" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_jpeg2000_profiles[] = { + { FF_PROFILE_JPEG2000_CSTREAM_RESTRICTION_0, "JPEG 2000 codestream restriction 0" }, + { FF_PROFILE_JPEG2000_CSTREAM_RESTRICTION_1, "JPEG 2000 codestream restriction 1" }, + { FF_PROFILE_JPEG2000_CSTREAM_NO_RESTRICTION, "JPEG 2000 no codestream restrictions" }, + { FF_PROFILE_JPEG2000_DCINEMA_2K, "JPEG 2000 digital cinema 2K" }, + { FF_PROFILE_JPEG2000_DCINEMA_4K, "JPEG 2000 digital cinema 4K" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_mpeg2_video_profiles[] = { + { FF_PROFILE_MPEG2_422, "4:2:2" }, + { FF_PROFILE_MPEG2_HIGH, "High" }, + { FF_PROFILE_MPEG2_SS, "Spatially Scalable" }, + { FF_PROFILE_MPEG2_SNR_SCALABLE, "SNR Scalable" }, + { FF_PROFILE_MPEG2_MAIN, "Main" }, + { FF_PROFILE_MPEG2_SIMPLE, "Simple" }, + { FF_PROFILE_RESERVED, "Reserved" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_mpeg4_video_profiles[] = { + { FF_PROFILE_MPEG4_SIMPLE, "Simple Profile" }, + { FF_PROFILE_MPEG4_SIMPLE_SCALABLE, "Simple Scalable Profile" }, + { FF_PROFILE_MPEG4_CORE, "Core Profile" }, + { FF_PROFILE_MPEG4_MAIN, "Main Profile" }, + { FF_PROFILE_MPEG4_N_BIT, "N-bit Profile" }, + { FF_PROFILE_MPEG4_SCALABLE_TEXTURE, "Scalable Texture Profile" }, + { FF_PROFILE_MPEG4_SIMPLE_FACE_ANIMATION, "Simple Face Animation Profile" }, + { FF_PROFILE_MPEG4_BASIC_ANIMATED_TEXTURE, "Basic Animated Texture Profile" }, + { FF_PROFILE_MPEG4_HYBRID, "Hybrid Profile" }, + { FF_PROFILE_MPEG4_ADVANCED_REAL_TIME, "Advanced Real Time Simple Profile" }, + { FF_PROFILE_MPEG4_CORE_SCALABLE, "Code Scalable Profile" }, + { FF_PROFILE_MPEG4_ADVANCED_CODING, "Advanced Coding Profile" }, + { FF_PROFILE_MPEG4_ADVANCED_CORE, "Advanced Core Profile" }, + { FF_PROFILE_MPEG4_ADVANCED_SCALABLE_TEXTURE, "Advanced Scalable Texture Profile" }, + { FF_PROFILE_MPEG4_SIMPLE_STUDIO, "Simple Studio Profile" }, + { FF_PROFILE_MPEG4_ADVANCED_SIMPLE, "Advanced Simple Profile" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_vc1_profiles[] = { + { FF_PROFILE_VC1_SIMPLE, "Simple" }, + { FF_PROFILE_VC1_MAIN, "Main" }, + { FF_PROFILE_VC1_COMPLEX, "Complex" }, + { FF_PROFILE_VC1_ADVANCED, "Advanced" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_vp9_profiles[] = { + { FF_PROFILE_VP9_0, "Profile 0" }, + { FF_PROFILE_VP9_1, "Profile 1" }, + { FF_PROFILE_VP9_2, "Profile 2" }, + { FF_PROFILE_VP9_3, "Profile 3" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_av1_profiles[] = { + { FF_PROFILE_AV1_MAIN, "Main" }, + { FF_PROFILE_AV1_HIGH, "High" }, + { FF_PROFILE_AV1_PROFESSIONAL, "Professional" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_sbc_profiles[] = { + { FF_PROFILE_SBC_MSBC, "mSBC" }, + { FF_PROFILE_UNKNOWN }, +}; + +const AVProfile ff_prores_profiles[] = { + { FF_PROFILE_PRORES_PROXY, "Proxy" }, + { FF_PROFILE_PRORES_LT, "LT" }, + { FF_PROFILE_PRORES_STANDARD, "Standard" }, + { FF_PROFILE_PRORES_HQ, "HQ" }, + { FF_PROFILE_PRORES_4444, "4444" }, + { FF_PROFILE_PRORES_XQ, "XQ" }, + { FF_PROFILE_UNKNOWN } +}; + +const AVProfile ff_mjpeg_profiles[] = { + { FF_PROFILE_MJPEG_HUFFMAN_BASELINE_DCT, "Baseline" }, + { FF_PROFILE_MJPEG_HUFFMAN_EXTENDED_SEQUENTIAL_DCT, "Sequential" }, + { FF_PROFILE_MJPEG_HUFFMAN_PROGRESSIVE_DCT, "Progressive" }, + { FF_PROFILE_MJPEG_HUFFMAN_LOSSLESS, "Lossless" }, + { FF_PROFILE_MJPEG_JPEG_LS, "JPEG LS" }, + { FF_PROFILE_UNKNOWN } +}; + +const AVProfile ff_arib_caption_profiles[] = { + { FF_PROFILE_ARIB_PROFILE_A, "Profile A" }, + { FF_PROFILE_ARIB_PROFILE_C, "Profile C" }, + { FF_PROFILE_UNKNOWN } +}; + +#endif /* !CONFIG_SMALL */ diff --git a/media/ffvpx/libavcodec/profiles.h b/media/ffvpx/libavcodec/profiles.h new file mode 100644 index 0000000000..41a19aa9ad --- /dev/null +++ b/media/ffvpx/libavcodec/profiles.h @@ -0,0 +1,76 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PROFILES_H +#define AVCODEC_PROFILES_H + +#include "avcodec.h" +#include "libavutil/opt.h" + +#define FF_AVCTX_PROFILE_OPTION(name, description, type, value) \ + {name, description, 0, AV_OPT_TYPE_CONST, {.i64 = value }, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM | AV_OPT_FLAG_## type ##_PARAM, "avctx.profile"}, + +#define FF_AAC_PROFILE_OPTS \ + FF_AVCTX_PROFILE_OPTION("aac_main", NULL, AUDIO, FF_PROFILE_AAC_MAIN)\ + FF_AVCTX_PROFILE_OPTION("aac_low", NULL, AUDIO, FF_PROFILE_AAC_LOW)\ + FF_AVCTX_PROFILE_OPTION("aac_ssr", NULL, AUDIO, FF_PROFILE_AAC_SSR)\ + FF_AVCTX_PROFILE_OPTION("aac_ltp", NULL, AUDIO, FF_PROFILE_AAC_LTP)\ + FF_AVCTX_PROFILE_OPTION("aac_he", NULL, AUDIO, FF_PROFILE_AAC_HE)\ + FF_AVCTX_PROFILE_OPTION("aac_he_v2", NULL, AUDIO, FF_PROFILE_AAC_HE_V2)\ + FF_AVCTX_PROFILE_OPTION("aac_ld", NULL, AUDIO, FF_PROFILE_AAC_LD)\ + FF_AVCTX_PROFILE_OPTION("aac_eld", NULL, AUDIO, FF_PROFILE_AAC_ELD)\ + FF_AVCTX_PROFILE_OPTION("mpeg2_aac_low", NULL, AUDIO, FF_PROFILE_MPEG2_AAC_LOW)\ + FF_AVCTX_PROFILE_OPTION("mpeg2_aac_he", NULL, AUDIO, FF_PROFILE_MPEG2_AAC_HE)\ + +#define FF_MPEG4_PROFILE_OPTS \ + FF_AVCTX_PROFILE_OPTION("mpeg4_sp", NULL, VIDEO, FF_PROFILE_MPEG4_SIMPLE)\ + FF_AVCTX_PROFILE_OPTION("mpeg4_core", NULL, VIDEO, FF_PROFILE_MPEG4_CORE)\ + FF_AVCTX_PROFILE_OPTION("mpeg4_main", NULL, VIDEO, FF_PROFILE_MPEG4_MAIN)\ + FF_AVCTX_PROFILE_OPTION("mpeg4_asp", NULL, VIDEO, FF_PROFILE_MPEG4_ADVANCED_SIMPLE)\ + +#define FF_MPEG2_PROFILE_OPTS \ + FF_AVCTX_PROFILE_OPTION("422", NULL, VIDEO, FF_PROFILE_MPEG2_422)\ + FF_AVCTX_PROFILE_OPTION("high", NULL, VIDEO, FF_PROFILE_MPEG2_HIGH)\ + FF_AVCTX_PROFILE_OPTION("ss", NULL, VIDEO, FF_PROFILE_MPEG2_SS)\ + FF_AVCTX_PROFILE_OPTION("snr", NULL, VIDEO, FF_PROFILE_MPEG2_SNR_SCALABLE)\ + FF_AVCTX_PROFILE_OPTION("main", NULL, VIDEO, FF_PROFILE_MPEG2_MAIN)\ + FF_AVCTX_PROFILE_OPTION("simple", NULL, VIDEO, FF_PROFILE_MPEG2_SIMPLE)\ + +#define FF_AV1_PROFILE_OPTS \ + FF_AVCTX_PROFILE_OPTION("main", NULL, VIDEO, FF_PROFILE_AV1_MAIN)\ + FF_AVCTX_PROFILE_OPTION("high", NULL, VIDEO, FF_PROFILE_AV1_HIGH)\ + FF_AVCTX_PROFILE_OPTION("professional", NULL, VIDEO, FF_PROFILE_AV1_PROFESSIONAL)\ + +extern const AVProfile ff_aac_profiles[]; +extern const AVProfile ff_dca_profiles[]; +extern const AVProfile ff_dnxhd_profiles[]; +extern const AVProfile ff_h264_profiles[]; +extern const AVProfile ff_hevc_profiles[]; +extern const AVProfile ff_vvc_profiles[]; +extern const AVProfile ff_jpeg2000_profiles[]; +extern const AVProfile ff_mpeg2_video_profiles[]; +extern const AVProfile ff_mpeg4_video_profiles[]; +extern const AVProfile ff_vc1_profiles[]; +extern const AVProfile ff_vp9_profiles[]; +extern const AVProfile ff_av1_profiles[]; +extern const AVProfile ff_sbc_profiles[]; +extern const AVProfile ff_prores_profiles[]; +extern const AVProfile ff_mjpeg_profiles[]; +extern const AVProfile ff_arib_caption_profiles[]; + +#endif /* AVCODEC_PROFILES_H */ diff --git a/media/ffvpx/libavcodec/pthread.c b/media/ffvpx/libavcodec/pthread.c new file mode 100644 index 0000000000..ca84b81391 --- /dev/null +++ b/media/ffvpx/libavcodec/pthread.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2004 Roman Shaposhnik + * Copyright (c) 2008 Alexander Strange (astrange@ithinksw.com) + * + * Many thanks to Steven M. Schultz for providing clever ideas and + * to Michael Niedermayer <michaelni@gmx.at> for writing initial + * implementation. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Multithreading support functions + * @see doc/multithreading.txt + */ + +#include "libavutil/thread.h" + +#include "avcodec.h" +#include "codec_internal.h" +#include "pthread_internal.h" +#include "thread.h" + +/** + * Set the threading algorithms used. + * + * Threading requires more than one thread. + * Frame threading requires entire frames to be passed to the codec, + * and introduces extra decoding delay, so is incompatible with low_delay. + * + * @param avctx The context. + */ +static void validate_thread_parameters(AVCodecContext *avctx) +{ + int frame_threading_supported = (avctx->codec->capabilities & AV_CODEC_CAP_FRAME_THREADS) + && !(avctx->flags & AV_CODEC_FLAG_LOW_DELAY) + && !(avctx->flags2 & AV_CODEC_FLAG2_CHUNKS); + if (avctx->thread_count == 1) { + avctx->active_thread_type = 0; + } else if (frame_threading_supported && (avctx->thread_type & FF_THREAD_FRAME)) { + avctx->active_thread_type = FF_THREAD_FRAME; + } else if (avctx->codec->capabilities & AV_CODEC_CAP_SLICE_THREADS && + avctx->thread_type & FF_THREAD_SLICE) { + avctx->active_thread_type = FF_THREAD_SLICE; + } else if (!(ffcodec(avctx->codec)->caps_internal & FF_CODEC_CAP_AUTO_THREADS)) { + avctx->thread_count = 1; + avctx->active_thread_type = 0; + } + + if (avctx->thread_count > MAX_AUTO_THREADS) + av_log(avctx, AV_LOG_WARNING, + "Application has requested %d threads. Using a thread count greater than %d is not recommended.\n", + avctx->thread_count, MAX_AUTO_THREADS); +} + +int ff_thread_init(AVCodecContext *avctx) +{ + validate_thread_parameters(avctx); + + if (avctx->active_thread_type&FF_THREAD_SLICE) + return ff_slice_thread_init(avctx); + else if (avctx->active_thread_type&FF_THREAD_FRAME) + return ff_frame_thread_init(avctx); + + return 0; +} + +void ff_thread_free(AVCodecContext *avctx) +{ + if (avctx->active_thread_type&FF_THREAD_FRAME) + ff_frame_thread_free(avctx, avctx->thread_count); + else + ff_slice_thread_free(avctx); +} + +av_cold void ff_pthread_free(void *obj, const unsigned offsets[]) +{ + unsigned cnt = *(unsigned*)((char*)obj + offsets[0]); + const unsigned *cur_offset = offsets; + + *(unsigned*)((char*)obj + offsets[0]) = 0; + + for (; *(++cur_offset) != THREAD_SENTINEL && cnt; cnt--) + pthread_mutex_destroy((pthread_mutex_t*)((char*)obj + *cur_offset)); + for (; *(++cur_offset) != THREAD_SENTINEL && cnt; cnt--) + pthread_cond_destroy ((pthread_cond_t *)((char*)obj + *cur_offset)); +} + +av_cold int ff_pthread_init(void *obj, const unsigned offsets[]) +{ + const unsigned *cur_offset = offsets; + unsigned cnt = 0; + int err; + +#define PTHREAD_INIT_LOOP(type) \ + for (; *(++cur_offset) != THREAD_SENTINEL; cnt++) { \ + pthread_ ## type ## _t *dst = (void*)((char*)obj + *cur_offset); \ + err = pthread_ ## type ## _init(dst, NULL); \ + if (err) { \ + err = AVERROR(err); \ + goto fail; \ + } \ + } + PTHREAD_INIT_LOOP(mutex) + PTHREAD_INIT_LOOP(cond) + +fail: + *(unsigned*)((char*)obj + offsets[0]) = cnt; + return err; +} diff --git a/media/ffvpx/libavcodec/pthread_frame.c b/media/ffvpx/libavcodec/pthread_frame.c new file mode 100644 index 0000000000..d9d5afaa82 --- /dev/null +++ b/media/ffvpx/libavcodec/pthread_frame.c @@ -0,0 +1,977 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Frame multithreading support functions + * @see doc/multithreading.txt + */ + +#include "config.h" + +#include <stdatomic.h> +#include <stdint.h> + +#include "avcodec.h" +#include "codec_internal.h" +#include "decode.h" +#include "hwconfig.h" +#include "internal.h" +#include "pthread_internal.h" +#include "thread.h" +#include "threadframe.h" +#include "version_major.h" + +#include "libavutil/avassert.h" +#include "libavutil/buffer.h" +#include "libavutil/common.h" +#include "libavutil/cpu.h" +#include "libavutil/frame.h" +#include "libavutil/internal.h" +#include "libavutil/log.h" +#include "libavutil/mem.h" +#include "libavutil/opt.h" +#include "libavutil/thread.h" + +enum { + ///< Set when the thread is awaiting a packet. + STATE_INPUT_READY, + ///< Set before the codec has called ff_thread_finish_setup(). + STATE_SETTING_UP, + /** + * Set when the codec calls get_buffer(). + * State is returned to STATE_SETTING_UP afterwards. + */ + STATE_GET_BUFFER, + /** + * Set when the codec calls get_format(). + * State is returned to STATE_SETTING_UP afterwards. + */ + STATE_GET_FORMAT, + ///< Set after the codec has called ff_thread_finish_setup(). + STATE_SETUP_FINISHED, +}; + +enum { + UNINITIALIZED, ///< Thread has not been created, AVCodec->close mustn't be called + NEEDS_CLOSE, ///< FFCodec->close needs to be called + INITIALIZED, ///< Thread has been properly set up +}; + +/** + * Context used by codec threads and stored in their AVCodecInternal thread_ctx. + */ +typedef struct PerThreadContext { + struct FrameThreadContext *parent; + + pthread_t thread; + int thread_init; + unsigned pthread_init_cnt;///< Number of successfully initialized mutexes/conditions + pthread_cond_t input_cond; ///< Used to wait for a new packet from the main thread. + pthread_cond_t progress_cond; ///< Used by child threads to wait for progress to change. + pthread_cond_t output_cond; ///< Used by the main thread to wait for frames to finish. + + pthread_mutex_t mutex; ///< Mutex used to protect the contents of the PerThreadContext. + pthread_mutex_t progress_mutex; ///< Mutex used to protect frame progress values and progress_cond. + + AVCodecContext *avctx; ///< Context used to decode packets passed to this thread. + + AVPacket *avpkt; ///< Input packet (for decoding) or output (for encoding). + + AVFrame *frame; ///< Output frame (for decoding) or input (for encoding). + int got_frame; ///< The output of got_picture_ptr from the last avcodec_decode_video() call. + int result; ///< The result of the last codec decode/encode() call. + + atomic_int state; + + int die; ///< Set when the thread should exit. + + int hwaccel_serializing; + int async_serializing; + + atomic_int debug_threads; ///< Set if the FF_DEBUG_THREADS option is set. +} PerThreadContext; + +/** + * Context stored in the client AVCodecInternal thread_ctx. + */ +typedef struct FrameThreadContext { + PerThreadContext *threads; ///< The contexts for each thread. + PerThreadContext *prev_thread; ///< The last thread submit_packet() was called on. + + unsigned pthread_init_cnt; ///< Number of successfully initialized mutexes/conditions + pthread_mutex_t buffer_mutex; ///< Mutex used to protect get/release_buffer(). + /** + * This lock is used for ensuring threads run in serial when hwaccel + * is used. + */ + pthread_mutex_t hwaccel_mutex; + pthread_mutex_t async_mutex; + pthread_cond_t async_cond; + int async_lock; + + int next_decoding; ///< The next context to submit a packet to. + int next_finished; ///< The next context to return output from. + + int delaying; /**< + * Set for the first N packets, where N is the number of threads. + * While it is set, ff_thread_en/decode_frame won't return any results. + */ + + /* hwaccel state is temporarily stored here in order to transfer its ownership + * to the next decoding thread without the need for extra synchronization */ + const AVHWAccel *stash_hwaccel; + void *stash_hwaccel_context; + void *stash_hwaccel_priv; +} FrameThreadContext; + +static void async_lock(FrameThreadContext *fctx) +{ + pthread_mutex_lock(&fctx->async_mutex); + while (fctx->async_lock) + pthread_cond_wait(&fctx->async_cond, &fctx->async_mutex); + fctx->async_lock = 1; + pthread_mutex_unlock(&fctx->async_mutex); +} + +static void async_unlock(FrameThreadContext *fctx) +{ + pthread_mutex_lock(&fctx->async_mutex); + av_assert0(fctx->async_lock); + fctx->async_lock = 0; + pthread_cond_broadcast(&fctx->async_cond); + pthread_mutex_unlock(&fctx->async_mutex); +} + +static void thread_set_name(PerThreadContext *p) +{ + AVCodecContext *avctx = p->avctx; + int idx = p - p->parent->threads; + char name[16]; + + snprintf(name, sizeof(name), "av:%.7s:df%d", avctx->codec->name, idx); + + ff_thread_setname(name); +} + +/** + * Codec worker thread. + * + * Automatically calls ff_thread_finish_setup() if the codec does + * not provide an update_thread_context method, or if the codec returns + * before calling it. + */ +static attribute_align_arg void *frame_worker_thread(void *arg) +{ + PerThreadContext *p = arg; + AVCodecContext *avctx = p->avctx; + const FFCodec *codec = ffcodec(avctx->codec); + + thread_set_name(p); + + pthread_mutex_lock(&p->mutex); + while (1) { + while (atomic_load(&p->state) == STATE_INPUT_READY && !p->die) + pthread_cond_wait(&p->input_cond, &p->mutex); + + if (p->die) break; + + if (!codec->update_thread_context) + ff_thread_finish_setup(avctx); + + /* If a decoder supports hwaccel, then it must call ff_get_format(). + * Since that call must happen before ff_thread_finish_setup(), the + * decoder is required to implement update_thread_context() and call + * ff_thread_finish_setup() manually. Therefore the above + * ff_thread_finish_setup() call did not happen and hwaccel_serializing + * cannot be true here. */ + av_assert0(!p->hwaccel_serializing); + + /* if the previous thread uses hwaccel then we take the lock to ensure + * the threads don't run concurrently */ + if (avctx->hwaccel) { + pthread_mutex_lock(&p->parent->hwaccel_mutex); + p->hwaccel_serializing = 1; + } + + av_frame_unref(p->frame); + p->got_frame = 0; + p->result = codec->cb.decode(avctx, p->frame, &p->got_frame, p->avpkt); + + if ((p->result < 0 || !p->got_frame) && p->frame->buf[0]) + ff_thread_release_buffer(avctx, p->frame); + + if (atomic_load(&p->state) == STATE_SETTING_UP) + ff_thread_finish_setup(avctx); + + if (p->hwaccel_serializing) { + /* wipe hwaccel state to avoid stale pointers lying around; + * the state was transferred to FrameThreadContext in + * ff_thread_finish_setup(), so nothing is leaked */ + avctx->hwaccel = NULL; + avctx->hwaccel_context = NULL; + avctx->internal->hwaccel_priv_data = NULL; + + p->hwaccel_serializing = 0; + pthread_mutex_unlock(&p->parent->hwaccel_mutex); + } + av_assert0(!avctx->hwaccel); + + if (p->async_serializing) { + p->async_serializing = 0; + + async_unlock(p->parent); + } + + pthread_mutex_lock(&p->progress_mutex); + + atomic_store(&p->state, STATE_INPUT_READY); + + pthread_cond_broadcast(&p->progress_cond); + pthread_cond_signal(&p->output_cond); + pthread_mutex_unlock(&p->progress_mutex); + } + pthread_mutex_unlock(&p->mutex); + + return NULL; +} + +/** + * Update the next thread's AVCodecContext with values from the reference thread's context. + * + * @param dst The destination context. + * @param src The source context. + * @param for_user 0 if the destination is a codec thread, 1 if the destination is the user's thread + * @return 0 on success, negative error code on failure + */ +static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src, int for_user) +{ + const FFCodec *const codec = ffcodec(dst->codec); + int err = 0; + + if (dst != src && (for_user || codec->update_thread_context)) { + dst->time_base = src->time_base; + dst->framerate = src->framerate; + dst->width = src->width; + dst->height = src->height; + dst->pix_fmt = src->pix_fmt; + dst->sw_pix_fmt = src->sw_pix_fmt; + + dst->coded_width = src->coded_width; + dst->coded_height = src->coded_height; + + dst->has_b_frames = src->has_b_frames; + dst->idct_algo = src->idct_algo; + dst->properties = src->properties; + + dst->bits_per_coded_sample = src->bits_per_coded_sample; + dst->sample_aspect_ratio = src->sample_aspect_ratio; + + dst->profile = src->profile; + dst->level = src->level; + + dst->bits_per_raw_sample = src->bits_per_raw_sample; + dst->ticks_per_frame = src->ticks_per_frame; + dst->color_primaries = src->color_primaries; + + dst->color_trc = src->color_trc; + dst->colorspace = src->colorspace; + dst->color_range = src->color_range; + dst->chroma_sample_location = src->chroma_sample_location; + + dst->sample_rate = src->sample_rate; + dst->sample_fmt = src->sample_fmt; +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + dst->channels = src->channels; + dst->channel_layout = src->channel_layout; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + err = av_channel_layout_copy(&dst->ch_layout, &src->ch_layout); + if (err < 0) + return err; + + if (!!dst->hw_frames_ctx != !!src->hw_frames_ctx || + (dst->hw_frames_ctx && dst->hw_frames_ctx->data != src->hw_frames_ctx->data)) { + av_buffer_unref(&dst->hw_frames_ctx); + + if (src->hw_frames_ctx) { + dst->hw_frames_ctx = av_buffer_ref(src->hw_frames_ctx); + if (!dst->hw_frames_ctx) + return AVERROR(ENOMEM); + } + } + + dst->hwaccel_flags = src->hwaccel_flags; + + err = av_buffer_replace(&dst->internal->pool, src->internal->pool); + if (err < 0) + return err; + } + + if (for_user) { + if (codec->update_thread_context_for_user) + err = codec->update_thread_context_for_user(dst, src); + } else { + if (codec->update_thread_context) + err = codec->update_thread_context(dst, src); + } + + return err; +} + +/** + * Update the next thread's AVCodecContext with values set by the user. + * + * @param dst The destination context. + * @param src The source context. + * @return 0 on success, negative error code on failure + */ +static int update_context_from_user(AVCodecContext *dst, AVCodecContext *src) +{ + int err; + + dst->flags = src->flags; + + dst->draw_horiz_band= src->draw_horiz_band; + dst->get_buffer2 = src->get_buffer2; + + dst->opaque = src->opaque; + dst->debug = src->debug; + + dst->slice_flags = src->slice_flags; + dst->flags2 = src->flags2; + dst->export_side_data = src->export_side_data; + + dst->skip_loop_filter = src->skip_loop_filter; + dst->skip_idct = src->skip_idct; + dst->skip_frame = src->skip_frame; + + dst->frame_num = src->frame_num; +#if FF_API_AVCTX_FRAME_NUMBER +FF_DISABLE_DEPRECATION_WARNINGS + dst->frame_number = src->frame_number; +FF_ENABLE_DEPRECATION_WARNINGS +#endif +#if FF_API_REORDERED_OPAQUE +FF_DISABLE_DEPRECATION_WARNINGS + dst->reordered_opaque = src->reordered_opaque; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + + if (src->slice_count && src->slice_offset) { + if (dst->slice_count < src->slice_count) { + int err = av_reallocp_array(&dst->slice_offset, src->slice_count, + sizeof(*dst->slice_offset)); + if (err < 0) + return err; + } + memcpy(dst->slice_offset, src->slice_offset, + src->slice_count * sizeof(*dst->slice_offset)); + } + dst->slice_count = src->slice_count; + + av_packet_unref(dst->internal->last_pkt_props); + err = av_packet_copy_props(dst->internal->last_pkt_props, src->internal->last_pkt_props); + if (err < 0) + return err; + + return 0; +} + +static int submit_packet(PerThreadContext *p, AVCodecContext *user_avctx, + AVPacket *avpkt) +{ + FrameThreadContext *fctx = p->parent; + PerThreadContext *prev_thread = fctx->prev_thread; + const AVCodec *codec = p->avctx->codec; + int ret; + + if (!avpkt->size && !(codec->capabilities & AV_CODEC_CAP_DELAY)) + return 0; + + pthread_mutex_lock(&p->mutex); + + ret = update_context_from_user(p->avctx, user_avctx); + if (ret) { + pthread_mutex_unlock(&p->mutex); + return ret; + } + atomic_store_explicit(&p->debug_threads, + (p->avctx->debug & FF_DEBUG_THREADS) != 0, + memory_order_relaxed); + + if (prev_thread) { + int err; + if (atomic_load(&prev_thread->state) == STATE_SETTING_UP) { + pthread_mutex_lock(&prev_thread->progress_mutex); + while (atomic_load(&prev_thread->state) == STATE_SETTING_UP) + pthread_cond_wait(&prev_thread->progress_cond, &prev_thread->progress_mutex); + pthread_mutex_unlock(&prev_thread->progress_mutex); + } + + err = update_context_from_thread(p->avctx, prev_thread->avctx, 0); + if (err) { + pthread_mutex_unlock(&p->mutex); + return err; + } + } + + /* transfer the stashed hwaccel state, if any */ + av_assert0(!p->avctx->hwaccel); + FFSWAP(const AVHWAccel*, p->avctx->hwaccel, fctx->stash_hwaccel); + FFSWAP(void*, p->avctx->hwaccel_context, fctx->stash_hwaccel_context); + FFSWAP(void*, p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); + + av_packet_unref(p->avpkt); + ret = av_packet_ref(p->avpkt, avpkt); + if (ret < 0) { + pthread_mutex_unlock(&p->mutex); + av_log(p->avctx, AV_LOG_ERROR, "av_packet_ref() failed in submit_packet()\n"); + return ret; + } + + atomic_store(&p->state, STATE_SETTING_UP); + pthread_cond_signal(&p->input_cond); + pthread_mutex_unlock(&p->mutex); + + fctx->prev_thread = p; + fctx->next_decoding++; + + return 0; +} + +int ff_thread_decode_frame(AVCodecContext *avctx, + AVFrame *picture, int *got_picture_ptr, + AVPacket *avpkt) +{ + FrameThreadContext *fctx = avctx->internal->thread_ctx; + int finished = fctx->next_finished; + PerThreadContext *p; + int err; + + /* release the async lock, permitting blocked hwaccel threads to + * go forward while we are in this function */ + async_unlock(fctx); + + /* + * Submit a packet to the next decoding thread. + */ + + p = &fctx->threads[fctx->next_decoding]; + err = submit_packet(p, avctx, avpkt); + if (err) + goto finish; + + /* + * If we're still receiving the initial packets, don't return a frame. + */ + + if (fctx->next_decoding > (avctx->thread_count-1-(avctx->codec_id == AV_CODEC_ID_FFV1))) + fctx->delaying = 0; + + if (fctx->delaying) { + *got_picture_ptr=0; + if (avpkt->size) { + err = avpkt->size; + goto finish; + } + } + + /* + * Return the next available frame from the oldest thread. + * If we're at the end of the stream, then we have to skip threads that + * didn't output a frame/error, because we don't want to accidentally signal + * EOF (avpkt->size == 0 && *got_picture_ptr == 0 && err >= 0). + */ + + do { + p = &fctx->threads[finished++]; + + if (atomic_load(&p->state) != STATE_INPUT_READY) { + pthread_mutex_lock(&p->progress_mutex); + while (atomic_load_explicit(&p->state, memory_order_relaxed) != STATE_INPUT_READY) + pthread_cond_wait(&p->output_cond, &p->progress_mutex); + pthread_mutex_unlock(&p->progress_mutex); + } + + av_frame_move_ref(picture, p->frame); + *got_picture_ptr = p->got_frame; + picture->pkt_dts = p->avpkt->dts; + err = p->result; + + /* + * A later call with avkpt->size == 0 may loop over all threads, + * including this one, searching for a frame/error to return before being + * stopped by the "finished != fctx->next_finished" condition. + * Make sure we don't mistakenly return the same frame/error again. + */ + p->got_frame = 0; + p->result = 0; + + if (finished >= avctx->thread_count) finished = 0; + } while (!avpkt->size && !*got_picture_ptr && err >= 0 && finished != fctx->next_finished); + + update_context_from_thread(avctx, p->avctx, 1); + + if (fctx->next_decoding >= avctx->thread_count) fctx->next_decoding = 0; + + fctx->next_finished = finished; + + /* return the size of the consumed packet if no error occurred */ + if (err >= 0) + err = avpkt->size; +finish: + async_lock(fctx); + return err; +} + +void ff_thread_report_progress(ThreadFrame *f, int n, int field) +{ + PerThreadContext *p; + atomic_int *progress = f->progress ? (atomic_int*)f->progress->data : NULL; + + if (!progress || + atomic_load_explicit(&progress[field], memory_order_relaxed) >= n) + return; + + p = f->owner[field]->internal->thread_ctx; + + if (atomic_load_explicit(&p->debug_threads, memory_order_relaxed)) + av_log(f->owner[field], AV_LOG_DEBUG, + "%p finished %d field %d\n", progress, n, field); + + pthread_mutex_lock(&p->progress_mutex); + + atomic_store_explicit(&progress[field], n, memory_order_release); + + pthread_cond_broadcast(&p->progress_cond); + pthread_mutex_unlock(&p->progress_mutex); +} + +void ff_thread_await_progress(const ThreadFrame *f, int n, int field) +{ + PerThreadContext *p; + atomic_int *progress = f->progress ? (atomic_int*)f->progress->data : NULL; + + if (!progress || + atomic_load_explicit(&progress[field], memory_order_acquire) >= n) + return; + + p = f->owner[field]->internal->thread_ctx; + + if (atomic_load_explicit(&p->debug_threads, memory_order_relaxed)) + av_log(f->owner[field], AV_LOG_DEBUG, + "thread awaiting %d field %d from %p\n", n, field, progress); + + pthread_mutex_lock(&p->progress_mutex); + while (atomic_load_explicit(&progress[field], memory_order_relaxed) < n) + pthread_cond_wait(&p->progress_cond, &p->progress_mutex); + pthread_mutex_unlock(&p->progress_mutex); +} + +void ff_thread_finish_setup(AVCodecContext *avctx) { + PerThreadContext *p = avctx->internal->thread_ctx; + + if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return; + + if (avctx->hwaccel && !p->hwaccel_serializing) { + pthread_mutex_lock(&p->parent->hwaccel_mutex); + p->hwaccel_serializing = 1; + } + + /* this assumes that no hwaccel calls happen before ff_thread_finish_setup() */ + if (avctx->hwaccel && + !(avctx->hwaccel->caps_internal & HWACCEL_CAP_ASYNC_SAFE)) { + p->async_serializing = 1; + + async_lock(p->parent); + } + + /* save hwaccel state for passing to the next thread; + * this is done here so that this worker thread can wipe its own hwaccel + * state after decoding, without requiring synchronization */ + av_assert0(!p->parent->stash_hwaccel); + p->parent->stash_hwaccel = avctx->hwaccel; + p->parent->stash_hwaccel_context = avctx->hwaccel_context; + p->parent->stash_hwaccel_priv = avctx->internal->hwaccel_priv_data; + + pthread_mutex_lock(&p->progress_mutex); + if(atomic_load(&p->state) == STATE_SETUP_FINISHED){ + av_log(avctx, AV_LOG_WARNING, "Multiple ff_thread_finish_setup() calls\n"); + } + + atomic_store(&p->state, STATE_SETUP_FINISHED); + + pthread_cond_broadcast(&p->progress_cond); + pthread_mutex_unlock(&p->progress_mutex); +} + +/// Waits for all threads to finish. +static void park_frame_worker_threads(FrameThreadContext *fctx, int thread_count) +{ + int i; + + async_unlock(fctx); + + for (i = 0; i < thread_count; i++) { + PerThreadContext *p = &fctx->threads[i]; + + if (atomic_load(&p->state) != STATE_INPUT_READY) { + pthread_mutex_lock(&p->progress_mutex); + while (atomic_load(&p->state) != STATE_INPUT_READY) + pthread_cond_wait(&p->output_cond, &p->progress_mutex); + pthread_mutex_unlock(&p->progress_mutex); + } + p->got_frame = 0; + } + + async_lock(fctx); +} + +#define OFF(member) offsetof(FrameThreadContext, member) +DEFINE_OFFSET_ARRAY(FrameThreadContext, thread_ctx, pthread_init_cnt, + (OFF(buffer_mutex), OFF(hwaccel_mutex), OFF(async_mutex)), + (OFF(async_cond))); +#undef OFF + +#define OFF(member) offsetof(PerThreadContext, member) +DEFINE_OFFSET_ARRAY(PerThreadContext, per_thread, pthread_init_cnt, + (OFF(progress_mutex), OFF(mutex)), + (OFF(input_cond), OFF(progress_cond), OFF(output_cond))); +#undef OFF + +void ff_frame_thread_free(AVCodecContext *avctx, int thread_count) +{ + FrameThreadContext *fctx = avctx->internal->thread_ctx; + const FFCodec *codec = ffcodec(avctx->codec); + int i; + + park_frame_worker_threads(fctx, thread_count); + + for (i = 0; i < thread_count; i++) { + PerThreadContext *p = &fctx->threads[i]; + AVCodecContext *ctx = p->avctx; + + if (ctx->internal) { + if (p->thread_init == INITIALIZED) { + pthread_mutex_lock(&p->mutex); + p->die = 1; + pthread_cond_signal(&p->input_cond); + pthread_mutex_unlock(&p->mutex); + + pthread_join(p->thread, NULL); + } + if (codec->close && p->thread_init != UNINITIALIZED) + codec->close(ctx); + + if (ctx->priv_data) { + if (codec->p.priv_class) + av_opt_free(ctx->priv_data); + av_freep(&ctx->priv_data); + } + + av_freep(&ctx->slice_offset); + + av_buffer_unref(&ctx->internal->pool); + av_packet_free(&ctx->internal->last_pkt_props); + av_freep(&ctx->internal); + av_buffer_unref(&ctx->hw_frames_ctx); + } + + av_frame_free(&p->frame); + + ff_pthread_free(p, per_thread_offsets); + av_packet_free(&p->avpkt); + + av_freep(&p->avctx); + } + + av_freep(&fctx->threads); + ff_pthread_free(fctx, thread_ctx_offsets); + + /* if we have stashed hwaccel state, move it to the user-facing context, + * so it will be freed in avcodec_close() */ + av_assert0(!avctx->hwaccel); + FFSWAP(const AVHWAccel*, avctx->hwaccel, fctx->stash_hwaccel); + FFSWAP(void*, avctx->hwaccel_context, fctx->stash_hwaccel_context); + FFSWAP(void*, avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv); + + av_freep(&avctx->internal->thread_ctx); +} + +static av_cold int init_thread(PerThreadContext *p, int *threads_to_free, + FrameThreadContext *fctx, AVCodecContext *avctx, + const FFCodec *codec, int first) +{ + AVCodecContext *copy; + int err; + + atomic_init(&p->state, STATE_INPUT_READY); + + copy = av_memdup(avctx, sizeof(*avctx)); + if (!copy) + return AVERROR(ENOMEM); + copy->priv_data = NULL; + + /* From now on, this PerThreadContext will be cleaned up by + * ff_frame_thread_free in case of errors. */ + (*threads_to_free)++; + + p->parent = fctx; + p->avctx = copy; + + copy->internal = av_mallocz(sizeof(*copy->internal)); + if (!copy->internal) + return AVERROR(ENOMEM); + copy->internal->thread_ctx = p; + + copy->delay = avctx->delay; + + if (codec->priv_data_size) { + copy->priv_data = av_mallocz(codec->priv_data_size); + if (!copy->priv_data) + return AVERROR(ENOMEM); + + if (codec->p.priv_class) { + *(const AVClass **)copy->priv_data = codec->p.priv_class; + err = av_opt_copy(copy->priv_data, avctx->priv_data); + if (err < 0) + return err; + } + } + + err = ff_pthread_init(p, per_thread_offsets); + if (err < 0) + return err; + + if (!(p->frame = av_frame_alloc()) || + !(p->avpkt = av_packet_alloc())) + return AVERROR(ENOMEM); + + if (!first) + copy->internal->is_copy = 1; + + copy->internal->last_pkt_props = av_packet_alloc(); + if (!copy->internal->last_pkt_props) + return AVERROR(ENOMEM); + + if (codec->init) { + err = codec->init(copy); + if (err < 0) { + if (codec->caps_internal & FF_CODEC_CAP_INIT_CLEANUP) + p->thread_init = NEEDS_CLOSE; + return err; + } + } + p->thread_init = NEEDS_CLOSE; + + if (first) + update_context_from_thread(avctx, copy, 1); + + atomic_init(&p->debug_threads, (copy->debug & FF_DEBUG_THREADS) != 0); + + err = AVERROR(pthread_create(&p->thread, NULL, frame_worker_thread, p)); + if (err < 0) + return err; + p->thread_init = INITIALIZED; + + return 0; +} + +int ff_frame_thread_init(AVCodecContext *avctx) +{ + int thread_count = avctx->thread_count; + const FFCodec *codec = ffcodec(avctx->codec); + FrameThreadContext *fctx; + int err, i = 0; + + if (!thread_count) { + int nb_cpus = av_cpu_count(); + // use number of cores + 1 as thread count if there is more than one + if (nb_cpus > 1) + thread_count = avctx->thread_count = FFMIN(nb_cpus + 1, MAX_AUTO_THREADS); + else + thread_count = avctx->thread_count = 1; + } + + if (thread_count <= 1) { + avctx->active_thread_type = 0; + return 0; + } + + avctx->internal->thread_ctx = fctx = av_mallocz(sizeof(FrameThreadContext)); + if (!fctx) + return AVERROR(ENOMEM); + + err = ff_pthread_init(fctx, thread_ctx_offsets); + if (err < 0) { + ff_pthread_free(fctx, thread_ctx_offsets); + av_freep(&avctx->internal->thread_ctx); + return err; + } + + fctx->async_lock = 1; + fctx->delaying = 1; + + if (codec->p.type == AVMEDIA_TYPE_VIDEO) + avctx->delay = avctx->thread_count - 1; + + fctx->threads = av_calloc(thread_count, sizeof(*fctx->threads)); + if (!fctx->threads) { + err = AVERROR(ENOMEM); + goto error; + } + + for (; i < thread_count; ) { + PerThreadContext *p = &fctx->threads[i]; + int first = !i; + + err = init_thread(p, &i, fctx, avctx, codec, first); + if (err < 0) + goto error; + } + + return 0; + +error: + ff_frame_thread_free(avctx, i); + return err; +} + +void ff_thread_flush(AVCodecContext *avctx) +{ + int i; + FrameThreadContext *fctx = avctx->internal->thread_ctx; + + if (!fctx) return; + + park_frame_worker_threads(fctx, avctx->thread_count); + if (fctx->prev_thread) { + if (fctx->prev_thread != &fctx->threads[0]) + update_context_from_thread(fctx->threads[0].avctx, fctx->prev_thread->avctx, 0); + } + + fctx->next_decoding = fctx->next_finished = 0; + fctx->delaying = 1; + fctx->prev_thread = NULL; + for (i = 0; i < avctx->thread_count; i++) { + PerThreadContext *p = &fctx->threads[i]; + // Make sure decode flush calls with size=0 won't return old frames + p->got_frame = 0; + av_frame_unref(p->frame); + p->result = 0; + + if (ffcodec(avctx->codec)->flush) + ffcodec(avctx->codec)->flush(p->avctx); + } +} + +int ff_thread_can_start_frame(AVCodecContext *avctx) +{ + PerThreadContext *p = avctx->internal->thread_ctx; + + if ((avctx->active_thread_type&FF_THREAD_FRAME) && atomic_load(&p->state) != STATE_SETTING_UP && + ffcodec(avctx->codec)->update_thread_context) { + return 0; + } + + return 1; +} + +static int thread_get_buffer_internal(AVCodecContext *avctx, AVFrame *f, int flags) +{ + PerThreadContext *p; + int err; + + if (!(avctx->active_thread_type & FF_THREAD_FRAME)) + return ff_get_buffer(avctx, f, flags); + + p = avctx->internal->thread_ctx; +FF_DISABLE_DEPRECATION_WARNINGS + if (atomic_load(&p->state) != STATE_SETTING_UP && + ffcodec(avctx->codec)->update_thread_context) { +FF_ENABLE_DEPRECATION_WARNINGS + av_log(avctx, AV_LOG_ERROR, "get_buffer() cannot be called after ff_thread_finish_setup()\n"); + return -1; + } + + pthread_mutex_lock(&p->parent->buffer_mutex); + err = ff_get_buffer(avctx, f, flags); + + pthread_mutex_unlock(&p->parent->buffer_mutex); + + return err; +} + +int ff_thread_get_buffer(AVCodecContext *avctx, AVFrame *f, int flags) +{ + int ret = thread_get_buffer_internal(avctx, f, flags); + if (ret < 0) + av_log(avctx, AV_LOG_ERROR, "thread_get_buffer() failed\n"); + return ret; +} + +int ff_thread_get_ext_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags) +{ + int ret; + + f->owner[0] = f->owner[1] = avctx; + /* Hint: It is possible for this function to be called with codecs + * that don't support frame threading at all, namely in case + * a frame-threaded decoder shares code with codecs that are not. + * This currently affects non-MPEG-4 mpegvideo codecs and and VP7. + * The following check will always be true for them. */ + if (!(avctx->active_thread_type & FF_THREAD_FRAME)) + return ff_get_buffer(avctx, f->f, flags); + + if (ffcodec(avctx->codec)->caps_internal & FF_CODEC_CAP_ALLOCATE_PROGRESS) { + atomic_int *progress; + f->progress = av_buffer_alloc(2 * sizeof(*progress)); + if (!f->progress) { + return AVERROR(ENOMEM); + } + progress = (atomic_int*)f->progress->data; + + atomic_init(&progress[0], -1); + atomic_init(&progress[1], -1); + } + + ret = ff_thread_get_buffer(avctx, f->f, flags); + if (ret) + av_buffer_unref(&f->progress); + return ret; +} + +void ff_thread_release_buffer(AVCodecContext *avctx, AVFrame *f) +{ + if (!f) + return; + + if (avctx->debug & FF_DEBUG_BUFFERS) + av_log(avctx, AV_LOG_DEBUG, "thread_release_buffer called on pic %p\n", f); + + av_frame_unref(f); +} + +void ff_thread_release_ext_buffer(AVCodecContext *avctx, ThreadFrame *f) +{ + av_buffer_unref(&f->progress); + f->owner[0] = f->owner[1] = NULL; + ff_thread_release_buffer(avctx, f->f); +} diff --git a/media/ffvpx/libavcodec/pthread_internal.h b/media/ffvpx/libavcodec/pthread_internal.h new file mode 100644 index 0000000000..d0b6a7a673 --- /dev/null +++ b/media/ffvpx/libavcodec/pthread_internal.h @@ -0,0 +1,66 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PTHREAD_INTERNAL_H +#define AVCODEC_PTHREAD_INTERNAL_H + +#include "avcodec.h" + +/* H.264 slice threading seems to be buggy with more than 16 threads, + * limit the number of threads to 16 for automatic detection */ +#define MAX_AUTO_THREADS 16 + +int ff_slice_thread_init(AVCodecContext *avctx); +void ff_slice_thread_free(AVCodecContext *avctx); + +int ff_frame_thread_init(AVCodecContext *avctx); +void ff_frame_thread_free(AVCodecContext *avctx, int thread_count); + +#define THREAD_SENTINEL 0 // This forbids putting a mutex/condition variable at the front. +/** + * Initialize/destroy a list of mutexes/conditions contained in a structure. + * The positions of these mutexes/conditions in the structure are given by + * their offsets. Because it is undefined behaviour to destroy + * an uninitialized mutex/condition, ff_pthread_init() stores the number + * of successfully initialized mutexes and conditions in the object itself + * and ff_pthread_free() uses this number to destroy exactly the mutexes and + * condition variables that have been successfully initialized. + * + * @param obj The object containing the mutexes/conditions. + * @param[in] offsets An array of offsets. Its first member gives the offset + * of the variable that contains the count of successfully + * initialized mutexes/condition variables; said variable + * must be an unsigned int. Two arrays of offsets, each + * delimited by a THREAD_SENTINEL follow. The first + * contains the offsets of all the mutexes, the second + * contains the offsets of all the condition variables. + */ +int ff_pthread_init(void *obj, const unsigned offsets[]); +void ff_pthread_free(void *obj, const unsigned offsets[]); + +/** + * Macros to help creating the above lists. mutexes and conds need + * to be parentheses-enclosed lists of offsets in the containing structure. + */ +#define OFFSET_ARRAY(...) __VA_ARGS__, THREAD_SENTINEL +#define DEFINE_OFFSET_ARRAY(type, name, cnt_variable, mutexes, conds) \ +static const unsigned name ## _offsets[] = { offsetof(type, cnt_variable), \ + OFFSET_ARRAY mutexes, \ + OFFSET_ARRAY conds } + +#endif // AVCODEC_PTHREAD_INTERNAL_H diff --git a/media/ffvpx/libavcodec/pthread_slice.c b/media/ffvpx/libavcodec/pthread_slice.c new file mode 100644 index 0000000000..a4d31c6f4d --- /dev/null +++ b/media/ffvpx/libavcodec/pthread_slice.c @@ -0,0 +1,260 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Slice multithreading support functions + * @see doc/multithreading.txt + */ + +#include "config.h" + +#include "avcodec.h" +#include "codec_internal.h" +#include "internal.h" +#include "pthread_internal.h" +#include "thread.h" + +#include "libavutil/avassert.h" +#include "libavutil/common.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/thread.h" +#include "libavutil/slicethread.h" + +typedef int (action_func)(AVCodecContext *c, void *arg); +typedef int (action_func2)(AVCodecContext *c, void *arg, int jobnr, int threadnr); +typedef int (main_func)(AVCodecContext *c); + +typedef struct Progress { + pthread_cond_t cond; + pthread_mutex_t mutex; +} Progress; + +typedef struct SliceThreadContext { + AVSliceThread *thread; + action_func *func; + action_func2 *func2; + main_func *mainfunc; + void *args; + int *rets; + int job_size; + + int *entries; + int entries_count; + int thread_count; + Progress *progress; +} SliceThreadContext; + +static void main_function(void *priv) { + AVCodecContext *avctx = priv; + SliceThreadContext *c = avctx->internal->thread_ctx; + c->mainfunc(avctx); +} + +static void worker_func(void *priv, int jobnr, int threadnr, int nb_jobs, int nb_threads) +{ + AVCodecContext *avctx = priv; + SliceThreadContext *c = avctx->internal->thread_ctx; + int ret; + + ret = c->func ? c->func(avctx, (char *)c->args + c->job_size * jobnr) + : c->func2(avctx, c->args, jobnr, threadnr); + if (c->rets) + c->rets[jobnr] = ret; +} + +void ff_slice_thread_free(AVCodecContext *avctx) +{ + SliceThreadContext *c = avctx->internal->thread_ctx; + int i; + + avpriv_slicethread_free(&c->thread); + + for (i = 0; i < c->thread_count; i++) { + Progress *const progress = &c->progress[i]; + pthread_mutex_destroy(&progress->mutex); + pthread_cond_destroy(&progress->cond); + } + + av_freep(&c->entries); + av_freep(&c->progress); + av_freep(&avctx->internal->thread_ctx); +} + +static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, int *ret, int job_count, int job_size) +{ + SliceThreadContext *c = avctx->internal->thread_ctx; + + if (!(avctx->active_thread_type&FF_THREAD_SLICE) || avctx->thread_count <= 1) + return avcodec_default_execute(avctx, func, arg, ret, job_count, job_size); + + if (job_count <= 0) + return 0; + + c->job_size = job_size; + c->args = arg; + c->func = func; + c->rets = ret; + + avpriv_slicethread_execute(c->thread, job_count, !!c->mainfunc ); + return 0; +} + +static int thread_execute2(AVCodecContext *avctx, action_func2* func2, void *arg, int *ret, int job_count) +{ + SliceThreadContext *c = avctx->internal->thread_ctx; + c->func2 = func2; + return thread_execute(avctx, NULL, arg, ret, job_count, 0); +} + +int ff_slice_thread_execute_with_mainfunc(AVCodecContext *avctx, action_func2* func2, main_func *mainfunc, void *arg, int *ret, int job_count) +{ + SliceThreadContext *c = avctx->internal->thread_ctx; + c->func2 = func2; + c->mainfunc = mainfunc; + return thread_execute(avctx, NULL, arg, ret, job_count, 0); +} + +int ff_slice_thread_init(AVCodecContext *avctx) +{ + SliceThreadContext *c; + int thread_count = avctx->thread_count; + void (*mainfunc)(void *); + + // We cannot do this in the encoder init as the threads are created before + if (av_codec_is_encoder(avctx->codec) && + avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO && + avctx->height > 2800) + thread_count = avctx->thread_count = 1; + + if (!thread_count) { + int nb_cpus = av_cpu_count(); + if (avctx->height) + nb_cpus = FFMIN(nb_cpus, (avctx->height+15)/16); + // use number of cores + 1 as thread count if there is more than one + if (nb_cpus > 1) + thread_count = avctx->thread_count = FFMIN(nb_cpus + 1, MAX_AUTO_THREADS); + else + thread_count = avctx->thread_count = 1; + } + + if (thread_count <= 1) { + avctx->active_thread_type = 0; + return 0; + } + + avctx->internal->thread_ctx = c = av_mallocz(sizeof(*c)); + mainfunc = ffcodec(avctx->codec)->caps_internal & FF_CODEC_CAP_SLICE_THREAD_HAS_MF ? &main_function : NULL; + if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, worker_func, mainfunc, thread_count)) <= 1) { + if (c) + avpriv_slicethread_free(&c->thread); + av_freep(&avctx->internal->thread_ctx); + avctx->thread_count = 1; + avctx->active_thread_type = 0; + return 0; + } + avctx->thread_count = thread_count; + + avctx->execute = thread_execute; + avctx->execute2 = thread_execute2; + return 0; +} + +int av_cold ff_slice_thread_init_progress(AVCodecContext *avctx) +{ + SliceThreadContext *const p = avctx->internal->thread_ctx; + int err, i = 0, thread_count = avctx->thread_count; + + p->progress = av_calloc(thread_count, sizeof(*p->progress)); + if (!p->progress) { + err = AVERROR(ENOMEM); + goto fail; + } + + for (; i < thread_count; i++) { + Progress *const progress = &p->progress[i]; + err = pthread_mutex_init(&progress->mutex, NULL); + if (err) { + err = AVERROR(err); + goto fail; + } + err = pthread_cond_init (&progress->cond, NULL); + if (err) { + err = AVERROR(err); + pthread_mutex_destroy(&progress->mutex); + goto fail; + } + } + err = 0; +fail: + p->thread_count = i; + return err; +} + +void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n) +{ + SliceThreadContext *p = avctx->internal->thread_ctx; + Progress *const progress = &p->progress[thread]; + int *entries = p->entries; + + pthread_mutex_lock(&progress->mutex); + entries[field] +=n; + pthread_cond_signal(&progress->cond); + pthread_mutex_unlock(&progress->mutex); +} + +void ff_thread_await_progress2(AVCodecContext *avctx, int field, int thread, int shift) +{ + SliceThreadContext *p = avctx->internal->thread_ctx; + Progress *progress; + int *entries = p->entries; + + if (!entries || !field) return; + + thread = thread ? thread - 1 : p->thread_count - 1; + progress = &p->progress[thread]; + + pthread_mutex_lock(&progress->mutex); + while ((entries[field - 1] - entries[field]) < shift){ + pthread_cond_wait(&progress->cond, &progress->mutex); + } + pthread_mutex_unlock(&progress->mutex); +} + +int ff_slice_thread_allocz_entries(AVCodecContext *avctx, int count) +{ + if (avctx->active_thread_type & FF_THREAD_SLICE) { + SliceThreadContext *p = avctx->internal->thread_ctx; + + if (p->entries_count == count) { + memset(p->entries, 0, p->entries_count * sizeof(*p->entries)); + return 0; + } + av_freep(&p->entries); + + p->entries = av_calloc(count, sizeof(*p->entries)); + if (!p->entries) { + p->entries_count = 0; + return AVERROR(ENOMEM); + } + p->entries_count = count; + } + + return 0; +} diff --git a/media/ffvpx/libavcodec/put_bits.h b/media/ffvpx/libavcodec/put_bits.h new file mode 100644 index 0000000000..4561dc131a --- /dev/null +++ b/media/ffvpx/libavcodec/put_bits.h @@ -0,0 +1,428 @@ +/* + * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * bitstream writer API + */ + +#ifndef AVCODEC_PUT_BITS_H +#define AVCODEC_PUT_BITS_H + +#include <stdint.h> +#include <stddef.h> + +#include "config.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/avassert.h" +#include "libavutil/common.h" + +#if ARCH_X86_64 +// TODO: Benchmark and optionally enable on other 64-bit architectures. +typedef uint64_t BitBuf; +#define AV_WBBUF AV_WB64 +#define AV_WLBUF AV_WL64 +#else +typedef uint32_t BitBuf; +#define AV_WBBUF AV_WB32 +#define AV_WLBUF AV_WL32 +#endif + +static const int BUF_BITS = 8 * sizeof(BitBuf); + +typedef struct PutBitContext { + BitBuf bit_buf; + int bit_left; + uint8_t *buf, *buf_ptr, *buf_end; +} PutBitContext; + +/** + * Initialize the PutBitContext s. + * + * @param buffer the buffer where to put bits + * @param buffer_size the size in bytes of buffer + */ +static inline void init_put_bits(PutBitContext *s, uint8_t *buffer, + int buffer_size) +{ + if (buffer_size < 0) { + buffer_size = 0; + buffer = NULL; + } + + s->buf = buffer; + s->buf_end = s->buf + buffer_size; + s->buf_ptr = s->buf; + s->bit_left = BUF_BITS; + s->bit_buf = 0; +} + +/** + * @return the total number of bits written to the bitstream. + */ +static inline int put_bits_count(PutBitContext *s) +{ + return (s->buf_ptr - s->buf) * 8 + BUF_BITS - s->bit_left; +} + +/** + * @return the number of bytes output so far; may only be called + * when the PutBitContext is freshly initialized or flushed. + */ +static inline int put_bytes_output(const PutBitContext *s) +{ + av_assert2(s->bit_left == BUF_BITS); + return s->buf_ptr - s->buf; +} + +/** + * @param round_up When set, the number of bits written so far will be + * rounded up to the next byte. + * @return the number of bytes output so far. + */ +static inline int put_bytes_count(const PutBitContext *s, int round_up) +{ + return s->buf_ptr - s->buf + ((BUF_BITS - s->bit_left + (round_up ? 7 : 0)) >> 3); +} + +/** + * Rebase the bit writer onto a reallocated buffer. + * + * @param buffer the buffer where to put bits + * @param buffer_size the size in bytes of buffer, + * must be large enough to hold everything written so far + */ +static inline void rebase_put_bits(PutBitContext *s, uint8_t *buffer, + int buffer_size) +{ + av_assert0(8*buffer_size >= put_bits_count(s)); + + s->buf_end = buffer + buffer_size; + s->buf_ptr = buffer + (s->buf_ptr - s->buf); + s->buf = buffer; +} + +/** + * @return the number of bits available in the bitstream. + */ +static inline int put_bits_left(PutBitContext* s) +{ + return (s->buf_end - s->buf_ptr) * 8 - BUF_BITS + s->bit_left; +} + +/** + * @param round_up When set, the number of bits written will be + * rounded up to the next byte. + * @return the number of bytes left. + */ +static inline int put_bytes_left(const PutBitContext *s, int round_up) +{ + return s->buf_end - s->buf_ptr - ((BUF_BITS - s->bit_left + (round_up ? 7 : 0)) >> 3); +} + +/** + * Pad the end of the output stream with zeros. + */ +static inline void flush_put_bits(PutBitContext *s) +{ +#ifndef BITSTREAM_WRITER_LE + if (s->bit_left < BUF_BITS) + s->bit_buf <<= s->bit_left; +#endif + while (s->bit_left < BUF_BITS) { + av_assert0(s->buf_ptr < s->buf_end); +#ifdef BITSTREAM_WRITER_LE + *s->buf_ptr++ = s->bit_buf; + s->bit_buf >>= 8; +#else + *s->buf_ptr++ = s->bit_buf >> (BUF_BITS - 8); + s->bit_buf <<= 8; +#endif + s->bit_left += 8; + } + s->bit_left = BUF_BITS; + s->bit_buf = 0; +} + +static inline void flush_put_bits_le(PutBitContext *s) +{ + while (s->bit_left < BUF_BITS) { + av_assert0(s->buf_ptr < s->buf_end); + *s->buf_ptr++ = s->bit_buf; + s->bit_buf >>= 8; + s->bit_left += 8; + } + s->bit_left = BUF_BITS; + s->bit_buf = 0; +} + +#ifdef BITSTREAM_WRITER_LE +#define ff_put_string ff_put_string_unsupported_here +#define ff_copy_bits ff_copy_bits_unsupported_here +#else + +/** + * Put the string string in the bitstream. + * + * @param terminate_string 0-terminates the written string if value is 1 + */ +void ff_put_string(PutBitContext *pb, const char *string, + int terminate_string); + +/** + * Copy the content of src to the bitstream. + * + * @param length the number of bits of src to copy + */ +void ff_copy_bits(PutBitContext *pb, const uint8_t *src, int length); +#endif + +static inline void put_bits_no_assert(PutBitContext *s, int n, BitBuf value) +{ + BitBuf bit_buf; + int bit_left; + + bit_buf = s->bit_buf; + bit_left = s->bit_left; + + /* XXX: optimize */ +#ifdef BITSTREAM_WRITER_LE + bit_buf |= value << (BUF_BITS - bit_left); + if (n >= bit_left) { + if (s->buf_end - s->buf_ptr >= sizeof(BitBuf)) { + AV_WLBUF(s->buf_ptr, bit_buf); + s->buf_ptr += sizeof(BitBuf); + } else { + av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n"); + av_assert2(0); + } + bit_buf = value >> bit_left; + bit_left += BUF_BITS; + } + bit_left -= n; +#else + if (n < bit_left) { + bit_buf = (bit_buf << n) | value; + bit_left -= n; + } else { + bit_buf <<= bit_left; + bit_buf |= value >> (n - bit_left); + if (s->buf_end - s->buf_ptr >= sizeof(BitBuf)) { + AV_WBBUF(s->buf_ptr, bit_buf); + s->buf_ptr += sizeof(BitBuf); + } else { + av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n"); + av_assert2(0); + } + bit_left += BUF_BITS - n; + bit_buf = value; + } +#endif + + s->bit_buf = bit_buf; + s->bit_left = bit_left; +} + +/** + * Write up to 31 bits into a bitstream. + * Use put_bits32 to write 32 bits. + */ +static inline void put_bits(PutBitContext *s, int n, BitBuf value) +{ + av_assert2(n <= 31 && value < (1UL << n)); + put_bits_no_assert(s, n, value); +} + +static inline void put_bits_le(PutBitContext *s, int n, BitBuf value) +{ + BitBuf bit_buf; + int bit_left; + + av_assert2(n <= 31 && value < (1UL << n)); + + bit_buf = s->bit_buf; + bit_left = s->bit_left; + + bit_buf |= value << (BUF_BITS - bit_left); + if (n >= bit_left) { + if (s->buf_end - s->buf_ptr >= sizeof(BitBuf)) { + AV_WLBUF(s->buf_ptr, bit_buf); + s->buf_ptr += sizeof(BitBuf); + } else { + av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n"); + av_assert2(0); + } + bit_buf = value >> bit_left; + bit_left += BUF_BITS; + } + bit_left -= n; + + s->bit_buf = bit_buf; + s->bit_left = bit_left; +} + +static inline void put_sbits(PutBitContext *pb, int n, int32_t value) +{ + av_assert2(n >= 0 && n <= 31); + + put_bits(pb, n, av_mod_uintp2(value, n)); +} + +/** + * Write exactly 32 bits into a bitstream. + */ +static void av_unused put_bits32(PutBitContext *s, uint32_t value) +{ + BitBuf bit_buf; + int bit_left; + + if (BUF_BITS > 32) { + put_bits_no_assert(s, 32, value); + return; + } + + bit_buf = s->bit_buf; + bit_left = s->bit_left; + +#ifdef BITSTREAM_WRITER_LE + bit_buf |= (BitBuf)value << (BUF_BITS - bit_left); + if (s->buf_end - s->buf_ptr >= sizeof(BitBuf)) { + AV_WLBUF(s->buf_ptr, bit_buf); + s->buf_ptr += sizeof(BitBuf); + } else { + av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n"); + av_assert2(0); + } + bit_buf = (uint64_t)value >> bit_left; +#else + bit_buf = (uint64_t)bit_buf << bit_left; + bit_buf |= (BitBuf)value >> (BUF_BITS - bit_left); + if (s->buf_end - s->buf_ptr >= sizeof(BitBuf)) { + AV_WBBUF(s->buf_ptr, bit_buf); + s->buf_ptr += sizeof(BitBuf); + } else { + av_log(NULL, AV_LOG_ERROR, "Internal error, put_bits buffer too small\n"); + av_assert2(0); + } + bit_buf = value; +#endif + + s->bit_buf = bit_buf; + s->bit_left = bit_left; +} + +/** + * Write up to 64 bits into a bitstream. + */ +static inline void put_bits64(PutBitContext *s, int n, uint64_t value) +{ + av_assert2((n == 64) || (n < 64 && value < (UINT64_C(1) << n))); + + if (n < 32) + put_bits(s, n, value); + else if (n == 32) + put_bits32(s, value); + else if (n < 64) { + uint32_t lo = value & 0xffffffff; + uint32_t hi = value >> 32; +#ifdef BITSTREAM_WRITER_LE + put_bits32(s, lo); + put_bits(s, n - 32, hi); +#else + put_bits(s, n - 32, hi); + put_bits32(s, lo); +#endif + } else { + uint32_t lo = value & 0xffffffff; + uint32_t hi = value >> 32; +#ifdef BITSTREAM_WRITER_LE + put_bits32(s, lo); + put_bits32(s, hi); +#else + put_bits32(s, hi); + put_bits32(s, lo); +#endif + + } +} + +static inline void put_sbits63(PutBitContext *pb, int n, int64_t value) +{ + av_assert2(n >= 0 && n < 64); + + put_bits64(pb, n, (uint64_t)(value) & (~(UINT64_MAX << n))); +} + +/** + * Return the pointer to the byte where the bitstream writer will put + * the next bit. + */ +static inline uint8_t *put_bits_ptr(PutBitContext *s) +{ + return s->buf_ptr; +} + +/** + * Skip the given number of bytes. + * PutBitContext must be flushed & aligned to a byte boundary before calling this. + */ +static inline void skip_put_bytes(PutBitContext *s, int n) +{ + av_assert2((put_bits_count(s) & 7) == 0); + av_assert2(s->bit_left == BUF_BITS); + av_assert0(n <= s->buf_end - s->buf_ptr); + s->buf_ptr += n; +} + +/** + * Skip the given number of bits. + * Must only be used if the actual values in the bitstream do not matter. + * If n is < 0 the behavior is undefined. + */ +static inline void skip_put_bits(PutBitContext *s, int n) +{ + unsigned bits = BUF_BITS - s->bit_left + n; + s->buf_ptr += sizeof(BitBuf) * (bits / BUF_BITS); + s->bit_left = BUF_BITS - (bits & (BUF_BITS - 1)); +} + +/** + * Change the end of the buffer. + * + * @param size the new size in bytes of the buffer where to put bits + */ +static inline void set_put_bits_buffer_size(PutBitContext *s, int size) +{ + av_assert0(size <= INT_MAX/8 - BUF_BITS); + s->buf_end = s->buf + size; +} + +/** + * Pad the bitstream with zeros up to the next byte boundary. + */ +static inline void align_put_bits(PutBitContext *s) +{ + put_bits(s, s->bit_left & 7, 0); +} + +#undef AV_WBBUF +#undef AV_WLBUF + +#endif /* AVCODEC_PUT_BITS_H */ diff --git a/media/ffvpx/libavcodec/qpeldsp.h b/media/ffvpx/libavcodec/qpeldsp.h new file mode 100644 index 0000000000..91019eda9c --- /dev/null +++ b/media/ffvpx/libavcodec/qpeldsp.h @@ -0,0 +1,83 @@ +/* + * quarterpel DSP functions + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * quarterpel DSP functions + */ + +#ifndef AVCODEC_QPELDSP_H +#define AVCODEC_QPELDSP_H + +#include <stddef.h> +#include <stdint.h> + +void ff_put_pixels8x8_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +void ff_avg_pixels8x8_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +void ff_put_pixels16x16_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +void ff_avg_pixels16x16_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + +void ff_put_pixels8_l2_8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, + int dst_stride, int src_stride1, int src_stride2, + int h); + +#define DEF_OLD_QPEL(name) \ +void ff_put_ ## name(uint8_t *dst /* align width (8 or 16) */, \ + const uint8_t *src /* align 1 */, \ + ptrdiff_t stride); \ +void ff_put_no_rnd_ ## name(uint8_t *dst /* align width (8 or 16) */, \ + const uint8_t *src /* align 1 */, \ + ptrdiff_t stride); \ +void ff_avg_ ## name(uint8_t *dst /* align width (8 or 16) */, \ + const uint8_t *src /* align 1 */, \ + ptrdiff_t stride); + +DEF_OLD_QPEL(qpel16_mc11_old_c) +DEF_OLD_QPEL(qpel16_mc31_old_c) +DEF_OLD_QPEL(qpel16_mc12_old_c) +DEF_OLD_QPEL(qpel16_mc32_old_c) +DEF_OLD_QPEL(qpel16_mc13_old_c) +DEF_OLD_QPEL(qpel16_mc33_old_c) +DEF_OLD_QPEL(qpel8_mc11_old_c) +DEF_OLD_QPEL(qpel8_mc31_old_c) +DEF_OLD_QPEL(qpel8_mc12_old_c) +DEF_OLD_QPEL(qpel8_mc32_old_c) +DEF_OLD_QPEL(qpel8_mc13_old_c) +DEF_OLD_QPEL(qpel8_mc33_old_c) + +typedef void (*qpel_mc_func)(uint8_t *dst /* align width (8 or 16) */, + const uint8_t *src /* align 1 */, + ptrdiff_t stride); + +/** + * quarterpel DSP context + */ +typedef struct QpelDSPContext { + qpel_mc_func put_qpel_pixels_tab[2][16]; + qpel_mc_func avg_qpel_pixels_tab[2][16]; + qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16]; +} QpelDSPContext; + +void ff_qpeldsp_init(QpelDSPContext *c); + +void ff_qpeldsp_init_x86(QpelDSPContext *c); +void ff_qpeldsp_init_mips(QpelDSPContext *c); + +#endif /* AVCODEC_QPELDSP_H */ diff --git a/media/ffvpx/libavcodec/qsv_api.c b/media/ffvpx/libavcodec/qsv_api.c new file mode 100644 index 0000000000..327ff7d813 --- /dev/null +++ b/media/ffvpx/libavcodec/qsv_api.c @@ -0,0 +1,42 @@ +/* + * Intel MediaSDK QSV public API functions + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include <stddef.h> + +#include "libavutil/mem.h" + +#if CONFIG_QSV +#include "qsv.h" + +AVQSVContext *av_qsv_alloc_context(void) +{ + return av_mallocz(sizeof(AVQSVContext)); +} +#else + +struct AVQSVContext *av_qsv_alloc_context(void); + +struct AVQSVContext *av_qsv_alloc_context(void) +{ + return NULL; +} +#endif diff --git a/media/ffvpx/libavcodec/ratecontrol.h b/media/ffvpx/libavcodec/ratecontrol.h new file mode 100644 index 0000000000..4de80fad90 --- /dev/null +++ b/media/ffvpx/libavcodec/ratecontrol.h @@ -0,0 +1,96 @@ +/* + * Ratecontrol + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_RATECONTROL_H +#define AVCODEC_RATECONTROL_H + +/** + * @file + * ratecontrol header. + */ + +#include <stdio.h> +#include <stdint.h> +#include "libavutil/eval.h" + +typedef struct Predictor{ + double coeff; + double count; + double decay; +} Predictor; + +typedef struct RateControlEntry{ + int pict_type; + float qscale; + int mv_bits; + int i_tex_bits; + int p_tex_bits; + int misc_bits; + int header_bits; + uint64_t expected_bits; + int new_pict_type; + float new_qscale; + int64_t mc_mb_var_sum; + int64_t mb_var_sum; + int i_count; + int skip_count; + int f_code; + int b_code; +}RateControlEntry; + +/** + * rate control context. + */ +typedef struct RateControlContext{ + int num_entries; ///< number of RateControlEntries + RateControlEntry *entry; + double buffer_index; ///< amount of bits in the video/audio buffer + Predictor pred[5]; + double short_term_qsum; ///< sum of recent qscales + double short_term_qcount; ///< count of recent qscales + double pass1_rc_eq_output_sum;///< sum of the output of the rc equation, this is used for normalization + double pass1_wanted_bits; ///< bits which should have been output by the pass1 code (including complexity init) + double last_qscale; + double last_qscale_for[5]; ///< last qscale for a specific pict type, used for max_diff & ipb factor stuff + int64_t last_mc_mb_var_sum; + int64_t last_mb_var_sum; + uint64_t i_cplx_sum[5]; + uint64_t p_cplx_sum[5]; + uint64_t mv_bits_sum[5]; + uint64_t qscale_sum[5]; + int frame_count[5]; + int last_non_b_pict_type; + + AVExpr * rc_eq_eval; +}RateControlContext; + +struct MpegEncContext; + +/* rate control */ +int ff_rate_control_init(struct MpegEncContext *s); +float ff_rate_estimate_qscale(struct MpegEncContext *s, int dry_run); +void ff_write_pass1_stats(struct MpegEncContext *s); +void ff_rate_control_uninit(struct MpegEncContext *s); +int ff_vbv_update(struct MpegEncContext *s, int frame_size); +void ff_get_2pass_fcode(struct MpegEncContext *s); + +#endif /* AVCODEC_RATECONTROL_H */ diff --git a/media/ffvpx/libavcodec/raw.c b/media/ffvpx/libavcodec/raw.c new file mode 100644 index 0000000000..1e5b48d1e0 --- /dev/null +++ b/media/ffvpx/libavcodec/raw.c @@ -0,0 +1,370 @@ +/* + * Raw Video Codec + * Copyright (c) 2001 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Raw Video Codec + */ + +#include "libavutil/macros.h" +#include "avcodec.h" +#include "raw.h" + +static const PixelFormatTag raw_pix_fmt_tags[] = { + { AV_PIX_FMT_YUV420P, MKTAG('I', '4', '2', '0') }, /* Planar formats */ + { AV_PIX_FMT_YUV420P, MKTAG('I', 'Y', 'U', 'V') }, + { AV_PIX_FMT_YUV420P, MKTAG('y', 'v', '1', '2') }, + { AV_PIX_FMT_YUV420P, MKTAG('Y', 'V', '1', '2') }, + { AV_PIX_FMT_YUV410P, MKTAG('Y', 'U', 'V', '9') }, + { AV_PIX_FMT_YUV410P, MKTAG('Y', 'V', 'U', '9') }, + { AV_PIX_FMT_YUV411P, MKTAG('Y', '4', '1', 'B') }, + { AV_PIX_FMT_YUV422P, MKTAG('Y', '4', '2', 'B') }, + { AV_PIX_FMT_YUV422P, MKTAG('P', '4', '2', '2') }, + { AV_PIX_FMT_YUV422P, MKTAG('Y', 'V', '1', '6') }, + /* yuvjXXX formats are deprecated hacks specific to libav*, + they are identical to yuvXXX */ + { AV_PIX_FMT_YUVJ420P, MKTAG('I', '4', '2', '0') }, /* Planar formats */ + { AV_PIX_FMT_YUVJ420P, MKTAG('I', 'Y', 'U', 'V') }, + { AV_PIX_FMT_YUVJ420P, MKTAG('Y', 'V', '1', '2') }, + { AV_PIX_FMT_YUVJ422P, MKTAG('Y', '4', '2', 'B') }, + { AV_PIX_FMT_YUVJ422P, MKTAG('P', '4', '2', '2') }, + { AV_PIX_FMT_GRAY8, MKTAG('Y', '8', '0', '0') }, + { AV_PIX_FMT_GRAY8, MKTAG('Y', '8', ' ', ' ') }, + + { AV_PIX_FMT_YUYV422, MKTAG('Y', 'U', 'Y', '2') }, /* Packed formats */ + { AV_PIX_FMT_YUYV422, MKTAG('Y', '4', '2', '2') }, + { AV_PIX_FMT_YUYV422, MKTAG('V', '4', '2', '2') }, + { AV_PIX_FMT_YUYV422, MKTAG('V', 'Y', 'U', 'Y') }, + { AV_PIX_FMT_YUYV422, MKTAG('Y', 'U', 'N', 'V') }, + { AV_PIX_FMT_YUYV422, MKTAG('Y', 'U', 'Y', 'V') }, + { AV_PIX_FMT_YVYU422, MKTAG('Y', 'V', 'Y', 'U') }, /* Philips */ + { AV_PIX_FMT_UYVY422, MKTAG('U', 'Y', 'V', 'Y') }, + { AV_PIX_FMT_UYVY422, MKTAG('H', 'D', 'Y', 'C') }, + { AV_PIX_FMT_UYVY422, MKTAG('U', 'Y', 'N', 'V') }, + { AV_PIX_FMT_UYVY422, MKTAG('U', 'Y', 'N', 'Y') }, + { AV_PIX_FMT_UYVY422, MKTAG('u', 'y', 'v', '1') }, + { AV_PIX_FMT_UYVY422, MKTAG('2', 'V', 'u', '1') }, + { AV_PIX_FMT_UYVY422, MKTAG('A', 'V', 'R', 'n') }, /* Avid AVI Codec 1:1 */ + { AV_PIX_FMT_UYVY422, MKTAG('A', 'V', '1', 'x') }, /* Avid 1:1x */ + { AV_PIX_FMT_UYVY422, MKTAG('A', 'V', 'u', 'p') }, + { AV_PIX_FMT_UYVY422, MKTAG('V', 'D', 'T', 'Z') }, /* SoftLab-NSK VideoTizer */ + { AV_PIX_FMT_UYVY422, MKTAG('a', 'u', 'v', '2') }, + { AV_PIX_FMT_UYVY422, MKTAG('c', 'y', 'u', 'v') }, /* CYUV is also Creative YUV */ + { AV_PIX_FMT_UYYVYY411, MKTAG('Y', '4', '1', '1') }, + { AV_PIX_FMT_GRAY8, MKTAG('G', 'R', 'E', 'Y') }, + { AV_PIX_FMT_NV12, MKTAG('N', 'V', '1', '2') }, + { AV_PIX_FMT_NV21, MKTAG('N', 'V', '2', '1') }, + { AV_PIX_FMT_VUYA, MKTAG('A', 'Y', 'U', 'V') }, /* MS 4:4:4:4 */ + + /* nut */ + { AV_PIX_FMT_RGB555LE, MKTAG('R', 'G', 'B', 15) }, + { AV_PIX_FMT_BGR555LE, MKTAG('B', 'G', 'R', 15) }, + { AV_PIX_FMT_RGB565LE, MKTAG('R', 'G', 'B', 16) }, + { AV_PIX_FMT_BGR565LE, MKTAG('B', 'G', 'R', 16) }, + { AV_PIX_FMT_RGB555BE, MKTAG(15 , 'B', 'G', 'R') }, + { AV_PIX_FMT_BGR555BE, MKTAG(15 , 'R', 'G', 'B') }, + { AV_PIX_FMT_RGB565BE, MKTAG(16 , 'B', 'G', 'R') }, + { AV_PIX_FMT_BGR565BE, MKTAG(16 , 'R', 'G', 'B') }, + { AV_PIX_FMT_RGB444LE, MKTAG('R', 'G', 'B', 12) }, + { AV_PIX_FMT_BGR444LE, MKTAG('B', 'G', 'R', 12) }, + { AV_PIX_FMT_RGB444BE, MKTAG(12 , 'B', 'G', 'R') }, + { AV_PIX_FMT_BGR444BE, MKTAG(12 , 'R', 'G', 'B') }, + { AV_PIX_FMT_RGBA64LE, MKTAG('R', 'B', 'A', 64 ) }, + { AV_PIX_FMT_BGRA64LE, MKTAG('B', 'R', 'A', 64 ) }, + { AV_PIX_FMT_RGBA64BE, MKTAG(64 , 'R', 'B', 'A') }, + { AV_PIX_FMT_BGRA64BE, MKTAG(64 , 'B', 'R', 'A') }, + { AV_PIX_FMT_RGBA, MKTAG('R', 'G', 'B', 'A') }, + { AV_PIX_FMT_RGB0, MKTAG('R', 'G', 'B', 0 ) }, + { AV_PIX_FMT_BGRA, MKTAG('B', 'G', 'R', 'A') }, + { AV_PIX_FMT_BGR0, MKTAG('B', 'G', 'R', 0 ) }, + { AV_PIX_FMT_ABGR, MKTAG('A', 'B', 'G', 'R') }, + { AV_PIX_FMT_0BGR, MKTAG( 0 , 'B', 'G', 'R') }, + { AV_PIX_FMT_ARGB, MKTAG('A', 'R', 'G', 'B') }, + { AV_PIX_FMT_0RGB, MKTAG( 0 , 'R', 'G', 'B') }, + { AV_PIX_FMT_RGB24, MKTAG('R', 'G', 'B', 24 ) }, + { AV_PIX_FMT_BGR24, MKTAG('B', 'G', 'R', 24 ) }, + { AV_PIX_FMT_YUV411P, MKTAG('4', '1', '1', 'P') }, + { AV_PIX_FMT_YUV422P, MKTAG('4', '2', '2', 'P') }, + { AV_PIX_FMT_YUVJ422P, MKTAG('4', '2', '2', 'P') }, + { AV_PIX_FMT_YUV440P, MKTAG('4', '4', '0', 'P') }, + { AV_PIX_FMT_YUVJ440P, MKTAG('4', '4', '0', 'P') }, + { AV_PIX_FMT_YUV444P, MKTAG('4', '4', '4', 'P') }, + { AV_PIX_FMT_YUVJ444P, MKTAG('4', '4', '4', 'P') }, + { AV_PIX_FMT_MONOWHITE,MKTAG('B', '1', 'W', '0') }, + { AV_PIX_FMT_MONOBLACK,MKTAG('B', '0', 'W', '1') }, + { AV_PIX_FMT_BGR8, MKTAG('B', 'G', 'R', 8 ) }, + { AV_PIX_FMT_RGB8, MKTAG('R', 'G', 'B', 8 ) }, + { AV_PIX_FMT_BGR4, MKTAG('B', 'G', 'R', 4 ) }, + { AV_PIX_FMT_RGB4, MKTAG('R', 'G', 'B', 4 ) }, + { AV_PIX_FMT_RGB4_BYTE,MKTAG('B', '4', 'B', 'Y') }, + { AV_PIX_FMT_BGR4_BYTE,MKTAG('R', '4', 'B', 'Y') }, + { AV_PIX_FMT_RGB48LE, MKTAG('R', 'G', 'B', 48 ) }, + { AV_PIX_FMT_RGB48BE, MKTAG( 48, 'R', 'G', 'B') }, + { AV_PIX_FMT_BGR48LE, MKTAG('B', 'G', 'R', 48 ) }, + { AV_PIX_FMT_BGR48BE, MKTAG( 48, 'B', 'G', 'R') }, + { AV_PIX_FMT_GRAY9LE, MKTAG('Y', '1', 0 , 9 ) }, + { AV_PIX_FMT_GRAY9BE, MKTAG( 9 , 0 , '1', 'Y') }, + { AV_PIX_FMT_GRAY10LE, MKTAG('Y', '1', 0 , 10 ) }, + { AV_PIX_FMT_GRAY10BE, MKTAG(10 , 0 , '1', 'Y') }, + { AV_PIX_FMT_GRAY12LE, MKTAG('Y', '1', 0 , 12 ) }, + { AV_PIX_FMT_GRAY12BE, MKTAG(12 , 0 , '1', 'Y') }, + { AV_PIX_FMT_GRAY14LE, MKTAG('Y', '1', 0 , 14 ) }, + { AV_PIX_FMT_GRAY14BE, MKTAG(14 , 0 , '1', 'Y') }, + { AV_PIX_FMT_GRAY16LE, MKTAG('Y', '1', 0 , 16 ) }, + { AV_PIX_FMT_GRAY16BE, MKTAG(16 , 0 , '1', 'Y') }, + { AV_PIX_FMT_YUV420P9LE, MKTAG('Y', '3', 11 , 9 ) }, + { AV_PIX_FMT_YUV420P9BE, MKTAG( 9 , 11 , '3', 'Y') }, + { AV_PIX_FMT_YUV422P9LE, MKTAG('Y', '3', 10 , 9 ) }, + { AV_PIX_FMT_YUV422P9BE, MKTAG( 9 , 10 , '3', 'Y') }, + { AV_PIX_FMT_YUV444P9LE, MKTAG('Y', '3', 0 , 9 ) }, + { AV_PIX_FMT_YUV444P9BE, MKTAG( 9 , 0 , '3', 'Y') }, + { AV_PIX_FMT_YUV420P10LE, MKTAG('Y', '3', 11 , 10 ) }, + { AV_PIX_FMT_YUV420P10BE, MKTAG(10 , 11 , '3', 'Y') }, + { AV_PIX_FMT_YUV422P10LE, MKTAG('Y', '3', 10 , 10 ) }, + { AV_PIX_FMT_YUV422P10BE, MKTAG(10 , 10 , '3', 'Y') }, + { AV_PIX_FMT_YUV444P10LE, MKTAG('Y', '3', 0 , 10 ) }, + { AV_PIX_FMT_YUV444P10BE, MKTAG(10 , 0 , '3', 'Y') }, + { AV_PIX_FMT_YUV420P12LE, MKTAG('Y', '3', 11 , 12 ) }, + { AV_PIX_FMT_YUV420P12BE, MKTAG(12 , 11 , '3', 'Y') }, + { AV_PIX_FMT_YUV422P12LE, MKTAG('Y', '3', 10 , 12 ) }, + { AV_PIX_FMT_YUV422P12BE, MKTAG(12 , 10 , '3', 'Y') }, + { AV_PIX_FMT_YUV444P12LE, MKTAG('Y', '3', 0 , 12 ) }, + { AV_PIX_FMT_YUV444P12BE, MKTAG(12 , 0 , '3', 'Y') }, + { AV_PIX_FMT_YUV420P14LE, MKTAG('Y', '3', 11 , 14 ) }, + { AV_PIX_FMT_YUV420P14BE, MKTAG(14 , 11 , '3', 'Y') }, + { AV_PIX_FMT_YUV422P14LE, MKTAG('Y', '3', 10 , 14 ) }, + { AV_PIX_FMT_YUV422P14BE, MKTAG(14 , 10 , '3', 'Y') }, + { AV_PIX_FMT_YUV444P14LE, MKTAG('Y', '3', 0 , 14 ) }, + { AV_PIX_FMT_YUV444P14BE, MKTAG(14 , 0 , '3', 'Y') }, + { AV_PIX_FMT_YUV420P16LE, MKTAG('Y', '3', 11 , 16 ) }, + { AV_PIX_FMT_YUV420P16BE, MKTAG(16 , 11 , '3', 'Y') }, + { AV_PIX_FMT_YUV422P16LE, MKTAG('Y', '3', 10 , 16 ) }, + { AV_PIX_FMT_YUV422P16BE, MKTAG(16 , 10 , '3', 'Y') }, + { AV_PIX_FMT_YUV444P16LE, MKTAG('Y', '3', 0 , 16 ) }, + { AV_PIX_FMT_YUV444P16BE, MKTAG(16 , 0 , '3', 'Y') }, + { AV_PIX_FMT_YUVA420P, MKTAG('Y', '4', 11 , 8 ) }, + { AV_PIX_FMT_YUVA422P, MKTAG('Y', '4', 10 , 8 ) }, + { AV_PIX_FMT_YUVA444P, MKTAG('Y', '4', 0 , 8 ) }, + { AV_PIX_FMT_YA8, MKTAG('Y', '2', 0 , 8 ) }, + { AV_PIX_FMT_PAL8, MKTAG('P', 'A', 'L', 8 ) }, + + { AV_PIX_FMT_YUVA420P9LE, MKTAG('Y', '4', 11 , 9 ) }, + { AV_PIX_FMT_YUVA420P9BE, MKTAG( 9 , 11 , '4', 'Y') }, + { AV_PIX_FMT_YUVA422P9LE, MKTAG('Y', '4', 10 , 9 ) }, + { AV_PIX_FMT_YUVA422P9BE, MKTAG( 9 , 10 , '4', 'Y') }, + { AV_PIX_FMT_YUVA444P9LE, MKTAG('Y', '4', 0 , 9 ) }, + { AV_PIX_FMT_YUVA444P9BE, MKTAG( 9 , 0 , '4', 'Y') }, + { AV_PIX_FMT_YUVA420P10LE, MKTAG('Y', '4', 11 , 10 ) }, + { AV_PIX_FMT_YUVA420P10BE, MKTAG(10 , 11 , '4', 'Y') }, + { AV_PIX_FMT_YUVA422P10LE, MKTAG('Y', '4', 10 , 10 ) }, + { AV_PIX_FMT_YUVA422P10BE, MKTAG(10 , 10 , '4', 'Y') }, + { AV_PIX_FMT_YUVA444P10LE, MKTAG('Y', '4', 0 , 10 ) }, + { AV_PIX_FMT_YUVA444P10BE, MKTAG(10 , 0 , '4', 'Y') }, + { AV_PIX_FMT_YUVA422P12LE, MKTAG('Y', '4', 10 , 12 ) }, + { AV_PIX_FMT_YUVA422P12BE, MKTAG(12 , 10 , '4', 'Y') }, + { AV_PIX_FMT_YUVA444P12LE, MKTAG('Y', '4', 0 , 12 ) }, + { AV_PIX_FMT_YUVA444P12BE, MKTAG(12 , 0 , '4', 'Y') }, + { AV_PIX_FMT_YUVA420P16LE, MKTAG('Y', '4', 11 , 16 ) }, + { AV_PIX_FMT_YUVA420P16BE, MKTAG(16 , 11 , '4', 'Y') }, + { AV_PIX_FMT_YUVA422P16LE, MKTAG('Y', '4', 10 , 16 ) }, + { AV_PIX_FMT_YUVA422P16BE, MKTAG(16 , 10 , '4', 'Y') }, + { AV_PIX_FMT_YUVA444P16LE, MKTAG('Y', '4', 0 , 16 ) }, + { AV_PIX_FMT_YUVA444P16BE, MKTAG(16 , 0 , '4', 'Y') }, + + { AV_PIX_FMT_GBRP, MKTAG('G', '3', 00 , 8 ) }, + { AV_PIX_FMT_GBRP9LE, MKTAG('G', '3', 00 , 9 ) }, + { AV_PIX_FMT_GBRP9BE, MKTAG( 9 , 00 , '3', 'G') }, + { AV_PIX_FMT_GBRP10LE, MKTAG('G', '3', 00 , 10 ) }, + { AV_PIX_FMT_GBRP10BE, MKTAG(10 , 00 , '3', 'G') }, + { AV_PIX_FMT_GBRP12LE, MKTAG('G', '3', 00 , 12 ) }, + { AV_PIX_FMT_GBRP12BE, MKTAG(12 , 00 , '3', 'G') }, + { AV_PIX_FMT_GBRP14LE, MKTAG('G', '3', 00 , 14 ) }, + { AV_PIX_FMT_GBRP14BE, MKTAG(14 , 00 , '3', 'G') }, + { AV_PIX_FMT_GBRP16LE, MKTAG('G', '3', 00 , 16 ) }, + { AV_PIX_FMT_GBRP16BE, MKTAG(16 , 00 , '3', 'G') }, + + { AV_PIX_FMT_GBRAP, MKTAG('G', '4', 00 , 8 ) }, + { AV_PIX_FMT_GBRAP10LE, MKTAG('G', '4', 00 , 10 ) }, + { AV_PIX_FMT_GBRAP10BE, MKTAG(10 , 00 , '4', 'G') }, + { AV_PIX_FMT_GBRAP12LE, MKTAG('G', '4', 00 , 12 ) }, + { AV_PIX_FMT_GBRAP12BE, MKTAG(12 , 00 , '4', 'G') }, + { AV_PIX_FMT_GBRAP16LE, MKTAG('G', '4', 00 , 16 ) }, + { AV_PIX_FMT_GBRAP16BE, MKTAG(16 , 00 , '4', 'G') }, + + { AV_PIX_FMT_XYZ12LE, MKTAG('X', 'Y', 'Z' , 36 ) }, + { AV_PIX_FMT_XYZ12BE, MKTAG(36 , 'Z' , 'Y', 'X') }, + + { AV_PIX_FMT_BAYER_BGGR8, MKTAG(0xBA, 'B', 'G', 8 ) }, + { AV_PIX_FMT_BAYER_BGGR16LE, MKTAG(0xBA, 'B', 'G', 16 ) }, + { AV_PIX_FMT_BAYER_BGGR16BE, MKTAG(16 , 'G', 'B', 0xBA) }, + { AV_PIX_FMT_BAYER_RGGB8, MKTAG(0xBA, 'R', 'G', 8 ) }, + { AV_PIX_FMT_BAYER_RGGB16LE, MKTAG(0xBA, 'R', 'G', 16 ) }, + { AV_PIX_FMT_BAYER_RGGB16BE, MKTAG(16 , 'G', 'R', 0xBA) }, + { AV_PIX_FMT_BAYER_GBRG8, MKTAG(0xBA, 'G', 'B', 8 ) }, + { AV_PIX_FMT_BAYER_GBRG16LE, MKTAG(0xBA, 'G', 'B', 16 ) }, + { AV_PIX_FMT_BAYER_GBRG16BE, MKTAG(16, 'B', 'G', 0xBA) }, + { AV_PIX_FMT_BAYER_GRBG8, MKTAG(0xBA, 'G', 'R', 8 ) }, + { AV_PIX_FMT_BAYER_GRBG16LE, MKTAG(0xBA, 'G', 'R', 16 ) }, + { AV_PIX_FMT_BAYER_GRBG16BE, MKTAG(16, 'R', 'G', 0xBA) }, + + /* quicktime */ + { AV_PIX_FMT_YUV420P, MKTAG('R', '4', '2', '0') }, /* Radius DV YUV PAL */ + { AV_PIX_FMT_YUV411P, MKTAG('R', '4', '1', '1') }, /* Radius DV YUV NTSC */ + { AV_PIX_FMT_UYVY422, MKTAG('2', 'v', 'u', 'y') }, + { AV_PIX_FMT_UYVY422, MKTAG('2', 'V', 'u', 'y') }, + { AV_PIX_FMT_UYVY422, MKTAG('A', 'V', 'U', 'I') }, /* FIXME merge both fields */ + { AV_PIX_FMT_UYVY422, MKTAG('b', 'x', 'y', 'v') }, + { AV_PIX_FMT_YUYV422, MKTAG('y', 'u', 'v', '2') }, + { AV_PIX_FMT_YUYV422, MKTAG('y', 'u', 'v', 's') }, + { AV_PIX_FMT_YUYV422, MKTAG('D', 'V', 'O', 'O') }, /* Digital Voodoo SD 8 Bit */ + { AV_PIX_FMT_RGB555LE,MKTAG('L', '5', '5', '5') }, + { AV_PIX_FMT_RGB565LE,MKTAG('L', '5', '6', '5') }, + { AV_PIX_FMT_RGB565BE,MKTAG('B', '5', '6', '5') }, + { AV_PIX_FMT_BGR24, MKTAG('2', '4', 'B', 'G') }, + { AV_PIX_FMT_BGR24, MKTAG('b', 'x', 'b', 'g') }, + { AV_PIX_FMT_BGRA, MKTAG('B', 'G', 'R', 'A') }, + { AV_PIX_FMT_RGBA, MKTAG('R', 'G', 'B', 'A') }, + { AV_PIX_FMT_RGB24, MKTAG('b', 'x', 'r', 'g') }, + { AV_PIX_FMT_ABGR, MKTAG('A', 'B', 'G', 'R') }, + { AV_PIX_FMT_GRAY16BE,MKTAG('b', '1', '6', 'g') }, + { AV_PIX_FMT_RGB48BE, MKTAG('b', '4', '8', 'r') }, + { AV_PIX_FMT_RGBA64BE,MKTAG('b', '6', '4', 'a') }, + { AV_PIX_FMT_BAYER_RGGB16BE, MKTAG('B', 'G', 'G', 'R') }, + + /* vlc */ + { AV_PIX_FMT_YUV410P, MKTAG('I', '4', '1', '0') }, + { AV_PIX_FMT_YUV411P, MKTAG('I', '4', '1', '1') }, + { AV_PIX_FMT_YUV422P, MKTAG('I', '4', '2', '2') }, + { AV_PIX_FMT_YUV440P, MKTAG('I', '4', '4', '0') }, + { AV_PIX_FMT_YUV444P, MKTAG('I', '4', '4', '4') }, + { AV_PIX_FMT_YUVJ420P, MKTAG('J', '4', '2', '0') }, + { AV_PIX_FMT_YUVJ422P, MKTAG('J', '4', '2', '2') }, + { AV_PIX_FMT_YUVJ440P, MKTAG('J', '4', '4', '0') }, + { AV_PIX_FMT_YUVJ444P, MKTAG('J', '4', '4', '4') }, + { AV_PIX_FMT_YUVA444P, MKTAG('Y', 'U', 'V', 'A') }, + { AV_PIX_FMT_YUVA420P, MKTAG('I', '4', '0', 'A') }, + { AV_PIX_FMT_YUVA422P, MKTAG('I', '4', '2', 'A') }, + { AV_PIX_FMT_RGB8, MKTAG('R', 'G', 'B', '2') }, + { AV_PIX_FMT_RGB555LE, MKTAG('R', 'V', '1', '5') }, + { AV_PIX_FMT_RGB565LE, MKTAG('R', 'V', '1', '6') }, + { AV_PIX_FMT_BGR24, MKTAG('R', 'V', '2', '4') }, + { AV_PIX_FMT_BGR0, MKTAG('R', 'V', '3', '2') }, + { AV_PIX_FMT_RGBA, MKTAG('A', 'V', '3', '2') }, + { AV_PIX_FMT_YUV420P9LE, MKTAG('I', '0', '9', 'L') }, + { AV_PIX_FMT_YUV420P9BE, MKTAG('I', '0', '9', 'B') }, + { AV_PIX_FMT_YUV422P9LE, MKTAG('I', '2', '9', 'L') }, + { AV_PIX_FMT_YUV422P9BE, MKTAG('I', '2', '9', 'B') }, + { AV_PIX_FMT_YUV444P9LE, MKTAG('I', '4', '9', 'L') }, + { AV_PIX_FMT_YUV444P9BE, MKTAG('I', '4', '9', 'B') }, + { AV_PIX_FMT_YUV420P10LE, MKTAG('I', '0', 'A', 'L') }, + { AV_PIX_FMT_YUV420P10BE, MKTAG('I', '0', 'A', 'B') }, + { AV_PIX_FMT_YUV422P10LE, MKTAG('I', '2', 'A', 'L') }, + { AV_PIX_FMT_YUV422P10BE, MKTAG('I', '2', 'A', 'B') }, + { AV_PIX_FMT_YUV444P10LE, MKTAG('I', '4', 'A', 'L') }, + { AV_PIX_FMT_YUV444P10BE, MKTAG('I', '4', 'A', 'B') }, + { AV_PIX_FMT_YUV420P12LE, MKTAG('I', '0', 'C', 'L') }, + { AV_PIX_FMT_YUV420P12BE, MKTAG('I', '0', 'C', 'B') }, + { AV_PIX_FMT_YUV422P12LE, MKTAG('I', '2', 'C', 'L') }, + { AV_PIX_FMT_YUV422P12BE, MKTAG('I', '2', 'C', 'B') }, + { AV_PIX_FMT_YUV444P12LE, MKTAG('I', '4', 'C', 'L') }, + { AV_PIX_FMT_YUV444P12BE, MKTAG('I', '4', 'C', 'B') }, + { AV_PIX_FMT_YUV420P16LE, MKTAG('I', '0', 'F', 'L') }, + { AV_PIX_FMT_YUV420P16BE, MKTAG('I', '0', 'F', 'B') }, + { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') }, + { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') }, + + /* special */ + { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ + { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ + + { AV_PIX_FMT_NONE, 0 }, +}; + +const struct PixelFormatTag *avpriv_get_raw_pix_fmt_tags(void) +{ + return raw_pix_fmt_tags; +} + +unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat fmt) +{ + const PixelFormatTag *tags = raw_pix_fmt_tags; + while (tags->pix_fmt >= 0) { + if (tags->pix_fmt == fmt) + return tags->fourcc; + tags++; + } + return 0; +} + +static const PixelFormatTag pix_fmt_bps_avi[] = { + { AV_PIX_FMT_PAL8, 1 }, + { AV_PIX_FMT_PAL8, 2 }, + { AV_PIX_FMT_PAL8, 4 }, + { AV_PIX_FMT_PAL8, 8 }, + { AV_PIX_FMT_RGB444LE, 12 }, + { AV_PIX_FMT_RGB555LE, 15 }, + { AV_PIX_FMT_RGB555LE, 16 }, + { AV_PIX_FMT_BGR24, 24 }, + { AV_PIX_FMT_BGRA, 32 }, + { AV_PIX_FMT_NONE, 0 }, +}; + +static const PixelFormatTag pix_fmt_bps_mov[] = { + { AV_PIX_FMT_PAL8, 1 }, + { AV_PIX_FMT_PAL8, 2 }, + { AV_PIX_FMT_PAL8, 4 }, + { AV_PIX_FMT_PAL8, 8 }, + { AV_PIX_FMT_RGB555BE, 16 }, + { AV_PIX_FMT_RGB24, 24 }, + { AV_PIX_FMT_ARGB, 32 }, + { AV_PIX_FMT_PAL8, 33 }, + { AV_PIX_FMT_NONE, 0 }, +}; + +static enum AVPixelFormat find_pix_fmt(const PixelFormatTag *tags, + unsigned int fourcc) +{ + while (tags->pix_fmt != AV_PIX_FMT_NONE) { + if (tags->fourcc == fourcc) + return tags->pix_fmt; + tags++; + } + return AV_PIX_FMT_NONE; +} + +enum AVPixelFormat avpriv_pix_fmt_find(enum PixelFormatTagLists list, + unsigned fourcc) +{ + const PixelFormatTag *tags; + + switch (list) { + case PIX_FMT_LIST_RAW: + tags = raw_pix_fmt_tags; + break; + case PIX_FMT_LIST_AVI: + tags = pix_fmt_bps_avi; + break; + case PIX_FMT_LIST_MOV: + tags = pix_fmt_bps_mov; + break; + } + return find_pix_fmt(tags, fourcc); +} diff --git a/media/ffvpx/libavcodec/raw.h b/media/ffvpx/libavcodec/raw.h new file mode 100644 index 0000000000..9a4ddef8fc --- /dev/null +++ b/media/ffvpx/libavcodec/raw.h @@ -0,0 +1,48 @@ +/* + * Raw Video Codec + * Copyright (c) 2001 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Raw Video Codec + */ + +#ifndef AVCODEC_RAW_H +#define AVCODEC_RAW_H + +#include "libavutil/pixfmt.h" + +typedef struct PixelFormatTag { + enum AVPixelFormat pix_fmt; + unsigned int fourcc; +} PixelFormatTag; + +const struct PixelFormatTag *avpriv_get_raw_pix_fmt_tags(void); + +enum PixelFormatTagLists { + PIX_FMT_LIST_RAW, + PIX_FMT_LIST_AVI, + PIX_FMT_LIST_MOV, +}; + +enum AVPixelFormat avpriv_pix_fmt_find(enum PixelFormatTagLists list, + unsigned fourcc); + +#endif /* AVCODEC_RAW_H */ diff --git a/media/ffvpx/libavcodec/rdft.c b/media/ffvpx/libavcodec/rdft.c new file mode 100644 index 0000000000..ac6f5d6781 --- /dev/null +++ b/media/ffvpx/libavcodec/rdft.c @@ -0,0 +1,120 @@ +/* + * (I)RDFT transforms + * Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdlib.h> +#include <math.h> +#include "libavutil/error.h" +#include "libavutil/mathematics.h" +#include "rdft.h" + +/** + * @file + * (Inverse) Real Discrete Fourier Transforms. + */ + +/** Map one real FFT into two parallel real even and odd FFTs. Then interleave + * the two real FFTs into one complex FFT. Unmangle the results. + * ref: http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM + */ +static void rdft_calc_c(RDFTContext *s, FFTSample *data) +{ + int i, i1, i2; + FFTComplex ev, od, odsum; + const int n = 1 << s->nbits; + const float k1 = 0.5; + const float k2 = 0.5 - s->inverse; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + + if (!s->inverse) { + s->fft.fft_permute(&s->fft, (FFTComplex*)data); + s->fft.fft_calc(&s->fft, (FFTComplex*)data); + } + /* i=0 is a special case because of packing, the DC term is real, so we + are going to throw the N/2 term (also real) in with it. */ + ev.re = data[0]; + data[0] = ev.re+data[1]; + data[1] = ev.re-data[1]; + +#define RDFT_UNMANGLE(sign0, sign1) \ + for (i = 1; i < (n>>2); i++) { \ + i1 = 2*i; \ + i2 = n-i1; \ + /* Separate even and odd FFTs */ \ + ev.re = k1*(data[i1 ]+data[i2 ]); \ + od.im = k2*(data[i2 ]-data[i1 ]); \ + ev.im = k1*(data[i1+1]-data[i2+1]); \ + od.re = k2*(data[i1+1]+data[i2+1]); \ + /* Apply twiddle factors to the odd FFT and add to the even FFT */ \ + odsum.re = od.re*tcos[i] sign0 od.im*tsin[i]; \ + odsum.im = od.im*tcos[i] sign1 od.re*tsin[i]; \ + data[i1 ] = ev.re + odsum.re; \ + data[i1+1] = ev.im + odsum.im; \ + data[i2 ] = ev.re - odsum.re; \ + data[i2+1] = odsum.im - ev.im; \ + } + + if (s->negative_sin) { + RDFT_UNMANGLE(+,-) + } else { + RDFT_UNMANGLE(-,+) + } + + data[2*i+1]=s->sign_convention*data[2*i+1]; + if (s->inverse) { + data[0] *= k1; + data[1] *= k1; + s->fft.fft_permute(&s->fft, (FFTComplex*)data); + s->fft.fft_calc(&s->fft, (FFTComplex*)data); + } +} + +av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans) +{ + int n = 1 << nbits; + int ret; + + s->nbits = nbits; + s->inverse = trans == IDFT_C2R || trans == DFT_C2R; + s->sign_convention = trans == IDFT_R2C || trans == DFT_C2R ? 1 : -1; + s->negative_sin = trans == DFT_C2R || trans == DFT_R2C; + + if (nbits < 4 || nbits > 16) + return AVERROR(EINVAL); + + if ((ret = ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C)) < 0) + return ret; + + ff_init_ff_cos_tabs(nbits); + s->tcos = ff_cos_tabs[nbits]; + s->tsin = ff_cos_tabs[nbits] + (n >> 2); + s->rdft_calc = rdft_calc_c; + +#if ARCH_ARM + ff_rdft_init_arm(s); +#endif + + return 0; +} + +av_cold void ff_rdft_end(RDFTContext *s) +{ + ff_fft_end(&s->fft); +} diff --git a/media/ffvpx/libavcodec/rdft.h b/media/ffvpx/libavcodec/rdft.h new file mode 100644 index 0000000000..ffafca7f24 --- /dev/null +++ b/media/ffvpx/libavcodec/rdft.h @@ -0,0 +1,52 @@ +/* + * (I)RDFT transforms + * Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#if !defined(AVCODEC_RDFT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT) +#define AVCODEC_RDFT_H + +#include "config.h" +#include "fft.h" + +struct RDFTContext { + int nbits; + int inverse; + int sign_convention; + + /* pre/post rotation tables */ + const FFTSample *tcos; + const FFTSample *tsin; + int negative_sin; + FFTContext fft; + void (*rdft_calc)(struct RDFTContext *s, FFTSample *z); +}; + +/** + * Set up a real FFT. + * @param nbits log2 of the length of the input array + * @param trans the type of transform + */ +int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans); +void ff_rdft_end(RDFTContext *s); + +void ff_rdft_init_arm(RDFTContext *s); + + +#endif /* AVCODEC_RDFT_H */ diff --git a/media/ffvpx/libavcodec/rectangle.h b/media/ffvpx/libavcodec/rectangle.h new file mode 100644 index 0000000000..df7c18a4e2 --- /dev/null +++ b/media/ffvpx/libavcodec/rectangle.h @@ -0,0 +1,124 @@ +/* + * rectangle filling function + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * useful rectangle filling function + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#ifndef AVCODEC_RECTANGLE_H +#define AVCODEC_RECTANGLE_H + +#include "config.h" +#include "libavutil/common.h" +#include "libavutil/avassert.h" + +/** + * fill a rectangle. + * @param h height of the rectangle, should be a constant + * @param w width of the rectangle, should be a constant + * @param size the size of val (1, 2 or 4), should be a constant + */ +static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ + uint8_t *p= (uint8_t*)vp; + av_assert2(size==1 || size==2 || size==4); + av_assert2(w<=4); + + w *= size; + stride *= size; + + av_assert2((((long)vp)&(FFMIN(w, 8<<(HAVE_NEON|ARCH_PPC|HAVE_MMX))-1)) == 0); + av_assert2((stride&(w-1))==0); + if(w==2){ + const uint16_t v= size==4 ? val : val*0x0101; + *(uint16_t*)(p + 0*stride)= v; + if(h==1) return; + *(uint16_t*)(p + 1*stride)= v; + if(h==2) return; + *(uint16_t*)(p + 2*stride)= v; + *(uint16_t*)(p + 3*stride)= v; + }else if(w==4){ + const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101; + *(uint32_t*)(p + 0*stride)= v; + if(h==1) return; + *(uint32_t*)(p + 1*stride)= v; + if(h==2) return; + *(uint32_t*)(p + 2*stride)= v; + *(uint32_t*)(p + 3*stride)= v; + }else if(w==8){ + // gcc cannot optimize 64-bit math on x86_32 +#if HAVE_FAST_64BIT + const uint64_t v= size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL; + *(uint64_t*)(p + 0*stride)= v; + if(h==1) return; + *(uint64_t*)(p + 1*stride)= v; + if(h==2) return; + *(uint64_t*)(p + 2*stride)= v; + *(uint64_t*)(p + 3*stride)= v; + }else if(w==16){ + const uint64_t v= val*0x0100000001ULL; + *(uint64_t*)(p + 0+0*stride)= v; + *(uint64_t*)(p + 8+0*stride)= v; + *(uint64_t*)(p + 0+1*stride)= v; + *(uint64_t*)(p + 8+1*stride)= v; + if(h==2) return; + *(uint64_t*)(p + 0+2*stride)= v; + *(uint64_t*)(p + 8+2*stride)= v; + *(uint64_t*)(p + 0+3*stride)= v; + *(uint64_t*)(p + 8+3*stride)= v; +#else + const uint32_t v= size==2 ? val*0x00010001 : val; + *(uint32_t*)(p + 0+0*stride)= v; + *(uint32_t*)(p + 4+0*stride)= v; + if(h==1) return; + *(uint32_t*)(p + 0+1*stride)= v; + *(uint32_t*)(p + 4+1*stride)= v; + if(h==2) return; + *(uint32_t*)(p + 0+2*stride)= v; + *(uint32_t*)(p + 4+2*stride)= v; + *(uint32_t*)(p + 0+3*stride)= v; + *(uint32_t*)(p + 4+3*stride)= v; + }else if(w==16){ + *(uint32_t*)(p + 0+0*stride)= val; + *(uint32_t*)(p + 4+0*stride)= val; + *(uint32_t*)(p + 8+0*stride)= val; + *(uint32_t*)(p +12+0*stride)= val; + *(uint32_t*)(p + 0+1*stride)= val; + *(uint32_t*)(p + 4+1*stride)= val; + *(uint32_t*)(p + 8+1*stride)= val; + *(uint32_t*)(p +12+1*stride)= val; + if(h==2) return; + *(uint32_t*)(p + 0+2*stride)= val; + *(uint32_t*)(p + 4+2*stride)= val; + *(uint32_t*)(p + 8+2*stride)= val; + *(uint32_t*)(p +12+2*stride)= val; + *(uint32_t*)(p + 0+3*stride)= val; + *(uint32_t*)(p + 4+3*stride)= val; + *(uint32_t*)(p + 8+3*stride)= val; + *(uint32_t*)(p +12+3*stride)= val; +#endif + }else + av_assert2(0); + av_assert2(h==4); +} + +#endif /* AVCODEC_RECTANGLE_H */ diff --git a/media/ffvpx/libavcodec/reverse.c b/media/ffvpx/libavcodec/reverse.c new file mode 100644 index 0000000000..440badaf34 --- /dev/null +++ b/media/ffvpx/libavcodec/reverse.c @@ -0,0 +1 @@ +#include "libavutil/reverse.c" diff --git a/media/ffvpx/libavcodec/rl.h b/media/ffvpx/libavcodec/rl.h new file mode 100644 index 0000000000..4380fda272 --- /dev/null +++ b/media/ffvpx/libavcodec/rl.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2000-2002 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * rl header. + */ + +#ifndef AVCODEC_RL_H +#define AVCODEC_RL_H + +#include <stdint.h> + +#include "vlc.h" + +/* run length table */ +#define MAX_RUN 64 +#define MAX_LEVEL 64 + +/** RLTable. */ +typedef struct RLTable { + int n; ///< number of entries of table_vlc minus 1 + int last; ///< number of values for last = 0 + const uint16_t (*table_vlc)[2]; + const int8_t *table_run; + const int8_t *table_level; + uint8_t *index_run[2]; ///< encoding only + int8_t *max_level[2]; ///< encoding & decoding + int8_t *max_run[2]; ///< encoding & decoding + RL_VLC_ELEM *rl_vlc[32]; ///< decoding only +} RLTable; + +/** + * Initialize max_level and index_run from table_run and table_level; + * this is equivalent to initializing RLTable.max_level[0] and + * RLTable.index_run[0] with ff_rl_init(). + */ +void ff_rl_init_level_run(uint8_t max_level[MAX_LEVEL + 1], + uint8_t index_run[MAX_RUN + 1], + const uint8_t table_run[/* n */], + const uint8_t table_level[/* n*/], int n); + +/** + * Initialize index_run, max_level and max_run from n, last, table_vlc, + * table_run and table_level. + * @param static_store static uint8_t array[2][2*MAX_RUN + MAX_LEVEL + 3] + * to hold the level and run tables. + * @note This function does not touch rl_vlc at all, hence there is no need + * to synchronize calls to ff_rl_init() and ff_rl_init_vlc() using the + * same RLTable. + */ +void ff_rl_init(RLTable *rl, uint8_t static_store[2][2*MAX_RUN + MAX_LEVEL + 3]); + +/** + * Initialize rl_vlc from n, last, table_vlc, table_run and table_level. + * All rl_vlc pointers to be initialized must already point to a static + * buffer of `static_size` RL_VLC_ELEM elements; if a pointer is NULL, + * initializing further VLCs stops. + * @note This function does not touch what ff_rl_init() initializes at all, + * hence there is no need to synchronize calls to ff_rl_init() and + * ff_rl_init_vlc() using the same RLTable. + */ +void ff_rl_init_vlc(RLTable *rl, unsigned static_size); + +#define INIT_VLC_RL(rl, static_size)\ +{\ + static RL_VLC_ELEM rl_vlc_table[32][static_size];\ +\ + for (int q = 0; q < 32; q++) \ + rl.rl_vlc[q] = rl_vlc_table[q]; \ +\ + ff_rl_init_vlc(&rl, static_size); \ +} + +#define INIT_FIRST_VLC_RL(rl, static_size) \ +do { \ + static RL_VLC_ELEM rl_vlc_table[static_size]; \ + \ + rl.rl_vlc[0] = rl_vlc_table; \ + ff_rl_init_vlc(&rl, static_size); \ +} while (0) + +static inline int get_rl_index(const RLTable *rl, int last, int run, int level) +{ + int index; + index = rl->index_run[last][run]; + if (index >= rl->n) + return rl->n; + if (level > rl->max_level[last][run]) + return rl->n; + return index + level - 1; +} + +#endif /* AVCODEC_RL_H */ diff --git a/media/ffvpx/libavcodec/rnd_avg.h b/media/ffvpx/libavcodec/rnd_avg.h new file mode 100644 index 0000000000..344775e31f --- /dev/null +++ b/media/ffvpx/libavcodec/rnd_avg.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2001-2003 BERO <bero@geocities.co.jp> + * Copyright (c) 2011 Oskar Arvidsson + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_RND_AVG_H +#define AVCODEC_RND_AVG_H + +#include <stddef.h> +#include <stdint.h> + +#define BYTE_VEC32(c) ((c) * 0x01010101UL) +#define BYTE_VEC64(c) ((c) * 0x0001000100010001UL) + +static inline uint32_t rnd_avg32(uint32_t a, uint32_t b) +{ + return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); +} + +static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b) +{ + return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); +} + +static inline uint64_t rnd_avg64(uint64_t a, uint64_t b) +{ + return (a | b) - (((a ^ b) & ~BYTE_VEC64(0x01)) >> 1); +} + +static inline uint64_t no_rnd_avg64(uint64_t a, uint64_t b) +{ + return (a & b) + (((a ^ b) & ~BYTE_VEC64(0x01)) >> 1); +} + +#endif /* AVCODEC_RND_AVG_H */ diff --git a/media/ffvpx/libavcodec/simple_idct.c b/media/ffvpx/libavcodec/simple_idct.c new file mode 100644 index 0000000000..eb13cff146 --- /dev/null +++ b/media/ffvpx/libavcodec/simple_idct.c @@ -0,0 +1,269 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * simpleidct in C. + */ + +#include "libavutil/intreadwrite.h" +#include "mathops.h" +#include "simple_idct.h" + +#define IN_IDCT_DEPTH 16 + +#define BIT_DEPTH 8 +#include "simple_idct_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 10 +#include "simple_idct_template.c" + +#define EXTRA_SHIFT 2 +#include "simple_idct_template.c" + +#undef EXTRA_SHIFT +#undef BIT_DEPTH + +#define BIT_DEPTH 12 +#include "simple_idct_template.c" +#undef BIT_DEPTH +#undef IN_IDCT_DEPTH + +#define IN_IDCT_DEPTH 32 +#define BIT_DEPTH 10 +#include "simple_idct_template.c" +#undef BIT_DEPTH +#undef IN_IDCT_DEPTH + +/* 2x4x8 idct */ + +#define CN_SHIFT 12 +#define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5)) +#define C1 C_FIX(0.6532814824) +#define C2 C_FIX(0.2705980501) + +/* row idct is multiple by 16 * sqrt(2.0), col idct4 is normalized, + and the butterfly must be multiplied by 0.5 * sqrt(2.0) */ +#define C_SHIFT (4+1+12) + +static inline void idct4col_put(uint8_t *dest, ptrdiff_t line_size, const int16_t *col) +{ + int c0, c1, c2, c3, a0, a1, a2, a3; + + a0 = col[8*0]; + a1 = col[8*2]; + a2 = col[8*4]; + a3 = col[8*6]; + c0 = ((a0 + a2) * (1 << CN_SHIFT - 1)) + (1 << (C_SHIFT - 1)); + c2 = ((a0 - a2) * (1 << CN_SHIFT - 1)) + (1 << (C_SHIFT - 1)); + c1 = a1 * C1 + a3 * C2; + c3 = a1 * C2 - a3 * C1; + dest[0] = av_clip_uint8((c0 + c1) >> C_SHIFT); + dest += line_size; + dest[0] = av_clip_uint8((c2 + c3) >> C_SHIFT); + dest += line_size; + dest[0] = av_clip_uint8((c2 - c3) >> C_SHIFT); + dest += line_size; + dest[0] = av_clip_uint8((c0 - c1) >> C_SHIFT); +} + +#define BF(k) \ +{\ + int a0, a1;\ + a0 = ptr[k];\ + a1 = ptr[8 + k];\ + ptr[k] = a0 + a1;\ + ptr[8 + k] = a0 - a1;\ +} + +/* only used by DV codec. The input must be interlaced. 128 is added + to the pixels before clamping to avoid systematic error + (1024*sqrt(2)) offset would be needed otherwise. */ +/* XXX: I think a 1.0/sqrt(2) normalization should be needed to + compensate the extra butterfly stage - I don't have the full DV + specification */ +void ff_simple_idct248_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + int i; + int16_t *ptr; + + /* butterfly */ + ptr = block; + for(i=0;i<4;i++) { + BF(0); + BF(1); + BF(2); + BF(3); + BF(4); + BF(5); + BF(6); + BF(7); + ptr += 2 * 8; + } + + /* IDCT8 on each line */ + for(i=0; i<8; i++) { + idctRowCondDC_int16_8bit(block + i*8, 0); + } + + /* IDCT4 and store */ + for(i=0;i<8;i++) { + idct4col_put(dest + i, 2 * line_size, block + i); + idct4col_put(dest + line_size + i, 2 * line_size, block + 8 + i); + } +} + +/* 8x4 & 4x8 WMV2 IDCT */ +#undef CN_SHIFT +#undef C_SHIFT +#undef C_FIX +#undef C1 +#undef C2 +#define CN_SHIFT 12 +#define C_FIX(x) ((int)((x) * M_SQRT2 * (1 << CN_SHIFT) + 0.5)) +#define C1 C_FIX(0.6532814824) +#define C2 C_FIX(0.2705980501) +#define C3 C_FIX(0.5) +#define C_SHIFT (4+1+12) +static inline void idct4col_add(uint8_t *dest, ptrdiff_t line_size, const int16_t *col) +{ + int c0, c1, c2, c3, a0, a1, a2, a3; + + a0 = col[8*0]; + a1 = col[8*1]; + a2 = col[8*2]; + a3 = col[8*3]; + c0 = (a0 + a2)*C3 + (1 << (C_SHIFT - 1)); + c2 = (a0 - a2)*C3 + (1 << (C_SHIFT - 1)); + c1 = a1 * C1 + a3 * C2; + c3 = a1 * C2 - a3 * C1; + dest[0] = av_clip_uint8(dest[0] + ((c0 + c1) >> C_SHIFT)); + dest += line_size; + dest[0] = av_clip_uint8(dest[0] + ((c2 + c3) >> C_SHIFT)); + dest += line_size; + dest[0] = av_clip_uint8(dest[0] + ((c2 - c3) >> C_SHIFT)); + dest += line_size; + dest[0] = av_clip_uint8(dest[0] + ((c0 - c1) >> C_SHIFT)); +} + +#define RN_SHIFT 15 +#define R_FIX(x) ((int)((x) * M_SQRT2 * (1 << RN_SHIFT) + 0.5)) +#define R1 R_FIX(0.6532814824) +#define R2 R_FIX(0.2705980501) +#define R3 R_FIX(0.5) +#define R_SHIFT 11 +static inline void idct4row(int16_t *row) +{ + unsigned c0, c1, c2, c3; + int a0, a1, a2, a3; + + a0 = row[0]; + a1 = row[1]; + a2 = row[2]; + a3 = row[3]; + c0 = (a0 + a2)*R3 + (1 << (R_SHIFT - 1)); + c2 = (a0 - a2)*R3 + (1 << (R_SHIFT - 1)); + c1 = a1 * R1 + a3 * R2; + c3 = a1 * R2 - a3 * R1; + row[0]= (c0 + c1) >> R_SHIFT; + row[1]= (c2 + c3) >> R_SHIFT; + row[2]= (c2 - c3) >> R_SHIFT; + row[3]= (c0 - c1) >> R_SHIFT; +} + +void ff_simple_idct84_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + int i; + + /* IDCT8 on each line */ + for(i=0; i<4; i++) { + idctRowCondDC_int16_8bit(block + i*8, 0); + } + + /* IDCT4 and store */ + for(i=0;i<8;i++) { + idct4col_add(dest + i, line_size, block + i); + } +} + +void ff_simple_idct48_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + int i; + + /* IDCT4 on each line */ + for(i=0; i<8; i++) { + idct4row(block + i*8); + } + + /* IDCT8 and store */ + for(i=0; i<4; i++){ + idctSparseColAdd_int16_8bit(dest + i, line_size, block + i); + } +} + +void ff_simple_idct44_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block) +{ + int i; + + /* IDCT4 on each line */ + for(i=0; i<4; i++) { + idct4row(block + i*8); + } + + /* IDCT4 and store */ + for(i=0; i<4; i++){ + idct4col_add(dest + i, line_size, block + i); + } +} + +void ff_prores_idct_10(int16_t *block, const int16_t *qmat) +{ + int i; + + for (i = 0; i < 64; i++) + block[i] *= qmat[i]; + + for (i = 0; i < 8; i++) + idctRowCondDC_extrashift_10(block + i*8, 2); + + for (i = 0; i < 8; i++) { + block[i] += 8192; + idctSparseCol_extrashift_10(block + i); + } +} + +void ff_prores_idct_12(int16_t *block, const int16_t *qmat) +{ + int i; + + for (i = 0; i < 64; i++) + block[i] *= qmat[i]; + + for (i = 0; i < 8; i++) + idctRowCondDC_int16_12bit(block + i*8, 0); + + for (i = 0; i < 8; i++) { + block[i] += 8192; + idctSparseCol_int16_12bit(block + i); + } +} diff --git a/media/ffvpx/libavcodec/simple_idct.h b/media/ffvpx/libavcodec/simple_idct.h new file mode 100644 index 0000000000..20578b3347 --- /dev/null +++ b/media/ffvpx/libavcodec/simple_idct.h @@ -0,0 +1,64 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * simple idct header. + */ + +#ifndef AVCODEC_SIMPLE_IDCT_H +#define AVCODEC_SIMPLE_IDCT_H + +#include <stddef.h> +#include <stdint.h> + +void ff_simple_idct_put_int16_8bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_add_int16_8bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_int16_8bit(int16_t *block); + +void ff_simple_idct_put_int16_10bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_add_int16_10bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_int16_10bit(int16_t *block); + +void ff_simple_idct_put_int32_10bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_add_int32_10bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_int32_10bit(int16_t *block); + +void ff_simple_idct_put_int16_12bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_add_int16_12bit(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_int16_12bit(int16_t *block); + +/** + * Special version of ff_simple_idct_int16_10bit() which does dequantization + * and scales by a factor of 2 more between the two IDCTs to account + * for larger scale of input coefficients. + */ +void ff_prores_idct_10(int16_t *block, const int16_t *qmat); +void ff_prores_idct_12(int16_t *block, const int16_t *qmat); + +void ff_simple_idct248_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct84_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct48_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct44_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +#endif /* AVCODEC_SIMPLE_IDCT_H */ diff --git a/media/ffvpx/libavcodec/simple_idct_template.c b/media/ffvpx/libavcodec/simple_idct_template.c new file mode 100644 index 0000000000..5ddd0b45a2 --- /dev/null +++ b/media/ffvpx/libavcodec/simple_idct_template.c @@ -0,0 +1,371 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * simpleidct in C. + */ + +/* Based upon some commented-out C code from mpeg2dec (idct_mmx.c + * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>). */ + +#include "simple_idct.h" + +#include "bit_depth_template.c" + +#undef W1 +#undef W2 +#undef W3 +#undef W4 +#undef W5 +#undef W6 +#undef W7 +#undef ROW_SHIFT +#undef COL_SHIFT +#undef DC_SHIFT +#undef MUL +#undef MAC + +#if BIT_DEPTH == 8 + +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + +#define ROW_SHIFT 11 +#define COL_SHIFT 20 +#define DC_SHIFT 3 + +#define MUL(a, b) MUL16(a, b) +#define MAC(a, b, c) MAC16(a, b, c) + +#elif BIT_DEPTH == 10 || BIT_DEPTH == 12 + +# if BIT_DEPTH == 10 +#define W1 22725 // 90901 +#define W2 21407 // 85627 +#define W3 19265 // 77062 +#define W4 16384 // 65535 +#define W5 12873 // 51491 +#define W6 8867 // 35468 +#define W7 4520 // 18081 + +# ifdef EXTRA_SHIFT +#define ROW_SHIFT 13 +#define COL_SHIFT 18 +#define DC_SHIFT 1 +# elif IN_IDCT_DEPTH == 32 +#define ROW_SHIFT 13 +#define COL_SHIFT 21 +#define DC_SHIFT 2 +# else +#define ROW_SHIFT 12 +#define COL_SHIFT 19 +#define DC_SHIFT 2 +# endif + +# else +#define W1 45451 +#define W2 42813 +#define W3 38531 +#define W4 32767 +#define W5 25746 +#define W6 17734 +#define W7 9041 + +#define ROW_SHIFT 16 +#define COL_SHIFT 17 +#define DC_SHIFT -1 +# endif + +#define MUL(a, b) ((int)((SUINT)(a) * (b))) +#define MAC(a, b, c) ((a) += (SUINT)(b) * (c)) + +#else + +#error "Unsupported bitdepth" + +#endif + +#ifdef EXTRA_SHIFT +static inline void FUNC(idctRowCondDC_extrashift)(int16_t *row, int extra_shift) +#else +static inline void FUNC6(idctRowCondDC)(idctin *row, int extra_shift) +#endif +{ + SUINT a0, a1, a2, a3, b0, b1, b2, b3; + +// TODO: Add DC-only support for int32_t input +#if IN_IDCT_DEPTH == 16 +#if HAVE_FAST_64BIT +#define ROW0_MASK (0xffffULL << 48 * HAVE_BIGENDIAN) + if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) { + uint64_t temp; + if (DC_SHIFT - extra_shift >= 0) { + temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; + } else { + temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; + } + temp += temp * (1 << 16); + temp += temp * ((uint64_t) 1 << 32); + AV_WN64A(row, temp); + AV_WN64A(row + 4, temp); + return; + } +#else + if (!(AV_RN32A(row+2) | + AV_RN32A(row+4) | + AV_RN32A(row+6) | + row[1])) { + uint32_t temp; + if (DC_SHIFT - extra_shift >= 0) { + temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; + } else { + temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; + } + temp += temp * (1 << 16); + AV_WN32A(row, temp); + AV_WN32A(row+2, temp); + AV_WN32A(row+4, temp); + AV_WN32A(row+6, temp); + return; + } +#endif +#endif + + a0 = ((SUINT)W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1)); + a1 = a0; + a2 = a0; + a3 = a0; + + a0 += (SUINT)W2 * row[2]; + a1 += (SUINT)W6 * row[2]; + a2 -= (SUINT)W6 * row[2]; + a3 -= (SUINT)W2 * row[2]; + + b0 = MUL(W1, row[1]); + MAC(b0, W3, row[3]); + b1 = MUL(W3, row[1]); + MAC(b1, -W7, row[3]); + b2 = MUL(W5, row[1]); + MAC(b2, -W1, row[3]); + b3 = MUL(W7, row[1]); + MAC(b3, -W5, row[3]); + +#if IN_IDCT_DEPTH == 32 + if (AV_RN64A(row + 4) | AV_RN64A(row + 6)) { +#else + if (AV_RN64A(row + 4)) { +#endif + a0 += (SUINT) W4*row[4] + (SUINT)W6*row[6]; + a1 += (SUINT)- W4*row[4] - (SUINT)W2*row[6]; + a2 += (SUINT)- W4*row[4] + (SUINT)W2*row[6]; + a3 += (SUINT) W4*row[4] - (SUINT)W6*row[6]; + + MAC(b0, W5, row[5]); + MAC(b0, W7, row[7]); + + MAC(b1, -W1, row[5]); + MAC(b1, -W5, row[7]); + + MAC(b2, W7, row[5]); + MAC(b2, W3, row[7]); + + MAC(b3, W3, row[5]); + MAC(b3, -W1, row[7]); + } + + row[0] = (int)(a0 + b0) >> (ROW_SHIFT + extra_shift); + row[7] = (int)(a0 - b0) >> (ROW_SHIFT + extra_shift); + row[1] = (int)(a1 + b1) >> (ROW_SHIFT + extra_shift); + row[6] = (int)(a1 - b1) >> (ROW_SHIFT + extra_shift); + row[2] = (int)(a2 + b2) >> (ROW_SHIFT + extra_shift); + row[5] = (int)(a2 - b2) >> (ROW_SHIFT + extra_shift); + row[3] = (int)(a3 + b3) >> (ROW_SHIFT + extra_shift); + row[4] = (int)(a3 - b3) >> (ROW_SHIFT + extra_shift); +} + +#define IDCT_COLS do { \ + a0 = (SUINT)W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); \ + a1 = a0; \ + a2 = a0; \ + a3 = a0; \ + \ + a0 += (SUINT) W2*col[8*2]; \ + a1 += (SUINT) W6*col[8*2]; \ + a2 += (SUINT)-W6*col[8*2]; \ + a3 += (SUINT)-W2*col[8*2]; \ + \ + b0 = MUL(W1, col[8*1]); \ + b1 = MUL(W3, col[8*1]); \ + b2 = MUL(W5, col[8*1]); \ + b3 = MUL(W7, col[8*1]); \ + \ + MAC(b0, W3, col[8*3]); \ + MAC(b1, -W7, col[8*3]); \ + MAC(b2, -W1, col[8*3]); \ + MAC(b3, -W5, col[8*3]); \ + \ + if (col[8*4]) { \ + a0 += (SUINT) W4*col[8*4]; \ + a1 += (SUINT)-W4*col[8*4]; \ + a2 += (SUINT)-W4*col[8*4]; \ + a3 += (SUINT) W4*col[8*4]; \ + } \ + \ + if (col[8*5]) { \ + MAC(b0, W5, col[8*5]); \ + MAC(b1, -W1, col[8*5]); \ + MAC(b2, W7, col[8*5]); \ + MAC(b3, W3, col[8*5]); \ + } \ + \ + if (col[8*6]) { \ + a0 += (SUINT) W6*col[8*6]; \ + a1 += (SUINT)-W2*col[8*6]; \ + a2 += (SUINT) W2*col[8*6]; \ + a3 += (SUINT)-W6*col[8*6]; \ + } \ + \ + if (col[8*7]) { \ + MAC(b0, W7, col[8*7]); \ + MAC(b1, -W5, col[8*7]); \ + MAC(b2, W3, col[8*7]); \ + MAC(b3, -W1, col[8*7]); \ + } \ + } while (0) + +#ifdef EXTRA_SHIFT +static inline void FUNC(idctSparseCol_extrashift)(int16_t *col) +#else +static inline void FUNC6(idctSparseColPut)(pixel *dest, ptrdiff_t line_size, + idctin *col) +{ + SUINT a0, a1, a2, a3, b0, b1, b2, b3; + + IDCT_COLS; + + dest[0] = av_clip_pixel((int)(a0 + b0) >> COL_SHIFT); + dest += line_size; + dest[0] = av_clip_pixel((int)(a1 + b1) >> COL_SHIFT); + dest += line_size; + dest[0] = av_clip_pixel((int)(a2 + b2) >> COL_SHIFT); + dest += line_size; + dest[0] = av_clip_pixel((int)(a3 + b3) >> COL_SHIFT); + dest += line_size; + dest[0] = av_clip_pixel((int)(a3 - b3) >> COL_SHIFT); + dest += line_size; + dest[0] = av_clip_pixel((int)(a2 - b2) >> COL_SHIFT); + dest += line_size; + dest[0] = av_clip_pixel((int)(a1 - b1) >> COL_SHIFT); + dest += line_size; + dest[0] = av_clip_pixel((int)(a0 - b0) >> COL_SHIFT); +} + +static inline void FUNC6(idctSparseColAdd)(pixel *dest, ptrdiff_t line_size, + idctin *col) +{ + unsigned a0, a1, a2, a3, b0, b1, b2, b3; + + IDCT_COLS; + + dest[0] = av_clip_pixel(dest[0] + ((int)(a0 + b0) >> COL_SHIFT)); + dest += line_size; + dest[0] = av_clip_pixel(dest[0] + ((int)(a1 + b1) >> COL_SHIFT)); + dest += line_size; + dest[0] = av_clip_pixel(dest[0] + ((int)(a2 + b2) >> COL_SHIFT)); + dest += line_size; + dest[0] = av_clip_pixel(dest[0] + ((int)(a3 + b3) >> COL_SHIFT)); + dest += line_size; + dest[0] = av_clip_pixel(dest[0] + ((int)(a3 - b3) >> COL_SHIFT)); + dest += line_size; + dest[0] = av_clip_pixel(dest[0] + ((int)(a2 - b2) >> COL_SHIFT)); + dest += line_size; + dest[0] = av_clip_pixel(dest[0] + ((int)(a1 - b1) >> COL_SHIFT)); + dest += line_size; + dest[0] = av_clip_pixel(dest[0] + ((int)(a0 - b0) >> COL_SHIFT)); +} + +static inline void FUNC6(idctSparseCol)(idctin *col) +#endif +{ + unsigned a0, a1, a2, a3, b0, b1, b2, b3; + + IDCT_COLS; + + col[0 ] = ((int)(a0 + b0) >> COL_SHIFT); + col[8 ] = ((int)(a1 + b1) >> COL_SHIFT); + col[16] = ((int)(a2 + b2) >> COL_SHIFT); + col[24] = ((int)(a3 + b3) >> COL_SHIFT); + col[32] = ((int)(a3 - b3) >> COL_SHIFT); + col[40] = ((int)(a2 - b2) >> COL_SHIFT); + col[48] = ((int)(a1 - b1) >> COL_SHIFT); + col[56] = ((int)(a0 - b0) >> COL_SHIFT); +} + +#ifndef EXTRA_SHIFT +void FUNC6(ff_simple_idct_put)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block_) +{ + idctin *block = (idctin *)block_; + pixel *dest = (pixel *)dest_; + int i; + + line_size /= sizeof(pixel); + + for (i = 0; i < 8; i++) + FUNC6(idctRowCondDC)(block + i*8, 0); + + for (i = 0; i < 8; i++) + FUNC6(idctSparseColPut)(dest + i, line_size, block + i); +} + +#if IN_IDCT_DEPTH == 16 +void FUNC6(ff_simple_idct_add)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block) +{ + pixel *dest = (pixel *)dest_; + int i; + + line_size /= sizeof(pixel); + + for (i = 0; i < 8; i++) + FUNC6(idctRowCondDC)(block + i*8, 0); + + for (i = 0; i < 8; i++) + FUNC6(idctSparseColAdd)(dest + i, line_size, block + i); +} + +void FUNC6(ff_simple_idct)(int16_t *block) +{ + int i; + + for (i = 0; i < 8; i++) + FUNC6(idctRowCondDC)(block + i*8, 0); + + for (i = 0; i < 8; i++) + FUNC6(idctSparseCol)(block + i); +} +#endif +#endif diff --git a/media/ffvpx/libavcodec/startcode.h b/media/ffvpx/libavcodec/startcode.h new file mode 100644 index 0000000000..8b75832aaf --- /dev/null +++ b/media/ffvpx/libavcodec/startcode.h @@ -0,0 +1,36 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Accelerated start code search function for start codes common to + * MPEG-1/2/4 video, VC-1, H.264/5 + */ + +#ifndef AVCODEC_STARTCODE_H +#define AVCODEC_STARTCODE_H + +#include <stdint.h> + +const uint8_t *avpriv_find_start_code(const uint8_t *p, + const uint8_t *end, + uint32_t *state); + +int ff_startcode_find_candidate_c(const uint8_t *buf, int size); + +#endif /* AVCODEC_STARTCODE_H */ diff --git a/media/ffvpx/libavcodec/thread.h b/media/ffvpx/libavcodec/thread.h new file mode 100644 index 0000000000..88a14cfeb1 --- /dev/null +++ b/media/ffvpx/libavcodec/thread.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2008 Alexander Strange <astrange@ithinksw.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Multithreading support functions + * @author Alexander Strange <astrange@ithinksw.com> + */ + +#ifndef AVCODEC_THREAD_H +#define AVCODEC_THREAD_H + +#include "libavutil/buffer.h" + +#include "avcodec.h" + +/** + * Wait for decoding threads to finish and reset internal state. + * Called by avcodec_flush_buffers(). + * + * @param avctx The context. + */ +void ff_thread_flush(AVCodecContext *avctx); + +/** + * Submit a new frame to a decoding thread. + * Returns the next available frame in picture. *got_picture_ptr + * will be 0 if none is available. + * The return value on success is the size of the consumed packet for + * compatibility with FFCodec.decode. This means the decoder + * has to consume the full packet. + * + * Parameters are the same as FFCodec.decode. + */ +int ff_thread_decode_frame(AVCodecContext *avctx, AVFrame *picture, + int *got_picture_ptr, AVPacket *avpkt); + +/** + * If the codec defines update_thread_context(), call this + * when they are ready for the next thread to start decoding + * the next frame. After calling it, do not change any variables + * read by the update_thread_context() method, or call ff_thread_get_buffer(). + * + * @param avctx The context. + */ +void ff_thread_finish_setup(AVCodecContext *avctx); + +#define ff_thread_get_format ff_get_format + +/** + * Wrapper around get_buffer() for frame-multithreaded codecs. + * Call this function instead of ff_get_buffer(f). + * Cannot be called after the codec has called ff_thread_finish_setup(). + * + * @param avctx The current context. + * @param f The frame to write into. + */ +int ff_thread_get_buffer(AVCodecContext *avctx, AVFrame *f, int flags); + +/** + * Wrapper around release_buffer() frame-for multithreaded codecs. + * Call this function instead of avctx->release_buffer(f). + * The AVFrame will be copied and the actual release_buffer() call + * will be performed later. The contents of data pointed to by the + * AVFrame should not be changed until ff_thread_get_buffer() is called + * on it. + * + * @param avctx The current context. + * @param f The picture being released. + */ +void ff_thread_release_buffer(AVCodecContext *avctx, AVFrame *f); + +int ff_thread_init(AVCodecContext *s); +int ff_slice_thread_execute_with_mainfunc(AVCodecContext *avctx, + int (*action_func2)(AVCodecContext *c, void *arg, int jobnr, int threadnr), + int (*main_func)(AVCodecContext *c), void *arg, int *ret, int job_count); +void ff_thread_free(AVCodecContext *s); +int ff_slice_thread_allocz_entries(AVCodecContext *avctx, int count); +int ff_slice_thread_init_progress(AVCodecContext *avctx); +void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n); +void ff_thread_await_progress2(AVCodecContext *avctx, int field, int thread, int shift); + +#endif /* AVCODEC_THREAD_H */ diff --git a/media/ffvpx/libavcodec/threadframe.h b/media/ffvpx/libavcodec/threadframe.h new file mode 100644 index 0000000000..d2f93c5cd0 --- /dev/null +++ b/media/ffvpx/libavcodec/threadframe.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2022 Andreas Rheinhardt <andreas.rheinhardt@outlook.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_THREADFRAME_H +#define AVCODEC_THREADFRAME_H + +#include "libavutil/frame.h" +#include "avcodec.h" + +typedef struct ThreadFrame { + AVFrame *f; + AVCodecContext *owner[2]; + // progress->data is an array of 2 ints holding progress for top/bottom + // fields + AVBufferRef *progress; +} ThreadFrame; + +/** + * Notify later decoding threads when part of their reference picture is ready. + * Call this when some part of the picture is finished decoding. + * Later calls with lower values of progress have no effect. + * + * @param f The picture being decoded. + * @param progress Value, in arbitrary units, of how much of the picture has decoded. + * @param field The field being decoded, for field-picture codecs. + * 0 for top field or frame pictures, 1 for bottom field. + */ +void ff_thread_report_progress(ThreadFrame *f, int progress, int field); + +/** + * Wait for earlier decoding threads to finish reference pictures. + * Call this before accessing some part of a picture, with a given + * value for progress, and it will return after the responsible decoding + * thread calls ff_thread_report_progress() with the same or + * higher value for progress. + * + * @param f The picture being referenced. + * @param progress Value, in arbitrary units, to wait for. + * @param field The field being referenced, for field-picture codecs. + * 0 for top field or frame pictures, 1 for bottom field. + */ +void ff_thread_await_progress(const ThreadFrame *f, int progress, int field); + +/** + * Wrapper around ff_get_buffer() for frame-multithreaded codecs. + * Call this function instead of ff_get_buffer() if you might need + * to wait for progress on this frame. + * Cannot be called after the codec has called ff_thread_finish_setup(). + * + * @param avctx The current context. + * @param f The frame to write into. + * @note: It is fine to call this with codecs that do not support + * frame threading. + */ +int ff_thread_get_ext_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags); + +/** + * Unref a ThreadFrame. + * + * This is basically a wrapper around av_frame_unref() and should + * be called instead of it. + * + * @param avctx The current context. + * @param f The picture being released. + */ +void ff_thread_release_ext_buffer(AVCodecContext *avctx, ThreadFrame *f); + +int ff_thread_ref_frame(ThreadFrame *dst, const ThreadFrame *src); + +int ff_thread_can_start_frame(AVCodecContext *avctx); + +#endif diff --git a/media/ffvpx/libavcodec/unary.h b/media/ffvpx/libavcodec/unary.h new file mode 100644 index 0000000000..d57f9f70c5 --- /dev/null +++ b/media/ffvpx/libavcodec/unary.h @@ -0,0 +1,69 @@ +/* + * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_UNARY_H +#define AVCODEC_UNARY_H + +#include "get_bits.h" + +/** + * Get unary code of limited length + * @param gb GetBitContext + * @param[in] stop The bitstop value (unary code of 1's or 0's) + * @param[in] len Maximum length + * @return unary 0 based code index. This is also the length in bits of the + * code excluding the stop bit. + * (in case len=1) + * 1 0 + * 0 1 + * (in case len=2) + * 1 0 + * 01 1 + * 00 2 + * (in case len=3) + * 1 0 + * 01 1 + * 001 2 + * 000 3 + */ +static inline int get_unary(GetBitContext *gb, int stop, int len) +{ + int i; + + for(i = 0; i < len && get_bits1(gb) != stop; i++); + return i; +} + +/** + * Get unary code terminated by a 0 with a maximum length of 33 + * @param gb GetBitContext + * @return Unary length/index + */ +static inline int get_unary_0_33(GetBitContext *gb) +{ + return get_unary(gb, 0, 33); +} + +static inline int get_unary_0_9(GetBitContext *gb) +{ + return get_unary(gb, 0, 9); +} + +#endif /* AVCODEC_UNARY_H */ diff --git a/media/ffvpx/libavcodec/utils.c b/media/ffvpx/libavcodec/utils.c new file mode 100644 index 0000000000..599da21dba --- /dev/null +++ b/media/ffvpx/libavcodec/utils.c @@ -0,0 +1,1170 @@ +/* + * utils for libavcodec + * Copyright (c) 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * utils. + */ + +#include "config.h" +#include "libavutil/avassert.h" +#include "libavutil/channel_layout.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem.h" +#include "libavutil/pixdesc.h" +#include "libavutil/imgutils.h" +#include "libavutil/pixfmt.h" +#include "avcodec.h" +#include "codec.h" +#include "codec_internal.h" +#include "decode.h" +#include "hwconfig.h" +#include "thread.h" +#include "threadframe.h" +#include "internal.h" +#include "put_bits.h" +#include "startcode.h" +#include <stdlib.h> +#include <limits.h> + +void av_fast_padded_malloc(void *ptr, unsigned int *size, size_t min_size) +{ + uint8_t **p = ptr; + if (min_size > SIZE_MAX - AV_INPUT_BUFFER_PADDING_SIZE) { + av_freep(p); + *size = 0; + return; + } + av_fast_mallocz(p, size, min_size + AV_INPUT_BUFFER_PADDING_SIZE); + if (*p) + memset(*p + min_size, 0, AV_INPUT_BUFFER_PADDING_SIZE); +} + +void av_fast_padded_mallocz(void *ptr, unsigned int *size, size_t min_size) +{ + uint8_t **p = ptr; + if (min_size > SIZE_MAX - AV_INPUT_BUFFER_PADDING_SIZE) { + av_freep(p); + *size = 0; + return; + } + av_fast_malloc(p, size, min_size + AV_INPUT_BUFFER_PADDING_SIZE); + if (*p) + memset(*p, 0, min_size + AV_INPUT_BUFFER_PADDING_SIZE); +} + +int av_codec_is_encoder(const AVCodec *avcodec) +{ + const FFCodec *const codec = ffcodec(avcodec); + return codec && (codec->cb_type == FF_CODEC_CB_TYPE_ENCODE || + codec->cb_type == FF_CODEC_CB_TYPE_ENCODE_SUB || + codec->cb_type == FF_CODEC_CB_TYPE_RECEIVE_PACKET); +} + +int av_codec_is_decoder(const AVCodec *avcodec) +{ + const FFCodec *const codec = ffcodec(avcodec); + return codec && (codec->cb_type == FF_CODEC_CB_TYPE_DECODE || + codec->cb_type == FF_CODEC_CB_TYPE_DECODE_SUB || + codec->cb_type == FF_CODEC_CB_TYPE_RECEIVE_FRAME); +} + +int ff_set_dimensions(AVCodecContext *s, int width, int height) +{ + int ret = av_image_check_size2(width, height, s->max_pixels, AV_PIX_FMT_NONE, 0, s); + + if (ret < 0) + width = height = 0; + + s->coded_width = width; + s->coded_height = height; + s->width = AV_CEIL_RSHIFT(width, s->lowres); + s->height = AV_CEIL_RSHIFT(height, s->lowres); + + return ret; +} + +int ff_set_sar(AVCodecContext *avctx, AVRational sar) +{ + int ret = av_image_check_sar(avctx->width, avctx->height, sar); + + if (ret < 0) { + av_log(avctx, AV_LOG_WARNING, "ignoring invalid SAR: %d/%d\n", + sar.num, sar.den); + avctx->sample_aspect_ratio = (AVRational){ 0, 1 }; + return ret; + } else { + avctx->sample_aspect_ratio = sar; + } + return 0; +} + +int ff_side_data_update_matrix_encoding(AVFrame *frame, + enum AVMatrixEncoding matrix_encoding) +{ + AVFrameSideData *side_data; + enum AVMatrixEncoding *data; + + side_data = av_frame_get_side_data(frame, AV_FRAME_DATA_MATRIXENCODING); + if (!side_data) + side_data = av_frame_new_side_data(frame, AV_FRAME_DATA_MATRIXENCODING, + sizeof(enum AVMatrixEncoding)); + + if (!side_data) + return AVERROR(ENOMEM); + + data = (enum AVMatrixEncoding*)side_data->data; + *data = matrix_encoding; + + return 0; +} + +void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height, + int linesize_align[AV_NUM_DATA_POINTERS]) +{ + int i; + int w_align = 1; + int h_align = 1; + AVPixFmtDescriptor const *desc = av_pix_fmt_desc_get(s->pix_fmt); + + if (desc) { + w_align = 1 << desc->log2_chroma_w; + h_align = 1 << desc->log2_chroma_h; + } + + switch (s->pix_fmt) { + case AV_PIX_FMT_YUV420P: + case AV_PIX_FMT_YUYV422: + case AV_PIX_FMT_YVYU422: + case AV_PIX_FMT_UYVY422: + case AV_PIX_FMT_YUV422P: + case AV_PIX_FMT_YUV440P: + case AV_PIX_FMT_YUV444P: + case AV_PIX_FMT_GBRP: + case AV_PIX_FMT_GBRAP: + case AV_PIX_FMT_GRAY8: + case AV_PIX_FMT_GRAY16BE: + case AV_PIX_FMT_GRAY16LE: + case AV_PIX_FMT_YUVJ420P: + case AV_PIX_FMT_YUVJ422P: + case AV_PIX_FMT_YUVJ440P: + case AV_PIX_FMT_YUVJ444P: + case AV_PIX_FMT_YUVA420P: + case AV_PIX_FMT_YUVA422P: + case AV_PIX_FMT_YUVA444P: + case AV_PIX_FMT_YUV420P9LE: + case AV_PIX_FMT_YUV420P9BE: + case AV_PIX_FMT_YUV420P10LE: + case AV_PIX_FMT_YUV420P10BE: + case AV_PIX_FMT_YUV420P12LE: + case AV_PIX_FMT_YUV420P12BE: + case AV_PIX_FMT_YUV420P14LE: + case AV_PIX_FMT_YUV420P14BE: + case AV_PIX_FMT_YUV420P16LE: + case AV_PIX_FMT_YUV420P16BE: + case AV_PIX_FMT_YUVA420P9LE: + case AV_PIX_FMT_YUVA420P9BE: + case AV_PIX_FMT_YUVA420P10LE: + case AV_PIX_FMT_YUVA420P10BE: + case AV_PIX_FMT_YUVA420P16LE: + case AV_PIX_FMT_YUVA420P16BE: + case AV_PIX_FMT_YUV422P9LE: + case AV_PIX_FMT_YUV422P9BE: + case AV_PIX_FMT_YUV422P10LE: + case AV_PIX_FMT_YUV422P10BE: + case AV_PIX_FMT_YUV422P12LE: + case AV_PIX_FMT_YUV422P12BE: + case AV_PIX_FMT_YUV422P14LE: + case AV_PIX_FMT_YUV422P14BE: + case AV_PIX_FMT_YUV422P16LE: + case AV_PIX_FMT_YUV422P16BE: + case AV_PIX_FMT_YUVA422P9LE: + case AV_PIX_FMT_YUVA422P9BE: + case AV_PIX_FMT_YUVA422P10LE: + case AV_PIX_FMT_YUVA422P10BE: + case AV_PIX_FMT_YUVA422P12LE: + case AV_PIX_FMT_YUVA422P12BE: + case AV_PIX_FMT_YUVA422P16LE: + case AV_PIX_FMT_YUVA422P16BE: + case AV_PIX_FMT_YUV440P10LE: + case AV_PIX_FMT_YUV440P10BE: + case AV_PIX_FMT_YUV440P12LE: + case AV_PIX_FMT_YUV440P12BE: + case AV_PIX_FMT_YUV444P9LE: + case AV_PIX_FMT_YUV444P9BE: + case AV_PIX_FMT_YUV444P10LE: + case AV_PIX_FMT_YUV444P10BE: + case AV_PIX_FMT_YUV444P12LE: + case AV_PIX_FMT_YUV444P12BE: + case AV_PIX_FMT_YUV444P14LE: + case AV_PIX_FMT_YUV444P14BE: + case AV_PIX_FMT_YUV444P16LE: + case AV_PIX_FMT_YUV444P16BE: + case AV_PIX_FMT_YUVA444P9LE: + case AV_PIX_FMT_YUVA444P9BE: + case AV_PIX_FMT_YUVA444P10LE: + case AV_PIX_FMT_YUVA444P10BE: + case AV_PIX_FMT_YUVA444P12LE: + case AV_PIX_FMT_YUVA444P12BE: + case AV_PIX_FMT_YUVA444P16LE: + case AV_PIX_FMT_YUVA444P16BE: + case AV_PIX_FMT_GBRP9LE: + case AV_PIX_FMT_GBRP9BE: + case AV_PIX_FMT_GBRP10LE: + case AV_PIX_FMT_GBRP10BE: + case AV_PIX_FMT_GBRP12LE: + case AV_PIX_FMT_GBRP12BE: + case AV_PIX_FMT_GBRP14LE: + case AV_PIX_FMT_GBRP14BE: + case AV_PIX_FMT_GBRP16LE: + case AV_PIX_FMT_GBRP16BE: + case AV_PIX_FMT_GBRAP12LE: + case AV_PIX_FMT_GBRAP12BE: + case AV_PIX_FMT_GBRAP16LE: + case AV_PIX_FMT_GBRAP16BE: + w_align = 16; //FIXME assume 16 pixel per macroblock + h_align = 16 * 2; // interlaced needs 2 macroblocks height + if (s->codec_id == AV_CODEC_ID_BINKVIDEO) + w_align = 16*2; + break; + case AV_PIX_FMT_YUV411P: + case AV_PIX_FMT_YUVJ411P: + case AV_PIX_FMT_UYYVYY411: + w_align = 32; + h_align = 16 * 2; + break; + case AV_PIX_FMT_YUV410P: + if (s->codec_id == AV_CODEC_ID_SVQ1) { + w_align = 64; + h_align = 64; + } + break; + case AV_PIX_FMT_RGB555: + if (s->codec_id == AV_CODEC_ID_RPZA) { + w_align = 4; + h_align = 4; + } + if (s->codec_id == AV_CODEC_ID_INTERPLAY_VIDEO) { + w_align = 8; + h_align = 8; + } + break; + case AV_PIX_FMT_PAL8: + case AV_PIX_FMT_BGR8: + case AV_PIX_FMT_RGB8: + if (s->codec_id == AV_CODEC_ID_SMC || + s->codec_id == AV_CODEC_ID_CINEPAK) { + w_align = 4; + h_align = 4; + } + if (s->codec_id == AV_CODEC_ID_JV || + s->codec_id == AV_CODEC_ID_ARGO || + s->codec_id == AV_CODEC_ID_INTERPLAY_VIDEO) { + w_align = 8; + h_align = 8; + } + if (s->codec_id == AV_CODEC_ID_MJPEG || + s->codec_id == AV_CODEC_ID_MJPEGB || + s->codec_id == AV_CODEC_ID_LJPEG || + s->codec_id == AV_CODEC_ID_SMVJPEG || + s->codec_id == AV_CODEC_ID_AMV || + s->codec_id == AV_CODEC_ID_SP5X || + s->codec_id == AV_CODEC_ID_JPEGLS) { + w_align = 8; + h_align = 2*8; + } + break; + case AV_PIX_FMT_BGR24: + if ((s->codec_id == AV_CODEC_ID_MSZH) || + (s->codec_id == AV_CODEC_ID_ZLIB)) { + w_align = 4; + h_align = 4; + } + break; + case AV_PIX_FMT_RGB24: + if (s->codec_id == AV_CODEC_ID_CINEPAK) { + w_align = 4; + h_align = 4; + } + break; + case AV_PIX_FMT_BGR0: + if (s->codec_id == AV_CODEC_ID_ARGO) { + w_align = 8; + h_align = 8; + } + break; + default: + break; + } + + if (s->codec_id == AV_CODEC_ID_IFF_ILBM) { + w_align = FFMAX(w_align, 8); + } + + *width = FFALIGN(*width, w_align); + *height = FFALIGN(*height, h_align); + if (s->codec_id == AV_CODEC_ID_H264 || s->lowres || + s->codec_id == AV_CODEC_ID_VC1 || s->codec_id == AV_CODEC_ID_WMV3 || + s->codec_id == AV_CODEC_ID_VP5 || s->codec_id == AV_CODEC_ID_VP6 || + s->codec_id == AV_CODEC_ID_VP6F || s->codec_id == AV_CODEC_ID_VP6A + ) { + // some of the optimized chroma MC reads one line too much + // which is also done in mpeg decoders with lowres > 0 + *height += 2; + + // H.264 uses edge emulation for out of frame motion vectors, for this + // it requires a temporary area large enough to hold a 21x21 block, + // increasing witdth ensure that the temporary area is large enough, + // the next rounded up width is 32 + *width = FFMAX(*width, 32); + } + if (s->codec_id == AV_CODEC_ID_SVQ3) { + *width = FFMAX(*width, 32); + } + + for (i = 0; i < 4; i++) + linesize_align[i] = STRIDE_ALIGN; +} + +void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height) +{ + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(s->pix_fmt); + int chroma_shift = desc->log2_chroma_w; + int linesize_align[AV_NUM_DATA_POINTERS]; + int align; + + avcodec_align_dimensions2(s, width, height, linesize_align); + align = FFMAX(linesize_align[0], linesize_align[3]); + linesize_align[1] <<= chroma_shift; + linesize_align[2] <<= chroma_shift; + align = FFMAX3(align, linesize_align[1], linesize_align[2]); + *width = FFALIGN(*width, align); +} +#if FF_API_AVCODEC_CHROMA_POS +int avcodec_enum_to_chroma_pos(int *xpos, int *ypos, enum AVChromaLocation pos) +{ + return av_chroma_location_enum_to_pos(xpos, ypos, pos); +} + +enum AVChromaLocation avcodec_chroma_pos_to_enum(int xpos, int ypos) +{ + return av_chroma_location_pos_to_enum(xpos, ypos); +} +#endif + +int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels, + enum AVSampleFormat sample_fmt, const uint8_t *buf, + int buf_size, int align) +{ + int ch, planar, needed_size, ret = 0; + + needed_size = av_samples_get_buffer_size(NULL, nb_channels, + frame->nb_samples, sample_fmt, + align); + if (buf_size < needed_size) + return AVERROR(EINVAL); + + planar = av_sample_fmt_is_planar(sample_fmt); + if (planar && nb_channels > AV_NUM_DATA_POINTERS) { + if (!FF_ALLOCZ_TYPED_ARRAY(frame->extended_data, nb_channels)) + return AVERROR(ENOMEM); + } else { + frame->extended_data = frame->data; + } + + if ((ret = av_samples_fill_arrays(frame->extended_data, &frame->linesize[0], + (uint8_t *)(intptr_t)buf, nb_channels, frame->nb_samples, + sample_fmt, align)) < 0) { + if (frame->extended_data != frame->data) + av_freep(&frame->extended_data); + return ret; + } + if (frame->extended_data != frame->data) { + for (ch = 0; ch < AV_NUM_DATA_POINTERS; ch++) + frame->data[ch] = frame->extended_data[ch]; + } + + return ret; +} + +void ff_color_frame(AVFrame *frame, const int c[4]) +{ + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); + int p, y; + + av_assert0(desc->flags & AV_PIX_FMT_FLAG_PLANAR); + + for (p = 0; p<desc->nb_components; p++) { + uint8_t *dst = frame->data[p]; + int is_chroma = p == 1 || p == 2; + int bytes = is_chroma ? AV_CEIL_RSHIFT(frame->width, desc->log2_chroma_w) : frame->width; + int height = is_chroma ? AV_CEIL_RSHIFT(frame->height, desc->log2_chroma_h) : frame->height; + if (desc->comp[0].depth >= 9) { + ((uint16_t*)dst)[0] = c[p]; + av_memcpy_backptr(dst + 2, 2, bytes - 2); + dst += frame->linesize[p]; + for (y = 1; y < height; y++) { + memcpy(dst, frame->data[p], 2*bytes); + dst += frame->linesize[p]; + } + } else { + for (y = 0; y < height; y++) { + memset(dst, c[p], bytes); + dst += frame->linesize[p]; + } + } + } +} + +int avpriv_codec_get_cap_skip_frame_fill_param(const AVCodec *codec){ + return !!(ffcodec(codec)->caps_internal & FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM); +} + +const char *avcodec_get_name(enum AVCodecID id) +{ + const AVCodecDescriptor *cd; + const AVCodec *codec; + + if (id == AV_CODEC_ID_NONE) + return "none"; + cd = avcodec_descriptor_get(id); + if (cd) + return cd->name; + av_log(NULL, AV_LOG_WARNING, "Codec 0x%x is not in the full list.\n", id); + codec = avcodec_find_decoder(id); + if (codec) + return codec->name; + codec = avcodec_find_encoder(id); + if (codec) + return codec->name; + return "unknown_codec"; +} + +const char *av_get_profile_name(const AVCodec *codec, int profile) +{ + const AVProfile *p; + if (profile == FF_PROFILE_UNKNOWN || !codec->profiles) + return NULL; + + for (p = codec->profiles; p->profile != FF_PROFILE_UNKNOWN; p++) + if (p->profile == profile) + return p->name; + + return NULL; +} + +const char *avcodec_profile_name(enum AVCodecID codec_id, int profile) +{ + const AVCodecDescriptor *desc = avcodec_descriptor_get(codec_id); + const AVProfile *p; + + if (profile == FF_PROFILE_UNKNOWN || !desc || !desc->profiles) + return NULL; + + for (p = desc->profiles; p->profile != FF_PROFILE_UNKNOWN; p++) + if (p->profile == profile) + return p->name; + + return NULL; +} + +int av_get_exact_bits_per_sample(enum AVCodecID codec_id) +{ + switch (codec_id) { + case AV_CODEC_ID_DFPWM: + return 1; + case AV_CODEC_ID_8SVX_EXP: + case AV_CODEC_ID_8SVX_FIB: + case AV_CODEC_ID_ADPCM_ARGO: + case AV_CODEC_ID_ADPCM_CT: + case AV_CODEC_ID_ADPCM_IMA_ALP: + case AV_CODEC_ID_ADPCM_IMA_AMV: + case AV_CODEC_ID_ADPCM_IMA_APC: + case AV_CODEC_ID_ADPCM_IMA_APM: + case AV_CODEC_ID_ADPCM_IMA_EA_SEAD: + case AV_CODEC_ID_ADPCM_IMA_OKI: + case AV_CODEC_ID_ADPCM_IMA_WS: + case AV_CODEC_ID_ADPCM_IMA_SSI: + case AV_CODEC_ID_ADPCM_G722: + case AV_CODEC_ID_ADPCM_YAMAHA: + case AV_CODEC_ID_ADPCM_AICA: + return 4; + case AV_CODEC_ID_DSD_LSBF: + case AV_CODEC_ID_DSD_MSBF: + case AV_CODEC_ID_DSD_LSBF_PLANAR: + case AV_CODEC_ID_DSD_MSBF_PLANAR: + case AV_CODEC_ID_PCM_ALAW: + case AV_CODEC_ID_PCM_MULAW: + case AV_CODEC_ID_PCM_VIDC: + case AV_CODEC_ID_PCM_S8: + case AV_CODEC_ID_PCM_S8_PLANAR: + case AV_CODEC_ID_PCM_SGA: + case AV_CODEC_ID_PCM_U8: + case AV_CODEC_ID_SDX2_DPCM: + case AV_CODEC_ID_CBD2_DPCM: + case AV_CODEC_ID_DERF_DPCM: + case AV_CODEC_ID_WADY_DPCM: + return 8; + case AV_CODEC_ID_PCM_S16BE: + case AV_CODEC_ID_PCM_S16BE_PLANAR: + case AV_CODEC_ID_PCM_S16LE: + case AV_CODEC_ID_PCM_S16LE_PLANAR: + case AV_CODEC_ID_PCM_U16BE: + case AV_CODEC_ID_PCM_U16LE: + return 16; + case AV_CODEC_ID_PCM_S24DAUD: + case AV_CODEC_ID_PCM_S24BE: + case AV_CODEC_ID_PCM_S24LE: + case AV_CODEC_ID_PCM_S24LE_PLANAR: + case AV_CODEC_ID_PCM_U24BE: + case AV_CODEC_ID_PCM_U24LE: + return 24; + case AV_CODEC_ID_PCM_S32BE: + case AV_CODEC_ID_PCM_S32LE: + case AV_CODEC_ID_PCM_S32LE_PLANAR: + case AV_CODEC_ID_PCM_U32BE: + case AV_CODEC_ID_PCM_U32LE: + case AV_CODEC_ID_PCM_F32BE: + case AV_CODEC_ID_PCM_F32LE: + case AV_CODEC_ID_PCM_F24LE: + case AV_CODEC_ID_PCM_F16LE: + return 32; + case AV_CODEC_ID_PCM_F64BE: + case AV_CODEC_ID_PCM_F64LE: + case AV_CODEC_ID_PCM_S64BE: + case AV_CODEC_ID_PCM_S64LE: + return 64; + default: + return 0; + } +} + +enum AVCodecID av_get_pcm_codec(enum AVSampleFormat fmt, int be) +{ + static const enum AVCodecID map[][2] = { + [AV_SAMPLE_FMT_U8 ] = { AV_CODEC_ID_PCM_U8, AV_CODEC_ID_PCM_U8 }, + [AV_SAMPLE_FMT_S16 ] = { AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S16BE }, + [AV_SAMPLE_FMT_S32 ] = { AV_CODEC_ID_PCM_S32LE, AV_CODEC_ID_PCM_S32BE }, + [AV_SAMPLE_FMT_FLT ] = { AV_CODEC_ID_PCM_F32LE, AV_CODEC_ID_PCM_F32BE }, + [AV_SAMPLE_FMT_DBL ] = { AV_CODEC_ID_PCM_F64LE, AV_CODEC_ID_PCM_F64BE }, + [AV_SAMPLE_FMT_U8P ] = { AV_CODEC_ID_PCM_U8, AV_CODEC_ID_PCM_U8 }, + [AV_SAMPLE_FMT_S16P] = { AV_CODEC_ID_PCM_S16LE, AV_CODEC_ID_PCM_S16BE }, + [AV_SAMPLE_FMT_S32P] = { AV_CODEC_ID_PCM_S32LE, AV_CODEC_ID_PCM_S32BE }, + [AV_SAMPLE_FMT_S64P] = { AV_CODEC_ID_PCM_S64LE, AV_CODEC_ID_PCM_S64BE }, + [AV_SAMPLE_FMT_FLTP] = { AV_CODEC_ID_PCM_F32LE, AV_CODEC_ID_PCM_F32BE }, + [AV_SAMPLE_FMT_DBLP] = { AV_CODEC_ID_PCM_F64LE, AV_CODEC_ID_PCM_F64BE }, + }; + if (fmt < 0 || fmt >= FF_ARRAY_ELEMS(map)) + return AV_CODEC_ID_NONE; + if (be < 0 || be > 1) + be = AV_NE(1, 0); + return map[fmt][be]; +} + +int av_get_bits_per_sample(enum AVCodecID codec_id) +{ + switch (codec_id) { + case AV_CODEC_ID_DFPWM: + return 1; + case AV_CODEC_ID_ADPCM_SBPRO_2: + return 2; + case AV_CODEC_ID_ADPCM_SBPRO_3: + return 3; + case AV_CODEC_ID_ADPCM_SBPRO_4: + case AV_CODEC_ID_ADPCM_IMA_WAV: + case AV_CODEC_ID_ADPCM_IMA_QT: + case AV_CODEC_ID_ADPCM_SWF: + case AV_CODEC_ID_ADPCM_MS: + return 4; + default: + return av_get_exact_bits_per_sample(codec_id); + } +} + +static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba, + uint32_t tag, int bits_per_coded_sample, int64_t bitrate, + uint8_t * extradata, int frame_size, int frame_bytes) +{ + int bps = av_get_exact_bits_per_sample(id); + int framecount = (ba > 0 && frame_bytes / ba > 0) ? frame_bytes / ba : 1; + + /* codecs with an exact constant bits per sample */ + if (bps > 0 && ch > 0 && frame_bytes > 0 && ch < 32768 && bps < 32768) + return (frame_bytes * 8LL) / (bps * ch); + bps = bits_per_coded_sample; + + /* codecs with a fixed packet duration */ + switch (id) { + case AV_CODEC_ID_ADPCM_ADX: return 32; + case AV_CODEC_ID_ADPCM_IMA_QT: return 64; + case AV_CODEC_ID_ADPCM_EA_XAS: return 128; + case AV_CODEC_ID_AMR_NB: + case AV_CODEC_ID_EVRC: + case AV_CODEC_ID_GSM: + case AV_CODEC_ID_QCELP: + case AV_CODEC_ID_RA_288: return 160; + case AV_CODEC_ID_AMR_WB: + case AV_CODEC_ID_GSM_MS: return 320; + case AV_CODEC_ID_MP1: return 384; + case AV_CODEC_ID_ATRAC1: return 512; + case AV_CODEC_ID_ATRAC9: + case AV_CODEC_ID_ATRAC3: + if (framecount > INT_MAX/1024) + return 0; + return 1024 * framecount; + case AV_CODEC_ID_ATRAC3P: return 2048; + case AV_CODEC_ID_MP2: + case AV_CODEC_ID_MUSEPACK7: return 1152; + case AV_CODEC_ID_AC3: return 1536; + case AV_CODEC_ID_FTR: return 1024; + } + + if (sr > 0) { + /* calc from sample rate */ + if (id == AV_CODEC_ID_TTA) + return 256 * sr / 245; + else if (id == AV_CODEC_ID_DST) + return 588 * sr / 44100; + else if (id == AV_CODEC_ID_BINKAUDIO_DCT) { + if (sr / 22050 > 22) + return 0; + return (480 << (sr / 22050)); + } + + if (id == AV_CODEC_ID_MP3) + return sr <= 24000 ? 576 : 1152; + } + + if (ba > 0) { + /* calc from block_align */ + if (id == AV_CODEC_ID_SIPR) { + switch (ba) { + case 20: return 160; + case 19: return 144; + case 29: return 288; + case 37: return 480; + } + } else if (id == AV_CODEC_ID_ILBC) { + switch (ba) { + case 38: return 160; + case 50: return 240; + } + } + } + + if (frame_bytes > 0) { + /* calc from frame_bytes only */ + if (id == AV_CODEC_ID_TRUESPEECH) + return 240 * (frame_bytes / 32); + if (id == AV_CODEC_ID_NELLYMOSER) + return 256 * (frame_bytes / 64); + if (id == AV_CODEC_ID_RA_144) + return 160 * (frame_bytes / 20); + if (id == AV_CODEC_ID_APTX) + return 4 * (frame_bytes / 4); + if (id == AV_CODEC_ID_APTX_HD) + return 4 * (frame_bytes / 6); + + if (bps > 0) { + /* calc from frame_bytes and bits_per_coded_sample */ + if (id == AV_CODEC_ID_ADPCM_G726 || id == AV_CODEC_ID_ADPCM_G726LE) + return frame_bytes * 8 / bps; + } + + if (ch > 0 && ch < INT_MAX/16) { + /* calc from frame_bytes and channels */ + switch (id) { + case AV_CODEC_ID_FASTAUDIO: + return frame_bytes / (40 * ch) * 256; + case AV_CODEC_ID_ADPCM_IMA_MOFLEX: + return (frame_bytes - 4 * ch) / (128 * ch) * 256; + case AV_CODEC_ID_ADPCM_AFC: + return frame_bytes / (9 * ch) * 16; + case AV_CODEC_ID_ADPCM_PSX: + case AV_CODEC_ID_ADPCM_DTK: + frame_bytes /= 16 * ch; + if (frame_bytes > INT_MAX / 28) + return 0; + return frame_bytes * 28; + case AV_CODEC_ID_ADPCM_4XM: + case AV_CODEC_ID_ADPCM_IMA_ACORN: + case AV_CODEC_ID_ADPCM_IMA_DAT4: + case AV_CODEC_ID_ADPCM_IMA_ISS: + return (frame_bytes - 4 * ch) * 2 / ch; + case AV_CODEC_ID_ADPCM_IMA_SMJPEG: + return (frame_bytes - 4) * 2 / ch; + case AV_CODEC_ID_ADPCM_IMA_AMV: + return (frame_bytes - 8) * 2; + case AV_CODEC_ID_ADPCM_THP: + case AV_CODEC_ID_ADPCM_THP_LE: + if (extradata) + return frame_bytes * 14LL / (8 * ch); + break; + case AV_CODEC_ID_ADPCM_XA: + return (frame_bytes / 128) * 224 / ch; + case AV_CODEC_ID_INTERPLAY_DPCM: + return (frame_bytes - 6 - ch) / ch; + case AV_CODEC_ID_ROQ_DPCM: + return (frame_bytes - 8) / ch; + case AV_CODEC_ID_XAN_DPCM: + return (frame_bytes - 2 * ch) / ch; + case AV_CODEC_ID_MACE3: + return 3 * frame_bytes / ch; + case AV_CODEC_ID_MACE6: + return 6 * frame_bytes / ch; + case AV_CODEC_ID_PCM_LXF: + return 2 * (frame_bytes / (5 * ch)); + case AV_CODEC_ID_IAC: + case AV_CODEC_ID_IMC: + return 4 * frame_bytes / ch; + } + + if (tag) { + /* calc from frame_bytes, channels, and codec_tag */ + if (id == AV_CODEC_ID_SOL_DPCM) { + if (tag == 3) + return frame_bytes / ch; + else + return frame_bytes * 2 / ch; + } + } + + if (ba > 0) { + /* calc from frame_bytes, channels, and block_align */ + int blocks = frame_bytes / ba; + int64_t tmp = 0; + switch (id) { + case AV_CODEC_ID_ADPCM_IMA_WAV: + if (bps < 2 || bps > 5) + return 0; + tmp = blocks * (1LL + (ba - 4 * ch) / (bps * ch) * 8); + break; + case AV_CODEC_ID_ADPCM_IMA_DK3: + tmp = blocks * (((ba - 16LL) * 2 / 3 * 4) / ch); + break; + case AV_CODEC_ID_ADPCM_IMA_DK4: + tmp = blocks * (1 + (ba - 4LL * ch) * 2 / ch); + break; + case AV_CODEC_ID_ADPCM_IMA_RAD: + tmp = blocks * ((ba - 4LL * ch) * 2 / ch); + break; + case AV_CODEC_ID_ADPCM_MS: + tmp = blocks * (2 + (ba - 7LL * ch) * 2LL / ch); + break; + case AV_CODEC_ID_ADPCM_MTAF: + tmp = blocks * (ba - 16LL) * 2 / ch; + break; + case AV_CODEC_ID_ADPCM_XMD: + tmp = blocks * 32; + break; + } + if (tmp) { + if (tmp != (int)tmp) + return 0; + return tmp; + } + } + + if (bps > 0) { + /* calc from frame_bytes, channels, and bits_per_coded_sample */ + switch (id) { + case AV_CODEC_ID_PCM_DVD: + if(bps<4 || frame_bytes<3) + return 0; + return 2 * ((frame_bytes - 3) / ((bps * 2 / 8) * ch)); + case AV_CODEC_ID_PCM_BLURAY: + if(bps<4 || frame_bytes<4) + return 0; + return (frame_bytes - 4) / ((FFALIGN(ch, 2) * bps) / 8); + case AV_CODEC_ID_S302M: + return 2 * (frame_bytes / ((bps + 4) / 4)) / ch; + } + } + } + } + + /* Fall back on using frame_size */ + if (frame_size > 1 && frame_bytes) + return frame_size; + + //For WMA we currently have no other means to calculate duration thus we + //do it here by assuming CBR, which is true for all known cases. + if (bitrate > 0 && frame_bytes > 0 && sr > 0 && ba > 1) { + if (id == AV_CODEC_ID_WMAV1 || id == AV_CODEC_ID_WMAV2) + return (frame_bytes * 8LL * sr) / bitrate; + } + + return 0; +} + +int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes) +{ + int channels = avctx->ch_layout.nb_channels; + int duration; +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + if (!channels) + channels = avctx->channels; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + duration = get_audio_frame_duration(avctx->codec_id, avctx->sample_rate, + channels, avctx->block_align, + avctx->codec_tag, avctx->bits_per_coded_sample, + avctx->bit_rate, avctx->extradata, avctx->frame_size, + frame_bytes); + return FFMAX(0, duration); +} + +int av_get_audio_frame_duration2(AVCodecParameters *par, int frame_bytes) +{ + int channels = par->ch_layout.nb_channels; + int duration; +#if FF_API_OLD_CHANNEL_LAYOUT +FF_DISABLE_DEPRECATION_WARNINGS + if (!channels) + channels = par->channels; +FF_ENABLE_DEPRECATION_WARNINGS +#endif + duration = get_audio_frame_duration(par->codec_id, par->sample_rate, + channels, par->block_align, + par->codec_tag, par->bits_per_coded_sample, + par->bit_rate, par->extradata, par->frame_size, + frame_bytes); + return FFMAX(0, duration); +} + +#if !HAVE_THREADS +int ff_thread_init(AVCodecContext *s) +{ + return -1; +} + +#endif + +unsigned int av_xiphlacing(unsigned char *s, unsigned int v) +{ + unsigned int n = 0; + + while (v >= 0xff) { + *s++ = 0xff; + v -= 0xff; + n++; + } + *s = v; + n++; + return n; +} + +int ff_match_2uint16(const uint16_t(*tab)[2], int size, int a, int b) +{ + int i; + for (i = 0; i < size && !(tab[i][0] == a && tab[i][1] == b); i++) ; + return i; +} + +const AVCodecHWConfig *avcodec_get_hw_config(const AVCodec *avcodec, int index) +{ + const FFCodec *const codec = ffcodec(avcodec); + int i; + if (!codec->hw_configs || index < 0) + return NULL; + for (i = 0; i <= index; i++) + if (!codec->hw_configs[i]) + return NULL; + return &codec->hw_configs[index]->public; +} + +int ff_thread_ref_frame(ThreadFrame *dst, const ThreadFrame *src) +{ + int ret; + + dst->owner[0] = src->owner[0]; + dst->owner[1] = src->owner[1]; + + ret = av_frame_ref(dst->f, src->f); + if (ret < 0) + return ret; + + av_assert0(!dst->progress); + + if (src->progress && + !(dst->progress = av_buffer_ref(src->progress))) { + ff_thread_release_ext_buffer(dst->owner[0], dst); + return AVERROR(ENOMEM); + } + + return 0; +} + +#if !HAVE_THREADS + +int ff_thread_get_buffer(AVCodecContext *avctx, AVFrame *f, int flags) +{ + return ff_get_buffer(avctx, f, flags); +} + +int ff_thread_get_ext_buffer(AVCodecContext *avctx, ThreadFrame *f, int flags) +{ + f->owner[0] = f->owner[1] = avctx; + return ff_get_buffer(avctx, f->f, flags); +} + +void ff_thread_release_buffer(AVCodecContext *avctx, AVFrame *f) +{ + if (f) + av_frame_unref(f); +} + +void ff_thread_release_ext_buffer(AVCodecContext *avctx, ThreadFrame *f) +{ + f->owner[0] = f->owner[1] = NULL; + if (f->f) + av_frame_unref(f->f); +} + +void ff_thread_finish_setup(AVCodecContext *avctx) +{ +} + +void ff_thread_report_progress(ThreadFrame *f, int progress, int field) +{ +} + +void ff_thread_await_progress(const ThreadFrame *f, int progress, int field) +{ +} + +int ff_thread_can_start_frame(AVCodecContext *avctx) +{ + return 1; +} + +int ff_slice_thread_init_progress(AVCodecContext *avctx) +{ + return 0; +} + +int ff_slice_thread_allocz_entries(AVCodecContext *avctx, int count) +{ + return 0; +} + +void ff_thread_await_progress2(AVCodecContext *avctx, int field, int thread, int shift) +{ +} + +void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n) +{ +} + +#endif + +const uint8_t *avpriv_find_start_code(const uint8_t *av_restrict p, + const uint8_t *end, + uint32_t *av_restrict state) +{ + int i; + + av_assert0(p <= end); + if (p >= end) + return end; + + for (i = 0; i < 3; i++) { + uint32_t tmp = *state << 8; + *state = tmp + *(p++); + if (tmp == 0x100 || p == end) + return p; + } + + while (p < end) { + if (p[-1] > 1 ) p += 3; + else if (p[-2] ) p += 2; + else if (p[-3]|(p[-1]-1)) p++; + else { + p++; + break; + } + } + + p = FFMIN(p, end) - 4; + *state = AV_RB32(p); + + return p + 4; +} + +AVCPBProperties *av_cpb_properties_alloc(size_t *size) +{ + AVCPBProperties *props = av_mallocz(sizeof(AVCPBProperties)); + if (!props) + return NULL; + + if (size) + *size = sizeof(*props); + + props->vbv_delay = UINT64_MAX; + + return props; +} + +AVCPBProperties *ff_add_cpb_side_data(AVCodecContext *avctx) +{ + AVPacketSideData *tmp; + AVCPBProperties *props; + size_t size; + int i; + + for (i = 0; i < avctx->nb_coded_side_data; i++) + if (avctx->coded_side_data[i].type == AV_PKT_DATA_CPB_PROPERTIES) + return (AVCPBProperties *)avctx->coded_side_data[i].data; + + props = av_cpb_properties_alloc(&size); + if (!props) + return NULL; + + tmp = av_realloc_array(avctx->coded_side_data, avctx->nb_coded_side_data + 1, sizeof(*tmp)); + if (!tmp) { + av_freep(&props); + return NULL; + } + + avctx->coded_side_data = tmp; + avctx->nb_coded_side_data++; + + avctx->coded_side_data[avctx->nb_coded_side_data - 1].type = AV_PKT_DATA_CPB_PROPERTIES; + avctx->coded_side_data[avctx->nb_coded_side_data - 1].data = (uint8_t*)props; + avctx->coded_side_data[avctx->nb_coded_side_data - 1].size = size; + + return props; +} + +static unsigned bcd2uint(uint8_t bcd) +{ + unsigned low = bcd & 0xf; + unsigned high = bcd >> 4; + if (low > 9 || high > 9) + return 0; + return low + 10*high; +} + +int ff_alloc_timecode_sei(const AVFrame *frame, AVRational rate, size_t prefix_len, + void **data, size_t *sei_size) +{ + AVFrameSideData *sd = NULL; + uint8_t *sei_data; + PutBitContext pb; + uint32_t *tc; + int m; + + if (frame) + sd = av_frame_get_side_data(frame, AV_FRAME_DATA_S12M_TIMECODE); + + if (!sd) { + *data = NULL; + return 0; + } + tc = (uint32_t*)sd->data; + m = tc[0] & 3; + + *sei_size = sizeof(uint32_t) * 4; + *data = av_mallocz(*sei_size + prefix_len); + if (!*data) + return AVERROR(ENOMEM); + sei_data = (uint8_t*)*data + prefix_len; + + init_put_bits(&pb, sei_data, *sei_size); + put_bits(&pb, 2, m); // num_clock_ts + + for (int j = 1; j <= m; j++) { + uint32_t tcsmpte = tc[j]; + unsigned hh = bcd2uint(tcsmpte & 0x3f); // 6-bit hours + unsigned mm = bcd2uint(tcsmpte>>8 & 0x7f); // 7-bit minutes + unsigned ss = bcd2uint(tcsmpte>>16 & 0x7f); // 7-bit seconds + unsigned ff = bcd2uint(tcsmpte>>24 & 0x3f); // 6-bit frames + unsigned drop = tcsmpte & 1<<30 && !0; // 1-bit drop if not arbitrary bit + + /* Calculate frame number of HEVC by SMPTE ST 12-1:2014 Sec 12.2 if rate > 30FPS */ + if (av_cmp_q(rate, (AVRational) {30, 1}) == 1) { + unsigned pc; + ff *= 2; + if (av_cmp_q(rate, (AVRational) {50, 1}) == 0) + pc = !!(tcsmpte & 1 << 7); + else + pc = !!(tcsmpte & 1 << 23); + ff = (ff + pc) & 0x7f; + } + + put_bits(&pb, 1, 1); // clock_timestamp_flag + put_bits(&pb, 1, 1); // units_field_based_flag + put_bits(&pb, 5, 0); // counting_type + put_bits(&pb, 1, 1); // full_timestamp_flag + put_bits(&pb, 1, 0); // discontinuity_flag + put_bits(&pb, 1, drop); + put_bits(&pb, 9, ff); + put_bits(&pb, 6, ss); + put_bits(&pb, 6, mm); + put_bits(&pb, 5, hh); + put_bits(&pb, 5, 0); + } + flush_put_bits(&pb); + + return 0; +} + +int64_t ff_guess_coded_bitrate(AVCodecContext *avctx) +{ + AVRational framerate = avctx->framerate; + int bits_per_coded_sample = avctx->bits_per_coded_sample; + int64_t bitrate; + + if (!(framerate.num && framerate.den)) + framerate = av_inv_q(avctx->time_base); + if (!(framerate.num && framerate.den)) + return 0; + + if (!bits_per_coded_sample) { + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt); + bits_per_coded_sample = av_get_bits_per_pixel(desc); + } + bitrate = (int64_t)bits_per_coded_sample * avctx->width * avctx->height * + framerate.num / framerate.den; + + return bitrate; +} + +int ff_int_from_list_or_default(void *ctx, const char * val_name, int val, + const int * array_valid_values, int default_value) +{ + int i = 0, ref_val; + + while (1) { + ref_val = array_valid_values[i]; + if (ref_val == INT_MAX) + break; + if (val == ref_val) + return val; + i++; + } + /* val is not a valid value */ + av_log(ctx, AV_LOG_DEBUG, + "%s %d are not supported. Set to default value : %d\n", val_name, val, default_value); + return default_value; +} diff --git a/media/ffvpx/libavcodec/vaapi.h b/media/ffvpx/libavcodec/vaapi.h new file mode 100644 index 0000000000..2cf7da5889 --- /dev/null +++ b/media/ffvpx/libavcodec/vaapi.h @@ -0,0 +1,86 @@ +/* + * Video Acceleration API (shared data between FFmpeg and the video player) + * HW decode acceleration for MPEG-2, MPEG-4, H.264 and VC-1 + * + * Copyright (C) 2008-2009 Splitted-Desktop Systems + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VAAPI_H +#define AVCODEC_VAAPI_H + +/** + * @file + * @ingroup lavc_codec_hwaccel_vaapi + * Public libavcodec VA API header. + */ + +#include <stdint.h> +#include "libavutil/attributes.h" +#include "version.h" + +#if FF_API_STRUCT_VAAPI_CONTEXT + +/** + * @defgroup lavc_codec_hwaccel_vaapi VA API Decoding + * @ingroup lavc_codec_hwaccel + * @{ + */ + +/** + * This structure is used to share data between the FFmpeg library and + * the client video application. + * This shall be zero-allocated and available as + * AVCodecContext.hwaccel_context. All user members can be set once + * during initialization or through each AVCodecContext.get_buffer() + * function call. In any case, they must be valid prior to calling + * decoding functions. + * + * Deprecated: use AVCodecContext.hw_frames_ctx instead. + */ +struct attribute_deprecated vaapi_context { + /** + * Window system dependent data + * + * - encoding: unused + * - decoding: Set by user + */ + void *display; + + /** + * Configuration ID + * + * - encoding: unused + * - decoding: Set by user + */ + uint32_t config_id; + + /** + * Context ID (video decode pipeline) + * + * - encoding: unused + * - decoding: Set by user + */ + uint32_t context_id; +}; + +/* @} */ + +#endif /* FF_API_STRUCT_VAAPI_CONTEXT */ + +#endif /* AVCODEC_VAAPI_H */ diff --git a/media/ffvpx/libavcodec/vaapi_av1.c b/media/ffvpx/libavcodec/vaapi_av1.c new file mode 100644 index 0000000000..d0339b2705 --- /dev/null +++ b/media/ffvpx/libavcodec/vaapi_av1.c @@ -0,0 +1,451 @@ +/* + * AV1 HW decode acceleration through VA API + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/frame.h" +#include "libavutil/pixdesc.h" +#include "hwconfig.h" +#include "vaapi_decode.h" +#include "internal.h" +#include "av1dec.h" +#include "thread.h" + +typedef struct VAAPIAV1FrameRef { + AVFrame *frame; + int valid; +} VAAPIAV1FrameRef; + +typedef struct VAAPIAV1DecContext { + VAAPIDecodeContext base; + + /** + * For film grain case, VAAPI generate 2 output for each frame, + * current_frame will not apply film grain, and will be used for + * references for next frames. Maintain the reference list without + * applying film grain here. And current_display_picture will be + * used to apply film grain and push to downstream. + */ + VAAPIAV1FrameRef ref_tab[AV1_NUM_REF_FRAMES]; + AVFrame *tmp_frame; +} VAAPIAV1DecContext; + +static VASurfaceID vaapi_av1_surface_id(AV1Frame *vf) +{ + if (vf) + return ff_vaapi_get_surface_id(vf->f); + else + return VA_INVALID_SURFACE; +} + +static int8_t vaapi_av1_get_bit_depth_idx(AVCodecContext *avctx) +{ + AV1DecContext *s = avctx->priv_data; + const AV1RawSequenceHeader *seq = s->raw_seq; + int8_t bit_depth = 8; + + if (seq->seq_profile == 2 && seq->color_config.high_bitdepth) + bit_depth = seq->color_config.twelve_bit ? 12 : 10; + else if (seq->seq_profile <= 2) + bit_depth = seq->color_config.high_bitdepth ? 10 : 8; + else { + av_log(avctx, AV_LOG_ERROR, + "Couldn't get bit depth from profile:%d.\n", seq->seq_profile); + return -1; + } + return bit_depth == 8 ? 0 : bit_depth == 10 ? 1 : 2; +} + +static int vaapi_av1_decode_init(AVCodecContext *avctx) +{ + VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data; + + ctx->tmp_frame = av_frame_alloc(); + if (!ctx->tmp_frame) { + av_log(avctx, AV_LOG_ERROR, + "Failed to allocate frame.\n"); + return AVERROR(ENOMEM); + } + + for (int i = 0; i < FF_ARRAY_ELEMS(ctx->ref_tab); i++) { + ctx->ref_tab[i].frame = av_frame_alloc(); + if (!ctx->ref_tab[i].frame) { + av_log(avctx, AV_LOG_ERROR, + "Failed to allocate reference table frame %d.\n", i); + return AVERROR(ENOMEM); + } + ctx->ref_tab[i].valid = 0; + } + + return ff_vaapi_decode_init(avctx); +} + +static int vaapi_av1_decode_uninit(AVCodecContext *avctx) +{ + VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data; + + if (ctx->tmp_frame->buf[0]) + ff_thread_release_buffer(avctx, ctx->tmp_frame); + av_frame_free(&ctx->tmp_frame); + + for (int i = 0; i < FF_ARRAY_ELEMS(ctx->ref_tab); i++) { + if (ctx->ref_tab[i].frame->buf[0]) + ff_thread_release_buffer(avctx, ctx->ref_tab[i].frame); + av_frame_free(&ctx->ref_tab[i].frame); + } + + return ff_vaapi_decode_uninit(avctx); +} + + +static int vaapi_av1_start_frame(AVCodecContext *avctx, + av_unused const uint8_t *buffer, + av_unused uint32_t size) +{ + AV1DecContext *s = avctx->priv_data; + const AV1RawSequenceHeader *seq = s->raw_seq; + const AV1RawFrameHeader *frame_header = s->raw_frame_header; + const AV1RawFilmGrainParams *film_grain = &s->cur_frame.film_grain; + VAAPIDecodePicture *pic = s->cur_frame.hwaccel_picture_private; + VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data; + VADecPictureParameterBufferAV1 pic_param; + int8_t bit_depth_idx; + int err = 0; + int apply_grain = !(avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN) && film_grain->apply_grain; + uint8_t remap_lr_type[4] = {AV1_RESTORE_NONE, AV1_RESTORE_SWITCHABLE, AV1_RESTORE_WIENER, AV1_RESTORE_SGRPROJ}; + uint8_t segmentation_feature_signed[AV1_SEG_LVL_MAX] = {1, 1, 1, 1, 1, 0, 0, 0}; + uint8_t segmentation_feature_max[AV1_SEG_LVL_MAX] = {255, AV1_MAX_LOOP_FILTER, + AV1_MAX_LOOP_FILTER, AV1_MAX_LOOP_FILTER, AV1_MAX_LOOP_FILTER, 7 , 0 , 0 }; + + bit_depth_idx = vaapi_av1_get_bit_depth_idx(avctx); + if (bit_depth_idx < 0) + goto fail; + + if (apply_grain) { + if (ctx->tmp_frame->buf[0]) + ff_thread_release_buffer(avctx, ctx->tmp_frame); + err = ff_thread_get_buffer(avctx, ctx->tmp_frame, AV_GET_BUFFER_FLAG_REF); + if (err < 0) + goto fail; + pic->output_surface = ff_vaapi_get_surface_id(ctx->tmp_frame); + } else { + pic->output_surface = vaapi_av1_surface_id(&s->cur_frame); + } + + memset(&pic_param, 0, sizeof(VADecPictureParameterBufferAV1)); + pic_param = (VADecPictureParameterBufferAV1) { + .profile = seq->seq_profile, + .order_hint_bits_minus_1 = seq->order_hint_bits_minus_1, + .bit_depth_idx = bit_depth_idx, + .matrix_coefficients = seq->color_config.matrix_coefficients, + .current_frame = pic->output_surface, + .current_display_picture = vaapi_av1_surface_id(&s->cur_frame), + .frame_width_minus1 = frame_header->frame_width_minus_1, + .frame_height_minus1 = frame_header->frame_height_minus_1, + .primary_ref_frame = frame_header->primary_ref_frame, + .order_hint = frame_header->order_hint, + .tile_cols = frame_header->tile_cols, + .tile_rows = frame_header->tile_rows, + .context_update_tile_id = frame_header->context_update_tile_id, + .superres_scale_denominator = frame_header->use_superres ? + frame_header->coded_denom + AV1_SUPERRES_DENOM_MIN : + AV1_SUPERRES_NUM, + .interp_filter = frame_header->interpolation_filter, + .filter_level[0] = frame_header->loop_filter_level[0], + .filter_level[1] = frame_header->loop_filter_level[1], + .filter_level_u = frame_header->loop_filter_level[2], + .filter_level_v = frame_header->loop_filter_level[3], + .base_qindex = frame_header->base_q_idx, + .y_dc_delta_q = frame_header->delta_q_y_dc, + .u_dc_delta_q = frame_header->delta_q_u_dc, + .u_ac_delta_q = frame_header->delta_q_u_ac, + .v_dc_delta_q = frame_header->delta_q_v_dc, + .v_ac_delta_q = frame_header->delta_q_v_ac, + .cdef_damping_minus_3 = frame_header->cdef_damping_minus_3, + .cdef_bits = frame_header->cdef_bits, + .seq_info_fields.fields = { + .still_picture = seq->still_picture, + .use_128x128_superblock = seq->use_128x128_superblock, + .enable_filter_intra = seq->enable_filter_intra, + .enable_intra_edge_filter = seq->enable_intra_edge_filter, + .enable_interintra_compound = seq->enable_interintra_compound, + .enable_masked_compound = seq->enable_masked_compound, + .enable_dual_filter = seq->enable_dual_filter, + .enable_order_hint = seq->enable_order_hint, + .enable_jnt_comp = seq->enable_jnt_comp, + .enable_cdef = seq->enable_cdef, + .mono_chrome = seq->color_config.mono_chrome, + .color_range = seq->color_config.color_range, + .subsampling_x = seq->color_config.subsampling_x, + .subsampling_y = seq->color_config.subsampling_y, + .chroma_sample_position = seq->color_config.chroma_sample_position, + .film_grain_params_present = seq->film_grain_params_present && + !(avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN), + }, + .seg_info.segment_info_fields.bits = { + .enabled = frame_header->segmentation_enabled, + .update_map = frame_header->segmentation_update_map, + .temporal_update = frame_header->segmentation_temporal_update, + .update_data = frame_header->segmentation_update_data, + }, + .film_grain_info = { + .film_grain_info_fields.bits = { + .apply_grain = apply_grain, + .chroma_scaling_from_luma = film_grain->chroma_scaling_from_luma, + .grain_scaling_minus_8 = film_grain->grain_scaling_minus_8, + .ar_coeff_lag = film_grain->ar_coeff_lag, + .ar_coeff_shift_minus_6 = film_grain->ar_coeff_shift_minus_6, + .grain_scale_shift = film_grain->grain_scale_shift, + .overlap_flag = film_grain->overlap_flag, + .clip_to_restricted_range = film_grain->clip_to_restricted_range, + }, + .grain_seed = film_grain->grain_seed, + .num_y_points = film_grain->num_y_points, + .num_cb_points = film_grain->num_cb_points, + .num_cr_points = film_grain->num_cr_points, + .cb_mult = film_grain->cb_mult, + .cb_luma_mult = film_grain->cb_luma_mult, + .cb_offset = film_grain->cb_offset, + .cr_mult = film_grain->cr_mult, + .cr_luma_mult = film_grain->cr_luma_mult, + .cr_offset = film_grain->cr_offset, + }, + .pic_info_fields.bits = { + .frame_type = frame_header->frame_type, + .show_frame = frame_header->show_frame, + .showable_frame = frame_header->showable_frame, + .error_resilient_mode = frame_header->error_resilient_mode, + .disable_cdf_update = frame_header->disable_cdf_update, + .allow_screen_content_tools = frame_header->allow_screen_content_tools, + .force_integer_mv = frame_header->force_integer_mv, + .allow_intrabc = frame_header->allow_intrabc, + .use_superres = frame_header->use_superres, + .allow_high_precision_mv = frame_header->allow_high_precision_mv, + .is_motion_mode_switchable = frame_header->is_motion_mode_switchable, + .use_ref_frame_mvs = frame_header->use_ref_frame_mvs, + .disable_frame_end_update_cdf = frame_header->disable_frame_end_update_cdf, + .uniform_tile_spacing_flag = frame_header->uniform_tile_spacing_flag, + .allow_warped_motion = frame_header->allow_warped_motion, + }, + .loop_filter_info_fields.bits = { + .sharpness_level = frame_header->loop_filter_sharpness, + .mode_ref_delta_enabled = frame_header->loop_filter_delta_enabled, + .mode_ref_delta_update = frame_header->loop_filter_delta_update, + }, + .mode_control_fields.bits = { + .delta_q_present_flag = frame_header->delta_q_present, + .log2_delta_q_res = frame_header->delta_q_res, + .delta_lf_present_flag = frame_header->delta_lf_present, + .log2_delta_lf_res = frame_header->delta_lf_res, + .delta_lf_multi = frame_header->delta_lf_multi, + .tx_mode = frame_header->tx_mode, + .reference_select = frame_header->reference_select, + .reduced_tx_set_used = frame_header->reduced_tx_set, + .skip_mode_present = frame_header->skip_mode_present, + }, + .loop_restoration_fields.bits = { + .yframe_restoration_type = remap_lr_type[frame_header->lr_type[0]], + .cbframe_restoration_type = remap_lr_type[frame_header->lr_type[1]], + .crframe_restoration_type = remap_lr_type[frame_header->lr_type[2]], + .lr_unit_shift = frame_header->lr_unit_shift, + .lr_uv_shift = frame_header->lr_uv_shift, + }, + .qmatrix_fields.bits = { + .using_qmatrix = frame_header->using_qmatrix, + .qm_y = frame_header->qm_y, + .qm_u = frame_header->qm_u, + .qm_v = frame_header->qm_v, + } + }; + + for (int i = 0; i < AV1_NUM_REF_FRAMES; i++) { + if (pic_param.pic_info_fields.bits.frame_type == AV1_FRAME_KEY && frame_header->show_frame) + pic_param.ref_frame_map[i] = VA_INVALID_ID; + else + pic_param.ref_frame_map[i] = ctx->ref_tab[i].valid ? + ff_vaapi_get_surface_id(ctx->ref_tab[i].frame) : + vaapi_av1_surface_id(&s->ref[i]); + } + for (int i = 0; i < AV1_REFS_PER_FRAME; i++) { + pic_param.ref_frame_idx[i] = frame_header->ref_frame_idx[i]; + } + for (int i = 0; i < AV1_TOTAL_REFS_PER_FRAME; i++) { + pic_param.ref_deltas[i] = frame_header->loop_filter_ref_deltas[i]; + } + for (int i = 0; i < 2; i++) { + pic_param.mode_deltas[i] = frame_header->loop_filter_mode_deltas[i]; + } + for (int i = 0; i < (1 << frame_header->cdef_bits); i++) { + pic_param.cdef_y_strengths[i] = + (frame_header->cdef_y_pri_strength[i] << 2) + + frame_header->cdef_y_sec_strength[i]; + pic_param.cdef_uv_strengths[i] = + (frame_header->cdef_uv_pri_strength[i] << 2) + + frame_header->cdef_uv_sec_strength[i]; + } + for (int i = 0; i < frame_header->tile_cols; i++) { + pic_param.width_in_sbs_minus_1[i] = + frame_header->width_in_sbs_minus_1[i]; + } + for (int i = 0; i < frame_header->tile_rows; i++) { + pic_param.height_in_sbs_minus_1[i] = + frame_header->height_in_sbs_minus_1[i]; + } + for (int i = AV1_REF_FRAME_LAST; i <= AV1_REF_FRAME_ALTREF; i++) { + pic_param.wm[i - 1].invalid = s->cur_frame.gm_invalid[i]; + pic_param.wm[i - 1].wmtype = s->cur_frame.gm_type[i]; + for (int j = 0; j < 6; j++) + pic_param.wm[i - 1].wmmat[j] = s->cur_frame.gm_params[i][j]; + } + for (int i = 0; i < AV1_MAX_SEGMENTS; i++) { + for (int j = 0; j < AV1_SEG_LVL_MAX; j++) { + pic_param.seg_info.feature_mask[i] |= (frame_header->feature_enabled[i][j] << j); + if (segmentation_feature_signed[j]) + pic_param.seg_info.feature_data[i][j] = av_clip(frame_header->feature_value[i][j], + -segmentation_feature_max[j], segmentation_feature_max[j]); + else + pic_param.seg_info.feature_data[i][j] = av_clip(frame_header->feature_value[i][j], + 0, segmentation_feature_max[j]); + } + } + if (apply_grain) { + for (int i = 0; i < film_grain->num_y_points; i++) { + pic_param.film_grain_info.point_y_value[i] = + film_grain->point_y_value[i]; + pic_param.film_grain_info.point_y_scaling[i] = + film_grain->point_y_scaling[i]; + } + for (int i = 0; i < film_grain->num_cb_points; i++) { + pic_param.film_grain_info.point_cb_value[i] = + film_grain->point_cb_value[i]; + pic_param.film_grain_info.point_cb_scaling[i] = + film_grain->point_cb_scaling[i]; + } + for (int i = 0; i < film_grain->num_cr_points; i++) { + pic_param.film_grain_info.point_cr_value[i] = + film_grain->point_cr_value[i]; + pic_param.film_grain_info.point_cr_scaling[i] = + film_grain->point_cr_scaling[i]; + } + for (int i = 0; i < 24; i++) { + pic_param.film_grain_info.ar_coeffs_y[i] = + film_grain->ar_coeffs_y_plus_128[i] - 128; + } + for (int i = 0; i < 25; i++) { + pic_param.film_grain_info.ar_coeffs_cb[i] = + film_grain->ar_coeffs_cb_plus_128[i] - 128; + pic_param.film_grain_info.ar_coeffs_cr[i] = + film_grain->ar_coeffs_cr_plus_128[i] - 128; + } + } + err = ff_vaapi_decode_make_param_buffer(avctx, pic, + VAPictureParameterBufferType, + &pic_param, sizeof(pic_param)); + if (err < 0) + goto fail; + + return 0; + +fail: + ff_vaapi_decode_cancel(avctx, pic); + return err; +} + +static int vaapi_av1_end_frame(AVCodecContext *avctx) +{ + const AV1DecContext *s = avctx->priv_data; + const AV1RawFrameHeader *header = s->raw_frame_header; + const AV1RawFilmGrainParams *film_grain = &s->cur_frame.film_grain; + VAAPIDecodePicture *pic = s->cur_frame.hwaccel_picture_private; + VAAPIAV1DecContext *ctx = avctx->internal->hwaccel_priv_data; + + int apply_grain = !(avctx->export_side_data & AV_CODEC_EXPORT_DATA_FILM_GRAIN) && film_grain->apply_grain; + int ret; + ret = ff_vaapi_decode_issue(avctx, pic); + if (ret < 0) + return ret; + + for (int i = 0; i < AV1_NUM_REF_FRAMES; i++) { + if (header->refresh_frame_flags & (1 << i)) { + if (ctx->ref_tab[i].frame->buf[0]) + ff_thread_release_buffer(avctx, ctx->ref_tab[i].frame); + + if (apply_grain) { + ret = av_frame_ref(ctx->ref_tab[i].frame, ctx->tmp_frame); + if (ret < 0) + return ret; + ctx->ref_tab[i].valid = 1; + } else { + ctx->ref_tab[i].valid = 0; + } + } + } + + return 0; +} + +static int vaapi_av1_decode_slice(AVCodecContext *avctx, + const uint8_t *buffer, + uint32_t size) +{ + const AV1DecContext *s = avctx->priv_data; + VAAPIDecodePicture *pic = s->cur_frame.hwaccel_picture_private; + VASliceParameterBufferAV1 slice_param; + int err = 0; + + for (int i = s->tg_start; i <= s->tg_end; i++) { + memset(&slice_param, 0, sizeof(VASliceParameterBufferAV1)); + + slice_param = (VASliceParameterBufferAV1) { + .slice_data_size = s->tile_group_info[i].tile_size, + .slice_data_offset = s->tile_group_info[i].tile_offset, + .slice_data_flag = VA_SLICE_DATA_FLAG_ALL, + .tile_row = s->tile_group_info[i].tile_row, + .tile_column = s->tile_group_info[i].tile_column, + .tg_start = s->tg_start, + .tg_end = s->tg_end, + }; + + err = ff_vaapi_decode_make_slice_buffer(avctx, pic, &slice_param, + sizeof(VASliceParameterBufferAV1), + buffer, + size); + if (err) { + ff_vaapi_decode_cancel(avctx, pic); + return err; + } + } + + return 0; +} + +const AVHWAccel ff_av1_vaapi_hwaccel = { + .name = "av1_vaapi", + .type = AVMEDIA_TYPE_VIDEO, + .id = AV_CODEC_ID_AV1, + .pix_fmt = AV_PIX_FMT_VAAPI, + .start_frame = vaapi_av1_start_frame, + .end_frame = vaapi_av1_end_frame, + .decode_slice = vaapi_av1_decode_slice, + .frame_priv_data_size = sizeof(VAAPIDecodePicture), + .init = vaapi_av1_decode_init, + .uninit = vaapi_av1_decode_uninit, + .frame_params = ff_vaapi_common_frame_params, + .priv_data_size = sizeof(VAAPIAV1DecContext), + .caps_internal = HWACCEL_CAP_ASYNC_SAFE, +}; diff --git a/media/ffvpx/libavcodec/vaapi_decode.c b/media/ffvpx/libavcodec/vaapi_decode.c new file mode 100644 index 0000000000..ab8c12e364 --- /dev/null +++ b/media/ffvpx/libavcodec/vaapi_decode.c @@ -0,0 +1,726 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config_components.h" + +#include "libavutil/avassert.h" +#include "libavutil/common.h" +#include "libavutil/pixdesc.h" + +#include "avcodec.h" +#include "decode.h" +#include "internal.h" +#include "vaapi_decode.h" +#include "vaapi_hevc.h" + + +int ff_vaapi_decode_make_param_buffer(AVCodecContext *avctx, + VAAPIDecodePicture *pic, + int type, + const void *data, + size_t size) +{ + VAAPIDecodeContext *ctx = avctx->internal->hwaccel_priv_data; + VAStatus vas; + VABufferID buffer; + + av_assert0(pic->nb_param_buffers + 1 <= MAX_PARAM_BUFFERS); + + vas = vaCreateBuffer(ctx->hwctx->display, ctx->va_context, + type, size, 1, (void*)data, &buffer); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to create parameter " + "buffer (type %d): %d (%s).\n", + type, vas, vaErrorStr(vas)); + return AVERROR(EIO); + } + + pic->param_buffers[pic->nb_param_buffers++] = buffer; + + av_log(avctx, AV_LOG_DEBUG, "Param buffer (type %d, %zu bytes) " + "is %#x.\n", type, size, buffer); + return 0; +} + + +int ff_vaapi_decode_make_slice_buffer(AVCodecContext *avctx, + VAAPIDecodePicture *pic, + const void *params_data, + size_t params_size, + const void *slice_data, + size_t slice_size) +{ + VAAPIDecodeContext *ctx = avctx->internal->hwaccel_priv_data; + VAStatus vas; + int index; + + av_assert0(pic->nb_slices <= pic->slices_allocated); + if (pic->nb_slices == pic->slices_allocated) { + if (pic->slices_allocated > 0) + pic->slices_allocated *= 2; + else + pic->slices_allocated = 64; + + pic->slice_buffers = + av_realloc_array(pic->slice_buffers, + pic->slices_allocated, + 2 * sizeof(*pic->slice_buffers)); + if (!pic->slice_buffers) + return AVERROR(ENOMEM); + } + av_assert0(pic->nb_slices + 1 <= pic->slices_allocated); + + index = 2 * pic->nb_slices; + + vas = vaCreateBuffer(ctx->hwctx->display, ctx->va_context, + VASliceParameterBufferType, + params_size, 1, (void*)params_data, + &pic->slice_buffers[index]); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to create slice " + "parameter buffer: %d (%s).\n", vas, vaErrorStr(vas)); + return AVERROR(EIO); + } + + av_log(avctx, AV_LOG_DEBUG, "Slice %d param buffer (%zu bytes) " + "is %#x.\n", pic->nb_slices, params_size, + pic->slice_buffers[index]); + + vas = vaCreateBuffer(ctx->hwctx->display, ctx->va_context, + VASliceDataBufferType, + slice_size, 1, (void*)slice_data, + &pic->slice_buffers[index + 1]); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to create slice " + "data buffer (size %zu): %d (%s).\n", + slice_size, vas, vaErrorStr(vas)); + vaDestroyBuffer(ctx->hwctx->display, + pic->slice_buffers[index]); + return AVERROR(EIO); + } + + av_log(avctx, AV_LOG_DEBUG, "Slice %d data buffer (%zu bytes) " + "is %#x.\n", pic->nb_slices, slice_size, + pic->slice_buffers[index + 1]); + + ++pic->nb_slices; + return 0; +} + +static void ff_vaapi_decode_destroy_buffers(AVCodecContext *avctx, + VAAPIDecodePicture *pic) +{ + VAAPIDecodeContext *ctx = avctx->internal->hwaccel_priv_data; + VAStatus vas; + int i; + + for (i = 0; i < pic->nb_param_buffers; i++) { + vas = vaDestroyBuffer(ctx->hwctx->display, + pic->param_buffers[i]); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to destroy " + "parameter buffer %#x: %d (%s).\n", + pic->param_buffers[i], vas, vaErrorStr(vas)); + } + } + + for (i = 0; i < 2 * pic->nb_slices; i++) { + vas = vaDestroyBuffer(ctx->hwctx->display, + pic->slice_buffers[i]); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to destroy slice " + "slice buffer %#x: %d (%s).\n", + pic->slice_buffers[i], vas, vaErrorStr(vas)); + } + } +} + +int ff_vaapi_decode_issue(AVCodecContext *avctx, + VAAPIDecodePicture *pic) +{ + VAAPIDecodeContext *ctx = avctx->internal->hwaccel_priv_data; + VAStatus vas; + int err; + + av_log(avctx, AV_LOG_DEBUG, "Decode to surface %#x.\n", + pic->output_surface); + + vas = vaBeginPicture(ctx->hwctx->display, ctx->va_context, + pic->output_surface); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to begin picture decode " + "issue: %d (%s).\n", vas, vaErrorStr(vas)); + err = AVERROR(EIO); + goto fail_with_picture; + } + + vas = vaRenderPicture(ctx->hwctx->display, ctx->va_context, + pic->param_buffers, pic->nb_param_buffers); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to upload decode " + "parameters: %d (%s).\n", vas, vaErrorStr(vas)); + err = AVERROR(EIO); + goto fail_with_picture; + } + + vas = vaRenderPicture(ctx->hwctx->display, ctx->va_context, + pic->slice_buffers, 2 * pic->nb_slices); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to upload slices: " + "%d (%s).\n", vas, vaErrorStr(vas)); + err = AVERROR(EIO); + goto fail_with_picture; + } + + vas = vaEndPicture(ctx->hwctx->display, ctx->va_context); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to end picture decode " + "issue: %d (%s).\n", vas, vaErrorStr(vas)); + err = AVERROR(EIO); + if (CONFIG_VAAPI_1 || ctx->hwctx->driver_quirks & + AV_VAAPI_DRIVER_QUIRK_RENDER_PARAM_BUFFERS) + goto fail; + else + goto fail_at_end; + } + + if (CONFIG_VAAPI_1 || ctx->hwctx->driver_quirks & + AV_VAAPI_DRIVER_QUIRK_RENDER_PARAM_BUFFERS) + ff_vaapi_decode_destroy_buffers(avctx, pic); + + err = 0; + goto exit; + +fail_with_picture: + vas = vaEndPicture(ctx->hwctx->display, ctx->va_context); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to end picture decode " + "after error: %d (%s).\n", vas, vaErrorStr(vas)); + } +fail: + ff_vaapi_decode_destroy_buffers(avctx, pic); +fail_at_end: +exit: + pic->nb_param_buffers = 0; + pic->nb_slices = 0; + pic->slices_allocated = 0; + av_freep(&pic->slice_buffers); + + return err; +} + +int ff_vaapi_decode_cancel(AVCodecContext *avctx, + VAAPIDecodePicture *pic) +{ + ff_vaapi_decode_destroy_buffers(avctx, pic); + + pic->nb_param_buffers = 0; + pic->nb_slices = 0; + pic->slices_allocated = 0; + av_freep(&pic->slice_buffers); + + return 0; +} + +static const struct { + uint32_t fourcc; + enum AVPixelFormat pix_fmt; +} vaapi_format_map[] = { +#define MAP(va, av) { VA_FOURCC_ ## va, AV_PIX_FMT_ ## av } + // 4:0:0 + MAP(Y800, GRAY8), + // 4:2:0 + MAP(NV12, NV12), + MAP(YV12, YUV420P), + MAP(IYUV, YUV420P), +#ifdef VA_FOURCC_I420 + MAP(I420, YUV420P), +#endif + MAP(IMC3, YUV420P), + // 4:1:1 + MAP(411P, YUV411P), + // 4:2:2 + MAP(422H, YUV422P), +#ifdef VA_FOURCC_YV16 + MAP(YV16, YUV422P), +#endif + MAP(YUY2, YUYV422), +#ifdef VA_FOURCC_Y210 + MAP(Y210, Y210), +#endif +#ifdef VA_FOURCC_Y212 + MAP(Y212, Y212), +#endif + // 4:4:0 + MAP(422V, YUV440P), + // 4:4:4 + MAP(444P, YUV444P), +#ifdef VA_FOURCC_XYUV + MAP(XYUV, VUYX), +#endif +#ifdef VA_FOURCC_Y410 + MAP(Y410, XV30), +#endif +#ifdef VA_FOURCC_Y412 + MAP(Y412, XV36), +#endif + // 4:2:0 10-bit +#ifdef VA_FOURCC_P010 + MAP(P010, P010), +#endif +#ifdef VA_FOURCC_P012 + MAP(P012, P012), +#endif +#ifdef VA_FOURCC_I010 + MAP(I010, YUV420P10), +#endif +#undef MAP +}; + +static int vaapi_decode_find_best_format(AVCodecContext *avctx, + AVHWDeviceContext *device, + VAConfigID config_id, + AVHWFramesContext *frames) +{ + AVVAAPIDeviceContext *hwctx = device->hwctx; + VAStatus vas; + VASurfaceAttrib *attr; + enum AVPixelFormat source_format, best_format, format; + uint32_t best_fourcc, fourcc; + int i, j, nb_attr; + + source_format = avctx->sw_pix_fmt; + av_assert0(source_format != AV_PIX_FMT_NONE); + + vas = vaQuerySurfaceAttributes(hwctx->display, config_id, + NULL, &nb_attr); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to query surface attributes: " + "%d (%s).\n", vas, vaErrorStr(vas)); + return AVERROR(ENOSYS); + } + + attr = av_malloc_array(nb_attr, sizeof(*attr)); + if (!attr) + return AVERROR(ENOMEM); + + vas = vaQuerySurfaceAttributes(hwctx->display, config_id, + attr, &nb_attr); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to query surface attributes: " + "%d (%s).\n", vas, vaErrorStr(vas)); + av_freep(&attr); + return AVERROR(ENOSYS); + } + + best_format = AV_PIX_FMT_NONE; + + for (i = 0; i < nb_attr; i++) { + if (attr[i].type != VASurfaceAttribPixelFormat) + continue; + + fourcc = attr[i].value.value.i; + for (j = 0; j < FF_ARRAY_ELEMS(vaapi_format_map); j++) { + if (fourcc == vaapi_format_map[j].fourcc) + break; + } + if (j >= FF_ARRAY_ELEMS(vaapi_format_map)) { + av_log(avctx, AV_LOG_DEBUG, "Ignoring unknown format %#x.\n", + fourcc); + continue; + } + format = vaapi_format_map[j].pix_fmt; + av_log(avctx, AV_LOG_DEBUG, "Considering format %#x -> %s.\n", + fourcc, av_get_pix_fmt_name(format)); + + best_format = av_find_best_pix_fmt_of_2(format, best_format, + source_format, 0, NULL); + if (format == best_format) + best_fourcc = fourcc; + } + + av_freep(&attr); + + if (best_format == AV_PIX_FMT_NONE) { + av_log(avctx, AV_LOG_ERROR, "No usable formats for decoding!\n"); + return AVERROR(EINVAL); + } + + av_log(avctx, AV_LOG_DEBUG, "Picked %s (%#x) as best match for %s.\n", + av_get_pix_fmt_name(best_format), best_fourcc, + av_get_pix_fmt_name(source_format)); + + frames->sw_format = best_format; + if (avctx->internal->hwaccel_priv_data) { + VAAPIDecodeContext *ctx = avctx->internal->hwaccel_priv_data; + AVVAAPIFramesContext *avfc = frames->hwctx; + + ctx->pixel_format_attribute = (VASurfaceAttrib) { + .type = VASurfaceAttribPixelFormat, + .flags = VA_SURFACE_ATTRIB_SETTABLE, + .value.type = VAGenericValueTypeInteger, + .value.value.i = best_fourcc, + }; + + avfc->attributes = &ctx->pixel_format_attribute; + avfc->nb_attributes = 1; + } + + return 0; +} + +static const struct { + enum AVCodecID codec_id; + int codec_profile; + VAProfile va_profile; + VAProfile (*profile_parser)(AVCodecContext *avctx); +} vaapi_profile_map[] = { +#define MAP(c, p, v, ...) { AV_CODEC_ID_ ## c, FF_PROFILE_ ## p, VAProfile ## v, __VA_ARGS__ } + MAP(MPEG2VIDEO, MPEG2_SIMPLE, MPEG2Simple ), + MAP(MPEG2VIDEO, MPEG2_MAIN, MPEG2Main ), + MAP(H263, UNKNOWN, H263Baseline), + MAP(MPEG4, MPEG4_SIMPLE, MPEG4Simple ), + MAP(MPEG4, MPEG4_ADVANCED_SIMPLE, + MPEG4AdvancedSimple), + MAP(MPEG4, MPEG4_MAIN, MPEG4Main ), + MAP(H264, H264_CONSTRAINED_BASELINE, + H264ConstrainedBaseline), + MAP(H264, H264_MAIN, H264Main ), + MAP(H264, H264_HIGH, H264High ), +#if VA_CHECK_VERSION(0, 37, 0) + MAP(HEVC, HEVC_MAIN, HEVCMain ), + MAP(HEVC, HEVC_MAIN_10, HEVCMain10 ), + MAP(HEVC, HEVC_MAIN_STILL_PICTURE, + HEVCMain ), +#endif +#if VA_CHECK_VERSION(1, 2, 0) && CONFIG_HEVC_VAAPI_HWACCEL + MAP(HEVC, HEVC_REXT, None, + ff_vaapi_parse_hevc_rext_scc_profile ), + MAP(HEVC, HEVC_SCC, None, + ff_vaapi_parse_hevc_rext_scc_profile ), +#endif + MAP(MJPEG, MJPEG_HUFFMAN_BASELINE_DCT, + JPEGBaseline), + MAP(WMV3, VC1_SIMPLE, VC1Simple ), + MAP(WMV3, VC1_MAIN, VC1Main ), + MAP(WMV3, VC1_COMPLEX, VC1Advanced ), + MAP(WMV3, VC1_ADVANCED, VC1Advanced ), + MAP(VC1, VC1_SIMPLE, VC1Simple ), + MAP(VC1, VC1_MAIN, VC1Main ), + MAP(VC1, VC1_COMPLEX, VC1Advanced ), + MAP(VC1, VC1_ADVANCED, VC1Advanced ), + MAP(VP8, UNKNOWN, VP8Version0_3 ), +#if VA_CHECK_VERSION(0, 38, 0) + MAP(VP9, VP9_0, VP9Profile0 ), +#endif +#if VA_CHECK_VERSION(0, 39, 0) + MAP(VP9, VP9_1, VP9Profile1 ), + MAP(VP9, VP9_2, VP9Profile2 ), + MAP(VP9, VP9_3, VP9Profile3 ), +#endif +#if VA_CHECK_VERSION(1, 8, 0) + MAP(AV1, AV1_MAIN, AV1Profile0), + MAP(AV1, AV1_HIGH, AV1Profile1), +#endif + +#undef MAP +}; + +/* + * Set *va_config and the frames_ref fields from the current codec parameters + * in avctx. + */ +static int vaapi_decode_make_config(AVCodecContext *avctx, + AVBufferRef *device_ref, + VAConfigID *va_config, + AVBufferRef *frames_ref) +{ + AVVAAPIHWConfig *hwconfig = NULL; + AVHWFramesConstraints *constraints = NULL; + VAStatus vas; + int err, i, j; + const AVCodecDescriptor *codec_desc; + VAProfile *profile_list = NULL, matched_va_profile, va_profile; + int profile_count, exact_match, matched_ff_profile, codec_profile; + + AVHWDeviceContext *device = (AVHWDeviceContext*)device_ref->data; + AVVAAPIDeviceContext *hwctx = device->hwctx; + + codec_desc = avcodec_descriptor_get(avctx->codec_id); + if (!codec_desc) { + err = AVERROR(EINVAL); + goto fail; + } + + profile_count = vaMaxNumProfiles(hwctx->display); + profile_list = av_malloc_array(profile_count, + sizeof(VAProfile)); + if (!profile_list) { + err = AVERROR(ENOMEM); + goto fail; + } + + vas = vaQueryConfigProfiles(hwctx->display, + profile_list, &profile_count); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to query profiles: " + "%d (%s).\n", vas, vaErrorStr(vas)); + err = AVERROR(ENOSYS); + goto fail; + } + + matched_va_profile = VAProfileNone; + exact_match = 0; + + for (i = 0; i < FF_ARRAY_ELEMS(vaapi_profile_map); i++) { + int profile_match = 0; + if (avctx->codec_id != vaapi_profile_map[i].codec_id) + continue; + if (avctx->profile == vaapi_profile_map[i].codec_profile || + vaapi_profile_map[i].codec_profile == FF_PROFILE_UNKNOWN) + profile_match = 1; + + va_profile = vaapi_profile_map[i].profile_parser ? + vaapi_profile_map[i].profile_parser(avctx) : + vaapi_profile_map[i].va_profile; + codec_profile = vaapi_profile_map[i].codec_profile; + + for (j = 0; j < profile_count; j++) { + if (va_profile == profile_list[j]) { + exact_match = profile_match; + break; + } + } + if (j < profile_count) { + matched_va_profile = va_profile; + matched_ff_profile = codec_profile; + if (exact_match) + break; + } + } + av_freep(&profile_list); + + if (matched_va_profile == VAProfileNone) { + av_log(avctx, AV_LOG_ERROR, "No support for codec %s " + "profile %d.\n", codec_desc->name, avctx->profile); + err = AVERROR(ENOSYS); + goto fail; + } + if (!exact_match) { + if (avctx->hwaccel_flags & + AV_HWACCEL_FLAG_ALLOW_PROFILE_MISMATCH) { + av_log(avctx, AV_LOG_VERBOSE, "Codec %s profile %d not " + "supported for hardware decode.\n", + codec_desc->name, avctx->profile); + av_log(avctx, AV_LOG_WARNING, "Using possibly-" + "incompatible profile %d instead.\n", + matched_ff_profile); + } else { + av_log(avctx, AV_LOG_VERBOSE, "Codec %s profile %d not " + "supported for hardware decode.\n", + codec_desc->name, avctx->profile); + err = AVERROR(EINVAL); + goto fail; + } + } + + vas = vaCreateConfig(hwctx->display, matched_va_profile, + VAEntrypointVLD, NULL, 0, + va_config); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to create decode " + "configuration: %d (%s).\n", vas, vaErrorStr(vas)); + err = AVERROR(EIO); + goto fail; + } + + hwconfig = av_hwdevice_hwconfig_alloc(device_ref); + if (!hwconfig) { + err = AVERROR(ENOMEM); + goto fail; + } + hwconfig->config_id = *va_config; + + constraints = + av_hwdevice_get_hwframe_constraints(device_ref, hwconfig); + if (!constraints) { + err = AVERROR(ENOMEM); + goto fail; + } + + if (avctx->coded_width < constraints->min_width || + avctx->coded_height < constraints->min_height || + avctx->coded_width > constraints->max_width || + avctx->coded_height > constraints->max_height) { + av_log(avctx, AV_LOG_ERROR, "Hardware does not support image " + "size %dx%d (constraints: width %d-%d height %d-%d).\n", + avctx->coded_width, avctx->coded_height, + constraints->min_width, constraints->max_width, + constraints->min_height, constraints->max_height); + err = AVERROR(EINVAL); + goto fail; + } + if (!constraints->valid_sw_formats || + constraints->valid_sw_formats[0] == AV_PIX_FMT_NONE) { + av_log(avctx, AV_LOG_ERROR, "Hardware does not offer any " + "usable surface formats.\n"); + err = AVERROR(EINVAL); + goto fail; + } + + if (frames_ref) { + AVHWFramesContext *frames = (AVHWFramesContext *)frames_ref->data; + + frames->format = AV_PIX_FMT_VAAPI; + frames->width = avctx->coded_width; + frames->height = avctx->coded_height; + + err = vaapi_decode_find_best_format(avctx, device, + *va_config, frames); + if (err < 0) + goto fail; + + frames->initial_pool_size = 1; + // Add per-codec number of surfaces used for storing reference frames. + switch (avctx->codec_id) { + case AV_CODEC_ID_H264: + case AV_CODEC_ID_HEVC: + case AV_CODEC_ID_AV1: + frames->initial_pool_size += 16; + break; + case AV_CODEC_ID_VP9: + frames->initial_pool_size += 8; + break; + case AV_CODEC_ID_VP8: + frames->initial_pool_size += 3; + break; + default: + frames->initial_pool_size += 2; + } + } + + av_hwframe_constraints_free(&constraints); + av_freep(&hwconfig); + + return 0; + +fail: + av_hwframe_constraints_free(&constraints); + av_freep(&hwconfig); + if (*va_config != VA_INVALID_ID) { + vaDestroyConfig(hwctx->display, *va_config); + *va_config = VA_INVALID_ID; + } + av_freep(&profile_list); + return err; +} + +int ff_vaapi_common_frame_params(AVCodecContext *avctx, + AVBufferRef *hw_frames_ctx) +{ + AVHWFramesContext *hw_frames = (AVHWFramesContext *)hw_frames_ctx->data; + AVHWDeviceContext *device_ctx = hw_frames->device_ctx; + AVVAAPIDeviceContext *hwctx; + VAConfigID va_config = VA_INVALID_ID; + int err; + + if (device_ctx->type != AV_HWDEVICE_TYPE_VAAPI) + return AVERROR(EINVAL); + hwctx = device_ctx->hwctx; + + err = vaapi_decode_make_config(avctx, hw_frames->device_ref, &va_config, + hw_frames_ctx); + if (err) + return err; + + if (va_config != VA_INVALID_ID) + vaDestroyConfig(hwctx->display, va_config); + + return 0; +} + +int ff_vaapi_decode_init(AVCodecContext *avctx) +{ + VAAPIDecodeContext *ctx = avctx->internal->hwaccel_priv_data; + VAStatus vas; + int err; + + ctx->va_config = VA_INVALID_ID; + ctx->va_context = VA_INVALID_ID; + + err = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_VAAPI); + if (err < 0) + goto fail; + + ctx->frames = (AVHWFramesContext*)avctx->hw_frames_ctx->data; + ctx->hwfc = ctx->frames->hwctx; + ctx->device = ctx->frames->device_ctx; + ctx->hwctx = ctx->device->hwctx; + + err = vaapi_decode_make_config(avctx, ctx->frames->device_ref, + &ctx->va_config, NULL); + if (err) + goto fail; + + vas = vaCreateContext(ctx->hwctx->display, ctx->va_config, + avctx->coded_width, avctx->coded_height, + VA_PROGRESSIVE, + ctx->hwfc->surface_ids, + ctx->hwfc->nb_surfaces, + &ctx->va_context); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to create decode " + "context: %d (%s).\n", vas, vaErrorStr(vas)); + err = AVERROR(EIO); + goto fail; + } + + av_log(avctx, AV_LOG_DEBUG, "Decode context initialised: " + "%#x/%#x.\n", ctx->va_config, ctx->va_context); + + return 0; + +fail: + ff_vaapi_decode_uninit(avctx); + return err; +} + +int ff_vaapi_decode_uninit(AVCodecContext *avctx) +{ + VAAPIDecodeContext *ctx = avctx->internal->hwaccel_priv_data; + VAStatus vas; + + if (ctx->va_context != VA_INVALID_ID) { + vas = vaDestroyContext(ctx->hwctx->display, ctx->va_context); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to destroy decode " + "context %#x: %d (%s).\n", + ctx->va_context, vas, vaErrorStr(vas)); + } + } + if (ctx->va_config != VA_INVALID_ID) { + vas = vaDestroyConfig(ctx->hwctx->display, ctx->va_config); + if (vas != VA_STATUS_SUCCESS) { + av_log(avctx, AV_LOG_ERROR, "Failed to destroy decode " + "configuration %#x: %d (%s).\n", + ctx->va_config, vas, vaErrorStr(vas)); + } + } + + return 0; +} diff --git a/media/ffvpx/libavcodec/vaapi_decode.h b/media/ffvpx/libavcodec/vaapi_decode.h new file mode 100644 index 0000000000..6beda14e52 --- /dev/null +++ b/media/ffvpx/libavcodec/vaapi_decode.h @@ -0,0 +1,91 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VAAPI_DECODE_H +#define AVCODEC_VAAPI_DECODE_H + +#include <va/va.h> + +#include "libavutil/frame.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_vaapi.h" + +#include "avcodec.h" + +static inline VASurfaceID ff_vaapi_get_surface_id(AVFrame *pic) +{ + return (uintptr_t)pic->data[3]; +} + +enum { + MAX_PARAM_BUFFERS = 16, +}; + +typedef struct VAAPIDecodePicture { + VASurfaceID output_surface; + + int nb_param_buffers; + VABufferID param_buffers[MAX_PARAM_BUFFERS]; + + int nb_slices; + VABufferID *slice_buffers; + int slices_allocated; +} VAAPIDecodePicture; + +typedef struct VAAPIDecodeContext { + VAConfigID va_config; + VAContextID va_context; + + AVHWDeviceContext *device; + AVVAAPIDeviceContext *hwctx; + + AVHWFramesContext *frames; + AVVAAPIFramesContext *hwfc; + + enum AVPixelFormat surface_format; + int surface_count; + + VASurfaceAttrib pixel_format_attribute; +} VAAPIDecodeContext; + + +int ff_vaapi_decode_make_param_buffer(AVCodecContext *avctx, + VAAPIDecodePicture *pic, + int type, + const void *data, + size_t size); + +int ff_vaapi_decode_make_slice_buffer(AVCodecContext *avctx, + VAAPIDecodePicture *pic, + const void *params_data, + size_t params_size, + const void *slice_data, + size_t slice_size); + +int ff_vaapi_decode_issue(AVCodecContext *avctx, + VAAPIDecodePicture *pic); +int ff_vaapi_decode_cancel(AVCodecContext *avctx, + VAAPIDecodePicture *pic); + +int ff_vaapi_decode_init(AVCodecContext *avctx); +int ff_vaapi_decode_uninit(AVCodecContext *avctx); + +int ff_vaapi_common_frame_params(AVCodecContext *avctx, + AVBufferRef *hw_frames_ctx); + +#endif /* AVCODEC_VAAPI_DECODE_H */ diff --git a/media/ffvpx/libavcodec/vaapi_hevc.h b/media/ffvpx/libavcodec/vaapi_hevc.h new file mode 100644 index 0000000000..449635d0d7 --- /dev/null +++ b/media/ffvpx/libavcodec/vaapi_hevc.h @@ -0,0 +1,27 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VAAPI_HEVC_H +#define AVCODEC_VAAPI_HEVC_H + +#include <va/va.h> +#include "avcodec.h" + +VAProfile ff_vaapi_parse_hevc_rext_scc_profile(AVCodecContext *avctx); + +#endif /* AVCODEC_VAAPI_HEVC_H */ diff --git a/media/ffvpx/libavcodec/vaapi_vp8.c b/media/ffvpx/libavcodec/vaapi_vp8.c new file mode 100644 index 0000000000..5b18bf8f34 --- /dev/null +++ b/media/ffvpx/libavcodec/vaapi_vp8.c @@ -0,0 +1,237 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <va/va.h> +#include <va/va_dec_vp8.h> + +#include "hwconfig.h" +#include "vaapi_decode.h" +#include "vp8.h" + +static VASurfaceID vaapi_vp8_surface_id(VP8Frame *vf) +{ + if (vf) + return ff_vaapi_get_surface_id(vf->tf.f); + else + return VA_INVALID_SURFACE; +} + +static int vaapi_vp8_start_frame(AVCodecContext *avctx, + av_unused const uint8_t *buffer, + av_unused uint32_t size) +{ + const VP8Context *s = avctx->priv_data; + VAAPIDecodePicture *pic = s->framep[VP8_FRAME_CURRENT]->hwaccel_picture_private; + VAPictureParameterBufferVP8 pp; + VAProbabilityDataBufferVP8 prob; + VAIQMatrixBufferVP8 quant; + int err, i, j, k; + + pic->output_surface = vaapi_vp8_surface_id(s->framep[VP8_FRAME_CURRENT]); + + pp = (VAPictureParameterBufferVP8) { + .frame_width = avctx->width, + .frame_height = avctx->height, + + .last_ref_frame = vaapi_vp8_surface_id(s->framep[VP8_FRAME_PREVIOUS]), + .golden_ref_frame = vaapi_vp8_surface_id(s->framep[VP8_FRAME_GOLDEN]), + .alt_ref_frame = vaapi_vp8_surface_id(s->framep[VP8_FRAME_ALTREF]), + .out_of_loop_frame = VA_INVALID_SURFACE, + + .pic_fields.bits = { + .key_frame = !s->keyframe, + .version = s->profile, + + .segmentation_enabled = s->segmentation.enabled, + .update_mb_segmentation_map = s->segmentation.update_map, + .update_segment_feature_data = s->segmentation.update_feature_data, + + .filter_type = s->filter.simple, + .sharpness_level = s->filter.sharpness, + + .loop_filter_adj_enable = s->lf_delta.enabled, + .mode_ref_lf_delta_update = s->lf_delta.update, + + .sign_bias_golden = s->sign_bias[VP8_FRAME_GOLDEN], + .sign_bias_alternate = s->sign_bias[VP8_FRAME_ALTREF], + + .mb_no_coeff_skip = s->mbskip_enabled, + .loop_filter_disable = s->filter.level == 0, + }, + + .prob_skip_false = s->prob->mbskip, + .prob_intra = s->prob->intra, + .prob_last = s->prob->last, + .prob_gf = s->prob->golden, + }; + + for (i = 0; i < 3; i++) + pp.mb_segment_tree_probs[i] = s->prob->segmentid[i]; + + for (i = 0; i < 4; i++) { + if (s->segmentation.enabled) { + pp.loop_filter_level[i] = s->segmentation.filter_level[i]; + if (!s->segmentation.absolute_vals) + pp.loop_filter_level[i] += s->filter.level; + } else { + pp.loop_filter_level[i] = s->filter.level; + } + pp.loop_filter_level[i] = av_clip_uintp2(pp.loop_filter_level[i], 6); + } + + for (i = 0; i < 4; i++) { + pp.loop_filter_deltas_ref_frame[i] = s->lf_delta.ref[i]; + pp.loop_filter_deltas_mode[i] = s->lf_delta.mode[i + 4]; + } + + if (s->keyframe) { + static const uint8_t keyframe_y_mode_probs[4] = { + 145, 156, 163, 128 + }; + static const uint8_t keyframe_uv_mode_probs[3] = { + 142, 114, 183 + }; + memcpy(pp.y_mode_probs, keyframe_y_mode_probs, 4); + memcpy(pp.uv_mode_probs, keyframe_uv_mode_probs, 3); + } else { + for (i = 0; i < 4; i++) + pp.y_mode_probs[i] = s->prob->pred16x16[i]; + for (i = 0; i < 3; i++) + pp.uv_mode_probs[i] = s->prob->pred8x8c[i]; + } + for (i = 0; i < 2; i++) + for (j = 0; j < 19; j++) + pp.mv_probs[i][j] = s->prob->mvc[i][j]; + + pp.bool_coder_ctx.range = s->coder_state_at_header_end.range; + pp.bool_coder_ctx.value = s->coder_state_at_header_end.value; + pp.bool_coder_ctx.count = s->coder_state_at_header_end.bit_count; + + err = ff_vaapi_decode_make_param_buffer(avctx, pic, + VAPictureParameterBufferType, + &pp, sizeof(pp)); + if (err < 0) + goto fail; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 8; j++) { + static const int coeff_bands_inverse[8] = { + 0, 1, 2, 3, 5, 6, 4, 15 + }; + int coeff_pos = coeff_bands_inverse[j]; + + for (k = 0; k < 3; k++) { + memcpy(prob.dct_coeff_probs[i][j][k], + s->prob->token[i][coeff_pos][k], 11); + } + } + } + + err = ff_vaapi_decode_make_param_buffer(avctx, pic, + VAProbabilityBufferType, + &prob, sizeof(prob)); + if (err < 0) + goto fail; + + for (i = 0; i < 4; i++) { + int base_qi = s->segmentation.base_quant[i]; + if (!s->segmentation.absolute_vals) + base_qi += s->quant.yac_qi; + + quant.quantization_index[i][0] = av_clip_uintp2(base_qi, 7); + quant.quantization_index[i][1] = av_clip_uintp2(base_qi + s->quant.ydc_delta, 7); + quant.quantization_index[i][2] = av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7); + quant.quantization_index[i][3] = av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7); + quant.quantization_index[i][4] = av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7); + quant.quantization_index[i][5] = av_clip_uintp2(base_qi + s->quant.uvac_delta, 7); + } + + err = ff_vaapi_decode_make_param_buffer(avctx, pic, + VAIQMatrixBufferType, + &quant, sizeof(quant)); + if (err < 0) + goto fail; + + return 0; + +fail: + ff_vaapi_decode_cancel(avctx, pic); + return err; +} + +static int vaapi_vp8_end_frame(AVCodecContext *avctx) +{ + const VP8Context *s = avctx->priv_data; + VAAPIDecodePicture *pic = s->framep[VP8_FRAME_CURRENT]->hwaccel_picture_private; + + return ff_vaapi_decode_issue(avctx, pic); +} + +static int vaapi_vp8_decode_slice(AVCodecContext *avctx, + const uint8_t *buffer, + uint32_t size) +{ + const VP8Context *s = avctx->priv_data; + VAAPIDecodePicture *pic = s->framep[VP8_FRAME_CURRENT]->hwaccel_picture_private; + VASliceParameterBufferVP8 sp; + int err, i; + + unsigned int header_size = 3 + 7 * s->keyframe; + const uint8_t *data = buffer + header_size; + unsigned int data_size = size - header_size; + + sp = (VASliceParameterBufferVP8) { + .slice_data_size = data_size, + .slice_data_offset = 0, + .slice_data_flag = VA_SLICE_DATA_FLAG_ALL, + + .macroblock_offset = (8 * (s->coder_state_at_header_end.input - data) - + s->coder_state_at_header_end.bit_count - 8), + .num_of_partitions = s->num_coeff_partitions + 1, + }; + + sp.partition_size[0] = s->header_partition_size - ((sp.macroblock_offset + 7) / 8); + for (i = 0; i < 8; i++) + sp.partition_size[i+1] = s->coeff_partition_size[i]; + + err = ff_vaapi_decode_make_slice_buffer(avctx, pic, &sp, sizeof(sp), data, data_size); + if (err) + goto fail; + + return 0; + +fail: + ff_vaapi_decode_cancel(avctx, pic); + return err; +} + +const AVHWAccel ff_vp8_vaapi_hwaccel = { + .name = "vp8_vaapi", + .type = AVMEDIA_TYPE_VIDEO, + .id = AV_CODEC_ID_VP8, + .pix_fmt = AV_PIX_FMT_VAAPI, + .start_frame = &vaapi_vp8_start_frame, + .end_frame = &vaapi_vp8_end_frame, + .decode_slice = &vaapi_vp8_decode_slice, + .frame_priv_data_size = sizeof(VAAPIDecodePicture), + .init = &ff_vaapi_decode_init, + .uninit = &ff_vaapi_decode_uninit, + .frame_params = &ff_vaapi_common_frame_params, + .priv_data_size = sizeof(VAAPIDecodeContext), + .caps_internal = HWACCEL_CAP_ASYNC_SAFE, +}; diff --git a/media/ffvpx/libavcodec/vaapi_vp9.c b/media/ffvpx/libavcodec/vaapi_vp9.c new file mode 100644 index 0000000000..776382f683 --- /dev/null +++ b/media/ffvpx/libavcodec/vaapi_vp9.c @@ -0,0 +1,185 @@ +/* + * VP9 HW decode acceleration through VA API + * + * Copyright (C) 2015 Timo Rothenpieler <timo@rothenpieler.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/pixdesc.h" + +#include "hwconfig.h" +#include "vaapi_decode.h" +#include "vp9shared.h" + +static VASurfaceID vaapi_vp9_surface_id(const VP9Frame *vf) +{ + if (vf) + return ff_vaapi_get_surface_id(vf->tf.f); + else + return VA_INVALID_SURFACE; +} + +static int vaapi_vp9_start_frame(AVCodecContext *avctx, + av_unused const uint8_t *buffer, + av_unused uint32_t size) +{ + const VP9SharedContext *h = avctx->priv_data; + VAAPIDecodePicture *pic = h->frames[CUR_FRAME].hwaccel_picture_private; + VADecPictureParameterBufferVP9 pic_param; + const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(avctx->sw_pix_fmt); + int err, i; + + pic->output_surface = vaapi_vp9_surface_id(&h->frames[CUR_FRAME]); + + pic_param = (VADecPictureParameterBufferVP9) { + .frame_width = avctx->width, + .frame_height = avctx->height, + + .pic_fields.bits = { + .subsampling_x = pixdesc->log2_chroma_w, + .subsampling_y = pixdesc->log2_chroma_h, + .frame_type = !h->h.keyframe, + .show_frame = !h->h.invisible, + .error_resilient_mode = h->h.errorres, + .intra_only = h->h.intraonly, + .allow_high_precision_mv = h->h.keyframe ? 0 : h->h.highprecisionmvs, + .mcomp_filter_type = h->h.filtermode ^ (h->h.filtermode <= 1), + .frame_parallel_decoding_mode = h->h.parallelmode, + .reset_frame_context = h->h.resetctx, + .refresh_frame_context = h->h.refreshctx, + .frame_context_idx = h->h.framectxid, + + .segmentation_enabled = h->h.segmentation.enabled, + .segmentation_temporal_update = h->h.segmentation.temporal, + .segmentation_update_map = h->h.segmentation.update_map, + + .last_ref_frame = h->h.refidx[0], + .last_ref_frame_sign_bias = h->h.signbias[0], + .golden_ref_frame = h->h.refidx[1], + .golden_ref_frame_sign_bias = h->h.signbias[1], + .alt_ref_frame = h->h.refidx[2], + .alt_ref_frame_sign_bias = h->h.signbias[2], + .lossless_flag = h->h.lossless, + }, + + .filter_level = h->h.filter.level, + .sharpness_level = h->h.filter.sharpness, + .log2_tile_rows = h->h.tiling.log2_tile_rows, + .log2_tile_columns = h->h.tiling.log2_tile_cols, + + .frame_header_length_in_bytes = h->h.uncompressed_header_size, + .first_partition_size = h->h.compressed_header_size, + + .profile = h->h.profile, + .bit_depth = h->h.bpp, + }; + + for (i = 0; i < 7; i++) + pic_param.mb_segment_tree_probs[i] = h->h.segmentation.prob[i]; + + if (h->h.segmentation.temporal) { + for (i = 0; i < 3; i++) + pic_param.segment_pred_probs[i] = h->h.segmentation.pred_prob[i]; + } else { + memset(pic_param.segment_pred_probs, 255, sizeof(pic_param.segment_pred_probs)); + } + + for (i = 0; i < 8; i++) { + if (h->refs[i].f->buf[0]) + pic_param.reference_frames[i] = ff_vaapi_get_surface_id(h->refs[i].f); + else + pic_param.reference_frames[i] = VA_INVALID_ID; + } + + err = ff_vaapi_decode_make_param_buffer(avctx, pic, + VAPictureParameterBufferType, + &pic_param, sizeof(pic_param)); + if (err < 0) { + ff_vaapi_decode_cancel(avctx, pic); + return err; + } + + return 0; +} + +static int vaapi_vp9_end_frame(AVCodecContext *avctx) +{ + const VP9SharedContext *h = avctx->priv_data; + VAAPIDecodePicture *pic = h->frames[CUR_FRAME].hwaccel_picture_private; + + return ff_vaapi_decode_issue(avctx, pic); +} + +static int vaapi_vp9_decode_slice(AVCodecContext *avctx, + const uint8_t *buffer, + uint32_t size) +{ + const VP9SharedContext *h = avctx->priv_data; + VAAPIDecodePicture *pic = h->frames[CUR_FRAME].hwaccel_picture_private; + VASliceParameterBufferVP9 slice_param; + int err, i; + + slice_param = (VASliceParameterBufferVP9) { + .slice_data_size = size, + .slice_data_offset = 0, + .slice_data_flag = VA_SLICE_DATA_FLAG_ALL, + }; + + for (i = 0; i < 8; i++) { + slice_param.seg_param[i] = (VASegmentParameterVP9) { + .segment_flags.fields = { + .segment_reference_enabled = h->h.segmentation.feat[i].ref_enabled, + .segment_reference = h->h.segmentation.feat[i].ref_val, + .segment_reference_skipped = h->h.segmentation.feat[i].skip_enabled, + }, + + .luma_dc_quant_scale = h->h.segmentation.feat[i].qmul[0][0], + .luma_ac_quant_scale = h->h.segmentation.feat[i].qmul[0][1], + .chroma_dc_quant_scale = h->h.segmentation.feat[i].qmul[1][0], + .chroma_ac_quant_scale = h->h.segmentation.feat[i].qmul[1][1], + }; + + memcpy(slice_param.seg_param[i].filter_level, h->h.segmentation.feat[i].lflvl, sizeof(slice_param.seg_param[i].filter_level)); + } + + err = ff_vaapi_decode_make_slice_buffer(avctx, pic, + &slice_param, sizeof(slice_param), + buffer, size); + if (err) { + ff_vaapi_decode_cancel(avctx, pic); + return err; + } + + return 0; +} + +const AVHWAccel ff_vp9_vaapi_hwaccel = { + .name = "vp9_vaapi", + .type = AVMEDIA_TYPE_VIDEO, + .id = AV_CODEC_ID_VP9, + .pix_fmt = AV_PIX_FMT_VAAPI, + .start_frame = vaapi_vp9_start_frame, + .end_frame = vaapi_vp9_end_frame, + .decode_slice = vaapi_vp9_decode_slice, + .frame_priv_data_size = sizeof(VAAPIDecodePicture), + .init = ff_vaapi_decode_init, + .uninit = ff_vaapi_decode_uninit, + .frame_params = ff_vaapi_common_frame_params, + .priv_data_size = sizeof(VAAPIDecodeContext), + .caps_internal = HWACCEL_CAP_ASYNC_SAFE, +}; diff --git a/media/ffvpx/libavcodec/version.c b/media/ffvpx/libavcodec/version.c new file mode 100644 index 0000000000..d7966b2015 --- /dev/null +++ b/media/ffvpx/libavcodec/version.c @@ -0,0 +1,50 @@ +/* + * Version functions. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/avassert.h" +#include "avcodec.h" +#include "codec_id.h" +#include "version.h" + +#include "libavutil/ffversion.h" +const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION; + +unsigned avcodec_version(void) +{ + av_assert0(AV_CODEC_ID_PCM_S8_PLANAR==65563); + av_assert0(AV_CODEC_ID_ADPCM_G722==69660); + av_assert0(AV_CODEC_ID_SRT==94216); + av_assert0(LIBAVCODEC_VERSION_MICRO >= 100); + + return LIBAVCODEC_VERSION_INT; +} + +const char *avcodec_configuration(void) +{ + return FFMPEG_CONFIGURATION; +} + +const char *avcodec_license(void) +{ +#define LICENSE_PREFIX "libavcodec license: " + return &LICENSE_PREFIX FFMPEG_LICENSE[sizeof(LICENSE_PREFIX) - 1]; +} diff --git a/media/ffvpx/libavcodec/version.h b/media/ffvpx/libavcodec/version.h new file mode 100644 index 0000000000..7aa95fc3f1 --- /dev/null +++ b/media/ffvpx/libavcodec/version.h @@ -0,0 +1,45 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VERSION_H +#define AVCODEC_VERSION_H + +/** + * @file + * @ingroup libavc + * Libavcodec version macros. + */ + +#include "libavutil/version.h" + +#include "version_major.h" + +#define LIBAVCODEC_VERSION_MINOR 5 +#define LIBAVCODEC_VERSION_MICRO 100 + +#define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \ + LIBAVCODEC_VERSION_MINOR, \ + LIBAVCODEC_VERSION_MICRO) +#define LIBAVCODEC_VERSION AV_VERSION(LIBAVCODEC_VERSION_MAJOR, \ + LIBAVCODEC_VERSION_MINOR, \ + LIBAVCODEC_VERSION_MICRO) +#define LIBAVCODEC_BUILD LIBAVCODEC_VERSION_INT + +#define LIBAVCODEC_IDENT "Lavc" AV_STRINGIFY(LIBAVCODEC_VERSION) + +#endif /* AVCODEC_VERSION_H */ diff --git a/media/ffvpx/libavcodec/version_major.h b/media/ffvpx/libavcodec/version_major.h new file mode 100644 index 0000000000..c2f118b262 --- /dev/null +++ b/media/ffvpx/libavcodec/version_major.h @@ -0,0 +1,52 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VERSION_MAJOR_H +#define AVCODEC_VERSION_MAJOR_H + +/** + * @file + * @ingroup libavc + * Libavcodec version macros. + */ + +#define LIBAVCODEC_VERSION_MAJOR 60 + +/** + * FF_API_* defines may be placed below to indicate public API that will be + * dropped at a future version bump. The defines themselves are not part of + * the public API and may change, break or disappear at any time. + * + * @note, when bumping the major version it is recommended to manually + * disable each FF_API_* in its own commit instead of disabling them all + * at once through the bump. This improves the git bisect-ability of the change. + */ + +#define FF_API_INIT_PACKET (LIBAVCODEC_VERSION_MAJOR < 61) +#define FF_API_IDCT_NONE (LIBAVCODEC_VERSION_MAJOR < 61) +#define FF_API_SVTAV1_OPTS (LIBAVCODEC_VERSION_MAJOR < 61) +#define FF_API_AYUV_CODECID (LIBAVCODEC_VERSION_MAJOR < 61) +#define FF_API_VT_OUTPUT_CALLBACK (LIBAVCODEC_VERSION_MAJOR < 61) +#define FF_API_AVCODEC_CHROMA_POS (LIBAVCODEC_VERSION_MAJOR < 61) +#define FF_API_VT_HWACCEL_CONTEXT (LIBAVCODEC_VERSION_MAJOR < 61) +#define FF_API_AVCTX_FRAME_NUMBER (LIBAVCODEC_VERSION_MAJOR < 61) + +// reminder to remove CrystalHD decoders on next major bump +#define FF_CODEC_CRYSTAL_HD (LIBAVCODEC_VERSION_MAJOR < 61) + +#endif /* AVCODEC_VERSION_MAJOR_H */ diff --git a/media/ffvpx/libavcodec/videodsp.c b/media/ffvpx/libavcodec/videodsp.c new file mode 100644 index 0000000000..bdff2e76f5 --- /dev/null +++ b/media/ffvpx/libavcodec/videodsp.c @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2012 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/macros.h" +#include "videodsp.h" + +#define BIT_DEPTH 8 +#include "videodsp_template.c" +#undef BIT_DEPTH + +#define BIT_DEPTH 16 +#include "videodsp_template.c" +#undef BIT_DEPTH + +static void just_return(const uint8_t *buf, ptrdiff_t stride, int h) +{ +} + +av_cold void ff_videodsp_init(VideoDSPContext *ctx, int bpc) +{ + ctx->prefetch = just_return; + if (bpc <= 8) { + ctx->emulated_edge_mc = ff_emulated_edge_mc_8; + } else { + ctx->emulated_edge_mc = ff_emulated_edge_mc_16; + } + +#if ARCH_AARCH64 + ff_videodsp_init_aarch64(ctx, bpc); +#elif ARCH_ARM + ff_videodsp_init_arm(ctx, bpc); +#elif ARCH_PPC + ff_videodsp_init_ppc(ctx, bpc); +#elif ARCH_X86 + ff_videodsp_init_x86(ctx, bpc); +#elif ARCH_MIPS + ff_videodsp_init_mips(ctx, bpc); +#elif ARCH_LOONGARCH64 + ff_videodsp_init_loongarch(ctx, bpc); +#endif +} diff --git a/media/ffvpx/libavcodec/videodsp.h b/media/ffvpx/libavcodec/videodsp.h new file mode 100644 index 0000000000..e8960b609d --- /dev/null +++ b/media/ffvpx/libavcodec/videodsp.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2012 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Core video DSP helper functions + */ + +#ifndef AVCODEC_VIDEODSP_H +#define AVCODEC_VIDEODSP_H + +#include <stddef.h> +#include <stdint.h> + +#define EMULATED_EDGE(depth) \ +void ff_emulated_edge_mc_ ## depth(uint8_t *dst, const uint8_t *src, \ + ptrdiff_t dst_stride, ptrdiff_t src_stride, \ + int block_w, int block_h,\ + int src_x, int src_y, int w, int h); + +EMULATED_EDGE(8) + +typedef struct VideoDSPContext { + /** + * Copy a rectangular area of samples to a temporary buffer and replicate + * the border samples. + * + * @param dst destination buffer + * @param dst_stride number of bytes between 2 vertically adjacent samples + * in destination buffer + * @param src source buffer + * @param dst_linesize number of bytes between 2 vertically adjacent + * samples in the destination buffer + * @param src_linesize number of bytes between 2 vertically adjacent + * samples in both the source buffer + * @param block_w width of block + * @param block_h height of block + * @param src_x x coordinate of the top left sample of the block in the + * source buffer + * @param src_y y coordinate of the top left sample of the block in the + * source buffer + * @param w width of the source buffer + * @param h height of the source buffer + */ + void (*emulated_edge_mc)(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_linesize, + ptrdiff_t src_linesize, + int block_w, int block_h, + int src_x, int src_y, int w, int h); + + /** + * Prefetch memory into cache (if supported by hardware). + * + * @param buf pointer to buffer to prefetch memory from + * @param stride distance between two lines of buf (in bytes) + * @param h number of lines to prefetch + */ + void (*prefetch)(const uint8_t *buf, ptrdiff_t stride, int h); +} VideoDSPContext; + +void ff_videodsp_init(VideoDSPContext *ctx, int bpc); + +/* for internal use only (i.e. called by ff_videodsp_init() */ +void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc); +void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc); +void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc); +void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc); +void ff_videodsp_init_mips(VideoDSPContext *ctx, int bpc); +void ff_videodsp_init_loongarch(VideoDSPContext *ctx, int bpc); + +#endif /* AVCODEC_VIDEODSP_H */ diff --git a/media/ffvpx/libavcodec/videodsp_template.c b/media/ffvpx/libavcodec/videodsp_template.c new file mode 100644 index 0000000000..d653f4d524 --- /dev/null +++ b/media/ffvpx/libavcodec/videodsp_template.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2002-2012 Michael Niedermayer + * Copyright (C) 2012 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "bit_depth_template.c" +#if BIT_DEPTH != 8 +// ff_emulated_edge_mc_8 is used by the x86 MpegVideoDSP API. +static +#endif +void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src, + ptrdiff_t buf_linesize, + ptrdiff_t src_linesize, + int block_w, int block_h, + int src_x, int src_y, int w, int h) +{ + int x, y; + int start_y, start_x, end_y, end_x; + + if (!w || !h) + return; + + av_assert2(block_w * sizeof(pixel) <= FFABS(buf_linesize)); + + if (src_y >= h) { + src -= src_y * src_linesize; + src += (h - 1) * src_linesize; + src_y = h - 1; + } else if (src_y <= -block_h) { + src -= src_y * src_linesize; + src += (1 - block_h) * src_linesize; + src_y = 1 - block_h; + } + if (src_x >= w) { + // The subtracted expression has an unsigned type and must thus not be negative + src -= (1 + src_x - w) * sizeof(pixel); + src_x = w - 1; + } else if (src_x <= -block_w) { + src += (1 - block_w - src_x) * sizeof(pixel); + src_x = 1 - block_w; + } + + start_y = FFMAX(0, -src_y); + start_x = FFMAX(0, -src_x); + end_y = FFMIN(block_h, h-src_y); + end_x = FFMIN(block_w, w-src_x); + av_assert2(start_y < end_y && block_h); + av_assert2(start_x < end_x && block_w); + + w = end_x - start_x; + src += start_y * src_linesize + start_x * (ptrdiff_t)sizeof(pixel); + buf += start_x * sizeof(pixel); + + // top + for (y = 0; y < start_y; y++) { + memcpy(buf, src, w * sizeof(pixel)); + buf += buf_linesize; + } + + // copy existing part + for (; y < end_y; y++) { + memcpy(buf, src, w * sizeof(pixel)); + src += src_linesize; + buf += buf_linesize; + } + + // bottom + src -= src_linesize; + for (; y < block_h; y++) { + memcpy(buf, src, w * sizeof(pixel)); + buf += buf_linesize; + } + + buf -= block_h * buf_linesize + start_x * (ptrdiff_t)sizeof(pixel); + while (block_h--) { + pixel *bufp = (pixel *) buf; + + // left + for(x = 0; x < start_x; x++) { + bufp[x] = bufp[start_x]; + } + + // right + for (x = end_x; x < block_w; x++) { + bufp[x] = bufp[end_x - 1]; + } + buf += buf_linesize; + } +} diff --git a/media/ffvpx/libavcodec/vlc.c b/media/ffvpx/libavcodec/vlc.c new file mode 100644 index 0000000000..96f2b28c7e --- /dev/null +++ b/media/ffvpx/libavcodec/vlc.c @@ -0,0 +1,378 @@ +/* + * API for creating VLC trees + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * Copyright (c) 2010 Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <inttypes.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/error.h" +#include "libavutil/internal.h" +#include "libavutil/log.h" +#include "libavutil/macros.h" +#include "libavutil/mem.h" +#include "libavutil/qsort.h" +#include "libavutil/reverse.h" +#include "vlc.h" + +#define GET_DATA(v, table, i, wrap, size) \ +{ \ + const uint8_t *ptr = (const uint8_t *)table + i * wrap; \ + switch(size) { \ + case 1: \ + v = *(const uint8_t *)ptr; \ + break; \ + case 2: \ + v = *(const uint16_t *)ptr; \ + break; \ + case 4: \ + default: \ + av_assert1(size == 4); \ + v = *(const uint32_t *)ptr; \ + break; \ + } \ +} + + +static int alloc_table(VLC *vlc, int size, int use_static) +{ + int index = vlc->table_size; + + vlc->table_size += size; + if (vlc->table_size > vlc->table_allocated) { + if (use_static) + abort(); // cannot do anything, init_vlc() is used with too little memory + vlc->table_allocated += (1 << vlc->bits); + vlc->table = av_realloc_f(vlc->table, vlc->table_allocated, sizeof(*vlc->table)); + if (!vlc->table) { + vlc->table_allocated = 0; + vlc->table_size = 0; + return AVERROR(ENOMEM); + } + memset(vlc->table + vlc->table_allocated - (1 << vlc->bits), 0, sizeof(*vlc->table) << vlc->bits); + } + return index; +} + +#define LOCALBUF_ELEMS 1500 // the maximum currently needed is 1296 by rv34 + +static av_always_inline uint32_t bitswap_32(uint32_t x) +{ + return (uint32_t)ff_reverse[ x & 0xFF] << 24 | + (uint32_t)ff_reverse[(x >> 8) & 0xFF] << 16 | + (uint32_t)ff_reverse[(x >> 16) & 0xFF] << 8 | + (uint32_t)ff_reverse[ x >> 24]; +} + +typedef struct VLCcode { + uint8_t bits; + VLCBaseType symbol; + /** codeword, with the first bit-to-be-read in the msb + * (even if intended for a little-endian bitstream reader) */ + uint32_t code; +} VLCcode; + +static int vlc_common_init(VLC *vlc, int nb_bits, int nb_codes, + VLCcode **buf, int flags) +{ + vlc->bits = nb_bits; + vlc->table_size = 0; + if (flags & INIT_VLC_USE_NEW_STATIC) { + av_assert0(nb_codes <= LOCALBUF_ELEMS); + } else { + vlc->table = NULL; + vlc->table_allocated = 0; + } + if (nb_codes > LOCALBUF_ELEMS) { + *buf = av_malloc_array(nb_codes, sizeof(VLCcode)); + if (!*buf) + return AVERROR(ENOMEM); + } + + return 0; +} + +static int compare_vlcspec(const void *a, const void *b) +{ + const VLCcode *sa = a, *sb = b; + return (sa->code >> 1) - (sb->code >> 1); +} + +/** + * Build VLC decoding tables suitable for use with get_vlc(). + * + * @param vlc the context to be initialized + * + * @param table_nb_bits max length of vlc codes to store directly in this table + * (Longer codes are delegated to subtables.) + * + * @param nb_codes number of elements in codes[] + * + * @param codes descriptions of the vlc codes + * These must be ordered such that codes going into the same subtable are contiguous. + * Sorting by VLCcode.code is sufficient, though not necessary. + */ +static int build_table(VLC *vlc, int table_nb_bits, int nb_codes, + VLCcode *codes, int flags) +{ + int table_size, table_index; + VLCElem *table; + + if (table_nb_bits > 30) + return AVERROR(EINVAL); + table_size = 1 << table_nb_bits; + table_index = alloc_table(vlc, table_size, flags & INIT_VLC_USE_NEW_STATIC); + ff_dlog(NULL, "new table index=%d size=%d\n", table_index, table_size); + if (table_index < 0) + return table_index; + table = &vlc->table[table_index]; + + /* first pass: map codes and compute auxiliary table sizes */ + for (int i = 0; i < nb_codes; i++) { + int n = codes[i].bits; + uint32_t code = codes[i].code; + int symbol = codes[i].symbol; + ff_dlog(NULL, "i=%d n=%d code=0x%"PRIx32"\n", i, n, code); + if (n <= table_nb_bits) { + /* no need to add another table */ + int j = code >> (32 - table_nb_bits); + int nb = 1 << (table_nb_bits - n); + int inc = 1; + + if (flags & INIT_VLC_OUTPUT_LE) { + j = bitswap_32(code); + inc = 1 << n; + } + for (int k = 0; k < nb; k++) { + int bits = table[j].len; + int oldsym = table[j].sym; + ff_dlog(NULL, "%4x: code=%d n=%d\n", j, i, n); + if ((bits || oldsym) && (bits != n || oldsym != symbol)) { + av_log(NULL, AV_LOG_ERROR, "incorrect codes\n"); + return AVERROR_INVALIDDATA; + } + table[j].len = n; + table[j].sym = symbol; + j += inc; + } + } else { + /* fill auxiliary table recursively */ + uint32_t code_prefix; + int index, subtable_bits, j, k; + + n -= table_nb_bits; + code_prefix = code >> (32 - table_nb_bits); + subtable_bits = n; + codes[i].bits = n; + codes[i].code = code << table_nb_bits; + for (k = i + 1; k < nb_codes; k++) { + n = codes[k].bits - table_nb_bits; + if (n <= 0) + break; + code = codes[k].code; + if (code >> (32 - table_nb_bits) != code_prefix) + break; + codes[k].bits = n; + codes[k].code = code << table_nb_bits; + subtable_bits = FFMAX(subtable_bits, n); + } + subtable_bits = FFMIN(subtable_bits, table_nb_bits); + j = (flags & INIT_VLC_OUTPUT_LE) ? bitswap_32(code_prefix) >> (32 - table_nb_bits) : code_prefix; + table[j].len = -subtable_bits; + ff_dlog(NULL, "%4x: n=%d (subtable)\n", + j, codes[i].bits + table_nb_bits); + index = build_table(vlc, subtable_bits, k-i, codes+i, flags); + if (index < 0) + return index; + /* note: realloc has been done, so reload tables */ + table = &vlc->table[table_index]; + table[j].sym = index; + if (table[j].sym != index) { + avpriv_request_sample(NULL, "strange codes"); + return AVERROR_PATCHWELCOME; + } + i = k-1; + } + } + + for (int i = 0; i < table_size; i++) { + if (table[i].len == 0) + table[i].sym = -1; + } + + return table_index; +} + +static int vlc_common_end(VLC *vlc, int nb_bits, int nb_codes, VLCcode *codes, + int flags, VLCcode localbuf[LOCALBUF_ELEMS]) +{ + int ret = build_table(vlc, nb_bits, nb_codes, codes, flags); + + if (flags & INIT_VLC_USE_NEW_STATIC) { + if (vlc->table_size != vlc->table_allocated && + !(flags & (INIT_VLC_STATIC_OVERLONG & ~INIT_VLC_USE_NEW_STATIC))) + av_log(NULL, AV_LOG_ERROR, "needed %d had %d\n", vlc->table_size, vlc->table_allocated); + av_assert0(ret >= 0); + } else { + if (codes != localbuf) + av_free(codes); + if (ret < 0) { + av_freep(&vlc->table); + return ret; + } + } + return 0; +} + +/* Build VLC decoding tables suitable for use with get_vlc(). + + 'nb_bits' sets the decoding table size (2^nb_bits) entries. The + bigger it is, the faster is the decoding. But it should not be too + big to save memory and L1 cache. '9' is a good compromise. + + 'nb_codes' : number of vlcs codes + + 'bits' : table which gives the size (in bits) of each vlc code. + + 'codes' : table which gives the bit pattern of of each vlc code. + + 'symbols' : table which gives the values to be returned from get_vlc(). + + 'xxx_wrap' : give the number of bytes between each entry of the + 'bits' or 'codes' tables. + + 'xxx_size' : gives the number of bytes of each entry of the 'bits' + or 'codes' tables. Currently 1,2 and 4 are supported. + + 'wrap' and 'size' make it possible to use any memory configuration and types + (byte/word/long) to store the 'bits', 'codes', and 'symbols' tables. +*/ +int ff_init_vlc_sparse(VLC *vlc, int nb_bits, int nb_codes, + const void *bits, int bits_wrap, int bits_size, + const void *codes, int codes_wrap, int codes_size, + const void *symbols, int symbols_wrap, int symbols_size, + int flags) +{ + VLCcode localbuf[LOCALBUF_ELEMS], *buf = localbuf; + int j, ret; + + ret = vlc_common_init(vlc, nb_bits, nb_codes, &buf, flags); + if (ret < 0) + return ret; + + av_assert0(symbols_size <= 2 || !symbols); + j = 0; +#define COPY(condition)\ + for (int i = 0; i < nb_codes; i++) { \ + unsigned len; \ + GET_DATA(len, bits, i, bits_wrap, bits_size); \ + if (!(condition)) \ + continue; \ + if (len > 3*nb_bits || len > 32) { \ + av_log(NULL, AV_LOG_ERROR, "Too long VLC (%u) in init_vlc\n", len);\ + if (buf != localbuf) \ + av_free(buf); \ + return AVERROR(EINVAL); \ + } \ + buf[j].bits = len; \ + GET_DATA(buf[j].code, codes, i, codes_wrap, codes_size); \ + if (buf[j].code >= (1LL<<buf[j].bits)) { \ + av_log(NULL, AV_LOG_ERROR, "Invalid code %"PRIx32" for %d in " \ + "init_vlc\n", buf[j].code, i); \ + if (buf != localbuf) \ + av_free(buf); \ + return AVERROR(EINVAL); \ + } \ + if (flags & INIT_VLC_INPUT_LE) \ + buf[j].code = bitswap_32(buf[j].code); \ + else \ + buf[j].code <<= 32 - buf[j].bits; \ + if (symbols) \ + GET_DATA(buf[j].symbol, symbols, i, symbols_wrap, symbols_size) \ + else \ + buf[j].symbol = i; \ + j++; \ + } + COPY(len > nb_bits); + // qsort is the slowest part of init_vlc, and could probably be improved or avoided + AV_QSORT(buf, j, struct VLCcode, compare_vlcspec); + COPY(len && len <= nb_bits); + nb_codes = j; + + return vlc_common_end(vlc, nb_bits, nb_codes, buf, + flags, localbuf); +} + +int ff_init_vlc_from_lengths(VLC *vlc, int nb_bits, int nb_codes, + const int8_t *lens, int lens_wrap, + const void *symbols, int symbols_wrap, int symbols_size, + int offset, int flags, void *logctx) +{ + VLCcode localbuf[LOCALBUF_ELEMS], *buf = localbuf; + uint64_t code; + int ret, j, len_max = FFMIN(32, 3 * nb_bits); + + ret = vlc_common_init(vlc, nb_bits, nb_codes, &buf, flags); + if (ret < 0) + return ret; + + j = code = 0; + for (int i = 0; i < nb_codes; i++, lens += lens_wrap) { + int len = *lens; + if (len > 0) { + unsigned sym; + + buf[j].bits = len; + if (symbols) + GET_DATA(sym, symbols, i, symbols_wrap, symbols_size) + else + sym = i; + buf[j].symbol = sym + offset; + buf[j++].code = code; + } else if (len < 0) { + len = -len; + } else + continue; + if (len > len_max || code & ((1U << (32 - len)) - 1)) { + av_log(logctx, AV_LOG_ERROR, "Invalid VLC (length %u)\n", len); + goto fail; + } + code += 1U << (32 - len); + if (code > UINT32_MAX + 1ULL) { + av_log(logctx, AV_LOG_ERROR, "Overdetermined VLC tree\n"); + goto fail; + } + } + return vlc_common_end(vlc, nb_bits, j, buf, flags, localbuf); +fail: + if (buf != localbuf) + av_free(buf); + return AVERROR_INVALIDDATA; +} + +void ff_free_vlc(VLC *vlc) +{ + av_freep(&vlc->table); +} diff --git a/media/ffvpx/libavcodec/vlc.h b/media/ffvpx/libavcodec/vlc.h new file mode 100644 index 0000000000..e63c484755 --- /dev/null +++ b/media/ffvpx/libavcodec/vlc.h @@ -0,0 +1,144 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VLC_H +#define AVCODEC_VLC_H + +#include <stdint.h> + +// When changing this, be sure to also update tableprint_vlc.h accordingly. +typedef int16_t VLCBaseType; + +typedef struct VLCElem { + VLCBaseType sym, len; +} VLCElem; + +typedef struct VLC { + int bits; + VLCElem *table; + int table_size, table_allocated; +} VLC; + +typedef struct RL_VLC_ELEM { + int16_t level; + int8_t len; + uint8_t run; +} RL_VLC_ELEM; + +#define init_vlc(vlc, nb_bits, nb_codes, \ + bits, bits_wrap, bits_size, \ + codes, codes_wrap, codes_size, \ + flags) \ + ff_init_vlc_sparse(vlc, nb_bits, nb_codes, \ + bits, bits_wrap, bits_size, \ + codes, codes_wrap, codes_size, \ + NULL, 0, 0, flags) + +int ff_init_vlc_sparse(VLC *vlc, int nb_bits, int nb_codes, + const void *bits, int bits_wrap, int bits_size, + const void *codes, int codes_wrap, int codes_size, + const void *symbols, int symbols_wrap, int symbols_size, + int flags); + +/** + * Build VLC decoding tables suitable for use with get_vlc2() + * + * This function takes lengths and symbols and calculates the codes from them. + * For this the input lengths and symbols have to be sorted according to "left + * nodes in the corresponding tree first". + * + * @param[in,out] vlc The VLC to be initialized; table and table_allocated + * must have been set when initializing a static VLC, + * otherwise this will be treated as uninitialized. + * @param[in] nb_bits The number of bits to use for the VLC table; + * higher values take up more memory and cache, but + * allow to read codes with fewer reads. + * @param[in] nb_codes The number of provided length and (if supplied) symbol + * entries. + * @param[in] lens The lengths of the codes. Entries > 0 correspond to + * valid codes; entries == 0 will be skipped and entries + * with len < 0 indicate that the tree is incomplete and + * has an open end of length -len at this position. + * @param[in] lens_wrap Stride (in bytes) of the lengths. + * @param[in] symbols The symbols, i.e. what is returned from get_vlc2() + * when the corresponding code is encountered. + * May be NULL, then 0, 1, 2, 3, 4,... will be used. + * @param[in] symbols_wrap Stride (in bytes) of the symbols. + * @param[in] symbols_size Size of the symbols. 1 and 2 are supported. + * @param[in] offset An offset to apply to all the valid symbols. + * @param[in] flags A combination of the INIT_VLC_* flags; notice that + * INIT_VLC_INPUT_LE is pointless and ignored. + */ +int ff_init_vlc_from_lengths(VLC *vlc, int nb_bits, int nb_codes, + const int8_t *lens, int lens_wrap, + const void *symbols, int symbols_wrap, int symbols_size, + int offset, int flags, void *logctx); + +void ff_free_vlc(VLC *vlc); + +/* If INIT_VLC_INPUT_LE is set, the LSB bit of the codes used to + * initialize the VLC table is the first bit to be read. */ +#define INIT_VLC_INPUT_LE 2 +/* If set the VLC is intended for a little endian bitstream reader. */ +#define INIT_VLC_OUTPUT_LE 8 +#define INIT_VLC_LE (INIT_VLC_INPUT_LE | INIT_VLC_OUTPUT_LE) +#define INIT_VLC_USE_NEW_STATIC 4 +#define INIT_VLC_STATIC_OVERLONG (1 | INIT_VLC_USE_NEW_STATIC) + +#define INIT_CUSTOM_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, \ + h, i, j, flags, static_size) \ + do { \ + static VLCElem table[static_size]; \ + (vlc)->table = table; \ + (vlc)->table_allocated = static_size; \ + ff_init_vlc_sparse(vlc, bits, a, b, c, d, e, f, g, h, i, j, \ + flags | INIT_VLC_USE_NEW_STATIC); \ + } while (0) + +#define INIT_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, h, i, j, static_size) \ + INIT_CUSTOM_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, \ + h, i, j, 0, static_size) + +#define INIT_LE_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, h, i, j, static_size) \ + INIT_CUSTOM_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, \ + h, i, j, INIT_VLC_LE, static_size) + +#define INIT_CUSTOM_VLC_STATIC(vlc, bits, a, b, c, d, e, f, g, flags, static_size) \ + INIT_CUSTOM_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, \ + NULL, 0, 0, flags, static_size) + +#define INIT_VLC_STATIC(vlc, bits, a, b, c, d, e, f, g, static_size) \ + INIT_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, NULL, 0, 0, static_size) + +#define INIT_LE_VLC_STATIC(vlc, bits, a, b, c, d, e, f, g, static_size) \ + INIT_LE_VLC_SPARSE_STATIC(vlc, bits, a, b, c, d, e, f, g, NULL, 0, 0, static_size) + +#define INIT_VLC_STATIC_FROM_LENGTHS(vlc, bits, nb_codes, lens, len_wrap, \ + symbols, symbols_wrap, symbols_size, \ + offset, flags, static_size) \ + do { \ + static VLCElem table[static_size]; \ + (vlc)->table = table; \ + (vlc)->table_allocated = static_size; \ + ff_init_vlc_from_lengths(vlc, bits, nb_codes, lens, len_wrap, \ + symbols, symbols_wrap, symbols_size, \ + offset, flags | INIT_VLC_USE_NEW_STATIC, \ + NULL); \ + } while (0) + +#endif /* AVCODEC_VLC_H */ diff --git a/media/ffvpx/libavcodec/vorbis_parser.c b/media/ffvpx/libavcodec/vorbis_parser.c new file mode 100644 index 0000000000..a7d15d4ce9 --- /dev/null +++ b/media/ffvpx/libavcodec/vorbis_parser.c @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2012 Justin Ruggles + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Vorbis audio parser + * + * Determines the duration for each packet. + */ + +#include "config_components.h" + +#include "libavutil/log.h" + +#include "get_bits.h" +#include "parser.h" +#include "xiph.h" +#include "vorbis_parser_internal.h" + +static const AVClass vorbis_parser_class = { + .class_name = "Vorbis parser", + .item_name = av_default_item_name, + .version = LIBAVUTIL_VERSION_INT, +}; + +static int parse_id_header(AVVorbisParseContext *s, + const uint8_t *buf, int buf_size) +{ + /* Id header should be 30 bytes */ + if (buf_size < 30) { + av_log(s, AV_LOG_ERROR, "Id header is too short\n"); + return AVERROR_INVALIDDATA; + } + + /* make sure this is the Id header */ + if (buf[0] != 1) { + av_log(s, AV_LOG_ERROR, "Wrong packet type in Id header\n"); + return AVERROR_INVALIDDATA; + } + + /* check for header signature */ + if (memcmp(&buf[1], "vorbis", 6)) { + av_log(s, AV_LOG_ERROR, "Invalid packet signature in Id header\n"); + return AVERROR_INVALIDDATA; + } + + if (!(buf[29] & 0x1)) { + av_log(s, AV_LOG_ERROR, "Invalid framing bit in Id header\n"); + return AVERROR_INVALIDDATA; + } + + s->blocksize[0] = 1 << (buf[28] & 0xF); + s->blocksize[1] = 1 << (buf[28] >> 4); + + return 0; +} + +static int parse_setup_header(AVVorbisParseContext *s, + const uint8_t *buf, int buf_size) +{ + GetBitContext gb, gb0; + uint8_t *rev_buf; + int i, ret = 0; + int got_framing_bit, mode_count, got_mode_header, last_mode_count = 0; + + /* avoid overread */ + if (buf_size < 7) { + av_log(s, AV_LOG_ERROR, "Setup header is too short\n"); + return AVERROR_INVALIDDATA; + } + + /* make sure this is the Setup header */ + if (buf[0] != 5) { + av_log(s, AV_LOG_ERROR, "Wrong packet type in Setup header\n"); + return AVERROR_INVALIDDATA; + } + + /* check for header signature */ + if (memcmp(&buf[1], "vorbis", 6)) { + av_log(s, AV_LOG_ERROR, "Invalid packet signature in Setup header\n"); + return AVERROR_INVALIDDATA; + } + + /* reverse bytes so we can easily read backwards with get_bits() */ + if (!(rev_buf = av_malloc(buf_size))) { + av_log(s, AV_LOG_ERROR, "Out of memory\n"); + return AVERROR(ENOMEM); + } + for (i = 0; i < buf_size; i++) + rev_buf[i] = buf[buf_size - 1 - i]; + init_get_bits(&gb, rev_buf, buf_size * 8); + + got_framing_bit = 0; + while (get_bits_left(&gb) > 97) { + if (get_bits1(&gb)) { + got_framing_bit = get_bits_count(&gb); + break; + } + } + if (!got_framing_bit) { + av_log(s, AV_LOG_ERROR, "Invalid Setup header\n"); + ret = AVERROR_INVALIDDATA; + goto bad_header; + } + + /* Now we search backwards to find possible valid mode counts. This is not + * fool-proof because we could have false positive matches and read too + * far, but there isn't really any way to be sure without parsing through + * all the many variable-sized fields before the modes. This approach seems + * to work well in testing, and it is similar to how it is handled in + * liboggz. */ + mode_count = 0; + got_mode_header = 0; + while (get_bits_left(&gb) >= 97) { + if (get_bits(&gb, 8) > 63 || get_bits(&gb, 16) || get_bits(&gb, 16)) + break; + skip_bits(&gb, 1); + mode_count++; + if (mode_count > 64) + break; + gb0 = gb; + if (get_bits(&gb0, 6) + 1 == mode_count) { + got_mode_header = 1; + last_mode_count = mode_count; + } + } + if (!got_mode_header) { + av_log(s, AV_LOG_ERROR, "Invalid Setup header\n"); + ret = AVERROR_INVALIDDATA; + goto bad_header; + } + /* All samples I've seen use <= 2 modes, so ask for a sample if we find + * more than that, as it is most likely a false positive. If we get any + * we may need to approach this the long way and parse the whole Setup + * header, but I hope very much that it never comes to that. */ + if (last_mode_count > 2) { + avpriv_request_sample(s, + "%d modes (either a false positive or a " + "sample from an unknown encoder)", + last_mode_count); + } + /* We're limiting the mode count to 63 so that we know that the previous + * block flag will be in the first packet byte. */ + if (last_mode_count > 63) { + av_log(s, AV_LOG_ERROR, "Unsupported mode count: %d\n", + last_mode_count); + ret = AVERROR_INVALIDDATA; + goto bad_header; + } + s->mode_count = mode_count = last_mode_count; + /* Determine the number of bits required to code the mode and turn that + * into a bitmask to directly access the mode from the first frame byte. */ + s->mode_mask = ((1 << (av_log2(mode_count - 1) + 1)) - 1) << 1; + /* The previous window flag is the next bit after the mode */ + s->prev_mask = (s->mode_mask | 0x1) + 1; + + init_get_bits(&gb, rev_buf, buf_size * 8); + skip_bits_long(&gb, got_framing_bit); + for (i = mode_count - 1; i >= 0; i--) { + skip_bits_long(&gb, 40); + s->mode_blocksize[i] = get_bits1(&gb); + } + +bad_header: + av_free(rev_buf); + return ret; +} + +static int vorbis_parse_init(AVVorbisParseContext *s, + const uint8_t *extradata, int extradata_size) +{ + const uint8_t *header_start[3]; + int header_len[3]; + int ret; + + s->class = &vorbis_parser_class; + s->extradata_parsed = 1; + + if ((ret = avpriv_split_xiph_headers(extradata, + extradata_size, 30, + header_start, header_len)) < 0) { + av_log(s, AV_LOG_ERROR, "Extradata corrupt.\n"); + return ret; + } + + if ((ret = parse_id_header(s, header_start[0], header_len[0])) < 0) + return ret; + + if ((ret = parse_setup_header(s, header_start[2], header_len[2])) < 0) + return ret; + + s->valid_extradata = 1; + s->previous_blocksize = s->blocksize[s->mode_blocksize[0]]; + + return 0; +} + +int av_vorbis_parse_frame_flags(AVVorbisParseContext *s, const uint8_t *buf, + int buf_size, int *flags) +{ + int duration = 0; + + if (s->valid_extradata && buf_size > 0) { + int mode, current_blocksize; + int previous_blocksize = s->previous_blocksize; + + if (buf[0] & 1) { + /* If the user doesn't care about special packets, it's a bad one. */ + if (!flags) + goto bad_packet; + + /* Set the flag for which kind of special packet it is. */ + if (buf[0] == 1) + *flags |= VORBIS_FLAG_HEADER; + else if (buf[0] == 3) + *flags |= VORBIS_FLAG_COMMENT; + else if (buf[0] == 5) + *flags |= VORBIS_FLAG_SETUP; + else + goto bad_packet; + + /* Special packets have no duration. */ + return 0; + +bad_packet: + av_log(s, AV_LOG_ERROR, "Invalid packet\n"); + return AVERROR_INVALIDDATA; + } + if (s->mode_count == 1) + mode = 0; + else + mode = (buf[0] & s->mode_mask) >> 1; + if (mode >= s->mode_count) { + av_log(s, AV_LOG_ERROR, "Invalid mode in packet\n"); + return AVERROR_INVALIDDATA; + } + if(s->mode_blocksize[mode]){ + int flag = !!(buf[0] & s->prev_mask); + previous_blocksize = s->blocksize[flag]; + } + current_blocksize = s->blocksize[s->mode_blocksize[mode]]; + duration = (previous_blocksize + current_blocksize) >> 2; + s->previous_blocksize = current_blocksize; + } + + return duration; +} + +int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf, + int buf_size) +{ + return av_vorbis_parse_frame_flags(s, buf, buf_size, NULL); +} + +void av_vorbis_parse_reset(AVVorbisParseContext *s) +{ + if (s->valid_extradata) + s->previous_blocksize = s->blocksize[0]; +} + +void av_vorbis_parse_free(AVVorbisParseContext **s) +{ + av_freep(s); +} + +AVVorbisParseContext *av_vorbis_parse_init(const uint8_t *extradata, + int extradata_size) +{ + AVVorbisParseContext *s = av_mallocz(sizeof(*s)); + int ret; + + if (!s) + return NULL; + + ret = vorbis_parse_init(s, extradata, extradata_size); + if (ret < 0) { + av_vorbis_parse_free(&s); + return NULL; + } + + return s; +} + +#if CONFIG_VORBIS_PARSER + +typedef struct VorbisParseContext { + AVVorbisParseContext *vp; +} VorbisParseContext; + +static int vorbis_parse(AVCodecParserContext *s1, AVCodecContext *avctx, + const uint8_t **poutbuf, int *poutbuf_size, + const uint8_t *buf, int buf_size) +{ + VorbisParseContext *s = s1->priv_data; + int duration; + + if (!s->vp && avctx->extradata && avctx->extradata_size) { + s->vp = av_vorbis_parse_init(avctx->extradata, avctx->extradata_size); + } + if (!s->vp) + goto end; + + if ((duration = av_vorbis_parse_frame(s->vp, buf, buf_size)) >= 0) + s1->duration = duration; + +end: + /* always return the full packet. this parser isn't doing any splitting or + combining, only packet analysis */ + *poutbuf = buf; + *poutbuf_size = buf_size; + return buf_size; +} + +static void vorbis_parser_close(AVCodecParserContext *ctx) +{ + VorbisParseContext *s = ctx->priv_data; + av_vorbis_parse_free(&s->vp); +} + +const AVCodecParser ff_vorbis_parser = { + .codec_ids = { AV_CODEC_ID_VORBIS }, + .priv_data_size = sizeof(VorbisParseContext), + .parser_parse = vorbis_parse, + .parser_close = vorbis_parser_close, +}; +#endif /* CONFIG_VORBIS_PARSER */ diff --git a/media/ffvpx/libavcodec/vorbis_parser.h b/media/ffvpx/libavcodec/vorbis_parser.h new file mode 100644 index 0000000000..789932ac49 --- /dev/null +++ b/media/ffvpx/libavcodec/vorbis_parser.h @@ -0,0 +1,74 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * A public API for Vorbis parsing + * + * Determines the duration for each packet. + */ + +#ifndef AVCODEC_VORBIS_PARSER_H +#define AVCODEC_VORBIS_PARSER_H + +#include <stdint.h> + +typedef struct AVVorbisParseContext AVVorbisParseContext; + +/** + * Allocate and initialize the Vorbis parser using headers in the extradata. + */ +AVVorbisParseContext *av_vorbis_parse_init(const uint8_t *extradata, + int extradata_size); + +/** + * Free the parser and everything associated with it. + */ +void av_vorbis_parse_free(AVVorbisParseContext **s); + +#define VORBIS_FLAG_HEADER 0x00000001 +#define VORBIS_FLAG_COMMENT 0x00000002 +#define VORBIS_FLAG_SETUP 0x00000004 + +/** + * Get the duration for a Vorbis packet. + * + * If @p flags is @c NULL, + * special frames are considered invalid. + * + * @param s Vorbis parser context + * @param buf buffer containing a Vorbis frame + * @param buf_size size of the buffer + * @param flags flags for special frames + */ +int av_vorbis_parse_frame_flags(AVVorbisParseContext *s, const uint8_t *buf, + int buf_size, int *flags); + +/** + * Get the duration for a Vorbis packet. + * + * @param s Vorbis parser context + * @param buf buffer containing a Vorbis frame + * @param buf_size size of the buffer + */ +int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf, + int buf_size); + +void av_vorbis_parse_reset(AVVorbisParseContext *s); + +#endif /* AVCODEC_VORBIS_PARSER_H */ diff --git a/media/ffvpx/libavcodec/vorbis_parser_internal.h b/media/ffvpx/libavcodec/vorbis_parser_internal.h new file mode 100644 index 0000000000..691a842385 --- /dev/null +++ b/media/ffvpx/libavcodec/vorbis_parser_internal.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2012 Justin Ruggles + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Vorbis audio parser + * + * Determines the duration for each packet. + */ + +#ifndef AVCODEC_VORBIS_PARSER_INTERNAL_H +#define AVCODEC_VORBIS_PARSER_INTERNAL_H + +#include "avcodec.h" +#include "vorbis_parser.h" + +struct AVVorbisParseContext { + const AVClass *class; + int extradata_parsed; ///< we have attempted to parse extradata + int valid_extradata; ///< extradata is valid, so we can calculate duration + int blocksize[2]; ///< short and long window sizes + int previous_blocksize; ///< previous window size + int mode_blocksize[64]; ///< window size mapping for each mode + int mode_count; ///< number of modes + int mode_mask; ///< bitmask used to get the mode in each packet + int prev_mask; ///< bitmask used to get the previous mode flag in each packet +}; + +#endif /* AVCODEC_VORBIS_PARSER_INTERNAL_H */ diff --git a/media/ffvpx/libavcodec/vp3dsp.h b/media/ffvpx/libavcodec/vp3dsp.h new file mode 100644 index 0000000000..3b849ec05d --- /dev/null +++ b/media/ffvpx/libavcodec/vp3dsp.h @@ -0,0 +1,64 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VP3DSP_H +#define AVCODEC_VP3DSP_H + +#include <stddef.h> +#include <stdint.h> + +typedef struct VP3DSPContext { + /** + * Copy 8xH pixels from source to destination buffer using a bilinear + * filter with no rounding (i.e. *dst = (*a + *b) >> 1). + * + * @param dst destination buffer, aligned by 8 + * @param a first source buffer, no alignment + * @param b second source buffer, no alignment + * @param stride distance between two lines in source/dest buffers + * @param h height + */ + void (*put_no_rnd_pixels_l2)(uint8_t *dst, + const uint8_t *a, + const uint8_t *b, + ptrdiff_t stride, int h); + + void (*idct_put)(uint8_t *dest, ptrdiff_t stride, int16_t *block); + void (*idct_add)(uint8_t *dest, ptrdiff_t stride, int16_t *block); + void (*idct_dc_add)(uint8_t *dest, ptrdiff_t stride, int16_t *block); + void (*v_loop_filter)(uint8_t *src, ptrdiff_t stride, int *bounding_values); + void (*h_loop_filter)(uint8_t *src, ptrdiff_t stride, int *bounding_values); + void (*v_loop_filter_unaligned)(uint8_t *src, ptrdiff_t stride, int *bounding_values); + void (*h_loop_filter_unaligned)(uint8_t *src, ptrdiff_t stride, int *bounding_values); +} VP3DSPContext; + +void ff_vp3dsp_v_loop_filter_12(uint8_t *first_pixel, ptrdiff_t stride, int *bounding_values); +void ff_vp3dsp_h_loop_filter_12(uint8_t *first_pixel, ptrdiff_t stride, int *bounding_values); + +void ff_vp3dsp_idct10_put(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_vp3dsp_idct10_add(uint8_t *dest, ptrdiff_t stride, int16_t *block); + +void ff_vp3dsp_init(VP3DSPContext *c, int flags); +void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags); +void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags); +void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags); +void ff_vp3dsp_init_mips(VP3DSPContext *c, int flags); + +void ff_vp3dsp_set_bounding_values(int * bound_values_array, int filter_limit); + +#endif /* AVCODEC_VP3DSP_H */ diff --git a/media/ffvpx/libavcodec/vp56.h b/media/ffvpx/libavcodec/vp56.h new file mode 100644 index 0000000000..9dc0b9c7ad --- /dev/null +++ b/media/ffvpx/libavcodec/vp56.h @@ -0,0 +1,256 @@ +/* + * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * VP5 and VP6 compatible video decoder (common features) + */ + +#ifndef AVCODEC_VP56_H +#define AVCODEC_VP56_H + +#include "libavutil/mem_internal.h" + +#include "avcodec.h" +#include "get_bits.h" +#include "hpeldsp.h" +#include "h264chroma.h" +#include "videodsp.h" +#include "vp3dsp.h" +#include "vp56dsp.h" +#include "vpx_rac.h" + +typedef struct vp56_context VP56Context; + +typedef enum { + VP56_FRAME_NONE =-1, + VP56_FRAME_CURRENT = 0, + VP56_FRAME_PREVIOUS = 1, + VP56_FRAME_GOLDEN = 2, +} VP56Frame; + +typedef enum { + VP56_MB_INTER_NOVEC_PF = 0, /**< Inter MB, no vector, from previous frame */ + VP56_MB_INTRA = 1, /**< Intra MB */ + VP56_MB_INTER_DELTA_PF = 2, /**< Inter MB, above/left vector + delta, from previous frame */ + VP56_MB_INTER_V1_PF = 3, /**< Inter MB, first vector, from previous frame */ + VP56_MB_INTER_V2_PF = 4, /**< Inter MB, second vector, from previous frame */ + VP56_MB_INTER_NOVEC_GF = 5, /**< Inter MB, no vector, from golden frame */ + VP56_MB_INTER_DELTA_GF = 6, /**< Inter MB, above/left vector + delta, from golden frame */ + VP56_MB_INTER_4V = 7, /**< Inter MB, 4 vectors, from previous frame */ + VP56_MB_INTER_V1_GF = 8, /**< Inter MB, first vector, from golden frame */ + VP56_MB_INTER_V2_GF = 9, /**< Inter MB, second vector, from golden frame */ +} VP56mb; + +typedef struct VP56Tree { + int8_t val; + int8_t prob_idx; +} VP56Tree; + +typedef struct VP56mv { + DECLARE_ALIGNED(4, int16_t, x); + int16_t y; +} VP56mv; + +#define VP56_SIZE_CHANGE 1 + +typedef void (*VP56ParseVectorAdjustment)(VP56Context *s, + VP56mv *vect); +typedef void (*VP56Filter)(VP56Context *s, uint8_t *dst, uint8_t *src, + int offset1, int offset2, ptrdiff_t stride, + VP56mv mv, int mask, int select, int luma); +typedef int (*VP56ParseCoeff)(VP56Context *s); +typedef void (*VP56DefaultModelsInit)(VP56Context *s); +typedef void (*VP56ParseVectorModels)(VP56Context *s); +typedef int (*VP56ParseCoeffModels)(VP56Context *s); +typedef int (*VP56ParseHeader)(VP56Context *s, const uint8_t *buf, + int buf_size); + +typedef struct VP56RefDc { + uint8_t not_null_dc; + VP56Frame ref_frame; + int16_t dc_coeff; +} VP56RefDc; + +typedef struct VP56Macroblock { + uint8_t type; + VP56mv mv; +} VP56Macroblock; + +typedef struct VP56Model { + uint8_t coeff_reorder[64]; /* used in vp6 only */ + uint8_t coeff_index_to_pos[64]; /* used in vp6 only */ + uint8_t coeff_index_to_idct_selector[64]; /* used in vp6 only */ + uint8_t vector_sig[2]; /* delta sign */ + uint8_t vector_dct[2]; /* delta coding types */ + uint8_t vector_pdi[2][2]; /* predefined delta init */ + uint8_t vector_pdv[2][7]; /* predefined delta values */ + uint8_t vector_fdv[2][8]; /* 8 bit delta value definition */ + uint8_t coeff_dccv[2][11]; /* DC coeff value */ + uint8_t coeff_ract[2][3][6][11]; /* Run/AC coding type and AC coeff value */ + uint8_t coeff_acct[2][3][3][6][5];/* vp5 only AC coding type for coding group < 3 */ + uint8_t coeff_dcct[2][36][5]; /* DC coeff coding type */ + uint8_t coeff_runv[2][14]; /* run value (vp6 only) */ + uint8_t mb_type[3][10][10]; /* model for decoding MB type */ + uint8_t mb_types_stats[3][10][2];/* contextual, next MB type stats */ +} VP56Model; + +struct vp56_context { + AVCodecContext *avctx; + H264ChromaContext h264chroma; + HpelDSPContext hdsp; + VideoDSPContext vdsp; + VP3DSPContext vp3dsp; + VP56DSPContext vp56dsp; + uint8_t idct_scantable[64]; + AVFrame *frames[4]; + uint8_t *edge_emu_buffer_alloc; + uint8_t *edge_emu_buffer; + VPXRangeCoder c; + VPXRangeCoder cc; + VPXRangeCoder *ccp; + int sub_version; + + /* frame info */ + int golden_frame; + int plane_width[4]; + int plane_height[4]; + int mb_width; /* number of horizontal MB */ + int mb_height; /* number of vertical MB */ + int block_offset[6]; + + int quantizer; + uint16_t dequant_dc; + uint16_t dequant_ac; + + /* DC predictors management */ + VP56RefDc *above_blocks; + VP56RefDc left_block[4]; + int above_block_idx[6]; + int16_t prev_dc[3][3]; /* [plan][ref_frame] */ + + /* blocks / macroblock */ + VP56mb mb_type; + VP56Macroblock *macroblocks; + DECLARE_ALIGNED(16, int16_t, block_coeff)[6][64]; + int idct_selector[6]; + + /* motion vectors */ + VP56mv mv[6]; /* vectors for each block in MB */ + VP56mv vector_candidate[2]; + int vector_candidate_pos; + + /* filtering hints */ + int filter_header; /* used in vp6 only */ + int deblock_filtering; + int filter_selection; + int filter_mode; + int max_vector_length; + int sample_variance_threshold; + DECLARE_ALIGNED(8, int, bounding_values_array)[256]; + + uint8_t coeff_ctx[4][64]; /* used in vp5 only */ + uint8_t coeff_ctx_last[4]; /* used in vp5 only */ + + int has_alpha; + + /* upside-down flipping hints */ + int flip; /* are we flipping ? */ + int frbi; /* first row block index in MB */ + int srbi; /* second row block index in MB */ + ptrdiff_t stride[4]; /* stride for each plan */ + + const uint8_t *vp56_coord_div; + VP56ParseVectorAdjustment parse_vector_adjustment; + VP56Filter filter; + VP56ParseCoeff parse_coeff; + VP56DefaultModelsInit default_models_init; + VP56ParseVectorModels parse_vector_models; + VP56ParseCoeffModels parse_coeff_models; + VP56ParseHeader parse_header; + + /* for "slice" parallelism between YUV and A */ + VP56Context *alpha_context; + + VP56Model *modelp; + VP56Model model; + + /* huffman decoding */ + int use_huffman; + GetBitContext gb; + VLC dccv_vlc[2]; + VLC runv_vlc[2]; + VLC ract_vlc[2][3][6]; + unsigned int nb_null[2][2]; /* number of consecutive NULL DC/AC */ + + int have_undamaged_frame; + int discard_frame; +}; + + +/** + * Initializes an VP56Context. Expects its caller to clean up + * in case of error. + */ +int ff_vp56_init_context(AVCodecContext *avctx, VP56Context *s, + int flip, int has_alpha); +int ff_vp56_free_context(VP56Context *s); +void ff_vp56_init_dequant(VP56Context *s, int quantizer); +int ff_vp56_decode_frame(AVCodecContext *avctx, AVFrame *frame, + int *got_frame, AVPacket *avpkt); + + +/** + * vp56 specific range coder implementation + */ + +static int vp56_rac_gets(VPXRangeCoder *c, int bits) +{ + int value = 0; + + while (bits--) { + value = (value << 1) | vpx_rac_get(c); + } + + return value; +} + +// P(7) +static av_unused int vp56_rac_gets_nn(VPXRangeCoder *c, int bits) +{ + int v = vp56_rac_gets(c, 7) << 1; + return v + !v; +} + +static av_always_inline +int vp56_rac_get_tree(VPXRangeCoder *c, + const VP56Tree *tree, + const uint8_t *probs) +{ + while (tree->val > 0) { + if (vpx_rac_get_prob_branchy(c, probs[tree->prob_idx])) + tree += tree->val; + else + tree++; + } + return -tree->val; +} + +#endif /* AVCODEC_VP56_H */ diff --git a/media/ffvpx/libavcodec/vp56dsp.h b/media/ffvpx/libavcodec/vp56dsp.h new file mode 100644 index 0000000000..e35e232ea3 --- /dev/null +++ b/media/ffvpx/libavcodec/vp56dsp.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VP56DSP_H +#define AVCODEC_VP56DSP_H + +#include <stddef.h> +#include <stdint.h> + +typedef struct VP56DSPContext { + void (*edge_filter_hor)(uint8_t *yuv, ptrdiff_t stride, int t); + void (*edge_filter_ver)(uint8_t *yuv, ptrdiff_t stride, int t); + + void (*vp6_filter_diag4)(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + const int16_t *h_weights,const int16_t *v_weights); +} VP56DSPContext; + +void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + const int16_t *h_weights, const int16_t *v_weights); + +void ff_vp5dsp_init(VP56DSPContext *s); +void ff_vp6dsp_init(VP56DSPContext *s); + +void ff_vp6dsp_init_arm(VP56DSPContext *s); +void ff_vp6dsp_init_x86(VP56DSPContext *s); + +#endif /* AVCODEC_VP56DSP_H */ diff --git a/media/ffvpx/libavcodec/vp8.c b/media/ffvpx/libavcodec/vp8.c new file mode 100644 index 0000000000..db2419deaf --- /dev/null +++ b/media/ffvpx/libavcodec/vp8.c @@ -0,0 +1,3014 @@ +/* + * VP7/VP8 compatible video decoder + * + * Copyright (C) 2010 David Conrad + * Copyright (C) 2010 Ronald S. Bultje + * Copyright (C) 2010 Fiona Glaser + * Copyright (C) 2012 Daniel Kang + * Copyright (C) 2014 Peter Ross + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config_components.h" + +#include "libavutil/mem_internal.h" + +#include "avcodec.h" +#include "codec_internal.h" +#include "decode.h" +#include "hwconfig.h" +#include "mathops.h" +#include "thread.h" +#include "threadframe.h" +#include "vp8.h" +#include "vp89_rac.h" +#include "vp8data.h" +#include "vpx_rac.h" + +#if ARCH_ARM +# include "arm/vp8.h" +#endif + +// fixme: add 1 bit to all the calls to this? +static int vp8_rac_get_sint(VPXRangeCoder *c, int bits) +{ + int v; + + if (!vp89_rac_get(c)) + return 0; + + v = vp89_rac_get_uint(c, bits); + + if (vp89_rac_get(c)) + v = -v; + + return v; +} + +static int vp8_rac_get_nn(VPXRangeCoder *c) +{ + int v = vp89_rac_get_uint(c, 7) << 1; + return v + !v; +} + +// DCTextra +static int vp8_rac_get_coeff(VPXRangeCoder *c, const uint8_t *prob) +{ + int v = 0; + + do { + v = (v<<1) + vpx_rac_get_prob(c, *prob++); + } while (*prob); + + return v; +} + +static void free_buffers(VP8Context *s) +{ + int i; + if (s->thread_data) + for (i = 0; i < MAX_THREADS; i++) { +#if HAVE_THREADS + pthread_cond_destroy(&s->thread_data[i].cond); + pthread_mutex_destroy(&s->thread_data[i].lock); +#endif + av_freep(&s->thread_data[i].filter_strength); + } + av_freep(&s->thread_data); + av_freep(&s->macroblocks_base); + av_freep(&s->intra4x4_pred_mode_top); + av_freep(&s->top_nnz); + av_freep(&s->top_border); + + s->macroblocks = NULL; +} + +static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref) +{ + int ret; + if ((ret = ff_thread_get_ext_buffer(s->avctx, &f->tf, + ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0) + return ret; + if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) + goto fail; + if (s->avctx->hwaccel) { + const AVHWAccel *hwaccel = s->avctx->hwaccel; + if (hwaccel->frame_priv_data_size) { + f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size); + if (!f->hwaccel_priv_buf) + goto fail; + f->hwaccel_picture_private = f->hwaccel_priv_buf->data; + } + } + return 0; + +fail: + av_buffer_unref(&f->seg_map); + ff_thread_release_ext_buffer(s->avctx, &f->tf); + return AVERROR(ENOMEM); +} + +static void vp8_release_frame(VP8Context *s, VP8Frame *f) +{ + av_buffer_unref(&f->seg_map); + av_buffer_unref(&f->hwaccel_priv_buf); + f->hwaccel_picture_private = NULL; + ff_thread_release_ext_buffer(s->avctx, &f->tf); +} + +#if CONFIG_VP8_DECODER +static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, const VP8Frame *src) +{ + int ret; + + vp8_release_frame(s, dst); + + if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) + return ret; + if (src->seg_map && + !(dst->seg_map = av_buffer_ref(src->seg_map))) { + vp8_release_frame(s, dst); + return AVERROR(ENOMEM); + } + if (src->hwaccel_picture_private) { + dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf); + if (!dst->hwaccel_priv_buf) + return AVERROR(ENOMEM); + dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data; + } + + return 0; +} +#endif /* CONFIG_VP8_DECODER */ + +static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem) +{ + VP8Context *s = avctx->priv_data; + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) + vp8_release_frame(s, &s->frames[i]); + memset(s->framep, 0, sizeof(s->framep)); + + if (free_mem) + free_buffers(s); +} + +static void vp8_decode_flush(AVCodecContext *avctx) +{ + vp8_decode_flush_impl(avctx, 0); +} + +static VP8Frame *vp8_find_free_buffer(VP8Context *s) +{ + VP8Frame *frame = NULL; + int i; + + // find a free buffer + for (i = 0; i < 5; i++) + if (&s->frames[i] != s->framep[VP8_FRAME_CURRENT] && + &s->frames[i] != s->framep[VP8_FRAME_PREVIOUS] && + &s->frames[i] != s->framep[VP8_FRAME_GOLDEN] && + &s->frames[i] != s->framep[VP8_FRAME_ALTREF]) { + frame = &s->frames[i]; + break; + } + if (i == 5) { + av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n"); + abort(); + } + if (frame->tf.f->buf[0]) + vp8_release_frame(s, frame); + + return frame; +} + +static enum AVPixelFormat get_pixel_format(VP8Context *s) +{ + enum AVPixelFormat pix_fmts[] = { +#if CONFIG_VP8_VAAPI_HWACCEL + AV_PIX_FMT_VAAPI, +#endif +#if CONFIG_VP8_NVDEC_HWACCEL + AV_PIX_FMT_CUDA, +#endif + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NONE, + }; + + return ff_get_format(s->avctx, pix_fmts); +} + +static av_always_inline +int update_dimensions(VP8Context *s, int width, int height, int is_vp7) +{ + AVCodecContext *avctx = s->avctx; + int i, ret, dim_reset = 0; + + if (width != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base || + height != s->avctx->height) { + vp8_decode_flush_impl(s->avctx, 1); + + ret = ff_set_dimensions(s->avctx, width, height); + if (ret < 0) + return ret; + + dim_reset = (s->macroblocks_base != NULL); + } + + if ((s->pix_fmt == AV_PIX_FMT_NONE || dim_reset) && + !s->actually_webp && !is_vp7) { + s->pix_fmt = get_pixel_format(s); + if (s->pix_fmt < 0) + return AVERROR(EINVAL); + avctx->pix_fmt = s->pix_fmt; + } + + s->mb_width = (s->avctx->coded_width + 15) / 16; + s->mb_height = (s->avctx->coded_height + 15) / 16; + + s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE && + avctx->thread_count > 1; + if (!s->mb_layout) { // Frame threading and one thread + s->macroblocks_base = av_mallocz((s->mb_width + s->mb_height * 2 + 1) * + sizeof(*s->macroblocks)); + s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4); + } else // Sliced threading + s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) * + sizeof(*s->macroblocks)); + s->top_nnz = av_mallocz(s->mb_width * sizeof(*s->top_nnz)); + s->top_border = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border)); + s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData)); + + if (!s->macroblocks_base || !s->top_nnz || !s->top_border || + !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) { + free_buffers(s); + return AVERROR(ENOMEM); + } + + for (i = 0; i < MAX_THREADS; i++) { + s->thread_data[i].filter_strength = + av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength)); + if (!s->thread_data[i].filter_strength) { + free_buffers(s); + return AVERROR(ENOMEM); + } +#if HAVE_THREADS + pthread_mutex_init(&s->thread_data[i].lock, NULL); + pthread_cond_init(&s->thread_data[i].cond, NULL); +#endif + } + + s->macroblocks = s->macroblocks_base + 1; + + return 0; +} + +static int vp7_update_dimensions(VP8Context *s, int width, int height) +{ + return update_dimensions(s, width, height, IS_VP7); +} + +static int vp8_update_dimensions(VP8Context *s, int width, int height) +{ + return update_dimensions(s, width, height, IS_VP8); +} + + +static void parse_segment_info(VP8Context *s) +{ + VPXRangeCoder *c = &s->c; + int i; + + s->segmentation.update_map = vp89_rac_get(c); + s->segmentation.update_feature_data = vp89_rac_get(c); + + if (s->segmentation.update_feature_data) { + s->segmentation.absolute_vals = vp89_rac_get(c); + + for (i = 0; i < 4; i++) + s->segmentation.base_quant[i] = vp8_rac_get_sint(c, 7); + + for (i = 0; i < 4; i++) + s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6); + } + if (s->segmentation.update_map) + for (i = 0; i < 3; i++) + s->prob->segmentid[i] = vp89_rac_get(c) ? vp89_rac_get_uint(c, 8) : 255; +} + +static void update_lf_deltas(VP8Context *s) +{ + VPXRangeCoder *c = &s->c; + int i; + + for (i = 0; i < 4; i++) { + if (vp89_rac_get(c)) { + s->lf_delta.ref[i] = vp89_rac_get_uint(c, 6); + + if (vp89_rac_get(c)) + s->lf_delta.ref[i] = -s->lf_delta.ref[i]; + } + } + + for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) { + if (vp89_rac_get(c)) { + s->lf_delta.mode[i] = vp89_rac_get_uint(c, 6); + + if (vp89_rac_get(c)) + s->lf_delta.mode[i] = -s->lf_delta.mode[i]; + } + } +} + +static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size) +{ + const uint8_t *sizes = buf; + int i; + int ret; + + s->num_coeff_partitions = 1 << vp89_rac_get_uint(&s->c, 2); + + buf += 3 * (s->num_coeff_partitions - 1); + buf_size -= 3 * (s->num_coeff_partitions - 1); + if (buf_size < 0) + return -1; + + for (i = 0; i < s->num_coeff_partitions - 1; i++) { + int size = AV_RL24(sizes + 3 * i); + if (buf_size - size < 0) + return -1; + s->coeff_partition_size[i] = size; + + ret = ff_vpx_init_range_decoder(&s->coeff_partition[i], buf, size); + if (ret < 0) + return ret; + buf += size; + buf_size -= size; + } + + s->coeff_partition_size[i] = buf_size; + ff_vpx_init_range_decoder(&s->coeff_partition[i], buf, buf_size); + + return 0; +} + +static void vp7_get_quants(VP8Context *s) +{ + VPXRangeCoder *c = &s->c; + + int yac_qi = vp89_rac_get_uint(c, 7); + int ydc_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi; + int y2dc_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi; + int y2ac_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi; + int uvdc_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi; + int uvac_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi; + + s->qmat[0].luma_qmul[0] = vp7_ydc_qlookup[ydc_qi]; + s->qmat[0].luma_qmul[1] = vp7_yac_qlookup[yac_qi]; + s->qmat[0].luma_dc_qmul[0] = vp7_y2dc_qlookup[y2dc_qi]; + s->qmat[0].luma_dc_qmul[1] = vp7_y2ac_qlookup[y2ac_qi]; + s->qmat[0].chroma_qmul[0] = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132); + s->qmat[0].chroma_qmul[1] = vp7_yac_qlookup[uvac_qi]; +} + +static void vp8_get_quants(VP8Context *s) +{ + VPXRangeCoder *c = &s->c; + int i, base_qi; + + s->quant.yac_qi = vp89_rac_get_uint(c, 7); + s->quant.ydc_delta = vp8_rac_get_sint(c, 4); + s->quant.y2dc_delta = vp8_rac_get_sint(c, 4); + s->quant.y2ac_delta = vp8_rac_get_sint(c, 4); + s->quant.uvdc_delta = vp8_rac_get_sint(c, 4); + s->quant.uvac_delta = vp8_rac_get_sint(c, 4); + + for (i = 0; i < 4; i++) { + if (s->segmentation.enabled) { + base_qi = s->segmentation.base_quant[i]; + if (!s->segmentation.absolute_vals) + base_qi += s->quant.yac_qi; + } else + base_qi = s->quant.yac_qi; + + s->qmat[i].luma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta, 7)]; + s->qmat[i].luma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi, 7)]; + s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2; + /* 101581>>16 is equivalent to 155/100 */ + s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16; + s->qmat[i].chroma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)]; + s->qmat[i].chroma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)]; + + s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8); + s->qmat[i].chroma_qmul[0] = FFMIN(s->qmat[i].chroma_qmul[0], 132); + } +} + +/** + * Determine which buffers golden and altref should be updated with after this frame. + * The spec isn't clear here, so I'm going by my understanding of what libvpx does + * + * Intra frames update all 3 references + * Inter frames update VP8_FRAME_PREVIOUS if the update_last flag is set + * If the update (golden|altref) flag is set, it's updated with the current frame + * if update_last is set, and VP8_FRAME_PREVIOUS otherwise. + * If the flag is not set, the number read means: + * 0: no update + * 1: VP8_FRAME_PREVIOUS + * 2: update golden with altref, or update altref with golden + */ +static VP8FrameType ref_to_update(VP8Context *s, int update, VP8FrameType ref) +{ + VPXRangeCoder *c = &s->c; + + if (update) + return VP8_FRAME_CURRENT; + + switch (vp89_rac_get_uint(c, 2)) { + case 1: + return VP8_FRAME_PREVIOUS; + case 2: + return (ref == VP8_FRAME_GOLDEN) ? VP8_FRAME_ALTREF : VP8_FRAME_GOLDEN; + } + return VP8_FRAME_NONE; +} + +static void vp78_reset_probability_tables(VP8Context *s) +{ + int i, j; + for (i = 0; i < 4; i++) + for (j = 0; j < 16; j++) + memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]], + sizeof(s->prob->token[i][j])); +} + +static void vp78_update_probability_tables(VP8Context *s) +{ + VPXRangeCoder *c = &s->c; + int i, j, k, l, m; + + for (i = 0; i < 4; i++) + for (j = 0; j < 8; j++) + for (k = 0; k < 3; k++) + for (l = 0; l < NUM_DCT_TOKENS-1; l++) + if (vpx_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) { + int prob = vp89_rac_get_uint(c, 8); + for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++) + s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob; + } +} + +#define VP7_MVC_SIZE 17 +#define VP8_MVC_SIZE 19 + +static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s, + int mvc_size) +{ + VPXRangeCoder *c = &s->c; + int i, j; + + if (vp89_rac_get(c)) + for (i = 0; i < 4; i++) + s->prob->pred16x16[i] = vp89_rac_get_uint(c, 8); + if (vp89_rac_get(c)) + for (i = 0; i < 3; i++) + s->prob->pred8x8c[i] = vp89_rac_get_uint(c, 8); + + // 17.2 MV probability update + for (i = 0; i < 2; i++) + for (j = 0; j < mvc_size; j++) + if (vpx_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j])) + s->prob->mvc[i][j] = vp8_rac_get_nn(c); +} + +static void update_refs(VP8Context *s) +{ + VPXRangeCoder *c = &s->c; + + int update_golden = vp89_rac_get(c); + int update_altref = vp89_rac_get(c); + + s->update_golden = ref_to_update(s, update_golden, VP8_FRAME_GOLDEN); + s->update_altref = ref_to_update(s, update_altref, VP8_FRAME_ALTREF); +} + +static void copy_chroma(AVFrame *dst, const AVFrame *src, int width, int height) +{ + int i, j; + + for (j = 1; j < 3; j++) { + for (i = 0; i < height / 2; i++) + memcpy(dst->data[j] + i * dst->linesize[j], + src->data[j] + i * src->linesize[j], width / 2); + } +} + +static void fade(uint8_t *dst, ptrdiff_t dst_linesize, + const uint8_t *src, ptrdiff_t src_linesize, + int width, int height, + int alpha, int beta) +{ + int i, j; + for (j = 0; j < height; j++) { + const uint8_t *src2 = src + j * src_linesize; + uint8_t *dst2 = dst + j * dst_linesize; + for (i = 0; i < width; i++) { + uint8_t y = src2[i]; + dst2[i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha); + } + } +} + +static int vp7_fade_frame(VP8Context *s, int alpha, int beta) +{ + int ret; + + if (!s->keyframe && (alpha || beta)) { + int width = s->mb_width * 16; + int height = s->mb_height * 16; + const AVFrame *src; + AVFrame *dst; + + if (!s->framep[VP8_FRAME_PREVIOUS] || + !s->framep[VP8_FRAME_GOLDEN]) { + av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n"); + return AVERROR_INVALIDDATA; + } + + src = + dst = s->framep[VP8_FRAME_PREVIOUS]->tf.f; + + /* preserve the golden frame, write a new previous frame */ + if (s->framep[VP8_FRAME_GOLDEN] == s->framep[VP8_FRAME_PREVIOUS]) { + s->framep[VP8_FRAME_PREVIOUS] = vp8_find_free_buffer(s); + if ((ret = vp8_alloc_frame(s, s->framep[VP8_FRAME_PREVIOUS], 1)) < 0) + return ret; + + dst = s->framep[VP8_FRAME_PREVIOUS]->tf.f; + + copy_chroma(dst, src, width, height); + } + + fade(dst->data[0], dst->linesize[0], + src->data[0], src->linesize[0], + width, height, alpha, beta); + } + + return 0; +} + +static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) +{ + VPXRangeCoder *c = &s->c; + int part1_size, hscale, vscale, i, j, ret; + int width = s->avctx->width; + int height = s->avctx->height; + int alpha = 0; + int beta = 0; + int fade_present = 1; + + if (buf_size < 4) { + return AVERROR_INVALIDDATA; + } + + s->profile = (buf[0] >> 1) & 7; + if (s->profile > 1) { + avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile); + return AVERROR_INVALIDDATA; + } + + s->keyframe = !(buf[0] & 1); + s->invisible = 0; + part1_size = AV_RL24(buf) >> 4; + + if (buf_size < 4 - s->profile + part1_size) { + av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size); + return AVERROR_INVALIDDATA; + } + + buf += 4 - s->profile; + buf_size -= 4 - s->profile; + + memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab)); + + ret = ff_vpx_init_range_decoder(c, buf, part1_size); + if (ret < 0) + return ret; + buf += part1_size; + buf_size -= part1_size; + + /* A. Dimension information (keyframes only) */ + if (s->keyframe) { + width = vp89_rac_get_uint(c, 12); + height = vp89_rac_get_uint(c, 12); + hscale = vp89_rac_get_uint(c, 2); + vscale = vp89_rac_get_uint(c, 2); + if (hscale || vscale) + avpriv_request_sample(s->avctx, "Upscaling"); + + s->update_golden = s->update_altref = VP8_FRAME_CURRENT; + vp78_reset_probability_tables(s); + memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, + sizeof(s->prob->pred16x16)); + memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter, + sizeof(s->prob->pred8x8c)); + for (i = 0; i < 2; i++) + memcpy(s->prob->mvc[i], vp7_mv_default_prob[i], + sizeof(vp7_mv_default_prob[i])); + memset(&s->segmentation, 0, sizeof(s->segmentation)); + memset(&s->lf_delta, 0, sizeof(s->lf_delta)); + memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan)); + } + + if (s->keyframe || s->profile > 0) + memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred)); + + /* B. Decoding information for all four macroblock-level features */ + for (i = 0; i < 4; i++) { + s->feature_enabled[i] = vp89_rac_get(c); + if (s->feature_enabled[i]) { + s->feature_present_prob[i] = vp89_rac_get_uint(c, 8); + + for (j = 0; j < 3; j++) + s->feature_index_prob[i][j] = + vp89_rac_get(c) ? vp89_rac_get_uint(c, 8) : 255; + + if (vp7_feature_value_size[s->profile][i]) + for (j = 0; j < 4; j++) + s->feature_value[i][j] = + vp89_rac_get(c) ? vp89_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0; + } + } + + s->segmentation.enabled = 0; + s->segmentation.update_map = 0; + s->lf_delta.enabled = 0; + + s->num_coeff_partitions = 1; + ret = ff_vpx_init_range_decoder(&s->coeff_partition[0], buf, buf_size); + if (ret < 0) + return ret; + + if (!s->macroblocks_base || /* first frame */ + width != s->avctx->width || height != s->avctx->height || + (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) { + if ((ret = vp7_update_dimensions(s, width, height)) < 0) + return ret; + } + + /* C. Dequantization indices */ + vp7_get_quants(s); + + /* D. Golden frame update flag (a Flag) for interframes only */ + if (!s->keyframe) { + s->update_golden = vp89_rac_get(c) ? VP8_FRAME_CURRENT : VP8_FRAME_NONE; + s->sign_bias[VP8_FRAME_GOLDEN] = 0; + } + + s->update_last = 1; + s->update_probabilities = 1; + + if (s->profile > 0) { + s->update_probabilities = vp89_rac_get(c); + if (!s->update_probabilities) + s->prob[1] = s->prob[0]; + + if (!s->keyframe) + fade_present = vp89_rac_get(c); + } + + if (vpx_rac_is_end(c)) + return AVERROR_INVALIDDATA; + /* E. Fading information for previous frame */ + if (fade_present && vp89_rac_get(c)) { + alpha = (int8_t) vp89_rac_get_uint(c, 8); + beta = (int8_t) vp89_rac_get_uint(c, 8); + } + + /* F. Loop filter type */ + if (!s->profile) + s->filter.simple = vp89_rac_get(c); + + /* G. DCT coefficient ordering specification */ + if (vp89_rac_get(c)) + for (i = 1; i < 16; i++) + s->prob[0].scan[i] = ff_zigzag_scan[vp89_rac_get_uint(c, 4)]; + + /* H. Loop filter levels */ + if (s->profile > 0) + s->filter.simple = vp89_rac_get(c); + s->filter.level = vp89_rac_get_uint(c, 6); + s->filter.sharpness = vp89_rac_get_uint(c, 3); + + /* I. DCT coefficient probability update; 13.3 Token Probability Updates */ + vp78_update_probability_tables(s); + + s->mbskip_enabled = 0; + + /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */ + if (!s->keyframe) { + s->prob->intra = vp89_rac_get_uint(c, 8); + s->prob->last = vp89_rac_get_uint(c, 8); + vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE); + } + + if (vpx_rac_is_end(c)) + return AVERROR_INVALIDDATA; + + if ((ret = vp7_fade_frame(s, alpha, beta)) < 0) + return ret; + + return 0; +} + +static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size) +{ + VPXRangeCoder *c = &s->c; + int header_size, hscale, vscale, ret; + int width = s->avctx->width; + int height = s->avctx->height; + + if (buf_size < 3) { + av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size); + return AVERROR_INVALIDDATA; + } + + s->keyframe = !(buf[0] & 1); + s->profile = (buf[0]>>1) & 7; + s->invisible = !(buf[0] & 0x10); + header_size = AV_RL24(buf) >> 5; + buf += 3; + buf_size -= 3; + + s->header_partition_size = header_size; + + if (s->profile > 3) + av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile); + + if (!s->profile) + memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, + sizeof(s->put_pixels_tab)); + else // profile 1-3 use bilinear, 4+ aren't defined so whatever + memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, + sizeof(s->put_pixels_tab)); + + if (header_size > buf_size - 7 * s->keyframe) { + av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n"); + return AVERROR_INVALIDDATA; + } + + if (s->keyframe) { + if (AV_RL24(buf) != 0x2a019d) { + av_log(s->avctx, AV_LOG_ERROR, + "Invalid start code 0x%x\n", AV_RL24(buf)); + return AVERROR_INVALIDDATA; + } + width = AV_RL16(buf + 3) & 0x3fff; + height = AV_RL16(buf + 5) & 0x3fff; + hscale = buf[4] >> 6; + vscale = buf[6] >> 6; + buf += 7; + buf_size -= 7; + + if (hscale || vscale) + avpriv_request_sample(s->avctx, "Upscaling"); + + s->update_golden = s->update_altref = VP8_FRAME_CURRENT; + vp78_reset_probability_tables(s); + memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, + sizeof(s->prob->pred16x16)); + memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter, + sizeof(s->prob->pred8x8c)); + memcpy(s->prob->mvc, vp8_mv_default_prob, + sizeof(s->prob->mvc)); + memset(&s->segmentation, 0, sizeof(s->segmentation)); + memset(&s->lf_delta, 0, sizeof(s->lf_delta)); + } + + ret = ff_vpx_init_range_decoder(c, buf, header_size); + if (ret < 0) + return ret; + buf += header_size; + buf_size -= header_size; + + if (s->keyframe) { + s->colorspace = vp89_rac_get(c); + if (s->colorspace) + av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n"); + s->fullrange = vp89_rac_get(c); + } + + if ((s->segmentation.enabled = vp89_rac_get(c))) + parse_segment_info(s); + else + s->segmentation.update_map = 0; // FIXME: move this to some init function? + + s->filter.simple = vp89_rac_get(c); + s->filter.level = vp89_rac_get_uint(c, 6); + s->filter.sharpness = vp89_rac_get_uint(c, 3); + + if ((s->lf_delta.enabled = vp89_rac_get(c))) { + s->lf_delta.update = vp89_rac_get(c); + if (s->lf_delta.update) + update_lf_deltas(s); + } + + if (setup_partitions(s, buf, buf_size)) { + av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n"); + return AVERROR_INVALIDDATA; + } + + if (!s->macroblocks_base || /* first frame */ + width != s->avctx->width || height != s->avctx->height || + (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) + if ((ret = vp8_update_dimensions(s, width, height)) < 0) + return ret; + + vp8_get_quants(s); + + if (!s->keyframe) { + update_refs(s); + s->sign_bias[VP8_FRAME_GOLDEN] = vp89_rac_get(c); + s->sign_bias[VP8_FRAME_ALTREF] = vp89_rac_get(c); + } + + // if we aren't saving this frame's probabilities for future frames, + // make a copy of the current probabilities + if (!(s->update_probabilities = vp89_rac_get(c))) + s->prob[1] = s->prob[0]; + + s->update_last = s->keyframe || vp89_rac_get(c); + + vp78_update_probability_tables(s); + + if ((s->mbskip_enabled = vp89_rac_get(c))) + s->prob->mbskip = vp89_rac_get_uint(c, 8); + + if (!s->keyframe) { + s->prob->intra = vp89_rac_get_uint(c, 8); + s->prob->last = vp89_rac_get_uint(c, 8); + s->prob->golden = vp89_rac_get_uint(c, 8); + vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE); + } + + // Record the entropy coder state here so that hwaccels can use it. + s->c.code_word = vpx_rac_renorm(&s->c); + s->coder_state_at_header_end.input = s->c.buffer - (-s->c.bits / 8); + s->coder_state_at_header_end.range = s->c.high; + s->coder_state_at_header_end.value = s->c.code_word >> 16; + s->coder_state_at_header_end.bit_count = -s->c.bits % 8; + + return 0; +} + +static av_always_inline +void clamp_mv(const VP8mvbounds *s, VP8mv *dst, const VP8mv *src) +{ + dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX), + av_clip(s->mv_max.x, INT16_MIN, INT16_MAX)); + dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX), + av_clip(s->mv_max.y, INT16_MIN, INT16_MAX)); +} + +/** + * Motion vector coding, 17.1. + */ +static av_always_inline int read_mv_component(VPXRangeCoder *c, const uint8_t *p, int vp7) +{ + int bit, x = 0; + + if (vpx_rac_get_prob_branchy(c, p[0])) { + int i; + + for (i = 0; i < 3; i++) + x += vpx_rac_get_prob(c, p[9 + i]) << i; + for (i = (vp7 ? 7 : 9); i > 3; i--) + x += vpx_rac_get_prob(c, p[9 + i]) << i; + if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vpx_rac_get_prob(c, p[12])) + x += 8; + } else { + // small_mvtree + const uint8_t *ps = p + 2; + bit = vpx_rac_get_prob(c, *ps); + ps += 1 + 3 * bit; + x += 4 * bit; + bit = vpx_rac_get_prob(c, *ps); + ps += 1 + bit; + x += 2 * bit; + x += vpx_rac_get_prob(c, *ps); + } + + return (x && vpx_rac_get_prob(c, p[1])) ? -x : x; +} + +static int vp7_read_mv_component(VPXRangeCoder *c, const uint8_t *p) +{ + return read_mv_component(c, p, 1); +} + +static int vp8_read_mv_component(VPXRangeCoder *c, const uint8_t *p) +{ + return read_mv_component(c, p, 0); +} + +static av_always_inline +const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7) +{ + if (is_vp7) + return vp7_submv_prob; + + if (left == top) + return vp8_submv_prob[4 - !!left]; + if (!top) + return vp8_submv_prob[2]; + return vp8_submv_prob[1 - !!left]; +} + +/** + * Split motion vector prediction, 16.4. + * @returns the number of motion vectors parsed (2, 4 or 16) + */ +static av_always_inline +int decode_splitmvs(const VP8Context *s, VPXRangeCoder *c, VP8Macroblock *mb, + int layout, int is_vp7) +{ + int part_idx; + int n, num; + const VP8Macroblock *top_mb; + const VP8Macroblock *left_mb = &mb[-1]; + const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning]; + const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx; + const VP8mv *top_mv; + const VP8mv *left_mv = left_mb->bmv; + const VP8mv *cur_mv = mb->bmv; + + if (!layout) // layout is inlined, s->mb_layout is not + top_mb = &mb[2]; + else + top_mb = &mb[-s->mb_width - 1]; + mbsplits_top = vp8_mbsplits[top_mb->partitioning]; + top_mv = top_mb->bmv; + + if (vpx_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) { + if (vpx_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) + part_idx = VP8_SPLITMVMODE_16x8 + vpx_rac_get_prob(c, vp8_mbsplit_prob[2]); + else + part_idx = VP8_SPLITMVMODE_8x8; + } else { + part_idx = VP8_SPLITMVMODE_4x4; + } + + num = vp8_mbsplit_count[part_idx]; + mbsplits_cur = vp8_mbsplits[part_idx], + firstidx = vp8_mbfirstidx[part_idx]; + mb->partitioning = part_idx; + + for (n = 0; n < num; n++) { + int k = firstidx[n]; + uint32_t left, above; + const uint8_t *submv_prob; + + if (!(k & 3)) + left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]); + else + left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]); + if (k <= 3) + above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]); + else + above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]); + + submv_prob = get_submv_prob(left, above, is_vp7); + + if (vpx_rac_get_prob_branchy(c, submv_prob[0])) { + if (vpx_rac_get_prob_branchy(c, submv_prob[1])) { + if (vpx_rac_get_prob_branchy(c, submv_prob[2])) { + mb->bmv[n].y = mb->mv.y + + read_mv_component(c, s->prob->mvc[0], is_vp7); + mb->bmv[n].x = mb->mv.x + + read_mv_component(c, s->prob->mvc[1], is_vp7); + } else { + AV_ZERO32(&mb->bmv[n]); + } + } else { + AV_WN32A(&mb->bmv[n], above); + } + } else { + AV_WN32A(&mb->bmv[n], left); + } + } + + return num; +} + +/** + * The vp7 reference decoder uses a padding macroblock column (added to right + * edge of the frame) to guard against illegal macroblock offsets. The + * algorithm has bugs that permit offsets to straddle the padding column. + * This function replicates those bugs. + * + * @param[out] edge_x macroblock x address + * @param[out] edge_y macroblock y address + * + * @return macroblock offset legal (boolean) + */ +static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width, + int xoffset, int yoffset, int boundary, + int *edge_x, int *edge_y) +{ + int vwidth = mb_width + 1; + int new = (mb_y + yoffset) * vwidth + mb_x + xoffset; + if (new < boundary || new % vwidth == vwidth - 1) + return 0; + *edge_y = new / vwidth; + *edge_x = new % vwidth; + return 1; +} + +static const VP8mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock) +{ + return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0]; +} + +static av_always_inline +void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb, + int mb_x, int mb_y, int layout) +{ + enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR }; + enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT }; + int idx = CNT_ZERO; + VP8mv near_mv[3]; + uint8_t cnt[3] = { 0 }; + VPXRangeCoder *c = &s->c; + int i; + + AV_ZERO32(&near_mv[0]); + AV_ZERO32(&near_mv[1]); + AV_ZERO32(&near_mv[2]); + + for (i = 0; i < VP7_MV_PRED_COUNT; i++) { + const VP7MVPred * pred = &vp7_mv_pred[i]; + int edge_x, edge_y; + + if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset, + pred->yoffset, !s->profile, &edge_x, &edge_y)) { + const VP8Macroblock *edge = (s->mb_layout == 1) + ? s->macroblocks_base + 1 + edge_x + + (s->mb_width + 1) * (edge_y + 1) + : s->macroblocks + edge_x + + (s->mb_height - edge_y - 1) * 2; + uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock)); + if (mv) { + if (AV_RN32A(&near_mv[CNT_NEAREST])) { + if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) { + idx = CNT_NEAREST; + } else if (AV_RN32A(&near_mv[CNT_NEAR])) { + if (mv != AV_RN32A(&near_mv[CNT_NEAR])) + continue; + idx = CNT_NEAR; + } else { + AV_WN32A(&near_mv[CNT_NEAR], mv); + idx = CNT_NEAR; + } + } else { + AV_WN32A(&near_mv[CNT_NEAREST], mv); + idx = CNT_NEAREST; + } + } else { + idx = CNT_ZERO; + } + } else { + idx = CNT_ZERO; + } + cnt[idx] += vp7_mv_pred[i].score; + } + + mb->partitioning = VP8_SPLITMVMODE_NONE; + + if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) { + mb->mode = VP8_MVMODE_MV; + + if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) { + + if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) { + + if (cnt[CNT_NEAREST] > cnt[CNT_NEAR]) + AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST])); + else + AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR] ? 0 : AV_RN32A(&near_mv[CNT_NEAR])); + + if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) { + mb->mode = VP8_MVMODE_SPLIT; + mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1]; + } else { + mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]); + mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]); + mb->bmv[0] = mb->mv; + } + } else { + mb->mv = near_mv[CNT_NEAR]; + mb->bmv[0] = mb->mv; + } + } else { + mb->mv = near_mv[CNT_NEAREST]; + mb->bmv[0] = mb->mv; + } + } else { + mb->mode = VP8_MVMODE_ZERO; + AV_ZERO32(&mb->mv); + mb->bmv[0] = mb->mv; + } +} + +static av_always_inline +void vp8_decode_mvs(VP8Context *s, const VP8mvbounds *mv_bounds, VP8Macroblock *mb, + int mb_x, int mb_y, int layout) +{ + VP8Macroblock *mb_edge[3] = { 0 /* top */, + mb - 1 /* left */, + 0 /* top-left */ }; + enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV }; + enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT }; + int idx = CNT_ZERO; + int cur_sign_bias = s->sign_bias[mb->ref_frame]; + const int8_t *sign_bias = s->sign_bias; + VP8mv near_mv[4]; + uint8_t cnt[4] = { 0 }; + VPXRangeCoder *c = &s->c; + + if (!layout) { // layout is inlined (s->mb_layout is not) + mb_edge[0] = mb + 2; + mb_edge[2] = mb + 1; + } else { + mb_edge[0] = mb - s->mb_width - 1; + mb_edge[2] = mb - s->mb_width - 2; + } + + AV_ZERO32(&near_mv[0]); + AV_ZERO32(&near_mv[1]); + AV_ZERO32(&near_mv[2]); + + /* Process MB on top, left and top-left */ +#define MV_EDGE_CHECK(n) \ + { \ + const VP8Macroblock *edge = mb_edge[n]; \ + int edge_ref = edge->ref_frame; \ + if (edge_ref != VP8_FRAME_CURRENT) { \ + uint32_t mv = AV_RN32A(&edge->mv); \ + if (mv) { \ + if (cur_sign_bias != sign_bias[edge_ref]) { \ + /* SWAR negate of the values in mv. */ \ + mv = ~mv; \ + mv = ((mv & 0x7fff7fff) + \ + 0x00010001) ^ (mv & 0x80008000); \ + } \ + if (!n || mv != AV_RN32A(&near_mv[idx])) \ + AV_WN32A(&near_mv[++idx], mv); \ + cnt[idx] += 1 + (n != 2); \ + } else \ + cnt[CNT_ZERO] += 1 + (n != 2); \ + } \ + } + + MV_EDGE_CHECK(0) + MV_EDGE_CHECK(1) + MV_EDGE_CHECK(2) + + mb->partitioning = VP8_SPLITMVMODE_NONE; + if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) { + mb->mode = VP8_MVMODE_MV; + + /* If we have three distinct MVs, merge first and last if they're the same */ + if (cnt[CNT_SPLITMV] && + AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT])) + cnt[CNT_NEAREST] += 1; + + /* Swap near and nearest if necessary */ + if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) { + FFSWAP(uint8_t, cnt[CNT_NEAREST], cnt[CNT_NEAR]); + FFSWAP(VP8mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]); + } + + if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) { + if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) { + /* Choose the best mv out of 0,0 and the nearest mv */ + clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]); + cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode == VP8_MVMODE_SPLIT) + + (mb_edge[VP8_EDGE_TOP]->mode == VP8_MVMODE_SPLIT)) * 2 + + (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT); + + if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) { + mb->mode = VP8_MVMODE_SPLIT; + mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1]; + } else { + mb->mv.y += vp8_read_mv_component(c, s->prob->mvc[0]); + mb->mv.x += vp8_read_mv_component(c, s->prob->mvc[1]); + mb->bmv[0] = mb->mv; + } + } else { + clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]); + mb->bmv[0] = mb->mv; + } + } else { + clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]); + mb->bmv[0] = mb->mv; + } + } else { + mb->mode = VP8_MVMODE_ZERO; + AV_ZERO32(&mb->mv); + mb->bmv[0] = mb->mv; + } +} + +static av_always_inline +void decode_intra4x4_modes(VP8Context *s, VPXRangeCoder *c, VP8Macroblock *mb, + int mb_x, int keyframe, int layout) +{ + uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb; + + if (layout) { + VP8Macroblock *mb_top = mb - s->mb_width - 1; + memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4); + } + if (keyframe) { + int x, y; + uint8_t *top; + uint8_t *const left = s->intra4x4_pred_mode_left; + if (layout) + top = mb->intra4x4_pred_mode_top; + else + top = s->intra4x4_pred_mode_top + 4 * mb_x; + for (y = 0; y < 4; y++) { + for (x = 0; x < 4; x++) { + const uint8_t *ctx; + ctx = vp8_pred4x4_prob_intra[top[x]][left[y]]; + *intra4x4 = vp89_rac_get_tree(c, vp8_pred4x4_tree, ctx); + left[y] = top[x] = *intra4x4; + intra4x4++; + } + } + } else { + int i; + for (i = 0; i < 16; i++) + intra4x4[i] = vp89_rac_get_tree(c, vp8_pred4x4_tree, + vp8_pred4x4_prob_inter); + } +} + +static av_always_inline +void decode_mb_mode(VP8Context *s, const VP8mvbounds *mv_bounds, + VP8Macroblock *mb, int mb_x, int mb_y, + uint8_t *segment, const uint8_t *ref, int layout, int is_vp7) +{ + VPXRangeCoder *c = &s->c; + static const char * const vp7_feature_name[] = { "q-index", + "lf-delta", + "partial-golden-update", + "blit-pitch" }; + if (is_vp7) { + int i; + *segment = 0; + for (i = 0; i < 4; i++) { + if (s->feature_enabled[i]) { + if (vpx_rac_get_prob_branchy(c, s->feature_present_prob[i])) { + int index = vp89_rac_get_tree(c, vp7_feature_index_tree, + s->feature_index_prob[i]); + av_log(s->avctx, AV_LOG_WARNING, + "Feature %s present in macroblock (value 0x%x)\n", + vp7_feature_name[i], s->feature_value[i][index]); + } + } + } + } else if (s->segmentation.update_map) { + int bit = vpx_rac_get_prob(c, s->prob->segmentid[0]); + *segment = vpx_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit; + } else if (s->segmentation.enabled) + *segment = ref ? *ref : *segment; + mb->segment = *segment; + + mb->skip = s->mbskip_enabled ? vpx_rac_get_prob(c, s->prob->mbskip) : 0; + + if (s->keyframe) { + mb->mode = vp89_rac_get_tree(c, vp8_pred16x16_tree_intra, + vp8_pred16x16_prob_intra); + + if (mb->mode == MODE_I4x4) { + decode_intra4x4_modes(s, c, mb, mb_x, 1, layout); + } else { + const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode + : vp8_pred4x4_mode)[mb->mode] * 0x01010101u; + if (s->mb_layout) + AV_WN32A(mb->intra4x4_pred_mode_top, modes); + else + AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes); + AV_WN32A(s->intra4x4_pred_mode_left, modes); + } + + mb->chroma_pred_mode = vp89_rac_get_tree(c, vp8_pred8x8c_tree, + vp8_pred8x8c_prob_intra); + mb->ref_frame = VP8_FRAME_CURRENT; + } else if (vpx_rac_get_prob_branchy(c, s->prob->intra)) { + // inter MB, 16.2 + if (vpx_rac_get_prob_branchy(c, s->prob->last)) + mb->ref_frame = + (!is_vp7 && vpx_rac_get_prob(c, s->prob->golden)) ? VP8_FRAME_ALTREF + : VP8_FRAME_GOLDEN; + else + mb->ref_frame = VP8_FRAME_PREVIOUS; + s->ref_count[mb->ref_frame - 1]++; + + // motion vectors, 16.3 + if (is_vp7) + vp7_decode_mvs(s, mb, mb_x, mb_y, layout); + else + vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout); + } else { + // intra MB, 16.1 + mb->mode = vp89_rac_get_tree(c, vp8_pred16x16_tree_inter, + s->prob->pred16x16); + + if (mb->mode == MODE_I4x4) + decode_intra4x4_modes(s, c, mb, mb_x, 0, layout); + + mb->chroma_pred_mode = vp89_rac_get_tree(c, vp8_pred8x8c_tree, + s->prob->pred8x8c); + mb->ref_frame = VP8_FRAME_CURRENT; + mb->partitioning = VP8_SPLITMVMODE_NONE; + AV_ZERO32(&mb->bmv[0]); + } +} + +/** + * @param r arithmetic bitstream reader context + * @param block destination for block coefficients + * @param probs probabilities to use when reading trees from the bitstream + * @param i initial coeff index, 0 unless a separate DC block is coded + * @param qmul array holding the dc/ac dequant factor at position 0/1 + * + * @return 0 if no coeffs were decoded + * otherwise, the index of the last coeff decoded plus one + */ +static av_always_inline +int decode_block_coeffs_internal(VPXRangeCoder *r, int16_t block[16], + uint8_t probs[16][3][NUM_DCT_TOKENS - 1], + int i, const uint8_t *token_prob, const int16_t qmul[2], + const uint8_t scan[16], int vp7) +{ + VPXRangeCoder c = *r; + goto skip_eob; + do { + int coeff; +restart: + if (!vpx_rac_get_prob_branchy(&c, token_prob[0])) // DCT_EOB + break; + +skip_eob: + if (!vpx_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0 + if (++i == 16) + break; // invalid input; blocks should end with EOB + token_prob = probs[i][0]; + if (vp7) + goto restart; + goto skip_eob; + } + + if (!vpx_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1 + coeff = 1; + token_prob = probs[i + 1][1]; + } else { + if (!vpx_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4 + coeff = vpx_rac_get_prob_branchy(&c, token_prob[4]); + if (coeff) + coeff += vpx_rac_get_prob(&c, token_prob[5]); + coeff += 2; + } else { + // DCT_CAT* + if (!vpx_rac_get_prob_branchy(&c, token_prob[6])) { + if (!vpx_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1 + coeff = 5 + vpx_rac_get_prob(&c, vp8_dct_cat1_prob[0]); + } else { // DCT_CAT2 + coeff = 7; + coeff += vpx_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1; + coeff += vpx_rac_get_prob(&c, vp8_dct_cat2_prob[1]); + } + } else { // DCT_CAT3 and up + int a = vpx_rac_get_prob(&c, token_prob[8]); + int b = vpx_rac_get_prob(&c, token_prob[9 + a]); + int cat = (a << 1) + b; + coeff = 3 + (8 << cat); + coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]); + } + } + token_prob = probs[i + 1][2]; + } + block[scan[i]] = (vp89_rac_get(&c) ? -coeff : coeff) * qmul[!!i]; + } while (++i < 16); + + *r = c; + return i; +} + +static av_always_inline +int inter_predict_dc(int16_t block[16], int16_t pred[2]) +{ + int16_t dc = block[0]; + int ret = 0; + + if (pred[1] > 3) { + dc += pred[0]; + ret = 1; + } + + if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) { + block[0] = pred[0] = dc; + pred[1] = 0; + } else { + if (pred[0] == dc) + pred[1]++; + block[0] = pred[0] = dc; + } + + return ret; +} + +static int vp7_decode_block_coeffs_internal(VPXRangeCoder *r, + int16_t block[16], + uint8_t probs[16][3][NUM_DCT_TOKENS - 1], + int i, const uint8_t *token_prob, + const int16_t qmul[2], + const uint8_t scan[16]) +{ + return decode_block_coeffs_internal(r, block, probs, i, + token_prob, qmul, scan, IS_VP7); +} + +#ifndef vp8_decode_block_coeffs_internal +static int vp8_decode_block_coeffs_internal(VPXRangeCoder *r, + int16_t block[16], + uint8_t probs[16][3][NUM_DCT_TOKENS - 1], + int i, const uint8_t *token_prob, + const int16_t qmul[2]) +{ + return decode_block_coeffs_internal(r, block, probs, i, + token_prob, qmul, ff_zigzag_scan, IS_VP8); +} +#endif + +/** + * @param c arithmetic bitstream reader context + * @param block destination for block coefficients + * @param probs probabilities to use when reading trees from the bitstream + * @param i initial coeff index, 0 unless a separate DC block is coded + * @param zero_nhood the initial prediction context for number of surrounding + * all-zero blocks (only left/top, so 0-2) + * @param qmul array holding the dc/ac dequant factor at position 0/1 + * @param scan scan pattern (VP7 only) + * + * @return 0 if no coeffs were decoded + * otherwise, the index of the last coeff decoded plus one + */ +static av_always_inline +int decode_block_coeffs(VPXRangeCoder *c, int16_t block[16], + uint8_t probs[16][3][NUM_DCT_TOKENS - 1], + int i, int zero_nhood, const int16_t qmul[2], + const uint8_t scan[16], int vp7) +{ + const uint8_t *token_prob = probs[i][zero_nhood]; + if (!vpx_rac_get_prob_branchy(c, token_prob[0])) // DCT_EOB + return 0; + return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i, + token_prob, qmul, scan) + : vp8_decode_block_coeffs_internal(c, block, probs, i, + token_prob, qmul); +} + +static av_always_inline +void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VPXRangeCoder *c, + VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9], + int is_vp7) +{ + int i, x, y, luma_start = 0, luma_ctx = 3; + int nnz_pred, nnz, nnz_total = 0; + int segment = mb->segment; + int block_dc = 0; + + if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) { + nnz_pred = t_nnz[8] + l_nnz[8]; + + // decode DC values and do hadamard + nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, + nnz_pred, s->qmat[segment].luma_dc_qmul, + ff_zigzag_scan, is_vp7); + l_nnz[8] = t_nnz[8] = !!nnz; + + if (is_vp7 && mb->mode > MODE_I4x4) { + nnz |= inter_predict_dc(td->block_dc, + s->inter_dc_pred[mb->ref_frame - 1]); + } + + if (nnz) { + nnz_total += nnz; + block_dc = 1; + if (nnz == 1) + s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc); + else + s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc); + } + luma_start = 1; + luma_ctx = 0; + } + + // luma blocks + for (y = 0; y < 4; y++) + for (x = 0; x < 4; x++) { + nnz_pred = l_nnz[y] + t_nnz[x]; + nnz = decode_block_coeffs(c, td->block[y][x], + s->prob->token[luma_ctx], + luma_start, nnz_pred, + s->qmat[segment].luma_qmul, + s->prob[0].scan, is_vp7); + /* nnz+block_dc may be one more than the actual last index, + * but we don't care */ + td->non_zero_count_cache[y][x] = nnz + block_dc; + t_nnz[x] = l_nnz[y] = !!nnz; + nnz_total += nnz; + } + + // chroma blocks + // TODO: what to do about dimensions? 2nd dim for luma is x, + // but for chroma it's (y<<1)|x + for (i = 4; i < 6; i++) + for (y = 0; y < 2; y++) + for (x = 0; x < 2; x++) { + nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x]; + nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x], + s->prob->token[2], 0, nnz_pred, + s->qmat[segment].chroma_qmul, + s->prob[0].scan, is_vp7); + td->non_zero_count_cache[i][(y << 1) + x] = nnz; + t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz; + nnz_total += nnz; + } + + // if there were no coded coeffs despite the macroblock not being marked skip, + // we MUST not do the inner loop filter and should not do IDCT + // Since skip isn't used for bitstream prediction, just manually set it. + if (!nnz_total) + mb->skip = 1; +} + +static av_always_inline +void backup_mb_border(uint8_t *top_border, const uint8_t *src_y, + const uint8_t *src_cb, const uint8_t *src_cr, + ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple) +{ + AV_COPY128(top_border, src_y + 15 * linesize); + if (!simple) { + AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize); + AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize); + } +} + +static av_always_inline +void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, + uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x, + int mb_y, int mb_width, int simple, int xchg) +{ + uint8_t *top_border_m1 = top_border - 32; // for TL prediction + src_y -= linesize; + src_cb -= uvlinesize; + src_cr -= uvlinesize; + +#define XCHG(a, b, xchg) \ + do { \ + if (xchg) \ + AV_SWAP64(b, a); \ + else \ + AV_COPY64(b, a); \ + } while (0) + + XCHG(top_border_m1 + 8, src_y - 8, xchg); + XCHG(top_border, src_y, xchg); + XCHG(top_border + 8, src_y + 8, 1); + if (mb_x < mb_width - 1) + XCHG(top_border + 32, src_y + 16, 1); + + // only copy chroma for normal loop filter + // or to initialize the top row to 127 + if (!simple || !mb_y) { + XCHG(top_border_m1 + 16, src_cb - 8, xchg); + XCHG(top_border_m1 + 24, src_cr - 8, xchg); + XCHG(top_border + 16, src_cb, 1); + XCHG(top_border + 24, src_cr, 1); + } +} + +static av_always_inline +int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y) +{ + if (!mb_x) + return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8; + else + return mb_y ? mode : LEFT_DC_PRED8x8; +} + +static av_always_inline +int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7) +{ + if (!mb_x) + return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8); + else + return mb_y ? mode : HOR_PRED8x8; +} + +static av_always_inline +int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7) +{ + switch (mode) { + case DC_PRED8x8: + return check_dc_pred8x8_mode(mode, mb_x, mb_y); + case VERT_PRED8x8: + return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode; + case HOR_PRED8x8: + return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode; + case PLANE_PRED8x8: /* TM */ + return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7); + } + return mode; +} + +static av_always_inline +int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7) +{ + if (!mb_x) { + return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED); + } else { + return mb_y ? mode : HOR_VP8_PRED; + } +} + +static av_always_inline +int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, + int *copy_buf, int vp7) +{ + switch (mode) { + case VERT_PRED: + if (!mb_x && mb_y) { + *copy_buf = 1; + return mode; + } + /* fall-through */ + case DIAG_DOWN_LEFT_PRED: + case VERT_LEFT_PRED: + return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode; + case HOR_PRED: + if (!mb_y) { + *copy_buf = 1; + return mode; + } + /* fall-through */ + case HOR_UP_PRED: + return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode; + case TM_VP8_PRED: + return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7); + case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions + * as 16x16/8x8 DC */ + case DIAG_DOWN_RIGHT_PRED: + case VERT_RIGHT_PRED: + case HOR_DOWN_PRED: + if (!mb_y || !mb_x) + *copy_buf = 1; + return mode; + } + return mode; +} + +static av_always_inline +void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3], + VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7) +{ + int x, y, mode, nnz; + uint32_t tr; + + /* for the first row, we need to run xchg_mb_border to init the top edge + * to 127 otherwise, skip it if we aren't going to deblock */ + if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0) + xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2], + s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width, + s->filter.simple, 1); + + if (mb->mode < MODE_I4x4) { + mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7); + s->hpc.pred16x16[mode](dst[0], s->linesize); + } else { + uint8_t *ptr = dst[0]; + const uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb; + const uint8_t lo = is_vp7 ? 128 : 127; + const uint8_t hi = is_vp7 ? 128 : 129; + const uint8_t tr_top[4] = { lo, lo, lo, lo }; + + // all blocks on the right edge of the macroblock use bottom edge + // the top macroblock for their topright edge + const uint8_t *tr_right = ptr - s->linesize + 16; + + // if we're on the right edge of the frame, said edge is extended + // from the top macroblock + if (mb_y && mb_x == s->mb_width - 1) { + tr = tr_right[-1] * 0x01010101u; + tr_right = (uint8_t *) &tr; + } + + if (mb->skip) + AV_ZERO128(td->non_zero_count_cache); + + for (y = 0; y < 4; y++) { + const uint8_t *topright = ptr + 4 - s->linesize; + for (x = 0; x < 4; x++) { + int copy = 0; + ptrdiff_t linesize = s->linesize; + uint8_t *dst = ptr + 4 * x; + LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]); + + if ((y == 0 || x == 3) && mb_y == 0) { + topright = tr_top; + } else if (x == 3) + topright = tr_right; + + mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, + mb_y + y, ©, is_vp7); + if (copy) { + dst = copy_dst + 12; + linesize = 8; + if (!(mb_y + y)) { + copy_dst[3] = lo; + AV_WN32A(copy_dst + 4, lo * 0x01010101U); + } else { + AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize); + if (!(mb_x + x)) { + copy_dst[3] = hi; + } else { + copy_dst[3] = ptr[4 * x - s->linesize - 1]; + } + } + if (!(mb_x + x)) { + copy_dst[11] = + copy_dst[19] = + copy_dst[27] = + copy_dst[35] = hi; + } else { + copy_dst[11] = ptr[4 * x - 1]; + copy_dst[19] = ptr[4 * x + s->linesize - 1]; + copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1]; + copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1]; + } + } + s->hpc.pred4x4[mode](dst, topright, linesize); + if (copy) { + AV_COPY32(ptr + 4 * x, copy_dst + 12); + AV_COPY32(ptr + 4 * x + s->linesize, copy_dst + 20); + AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28); + AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36); + } + + nnz = td->non_zero_count_cache[y][x]; + if (nnz) { + if (nnz == 1) + s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x, + td->block[y][x], s->linesize); + else + s->vp8dsp.vp8_idct_add(ptr + 4 * x, + td->block[y][x], s->linesize); + } + topright += 4; + } + + ptr += 4 * s->linesize; + intra4x4 += 4; + } + } + + mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, + mb_x, mb_y, is_vp7); + s->hpc.pred8x8[mode](dst[1], s->uvlinesize); + s->hpc.pred8x8[mode](dst[2], s->uvlinesize); + + if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0) + xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2], + s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width, + s->filter.simple, 0); +} + +static const uint8_t subpel_idx[3][8] = { + { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels, + // also function pointer index + { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required + { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels +}; + +/** + * luma MC function + * + * @param s VP8 decoding context + * @param dst target buffer for block data at block position + * @param ref reference picture buffer at origin (0, 0) + * @param mv motion vector (relative to block position) to get pixel data from + * @param x_off horizontal position of block from origin (0, 0) + * @param y_off vertical position of block from origin (0, 0) + * @param block_w width of block (16, 8 or 4) + * @param block_h height of block (always same as block_w) + * @param width width of src/dst plane data + * @param height height of src/dst plane data + * @param linesize size of a single line of plane data, including padding + * @param mc_func motion compensation function pointers (bilinear or sixtap MC) + */ +static av_always_inline +void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst, + const ThreadFrame *ref, const VP8mv *mv, + int x_off, int y_off, int block_w, int block_h, + int width, int height, ptrdiff_t linesize, + vp8_mc_func mc_func[3][3]) +{ + const uint8_t *src = ref->f->data[0]; + + if (AV_RN32A(mv)) { + ptrdiff_t src_linesize = linesize; + + int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx]; + int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my]; + + x_off += mv->x >> 2; + y_off += mv->y >> 2; + + // edge emulation + ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0); + src += y_off * linesize + x_off; + if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] || + y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) { + s->vdsp.emulated_edge_mc(td->edge_emu_buffer, + src - my_idx * linesize - mx_idx, + EDGE_EMU_LINESIZE, linesize, + block_w + subpel_idx[1][mx], + block_h + subpel_idx[1][my], + x_off - mx_idx, y_off - my_idx, + width, height); + src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx; + src_linesize = EDGE_EMU_LINESIZE; + } + mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my); + } else { + ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0); + mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, + linesize, block_h, 0, 0); + } +} + +/** + * chroma MC function + * + * @param s VP8 decoding context + * @param dst1 target buffer for block data at block position (U plane) + * @param dst2 target buffer for block data at block position (V plane) + * @param ref reference picture buffer at origin (0, 0) + * @param mv motion vector (relative to block position) to get pixel data from + * @param x_off horizontal position of block from origin (0, 0) + * @param y_off vertical position of block from origin (0, 0) + * @param block_w width of block (16, 8 or 4) + * @param block_h height of block (always same as block_w) + * @param width width of src/dst plane data + * @param height height of src/dst plane data + * @param linesize size of a single line of plane data, including padding + * @param mc_func motion compensation function pointers (bilinear or sixtap MC) + */ +static av_always_inline +void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, + uint8_t *dst2, const ThreadFrame *ref, const VP8mv *mv, + int x_off, int y_off, int block_w, int block_h, + int width, int height, ptrdiff_t linesize, + vp8_mc_func mc_func[3][3]) +{ + const uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2]; + + if (AV_RN32A(mv)) { + int mx = mv->x & 7, mx_idx = subpel_idx[0][mx]; + int my = mv->y & 7, my_idx = subpel_idx[0][my]; + + x_off += mv->x >> 3; + y_off += mv->y >> 3; + + // edge emulation + src1 += y_off * linesize + x_off; + src2 += y_off * linesize + x_off; + ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0); + if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] || + y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) { + s->vdsp.emulated_edge_mc(td->edge_emu_buffer, + src1 - my_idx * linesize - mx_idx, + EDGE_EMU_LINESIZE, linesize, + block_w + subpel_idx[1][mx], + block_h + subpel_idx[1][my], + x_off - mx_idx, y_off - my_idx, width, height); + src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx; + mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my); + + s->vdsp.emulated_edge_mc(td->edge_emu_buffer, + src2 - my_idx * linesize - mx_idx, + EDGE_EMU_LINESIZE, linesize, + block_w + subpel_idx[1][mx], + block_h + subpel_idx[1][my], + x_off - mx_idx, y_off - my_idx, width, height); + src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx; + mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my); + } else { + mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my); + mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my); + } + } else { + ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0); + mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0); + mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0); + } +} + +static av_always_inline +void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3], + const ThreadFrame *ref_frame, int x_off, int y_off, + int bx_off, int by_off, int block_w, int block_h, + int width, int height, const VP8mv *mv) +{ + VP8mv uvmv = *mv; + + /* Y */ + vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off, + ref_frame, mv, x_off + bx_off, y_off + by_off, + block_w, block_h, width, height, s->linesize, + s->put_pixels_tab[block_w == 8]); + + /* U/V */ + if (s->profile == 3) { + /* this block only applies VP8; it is safe to check + * only the profile, as VP7 profile <= 1 */ + uvmv.x &= ~7; + uvmv.y &= ~7; + } + x_off >>= 1; + y_off >>= 1; + bx_off >>= 1; + by_off >>= 1; + width >>= 1; + height >>= 1; + block_w >>= 1; + block_h >>= 1; + vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off, + dst[2] + by_off * s->uvlinesize + bx_off, ref_frame, + &uvmv, x_off + bx_off, y_off + by_off, + block_w, block_h, width, height, s->uvlinesize, + s->put_pixels_tab[1 + (block_w == 4)]); +} + +/* Fetch pixels for estimated mv 4 macroblocks ahead. + * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */ +static av_always_inline +void prefetch_motion(const VP8Context *s, const VP8Macroblock *mb, + int mb_x, int mb_y, int mb_xy, int ref) +{ + /* Don't prefetch refs that haven't been used very often this frame. */ + if (s->ref_count[ref - 1] > (mb_xy >> 5)) { + int x_off = mb_x << 4, y_off = mb_y << 4; + int mx = (mb->mv.x >> 2) + x_off + 8; + int my = (mb->mv.y >> 2) + y_off; + uint8_t **src = s->framep[ref]->tf.f->data; + int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64; + /* For threading, a ff_thread_await_progress here might be useful, but + * it actually slows down the decoder. Since a bad prefetch doesn't + * generate bad decoder output, we don't run it here. */ + s->vdsp.prefetch(src[0] + off, s->linesize, 4); + off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64; + s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2); + } +} + +/** + * Apply motion vectors to prediction buffer, chapter 18. + */ +static av_always_inline +void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3], + VP8Macroblock *mb, int mb_x, int mb_y) +{ + int x_off = mb_x << 4, y_off = mb_y << 4; + int width = 16 * s->mb_width, height = 16 * s->mb_height; + const ThreadFrame *ref = &s->framep[mb->ref_frame]->tf; + const VP8mv *bmv = mb->bmv; + + switch (mb->partitioning) { + case VP8_SPLITMVMODE_NONE: + vp8_mc_part(s, td, dst, ref, x_off, y_off, + 0, 0, 16, 16, width, height, &mb->mv); + break; + case VP8_SPLITMVMODE_4x4: { + int x, y; + VP8mv uvmv; + + /* Y */ + for (y = 0; y < 4; y++) { + for (x = 0; x < 4; x++) { + vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4, + ref, &bmv[4 * y + x], + 4 * x + x_off, 4 * y + y_off, 4, 4, + width, height, s->linesize, + s->put_pixels_tab[2]); + } + } + + /* U/V */ + x_off >>= 1; + y_off >>= 1; + width >>= 1; + height >>= 1; + for (y = 0; y < 2; y++) { + for (x = 0; x < 2; x++) { + uvmv.x = mb->bmv[2 * y * 4 + 2 * x ].x + + mb->bmv[2 * y * 4 + 2 * x + 1].x + + mb->bmv[(2 * y + 1) * 4 + 2 * x ].x + + mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x; + uvmv.y = mb->bmv[2 * y * 4 + 2 * x ].y + + mb->bmv[2 * y * 4 + 2 * x + 1].y + + mb->bmv[(2 * y + 1) * 4 + 2 * x ].y + + mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y; + uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2; + uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2; + if (s->profile == 3) { + uvmv.x &= ~7; + uvmv.y &= ~7; + } + vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4, + dst[2] + 4 * y * s->uvlinesize + x * 4, ref, + &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4, + width, height, s->uvlinesize, + s->put_pixels_tab[2]); + } + } + break; + } + case VP8_SPLITMVMODE_16x8: + vp8_mc_part(s, td, dst, ref, x_off, y_off, + 0, 0, 16, 8, width, height, &bmv[0]); + vp8_mc_part(s, td, dst, ref, x_off, y_off, + 0, 8, 16, 8, width, height, &bmv[1]); + break; + case VP8_SPLITMVMODE_8x16: + vp8_mc_part(s, td, dst, ref, x_off, y_off, + 0, 0, 8, 16, width, height, &bmv[0]); + vp8_mc_part(s, td, dst, ref, x_off, y_off, + 8, 0, 8, 16, width, height, &bmv[1]); + break; + case VP8_SPLITMVMODE_8x8: + vp8_mc_part(s, td, dst, ref, x_off, y_off, + 0, 0, 8, 8, width, height, &bmv[0]); + vp8_mc_part(s, td, dst, ref, x_off, y_off, + 8, 0, 8, 8, width, height, &bmv[1]); + vp8_mc_part(s, td, dst, ref, x_off, y_off, + 0, 8, 8, 8, width, height, &bmv[2]); + vp8_mc_part(s, td, dst, ref, x_off, y_off, + 8, 8, 8, 8, width, height, &bmv[3]); + break; + } +} + +static av_always_inline +void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3], + const VP8Macroblock *mb) +{ + int x, y, ch; + + if (mb->mode != MODE_I4x4) { + uint8_t *y_dst = dst[0]; + for (y = 0; y < 4; y++) { + uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]); + if (nnz4) { + if (nnz4 & ~0x01010101) { + for (x = 0; x < 4; x++) { + if ((uint8_t) nnz4 == 1) + s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x, + td->block[y][x], + s->linesize); + else if ((uint8_t) nnz4 > 1) + s->vp8dsp.vp8_idct_add(y_dst + 4 * x, + td->block[y][x], + s->linesize); + nnz4 >>= 8; + if (!nnz4) + break; + } + } else { + s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize); + } + } + y_dst += 4 * s->linesize; + } + } + + for (ch = 0; ch < 2; ch++) { + uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]); + if (nnz4) { + uint8_t *ch_dst = dst[1 + ch]; + if (nnz4 & ~0x01010101) { + for (y = 0; y < 2; y++) { + for (x = 0; x < 2; x++) { + if ((uint8_t) nnz4 == 1) + s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x, + td->block[4 + ch][(y << 1) + x], + s->uvlinesize); + else if ((uint8_t) nnz4 > 1) + s->vp8dsp.vp8_idct_add(ch_dst + 4 * x, + td->block[4 + ch][(y << 1) + x], + s->uvlinesize); + nnz4 >>= 8; + if (!nnz4) + goto chroma_idct_end; + } + ch_dst += 4 * s->uvlinesize; + } + } else { + s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize); + } + } +chroma_idct_end: + ; + } +} + +static av_always_inline +void filter_level_for_mb(const VP8Context *s, const VP8Macroblock *mb, + VP8FilterStrength *f, int is_vp7) +{ + int interior_limit, filter_level; + + if (s->segmentation.enabled) { + filter_level = s->segmentation.filter_level[mb->segment]; + if (!s->segmentation.absolute_vals) + filter_level += s->filter.level; + } else + filter_level = s->filter.level; + + if (s->lf_delta.enabled) { + filter_level += s->lf_delta.ref[mb->ref_frame]; + filter_level += s->lf_delta.mode[mb->mode]; + } + + filter_level = av_clip_uintp2(filter_level, 6); + + interior_limit = filter_level; + if (s->filter.sharpness) { + interior_limit >>= (s->filter.sharpness + 3) >> 2; + interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness); + } + interior_limit = FFMAX(interior_limit, 1); + + f->filter_level = filter_level; + f->inner_limit = interior_limit; + f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 || + mb->mode == VP8_MVMODE_SPLIT; +} + +static av_always_inline +void filter_mb(const VP8Context *s, uint8_t *const dst[3], const VP8FilterStrength *f, + int mb_x, int mb_y, int is_vp7) +{ + int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh; + int filter_level = f->filter_level; + int inner_limit = f->inner_limit; + int inner_filter = f->inner_filter; + ptrdiff_t linesize = s->linesize; + ptrdiff_t uvlinesize = s->uvlinesize; + static const uint8_t hev_thresh_lut[2][64] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2 } + }; + + if (!filter_level) + return; + + if (is_vp7) { + bedge_lim_y = filter_level; + bedge_lim_uv = filter_level * 2; + mbedge_lim = filter_level + 2; + } else { + bedge_lim_y = + bedge_lim_uv = filter_level * 2 + inner_limit; + mbedge_lim = bedge_lim_y + 4; + } + + hev_thresh = hev_thresh_lut[s->keyframe][filter_level]; + + if (mb_x) { + s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize, + mbedge_lim, inner_limit, hev_thresh); + s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize, + mbedge_lim, inner_limit, hev_thresh); + } + +#define H_LOOP_FILTER_16Y_INNER(cond) \ + if (cond && inner_filter) { \ + s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 4, linesize, \ + bedge_lim_y, inner_limit, \ + hev_thresh); \ + s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 8, linesize, \ + bedge_lim_y, inner_limit, \ + hev_thresh); \ + s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize, \ + bedge_lim_y, inner_limit, \ + hev_thresh); \ + s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4, \ + uvlinesize, bedge_lim_uv, \ + inner_limit, hev_thresh); \ + } + + H_LOOP_FILTER_16Y_INNER(!is_vp7) + + if (mb_y) { + s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize, + mbedge_lim, inner_limit, hev_thresh); + s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize, + mbedge_lim, inner_limit, hev_thresh); + } + + if (inner_filter) { + s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 4 * linesize, + linesize, bedge_lim_y, + inner_limit, hev_thresh); + s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 8 * linesize, + linesize, bedge_lim_y, + inner_limit, hev_thresh); + s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize, + linesize, bedge_lim_y, + inner_limit, hev_thresh); + s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize, + dst[2] + 4 * uvlinesize, + uvlinesize, bedge_lim_uv, + inner_limit, hev_thresh); + } + + H_LOOP_FILTER_16Y_INNER(is_vp7) +} + +static av_always_inline +void filter_mb_simple(const VP8Context *s, uint8_t *dst, const VP8FilterStrength *f, + int mb_x, int mb_y) +{ + int mbedge_lim, bedge_lim; + int filter_level = f->filter_level; + int inner_limit = f->inner_limit; + int inner_filter = f->inner_filter; + ptrdiff_t linesize = s->linesize; + + if (!filter_level) + return; + + bedge_lim = 2 * filter_level + inner_limit; + mbedge_lim = bedge_lim + 4; + + if (mb_x) + s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim); + if (inner_filter) { + s->vp8dsp.vp8_h_loop_filter_simple(dst + 4, linesize, bedge_lim); + s->vp8dsp.vp8_h_loop_filter_simple(dst + 8, linesize, bedge_lim); + s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim); + } + + if (mb_y) + s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim); + if (inner_filter) { + s->vp8dsp.vp8_v_loop_filter_simple(dst + 4 * linesize, linesize, bedge_lim); + s->vp8dsp.vp8_v_loop_filter_simple(dst + 8 * linesize, linesize, bedge_lim); + s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim); + } +} + +#define MARGIN (16 << 2) +static av_always_inline +int vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe, + const VP8Frame *prev_frame, int is_vp7) +{ + VP8Context *s = avctx->priv_data; + int mb_x, mb_y; + + s->mv_bounds.mv_min.y = -MARGIN; + s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN; + for (mb_y = 0; mb_y < s->mb_height; mb_y++) { + VP8Macroblock *mb = s->macroblocks_base + + ((s->mb_width + 1) * (mb_y + 1) + 1); + int mb_xy = mb_y * s->mb_width; + + AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101); + + s->mv_bounds.mv_min.x = -MARGIN; + s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN; + + for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) { + if (vpx_rac_is_end(&s->c)) { + return AVERROR_INVALIDDATA; + } + if (mb_y == 0) + AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top, + DC_PRED * 0x01010101); + decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy, + prev_frame && prev_frame->seg_map ? + prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7); + s->mv_bounds.mv_min.x -= 64; + s->mv_bounds.mv_max.x -= 64; + } + s->mv_bounds.mv_min.y -= 64; + s->mv_bounds.mv_max.y -= 64; + } + return 0; +} + +static int vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame, + const VP8Frame *prev_frame) +{ + return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7); +} + +static int vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame, + const VP8Frame *prev_frame) +{ + return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8); +} + +#if HAVE_THREADS +#define check_thread_pos(td, otd, mb_x_check, mb_y_check) \ + do { \ + int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF); \ + if (atomic_load(&otd->thread_mb_pos) < tmp) { \ + pthread_mutex_lock(&otd->lock); \ + atomic_store(&td->wait_mb_pos, tmp); \ + do { \ + if (atomic_load(&otd->thread_mb_pos) >= tmp) \ + break; \ + pthread_cond_wait(&otd->cond, &otd->lock); \ + } while (1); \ + atomic_store(&td->wait_mb_pos, INT_MAX); \ + pthread_mutex_unlock(&otd->lock); \ + } \ + } while (0) + +#define update_pos(td, mb_y, mb_x) \ + do { \ + int pos = (mb_y << 16) | (mb_x & 0xFFFF); \ + int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \ + (num_jobs > 1); \ + int is_null = !next_td || !prev_td; \ + int pos_check = (is_null) ? 1 : \ + (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) || \ + (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos)); \ + atomic_store(&td->thread_mb_pos, pos); \ + if (sliced_threading && pos_check) { \ + pthread_mutex_lock(&td->lock); \ + pthread_cond_broadcast(&td->cond); \ + pthread_mutex_unlock(&td->lock); \ + } \ + } while (0) +#else +#define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0) +#define update_pos(td, mb_y, mb_x) while(0) +#endif + +static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata, + int jobnr, int threadnr, int is_vp7) +{ + VP8Context *s = avctx->priv_data; + VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr]; + int mb_y = atomic_load(&td->thread_mb_pos) >> 16; + int mb_x, mb_xy = mb_y * s->mb_width; + int num_jobs = s->num_jobs; + const VP8Frame *prev_frame = s->prev_frame; + VP8Frame *curframe = s->curframe; + VPXRangeCoder *coeff_c = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)]; + + VP8Macroblock *mb; + uint8_t *dst[3] = { + curframe->tf.f->data[0] + 16 * mb_y * s->linesize, + curframe->tf.f->data[1] + 8 * mb_y * s->uvlinesize, + curframe->tf.f->data[2] + 8 * mb_y * s->uvlinesize + }; + + if (vpx_rac_is_end(&s->c)) + return AVERROR_INVALIDDATA; + + if (mb_y == 0) + prev_td = td; + else + prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs]; + if (mb_y == s->mb_height - 1) + next_td = td; + else + next_td = &s->thread_data[(jobnr + 1) % num_jobs]; + if (s->mb_layout == 1) + mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1); + else { + // Make sure the previous frame has read its segmentation map, + // if we re-use the same map. + if (prev_frame && s->segmentation.enabled && + !s->segmentation.update_map) + ff_thread_await_progress(&prev_frame->tf, mb_y, 0); + mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2; + memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock + AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101); + } + + if (!is_vp7 || mb_y == 0) + memset(td->left_nnz, 0, sizeof(td->left_nnz)); + + td->mv_bounds.mv_min.x = -MARGIN; + td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN; + + for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) { + if (vpx_rac_is_end(&s->c)) + return AVERROR_INVALIDDATA; + // Wait for previous thread to read mb_x+2, and reach mb_y-1. + if (prev_td != td) { + if (threadnr != 0) { + check_thread_pos(td, prev_td, + mb_x + (is_vp7 ? 2 : 1), + mb_y - (is_vp7 ? 2 : 1)); + } else { + check_thread_pos(td, prev_td, + mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3, + mb_y - (is_vp7 ? 2 : 1)); + } + } + + s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64, + s->linesize, 4); + s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64, + dst[2] - dst[1], 2); + + if (!s->mb_layout) + decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy, + prev_frame && prev_frame->seg_map ? + prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7); + + prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP8_FRAME_PREVIOUS); + + if (!mb->skip) { + if (vpx_rac_is_end(coeff_c)) + return AVERROR_INVALIDDATA; + decode_mb_coeffs(s, td, coeff_c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7); + } + + if (mb->mode <= MODE_I4x4) + intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7); + else + inter_predict(s, td, dst, mb, mb_x, mb_y); + + prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP8_FRAME_GOLDEN); + + if (!mb->skip) { + idct_mb(s, td, dst, mb); + } else { + AV_ZERO64(td->left_nnz); + AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned + + /* Reset DC block predictors if they would exist + * if the mb had coefficients */ + if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) { + td->left_nnz[8] = 0; + s->top_nnz[mb_x][8] = 0; + } + } + + if (s->deblock_filter) + filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7); + + if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) { + if (s->filter.simple) + backup_mb_border(s->top_border[mb_x + 1], dst[0], + NULL, NULL, s->linesize, 0, 1); + else + backup_mb_border(s->top_border[mb_x + 1], dst[0], + dst[1], dst[2], s->linesize, s->uvlinesize, 0); + } + + prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP8_FRAME_ALTREF); + + dst[0] += 16; + dst[1] += 8; + dst[2] += 8; + td->mv_bounds.mv_min.x -= 64; + td->mv_bounds.mv_max.x -= 64; + + if (mb_x == s->mb_width + 1) { + update_pos(td, mb_y, s->mb_width + 3); + } else { + update_pos(td, mb_y, mb_x); + } + } + return 0; +} + +static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata, + int jobnr, int threadnr) +{ + return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1); +} + +static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata, + int jobnr, int threadnr) +{ + return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0); +} + +static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata, + int jobnr, int threadnr, int is_vp7) +{ + VP8Context *s = avctx->priv_data; + VP8ThreadData *td = &s->thread_data[threadnr]; + int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs; + AVFrame *curframe = s->curframe->tf.f; + VP8Macroblock *mb; + VP8ThreadData *prev_td, *next_td; + uint8_t *dst[3] = { + curframe->data[0] + 16 * mb_y * s->linesize, + curframe->data[1] + 8 * mb_y * s->uvlinesize, + curframe->data[2] + 8 * mb_y * s->uvlinesize + }; + + if (s->mb_layout == 1) + mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1); + else + mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2; + + if (mb_y == 0) + prev_td = td; + else + prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs]; + if (mb_y == s->mb_height - 1) + next_td = td; + else + next_td = &s->thread_data[(jobnr + 1) % num_jobs]; + + for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) { + const VP8FilterStrength *f = &td->filter_strength[mb_x]; + if (prev_td != td) + check_thread_pos(td, prev_td, + (mb_x + 1) + (s->mb_width + 3), mb_y - 1); + if (next_td != td) + if (next_td != &s->thread_data[0]) + check_thread_pos(td, next_td, mb_x + 1, mb_y + 1); + + if (num_jobs == 1) { + if (s->filter.simple) + backup_mb_border(s->top_border[mb_x + 1], dst[0], + NULL, NULL, s->linesize, 0, 1); + else + backup_mb_border(s->top_border[mb_x + 1], dst[0], + dst[1], dst[2], s->linesize, s->uvlinesize, 0); + } + + if (s->filter.simple) + filter_mb_simple(s, dst[0], f, mb_x, mb_y); + else + filter_mb(s, dst, f, mb_x, mb_y, is_vp7); + dst[0] += 16; + dst[1] += 8; + dst[2] += 8; + + update_pos(td, mb_y, (s->mb_width + 3) + mb_x); + } +} + +static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata, + int jobnr, int threadnr) +{ + filter_mb_row(avctx, tdata, jobnr, threadnr, 1); +} + +static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata, + int jobnr, int threadnr) +{ + filter_mb_row(avctx, tdata, jobnr, threadnr, 0); +} + +static av_always_inline +int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr, + int threadnr, int is_vp7) +{ + const VP8Context *s = avctx->priv_data; + VP8ThreadData *td = &s->thread_data[jobnr]; + VP8ThreadData *next_td = NULL, *prev_td = NULL; + VP8Frame *curframe = s->curframe; + int mb_y, num_jobs = s->num_jobs; + int ret; + + td->thread_nr = threadnr; + td->mv_bounds.mv_min.y = -MARGIN - 64 * threadnr; + td->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr; + for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) { + atomic_store(&td->thread_mb_pos, mb_y << 16); + ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr); + if (ret < 0) { + update_pos(td, s->mb_height, INT_MAX & 0xFFFF); + return ret; + } + if (s->deblock_filter) + s->filter_mb_row(avctx, tdata, jobnr, threadnr); + update_pos(td, mb_y, INT_MAX & 0xFFFF); + + td->mv_bounds.mv_min.y -= 64 * num_jobs; + td->mv_bounds.mv_max.y -= 64 * num_jobs; + + if (avctx->active_thread_type == FF_THREAD_FRAME) + ff_thread_report_progress(&curframe->tf, mb_y, 0); + } + + return 0; +} + +static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, + int jobnr, int threadnr) +{ + return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7); +} + +static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, + int jobnr, int threadnr) +{ + return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8); +} + +static av_always_inline +int vp78_decode_frame(AVCodecContext *avctx, AVFrame *rframe, int *got_frame, + const AVPacket *avpkt, int is_vp7) +{ + VP8Context *s = avctx->priv_data; + int ret, i, referenced, num_jobs; + enum AVDiscard skip_thresh; + VP8Frame *av_uninit(curframe), *prev_frame; + + if (is_vp7) + ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size); + else + ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size); + + if (ret < 0) + goto err; + + if (s->actually_webp) { + // avctx->pix_fmt already set in caller. + } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) { + s->pix_fmt = get_pixel_format(s); + if (s->pix_fmt < 0) { + ret = AVERROR(EINVAL); + goto err; + } + avctx->pix_fmt = s->pix_fmt; + } + + prev_frame = s->framep[VP8_FRAME_CURRENT]; + + referenced = s->update_last || s->update_golden == VP8_FRAME_CURRENT || + s->update_altref == VP8_FRAME_CURRENT; + + skip_thresh = !referenced ? AVDISCARD_NONREF + : !s->keyframe ? AVDISCARD_NONKEY + : AVDISCARD_ALL; + + if (avctx->skip_frame >= skip_thresh) { + s->invisible = 1; + memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4); + goto skip_decode; + } + s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh; + + // release no longer referenced frames + for (i = 0; i < 5; i++) + if (s->frames[i].tf.f->buf[0] && + &s->frames[i] != prev_frame && + &s->frames[i] != s->framep[VP8_FRAME_PREVIOUS] && + &s->frames[i] != s->framep[VP8_FRAME_GOLDEN] && + &s->frames[i] != s->framep[VP8_FRAME_ALTREF]) + vp8_release_frame(s, &s->frames[i]); + + curframe = s->framep[VP8_FRAME_CURRENT] = vp8_find_free_buffer(s); + + if (!s->colorspace) + avctx->colorspace = AVCOL_SPC_BT470BG; + if (s->fullrange) + avctx->color_range = AVCOL_RANGE_JPEG; + else + avctx->color_range = AVCOL_RANGE_MPEG; + + /* Given that arithmetic probabilities are updated every frame, it's quite + * likely that the values we have on a random interframe are complete + * junk if we didn't start decode on a keyframe. So just don't display + * anything rather than junk. */ + if (!s->keyframe && (!s->framep[VP8_FRAME_PREVIOUS] || + !s->framep[VP8_FRAME_GOLDEN] || + !s->framep[VP8_FRAME_ALTREF])) { + av_log(avctx, AV_LOG_WARNING, + "Discarding interframe without a prior keyframe!\n"); + ret = AVERROR_INVALIDDATA; + goto err; + } + + curframe->tf.f->key_frame = s->keyframe; + curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I + : AV_PICTURE_TYPE_P; + if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0) + goto err; + + // check if golden and altref are swapped + if (s->update_altref != VP8_FRAME_NONE) + s->next_framep[VP8_FRAME_ALTREF] = s->framep[s->update_altref]; + else + s->next_framep[VP8_FRAME_ALTREF] = s->framep[VP8_FRAME_ALTREF]; + + if (s->update_golden != VP8_FRAME_NONE) + s->next_framep[VP8_FRAME_GOLDEN] = s->framep[s->update_golden]; + else + s->next_framep[VP8_FRAME_GOLDEN] = s->framep[VP8_FRAME_GOLDEN]; + + if (s->update_last) + s->next_framep[VP8_FRAME_PREVIOUS] = curframe; + else + s->next_framep[VP8_FRAME_PREVIOUS] = s->framep[VP8_FRAME_PREVIOUS]; + + s->next_framep[VP8_FRAME_CURRENT] = curframe; + + if (ffcodec(avctx->codec)->update_thread_context) + ff_thread_finish_setup(avctx); + + if (avctx->hwaccel) { + ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size); + if (ret < 0) + goto err; + + ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size); + if (ret < 0) + goto err; + + ret = avctx->hwaccel->end_frame(avctx); + if (ret < 0) + goto err; + + } else { + s->linesize = curframe->tf.f->linesize[0]; + s->uvlinesize = curframe->tf.f->linesize[1]; + + memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz)); + /* Zero macroblock structures for top/top-left prediction + * from outside the frame. */ + if (!s->mb_layout) + memset(s->macroblocks + s->mb_height * 2 - 1, 0, + (s->mb_width + 1) * sizeof(*s->macroblocks)); + if (!s->mb_layout && s->keyframe) + memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4); + + memset(s->ref_count, 0, sizeof(s->ref_count)); + + if (s->mb_layout == 1) { + // Make sure the previous frame has read its segmentation map, + // if we re-use the same map. + if (prev_frame && s->segmentation.enabled && + !s->segmentation.update_map) + ff_thread_await_progress(&prev_frame->tf, 1, 0); + if (is_vp7) + ret = vp7_decode_mv_mb_modes(avctx, curframe, prev_frame); + else + ret = vp8_decode_mv_mb_modes(avctx, curframe, prev_frame); + if (ret < 0) + goto err; + } + + if (avctx->active_thread_type == FF_THREAD_FRAME) + num_jobs = 1; + else + num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count); + s->num_jobs = num_jobs; + s->curframe = curframe; + s->prev_frame = prev_frame; + s->mv_bounds.mv_min.y = -MARGIN; + s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN; + for (i = 0; i < MAX_THREADS; i++) { + VP8ThreadData *td = &s->thread_data[i]; + atomic_init(&td->thread_mb_pos, 0); + atomic_init(&td->wait_mb_pos, INT_MAX); + } + if (is_vp7) + avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL, + num_jobs); + else + avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, + num_jobs); + } + + ff_thread_report_progress(&curframe->tf, INT_MAX, 0); + memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4); + +skip_decode: + // if future frames don't use the updated probabilities, + // reset them to the values we saved + if (!s->update_probabilities) + s->prob[0] = s->prob[1]; + + if (!s->invisible) { + if ((ret = av_frame_ref(rframe, curframe->tf.f)) < 0) + return ret; + *got_frame = 1; + } + + return avpkt->size; +err: + memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4); + return ret; +} + +int ff_vp8_decode_frame(AVCodecContext *avctx, AVFrame *frame, + int *got_frame, AVPacket *avpkt) +{ + return vp78_decode_frame(avctx, frame, got_frame, avpkt, IS_VP8); +} + +#if CONFIG_VP7_DECODER +static int vp7_decode_frame(AVCodecContext *avctx, AVFrame *frame, + int *got_frame, AVPacket *avpkt) +{ + return vp78_decode_frame(avctx, frame, got_frame, avpkt, IS_VP7); +} +#endif /* CONFIG_VP7_DECODER */ + +av_cold int ff_vp8_decode_free(AVCodecContext *avctx) +{ + VP8Context *s = avctx->priv_data; + int i; + + vp8_decode_flush_impl(avctx, 1); + for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) + av_frame_free(&s->frames[i].tf.f); + + return 0; +} + +static av_cold int vp8_init_frames(VP8Context *s) +{ + int i; + for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) { + s->frames[i].tf.f = av_frame_alloc(); + if (!s->frames[i].tf.f) + return AVERROR(ENOMEM); + } + return 0; +} + +static av_always_inline +int vp78_decode_init(AVCodecContext *avctx, int is_vp7) +{ + VP8Context *s = avctx->priv_data; + int ret; + + s->avctx = avctx; + s->vp7 = avctx->codec->id == AV_CODEC_ID_VP7; + s->pix_fmt = AV_PIX_FMT_NONE; + avctx->pix_fmt = AV_PIX_FMT_YUV420P; + + ff_videodsp_init(&s->vdsp, 8); + + ff_vp78dsp_init(&s->vp8dsp); + if (CONFIG_VP7_DECODER && is_vp7) { + ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1); + ff_vp7dsp_init(&s->vp8dsp); + s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter; + s->filter_mb_row = vp7_filter_mb_row; + } else if (CONFIG_VP8_DECODER && !is_vp7) { + ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1); + ff_vp8dsp_init(&s->vp8dsp); + s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter; + s->filter_mb_row = vp8_filter_mb_row; + } + + /* does not change for VP8 */ + memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan)); + + if ((ret = vp8_init_frames(s)) < 0) { + ff_vp8_decode_free(avctx); + return ret; + } + + return 0; +} + +#if CONFIG_VP7_DECODER +static int vp7_decode_init(AVCodecContext *avctx) +{ + return vp78_decode_init(avctx, IS_VP7); +} +#endif /* CONFIG_VP7_DECODER */ + +av_cold int ff_vp8_decode_init(AVCodecContext *avctx) +{ + return vp78_decode_init(avctx, IS_VP8); +} + +#if CONFIG_VP8_DECODER +#if HAVE_THREADS +#define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL) + +static int vp8_decode_update_thread_context(AVCodecContext *dst, + const AVCodecContext *src) +{ + VP8Context *s = dst->priv_data, *s_src = src->priv_data; + int i; + + if (s->macroblocks_base && + (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) { + free_buffers(s); + s->mb_width = s_src->mb_width; + s->mb_height = s_src->mb_height; + } + + s->pix_fmt = s_src->pix_fmt; + s->prob[0] = s_src->prob[!s_src->update_probabilities]; + s->segmentation = s_src->segmentation; + s->lf_delta = s_src->lf_delta; + memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias)); + + for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) { + if (s_src->frames[i].tf.f->buf[0]) { + int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]); + if (ret < 0) + return ret; + } + } + + s->framep[0] = REBASE(s_src->next_framep[0]); + s->framep[1] = REBASE(s_src->next_framep[1]); + s->framep[2] = REBASE(s_src->next_framep[2]); + s->framep[3] = REBASE(s_src->next_framep[3]); + + return 0; +} +#endif /* HAVE_THREADS */ +#endif /* CONFIG_VP8_DECODER */ + +#if CONFIG_VP7_DECODER +const FFCodec ff_vp7_decoder = { + .p.name = "vp7", + CODEC_LONG_NAME("On2 VP7"), + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_VP7, + .priv_data_size = sizeof(VP8Context), + .init = vp7_decode_init, + .close = ff_vp8_decode_free, + FF_CODEC_DECODE_CB(vp7_decode_frame), + .p.capabilities = AV_CODEC_CAP_DR1, + .flush = vp8_decode_flush, +}; +#endif /* CONFIG_VP7_DECODER */ + +#if CONFIG_VP8_DECODER +const FFCodec ff_vp8_decoder = { + .p.name = "vp8", + CODEC_LONG_NAME("On2 VP8"), + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_VP8, + .priv_data_size = sizeof(VP8Context), + .init = ff_vp8_decode_init, + .close = ff_vp8_decode_free, + FF_CODEC_DECODE_CB(ff_vp8_decode_frame), + .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | + AV_CODEC_CAP_SLICE_THREADS, + .caps_internal = FF_CODEC_CAP_ALLOCATE_PROGRESS, + .flush = vp8_decode_flush, + UPDATE_THREAD_CONTEXT(vp8_decode_update_thread_context), + .hw_configs = (const AVCodecHWConfigInternal *const []) { +#if CONFIG_VP8_VAAPI_HWACCEL + HWACCEL_VAAPI(vp8), +#endif +#if CONFIG_VP8_NVDEC_HWACCEL + HWACCEL_NVDEC(vp8), +#endif + NULL + }, +}; +#endif /* CONFIG_VP7_DECODER */ diff --git a/media/ffvpx/libavcodec/vp8.h b/media/ffvpx/libavcodec/vp8.h new file mode 100644 index 0000000000..6f29156b53 --- /dev/null +++ b/media/ffvpx/libavcodec/vp8.h @@ -0,0 +1,361 @@ +/* + * VP8 compatible video decoder + * + * Copyright (C) 2010 David Conrad + * Copyright (C) 2010 Ronald S. Bultje + * Copyright (C) 2010 Fiona Glaser + * Copyright (C) 2012 Daniel Kang + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VP8_H +#define AVCODEC_VP8_H + +#include <stdatomic.h> + +#include "libavutil/buffer.h" +#include "libavutil/mem_internal.h" +#include "libavutil/thread.h" + +#include "h264pred.h" +#include "threadframe.h" +#include "videodsp.h" +#include "vp8dsp.h" +#include "vpx_rac.h" + +#define VP8_MAX_QUANT 127 + +typedef enum { + VP8_FRAME_NONE = -1, + VP8_FRAME_CURRENT = 0, + VP8_FRAME_PREVIOUS = 1, + VP8_FRAME_GOLDEN = 2, + VP8_FRAME_ALTREF = 3, +} VP8FrameType; + +enum dct_token { + DCT_0, + DCT_1, + DCT_2, + DCT_3, + DCT_4, + DCT_CAT1, + DCT_CAT2, + DCT_CAT3, + DCT_CAT4, + DCT_CAT5, + DCT_CAT6, + DCT_EOB, + + NUM_DCT_TOKENS +}; + +// used to signal 4x4 intra pred in luma MBs +#define MODE_I4x4 4 + +enum inter_mvmode { + VP8_MVMODE_ZERO = MODE_I4x4 + 1, + VP8_MVMODE_MV, + VP8_MVMODE_SPLIT +}; + +enum inter_splitmvmode { + VP8_SPLITMVMODE_16x8 = 0, ///< 2 16x8 blocks (vertical) + VP8_SPLITMVMODE_8x16, ///< 2 8x16 blocks (horizontal) + VP8_SPLITMVMODE_8x8, ///< 2x2 blocks of 8x8px each + VP8_SPLITMVMODE_4x4, ///< 4x4 blocks of 4x4px each + VP8_SPLITMVMODE_NONE, ///< (only used in prediction) no split MVs +}; + +typedef struct VP8mv { + DECLARE_ALIGNED(4, int16_t, x); + int16_t y; +} VP8mv; + +typedef struct VP8FilterStrength { + uint8_t filter_level; + uint8_t inner_limit; + uint8_t inner_filter; +} VP8FilterStrength; + +typedef struct VP8Macroblock { + uint8_t skip; + // TODO: make it possible to check for at least (i4x4 or split_mv) + // in one op. are others needed? + uint8_t mode; + uint8_t ref_frame; + uint8_t partitioning; + uint8_t chroma_pred_mode; + uint8_t segment; + uint8_t intra4x4_pred_mode_mb[16]; + DECLARE_ALIGNED(4, uint8_t, intra4x4_pred_mode_top)[4]; + VP8mv mv; + VP8mv bmv[16]; +} VP8Macroblock; + +typedef struct VP8intmv { + int x; + int y; +} VP8intmv; + +typedef struct VP8mvbounds { + VP8intmv mv_min; + VP8intmv mv_max; +} VP8mvbounds; + +typedef struct VP8ThreadData { + DECLARE_ALIGNED(16, int16_t, block)[6][4][16]; + DECLARE_ALIGNED(16, int16_t, block_dc)[16]; + /** + * This is the index plus one of the last non-zero coeff + * for each of the blocks in the current macroblock. + * So, 0 -> no coeffs + * 1 -> dc-only (special transform) + * 2+-> full transform + */ + DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4]; + /** + * For coeff decode, we need to know whether the above block had non-zero + * coefficients. This means for each macroblock, we need data for 4 luma + * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9 + * per macroblock. We keep the last row in top_nnz. + */ + DECLARE_ALIGNED(8, uint8_t, left_nnz)[9]; + int thread_nr; +#if HAVE_THREADS + pthread_mutex_t lock; + pthread_cond_t cond; +#endif + atomic_int thread_mb_pos; // (mb_y << 16) | (mb_x & 0xFFFF) + atomic_int wait_mb_pos; // What the current thread is waiting on. + +#define EDGE_EMU_LINESIZE 32 + DECLARE_ALIGNED(16, uint8_t, edge_emu_buffer)[21 * EDGE_EMU_LINESIZE]; + VP8FilterStrength *filter_strength; + VP8mvbounds mv_bounds; +} VP8ThreadData; + +typedef struct VP8Frame { + ThreadFrame tf; + AVBufferRef *seg_map; + + AVBufferRef *hwaccel_priv_buf; + void *hwaccel_picture_private; +} VP8Frame; + +#define MAX_THREADS 8 +typedef struct VP8Context { + VP8ThreadData *thread_data; + AVCodecContext *avctx; + enum AVPixelFormat pix_fmt; + int actually_webp; + + VP8Frame *framep[4]; + VP8Frame *next_framep[4]; + VP8Frame *curframe; + VP8Frame *prev_frame; + + uint16_t mb_width; /* number of horizontal MB */ + uint16_t mb_height; /* number of vertical MB */ + ptrdiff_t linesize; + ptrdiff_t uvlinesize; + + uint8_t keyframe; + uint8_t deblock_filter; + uint8_t mbskip_enabled; + uint8_t profile; + VP8mvbounds mv_bounds; + + int8_t sign_bias[4]; ///< one state [0, 1] per ref frame type + int ref_count[3]; + + /** + * Base parameters for segmentation, i.e. per-macroblock parameters. + * These must be kept unchanged even if segmentation is not used for + * a frame, since the values persist between interframes. + */ + struct { + uint8_t enabled; + uint8_t absolute_vals; + uint8_t update_map; + uint8_t update_feature_data; + int8_t base_quant[4]; + int8_t filter_level[4]; ///< base loop filter level + } segmentation; + + struct { + uint8_t simple; + uint8_t level; + uint8_t sharpness; + } filter; + + VP8Macroblock *macroblocks; + + uint8_t *intra4x4_pred_mode_top; + uint8_t intra4x4_pred_mode_left[4]; + + /** + * Macroblocks can have one of 4 different quants in a frame when + * segmentation is enabled. + * If segmentation is disabled, only the first segment's values are used. + */ + struct { + // [0] - DC qmul [1] - AC qmul + int16_t luma_qmul[2]; + int16_t luma_dc_qmul[2]; ///< luma dc-only block quant + int16_t chroma_qmul[2]; + } qmat[4]; + + // Raw quantisation values, which may be needed by hwaccel decode. + struct { + int yac_qi; + int ydc_delta; + int y2dc_delta; + int y2ac_delta; + int uvdc_delta; + int uvac_delta; + } quant; + + struct { + uint8_t enabled; ///< whether each mb can have a different strength based on mode/ref + uint8_t update; + + /** + * filter strength adjustment for the following macroblock modes: + * [0-3] - i16x16 (always zero) + * [4] - i4x4 + * [5] - zero mv + * [6] - inter modes except for zero or split mv + * [7] - split mv + * i16x16 modes never have any adjustment + */ + int8_t mode[VP8_MVMODE_SPLIT + 1]; + + /** + * filter strength adjustment for macroblocks that reference: + * [0] - intra / VP8_FRAME_CURRENT + * [1] - VP8_FRAME_PREVIOUS + * [2] - VP8_FRAME_GOLDEN + * [3] - altref / VP8_FRAME_ALTREF + */ + int8_t ref[4]; + } lf_delta; + + uint8_t (*top_border)[16 + 8 + 8]; + uint8_t (*top_nnz)[9]; + + VPXRangeCoder c; ///< header context, includes mb modes and motion vectors + + /* This contains the entropy coder state at the end of the header + * block, in the form specified by the standard. For use by + * hwaccels, so that a hardware decoder has the information to + * start decoding at the macroblock layer. + */ + struct { + const uint8_t *input; + uint32_t range; + uint32_t value; + int bit_count; + } coder_state_at_header_end; + + int header_partition_size; + + /** + * These are all of the updatable probabilities for binary decisions. + * They are only implicitly reset on keyframes, making it quite likely + * for an interframe to desync if a prior frame's header was corrupt + * or missing outright! + */ + struct { + uint8_t segmentid[3]; + uint8_t mbskip; + uint8_t intra; + uint8_t last; + uint8_t golden; + uint8_t pred16x16[4]; + uint8_t pred8x8c[3]; + uint8_t token[4][16][3][NUM_DCT_TOKENS - 1]; + uint8_t mvc[2][19]; + uint8_t scan[16]; + } prob[2]; + + VP8Macroblock *macroblocks_base; + int invisible; + int update_last; ///< update VP8_FRAME_PREVIOUS with the current one + int update_golden; ///< VP8_FRAME_NONE if not updated, or which frame to copy if so + int update_altref; + + /** + * If this flag is not set, all the probability updates + * are discarded after this frame is decoded. + */ + int update_probabilities; + + /** + * All coefficients are contained in separate arith coding contexts. + * There can be 1, 2, 4, or 8 of these after the header context. + */ + int num_coeff_partitions; + VPXRangeCoder coeff_partition[8]; + int coeff_partition_size[8]; + VideoDSPContext vdsp; + VP8DSPContext vp8dsp; + H264PredContext hpc; + vp8_mc_func put_pixels_tab[3][3][3]; + VP8Frame frames[5]; + + uint8_t colorspace; ///< 0 is the only value allowed (meaning bt601) + uint8_t fullrange; ///< whether we can skip clamping in dsp functions + + int num_jobs; + /** + * This describes the macroblock memory layout. + * 0 -> Only width+height*2+1 macroblocks allocated (frame/single thread). + * 1 -> Macroblocks for entire frame allocated (sliced thread). + */ + int mb_layout; + + int (*decode_mb_row_no_filter)(AVCodecContext *avctx, void *tdata, int jobnr, int threadnr); + void (*filter_mb_row)(AVCodecContext *avctx, void *tdata, int jobnr, int threadnr); + + int vp7; + + /** + * Interframe DC prediction (VP7) + * [0] VP8_FRAME_PREVIOUS + * [1] VP8_FRAME_GOLDEN + */ + uint16_t inter_dc_pred[2][2]; + + /** + * Macroblock features (VP7) + */ + uint8_t feature_enabled[4]; + uint8_t feature_present_prob[4]; + uint8_t feature_index_prob[4][3]; + uint8_t feature_value[4][4]; +} VP8Context; + +int ff_vp8_decode_init(AVCodecContext *avctx); + +int ff_vp8_decode_frame(AVCodecContext *avctx, AVFrame *frame, + int *got_frame, AVPacket *avpkt); + +int ff_vp8_decode_free(AVCodecContext *avctx); + +#endif /* AVCODEC_VP8_H */ diff --git a/media/ffvpx/libavcodec/vp89_rac.h b/media/ffvpx/libavcodec/vp89_rac.h new file mode 100644 index 0000000000..bc0924c387 --- /dev/null +++ b/media/ffvpx/libavcodec/vp89_rac.h @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Range decoder functions common to VP8 and VP9 + */ + +#ifndef AVCODEC_VP89_RAC_H +#define AVCODEC_VP89_RAC_H + +#include <stdint.h> + +#include "libavutil/attributes.h" + +#include "vpx_rac.h" + +// rounding is different than vpx_rac_get, is vpx_rac_get wrong? +static av_always_inline int vp89_rac_get(VPXRangeCoder *c) +{ + return vpx_rac_get_prob(c, 128); +} + +static av_unused int vp89_rac_get_uint(VPXRangeCoder *c, int bits) +{ + int value = 0; + + while (bits--) { + value = (value << 1) | vp89_rac_get(c); + } + + return value; +} + +// how probabilities are associated with decisions is different I think +// well, the new scheme fits in the old but this way has one fewer branches per decision +static av_always_inline int vp89_rac_get_tree(VPXRangeCoder *c, const int8_t (*tree)[2], + const uint8_t *probs) +{ + int i = 0; + + do { + i = tree[i][vpx_rac_get_prob(c, probs[i])]; + } while (i > 0); + + return -i; +} + +#endif /* AVCODEC_VP89_RAC_H */ diff --git a/media/ffvpx/libavcodec/vp8_parser.c b/media/ffvpx/libavcodec/vp8_parser.c new file mode 100644 index 0000000000..98b752bfb9 --- /dev/null +++ b/media/ffvpx/libavcodec/vp8_parser.c @@ -0,0 +1,79 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/intreadwrite.h" + +#include "avcodec.h" + +static int parse(AVCodecParserContext *s, + AVCodecContext *avctx, + const uint8_t **poutbuf, int *poutbuf_size, + const uint8_t *buf, int buf_size) +{ + unsigned int frame_type; + unsigned int profile; + + *poutbuf = buf; + *poutbuf_size = buf_size; + + if (buf_size < 3) + return buf_size; + + frame_type = buf[0] & 1; + profile = (buf[0] >> 1) & 7; + if (profile > 3) { + av_log(avctx, AV_LOG_ERROR, "Invalid profile %u.\n", profile); + return buf_size; + } + + avctx->profile = profile; + s->key_frame = frame_type == 0; + s->pict_type = frame_type ? AV_PICTURE_TYPE_P : AV_PICTURE_TYPE_I; + s->format = AV_PIX_FMT_YUV420P; + s->field_order = AV_FIELD_PROGRESSIVE; + s->picture_structure = AV_PICTURE_STRUCTURE_FRAME; + + if (frame_type == 0) { + unsigned int sync_code; + unsigned int width, height; + + if (buf_size < 10) + return buf_size; + + sync_code = AV_RL24(buf + 3); + if (sync_code != 0x2a019d) { + av_log(avctx, AV_LOG_ERROR, "Invalid sync code %06x.\n", sync_code); + return buf_size; + } + + width = AV_RL16(buf + 6) & 0x3fff; + height = AV_RL16(buf + 8) & 0x3fff; + + s->width = width; + s->height = height; + s->coded_width = FFALIGN(width, 16); + s->coded_height = FFALIGN(height, 16); + } + + return buf_size; +} + +const AVCodecParser ff_vp8_parser = { + .codec_ids = { AV_CODEC_ID_VP8 }, + .parser_parse = parse, +}; diff --git a/media/ffvpx/libavcodec/vp8data.h b/media/ffvpx/libavcodec/vp8data.h new file mode 100644 index 0000000000..1fcce134eb --- /dev/null +++ b/media/ffvpx/libavcodec/vp8data.h @@ -0,0 +1,824 @@ +/* + * Copyright (C) 2010 David Conrad + * Copyright (C) 2010 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * VP8 compatible video decoder + */ + +#ifndef AVCODEC_VP8DATA_H +#define AVCODEC_VP8DATA_H + +#include "vp8.h" +#include "h264pred.h" + +static const uint8_t vp7_pred4x4_mode[] = { + [DC_PRED8x8] = DC_PRED, + [VERT_PRED8x8] = TM_VP8_PRED, + [HOR_PRED8x8] = TM_VP8_PRED, + [PLANE_PRED8x8] = TM_VP8_PRED, +}; + +static const uint8_t vp8_pred4x4_mode[] = { + [DC_PRED8x8] = DC_PRED, + [VERT_PRED8x8] = VERT_PRED, + [HOR_PRED8x8] = HOR_PRED, + [PLANE_PRED8x8] = TM_VP8_PRED, +}; + +static const int8_t vp8_pred16x16_tree_intra[4][2] = { + { -MODE_I4x4, 1 }, // '0' + { 2, 3 }, + { -DC_PRED8x8, -VERT_PRED8x8 }, // '100', '101' + { -HOR_PRED8x8, -PLANE_PRED8x8 }, // '110', '111' +}; + +static const int8_t vp8_pred16x16_tree_inter[4][2] = { + { -DC_PRED8x8, 1 }, // '0' + { 2, 3 }, + { -VERT_PRED8x8, -HOR_PRED8x8 }, // '100', '101' + { -PLANE_PRED8x8, -MODE_I4x4 }, // '110', '111' +}; + +typedef struct VP7MVPred { + int8_t yoffset; + int8_t xoffset; + uint8_t subblock; + uint8_t score; +} VP7MVPred; + +#define VP7_MV_PRED_COUNT 12 +static const VP7MVPred vp7_mv_pred[VP7_MV_PRED_COUNT] = { + { -1, 0, 12, 8 }, + { 0, -1, 3, 8 }, + { -1, -1, 15, 2 }, + { -1, 1, 12, 2 }, + { -2, 0, 12, 2 }, + { 0, -2, 3, 2 }, + { -1, -2, 15, 1 }, + { -2, -1, 15, 1 }, + { -2, 1, 12, 1 }, + { -1, 2, 12, 1 }, + { -2, -2, 15, 1 }, + { -2, 2, 12, 1 }, +}; + +static const int vp7_mode_contexts[31][4] = { + { 3, 3, 1, 246 }, + { 7, 89, 66, 239 }, + { 10, 90, 78, 238 }, + { 14, 118, 95, 241 }, + { 14, 123, 106, 238 }, + { 20, 140, 109, 240 }, + { 13, 155, 103, 238 }, + { 21, 158, 99, 240 }, + { 27, 82, 108, 232 }, + { 19, 99, 123, 217 }, + { 45, 139, 148, 236 }, + { 50, 117, 144, 235 }, + { 57, 128, 164, 238 }, + { 69, 139, 171, 239 }, + { 74, 154, 179, 238 }, + { 112, 165, 186, 242 }, + { 98, 143, 185, 245 }, + { 105, 153, 190, 250 }, + { 124, 167, 192, 245 }, + { 131, 186, 203, 246 }, + { 59, 184, 222, 224 }, + { 148, 215, 214, 213 }, + { 137, 211, 210, 219 }, + { 190, 227, 128, 228 }, + { 183, 228, 128, 228 }, + { 194, 234, 128, 228 }, + { 202, 236, 128, 228 }, + { 205, 240, 128, 228 }, + { 205, 244, 128, 228 }, + { 225, 246, 128, 228 }, + { 233, 251, 128, 228 }, +}; + +static const int vp8_mode_contexts[6][4] = { + { 7, 1, 1, 143 }, + { 14, 18, 14, 107 }, + { 135, 64, 57, 68 }, + { 60, 56, 128, 65 }, + { 159, 134, 128, 34 }, + { 234, 188, 128, 28 }, +}; + +static const uint8_t vp8_mbsplits[5][16] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } +}; + +static const uint8_t vp8_mbfirstidx[4][16] = { + { 0, 8 }, + { 0, 2 }, + { 0, 2, 8, 10 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } +}; + +static const uint8_t vp8_mbsplit_count[4] = { + 2, 2, 4, 16 +}; +static const uint8_t vp8_mbsplit_prob[3] = { + 110, 111, 150 +}; + +static const uint8_t vp7_submv_prob[3] = { + 180, 162, 25 +}; + +static const uint8_t vp8_submv_prob[5][3] = { + { 147, 136, 18 }, + { 106, 145, 1 }, + { 179, 121, 1 }, + { 223, 1, 34 }, + { 208, 1, 1 } +}; + +static const uint8_t vp8_pred16x16_prob_intra[4] = { + 145, 156, 163, 128 +}; +static const uint8_t vp8_pred16x16_prob_inter[4] = { + 112, 86, 140, 37 +}; + +static const int8_t vp8_pred4x4_tree[9][2] = { + { -DC_PRED, 1 }, // '0' + { -TM_VP8_PRED, 2 }, // '10' + { -VERT_PRED, 3 }, // '110' + { 4, 6 }, + { -HOR_PRED, 5 }, // '11100' + { -DIAG_DOWN_RIGHT_PRED, -VERT_RIGHT_PRED }, // '111010', '111011' + { -DIAG_DOWN_LEFT_PRED, 7 }, // '11110' + { -VERT_LEFT_PRED, 8 }, // '111110' + { -HOR_DOWN_PRED, -HOR_UP_PRED }, // '1111110', '1111111' +}; + +static const int8_t vp8_pred8x8c_tree[3][2] = { + { -DC_PRED8x8, 1 }, // '0' + { -VERT_PRED8x8, 2 }, // '10 + { -HOR_PRED8x8, -PLANE_PRED8x8 }, // '110', '111' +}; + +static const uint8_t vp8_pred8x8c_prob_intra[3] = { + 142, 114, 183 +}; +static const uint8_t vp8_pred8x8c_prob_inter[3] = { + 162, 101, 204 +}; +static const uint8_t vp8_pred4x4_prob_inter[9] = { + 120, 90, 79, 133, 87, 85, 80, 111, 151 +}; + +static const uint8_t vp8_pred4x4_prob_intra[10][10][9] = { + { + { 39, 53, 200, 87, 26, 21, 43, 232, 171 }, + { 56, 34, 51, 104, 114, 102, 29, 93, 77 }, + { 88, 88, 147, 150, 42, 46, 45, 196, 205 }, + { 107, 54, 32, 26, 51, 1, 81, 43, 31 }, + { 39, 28, 85, 171, 58, 165, 90, 98, 64 }, + { 34, 22, 116, 206, 23, 34, 43, 166, 73 }, + { 34, 19, 21, 102, 132, 188, 16, 76, 124 }, + { 68, 25, 106, 22, 64, 171, 36, 225, 114 }, + { 62, 18, 78, 95, 85, 57, 50, 48, 51 }, + { 43, 97, 183, 117, 85, 38, 35, 179, 61 }, + }, + { + { 112, 113, 77, 85, 179, 255, 38, 120, 114 }, + { 40, 42, 1, 196, 245, 209, 10, 25, 109 }, + { 193, 101, 35, 159, 215, 111, 89, 46, 111 }, + { 100, 80, 8, 43, 154, 1, 51, 26, 71 }, + { 88, 43, 29, 140, 166, 213, 37, 43, 154 }, + { 61, 63, 30, 155, 67, 45, 68, 1, 209 }, + { 41, 40, 5, 102, 211, 183, 4, 1, 221 }, + { 142, 78, 78, 16, 255, 128, 34, 197, 171 }, + { 51, 50, 17, 168, 209, 192, 23, 25, 82 }, + { 60, 148, 31, 172, 219, 228, 21, 18, 111 }, + }, + { + { 175, 69, 143, 80, 85, 82, 72, 155, 103 }, + { 56, 58, 10, 171, 218, 189, 17, 13, 152 }, + { 231, 120, 48, 89, 115, 113, 120, 152, 112 }, + { 144, 71, 10, 38, 171, 213, 144, 34, 26 }, + { 114, 26, 17, 163, 44, 195, 21, 10, 173 }, + { 121, 24, 80, 195, 26, 62, 44, 64, 85 }, + { 63, 20, 8, 114, 114, 208, 12, 9, 226 }, + { 170, 46, 55, 19, 136, 160, 33, 206, 71 }, + { 81, 40, 11, 96, 182, 84, 29, 16, 36 }, + { 152, 179, 64, 126, 170, 118, 46, 70, 95 }, + }, + { + { 75, 79, 123, 47, 51, 128, 81, 171, 1 }, + { 57, 17, 5, 71, 102, 57, 53, 41, 49 }, + { 125, 98, 42, 88, 104, 85, 117, 175, 82 }, + { 115, 21, 2, 10, 102, 255, 166, 23, 6 }, + { 38, 33, 13, 121, 57, 73, 26, 1, 85 }, + { 41, 10, 67, 138, 77, 110, 90, 47, 114 }, + { 57, 18, 10, 102, 102, 213, 34, 20, 43 }, + { 101, 29, 16, 10, 85, 128, 101, 196, 26 }, + { 117, 20, 15, 36, 163, 128, 68, 1, 26 }, + { 95, 84, 53, 89, 128, 100, 113, 101, 45 }, + }, + { + { 63, 59, 90, 180, 59, 166, 93, 73, 154 }, + { 40, 40, 21, 116, 143, 209, 34, 39, 175 }, + { 138, 31, 36, 171, 27, 166, 38, 44, 229 }, + { 57, 46, 22, 24, 128, 1, 54, 17, 37 }, + { 47, 15, 16, 183, 34, 223, 49, 45, 183 }, + { 46, 17, 33, 183, 6, 98, 15, 32, 183 }, + { 40, 3, 9, 115, 51, 192, 18, 6, 223 }, + { 65, 32, 73, 115, 28, 128, 23, 128, 205 }, + { 87, 37, 9, 115, 59, 77, 64, 21, 47 }, + { 67, 87, 58, 169, 82, 115, 26, 59, 179 }, + }, + { + { 54, 57, 112, 184, 5, 41, 38, 166, 213 }, + { 30, 34, 26, 133, 152, 116, 10, 32, 134 }, + { 104, 55, 44, 218, 9, 54, 53, 130, 226 }, + { 75, 32, 12, 51, 192, 255, 160, 43, 51 }, + { 39, 19, 53, 221, 26, 114, 32, 73, 255 }, + { 31, 9, 65, 234, 2, 15, 1, 118, 73 }, + { 56, 21, 23, 111, 59, 205, 45, 37, 192 }, + { 88, 31, 35, 67, 102, 85, 55, 186, 85 }, + { 55, 38, 70, 124, 73, 102, 1, 34, 98 }, + { 64, 90, 70, 205, 40, 41, 23, 26, 57 }, + }, + { + { 86, 40, 64, 135, 148, 224, 45, 183, 128 }, + { 22, 26, 17, 131, 240, 154, 14, 1, 209 }, + { 164, 50, 31, 137, 154, 133, 25, 35, 218 }, + { 83, 12, 13, 54, 192, 255, 68, 47, 28 }, + { 45, 16, 21, 91, 64, 222, 7, 1, 197 }, + { 56, 21, 39, 155, 60, 138, 23, 102, 213 }, + { 18, 11, 7, 63, 144, 171, 4, 4, 246 }, + { 85, 26, 85, 85, 128, 128, 32, 146, 171 }, + { 35, 27, 10, 146, 174, 171, 12, 26, 128 }, + { 51, 103, 44, 131, 131, 123, 31, 6, 158 }, + }, + { + { 68, 45, 128, 34, 1, 47, 11, 245, 171 }, + { 62, 17, 19, 70, 146, 85, 55, 62, 70 }, + { 102, 61, 71, 37, 34, 53, 31, 243, 192 }, + { 75, 15, 9, 9, 64, 255, 184, 119, 16 }, + { 37, 43, 37, 154, 100, 163, 85, 160, 1 }, + { 63, 9, 92, 136, 28, 64, 32, 201, 85 }, + { 56, 8, 17, 132, 137, 255, 55, 116, 128 }, + { 86, 6, 28, 5, 64, 255, 25, 248, 1 }, + { 58, 15, 20, 82, 135, 57, 26, 121, 40 }, + { 69, 60, 71, 38, 73, 119, 28, 222, 37 }, + }, + { + { 101, 75, 128, 139, 118, 146, 116, 128, 85 }, + { 56, 41, 15, 176, 236, 85, 37, 9, 62 }, + { 190, 80, 35, 99, 180, 80, 126, 54, 45 }, + { 146, 36, 19, 30, 171, 255, 97, 27, 20 }, + { 71, 30, 17, 119, 118, 255, 17, 18, 138 }, + { 101, 38, 60, 138, 55, 70, 43, 26, 142 }, + { 32, 41, 20, 117, 151, 142, 20, 21, 163 }, + { 138, 45, 61, 62, 219, 1, 81, 188, 64 }, + { 112, 19, 12, 61, 195, 128, 48, 4, 24 }, + { 85, 126, 47, 87, 176, 51, 41, 20, 32 }, + }, + { + { 66, 102, 167, 99, 74, 62, 40, 234, 128 }, + { 41, 53, 9, 178, 241, 141, 26, 8, 107 }, + { 134, 183, 89, 137, 98, 101, 106, 165, 148 }, + { 104, 79, 12, 27, 217, 255, 87, 17, 7 }, + { 74, 43, 26, 146, 73, 166, 49, 23, 157 }, + { 65, 38, 105, 160, 51, 52, 31, 115, 128 }, + { 47, 41, 14, 110, 182, 183, 21, 17, 194 }, + { 87, 68, 71, 44, 114, 51, 15, 186, 23 }, + { 66, 45, 25, 102, 197, 189, 23, 18, 22 }, + { 72, 187, 100, 130, 157, 111, 32, 75, 80 }, + }, +}; + +static const uint8_t vp8_coeff_band[16] = { + 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7 +}; + +/* Inverse of vp8_coeff_band: mappings of bands to coefficient indexes. + * Each list is -1-terminated. */ +static const int8_t vp8_coeff_band_indexes[8][10] = { + { 0, -1 }, + { 1, -1 }, + { 2, -1 }, + { 3, -1 }, + { 5, -1 }, + { 6, -1 }, + { 4, 7, 8, 9, 10, 11, 12, 13, 14, -1 }, + { 15, -1 } +}; + +static const uint8_t vp8_dct_cat1_prob[] = { + 159, 0 +}; +static const uint8_t vp8_dct_cat2_prob[] = { + 165, 145, 0 +}; +static const uint8_t vp8_dct_cat3_prob[] = { + 173, 148, 140, 0 +}; +static const uint8_t vp8_dct_cat4_prob[] = { + 176, 155, 140, 135, 0 +}; +static const uint8_t vp8_dct_cat5_prob[] = { + 180, 157, 141, 134, 130, 0 +}; +static const uint8_t vp8_dct_cat6_prob[] = { + 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 +}; + +// only used for cat3 and above; cat 1 and 2 are referenced directly +const uint8_t *const ff_vp8_dct_cat_prob[] = { + vp8_dct_cat3_prob, + vp8_dct_cat4_prob, + vp8_dct_cat5_prob, + vp8_dct_cat6_prob, +}; + +static const uint8_t vp8_token_default_probs[4][8][3][NUM_DCT_TOKENS - 1] = { + { + { + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 }, + { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 }, + { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }, + }, + { + { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 }, + { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 }, + { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }, + }, + { + { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 }, + { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 }, + { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }, + }, + { + { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 }, + { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 }, + { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }, + }, + { + { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 }, + { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 }, + { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }, + }, + { + { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 }, + { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 }, + { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }, + }, + { + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + }, + { + { + { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 }, + { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 }, + { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }, + }, + { + { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 }, + { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 }, + { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }, + }, + { + { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 }, + { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 }, + { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }, + }, + { + { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 }, + { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 }, + { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }, + }, + { + { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 }, + { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 }, + { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }, + }, + { + { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 }, + { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 }, + { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }, + }, + { + { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 }, + { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 }, + { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }, + }, + { + { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }, + }, + }, + { + { + { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 }, + { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 }, + { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }, + }, + { + { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 }, + { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 }, + { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }, + }, + { + { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 }, + { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 }, + { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }, + }, + { + { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + }, + { + { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 }, + { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + }, + { + { + { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 }, + { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 }, + { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }, + }, + { + { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 }, + { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 }, + { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }, + }, + { + { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 }, + { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 }, + { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }, + }, + { + { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 }, + { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 }, + { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }, + }, + { + { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 }, + { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 }, + { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }, + }, + { + { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 }, + { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 }, + { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }, + }, + { + { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 }, + { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 }, + { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }, + }, + { + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + }, +}; + +static const uint8_t vp8_token_update_probs[4][8][3][NUM_DCT_TOKENS - 1] = { + { + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 }, + { 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + }, + { + { + { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 }, + { 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 }, + }, + { + { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + }, + { + { + { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 }, + { 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 }, + }, + { + { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + }, + { + { + { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 }, + { 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + { + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }, + }, + }, +}; + +static const uint8_t vp8_dc_qlookup[VP8_MAX_QUANT + 1] = { + 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 17, + 18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 91, 93, 95, 96, 98, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, + 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 143, 145, 148, 151, 154, 157, +}; + +static const uint16_t vp8_ac_qlookup[VP8_MAX_QUANT + 1] = { + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, + 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, + 110, 112, 114, 116, 119, 122, 125, 128, 131, 134, 137, 140, 143, 146, 149, 152, + 155, 158, 161, 164, 167, 170, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209, + 213, 217, 221, 225, 229, 234, 239, 245, 249, 254, 259, 264, 269, 274, 279, 284, +}; + +static const uint8_t vp8_mv_update_prob[2][19] = { + { 237, + 246, + 253, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 250, 250, 252, /* VP8 only: */ 254, 254 }, + { 231, + 243, + 245, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 251, 251, 254, /* VP8 only: */ 254, 254 } +}; + +static const uint8_t vp7_mv_default_prob[2][17] = { + { 162, + 128, + 225, 146, 172, 147, 214, 39, 156, + 247, 210, 135, 68, 138, 220, 239, 246 }, + { 164, + 128, + 204, 170, 119, 235, 140, 230, 228, + 244, 184, 201, 44, 173, 221, 239, 253 } +}; + +static const uint8_t vp8_mv_default_prob[2][19] = { + { 162, + 128, + 225, 146, 172, 147, 214, 39, 156, + 128, 129, 132, 75, 145, 178, 206, 239, 254, 254 }, + { 164, + 128, + 204, 170, 119, 235, 140, 230, 228, + 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 } +}; + +static const uint8_t vp7_feature_value_size[2][4] = { + { 7, 6, 0, 8 }, + { 7, 6, 0, 5 }, +}; + +static const int8_t vp7_feature_index_tree[4][2] = +{ + { 1, 2 }, + { -0, -1 }, // '00', '01' + { -2, -3 }, // '10', '11' +}; + +static const uint16_t vp7_ydc_qlookup[] = { + 4, 4, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38, 39, 39, 40, 41, + 41, 42, 43, 43, 44, 45, 45, 46, 47, 48, 48, 49, 50, 51, 52, + 53, 53, 54, 56, 57, 58, 59, 60, 62, 63, 65, 66, 68, 70, 72, + 74, 76, 79, 81, 84, 87, 90, 93, 96, 100, 104, 108, 112, 116, 121, + 126, 131, 136, 142, 148, 154, 160, 167, 174, 182, 189, 198, 206, 215, 224, + 234, 244, 254, 265, 277, 288, 301, 313, 327, 340, 355, 370, 385, 401, 417, + 434, 452, 470, 489, 509, 529, 550, 572, +}; + +static const uint16_t vp7_yac_qlookup[] = { + 4, 4, 5, 5, 6, 6, 7, 8, 9, 10, 11, 12, 13, 15, + 16, 17, 19, 20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, + 37, 38, 40, 41, 42, 44, 45, 46, 48, 49, 50, 51, 53, 54, + 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 67, 68, 69, 70, + 72, 73, 75, 76, 78, 80, 82, 84, 86, 88, 91, 93, 96, 99, + 102, 105, 109, 112, 116, 121, 125, 130, 135, 140, 146, 152, 158, 165, + 172, 180, 188, 196, 205, 214, 224, 234, 245, 256, 268, 281, 294, 308, + 322, 337, 353, 369, 386, 404, 423, 443, 463, 484, 506, 529, 553, 578, + 604, 631, 659, 688, 718, 749, 781, 814, 849, 885, 922, 960, 1000, 1041, + 1083, 1127, +}; + +static const uint16_t vp7_y2dc_qlookup[] = { + 7, 9, 11, 13, 15, 17, 19, 21, 23, 26, 28, 30, 33, 35, + 37, 39, 42, 44, 46, 48, 51, 53, 55, 57, 59, 61, 63, 65, + 67, 69, 70, 72, 74, 75, 77, 78, 80, 81, 83, 84, 85, 87, + 88, 89, 90, 92, 93, 94, 95, 96, 97, 99, 100, 101, 102, 104, + 105, 106, 108, 109, 111, 113, 114, 116, 118, 120, 123, 125, 128, 131, + 134, 137, 140, 144, 148, 152, 156, 161, 166, 171, 176, 182, 188, 195, + 202, 209, 217, 225, 234, 243, 253, 263, 274, 285, 297, 309, 322, 336, + 350, 365, 381, 397, 414, 432, 450, 470, 490, 511, 533, 556, 579, 604, + 630, 656, 684, 713, 742, 773, 805, 838, 873, 908, 945, 983, 1022, 1063, + 1105, 1148, +}; + +static const uint16_t vp7_y2ac_qlookup[] = { + 7, 9, 11, 13, 16, 18, 21, 24, 26, 29, 32, 35, + 38, 41, 43, 46, 49, 52, 55, 58, 61, 64, 66, 69, + 72, 74, 77, 79, 82, 84, 86, 88, 91, 93, 95, 97, + 98, 100, 102, 104, 105, 107, 109, 110, 112, 113, 115, 116, + 117, 119, 120, 122, 123, 125, 127, 128, 130, 132, 134, 136, + 138, 141, 143, 146, 149, 152, 155, 158, 162, 166, 171, 175, + 180, 185, 191, 197, 204, 210, 218, 226, 234, 243, 252, 262, + 273, 284, 295, 308, 321, 335, 350, 365, 381, 398, 416, 435, + 455, 476, 497, 520, 544, 569, 595, 622, 650, 680, 711, 743, + 776, 811, 848, 885, 925, 965, 1008, 1052, 1097, 1144, 1193, 1244, + 1297, 1351, 1407, 1466, 1526, 1588, 1652, 1719, +}; + +#endif /* AVCODEC_VP8DATA_H */ diff --git a/media/ffvpx/libavcodec/vp8dsp.c b/media/ffvpx/libavcodec/vp8dsp.c new file mode 100644 index 0000000000..7a85e9f4ca --- /dev/null +++ b/media/ffvpx/libavcodec/vp8dsp.c @@ -0,0 +1,753 @@ +/* + * Copyright (C) 2010 David Conrad + * Copyright (C) 2010 Ronald S. Bultje + * Copyright (C) 2014 Peter Ross + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * VP8 compatible video decoder + */ + +#include "config_components.h" + +#include "libavutil/common.h" +#include "libavutil/intreadwrite.h" + +#include "mathops.h" +#include "vp8dsp.h" + +#define MK_IDCT_DC_ADD4_C(name) \ +static void name ## _idct_dc_add4uv_c(uint8_t *dst, int16_t block[4][16], \ + ptrdiff_t stride) \ +{ \ + name ## _idct_dc_add_c(dst + stride * 0 + 0, block[0], stride); \ + name ## _idct_dc_add_c(dst + stride * 0 + 4, block[1], stride); \ + name ## _idct_dc_add_c(dst + stride * 4 + 0, block[2], stride); \ + name ## _idct_dc_add_c(dst + stride * 4 + 4, block[3], stride); \ +} \ + \ +static void name ## _idct_dc_add4y_c(uint8_t *dst, int16_t block[4][16], \ + ptrdiff_t stride) \ +{ \ + name ## _idct_dc_add_c(dst + 0, block[0], stride); \ + name ## _idct_dc_add_c(dst + 4, block[1], stride); \ + name ## _idct_dc_add_c(dst + 8, block[2], stride); \ + name ## _idct_dc_add_c(dst + 12, block[3], stride); \ +} + +#if CONFIG_VP7_DECODER +static void vp7_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16]) +{ + int i; + unsigned a1, b1, c1, d1; + int16_t tmp[16]; + + for (i = 0; i < 4; i++) { + a1 = (dc[i * 4 + 0] + dc[i * 4 + 2]) * 23170; + b1 = (dc[i * 4 + 0] - dc[i * 4 + 2]) * 23170; + c1 = dc[i * 4 + 1] * 12540 - dc[i * 4 + 3] * 30274; + d1 = dc[i * 4 + 1] * 30274 + dc[i * 4 + 3] * 12540; + tmp[i * 4 + 0] = (int)(a1 + d1) >> 14; + tmp[i * 4 + 3] = (int)(a1 - d1) >> 14; + tmp[i * 4 + 1] = (int)(b1 + c1) >> 14; + tmp[i * 4 + 2] = (int)(b1 - c1) >> 14; + } + + for (i = 0; i < 4; i++) { + a1 = (tmp[i + 0] + tmp[i + 8]) * 23170; + b1 = (tmp[i + 0] - tmp[i + 8]) * 23170; + c1 = tmp[i + 4] * 12540 - tmp[i + 12] * 30274; + d1 = tmp[i + 4] * 30274 + tmp[i + 12] * 12540; + AV_ZERO64(dc + i * 4); + block[0][i][0] = (int)(a1 + d1 + 0x20000) >> 18; + block[3][i][0] = (int)(a1 - d1 + 0x20000) >> 18; + block[1][i][0] = (int)(b1 + c1 + 0x20000) >> 18; + block[2][i][0] = (int)(b1 - c1 + 0x20000) >> 18; + } +} + +static void vp7_luma_dc_wht_dc_c(int16_t block[4][4][16], int16_t dc[16]) +{ + int i, val = (23170 * (23170 * dc[0] >> 14) + 0x20000) >> 18; + dc[0] = 0; + + for (i = 0; i < 4; i++) { + block[i][0][0] = val; + block[i][1][0] = val; + block[i][2][0] = val; + block[i][3][0] = val; + } +} + +static void vp7_idct_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride) +{ + int i; + unsigned a1, b1, c1, d1; + int16_t tmp[16]; + + for (i = 0; i < 4; i++) { + a1 = (block[i * 4 + 0] + block[i * 4 + 2]) * 23170; + b1 = (block[i * 4 + 0] - block[i * 4 + 2]) * 23170; + c1 = block[i * 4 + 1] * 12540 - block[i * 4 + 3] * 30274; + d1 = block[i * 4 + 1] * 30274 + block[i * 4 + 3] * 12540; + AV_ZERO64(block + i * 4); + tmp[i * 4 + 0] = (int)(a1 + d1) >> 14; + tmp[i * 4 + 3] = (int)(a1 - d1) >> 14; + tmp[i * 4 + 1] = (int)(b1 + c1) >> 14; + tmp[i * 4 + 2] = (int)(b1 - c1) >> 14; + } + + for (i = 0; i < 4; i++) { + a1 = (tmp[i + 0] + tmp[i + 8]) * 23170; + b1 = (tmp[i + 0] - tmp[i + 8]) * 23170; + c1 = tmp[i + 4] * 12540 - tmp[i + 12] * 30274; + d1 = tmp[i + 4] * 30274 + tmp[i + 12] * 12540; + dst[0 * stride + i] = av_clip_uint8(dst[0 * stride + i] + + ((int)(a1 + d1 + 0x20000) >> 18)); + dst[3 * stride + i] = av_clip_uint8(dst[3 * stride + i] + + ((int)(a1 - d1 + 0x20000) >> 18)); + dst[1 * stride + i] = av_clip_uint8(dst[1 * stride + i] + + ((int)(b1 + c1 + 0x20000) >> 18)); + dst[2 * stride + i] = av_clip_uint8(dst[2 * stride + i] + + ((int)(b1 - c1 + 0x20000) >> 18)); + } +} + +static void vp7_idct_dc_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride) +{ + int i, dc = (23170 * (23170 * block[0] >> 14) + 0x20000) >> 18; + block[0] = 0; + + for (i = 0; i < 4; i++) { + dst[0] = av_clip_uint8(dst[0] + dc); + dst[1] = av_clip_uint8(dst[1] + dc); + dst[2] = av_clip_uint8(dst[2] + dc); + dst[3] = av_clip_uint8(dst[3] + dc); + dst += stride; + } +} + +MK_IDCT_DC_ADD4_C(vp7) +#endif /* CONFIG_VP7_DECODER */ + +// TODO: Maybe add dequant +#if CONFIG_VP8_DECODER +static void vp8_luma_dc_wht_c(int16_t block[4][4][16], int16_t dc[16]) +{ + int i, t0, t1, t2, t3; + + for (i = 0; i < 4; i++) { + t0 = dc[0 * 4 + i] + dc[3 * 4 + i]; + t1 = dc[1 * 4 + i] + dc[2 * 4 + i]; + t2 = dc[1 * 4 + i] - dc[2 * 4 + i]; + t3 = dc[0 * 4 + i] - dc[3 * 4 + i]; + + dc[0 * 4 + i] = t0 + t1; + dc[1 * 4 + i] = t3 + t2; + dc[2 * 4 + i] = t0 - t1; + dc[3 * 4 + i] = t3 - t2; + } + + for (i = 0; i < 4; i++) { + t0 = dc[i * 4 + 0] + dc[i * 4 + 3] + 3; // rounding + t1 = dc[i * 4 + 1] + dc[i * 4 + 2]; + t2 = dc[i * 4 + 1] - dc[i * 4 + 2]; + t3 = dc[i * 4 + 0] - dc[i * 4 + 3] + 3; // rounding + AV_ZERO64(dc + i * 4); + + block[i][0][0] = (t0 + t1) >> 3; + block[i][1][0] = (t3 + t2) >> 3; + block[i][2][0] = (t0 - t1) >> 3; + block[i][3][0] = (t3 - t2) >> 3; + } +} + +static void vp8_luma_dc_wht_dc_c(int16_t block[4][4][16], int16_t dc[16]) +{ + int i, val = (dc[0] + 3) >> 3; + dc[0] = 0; + + for (i = 0; i < 4; i++) { + block[i][0][0] = val; + block[i][1][0] = val; + block[i][2][0] = val; + block[i][3][0] = val; + } +} + +#define MUL_20091(a) ((((a) * 20091) >> 16) + (a)) +#define MUL_35468(a) (((a) * 35468) >> 16) + +static void vp8_idct_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride) +{ + int i, t0, t1, t2, t3; + int16_t tmp[16]; + + for (i = 0; i < 4; i++) { + t0 = block[0 * 4 + i] + block[2 * 4 + i]; + t1 = block[0 * 4 + i] - block[2 * 4 + i]; + t2 = MUL_35468(block[1 * 4 + i]) - MUL_20091(block[3 * 4 + i]); + t3 = MUL_20091(block[1 * 4 + i]) + MUL_35468(block[3 * 4 + i]); + block[0 * 4 + i] = 0; + block[1 * 4 + i] = 0; + block[2 * 4 + i] = 0; + block[3 * 4 + i] = 0; + + tmp[i * 4 + 0] = t0 + t3; + tmp[i * 4 + 1] = t1 + t2; + tmp[i * 4 + 2] = t1 - t2; + tmp[i * 4 + 3] = t0 - t3; + } + + for (i = 0; i < 4; i++) { + t0 = tmp[0 * 4 + i] + tmp[2 * 4 + i]; + t1 = tmp[0 * 4 + i] - tmp[2 * 4 + i]; + t2 = MUL_35468(tmp[1 * 4 + i]) - MUL_20091(tmp[3 * 4 + i]); + t3 = MUL_20091(tmp[1 * 4 + i]) + MUL_35468(tmp[3 * 4 + i]); + + dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3)); + dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3)); + dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3)); + dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3)); + dst += stride; + } +} + +static void vp8_idct_dc_add_c(uint8_t *dst, int16_t block[16], ptrdiff_t stride) +{ + int i, dc = (block[0] + 4) >> 3; + block[0] = 0; + + for (i = 0; i < 4; i++) { + dst[0] = av_clip_uint8(dst[0] + dc); + dst[1] = av_clip_uint8(dst[1] + dc); + dst[2] = av_clip_uint8(dst[2] + dc); + dst[3] = av_clip_uint8(dst[3] + dc); + dst += stride; + } +} + +MK_IDCT_DC_ADD4_C(vp8) +#endif /* CONFIG_VP8_DECODER */ + +// because I like only having two parameters to pass functions... +#define LOAD_PIXELS \ + int av_unused p3 = p[-4 * stride]; \ + int av_unused p2 = p[-3 * stride]; \ + int av_unused p1 = p[-2 * stride]; \ + int av_unused p0 = p[-1 * stride]; \ + int av_unused q0 = p[ 0 * stride]; \ + int av_unused q1 = p[ 1 * stride]; \ + int av_unused q2 = p[ 2 * stride]; \ + int av_unused q3 = p[ 3 * stride]; + +#define clip_int8(n) (cm[(n) + 0x80] - 0x80) + +static av_always_inline void filter_common(uint8_t *p, ptrdiff_t stride, + int is4tap, int is_vp7) +{ + LOAD_PIXELS + int a, f1, f2; + const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; + + a = 3 * (q0 - p0); + + if (is4tap) + a += clip_int8(p1 - q1); + + a = clip_int8(a); + + // We deviate from the spec here with c(a+3) >> 3 + // since that's what libvpx does. + f1 = FFMIN(a + 4, 127) >> 3; + + if (is_vp7) + f2 = f1 - ((a & 7) == 4); + else + f2 = FFMIN(a + 3, 127) >> 3; + + // Despite what the spec says, we do need to clamp here to + // be bitexact with libvpx. + p[-1 * stride] = cm[p0 + f2]; + p[ 0 * stride] = cm[q0 - f1]; + + // only used for _inner on blocks without high edge variance + if (!is4tap) { + a = (f1 + 1) >> 1; + p[-2 * stride] = cm[p1 + a]; + p[ 1 * stride] = cm[q1 - a]; + } +} + +static av_always_inline void vp7_filter_common(uint8_t *p, ptrdiff_t stride, + int is4tap) +{ + filter_common(p, stride, is4tap, IS_VP7); +} + +static av_always_inline void vp8_filter_common(uint8_t *p, ptrdiff_t stride, + int is4tap) +{ + filter_common(p, stride, is4tap, IS_VP8); +} + +static av_always_inline int vp7_simple_limit(uint8_t *p, ptrdiff_t stride, + int flim) +{ + LOAD_PIXELS + return FFABS(p0 - q0) <= flim; +} + +static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride, + int flim) +{ + LOAD_PIXELS + return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim; +} + +/** + * E - limit at the macroblock edge + * I - limit for interior difference + */ +#define NORMAL_LIMIT(vpn) \ +static av_always_inline int vp ## vpn ## _normal_limit(uint8_t *p, \ + ptrdiff_t stride, \ + int E, int I) \ +{ \ + LOAD_PIXELS \ + return vp ## vpn ## _simple_limit(p, stride, E) && \ + FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I && \ + FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I && \ + FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I; \ +} + +NORMAL_LIMIT(7) +NORMAL_LIMIT(8) + +// high edge variance +static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh) +{ + LOAD_PIXELS + return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh; +} + +static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride) +{ + int a0, a1, a2, w; + const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; + + LOAD_PIXELS + + w = clip_int8(p1 - q1); + w = clip_int8(w + 3 * (q0 - p0)); + + a0 = (27 * w + 63) >> 7; + a1 = (18 * w + 63) >> 7; + a2 = (9 * w + 63) >> 7; + + p[-3 * stride] = cm[p2 + a2]; + p[-2 * stride] = cm[p1 + a1]; + p[-1 * stride] = cm[p0 + a0]; + p[ 0 * stride] = cm[q0 - a0]; + p[ 1 * stride] = cm[q1 - a1]; + p[ 2 * stride] = cm[q2 - a2]; +} + +#define LOOP_FILTER(vpn, dir, size, stridea, strideb, maybe_inline) \ +static maybe_inline \ +void vpn ## _ ## dir ## _loop_filter ## size ## _c(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim_E, int flim_I, \ + int hev_thresh) \ +{ \ + int i; \ + for (i = 0; i < size; i++) \ + if (vpn ## _normal_limit(dst + i * stridea, strideb, \ + flim_E, flim_I)) { \ + if (hev(dst + i * stridea, strideb, hev_thresh)) \ + vpn ## _filter_common(dst + i * stridea, strideb, 1); \ + else \ + filter_mbedge(dst + i * stridea, strideb); \ + } \ +} \ + \ +static maybe_inline \ +void vpn ## _ ## dir ## _loop_filter ## size ## _inner_c(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim_E, \ + int flim_I, \ + int hev_thresh) \ +{ \ + int i; \ + for (i = 0; i < size; i++) \ + if (vpn ## _normal_limit(dst + i * stridea, strideb, \ + flim_E, flim_I)) { \ + int hv = hev(dst + i * stridea, strideb, hev_thresh); \ + if (hv) \ + vpn ## _filter_common(dst + i * stridea, strideb, 1); \ + else \ + vpn ## _filter_common(dst + i * stridea, strideb, 0); \ + } \ +} + +#define UV_LOOP_FILTER(vpn, dir, stridea, strideb) \ +LOOP_FILTER(vpn, dir, 8, stridea, strideb, av_always_inline) \ +static void vpn ## _ ## dir ## _loop_filter8uv_c(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t stride, int fE, \ + int fI, int hev_thresh) \ +{ \ + vpn ## _ ## dir ## _loop_filter8_c(dstU, stride, fE, fI, hev_thresh); \ + vpn ## _ ## dir ## _loop_filter8_c(dstV, stride, fE, fI, hev_thresh); \ +} \ + \ +static void vpn ## _ ## dir ## _loop_filter8uv_inner_c(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t stride, \ + int fE, int fI, \ + int hev_thresh) \ +{ \ + vpn ## _ ## dir ## _loop_filter8_inner_c(dstU, stride, fE, fI, \ + hev_thresh); \ + vpn ## _ ## dir ## _loop_filter8_inner_c(dstV, stride, fE, fI, \ + hev_thresh); \ +} + +#define LOOP_FILTER_SIMPLE(vpn) \ +static void vpn ## _v_loop_filter_simple_c(uint8_t *dst, ptrdiff_t stride, \ + int flim) \ +{ \ + int i; \ + for (i = 0; i < 16; i++) \ + if (vpn ## _simple_limit(dst + i, stride, flim)) \ + vpn ## _filter_common(dst + i, stride, 1); \ +} \ + \ +static void vpn ## _h_loop_filter_simple_c(uint8_t *dst, ptrdiff_t stride, \ + int flim) \ +{ \ + int i; \ + for (i = 0; i < 16; i++) \ + if (vpn ## _simple_limit(dst + i * stride, 1, flim)) \ + vpn ## _filter_common(dst + i * stride, 1, 1); \ +} + +#define LOOP_FILTERS(vpn) \ + LOOP_FILTER(vpn, v, 16, 1, stride, ) \ + LOOP_FILTER(vpn, h, 16, stride, 1, ) \ + UV_LOOP_FILTER(vpn, v, 1, stride) \ + UV_LOOP_FILTER(vpn, h, stride, 1) \ + LOOP_FILTER_SIMPLE(vpn) \ + +static const uint8_t subpel_filters[7][6] = { + { 0, 6, 123, 12, 1, 0 }, + { 2, 11, 108, 36, 8, 1 }, + { 0, 9, 93, 50, 6, 0 }, + { 3, 16, 77, 77, 16, 3 }, + { 0, 6, 50, 93, 9, 0 }, + { 1, 8, 36, 108, 11, 2 }, + { 0, 1, 12, 123, 6, 0 }, +}; + +#define PUT_PIXELS(WIDTH) \ +static void put_vp8_pixels ## WIDTH ## _c(uint8_t *dst, ptrdiff_t dststride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int h, int x, int y) \ +{ \ + int i; \ + for (i = 0; i < h; i++, dst += dststride, src += srcstride) \ + memcpy(dst, src, WIDTH); \ +} + +PUT_PIXELS(16) +PUT_PIXELS(8) +PUT_PIXELS(4) + +#define FILTER_6TAP(src, F, stride) \ + cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \ + F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \ + F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7] + +#define FILTER_4TAP(src, F, stride) \ + cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \ + F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7] + +#define VP8_EPEL_H(SIZE, TAPS) \ +static void put_vp8_epel ## SIZE ## _h ## TAPS ## _c(uint8_t *dst, \ + ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my) \ +{ \ + const uint8_t *filter = subpel_filters[mx - 1]; \ + const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \ + int x, y; \ + for (y = 0; y < h; y++) { \ + for (x = 0; x < SIZE; x++) \ + dst[x] = FILTER_ ## TAPS ## TAP(src, filter, 1); \ + dst += dststride; \ + src += srcstride; \ + } \ +} + +#define VP8_EPEL_V(SIZE, TAPS) \ +static void put_vp8_epel ## SIZE ## _v ## TAPS ## _c(uint8_t *dst, \ + ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, int my) \ +{ \ + const uint8_t *filter = subpel_filters[my - 1]; \ + const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \ + int x, y; \ + for (y = 0; y < h; y++) { \ + for (x = 0; x < SIZE; x++) \ + dst[x] = FILTER_ ## TAPS ## TAP(src, filter, srcstride); \ + dst += dststride; \ + src += srcstride; \ + } \ +} + +#define VP8_EPEL_HV(SIZE, HTAPS, VTAPS) \ +static void \ +put_vp8_epel ## SIZE ## _h ## HTAPS ## v ## VTAPS ## _c(uint8_t *dst, \ + ptrdiff_t dststride, \ + const uint8_t *src, \ + ptrdiff_t srcstride, \ + int h, int mx, \ + int my) \ +{ \ + const uint8_t *filter = subpel_filters[mx - 1]; \ + const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \ + int x, y; \ + uint8_t tmp_array[(2 * SIZE + VTAPS - 1) * SIZE]; \ + uint8_t *tmp = tmp_array; \ + src -= (2 - (VTAPS == 4)) * srcstride; \ + \ + for (y = 0; y < h + VTAPS - 1; y++) { \ + for (x = 0; x < SIZE; x++) \ + tmp[x] = FILTER_ ## HTAPS ## TAP(src, filter, 1); \ + tmp += SIZE; \ + src += srcstride; \ + } \ + tmp = tmp_array + (2 - (VTAPS == 4)) * SIZE; \ + filter = subpel_filters[my - 1]; \ + \ + for (y = 0; y < h; y++) { \ + for (x = 0; x < SIZE; x++) \ + dst[x] = FILTER_ ## VTAPS ## TAP(tmp, filter, SIZE); \ + dst += dststride; \ + tmp += SIZE; \ + } \ +} + +VP8_EPEL_H(16, 4) +VP8_EPEL_H(8, 4) +VP8_EPEL_H(4, 4) +VP8_EPEL_H(16, 6) +VP8_EPEL_H(8, 6) +VP8_EPEL_H(4, 6) +VP8_EPEL_V(16, 4) +VP8_EPEL_V(8, 4) +VP8_EPEL_V(4, 4) +VP8_EPEL_V(16, 6) +VP8_EPEL_V(8, 6) +VP8_EPEL_V(4, 6) + +VP8_EPEL_HV(16, 4, 4) +VP8_EPEL_HV(8, 4, 4) +VP8_EPEL_HV(4, 4, 4) +VP8_EPEL_HV(16, 4, 6) +VP8_EPEL_HV(8, 4, 6) +VP8_EPEL_HV(4, 4, 6) +VP8_EPEL_HV(16, 6, 4) +VP8_EPEL_HV(8, 6, 4) +VP8_EPEL_HV(4, 6, 4) +VP8_EPEL_HV(16, 6, 6) +VP8_EPEL_HV(8, 6, 6) +VP8_EPEL_HV(4, 6, 6) + +#define VP8_BILINEAR(SIZE) \ +static void put_vp8_bilinear ## SIZE ## _h_c(uint8_t *dst, ptrdiff_t dstride, \ + const uint8_t *src, ptrdiff_t sstride, \ + int h, int mx, int my) \ +{ \ + int a = 8 - mx, b = mx; \ + int x, y; \ + for (y = 0; y < h; y++) { \ + for (x = 0; x < SIZE; x++) \ + dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3; \ + dst += dstride; \ + src += sstride; \ + } \ +} \ + \ +static void put_vp8_bilinear ## SIZE ## _v_c(uint8_t *dst, ptrdiff_t dstride, \ + const uint8_t *src, ptrdiff_t sstride, \ + int h, int mx, int my) \ +{ \ + int c = 8 - my, d = my; \ + int x, y; \ + for (y = 0; y < h; y++) { \ + for (x = 0; x < SIZE; x++) \ + dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3; \ + dst += dstride; \ + src += sstride; \ + } \ +} \ + \ +static void put_vp8_bilinear ## SIZE ## _hv_c(uint8_t *dst, \ + ptrdiff_t dstride, \ + const uint8_t *src, \ + ptrdiff_t sstride, \ + int h, int mx, int my) \ +{ \ + int a = 8 - mx, b = mx; \ + int c = 8 - my, d = my; \ + int x, y; \ + uint8_t tmp_array[(2 * SIZE + 1) * SIZE]; \ + uint8_t *tmp = tmp_array; \ + for (y = 0; y < h + 1; y++) { \ + for (x = 0; x < SIZE; x++) \ + tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3; \ + tmp += SIZE; \ + src += sstride; \ + } \ + tmp = tmp_array; \ + for (y = 0; y < h; y++) { \ + for (x = 0; x < SIZE; x++) \ + dst[x] = (c * tmp[x] + d * tmp[x + SIZE] + 4) >> 3; \ + dst += dstride; \ + tmp += SIZE; \ + } \ +} + +VP8_BILINEAR(16) +VP8_BILINEAR(8) +VP8_BILINEAR(4) + +#define VP78_MC_FUNC(IDX, SIZE) \ + dsp->put_vp8_epel_pixels_tab[IDX][0][0] = put_vp8_pixels ## SIZE ## _c; \ + dsp->put_vp8_epel_pixels_tab[IDX][0][1] = put_vp8_epel ## SIZE ## _h4_c; \ + dsp->put_vp8_epel_pixels_tab[IDX][0][2] = put_vp8_epel ## SIZE ## _h6_c; \ + dsp->put_vp8_epel_pixels_tab[IDX][1][0] = put_vp8_epel ## SIZE ## _v4_c; \ + dsp->put_vp8_epel_pixels_tab[IDX][1][1] = put_vp8_epel ## SIZE ## _h4v4_c; \ + dsp->put_vp8_epel_pixels_tab[IDX][1][2] = put_vp8_epel ## SIZE ## _h6v4_c; \ + dsp->put_vp8_epel_pixels_tab[IDX][2][0] = put_vp8_epel ## SIZE ## _v6_c; \ + dsp->put_vp8_epel_pixels_tab[IDX][2][1] = put_vp8_epel ## SIZE ## _h4v6_c; \ + dsp->put_vp8_epel_pixels_tab[IDX][2][2] = put_vp8_epel ## SIZE ## _h6v6_c + +#define VP78_BILINEAR_MC_FUNC(IDX, SIZE) \ + dsp->put_vp8_bilinear_pixels_tab[IDX][0][0] = put_vp8_pixels ## SIZE ## _c; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][0][1] = put_vp8_bilinear ## SIZE ## _h_c; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][0][2] = put_vp8_bilinear ## SIZE ## _h_c; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][1][0] = put_vp8_bilinear ## SIZE ## _v_c; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][1][1] = put_vp8_bilinear ## SIZE ## _hv_c; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][1][2] = put_vp8_bilinear ## SIZE ## _hv_c; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][2][0] = put_vp8_bilinear ## SIZE ## _v_c; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][2][1] = put_vp8_bilinear ## SIZE ## _hv_c; \ + dsp->put_vp8_bilinear_pixels_tab[IDX][2][2] = put_vp8_bilinear ## SIZE ## _hv_c + +av_cold void ff_vp78dsp_init(VP8DSPContext *dsp) +{ + VP78_MC_FUNC(0, 16); + VP78_MC_FUNC(1, 8); + VP78_MC_FUNC(2, 4); + + VP78_BILINEAR_MC_FUNC(0, 16); + VP78_BILINEAR_MC_FUNC(1, 8); + VP78_BILINEAR_MC_FUNC(2, 4); + +#if ARCH_AARCH64 + ff_vp78dsp_init_aarch64(dsp); +#elif ARCH_ARM + ff_vp78dsp_init_arm(dsp); +#elif ARCH_PPC + ff_vp78dsp_init_ppc(dsp); +#elif ARCH_X86 + ff_vp78dsp_init_x86(dsp); +#endif +} + +#if CONFIG_VP7_DECODER +LOOP_FILTERS(vp7) + +av_cold void ff_vp7dsp_init(VP8DSPContext *dsp) +{ + dsp->vp8_luma_dc_wht = vp7_luma_dc_wht_c; + dsp->vp8_luma_dc_wht_dc = vp7_luma_dc_wht_dc_c; + dsp->vp8_idct_add = vp7_idct_add_c; + dsp->vp8_idct_dc_add = vp7_idct_dc_add_c; + dsp->vp8_idct_dc_add4y = vp7_idct_dc_add4y_c; + dsp->vp8_idct_dc_add4uv = vp7_idct_dc_add4uv_c; + + dsp->vp8_v_loop_filter16y = vp7_v_loop_filter16_c; + dsp->vp8_h_loop_filter16y = vp7_h_loop_filter16_c; + dsp->vp8_v_loop_filter8uv = vp7_v_loop_filter8uv_c; + dsp->vp8_h_loop_filter8uv = vp7_h_loop_filter8uv_c; + + dsp->vp8_v_loop_filter16y_inner = vp7_v_loop_filter16_inner_c; + dsp->vp8_h_loop_filter16y_inner = vp7_h_loop_filter16_inner_c; + dsp->vp8_v_loop_filter8uv_inner = vp7_v_loop_filter8uv_inner_c; + dsp->vp8_h_loop_filter8uv_inner = vp7_h_loop_filter8uv_inner_c; + + dsp->vp8_v_loop_filter_simple = vp7_v_loop_filter_simple_c; + dsp->vp8_h_loop_filter_simple = vp7_h_loop_filter_simple_c; +} +#endif /* CONFIG_VP7_DECODER */ + +#if CONFIG_VP8_DECODER +LOOP_FILTERS(vp8) + +av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) +{ + dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c; + dsp->vp8_luma_dc_wht_dc = vp8_luma_dc_wht_dc_c; + dsp->vp8_idct_add = vp8_idct_add_c; + dsp->vp8_idct_dc_add = vp8_idct_dc_add_c; + dsp->vp8_idct_dc_add4y = vp8_idct_dc_add4y_c; + dsp->vp8_idct_dc_add4uv = vp8_idct_dc_add4uv_c; + + dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c; + dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c; + dsp->vp8_v_loop_filter8uv = vp8_v_loop_filter8uv_c; + dsp->vp8_h_loop_filter8uv = vp8_h_loop_filter8uv_c; + + dsp->vp8_v_loop_filter16y_inner = vp8_v_loop_filter16_inner_c; + dsp->vp8_h_loop_filter16y_inner = vp8_h_loop_filter16_inner_c; + dsp->vp8_v_loop_filter8uv_inner = vp8_v_loop_filter8uv_inner_c; + dsp->vp8_h_loop_filter8uv_inner = vp8_h_loop_filter8uv_inner_c; + + dsp->vp8_v_loop_filter_simple = vp8_v_loop_filter_simple_c; + dsp->vp8_h_loop_filter_simple = vp8_h_loop_filter_simple_c; + +#if ARCH_AARCH64 + ff_vp8dsp_init_aarch64(dsp); +#elif ARCH_ARM + ff_vp8dsp_init_arm(dsp); +#elif ARCH_X86 + ff_vp8dsp_init_x86(dsp); +#elif ARCH_MIPS + ff_vp8dsp_init_mips(dsp); +#elif ARCH_LOONGARCH + ff_vp8dsp_init_loongarch(dsp); +#endif +} +#endif /* CONFIG_VP8_DECODER */ diff --git a/media/ffvpx/libavcodec/vp8dsp.h b/media/ffvpx/libavcodec/vp8dsp.h new file mode 100644 index 0000000000..16b5e9c35b --- /dev/null +++ b/media/ffvpx/libavcodec/vp8dsp.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2010 David Conrad + * Copyright (C) 2010 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * VP8 compatible video decoder + */ + +#ifndef AVCODEC_VP8DSP_H +#define AVCODEC_VP8DSP_H + +#include <stddef.h> +#include <stdint.h> + +typedef void (*vp8_mc_func)(uint8_t *dst /* align 8 */, ptrdiff_t dstStride, + const uint8_t *src /* align 1 */, ptrdiff_t srcStride, + int h, int x, int y); + +typedef struct VP8DSPContext { + void (*vp8_luma_dc_wht)(int16_t block[4][4][16], int16_t dc[16]); + void (*vp8_luma_dc_wht_dc)(int16_t block[4][4][16], int16_t dc[16]); + void (*vp8_idct_add)(uint8_t *dst, int16_t block[16], ptrdiff_t stride); + void (*vp8_idct_dc_add)(uint8_t *dst, int16_t block[16], ptrdiff_t stride); + void (*vp8_idct_dc_add4y)(uint8_t *dst, int16_t block[4][16], + ptrdiff_t stride); + void (*vp8_idct_dc_add4uv)(uint8_t *dst, int16_t block[4][16], + ptrdiff_t stride); + + // loop filter applied to edges between macroblocks + void (*vp8_v_loop_filter16y)(uint8_t *dst, ptrdiff_t stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_h_loop_filter16y)(uint8_t *dst, ptrdiff_t stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_v_loop_filter8uv)(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_h_loop_filter8uv)(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride, + int flim_E, int flim_I, int hev_thresh); + + // loop filter applied to inner macroblock edges + void (*vp8_v_loop_filter16y_inner)(uint8_t *dst, ptrdiff_t stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_h_loop_filter16y_inner)(uint8_t *dst, ptrdiff_t stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_v_loop_filter8uv_inner)(uint8_t *dstU, uint8_t *dstV, + ptrdiff_t stride, + int flim_E, int flim_I, int hev_thresh); + void (*vp8_h_loop_filter8uv_inner)(uint8_t *dstU, uint8_t *dstV, + ptrdiff_t stride, + int flim_E, int flim_I, int hev_thresh); + + void (*vp8_v_loop_filter_simple)(uint8_t *dst, ptrdiff_t stride, int flim); + void (*vp8_h_loop_filter_simple)(uint8_t *dst, ptrdiff_t stride, int flim); + + /** + * first dimension: 4-log2(width) + * second dimension: 0 if no vertical interpolation is needed; + * 1 4-tap vertical interpolation filter (my & 1) + * 2 6-tap vertical interpolation filter (!(my & 1)) + * third dimension: same as second dimension, for horizontal interpolation + * so something like put_vp8_epel_pixels_tab[4-log2(width)][2*!!my-(my&1)][2*!!mx-(mx&1)](..., mx, my) + */ + vp8_mc_func put_vp8_epel_pixels_tab[3][3][3]; + vp8_mc_func put_vp8_bilinear_pixels_tab[3][3][3]; +} VP8DSPContext; + +void ff_vp7dsp_init(VP8DSPContext *c); + +void ff_vp78dsp_init(VP8DSPContext *c); +void ff_vp78dsp_init_aarch64(VP8DSPContext *c); +void ff_vp78dsp_init_arm(VP8DSPContext *c); +void ff_vp78dsp_init_ppc(VP8DSPContext *c); +void ff_vp78dsp_init_x86(VP8DSPContext *c); + +void ff_vp8dsp_init(VP8DSPContext *c); +void ff_vp8dsp_init_aarch64(VP8DSPContext *c); +void ff_vp8dsp_init_arm(VP8DSPContext *c); +void ff_vp8dsp_init_x86(VP8DSPContext *c); +void ff_vp8dsp_init_mips(VP8DSPContext *c); +void ff_vp8dsp_init_loongarch(VP8DSPContext *c); + +#define IS_VP7 1 +#define IS_VP8 0 + +#endif /* AVCODEC_VP8DSP_H */ diff --git a/media/ffvpx/libavcodec/vp9.c b/media/ffvpx/libavcodec/vp9.c new file mode 100644 index 0000000000..7c0a246446 --- /dev/null +++ b/media/ffvpx/libavcodec/vp9.c @@ -0,0 +1,1914 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config_components.h" + +#include "avcodec.h" +#include "codec_internal.h" +#include "decode.h" +#include "get_bits.h" +#include "hwconfig.h" +#include "profiles.h" +#include "thread.h" +#include "threadframe.h" +#include "pthread_internal.h" + +#include "videodsp.h" +#include "vp89_rac.h" +#include "vp9.h" +#include "vp9data.h" +#include "vp9dec.h" +#include "vpx_rac.h" +#include "libavutil/avassert.h" +#include "libavutil/pixdesc.h" +#include "libavutil/video_enc_params.h" + +#define VP9_SYNCCODE 0x498342 + +#if HAVE_THREADS +DEFINE_OFFSET_ARRAY(VP9Context, vp9_context, pthread_init_cnt, + (offsetof(VP9Context, progress_mutex)), + (offsetof(VP9Context, progress_cond))); + +static int vp9_alloc_entries(AVCodecContext *avctx, int n) { + VP9Context *s = avctx->priv_data; + int i; + + if (avctx->active_thread_type & FF_THREAD_SLICE) { + if (s->entries) + av_freep(&s->entries); + + s->entries = av_malloc_array(n, sizeof(atomic_int)); + if (!s->entries) + return AVERROR(ENOMEM); + + for (i = 0; i < n; i++) + atomic_init(&s->entries[i], 0); + } + return 0; +} + +static void vp9_report_tile_progress(VP9Context *s, int field, int n) { + pthread_mutex_lock(&s->progress_mutex); + atomic_fetch_add_explicit(&s->entries[field], n, memory_order_release); + pthread_cond_signal(&s->progress_cond); + pthread_mutex_unlock(&s->progress_mutex); +} + +static void vp9_await_tile_progress(VP9Context *s, int field, int n) { + if (atomic_load_explicit(&s->entries[field], memory_order_acquire) >= n) + return; + + pthread_mutex_lock(&s->progress_mutex); + while (atomic_load_explicit(&s->entries[field], memory_order_relaxed) != n) + pthread_cond_wait(&s->progress_cond, &s->progress_mutex); + pthread_mutex_unlock(&s->progress_mutex); +} +#else +static int vp9_alloc_entries(AVCodecContext *avctx, int n) { return 0; } +#endif + +static void vp9_tile_data_free(VP9TileData *td) +{ + av_freep(&td->b_base); + av_freep(&td->block_base); + av_freep(&td->block_structure); +} + +static void vp9_frame_unref(AVCodecContext *avctx, VP9Frame *f) +{ + ff_thread_release_ext_buffer(avctx, &f->tf); + av_buffer_unref(&f->extradata); + av_buffer_unref(&f->hwaccel_priv_buf); + f->segmentation_map = NULL; + f->hwaccel_picture_private = NULL; +} + +static int vp9_frame_alloc(AVCodecContext *avctx, VP9Frame *f) +{ + VP9Context *s = avctx->priv_data; + int ret, sz; + + ret = ff_thread_get_ext_buffer(avctx, &f->tf, AV_GET_BUFFER_FLAG_REF); + if (ret < 0) + return ret; + + sz = 64 * s->sb_cols * s->sb_rows; + if (sz != s->frame_extradata_pool_size) { + av_buffer_pool_uninit(&s->frame_extradata_pool); + s->frame_extradata_pool = av_buffer_pool_init(sz * (1 + sizeof(VP9mvrefPair)), NULL); + if (!s->frame_extradata_pool) { + s->frame_extradata_pool_size = 0; + goto fail; + } + s->frame_extradata_pool_size = sz; + } + f->extradata = av_buffer_pool_get(s->frame_extradata_pool); + if (!f->extradata) { + goto fail; + } + memset(f->extradata->data, 0, f->extradata->size); + + f->segmentation_map = f->extradata->data; + f->mv = (VP9mvrefPair *) (f->extradata->data + sz); + + if (avctx->hwaccel) { + const AVHWAccel *hwaccel = avctx->hwaccel; + av_assert0(!f->hwaccel_picture_private); + if (hwaccel->frame_priv_data_size) { + f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size); + if (!f->hwaccel_priv_buf) + goto fail; + f->hwaccel_picture_private = f->hwaccel_priv_buf->data; + } + } + + return 0; + +fail: + vp9_frame_unref(avctx, f); + return AVERROR(ENOMEM); +} + +static int vp9_frame_ref(AVCodecContext *avctx, VP9Frame *dst, VP9Frame *src) +{ + int ret; + + ret = ff_thread_ref_frame(&dst->tf, &src->tf); + if (ret < 0) + return ret; + + dst->extradata = av_buffer_ref(src->extradata); + if (!dst->extradata) + goto fail; + + dst->segmentation_map = src->segmentation_map; + dst->mv = src->mv; + dst->uses_2pass = src->uses_2pass; + + if (src->hwaccel_picture_private) { + dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf); + if (!dst->hwaccel_priv_buf) + goto fail; + dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data; + } + + return 0; + +fail: + vp9_frame_unref(avctx, dst); + return AVERROR(ENOMEM); +} + +static int update_size(AVCodecContext *avctx, int w, int h) +{ +#define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + \ + CONFIG_VP9_D3D11VA_HWACCEL * 2 + \ + CONFIG_VP9_NVDEC_HWACCEL + \ + CONFIG_VP9_VAAPI_HWACCEL + \ + CONFIG_VP9_VDPAU_HWACCEL + \ + CONFIG_VP9_VIDEOTOOLBOX_HWACCEL) + enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts; + VP9Context *s = avctx->priv_data; + uint8_t *p; + int bytesperpixel = s->bytesperpixel, ret, cols, rows; + int lflvl_len, i; + + av_assert0(w > 0 && h > 0); + + if (!(s->pix_fmt == s->gf_fmt && w == s->w && h == s->h)) { + if ((ret = ff_set_dimensions(avctx, w, h)) < 0) + return ret; + + switch (s->pix_fmt) { + case AV_PIX_FMT_YUV420P: + case AV_PIX_FMT_YUV420P10: +#if CONFIG_VP9_DXVA2_HWACCEL + *fmtp++ = AV_PIX_FMT_DXVA2_VLD; +#endif +#if CONFIG_VP9_D3D11VA_HWACCEL + *fmtp++ = AV_PIX_FMT_D3D11VA_VLD; + *fmtp++ = AV_PIX_FMT_D3D11; +#endif +#if CONFIG_VP9_NVDEC_HWACCEL + *fmtp++ = AV_PIX_FMT_CUDA; +#endif +#if CONFIG_VP9_VAAPI_HWACCEL + *fmtp++ = AV_PIX_FMT_VAAPI; +#endif +#if CONFIG_VP9_VDPAU_HWACCEL + *fmtp++ = AV_PIX_FMT_VDPAU; +#endif +#if CONFIG_VP9_VIDEOTOOLBOX_HWACCEL + *fmtp++ = AV_PIX_FMT_VIDEOTOOLBOX; +#endif + break; + case AV_PIX_FMT_YUV420P12: +#if CONFIG_VP9_NVDEC_HWACCEL + *fmtp++ = AV_PIX_FMT_CUDA; +#endif +#if CONFIG_VP9_VAAPI_HWACCEL + *fmtp++ = AV_PIX_FMT_VAAPI; +#endif +#if CONFIG_VP9_VDPAU_HWACCEL + *fmtp++ = AV_PIX_FMT_VDPAU; +#endif + break; + case AV_PIX_FMT_YUV444P: + case AV_PIX_FMT_YUV444P10: + case AV_PIX_FMT_YUV444P12: +#if CONFIG_VP9_VAAPI_HWACCEL + *fmtp++ = AV_PIX_FMT_VAAPI; +#endif + break; + } + + *fmtp++ = s->pix_fmt; + *fmtp = AV_PIX_FMT_NONE; + + ret = ff_thread_get_format(avctx, pix_fmts); + if (ret < 0) + return ret; + + avctx->pix_fmt = ret; + s->gf_fmt = s->pix_fmt; + s->w = w; + s->h = h; + } + + cols = (w + 7) >> 3; + rows = (h + 7) >> 3; + + if (s->intra_pred_data[0] && cols == s->cols && rows == s->rows && s->pix_fmt == s->last_fmt) + return 0; + + s->last_fmt = s->pix_fmt; + s->sb_cols = (w + 63) >> 6; + s->sb_rows = (h + 63) >> 6; + s->cols = (w + 7) >> 3; + s->rows = (h + 7) >> 3; + lflvl_len = avctx->active_thread_type == FF_THREAD_SLICE ? s->sb_rows : 1; + +#define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var) + av_freep(&s->intra_pred_data[0]); + // FIXME we slightly over-allocate here for subsampled chroma, but a little + // bit of padding shouldn't affect performance... + p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel + + lflvl_len * sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx))); + if (!p) + return AVERROR(ENOMEM); + assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel); + assign(s->intra_pred_data[1], uint8_t *, 64 * bytesperpixel); + assign(s->intra_pred_data[2], uint8_t *, 64 * bytesperpixel); + assign(s->above_y_nnz_ctx, uint8_t *, 16); + assign(s->above_mode_ctx, uint8_t *, 16); + assign(s->above_mv_ctx, VP9mv(*)[2], 16); + assign(s->above_uv_nnz_ctx[0], uint8_t *, 16); + assign(s->above_uv_nnz_ctx[1], uint8_t *, 16); + assign(s->above_partition_ctx, uint8_t *, 8); + assign(s->above_skip_ctx, uint8_t *, 8); + assign(s->above_txfm_ctx, uint8_t *, 8); + assign(s->above_segpred_ctx, uint8_t *, 8); + assign(s->above_intra_ctx, uint8_t *, 8); + assign(s->above_comp_ctx, uint8_t *, 8); + assign(s->above_ref_ctx, uint8_t *, 8); + assign(s->above_filter_ctx, uint8_t *, 8); + assign(s->lflvl, VP9Filter *, lflvl_len); +#undef assign + + if (s->td) { + for (i = 0; i < s->active_tile_cols; i++) + vp9_tile_data_free(&s->td[i]); + } + + if (s->s.h.bpp != s->last_bpp) { + ff_vp9dsp_init(&s->dsp, s->s.h.bpp, avctx->flags & AV_CODEC_FLAG_BITEXACT); + ff_videodsp_init(&s->vdsp, s->s.h.bpp); + s->last_bpp = s->s.h.bpp; + } + + return 0; +} + +static int update_block_buffers(AVCodecContext *avctx) +{ + int i; + VP9Context *s = avctx->priv_data; + int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel; + VP9TileData *td = &s->td[0]; + + if (td->b_base && td->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass) + return 0; + + vp9_tile_data_free(td); + chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v); + chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v); + if (s->s.frames[CUR_FRAME].uses_2pass) { + int sbs = s->sb_cols * s->sb_rows; + + td->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block)); + td->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) + + 16 * 16 + 2 * chroma_eobs) * sbs); + if (!td->b_base || !td->block_base) + return AVERROR(ENOMEM); + td->uvblock_base[0] = td->block_base + sbs * 64 * 64 * bytesperpixel; + td->uvblock_base[1] = td->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel; + td->eob_base = (uint8_t *) (td->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel); + td->uveob_base[0] = td->eob_base + 16 * 16 * sbs; + td->uveob_base[1] = td->uveob_base[0] + chroma_eobs * sbs; + + if (avctx->export_side_data & AV_CODEC_EXPORT_DATA_VIDEO_ENC_PARAMS) { + td->block_structure = av_malloc_array(s->cols * s->rows, sizeof(*td->block_structure)); + if (!td->block_structure) + return AVERROR(ENOMEM); + } + } else { + for (i = 1; i < s->active_tile_cols; i++) + vp9_tile_data_free(&s->td[i]); + + for (i = 0; i < s->active_tile_cols; i++) { + s->td[i].b_base = av_malloc(sizeof(VP9Block)); + s->td[i].block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) + + 16 * 16 + 2 * chroma_eobs); + if (!s->td[i].b_base || !s->td[i].block_base) + return AVERROR(ENOMEM); + s->td[i].uvblock_base[0] = s->td[i].block_base + 64 * 64 * bytesperpixel; + s->td[i].uvblock_base[1] = s->td[i].uvblock_base[0] + chroma_blocks * bytesperpixel; + s->td[i].eob_base = (uint8_t *) (s->td[i].uvblock_base[1] + chroma_blocks * bytesperpixel); + s->td[i].uveob_base[0] = s->td[i].eob_base + 16 * 16; + s->td[i].uveob_base[1] = s->td[i].uveob_base[0] + chroma_eobs; + + if (avctx->export_side_data & AV_CODEC_EXPORT_DATA_VIDEO_ENC_PARAMS) { + s->td[i].block_structure = av_malloc_array(s->cols * s->rows, sizeof(*td->block_structure)); + if (!s->td[i].block_structure) + return AVERROR(ENOMEM); + } + } + } + s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass; + + return 0; +} + +// The sign bit is at the end, not the start, of a bit sequence +static av_always_inline int get_sbits_inv(GetBitContext *gb, int n) +{ + int v = get_bits(gb, n); + return get_bits1(gb) ? -v : v; +} + +static av_always_inline int inv_recenter_nonneg(int v, int m) +{ + if (v > 2 * m) + return v; + if (v & 1) + return m - ((v + 1) >> 1); + return m + (v >> 1); +} + +// differential forward probability updates +static int update_prob(VPXRangeCoder *c, int p) +{ + static const uint8_t inv_map_table[255] = { + 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176, + 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, + 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100, + 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115, + 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130, + 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145, + 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, + 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206, + 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, + 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, + 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, + 252, 253, 253, + }; + int d; + + /* This code is trying to do a differential probability update. For a + * current probability A in the range [1, 255], the difference to a new + * probability of any value can be expressed differentially as 1-A, 255-A + * where some part of this (absolute range) exists both in positive as + * well as the negative part, whereas another part only exists in one + * half. We're trying to code this shared part differentially, i.e. + * times two where the value of the lowest bit specifies the sign, and + * the single part is then coded on top of this. This absolute difference + * then again has a value of [0, 254], but a bigger value in this range + * indicates that we're further away from the original value A, so we + * can code this as a VLC code, since higher values are increasingly + * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough' + * updates vs. the 'fine, exact' updates further down the range, which + * adds one extra dimension to this differential update model. */ + + if (!vp89_rac_get(c)) { + d = vp89_rac_get_uint(c, 4) + 0; + } else if (!vp89_rac_get(c)) { + d = vp89_rac_get_uint(c, 4) + 16; + } else if (!vp89_rac_get(c)) { + d = vp89_rac_get_uint(c, 5) + 32; + } else { + d = vp89_rac_get_uint(c, 7); + if (d >= 65) + d = (d << 1) - 65 + vp89_rac_get(c); + d += 64; + av_assert2(d < FF_ARRAY_ELEMS(inv_map_table)); + } + + return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) : + 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p); +} + +static int read_colorspace_details(AVCodecContext *avctx) +{ + static const enum AVColorSpace colorspaces[8] = { + AVCOL_SPC_UNSPECIFIED, AVCOL_SPC_BT470BG, AVCOL_SPC_BT709, AVCOL_SPC_SMPTE170M, + AVCOL_SPC_SMPTE240M, AVCOL_SPC_BT2020_NCL, AVCOL_SPC_RESERVED, AVCOL_SPC_RGB, + }; + VP9Context *s = avctx->priv_data; + int bits = avctx->profile <= 1 ? 0 : 1 + get_bits1(&s->gb); // 0:8, 1:10, 2:12 + + s->bpp_index = bits; + s->s.h.bpp = 8 + bits * 2; + s->bytesperpixel = (7 + s->s.h.bpp) >> 3; + avctx->colorspace = colorspaces[get_bits(&s->gb, 3)]; + if (avctx->colorspace == AVCOL_SPC_RGB) { // RGB = profile 1 + static const enum AVPixelFormat pix_fmt_rgb[3] = { + AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12 + }; + s->ss_h = s->ss_v = 0; + avctx->color_range = AVCOL_RANGE_JPEG; + s->pix_fmt = pix_fmt_rgb[bits]; + if (avctx->profile & 1) { + if (get_bits1(&s->gb)) { + av_log(avctx, AV_LOG_ERROR, "Reserved bit set in RGB\n"); + return AVERROR_INVALIDDATA; + } + } else { + av_log(avctx, AV_LOG_ERROR, "RGB not supported in profile %d\n", + avctx->profile); + return AVERROR_INVALIDDATA; + } + } else { + static const enum AVPixelFormat pix_fmt_for_ss[3][2 /* v */][2 /* h */] = { + { { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV422P }, + { AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV420P } }, + { { AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV422P10 }, + { AV_PIX_FMT_YUV440P10, AV_PIX_FMT_YUV420P10 } }, + { { AV_PIX_FMT_YUV444P12, AV_PIX_FMT_YUV422P12 }, + { AV_PIX_FMT_YUV440P12, AV_PIX_FMT_YUV420P12 } } + }; + avctx->color_range = get_bits1(&s->gb) ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG; + if (avctx->profile & 1) { + s->ss_h = get_bits1(&s->gb); + s->ss_v = get_bits1(&s->gb); + s->pix_fmt = pix_fmt_for_ss[bits][s->ss_v][s->ss_h]; + if (s->pix_fmt == AV_PIX_FMT_YUV420P) { + av_log(avctx, AV_LOG_ERROR, "YUV 4:2:0 not supported in profile %d\n", + avctx->profile); + return AVERROR_INVALIDDATA; + } else if (get_bits1(&s->gb)) { + av_log(avctx, AV_LOG_ERROR, "Profile %d color details reserved bit set\n", + avctx->profile); + return AVERROR_INVALIDDATA; + } + } else { + s->ss_h = s->ss_v = 1; + s->pix_fmt = pix_fmt_for_ss[bits][1][1]; + } + } + + return 0; +} + +static int decode_frame_header(AVCodecContext *avctx, + const uint8_t *data, int size, int *ref) +{ + VP9Context *s = avctx->priv_data; + int c, i, j, k, l, m, n, w, h, max, size2, ret, sharp; + int last_invisible; + const uint8_t *data2; + + /* general header */ + if ((ret = init_get_bits8(&s->gb, data, size)) < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n"); + return ret; + } + if (get_bits(&s->gb, 2) != 0x2) { // frame marker + av_log(avctx, AV_LOG_ERROR, "Invalid frame marker\n"); + return AVERROR_INVALIDDATA; + } + avctx->profile = get_bits1(&s->gb); + avctx->profile |= get_bits1(&s->gb) << 1; + if (avctx->profile == 3) avctx->profile += get_bits1(&s->gb); + if (avctx->profile > 3) { + av_log(avctx, AV_LOG_ERROR, "Profile %d is not yet supported\n", avctx->profile); + return AVERROR_INVALIDDATA; + } + s->s.h.profile = avctx->profile; + if (get_bits1(&s->gb)) { + *ref = get_bits(&s->gb, 3); + return 0; + } + + s->last_keyframe = s->s.h.keyframe; + s->s.h.keyframe = !get_bits1(&s->gb); + + last_invisible = s->s.h.invisible; + s->s.h.invisible = !get_bits1(&s->gb); + s->s.h.errorres = get_bits1(&s->gb); + s->s.h.use_last_frame_mvs = !s->s.h.errorres && !last_invisible; + + if (s->s.h.keyframe) { + if (get_bits(&s->gb, 24) != VP9_SYNCCODE) { // synccode + av_log(avctx, AV_LOG_ERROR, "Invalid sync code\n"); + return AVERROR_INVALIDDATA; + } + if ((ret = read_colorspace_details(avctx)) < 0) + return ret; + // for profile 1, here follows the subsampling bits + s->s.h.refreshrefmask = 0xff; + w = get_bits(&s->gb, 16) + 1; + h = get_bits(&s->gb, 16) + 1; + if (get_bits1(&s->gb)) // display size + skip_bits(&s->gb, 32); + } else { + s->s.h.intraonly = s->s.h.invisible ? get_bits1(&s->gb) : 0; + s->s.h.resetctx = s->s.h.errorres ? 0 : get_bits(&s->gb, 2); + if (s->s.h.intraonly) { + if (get_bits(&s->gb, 24) != VP9_SYNCCODE) { // synccode + av_log(avctx, AV_LOG_ERROR, "Invalid sync code\n"); + return AVERROR_INVALIDDATA; + } + if (avctx->profile >= 1) { + if ((ret = read_colorspace_details(avctx)) < 0) + return ret; + } else { + s->ss_h = s->ss_v = 1; + s->s.h.bpp = 8; + s->bpp_index = 0; + s->bytesperpixel = 1; + s->pix_fmt = AV_PIX_FMT_YUV420P; + avctx->colorspace = AVCOL_SPC_BT470BG; + avctx->color_range = AVCOL_RANGE_MPEG; + } + s->s.h.refreshrefmask = get_bits(&s->gb, 8); + w = get_bits(&s->gb, 16) + 1; + h = get_bits(&s->gb, 16) + 1; + if (get_bits1(&s->gb)) // display size + skip_bits(&s->gb, 32); + } else { + s->s.h.refreshrefmask = get_bits(&s->gb, 8); + s->s.h.refidx[0] = get_bits(&s->gb, 3); + s->s.h.signbias[0] = get_bits1(&s->gb) && !s->s.h.errorres; + s->s.h.refidx[1] = get_bits(&s->gb, 3); + s->s.h.signbias[1] = get_bits1(&s->gb) && !s->s.h.errorres; + s->s.h.refidx[2] = get_bits(&s->gb, 3); + s->s.h.signbias[2] = get_bits1(&s->gb) && !s->s.h.errorres; + if (!s->s.refs[s->s.h.refidx[0]].f->buf[0] || + !s->s.refs[s->s.h.refidx[1]].f->buf[0] || + !s->s.refs[s->s.h.refidx[2]].f->buf[0]) { + av_log(avctx, AV_LOG_ERROR, "Not all references are available\n"); + return AVERROR_INVALIDDATA; + } + if (get_bits1(&s->gb)) { + w = s->s.refs[s->s.h.refidx[0]].f->width; + h = s->s.refs[s->s.h.refidx[0]].f->height; + } else if (get_bits1(&s->gb)) { + w = s->s.refs[s->s.h.refidx[1]].f->width; + h = s->s.refs[s->s.h.refidx[1]].f->height; + } else if (get_bits1(&s->gb)) { + w = s->s.refs[s->s.h.refidx[2]].f->width; + h = s->s.refs[s->s.h.refidx[2]].f->height; + } else { + w = get_bits(&s->gb, 16) + 1; + h = get_bits(&s->gb, 16) + 1; + } + // Note that in this code, "CUR_FRAME" is actually before we + // have formally allocated a frame, and thus actually represents + // the _last_ frame + s->s.h.use_last_frame_mvs &= s->s.frames[CUR_FRAME].tf.f->width == w && + s->s.frames[CUR_FRAME].tf.f->height == h; + if (get_bits1(&s->gb)) // display size + skip_bits(&s->gb, 32); + s->s.h.highprecisionmvs = get_bits1(&s->gb); + s->s.h.filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE : + get_bits(&s->gb, 2); + s->s.h.allowcompinter = s->s.h.signbias[0] != s->s.h.signbias[1] || + s->s.h.signbias[0] != s->s.h.signbias[2]; + if (s->s.h.allowcompinter) { + if (s->s.h.signbias[0] == s->s.h.signbias[1]) { + s->s.h.fixcompref = 2; + s->s.h.varcompref[0] = 0; + s->s.h.varcompref[1] = 1; + } else if (s->s.h.signbias[0] == s->s.h.signbias[2]) { + s->s.h.fixcompref = 1; + s->s.h.varcompref[0] = 0; + s->s.h.varcompref[1] = 2; + } else { + s->s.h.fixcompref = 0; + s->s.h.varcompref[0] = 1; + s->s.h.varcompref[1] = 2; + } + } + } + } + s->s.h.refreshctx = s->s.h.errorres ? 0 : get_bits1(&s->gb); + s->s.h.parallelmode = s->s.h.errorres ? 1 : get_bits1(&s->gb); + s->s.h.framectxid = c = get_bits(&s->gb, 2); + if (s->s.h.keyframe || s->s.h.intraonly) + s->s.h.framectxid = 0; // BUG: libvpx ignores this field in keyframes + + /* loopfilter header data */ + if (s->s.h.keyframe || s->s.h.errorres || s->s.h.intraonly) { + // reset loopfilter defaults + s->s.h.lf_delta.ref[0] = 1; + s->s.h.lf_delta.ref[1] = 0; + s->s.h.lf_delta.ref[2] = -1; + s->s.h.lf_delta.ref[3] = -1; + s->s.h.lf_delta.mode[0] = 0; + s->s.h.lf_delta.mode[1] = 0; + memset(s->s.h.segmentation.feat, 0, sizeof(s->s.h.segmentation.feat)); + } + s->s.h.filter.level = get_bits(&s->gb, 6); + sharp = get_bits(&s->gb, 3); + // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep + // the old cache values since they are still valid + if (s->s.h.filter.sharpness != sharp) { + for (i = 1; i <= 63; i++) { + int limit = i; + + if (sharp > 0) { + limit >>= (sharp + 3) >> 2; + limit = FFMIN(limit, 9 - sharp); + } + limit = FFMAX(limit, 1); + + s->filter_lut.lim_lut[i] = limit; + s->filter_lut.mblim_lut[i] = 2 * (i + 2) + limit; + } + } + s->s.h.filter.sharpness = sharp; + if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) { + if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) { + for (i = 0; i < 4; i++) + if (get_bits1(&s->gb)) + s->s.h.lf_delta.ref[i] = get_sbits_inv(&s->gb, 6); + for (i = 0; i < 2; i++) + if (get_bits1(&s->gb)) + s->s.h.lf_delta.mode[i] = get_sbits_inv(&s->gb, 6); + } + } + + /* quantization header data */ + s->s.h.yac_qi = get_bits(&s->gb, 8); + s->s.h.ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0; + s->s.h.uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0; + s->s.h.uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0; + s->s.h.lossless = s->s.h.yac_qi == 0 && s->s.h.ydc_qdelta == 0 && + s->s.h.uvdc_qdelta == 0 && s->s.h.uvac_qdelta == 0; + if (s->s.h.lossless) + avctx->properties |= FF_CODEC_PROPERTY_LOSSLESS; + + /* segmentation header info */ + if ((s->s.h.segmentation.enabled = get_bits1(&s->gb))) { + if ((s->s.h.segmentation.update_map = get_bits1(&s->gb))) { + for (i = 0; i < 7; i++) + s->s.h.segmentation.prob[i] = get_bits1(&s->gb) ? + get_bits(&s->gb, 8) : 255; + if ((s->s.h.segmentation.temporal = get_bits1(&s->gb))) + for (i = 0; i < 3; i++) + s->s.h.segmentation.pred_prob[i] = get_bits1(&s->gb) ? + get_bits(&s->gb, 8) : 255; + } + + if (get_bits1(&s->gb)) { + s->s.h.segmentation.absolute_vals = get_bits1(&s->gb); + for (i = 0; i < 8; i++) { + if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb))) + s->s.h.segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8); + if ((s->s.h.segmentation.feat[i].lf_enabled = get_bits1(&s->gb))) + s->s.h.segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6); + if ((s->s.h.segmentation.feat[i].ref_enabled = get_bits1(&s->gb))) + s->s.h.segmentation.feat[i].ref_val = get_bits(&s->gb, 2); + s->s.h.segmentation.feat[i].skip_enabled = get_bits1(&s->gb); + } + } + } + + // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas + for (i = 0; i < (s->s.h.segmentation.enabled ? 8 : 1); i++) { + int qyac, qydc, quvac, quvdc, lflvl, sh; + + if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].q_enabled) { + if (s->s.h.segmentation.absolute_vals) + qyac = av_clip_uintp2(s->s.h.segmentation.feat[i].q_val, 8); + else + qyac = av_clip_uintp2(s->s.h.yac_qi + s->s.h.segmentation.feat[i].q_val, 8); + } else { + qyac = s->s.h.yac_qi; + } + qydc = av_clip_uintp2(qyac + s->s.h.ydc_qdelta, 8); + quvdc = av_clip_uintp2(qyac + s->s.h.uvdc_qdelta, 8); + quvac = av_clip_uintp2(qyac + s->s.h.uvac_qdelta, 8); + qyac = av_clip_uintp2(qyac, 8); + + s->s.h.segmentation.feat[i].qmul[0][0] = ff_vp9_dc_qlookup[s->bpp_index][qydc]; + s->s.h.segmentation.feat[i].qmul[0][1] = ff_vp9_ac_qlookup[s->bpp_index][qyac]; + s->s.h.segmentation.feat[i].qmul[1][0] = ff_vp9_dc_qlookup[s->bpp_index][quvdc]; + s->s.h.segmentation.feat[i].qmul[1][1] = ff_vp9_ac_qlookup[s->bpp_index][quvac]; + + sh = s->s.h.filter.level >= 32; + if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[i].lf_enabled) { + if (s->s.h.segmentation.absolute_vals) + lflvl = av_clip_uintp2(s->s.h.segmentation.feat[i].lf_val, 6); + else + lflvl = av_clip_uintp2(s->s.h.filter.level + s->s.h.segmentation.feat[i].lf_val, 6); + } else { + lflvl = s->s.h.filter.level; + } + if (s->s.h.lf_delta.enabled) { + s->s.h.segmentation.feat[i].lflvl[0][0] = + s->s.h.segmentation.feat[i].lflvl[0][1] = + av_clip_uintp2(lflvl + (s->s.h.lf_delta.ref[0] * (1 << sh)), 6); + for (j = 1; j < 4; j++) { + s->s.h.segmentation.feat[i].lflvl[j][0] = + av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] + + s->s.h.lf_delta.mode[0]) * (1 << sh)), 6); + s->s.h.segmentation.feat[i].lflvl[j][1] = + av_clip_uintp2(lflvl + ((s->s.h.lf_delta.ref[j] + + s->s.h.lf_delta.mode[1]) * (1 << sh)), 6); + } + } else { + memset(s->s.h.segmentation.feat[i].lflvl, lflvl, + sizeof(s->s.h.segmentation.feat[i].lflvl)); + } + } + + /* tiling info */ + if ((ret = update_size(avctx, w, h)) < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n", + w, h, s->pix_fmt); + return ret; + } + for (s->s.h.tiling.log2_tile_cols = 0; + s->sb_cols > (64 << s->s.h.tiling.log2_tile_cols); + s->s.h.tiling.log2_tile_cols++) ; + for (max = 0; (s->sb_cols >> max) >= 4; max++) ; + max = FFMAX(0, max - 1); + while (max > s->s.h.tiling.log2_tile_cols) { + if (get_bits1(&s->gb)) + s->s.h.tiling.log2_tile_cols++; + else + break; + } + s->s.h.tiling.log2_tile_rows = decode012(&s->gb); + s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows; + if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) { + int n_range_coders; + VPXRangeCoder *rc; + + if (s->td) { + for (i = 0; i < s->active_tile_cols; i++) + vp9_tile_data_free(&s->td[i]); + av_freep(&s->td); + } + + s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols; + s->active_tile_cols = avctx->active_thread_type == FF_THREAD_SLICE ? + s->s.h.tiling.tile_cols : 1; + vp9_alloc_entries(avctx, s->sb_rows); + if (avctx->active_thread_type == FF_THREAD_SLICE) { + n_range_coders = 4; // max_tile_rows + } else { + n_range_coders = s->s.h.tiling.tile_cols; + } + s->td = av_calloc(s->active_tile_cols, sizeof(VP9TileData) + + n_range_coders * sizeof(VPXRangeCoder)); + if (!s->td) + return AVERROR(ENOMEM); + rc = (VPXRangeCoder *) &s->td[s->active_tile_cols]; + for (i = 0; i < s->active_tile_cols; i++) { + s->td[i].s = s; + s->td[i].c_b = rc; + rc += n_range_coders; + } + } + + /* check reference frames */ + if (!s->s.h.keyframe && !s->s.h.intraonly) { + int valid_ref_frame = 0; + for (i = 0; i < 3; i++) { + AVFrame *ref = s->s.refs[s->s.h.refidx[i]].f; + int refw = ref->width, refh = ref->height; + + if (ref->format != avctx->pix_fmt) { + av_log(avctx, AV_LOG_ERROR, + "Ref pixfmt (%s) did not match current frame (%s)", + av_get_pix_fmt_name(ref->format), + av_get_pix_fmt_name(avctx->pix_fmt)); + return AVERROR_INVALIDDATA; + } else if (refw == w && refh == h) { + s->mvscale[i][0] = s->mvscale[i][1] = 0; + } else { + /* Check to make sure at least one of frames that */ + /* this frame references has valid dimensions */ + if (w * 2 < refw || h * 2 < refh || w > 16 * refw || h > 16 * refh) { + av_log(avctx, AV_LOG_WARNING, + "Invalid ref frame dimensions %dx%d for frame size %dx%d\n", + refw, refh, w, h); + s->mvscale[i][0] = s->mvscale[i][1] = REF_INVALID_SCALE; + continue; + } + s->mvscale[i][0] = (refw << 14) / w; + s->mvscale[i][1] = (refh << 14) / h; + s->mvstep[i][0] = 16 * s->mvscale[i][0] >> 14; + s->mvstep[i][1] = 16 * s->mvscale[i][1] >> 14; + } + valid_ref_frame++; + } + if (!valid_ref_frame) { + av_log(avctx, AV_LOG_ERROR, "No valid reference frame is found, bitstream not supported\n"); + return AVERROR_INVALIDDATA; + } + } + + if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) { + s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p = + s->prob_ctx[3].p = ff_vp9_default_probs; + memcpy(s->prob_ctx[0].coef, ff_vp9_default_coef_probs, + sizeof(ff_vp9_default_coef_probs)); + memcpy(s->prob_ctx[1].coef, ff_vp9_default_coef_probs, + sizeof(ff_vp9_default_coef_probs)); + memcpy(s->prob_ctx[2].coef, ff_vp9_default_coef_probs, + sizeof(ff_vp9_default_coef_probs)); + memcpy(s->prob_ctx[3].coef, ff_vp9_default_coef_probs, + sizeof(ff_vp9_default_coef_probs)); + } else if (s->s.h.intraonly && s->s.h.resetctx == 2) { + s->prob_ctx[c].p = ff_vp9_default_probs; + memcpy(s->prob_ctx[c].coef, ff_vp9_default_coef_probs, + sizeof(ff_vp9_default_coef_probs)); + } + + // next 16 bits is size of the rest of the header (arith-coded) + s->s.h.compressed_header_size = size2 = get_bits(&s->gb, 16); + s->s.h.uncompressed_header_size = (get_bits_count(&s->gb) + 7) / 8; + + data2 = align_get_bits(&s->gb); + if (size2 > size - (data2 - data)) { + av_log(avctx, AV_LOG_ERROR, "Invalid compressed header size\n"); + return AVERROR_INVALIDDATA; + } + ret = ff_vpx_init_range_decoder(&s->c, data2, size2); + if (ret < 0) + return ret; + + if (vpx_rac_get_prob_branchy(&s->c, 128)) { // marker bit + av_log(avctx, AV_LOG_ERROR, "Marker bit was set\n"); + return AVERROR_INVALIDDATA; + } + + for (i = 0; i < s->active_tile_cols; i++) { + if (s->s.h.keyframe || s->s.h.intraonly) { + memset(s->td[i].counts.coef, 0, sizeof(s->td[0].counts.coef)); + memset(s->td[i].counts.eob, 0, sizeof(s->td[0].counts.eob)); + } else { + memset(&s->td[i].counts, 0, sizeof(s->td[0].counts)); + } + s->td[i].nb_block_structure = 0; + } + + /* FIXME is it faster to not copy here, but do it down in the fw updates + * as explicit copies if the fw update is missing (and skip the copy upon + * fw update)? */ + s->prob.p = s->prob_ctx[c].p; + + // txfm updates + if (s->s.h.lossless) { + s->s.h.txfmmode = TX_4X4; + } else { + s->s.h.txfmmode = vp89_rac_get_uint(&s->c, 2); + if (s->s.h.txfmmode == 3) + s->s.h.txfmmode += vp89_rac_get(&s->c); + + if (s->s.h.txfmmode == TX_SWITCHABLE) { + for (i = 0; i < 2; i++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]); + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.tx16p[i][j] = + update_prob(&s->c, s->prob.p.tx16p[i][j]); + for (i = 0; i < 2; i++) + for (j = 0; j < 3; j++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.tx32p[i][j] = + update_prob(&s->c, s->prob.p.tx32p[i][j]); + } + } + + // coef updates + for (i = 0; i < 4; i++) { + uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i]; + if (vp89_rac_get(&s->c)) { + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + for (l = 0; l < 6; l++) + for (m = 0; m < 6; m++) { + uint8_t *p = s->prob.coef[i][j][k][l][m]; + uint8_t *r = ref[j][k][l][m]; + if (m >= 3 && l == 0) // dc only has 3 pt + break; + for (n = 0; n < 3; n++) { + if (vpx_rac_get_prob_branchy(&s->c, 252)) + p[n] = update_prob(&s->c, r[n]); + else + p[n] = r[n]; + } + memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8); + } + } else { + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + for (l = 0; l < 6; l++) + for (m = 0; m < 6; m++) { + uint8_t *p = s->prob.coef[i][j][k][l][m]; + uint8_t *r = ref[j][k][l][m]; + if (m > 3 && l == 0) // dc only has 3 pt + break; + memcpy(p, r, 3); + memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8); + } + } + if (s->s.h.txfmmode == i) + break; + } + + // mode updates + for (i = 0; i < 3; i++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]); + if (!s->s.h.keyframe && !s->s.h.intraonly) { + for (i = 0; i < 7; i++) + for (j = 0; j < 3; j++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.mv_mode[i][j] = + update_prob(&s->c, s->prob.p.mv_mode[i][j]); + + if (s->s.h.filtermode == FILTER_SWITCHABLE) + for (i = 0; i < 4; i++) + for (j = 0; j < 2; j++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.filter[i][j] = + update_prob(&s->c, s->prob.p.filter[i][j]); + + for (i = 0; i < 4; i++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]); + + if (s->s.h.allowcompinter) { + s->s.h.comppredmode = vp89_rac_get(&s->c); + if (s->s.h.comppredmode) + s->s.h.comppredmode += vp89_rac_get(&s->c); + if (s->s.h.comppredmode == PRED_SWITCHABLE) + for (i = 0; i < 5; i++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.comp[i] = + update_prob(&s->c, s->prob.p.comp[i]); + } else { + s->s.h.comppredmode = PRED_SINGLEREF; + } + + if (s->s.h.comppredmode != PRED_COMPREF) { + for (i = 0; i < 5; i++) { + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.single_ref[i][0] = + update_prob(&s->c, s->prob.p.single_ref[i][0]); + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.single_ref[i][1] = + update_prob(&s->c, s->prob.p.single_ref[i][1]); + } + } + + if (s->s.h.comppredmode != PRED_SINGLEREF) { + for (i = 0; i < 5; i++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.comp_ref[i] = + update_prob(&s->c, s->prob.p.comp_ref[i]); + } + + for (i = 0; i < 4; i++) + for (j = 0; j < 9; j++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.y_mode[i][j] = + update_prob(&s->c, s->prob.p.y_mode[i][j]); + + for (i = 0; i < 4; i++) + for (j = 0; j < 4; j++) + for (k = 0; k < 3; k++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.partition[3 - i][j][k] = + update_prob(&s->c, + s->prob.p.partition[3 - i][j][k]); + + // mv fields don't use the update_prob subexp model for some reason + for (i = 0; i < 3; i++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.mv_joint[i] = (vp89_rac_get_uint(&s->c, 7) << 1) | 1; + + for (i = 0; i < 2; i++) { + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.mv_comp[i].sign = + (vp89_rac_get_uint(&s->c, 7) << 1) | 1; + + for (j = 0; j < 10; j++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.mv_comp[i].classes[j] = + (vp89_rac_get_uint(&s->c, 7) << 1) | 1; + + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.mv_comp[i].class0 = + (vp89_rac_get_uint(&s->c, 7) << 1) | 1; + + for (j = 0; j < 10; j++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.mv_comp[i].bits[j] = + (vp89_rac_get_uint(&s->c, 7) << 1) | 1; + } + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) + for (k = 0; k < 3; k++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.mv_comp[i].class0_fp[j][k] = + (vp89_rac_get_uint(&s->c, 7) << 1) | 1; + + for (j = 0; j < 3; j++) + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.mv_comp[i].fp[j] = + (vp89_rac_get_uint(&s->c, 7) << 1) | 1; + } + + if (s->s.h.highprecisionmvs) { + for (i = 0; i < 2; i++) { + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.mv_comp[i].class0_hp = + (vp89_rac_get_uint(&s->c, 7) << 1) | 1; + + if (vpx_rac_get_prob_branchy(&s->c, 252)) + s->prob.p.mv_comp[i].hp = + (vp89_rac_get_uint(&s->c, 7) << 1) | 1; + } + } + } + + return (data2 - data) + size2; +} + +static void decode_sb(VP9TileData *td, int row, int col, VP9Filter *lflvl, + ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl) +{ + const VP9Context *s = td->s; + int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) | + (((td->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1); + const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? ff_vp9_default_kf_partition_probs[bl][c] : + s->prob.p.partition[bl][c]; + enum BlockPartition bp; + ptrdiff_t hbs = 4 >> bl; + AVFrame *f = s->s.frames[CUR_FRAME].tf.f; + ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1]; + int bytesperpixel = s->bytesperpixel; + + if (bl == BL_8X8) { + bp = vp89_rac_get_tree(td->c, ff_vp9_partition_tree, p); + ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp); + } else if (col + hbs < s->cols) { // FIXME why not <=? + if (row + hbs < s->rows) { // FIXME why not <=? + bp = vp89_rac_get_tree(td->c, ff_vp9_partition_tree, p); + switch (bp) { + case PARTITION_NONE: + ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp); + break; + case PARTITION_H: + ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp); + yoff += hbs * 8 * y_stride; + uvoff += hbs * 8 * uv_stride >> s->ss_v; + ff_vp9_decode_block(td, row + hbs, col, lflvl, yoff, uvoff, bl, bp); + break; + case PARTITION_V: + ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp); + yoff += hbs * 8 * bytesperpixel; + uvoff += hbs * 8 * bytesperpixel >> s->ss_h; + ff_vp9_decode_block(td, row, col + hbs, lflvl, yoff, uvoff, bl, bp); + break; + case PARTITION_SPLIT: + decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1); + decode_sb(td, row, col + hbs, lflvl, + yoff + 8 * hbs * bytesperpixel, + uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1); + yoff += hbs * 8 * y_stride; + uvoff += hbs * 8 * uv_stride >> s->ss_v; + decode_sb(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1); + decode_sb(td, row + hbs, col + hbs, lflvl, + yoff + 8 * hbs * bytesperpixel, + uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1); + break; + default: + av_assert0(0); + } + } else if (vpx_rac_get_prob_branchy(td->c, p[1])) { + bp = PARTITION_SPLIT; + decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1); + decode_sb(td, row, col + hbs, lflvl, + yoff + 8 * hbs * bytesperpixel, + uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1); + } else { + bp = PARTITION_H; + ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp); + } + } else if (row + hbs < s->rows) { // FIXME why not <=? + if (vpx_rac_get_prob_branchy(td->c, p[2])) { + bp = PARTITION_SPLIT; + decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1); + yoff += hbs * 8 * y_stride; + uvoff += hbs * 8 * uv_stride >> s->ss_v; + decode_sb(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1); + } else { + bp = PARTITION_V; + ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp); + } + } else { + bp = PARTITION_SPLIT; + decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1); + } + td->counts.partition[bl][c][bp]++; +} + +static void decode_sb_mem(VP9TileData *td, int row, int col, VP9Filter *lflvl, + ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl) +{ + const VP9Context *s = td->s; + VP9Block *b = td->b; + ptrdiff_t hbs = 4 >> bl; + AVFrame *f = s->s.frames[CUR_FRAME].tf.f; + ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1]; + int bytesperpixel = s->bytesperpixel; + + if (bl == BL_8X8) { + av_assert2(b->bl == BL_8X8); + ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, b->bl, b->bp); + } else if (td->b->bl == bl) { + ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, b->bl, b->bp); + if (b->bp == PARTITION_H && row + hbs < s->rows) { + yoff += hbs * 8 * y_stride; + uvoff += hbs * 8 * uv_stride >> s->ss_v; + ff_vp9_decode_block(td, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp); + } else if (b->bp == PARTITION_V && col + hbs < s->cols) { + yoff += hbs * 8 * bytesperpixel; + uvoff += hbs * 8 * bytesperpixel >> s->ss_h; + ff_vp9_decode_block(td, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp); + } + } else { + decode_sb_mem(td, row, col, lflvl, yoff, uvoff, bl + 1); + if (col + hbs < s->cols) { // FIXME why not <=? + if (row + hbs < s->rows) { + decode_sb_mem(td, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel, + uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1); + yoff += hbs * 8 * y_stride; + uvoff += hbs * 8 * uv_stride >> s->ss_v; + decode_sb_mem(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1); + decode_sb_mem(td, row + hbs, col + hbs, lflvl, + yoff + 8 * hbs * bytesperpixel, + uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1); + } else { + yoff += hbs * 8 * bytesperpixel; + uvoff += hbs * 8 * bytesperpixel >> s->ss_h; + decode_sb_mem(td, row, col + hbs, lflvl, yoff, uvoff, bl + 1); + } + } else if (row + hbs < s->rows) { + yoff += hbs * 8 * y_stride; + uvoff += hbs * 8 * uv_stride >> s->ss_v; + decode_sb_mem(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1); + } + } +} + +static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n) +{ + int sb_start = ( idx * n) >> log2_n; + int sb_end = ((idx + 1) * n) >> log2_n; + *start = FFMIN(sb_start, n) << 3; + *end = FFMIN(sb_end, n) << 3; +} + +static void free_buffers(VP9Context *s) +{ + int i; + + av_freep(&s->intra_pred_data[0]); + for (i = 0; i < s->active_tile_cols; i++) + vp9_tile_data_free(&s->td[i]); +} + +static av_cold int vp9_decode_free(AVCodecContext *avctx) +{ + VP9Context *s = avctx->priv_data; + int i; + + for (i = 0; i < 3; i++) { + vp9_frame_unref(avctx, &s->s.frames[i]); + av_frame_free(&s->s.frames[i].tf.f); + } + av_buffer_pool_uninit(&s->frame_extradata_pool); + for (i = 0; i < 8; i++) { + ff_thread_release_ext_buffer(avctx, &s->s.refs[i]); + av_frame_free(&s->s.refs[i].f); + ff_thread_release_ext_buffer(avctx, &s->next_refs[i]); + av_frame_free(&s->next_refs[i].f); + } + + free_buffers(s); +#if HAVE_THREADS + av_freep(&s->entries); + ff_pthread_free(s, vp9_context_offsets); +#endif + av_freep(&s->td); + return 0; +} + +static int decode_tiles(AVCodecContext *avctx, + const uint8_t *data, int size) +{ + VP9Context *s = avctx->priv_data; + VP9TileData *td = &s->td[0]; + int row, col, tile_row, tile_col, ret; + int bytesperpixel; + int tile_row_start, tile_row_end, tile_col_start, tile_col_end; + AVFrame *f; + ptrdiff_t yoff, uvoff, ls_y, ls_uv; + + f = s->s.frames[CUR_FRAME].tf.f; + ls_y = f->linesize[0]; + ls_uv =f->linesize[1]; + bytesperpixel = s->bytesperpixel; + + yoff = uvoff = 0; + for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) { + set_tile_offset(&tile_row_start, &tile_row_end, + tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows); + + for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) { + int64_t tile_size; + + if (tile_col == s->s.h.tiling.tile_cols - 1 && + tile_row == s->s.h.tiling.tile_rows - 1) { + tile_size = size; + } else { + tile_size = AV_RB32(data); + data += 4; + size -= 4; + } + if (tile_size > size) + return AVERROR_INVALIDDATA; + ret = ff_vpx_init_range_decoder(&td->c_b[tile_col], data, tile_size); + if (ret < 0) + return ret; + if (vpx_rac_get_prob_branchy(&td->c_b[tile_col], 128)) // marker bit + return AVERROR_INVALIDDATA; + data += tile_size; + size -= tile_size; + } + + for (row = tile_row_start; row < tile_row_end; + row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) { + VP9Filter *lflvl_ptr = s->lflvl; + ptrdiff_t yoff2 = yoff, uvoff2 = uvoff; + + for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) { + set_tile_offset(&tile_col_start, &tile_col_end, + tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols); + td->tile_col_start = tile_col_start; + if (s->pass != 2) { + memset(td->left_partition_ctx, 0, 8); + memset(td->left_skip_ctx, 0, 8); + if (s->s.h.keyframe || s->s.h.intraonly) { + memset(td->left_mode_ctx, DC_PRED, 16); + } else { + memset(td->left_mode_ctx, NEARESTMV, 8); + } + memset(td->left_y_nnz_ctx, 0, 16); + memset(td->left_uv_nnz_ctx, 0, 32); + memset(td->left_segpred_ctx, 0, 8); + + td->c = &td->c_b[tile_col]; + } + + for (col = tile_col_start; + col < tile_col_end; + col += 8, yoff2 += 64 * bytesperpixel, + uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) { + // FIXME integrate with lf code (i.e. zero after each + // use, similar to invtxfm coefficients, or similar) + if (s->pass != 1) { + memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask)); + } + + if (s->pass == 2) { + decode_sb_mem(td, row, col, lflvl_ptr, + yoff2, uvoff2, BL_64X64); + } else { + if (vpx_rac_is_end(td->c)) { + return AVERROR_INVALIDDATA; + } + decode_sb(td, row, col, lflvl_ptr, + yoff2, uvoff2, BL_64X64); + } + } + } + + if (s->pass == 1) + continue; + + // backup pre-loopfilter reconstruction data for intra + // prediction of next row of sb64s + if (row + 8 < s->rows) { + memcpy(s->intra_pred_data[0], + f->data[0] + yoff + 63 * ls_y, + 8 * s->cols * bytesperpixel); + memcpy(s->intra_pred_data[1], + f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv, + 8 * s->cols * bytesperpixel >> s->ss_h); + memcpy(s->intra_pred_data[2], + f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv, + 8 * s->cols * bytesperpixel >> s->ss_h); + } + + // loopfilter one row + if (s->s.h.filter.level) { + yoff2 = yoff; + uvoff2 = uvoff; + lflvl_ptr = s->lflvl; + for (col = 0; col < s->cols; + col += 8, yoff2 += 64 * bytesperpixel, + uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) { + ff_vp9_loopfilter_sb(avctx, lflvl_ptr, row, col, + yoff2, uvoff2); + } + } + + // FIXME maybe we can make this more finegrained by running the + // loopfilter per-block instead of after each sbrow + // In fact that would also make intra pred left preparation easier? + ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0); + } + } + return 0; +} + +#if HAVE_THREADS +static av_always_inline +int decode_tiles_mt(AVCodecContext *avctx, void *tdata, int jobnr, + int threadnr) +{ + VP9Context *s = avctx->priv_data; + VP9TileData *td = &s->td[jobnr]; + ptrdiff_t uvoff, yoff, ls_y, ls_uv; + int bytesperpixel = s->bytesperpixel, row, col, tile_row; + unsigned tile_cols_len; + int tile_row_start, tile_row_end, tile_col_start, tile_col_end; + VP9Filter *lflvl_ptr_base; + AVFrame *f; + + f = s->s.frames[CUR_FRAME].tf.f; + ls_y = f->linesize[0]; + ls_uv =f->linesize[1]; + + set_tile_offset(&tile_col_start, &tile_col_end, + jobnr, s->s.h.tiling.log2_tile_cols, s->sb_cols); + td->tile_col_start = tile_col_start; + uvoff = (64 * bytesperpixel >> s->ss_h)*(tile_col_start >> 3); + yoff = (64 * bytesperpixel)*(tile_col_start >> 3); + lflvl_ptr_base = s->lflvl+(tile_col_start >> 3); + + for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) { + set_tile_offset(&tile_row_start, &tile_row_end, + tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows); + + td->c = &td->c_b[tile_row]; + for (row = tile_row_start; row < tile_row_end; + row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) { + ptrdiff_t yoff2 = yoff, uvoff2 = uvoff; + VP9Filter *lflvl_ptr = lflvl_ptr_base+s->sb_cols*(row >> 3); + + memset(td->left_partition_ctx, 0, 8); + memset(td->left_skip_ctx, 0, 8); + if (s->s.h.keyframe || s->s.h.intraonly) { + memset(td->left_mode_ctx, DC_PRED, 16); + } else { + memset(td->left_mode_ctx, NEARESTMV, 8); + } + memset(td->left_y_nnz_ctx, 0, 16); + memset(td->left_uv_nnz_ctx, 0, 32); + memset(td->left_segpred_ctx, 0, 8); + + for (col = tile_col_start; + col < tile_col_end; + col += 8, yoff2 += 64 * bytesperpixel, + uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) { + // FIXME integrate with lf code (i.e. zero after each + // use, similar to invtxfm coefficients, or similar) + memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask)); + decode_sb(td, row, col, lflvl_ptr, + yoff2, uvoff2, BL_64X64); + } + + // backup pre-loopfilter reconstruction data for intra + // prediction of next row of sb64s + tile_cols_len = tile_col_end - tile_col_start; + if (row + 8 < s->rows) { + memcpy(s->intra_pred_data[0] + (tile_col_start * 8 * bytesperpixel), + f->data[0] + yoff + 63 * ls_y, + 8 * tile_cols_len * bytesperpixel); + memcpy(s->intra_pred_data[1] + (tile_col_start * 8 * bytesperpixel >> s->ss_h), + f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv, + 8 * tile_cols_len * bytesperpixel >> s->ss_h); + memcpy(s->intra_pred_data[2] + (tile_col_start * 8 * bytesperpixel >> s->ss_h), + f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv, + 8 * tile_cols_len * bytesperpixel >> s->ss_h); + } + + vp9_report_tile_progress(s, row >> 3, 1); + } + } + return 0; +} + +static av_always_inline +int loopfilter_proc(AVCodecContext *avctx) +{ + VP9Context *s = avctx->priv_data; + ptrdiff_t uvoff, yoff, ls_y, ls_uv; + VP9Filter *lflvl_ptr; + int bytesperpixel = s->bytesperpixel, col, i; + AVFrame *f; + + f = s->s.frames[CUR_FRAME].tf.f; + ls_y = f->linesize[0]; + ls_uv =f->linesize[1]; + + for (i = 0; i < s->sb_rows; i++) { + vp9_await_tile_progress(s, i, s->s.h.tiling.tile_cols); + + if (s->s.h.filter.level) { + yoff = (ls_y * 64)*i; + uvoff = (ls_uv * 64 >> s->ss_v)*i; + lflvl_ptr = s->lflvl+s->sb_cols*i; + for (col = 0; col < s->cols; + col += 8, yoff += 64 * bytesperpixel, + uvoff += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) { + ff_vp9_loopfilter_sb(avctx, lflvl_ptr, i << 3, col, + yoff, uvoff); + } + } + } + return 0; +} +#endif + +static int vp9_export_enc_params(VP9Context *s, VP9Frame *frame) +{ + AVVideoEncParams *par; + unsigned int tile, nb_blocks = 0; + + if (s->s.h.segmentation.enabled) { + for (tile = 0; tile < s->active_tile_cols; tile++) + nb_blocks += s->td[tile].nb_block_structure; + } + + par = av_video_enc_params_create_side_data(frame->tf.f, + AV_VIDEO_ENC_PARAMS_VP9, nb_blocks); + if (!par) + return AVERROR(ENOMEM); + + par->qp = s->s.h.yac_qi; + par->delta_qp[0][0] = s->s.h.ydc_qdelta; + par->delta_qp[1][0] = s->s.h.uvdc_qdelta; + par->delta_qp[2][0] = s->s.h.uvdc_qdelta; + par->delta_qp[1][1] = s->s.h.uvac_qdelta; + par->delta_qp[2][1] = s->s.h.uvac_qdelta; + + if (nb_blocks) { + unsigned int block = 0; + unsigned int tile, block_tile; + + for (tile = 0; tile < s->active_tile_cols; tile++) { + VP9TileData *td = &s->td[tile]; + + for (block_tile = 0; block_tile < td->nb_block_structure; block_tile++) { + AVVideoBlockParams *b = av_video_enc_params_block(par, block++); + unsigned int row = td->block_structure[block_tile].row; + unsigned int col = td->block_structure[block_tile].col; + uint8_t seg_id = frame->segmentation_map[row * 8 * s->sb_cols + col]; + + b->src_x = col * 8; + b->src_y = row * 8; + b->w = 1 << (3 + td->block_structure[block_tile].block_size_idx_x); + b->h = 1 << (3 + td->block_structure[block_tile].block_size_idx_y); + + if (s->s.h.segmentation.feat[seg_id].q_enabled) { + b->delta_qp = s->s.h.segmentation.feat[seg_id].q_val; + if (s->s.h.segmentation.absolute_vals) + b->delta_qp -= par->qp; + } + } + } + } + + return 0; +} + +static int vp9_decode_frame(AVCodecContext *avctx, AVFrame *frame, + int *got_frame, AVPacket *pkt) +{ + const uint8_t *data = pkt->data; + int size = pkt->size; + VP9Context *s = avctx->priv_data; + int ret, i, j, ref; + int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map && + (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map); + AVFrame *f; + + if ((ret = decode_frame_header(avctx, data, size, &ref)) < 0) { + return ret; + } else if (ret == 0) { + if (!s->s.refs[ref].f->buf[0]) { + av_log(avctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref); + return AVERROR_INVALIDDATA; + } + if ((ret = av_frame_ref(frame, s->s.refs[ref].f)) < 0) + return ret; + frame->pts = pkt->pts; + frame->pkt_dts = pkt->dts; + for (i = 0; i < 8; i++) { + if (s->next_refs[i].f->buf[0]) + ff_thread_release_ext_buffer(avctx, &s->next_refs[i]); + if (s->s.refs[i].f->buf[0] && + (ret = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i])) < 0) + return ret; + } + *got_frame = 1; + return pkt->size; + } + data += ret; + size -= ret; + + if (!retain_segmap_ref || s->s.h.keyframe || s->s.h.intraonly) { + if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0]) + vp9_frame_unref(avctx, &s->s.frames[REF_FRAME_SEGMAP]); + if (!s->s.h.keyframe && !s->s.h.intraonly && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] && + (ret = vp9_frame_ref(avctx, &s->s.frames[REF_FRAME_SEGMAP], &s->s.frames[CUR_FRAME])) < 0) + return ret; + } + if (s->s.frames[REF_FRAME_MVPAIR].tf.f->buf[0]) + vp9_frame_unref(avctx, &s->s.frames[REF_FRAME_MVPAIR]); + if (!s->s.h.intraonly && !s->s.h.keyframe && !s->s.h.errorres && s->s.frames[CUR_FRAME].tf.f->buf[0] && + (ret = vp9_frame_ref(avctx, &s->s.frames[REF_FRAME_MVPAIR], &s->s.frames[CUR_FRAME])) < 0) + return ret; + if (s->s.frames[CUR_FRAME].tf.f->buf[0]) + vp9_frame_unref(avctx, &s->s.frames[CUR_FRAME]); + if ((ret = vp9_frame_alloc(avctx, &s->s.frames[CUR_FRAME])) < 0) + return ret; + f = s->s.frames[CUR_FRAME].tf.f; + f->key_frame = s->s.h.keyframe; + f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P; + + if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] && + (s->s.frames[REF_FRAME_MVPAIR].tf.f->width != s->s.frames[CUR_FRAME].tf.f->width || + s->s.frames[REF_FRAME_MVPAIR].tf.f->height != s->s.frames[CUR_FRAME].tf.f->height)) { + vp9_frame_unref(avctx, &s->s.frames[REF_FRAME_SEGMAP]); + } + + // ref frame setup + for (i = 0; i < 8; i++) { + if (s->next_refs[i].f->buf[0]) + ff_thread_release_ext_buffer(avctx, &s->next_refs[i]); + if (s->s.h.refreshrefmask & (1 << i)) { + ret = ff_thread_ref_frame(&s->next_refs[i], &s->s.frames[CUR_FRAME].tf); + } else if (s->s.refs[i].f->buf[0]) { + ret = ff_thread_ref_frame(&s->next_refs[i], &s->s.refs[i]); + } + if (ret < 0) + return ret; + } + + if (avctx->hwaccel) { + ret = avctx->hwaccel->start_frame(avctx, NULL, 0); + if (ret < 0) + return ret; + ret = avctx->hwaccel->decode_slice(avctx, pkt->data, pkt->size); + if (ret < 0) + return ret; + ret = avctx->hwaccel->end_frame(avctx); + if (ret < 0) + return ret; + goto finish; + } + + // main tile decode loop + memset(s->above_partition_ctx, 0, s->cols); + memset(s->above_skip_ctx, 0, s->cols); + if (s->s.h.keyframe || s->s.h.intraonly) { + memset(s->above_mode_ctx, DC_PRED, s->cols * 2); + } else { + memset(s->above_mode_ctx, NEARESTMV, s->cols); + } + memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16); + memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 16 >> s->ss_h); + memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 16 >> s->ss_h); + memset(s->above_segpred_ctx, 0, s->cols); + s->pass = s->s.frames[CUR_FRAME].uses_2pass = + avctx->active_thread_type == FF_THREAD_FRAME && s->s.h.refreshctx && !s->s.h.parallelmode; + if ((ret = update_block_buffers(avctx)) < 0) { + av_log(avctx, AV_LOG_ERROR, + "Failed to allocate block buffers\n"); + return ret; + } + if (s->s.h.refreshctx && s->s.h.parallelmode) { + int j, k, l, m; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + for (l = 0; l < 6; l++) + for (m = 0; m < 6; m++) + memcpy(s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m], + s->prob.coef[i][j][k][l][m], 3); + if (s->s.h.txfmmode == i) + break; + } + s->prob_ctx[s->s.h.framectxid].p = s->prob.p; + ff_thread_finish_setup(avctx); + } else if (!s->s.h.refreshctx) { + ff_thread_finish_setup(avctx); + } + +#if HAVE_THREADS + if (avctx->active_thread_type & FF_THREAD_SLICE) { + for (i = 0; i < s->sb_rows; i++) + atomic_store(&s->entries[i], 0); + } +#endif + + do { + for (i = 0; i < s->active_tile_cols; i++) { + s->td[i].b = s->td[i].b_base; + s->td[i].block = s->td[i].block_base; + s->td[i].uvblock[0] = s->td[i].uvblock_base[0]; + s->td[i].uvblock[1] = s->td[i].uvblock_base[1]; + s->td[i].eob = s->td[i].eob_base; + s->td[i].uveob[0] = s->td[i].uveob_base[0]; + s->td[i].uveob[1] = s->td[i].uveob_base[1]; + s->td[i].error_info = 0; + } + +#if HAVE_THREADS + if (avctx->active_thread_type == FF_THREAD_SLICE) { + int tile_row, tile_col; + + av_assert1(!s->pass); + + for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) { + for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) { + int64_t tile_size; + + if (tile_col == s->s.h.tiling.tile_cols - 1 && + tile_row == s->s.h.tiling.tile_rows - 1) { + tile_size = size; + } else { + tile_size = AV_RB32(data); + data += 4; + size -= 4; + } + if (tile_size > size) + return AVERROR_INVALIDDATA; + ret = ff_vpx_init_range_decoder(&s->td[tile_col].c_b[tile_row], data, tile_size); + if (ret < 0) + return ret; + if (vpx_rac_get_prob_branchy(&s->td[tile_col].c_b[tile_row], 128)) // marker bit + return AVERROR_INVALIDDATA; + data += tile_size; + size -= tile_size; + } + } + + ff_slice_thread_execute_with_mainfunc(avctx, decode_tiles_mt, loopfilter_proc, s->td, NULL, s->s.h.tiling.tile_cols); + } else +#endif + { + ret = decode_tiles(avctx, data, size); + if (ret < 0) { + ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0); + return ret; + } + } + + // Sum all counts fields into td[0].counts for tile threading + if (avctx->active_thread_type == FF_THREAD_SLICE) + for (i = 1; i < s->s.h.tiling.tile_cols; i++) + for (j = 0; j < sizeof(s->td[i].counts) / sizeof(unsigned); j++) + ((unsigned *)&s->td[0].counts)[j] += ((unsigned *)&s->td[i].counts)[j]; + + if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) { + ff_vp9_adapt_probs(s); + ff_thread_finish_setup(avctx); + } + } while (s->pass++ == 1); + ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0); + + if (s->td->error_info < 0) { + av_log(avctx, AV_LOG_ERROR, "Failed to decode tile data\n"); + s->td->error_info = 0; + return AVERROR_INVALIDDATA; + } + if (avctx->export_side_data & AV_CODEC_EXPORT_DATA_VIDEO_ENC_PARAMS) { + ret = vp9_export_enc_params(s, &s->s.frames[CUR_FRAME]); + if (ret < 0) + return ret; + } + +finish: + // ref frame setup + for (i = 0; i < 8; i++) { + if (s->s.refs[i].f->buf[0]) + ff_thread_release_ext_buffer(avctx, &s->s.refs[i]); + if (s->next_refs[i].f->buf[0] && + (ret = ff_thread_ref_frame(&s->s.refs[i], &s->next_refs[i])) < 0) + return ret; + } + + if (!s->s.h.invisible) { + if ((ret = av_frame_ref(frame, s->s.frames[CUR_FRAME].tf.f)) < 0) + return ret; + *got_frame = 1; + } + + return pkt->size; +} + +static void vp9_decode_flush(AVCodecContext *avctx) +{ + VP9Context *s = avctx->priv_data; + int i; + + for (i = 0; i < 3; i++) + vp9_frame_unref(avctx, &s->s.frames[i]); + for (i = 0; i < 8; i++) + ff_thread_release_ext_buffer(avctx, &s->s.refs[i]); +} + +static av_cold int vp9_decode_init(AVCodecContext *avctx) +{ + VP9Context *s = avctx->priv_data; + int ret; + + s->last_bpp = 0; + s->s.h.filter.sharpness = -1; + +#if HAVE_THREADS + if (avctx->active_thread_type & FF_THREAD_SLICE) { + ret = ff_pthread_init(s, vp9_context_offsets); + if (ret < 0) + return ret; + } +#endif + + for (int i = 0; i < 3; i++) { + s->s.frames[i].tf.f = av_frame_alloc(); + if (!s->s.frames[i].tf.f) + return AVERROR(ENOMEM); + } + for (int i = 0; i < 8; i++) { + s->s.refs[i].f = av_frame_alloc(); + s->next_refs[i].f = av_frame_alloc(); + if (!s->s.refs[i].f || !s->next_refs[i].f) + return AVERROR(ENOMEM); + } + return 0; +} + +#if HAVE_THREADS +static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src) +{ + int i, ret; + VP9Context *s = dst->priv_data, *ssrc = src->priv_data; + + for (i = 0; i < 3; i++) { + if (s->s.frames[i].tf.f->buf[0]) + vp9_frame_unref(dst, &s->s.frames[i]); + if (ssrc->s.frames[i].tf.f->buf[0]) { + if ((ret = vp9_frame_ref(dst, &s->s.frames[i], &ssrc->s.frames[i])) < 0) + return ret; + } + } + for (i = 0; i < 8; i++) { + if (s->s.refs[i].f->buf[0]) + ff_thread_release_ext_buffer(dst, &s->s.refs[i]); + if (ssrc->next_refs[i].f->buf[0]) { + if ((ret = ff_thread_ref_frame(&s->s.refs[i], &ssrc->next_refs[i])) < 0) + return ret; + } + } + + s->s.h.invisible = ssrc->s.h.invisible; + s->s.h.keyframe = ssrc->s.h.keyframe; + s->s.h.intraonly = ssrc->s.h.intraonly; + s->ss_v = ssrc->ss_v; + s->ss_h = ssrc->ss_h; + s->s.h.segmentation.enabled = ssrc->s.h.segmentation.enabled; + s->s.h.segmentation.update_map = ssrc->s.h.segmentation.update_map; + s->s.h.segmentation.absolute_vals = ssrc->s.h.segmentation.absolute_vals; + s->bytesperpixel = ssrc->bytesperpixel; + s->gf_fmt = ssrc->gf_fmt; + s->w = ssrc->w; + s->h = ssrc->h; + s->s.h.bpp = ssrc->s.h.bpp; + s->bpp_index = ssrc->bpp_index; + s->pix_fmt = ssrc->pix_fmt; + memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx)); + memcpy(&s->s.h.lf_delta, &ssrc->s.h.lf_delta, sizeof(s->s.h.lf_delta)); + memcpy(&s->s.h.segmentation.feat, &ssrc->s.h.segmentation.feat, + sizeof(s->s.h.segmentation.feat)); + + return 0; +} +#endif + +const FFCodec ff_vp9_decoder = { + .p.name = "vp9", + CODEC_LONG_NAME("Google VP9"), + .p.type = AVMEDIA_TYPE_VIDEO, + .p.id = AV_CODEC_ID_VP9, + .priv_data_size = sizeof(VP9Context), + .init = vp9_decode_init, + .close = vp9_decode_free, + FF_CODEC_DECODE_CB(vp9_decode_frame), + .p.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS, + .caps_internal = FF_CODEC_CAP_INIT_CLEANUP | + FF_CODEC_CAP_SLICE_THREAD_HAS_MF | + FF_CODEC_CAP_ALLOCATE_PROGRESS, + .flush = vp9_decode_flush, + UPDATE_THREAD_CONTEXT(vp9_decode_update_thread_context), + .p.profiles = NULL_IF_CONFIG_SMALL(ff_vp9_profiles), + .bsfs = "vp9_superframe_split", + .hw_configs = (const AVCodecHWConfigInternal *const []) { +#if CONFIG_VP9_DXVA2_HWACCEL + HWACCEL_DXVA2(vp9), +#endif +#if CONFIG_VP9_D3D11VA_HWACCEL + HWACCEL_D3D11VA(vp9), +#endif +#if CONFIG_VP9_D3D11VA2_HWACCEL + HWACCEL_D3D11VA2(vp9), +#endif +#if CONFIG_VP9_NVDEC_HWACCEL + HWACCEL_NVDEC(vp9), +#endif +#if CONFIG_VP9_VAAPI_HWACCEL + HWACCEL_VAAPI(vp9), +#endif +#if CONFIG_VP9_VDPAU_HWACCEL + HWACCEL_VDPAU(vp9), +#endif +#if CONFIG_VP9_VIDEOTOOLBOX_HWACCEL + HWACCEL_VIDEOTOOLBOX(vp9), +#endif + NULL + }, +}; diff --git a/media/ffvpx/libavcodec/vp9.h b/media/ffvpx/libavcodec/vp9.h new file mode 100644 index 0000000000..c8d07ad986 --- /dev/null +++ b/media/ffvpx/libavcodec/vp9.h @@ -0,0 +1,73 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VP9_H +#define AVCODEC_VP9_H + +enum TxfmMode { + TX_4X4, + TX_8X8, + TX_16X16, + TX_32X32, + N_TXFM_SIZES, + TX_SWITCHABLE = N_TXFM_SIZES, + N_TXFM_MODES +}; + +enum TxfmType { + DCT_DCT, + DCT_ADST, + ADST_DCT, + ADST_ADST, + N_TXFM_TYPES +}; + +enum IntraPredMode { + VERT_PRED, + HOR_PRED, + DC_PRED, + DIAG_DOWN_LEFT_PRED, + DIAG_DOWN_RIGHT_PRED, + VERT_RIGHT_PRED, + HOR_DOWN_PRED, + VERT_LEFT_PRED, + HOR_UP_PRED, + TM_VP8_PRED, + LEFT_DC_PRED, + TOP_DC_PRED, + DC_128_PRED, + DC_127_PRED, + DC_129_PRED, + N_INTRA_PRED_MODES +}; + +enum FilterMode { + FILTER_8TAP_SMOOTH, + FILTER_8TAP_REGULAR, + FILTER_8TAP_SHARP, + FILTER_BILINEAR, + N_FILTERS, + FILTER_SWITCHABLE = N_FILTERS, +}; + +#endif /* AVCODEC_VP9_H */ diff --git a/media/ffvpx/libavcodec/vp9_mc_template.c b/media/ffvpx/libavcodec/vp9_mc_template.c new file mode 100644 index 0000000000..e654c0e5ed --- /dev/null +++ b/media/ffvpx/libavcodec/vp9_mc_template.c @@ -0,0 +1,439 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define ROUNDED_DIV_MVx2(a, b) \ + (VP9mv) { .x = ROUNDED_DIV(a.x + b.x, 2), .y = ROUNDED_DIV(a.y + b.y, 2) } +#define ROUNDED_DIV_MVx4(a, b, c, d) \ + (VP9mv) { .x = ROUNDED_DIV(a.x + b.x + c.x + d.x, 4), \ + .y = ROUNDED_DIV(a.y + b.y + c.y + d.y, 4) } + +static void FN(inter_pred)(VP9TileData *td) +{ + static const uint8_t bwlog_tab[2][N_BS_SIZES] = { + { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 }, + { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 }, + }; + const VP9Context *s = td->s; + VP9Block *b = td->b; + int row = td->row, col = td->col; + const ThreadFrame *tref1 = &s->s.refs[s->s.h.refidx[b->ref[0]]], *tref2; + const AVFrame *ref1 = tref1->f, *ref2; + int w1 = ref1->width, h1 = ref1->height, w2, h2; + ptrdiff_t ls_y = td->y_stride, ls_uv = td->uv_stride; + int bytesperpixel = BYTES_PER_PIXEL; + + if (b->comp) { + tref2 = &s->s.refs[s->s.h.refidx[b->ref[1]]]; + ref2 = tref2->f; + w2 = ref2->width; + h2 = ref2->height; + } + + // y inter pred + if (b->bs > BS_8x8) { + VP9mv uvmv; + +#if SCALED == 0 + if (b->bs == BS_8x4) { + mc_luma_dir(td, mc[3][b->filter][0], td->dst[0], ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, col << 3, &b->mv[0][0],,,,, 8, 4, w1, h1, 0); + mc_luma_dir(td, mc[3][b->filter][0], + td->dst[0] + 4 * ls_y, ls_y, + ref1->data[0], ref1->linesize[0], tref1, + (row << 3) + 4, col << 3, &b->mv[2][0],,,,, 8, 4, w1, h1, 0); + w1 = (w1 + s->ss_h) >> s->ss_h; + if (s->ss_v) { + h1 = (h1 + 1) >> 1; + uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]); + mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][0], + td->dst[1], td->dst[2], ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << 2, col << (3 - s->ss_h), + &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0); + } else { + mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][0], + td->dst[1], td->dst[2], ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << 3, col << (3 - s->ss_h), + &b->mv[0][0],,,,, 8 >> s->ss_h, 4, w1, h1, 0); + // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index + // to get the motion vector for the bottom 4x4 block + // https://code.google.com/p/webm/issues/detail?id=993 + if (s->ss_h == 0) { + uvmv = b->mv[2][0]; + } else { + uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]); + } + mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][0], + td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + (row << 3) + 4, col << (3 - s->ss_h), + &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0); + } + + if (b->comp) { + mc_luma_dir(td, mc[3][b->filter][1], td->dst[0], ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, col << 3, &b->mv[0][1],,,,, 8, 4, w2, h2, 1); + mc_luma_dir(td, mc[3][b->filter][1], + td->dst[0] + 4 * ls_y, ls_y, + ref2->data[0], ref2->linesize[0], tref2, + (row << 3) + 4, col << 3, &b->mv[2][1],,,,, 8, 4, w2, h2, 1); + w2 = (w2 + s->ss_h) >> s->ss_h; + if (s->ss_v) { + h2 = (h2 + 1) >> 1; + uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]); + mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][1], + td->dst[1], td->dst[2], ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << 2, col << (3 - s->ss_h), + &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1); + } else { + mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][1], + td->dst[1], td->dst[2], ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << 3, col << (3 - s->ss_h), + &b->mv[0][1],,,,, 8 >> s->ss_h, 4, w2, h2, 1); + // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index + // to get the motion vector for the bottom 4x4 block + // https://code.google.com/p/webm/issues/detail?id=993 + if (s->ss_h == 0) { + uvmv = b->mv[2][1]; + } else { + uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]); + } + mc_chroma_dir(td, mc[3 + s->ss_h][b->filter][1], + td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + (row << 3) + 4, col << (3 - s->ss_h), + &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1); + } + } + } else if (b->bs == BS_4x8) { + mc_luma_dir(td, mc[4][b->filter][0], td->dst[0], ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, col << 3, &b->mv[0][0],,,,, 4, 8, w1, h1, 0); + mc_luma_dir(td, mc[4][b->filter][0], td->dst[0] + 4 * bytesperpixel, ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, (col << 3) + 4, &b->mv[1][0],,,,, 4, 8, w1, h1, 0); + h1 = (h1 + s->ss_v) >> s->ss_v; + if (s->ss_h) { + w1 = (w1 + 1) >> 1; + uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[1][0]); + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1], td->dst[2], ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << (3 - s->ss_v), col << 2, + &uvmv,,,,, 4, 8 >> s->ss_v, w1, h1, 0); + } else { + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1], td->dst[2], ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << (3 - s->ss_v), col << 3, + &b->mv[0][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0); + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1] + 4 * bytesperpixel, + td->dst[2] + 4 * bytesperpixel, ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << (3 - s->ss_v), (col << 3) + 4, + &b->mv[1][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0); + } + + if (b->comp) { + mc_luma_dir(td, mc[4][b->filter][1], td->dst[0], ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, col << 3, &b->mv[0][1],,,,, 4, 8, w2, h2, 1); + mc_luma_dir(td, mc[4][b->filter][1], td->dst[0] + 4 * bytesperpixel, ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, (col << 3) + 4, &b->mv[1][1],,,,, 4, 8, w2, h2, 1); + h2 = (h2 + s->ss_v) >> s->ss_v; + if (s->ss_h) { + w2 = (w2 + 1) >> 1; + uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[1][1]); + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1], td->dst[2], ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << (3 - s->ss_v), col << 2, + &uvmv,,,,, 4, 8 >> s->ss_v, w2, h2, 1); + } else { + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1], td->dst[2], ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << (3 - s->ss_v), col << 3, + &b->mv[0][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1); + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1] + 4 * bytesperpixel, + td->dst[2] + 4 * bytesperpixel, ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << (3 - s->ss_v), (col << 3) + 4, + &b->mv[1][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1); + } + } + } else +#endif + { +#if SCALED == 0 + av_assert2(b->bs == BS_4x4); +#endif + + // FIXME if two horizontally adjacent blocks have the same MV, + // do a w8 instead of a w4 call + mc_luma_dir(td, mc[4][b->filter][0], td->dst[0], ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, col << 3, &b->mv[0][0], + 0, 0, 8, 8, 4, 4, w1, h1, 0); + mc_luma_dir(td, mc[4][b->filter][0], td->dst[0] + 4 * bytesperpixel, ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, (col << 3) + 4, &b->mv[1][0], + 4, 0, 8, 8, 4, 4, w1, h1, 0); + mc_luma_dir(td, mc[4][b->filter][0], + td->dst[0] + 4 * ls_y, ls_y, + ref1->data[0], ref1->linesize[0], tref1, + (row << 3) + 4, col << 3, &b->mv[2][0], + 0, 4, 8, 8, 4, 4, w1, h1, 0); + mc_luma_dir(td, mc[4][b->filter][0], + td->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y, + ref1->data[0], ref1->linesize[0], tref1, + (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], + 4, 4, 8, 8, 4, 4, w1, h1, 0); + if (s->ss_v) { + h1 = (h1 + 1) >> 1; + if (s->ss_h) { + w1 = (w1 + 1) >> 1; + uvmv = ROUNDED_DIV_MVx4(b->mv[0][0], b->mv[1][0], + b->mv[2][0], b->mv[3][0]); + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1], td->dst[2], ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << 2, col << 2, + &uvmv, 0, 0, 4, 4, 4, 4, w1, h1, 0); + } else { + uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]); + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1], td->dst[2], ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << 2, col << 3, + &uvmv, 0, 0, 8, 4, 4, 4, w1, h1, 0); + uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[3][0]); + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1] + 4 * bytesperpixel, + td->dst[2] + 4 * bytesperpixel, ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << 2, (col << 3) + 4, + &uvmv, 4, 0, 8, 4, 4, 4, w1, h1, 0); + } + } else { + if (s->ss_h) { + w1 = (w1 + 1) >> 1; + uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[1][0]); + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1], td->dst[2], ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << 3, col << 2, + &uvmv, 0, 0, 4, 8, 4, 4, w1, h1, 0); + // BUG libvpx uses wrong block index for 4:2:2 bs=4x4 + // bottom block + // https://code.google.com/p/webm/issues/detail?id=993 + uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[2][0]); + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + (row << 3) + 4, col << 2, + &uvmv, 0, 4, 4, 8, 4, 4, w1, h1, 0); + } else { + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1], td->dst[2], ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << 3, col << 3, + &b->mv[0][0], 0, 0, 8, 8, 4, 4, w1, h1, 0); + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1] + 4 * bytesperpixel, + td->dst[2] + 4 * bytesperpixel, ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << 3, (col << 3) + 4, + &b->mv[1][0], 4, 0, 8, 8, 4, 4, w1, h1, 0); + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + (row << 3) + 4, col << 3, + &b->mv[2][0], 0, 4, 8, 8, 4, 4, w1, h1, 0); + mc_chroma_dir(td, mc[4][b->filter][0], + td->dst[1] + 4 * ls_uv + 4 * bytesperpixel, + td->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + (row << 3) + 4, (col << 3) + 4, + &b->mv[3][0], 4, 4, 8, 8, 4, 4, w1, h1, 0); + } + } + + if (b->comp) { + mc_luma_dir(td, mc[4][b->filter][1], td->dst[0], ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, col << 3, &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1); + mc_luma_dir(td, mc[4][b->filter][1], td->dst[0] + 4 * bytesperpixel, ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, (col << 3) + 4, &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1); + mc_luma_dir(td, mc[4][b->filter][1], + td->dst[0] + 4 * ls_y, ls_y, + ref2->data[0], ref2->linesize[0], tref2, + (row << 3) + 4, col << 3, &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1); + mc_luma_dir(td, mc[4][b->filter][1], + td->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y, + ref2->data[0], ref2->linesize[0], tref2, + (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1); + if (s->ss_v) { + h2 = (h2 + 1) >> 1; + if (s->ss_h) { + w2 = (w2 + 1) >> 1; + uvmv = ROUNDED_DIV_MVx4(b->mv[0][1], b->mv[1][1], + b->mv[2][1], b->mv[3][1]); + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1], td->dst[2], ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << 2, col << 2, + &uvmv, 0, 0, 4, 4, 4, 4, w2, h2, 1); + } else { + uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]); + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1], td->dst[2], ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << 2, col << 3, + &uvmv, 0, 0, 8, 4, 4, 4, w2, h2, 1); + uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[3][1]); + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1] + 4 * bytesperpixel, + td->dst[2] + 4 * bytesperpixel, ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << 2, (col << 3) + 4, + &uvmv, 4, 0, 8, 4, 4, 4, w2, h2, 1); + } + } else { + if (s->ss_h) { + w2 = (w2 + 1) >> 1; + uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[1][1]); + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1], td->dst[2], ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << 3, col << 2, + &uvmv, 0, 0, 4, 8, 4, 4, w2, h2, 1); + // BUG libvpx uses wrong block index for 4:2:2 bs=4x4 + // bottom block + // https://code.google.com/p/webm/issues/detail?id=993 + uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[2][1]); + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + (row << 3) + 4, col << 2, + &uvmv, 0, 4, 4, 8, 4, 4, w2, h2, 1); + } else { + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1], td->dst[2], ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << 3, col << 3, + &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1); + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1] + 4 * bytesperpixel, + td->dst[2] + 4 * bytesperpixel, ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << 3, (col << 3) + 4, + &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1); + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1] + 4 * ls_uv, td->dst[2] + 4 * ls_uv, ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + (row << 3) + 4, col << 3, + &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1); + mc_chroma_dir(td, mc[4][b->filter][1], + td->dst[1] + 4 * ls_uv + 4 * bytesperpixel, + td->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + (row << 3) + 4, (col << 3) + 4, + &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1); + } + } + } + } + } else { + int bwl = bwlog_tab[0][b->bs]; + int bw = ff_vp9_bwh_tab[0][b->bs][0] * 4; + int bh = ff_vp9_bwh_tab[0][b->bs][1] * 4; + int uvbw = ff_vp9_bwh_tab[s->ss_h][b->bs][0] * 4; + int uvbh = ff_vp9_bwh_tab[s->ss_v][b->bs][1] * 4; + + mc_luma_dir(td, mc[bwl][b->filter][0], td->dst[0], ls_y, + ref1->data[0], ref1->linesize[0], tref1, + row << 3, col << 3, &b->mv[0][0], 0, 0, bw, bh, bw, bh, w1, h1, 0); + w1 = (w1 + s->ss_h) >> s->ss_h; + h1 = (h1 + s->ss_v) >> s->ss_v; + mc_chroma_dir(td, mc[bwl + s->ss_h][b->filter][0], + td->dst[1], td->dst[2], ls_uv, + ref1->data[1], ref1->linesize[1], + ref1->data[2], ref1->linesize[2], tref1, + row << (3 - s->ss_v), col << (3 - s->ss_h), + &b->mv[0][0], 0, 0, uvbw, uvbh, uvbw, uvbh, w1, h1, 0); + + if (b->comp) { + mc_luma_dir(td, mc[bwl][b->filter][1], td->dst[0], ls_y, + ref2->data[0], ref2->linesize[0], tref2, + row << 3, col << 3, &b->mv[0][1], 0, 0, bw, bh, bw, bh, w2, h2, 1); + w2 = (w2 + s->ss_h) >> s->ss_h; + h2 = (h2 + s->ss_v) >> s->ss_v; + mc_chroma_dir(td, mc[bwl + s->ss_h][b->filter][1], + td->dst[1], td->dst[2], ls_uv, + ref2->data[1], ref2->linesize[1], + ref2->data[2], ref2->linesize[2], tref2, + row << (3 - s->ss_v), col << (3 - s->ss_h), + &b->mv[0][1], 0, 0, uvbw, uvbh, uvbw, uvbh, w2, h2, 1); + } + } +} diff --git a/media/ffvpx/libavcodec/vp9_parser.c b/media/ffvpx/libavcodec/vp9_parser.c new file mode 100644 index 0000000000..ffcb93505f --- /dev/null +++ b/media/ffvpx/libavcodec/vp9_parser.c @@ -0,0 +1,70 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/intreadwrite.h" +#include "libavcodec/get_bits.h" +#include "parser.h" + +static int parse(AVCodecParserContext *ctx, + AVCodecContext *avctx, + const uint8_t **out_data, int *out_size, + const uint8_t *data, int size) +{ + GetBitContext gb; + int res, profile, keyframe; + + *out_data = data; + *out_size = size; + + if (!size || (res = init_get_bits8(&gb, data, size)) < 0) + return size; // parsers can't return errors + get_bits(&gb, 2); // frame marker + profile = get_bits1(&gb); + profile |= get_bits1(&gb) << 1; + if (profile == 3) profile += get_bits1(&gb); + if (profile > 3) + return size; + + avctx->profile = profile; + + if (get_bits1(&gb)) { + keyframe = 0; + } else { + keyframe = !get_bits1(&gb); + } + + if (!keyframe) { + ctx->pict_type = AV_PICTURE_TYPE_P; + ctx->key_frame = 0; + } else { + ctx->pict_type = AV_PICTURE_TYPE_I; + ctx->key_frame = 1; + } + + return size; +} + +const AVCodecParser ff_vp9_parser = { + .codec_ids = { AV_CODEC_ID_VP9 }, + .parser_parse = parse, +}; diff --git a/media/ffvpx/libavcodec/vp9_superframe_split_bsf.c b/media/ffvpx/libavcodec/vp9_superframe_split_bsf.c new file mode 100644 index 0000000000..cddd48119c --- /dev/null +++ b/media/ffvpx/libavcodec/vp9_superframe_split_bsf.c @@ -0,0 +1,170 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * This bitstream filter splits VP9 superframes into packets containing + * just one frame. + */ + +#include <stddef.h> + +#include "bsf.h" +#include "bsf_internal.h" +#include "bytestream.h" +#include "get_bits.h" + +typedef struct VP9SFSplitContext { + AVPacket *buffer_pkt; + + int nb_frames; + int next_frame; + size_t next_frame_offset; + int sizes[8]; +} VP9SFSplitContext; + +static int vp9_superframe_split_filter(AVBSFContext *ctx, AVPacket *out) +{ + VP9SFSplitContext *s = ctx->priv_data; + AVPacket *in; + int i, j, ret, marker; + int is_superframe = !!s->buffer_pkt->data; + + if (!s->buffer_pkt->data) { + ret = ff_bsf_get_packet_ref(ctx, s->buffer_pkt); + if (ret < 0) + return ret; + in = s->buffer_pkt; + + if (!in->size) + goto passthrough; + + marker = in->data[in->size - 1]; + if ((marker & 0xe0) == 0xc0) { + int length_size = 1 + ((marker >> 3) & 0x3); + int nb_frames = 1 + (marker & 0x7); + int idx_size = 2 + nb_frames * length_size; + + if (in->size >= idx_size && in->data[in->size - idx_size] == marker) { + GetByteContext bc; + int64_t total_size = 0; + + bytestream2_init(&bc, in->data + in->size + 1 - idx_size, + nb_frames * length_size); + + for (i = 0; i < nb_frames; i++) { + int frame_size = 0; + for (j = 0; j < length_size; j++) + frame_size |= bytestream2_get_byte(&bc) << (j * 8); + + total_size += frame_size; + if (frame_size <= 0 || total_size > in->size - idx_size) { + av_log(ctx, AV_LOG_ERROR, + "Invalid frame size in a superframe: %d\n", frame_size); + ret = AVERROR(EINVAL); + goto fail; + } + s->sizes[i] = frame_size; + } + s->nb_frames = nb_frames; + s->next_frame = 0; + s->next_frame_offset = 0; + is_superframe = 1; + } + } + } + + if (is_superframe) { + GetBitContext gb; + int profile, invisible = 0; + + ret = av_packet_ref(out, s->buffer_pkt); + if (ret < 0) + goto fail; + + out->data += s->next_frame_offset; + out->size = s->sizes[s->next_frame]; + + s->next_frame_offset += out->size; + s->next_frame++; + + if (s->next_frame >= s->nb_frames) + av_packet_unref(s->buffer_pkt); + + ret = init_get_bits8(&gb, out->data, out->size); + if (ret < 0) + goto fail; + + get_bits(&gb, 2); // frame_marker + profile = get_bits1(&gb); + profile |= get_bits1(&gb) << 1; + if (profile == 3) + get_bits1(&gb); + if (!get_bits1(&gb)) { + get_bits1(&gb); + invisible = !get_bits1(&gb); + } + + if (invisible) + out->pts = AV_NOPTS_VALUE; + + } else { +passthrough: + av_packet_move_ref(out, s->buffer_pkt); + } + + return 0; +fail: + if (ret < 0) + av_packet_unref(out); + av_packet_unref(s->buffer_pkt); + return ret; +} + +static int vp9_superframe_split_init(AVBSFContext *ctx) +{ + VP9SFSplitContext *s = ctx->priv_data; + + s->buffer_pkt = av_packet_alloc(); + if (!s->buffer_pkt) + return AVERROR(ENOMEM); + + return 0; +} + +static void vp9_superframe_split_flush(AVBSFContext *ctx) +{ + VP9SFSplitContext *s = ctx->priv_data; + av_packet_unref(s->buffer_pkt); +} + +static void vp9_superframe_split_uninit(AVBSFContext *ctx) +{ + VP9SFSplitContext *s = ctx->priv_data; + av_packet_free(&s->buffer_pkt); +} + +const FFBitStreamFilter ff_vp9_superframe_split_bsf = { + .p.name = "vp9_superframe_split", + .p.codec_ids = (const enum AVCodecID []){ AV_CODEC_ID_VP9, AV_CODEC_ID_NONE }, + .priv_data_size = sizeof(VP9SFSplitContext), + .init = vp9_superframe_split_init, + .flush = vp9_superframe_split_flush, + .close = vp9_superframe_split_uninit, + .filter = vp9_superframe_split_filter, +}; diff --git a/media/ffvpx/libavcodec/vp9block.c b/media/ffvpx/libavcodec/vp9block.c new file mode 100644 index 0000000000..5743f048cc --- /dev/null +++ b/media/ffvpx/libavcodec/vp9block.c @@ -0,0 +1,1457 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" + +#include "threadframe.h" +#include "vp89_rac.h" +#include "vp9.h" +#include "vp9data.h" +#include "vp9dec.h" +#include "vpx_rac.h" + +static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h, + ptrdiff_t stride, int v) +{ + switch (w) { + case 1: + do { + *ptr = v; + ptr += stride; + } while (--h); + break; + case 2: { + int v16 = v * 0x0101; + do { + AV_WN16A(ptr, v16); + ptr += stride; + } while (--h); + break; + } + case 4: { + uint32_t v32 = v * 0x01010101; + do { + AV_WN32A(ptr, v32); + ptr += stride; + } while (--h); + break; + } + case 8: { +#if HAVE_FAST_64BIT + uint64_t v64 = v * 0x0101010101010101ULL; + do { + AV_WN64A(ptr, v64); + ptr += stride; + } while (--h); +#else + uint32_t v32 = v * 0x01010101; + do { + AV_WN32A(ptr, v32); + AV_WN32A(ptr + 4, v32); + ptr += stride; + } while (--h); +#endif + break; + } + } +} + +static void decode_mode(VP9TileData *td) +{ + static const uint8_t left_ctx[N_BS_SIZES] = { + 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf + }; + static const uint8_t above_ctx[N_BS_SIZES] = { + 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf + }; + static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = { + TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16, + TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4 + }; + const VP9Context *s = td->s; + VP9Block *b = td->b; + int row = td->row, col = td->col, row7 = td->row7; + enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs]; + int bw4 = ff_vp9_bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4); + int bh4 = ff_vp9_bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y; + int have_a = row > 0, have_l = col > td->tile_col_start; + int vref, filter_id; + + if (!s->s.h.segmentation.enabled) { + b->seg_id = 0; + } else if (s->s.h.keyframe || s->s.h.intraonly) { + b->seg_id = !s->s.h.segmentation.update_map ? 0 : + vp89_rac_get_tree(td->c, ff_vp9_segmentation_tree, + s->s.h.segmentation.prob); + } else if (!s->s.h.segmentation.update_map || + (s->s.h.segmentation.temporal && + vpx_rac_get_prob_branchy(td->c, + s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] + + td->left_segpred_ctx[row7]]))) { + if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) { + int pred = 8, x; + uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map; + + if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass) + ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0); + for (y = 0; y < h4; y++) { + int idx_base = (y + row) * 8 * s->sb_cols + col; + for (x = 0; x < w4; x++) + pred = FFMIN(pred, refsegmap[idx_base + x]); + } + av_assert1(pred < 8); + b->seg_id = pred; + } else { + b->seg_id = 0; + } + + memset(&s->above_segpred_ctx[col], 1, w4); + memset(&td->left_segpred_ctx[row7], 1, h4); + } else { + b->seg_id = vp89_rac_get_tree(td->c, ff_vp9_segmentation_tree, + s->s.h.segmentation.prob); + + memset(&s->above_segpred_ctx[col], 0, w4); + memset(&td->left_segpred_ctx[row7], 0, h4); + } + if (s->s.h.segmentation.enabled && + (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) { + setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col], + bw4, bh4, 8 * s->sb_cols, b->seg_id); + } + + b->skip = s->s.h.segmentation.enabled && + s->s.h.segmentation.feat[b->seg_id].skip_enabled; + if (!b->skip) { + int c = td->left_skip_ctx[row7] + s->above_skip_ctx[col]; + b->skip = vpx_rac_get_prob(td->c, s->prob.p.skip[c]); + td->counts.skip[c][b->skip]++; + } + + if (s->s.h.keyframe || s->s.h.intraonly) { + b->intra = 1; + } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) { + b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val; + } else { + int c, bit; + + if (have_a && have_l) { + c = s->above_intra_ctx[col] + td->left_intra_ctx[row7]; + c += (c == 2); + } else { + c = have_a ? 2 * s->above_intra_ctx[col] : + have_l ? 2 * td->left_intra_ctx[row7] : 0; + } + bit = vpx_rac_get_prob(td->c, s->prob.p.intra[c]); + td->counts.intra[c][bit]++; + b->intra = !bit; + } + + if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) { + int c; + if (have_a) { + if (have_l) { + c = (s->above_skip_ctx[col] ? max_tx : + s->above_txfm_ctx[col]) + + (td->left_skip_ctx[row7] ? max_tx : + td->left_txfm_ctx[row7]) > max_tx; + } else { + c = s->above_skip_ctx[col] ? 1 : + (s->above_txfm_ctx[col] * 2 > max_tx); + } + } else if (have_l) { + c = td->left_skip_ctx[row7] ? 1 : + (td->left_txfm_ctx[row7] * 2 > max_tx); + } else { + c = 1; + } + switch (max_tx) { + case TX_32X32: + b->tx = vpx_rac_get_prob(td->c, s->prob.p.tx32p[c][0]); + if (b->tx) { + b->tx += vpx_rac_get_prob(td->c, s->prob.p.tx32p[c][1]); + if (b->tx == 2) + b->tx += vpx_rac_get_prob(td->c, s->prob.p.tx32p[c][2]); + } + td->counts.tx32p[c][b->tx]++; + break; + case TX_16X16: + b->tx = vpx_rac_get_prob(td->c, s->prob.p.tx16p[c][0]); + if (b->tx) + b->tx += vpx_rac_get_prob(td->c, s->prob.p.tx16p[c][1]); + td->counts.tx16p[c][b->tx]++; + break; + case TX_8X8: + b->tx = vpx_rac_get_prob(td->c, s->prob.p.tx8p[c]); + td->counts.tx8p[c][b->tx]++; + break; + case TX_4X4: + b->tx = TX_4X4; + break; + } + } else { + b->tx = FFMIN(max_tx, s->s.h.txfmmode); + } + + if (s->s.h.keyframe || s->s.h.intraonly) { + uint8_t *a = &s->above_mode_ctx[col * 2]; + uint8_t *l = &td->left_mode_ctx[(row7) << 1]; + + b->comp = 0; + if (b->bs > BS_8x8) { + // FIXME the memory storage intermediates here aren't really + // necessary, they're just there to make the code slightly + // simpler for now + b->mode[0] = + a[0] = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + ff_vp9_default_kf_ymode_probs[a[0]][l[0]]); + if (b->bs != BS_8x4) { + b->mode[1] = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + ff_vp9_default_kf_ymode_probs[a[1]][b->mode[0]]); + l[0] = + a[1] = b->mode[1]; + } else { + l[0] = + a[1] = + b->mode[1] = b->mode[0]; + } + if (b->bs != BS_4x8) { + b->mode[2] = + a[0] = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + ff_vp9_default_kf_ymode_probs[a[0]][l[1]]); + if (b->bs != BS_8x4) { + b->mode[3] = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + ff_vp9_default_kf_ymode_probs[a[1]][b->mode[2]]); + l[1] = + a[1] = b->mode[3]; + } else { + l[1] = + a[1] = + b->mode[3] = b->mode[2]; + } + } else { + b->mode[2] = b->mode[0]; + l[1] = + a[1] = + b->mode[3] = b->mode[1]; + } + } else { + b->mode[0] = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + ff_vp9_default_kf_ymode_probs[*a][*l]); + b->mode[3] = + b->mode[2] = + b->mode[1] = b->mode[0]; + // FIXME this can probably be optimized + memset(a, b->mode[0], ff_vp9_bwh_tab[0][b->bs][0]); + memset(l, b->mode[0], ff_vp9_bwh_tab[0][b->bs][1]); + } + b->uvmode = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + ff_vp9_default_kf_uvmode_probs[b->mode[3]]); + } else if (b->intra) { + b->comp = 0; + if (b->bs > BS_8x8) { + b->mode[0] = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + s->prob.p.y_mode[0]); + td->counts.y_mode[0][b->mode[0]]++; + if (b->bs != BS_8x4) { + b->mode[1] = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + s->prob.p.y_mode[0]); + td->counts.y_mode[0][b->mode[1]]++; + } else { + b->mode[1] = b->mode[0]; + } + if (b->bs != BS_4x8) { + b->mode[2] = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + s->prob.p.y_mode[0]); + td->counts.y_mode[0][b->mode[2]]++; + if (b->bs != BS_8x4) { + b->mode[3] = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + s->prob.p.y_mode[0]); + td->counts.y_mode[0][b->mode[3]]++; + } else { + b->mode[3] = b->mode[2]; + } + } else { + b->mode[2] = b->mode[0]; + b->mode[3] = b->mode[1]; + } + } else { + static const uint8_t size_group[10] = { + 3, 3, 3, 3, 2, 2, 2, 1, 1, 1 + }; + int sz = size_group[b->bs]; + + b->mode[0] = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + s->prob.p.y_mode[sz]); + b->mode[1] = + b->mode[2] = + b->mode[3] = b->mode[0]; + td->counts.y_mode[sz][b->mode[3]]++; + } + b->uvmode = vp89_rac_get_tree(td->c, ff_vp9_intramode_tree, + s->prob.p.uv_mode[b->mode[3]]); + td->counts.uv_mode[b->mode[3]][b->uvmode]++; + } else { + static const uint8_t inter_mode_ctx_lut[14][14] = { + { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }, + { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }, + { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }, + { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }, + { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }, + { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }, + { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }, + { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }, + { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }, + { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 }, + { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 }, + { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 }, + { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 }, + { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 }, + }; + + if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) { + av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0); + b->comp = 0; + b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1; + } else { + // read comp_pred flag + if (s->s.h.comppredmode != PRED_SWITCHABLE) { + b->comp = s->s.h.comppredmode == PRED_COMPREF; + } else { + int c; + + // FIXME add intra as ref=0xff (or -1) to make these easier? + if (have_a) { + if (have_l) { + if (s->above_comp_ctx[col] && td->left_comp_ctx[row7]) { + c = 4; + } else if (s->above_comp_ctx[col]) { + c = 2 + (td->left_intra_ctx[row7] || + td->left_ref_ctx[row7] == s->s.h.fixcompref); + } else if (td->left_comp_ctx[row7]) { + c = 2 + (s->above_intra_ctx[col] || + s->above_ref_ctx[col] == s->s.h.fixcompref); + } else { + c = (!s->above_intra_ctx[col] && + s->above_ref_ctx[col] == s->s.h.fixcompref) ^ + (!td->left_intra_ctx[row7] && + td->left_ref_ctx[row & 7] == s->s.h.fixcompref); + } + } else { + c = s->above_comp_ctx[col] ? 3 : + (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref); + } + } else if (have_l) { + c = td->left_comp_ctx[row7] ? 3 : + (!td->left_intra_ctx[row7] && td->left_ref_ctx[row7] == s->s.h.fixcompref); + } else { + c = 1; + } + b->comp = vpx_rac_get_prob(td->c, s->prob.p.comp[c]); + td->counts.comp[c][b->comp]++; + } + + // read actual references + // FIXME probably cache a few variables here to prevent repetitive + // memory accesses below + if (b->comp) { /* two references */ + int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit; + + b->ref[fix_idx] = s->s.h.fixcompref; + // FIXME can this codeblob be replaced by some sort of LUT? + if (have_a) { + if (have_l) { + if (s->above_intra_ctx[col]) { + if (td->left_intra_ctx[row7]) { + c = 2; + } else { + c = 1 + 2 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]); + } + } else if (td->left_intra_ctx[row7]) { + c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]); + } else { + int refl = td->left_ref_ctx[row7], refa = s->above_ref_ctx[col]; + + if (refl == refa && refa == s->s.h.varcompref[1]) { + c = 0; + } else if (!td->left_comp_ctx[row7] && !s->above_comp_ctx[col]) { + if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) || + (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) { + c = 4; + } else { + c = (refa == refl) ? 3 : 1; + } + } else if (!td->left_comp_ctx[row7]) { + if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) { + c = 1; + } else { + c = (refl == s->s.h.varcompref[1] && + refa != s->s.h.varcompref[1]) ? 2 : 4; + } + } else if (!s->above_comp_ctx[col]) { + if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) { + c = 1; + } else { + c = (refa == s->s.h.varcompref[1] && + refl != s->s.h.varcompref[1]) ? 2 : 4; + } + } else { + c = (refl == refa) ? 4 : 2; + } + } + } else { + if (s->above_intra_ctx[col]) { + c = 2; + } else if (s->above_comp_ctx[col]) { + c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]); + } else { + c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]); + } + } + } else if (have_l) { + if (td->left_intra_ctx[row7]) { + c = 2; + } else if (td->left_comp_ctx[row7]) { + c = 4 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]); + } else { + c = 3 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]); + } + } else { + c = 2; + } + bit = vpx_rac_get_prob(td->c, s->prob.p.comp_ref[c]); + b->ref[var_idx] = s->s.h.varcompref[bit]; + td->counts.comp_ref[c][bit]++; + } else /* single reference */ { + int bit, c; + + if (have_a && !s->above_intra_ctx[col]) { + if (have_l && !td->left_intra_ctx[row7]) { + if (td->left_comp_ctx[row7]) { + if (s->above_comp_ctx[col]) { + c = 1 + (!s->s.h.fixcompref || !td->left_ref_ctx[row7] || + !s->above_ref_ctx[col]); + } else { + c = (3 * !s->above_ref_ctx[col]) + + (!s->s.h.fixcompref || !td->left_ref_ctx[row7]); + } + } else if (s->above_comp_ctx[col]) { + c = (3 * !td->left_ref_ctx[row7]) + + (!s->s.h.fixcompref || !s->above_ref_ctx[col]); + } else { + c = 2 * !td->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col]; + } + } else if (s->above_intra_ctx[col]) { + c = 2; + } else if (s->above_comp_ctx[col]) { + c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]); + } else { + c = 4 * (!s->above_ref_ctx[col]); + } + } else if (have_l && !td->left_intra_ctx[row7]) { + if (td->left_intra_ctx[row7]) { + c = 2; + } else if (td->left_comp_ctx[row7]) { + c = 1 + (!s->s.h.fixcompref || !td->left_ref_ctx[row7]); + } else { + c = 4 * (!td->left_ref_ctx[row7]); + } + } else { + c = 2; + } + bit = vpx_rac_get_prob(td->c, s->prob.p.single_ref[c][0]); + td->counts.single_ref[c][0][bit]++; + if (!bit) { + b->ref[0] = 0; + } else { + // FIXME can this codeblob be replaced by some sort of LUT? + if (have_a) { + if (have_l) { + if (td->left_intra_ctx[row7]) { + if (s->above_intra_ctx[col]) { + c = 2; + } else if (s->above_comp_ctx[col]) { + c = 1 + 2 * (s->s.h.fixcompref == 1 || + s->above_ref_ctx[col] == 1); + } else if (!s->above_ref_ctx[col]) { + c = 3; + } else { + c = 4 * (s->above_ref_ctx[col] == 1); + } + } else if (s->above_intra_ctx[col]) { + if (td->left_intra_ctx[row7]) { + c = 2; + } else if (td->left_comp_ctx[row7]) { + c = 1 + 2 * (s->s.h.fixcompref == 1 || + td->left_ref_ctx[row7] == 1); + } else if (!td->left_ref_ctx[row7]) { + c = 3; + } else { + c = 4 * (td->left_ref_ctx[row7] == 1); + } + } else if (s->above_comp_ctx[col]) { + if (td->left_comp_ctx[row7]) { + if (td->left_ref_ctx[row7] == s->above_ref_ctx[col]) { + c = 3 * (s->s.h.fixcompref == 1 || + td->left_ref_ctx[row7] == 1); + } else { + c = 2; + } + } else if (!td->left_ref_ctx[row7]) { + c = 1 + 2 * (s->s.h.fixcompref == 1 || + s->above_ref_ctx[col] == 1); + } else { + c = 3 * (td->left_ref_ctx[row7] == 1) + + (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1); + } + } else if (td->left_comp_ctx[row7]) { + if (!s->above_ref_ctx[col]) { + c = 1 + 2 * (s->s.h.fixcompref == 1 || + td->left_ref_ctx[row7] == 1); + } else { + c = 3 * (s->above_ref_ctx[col] == 1) + + (s->s.h.fixcompref == 1 || td->left_ref_ctx[row7] == 1); + } + } else if (!s->above_ref_ctx[col]) { + if (!td->left_ref_ctx[row7]) { + c = 3; + } else { + c = 4 * (td->left_ref_ctx[row7] == 1); + } + } else if (!td->left_ref_ctx[row7]) { + c = 4 * (s->above_ref_ctx[col] == 1); + } else { + c = 2 * (td->left_ref_ctx[row7] == 1) + + 2 * (s->above_ref_ctx[col] == 1); + } + } else { + if (s->above_intra_ctx[col] || + (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) { + c = 2; + } else if (s->above_comp_ctx[col]) { + c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1); + } else { + c = 4 * (s->above_ref_ctx[col] == 1); + } + } + } else if (have_l) { + if (td->left_intra_ctx[row7] || + (!td->left_comp_ctx[row7] && !td->left_ref_ctx[row7])) { + c = 2; + } else if (td->left_comp_ctx[row7]) { + c = 3 * (s->s.h.fixcompref == 1 || td->left_ref_ctx[row7] == 1); + } else { + c = 4 * (td->left_ref_ctx[row7] == 1); + } + } else { + c = 2; + } + bit = vpx_rac_get_prob(td->c, s->prob.p.single_ref[c][1]); + td->counts.single_ref[c][1][bit]++; + b->ref[0] = 1 + bit; + } + } + } + + if (b->bs <= BS_8x8) { + if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) { + b->mode[0] = + b->mode[1] = + b->mode[2] = + b->mode[3] = ZEROMV; + } else { + static const uint8_t off[10] = { + 3, 0, 0, 1, 0, 0, 0, 0, 0, 0 + }; + + // FIXME this needs to use the LUT tables from find_ref_mvs + // because not all are -1,0/0,-1 + int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]] + [td->left_mode_ctx[row7 + off[b->bs]]]; + + b->mode[0] = vp89_rac_get_tree(td->c, ff_vp9_inter_mode_tree, + s->prob.p.mv_mode[c]); + b->mode[1] = + b->mode[2] = + b->mode[3] = b->mode[0]; + td->counts.mv_mode[c][b->mode[0] - 10]++; + } + } + + if (s->s.h.filtermode == FILTER_SWITCHABLE) { + int c; + + if (have_a && s->above_mode_ctx[col] >= NEARESTMV) { + if (have_l && td->left_mode_ctx[row7] >= NEARESTMV) { + c = s->above_filter_ctx[col] == td->left_filter_ctx[row7] ? + td->left_filter_ctx[row7] : 3; + } else { + c = s->above_filter_ctx[col]; + } + } else if (have_l && td->left_mode_ctx[row7] >= NEARESTMV) { + c = td->left_filter_ctx[row7]; + } else { + c = 3; + } + + filter_id = vp89_rac_get_tree(td->c, ff_vp9_filter_tree, + s->prob.p.filter[c]); + td->counts.filter[c][filter_id]++; + b->filter = ff_vp9_filter_lut[filter_id]; + } else { + b->filter = s->s.h.filtermode; + } + + if (b->bs > BS_8x8) { + int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][td->left_mode_ctx[row7]]; + + b->mode[0] = vp89_rac_get_tree(td->c, ff_vp9_inter_mode_tree, + s->prob.p.mv_mode[c]); + td->counts.mv_mode[c][b->mode[0] - 10]++; + ff_vp9_fill_mv(td, b->mv[0], b->mode[0], 0); + + if (b->bs != BS_8x4) { + b->mode[1] = vp89_rac_get_tree(td->c, ff_vp9_inter_mode_tree, + s->prob.p.mv_mode[c]); + td->counts.mv_mode[c][b->mode[1] - 10]++; + ff_vp9_fill_mv(td, b->mv[1], b->mode[1], 1); + } else { + b->mode[1] = b->mode[0]; + AV_COPY32(&b->mv[1][0], &b->mv[0][0]); + AV_COPY32(&b->mv[1][1], &b->mv[0][1]); + } + + if (b->bs != BS_4x8) { + b->mode[2] = vp89_rac_get_tree(td->c, ff_vp9_inter_mode_tree, + s->prob.p.mv_mode[c]); + td->counts.mv_mode[c][b->mode[2] - 10]++; + ff_vp9_fill_mv(td, b->mv[2], b->mode[2], 2); + + if (b->bs != BS_8x4) { + b->mode[3] = vp89_rac_get_tree(td->c, ff_vp9_inter_mode_tree, + s->prob.p.mv_mode[c]); + td->counts.mv_mode[c][b->mode[3] - 10]++; + ff_vp9_fill_mv(td, b->mv[3], b->mode[3], 3); + } else { + b->mode[3] = b->mode[2]; + AV_COPY32(&b->mv[3][0], &b->mv[2][0]); + AV_COPY32(&b->mv[3][1], &b->mv[2][1]); + } + } else { + b->mode[2] = b->mode[0]; + AV_COPY32(&b->mv[2][0], &b->mv[0][0]); + AV_COPY32(&b->mv[2][1], &b->mv[0][1]); + b->mode[3] = b->mode[1]; + AV_COPY32(&b->mv[3][0], &b->mv[1][0]); + AV_COPY32(&b->mv[3][1], &b->mv[1][1]); + } + } else { + ff_vp9_fill_mv(td, b->mv[0], b->mode[0], -1); + AV_COPY32(&b->mv[1][0], &b->mv[0][0]); + AV_COPY32(&b->mv[2][0], &b->mv[0][0]); + AV_COPY32(&b->mv[3][0], &b->mv[0][0]); + AV_COPY32(&b->mv[1][1], &b->mv[0][1]); + AV_COPY32(&b->mv[2][1], &b->mv[0][1]); + AV_COPY32(&b->mv[3][1], &b->mv[0][1]); + } + + vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0]; + } + +#if HAVE_FAST_64BIT +#define SPLAT_CTX(var, val, n) \ + switch (n) { \ + case 1: var = val; break; \ + case 2: AV_WN16A(&var, val * 0x0101); break; \ + case 4: AV_WN32A(&var, val * 0x01010101); break; \ + case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \ + case 16: { \ + uint64_t v64 = val * 0x0101010101010101ULL; \ + AV_WN64A( &var, v64); \ + AV_WN64A(&((uint8_t *) &var)[8], v64); \ + break; \ + } \ + } +#else +#define SPLAT_CTX(var, val, n) \ + switch (n) { \ + case 1: var = val; break; \ + case 2: AV_WN16A(&var, val * 0x0101); break; \ + case 4: AV_WN32A(&var, val * 0x01010101); break; \ + case 8: { \ + uint32_t v32 = val * 0x01010101; \ + AV_WN32A( &var, v32); \ + AV_WN32A(&((uint8_t *) &var)[4], v32); \ + break; \ + } \ + case 16: { \ + uint32_t v32 = val * 0x01010101; \ + AV_WN32A( &var, v32); \ + AV_WN32A(&((uint8_t *) &var)[4], v32); \ + AV_WN32A(&((uint8_t *) &var)[8], v32); \ + AV_WN32A(&((uint8_t *) &var)[12], v32); \ + break; \ + } \ + } +#endif + + switch (ff_vp9_bwh_tab[1][b->bs][0]) { +#define SET_CTXS(perf, dir, off, n) \ + do { \ + SPLAT_CTX(perf->dir##_skip_ctx[off], b->skip, n); \ + SPLAT_CTX(perf->dir##_txfm_ctx[off], b->tx, n); \ + SPLAT_CTX(perf->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \ + if (!s->s.h.keyframe && !s->s.h.intraonly) { \ + SPLAT_CTX(perf->dir##_intra_ctx[off], b->intra, n); \ + SPLAT_CTX(perf->dir##_comp_ctx[off], b->comp, n); \ + SPLAT_CTX(perf->dir##_mode_ctx[off], b->mode[3], n); \ + if (!b->intra) { \ + SPLAT_CTX(perf->dir##_ref_ctx[off], vref, n); \ + if (s->s.h.filtermode == FILTER_SWITCHABLE) { \ + SPLAT_CTX(perf->dir##_filter_ctx[off], filter_id, n); \ + } \ + } \ + } \ + } while (0) + case 1: SET_CTXS(s, above, col, 1); break; + case 2: SET_CTXS(s, above, col, 2); break; + case 4: SET_CTXS(s, above, col, 4); break; + case 8: SET_CTXS(s, above, col, 8); break; + } + switch (ff_vp9_bwh_tab[1][b->bs][1]) { + case 1: SET_CTXS(td, left, row7, 1); break; + case 2: SET_CTXS(td, left, row7, 2); break; + case 4: SET_CTXS(td, left, row7, 4); break; + case 8: SET_CTXS(td, left, row7, 8); break; + } +#undef SPLAT_CTX +#undef SET_CTXS + + if (!s->s.h.keyframe && !s->s.h.intraonly) { + if (b->bs > BS_8x8) { + int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]); + + AV_COPY32(&td->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]); + AV_COPY32(&td->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]); + AV_WN32A(&td->left_mv_ctx[row7 * 2 + 1][0], mv0); + AV_WN32A(&td->left_mv_ctx[row7 * 2 + 1][1], mv1); + AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]); + AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]); + AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0); + AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1); + } else { + int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]); + + for (n = 0; n < w4 * 2; n++) { + AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0); + AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1); + } + for (n = 0; n < h4 * 2; n++) { + AV_WN32A(&td->left_mv_ctx[row7 * 2 + n][0], mv0); + AV_WN32A(&td->left_mv_ctx[row7 * 2 + n][1], mv1); + } + } + } + + // FIXME kinda ugly + for (y = 0; y < h4; y++) { + int x, o = (row + y) * s->sb_cols * 8 + col; + VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o]; + + if (b->intra) { + for (x = 0; x < w4; x++) { + mv[x].ref[0] = + mv[x].ref[1] = -1; + } + } else if (b->comp) { + for (x = 0; x < w4; x++) { + mv[x].ref[0] = b->ref[0]; + mv[x].ref[1] = b->ref[1]; + AV_COPY32(&mv[x].mv[0], &b->mv[3][0]); + AV_COPY32(&mv[x].mv[1], &b->mv[3][1]); + } + } else { + for (x = 0; x < w4; x++) { + mv[x].ref[0] = b->ref[0]; + mv[x].ref[1] = -1; + AV_COPY32(&mv[x].mv[0], &b->mv[3][0]); + } + } + } +} + +// FIXME merge cnt/eob arguments? +static av_always_inline int +decode_coeffs_b_generic(VPXRangeCoder *c, int16_t *coef, int n_coeffs, + int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3], + unsigned (*eob)[6][2], const uint8_t (*p)[6][11], + int nnz, const int16_t *scan, const int16_t (*nb)[2], + const int16_t *band_counts, const int16_t *qmul) +{ + int i = 0, band = 0, band_left = band_counts[band]; + const uint8_t *tp = p[0][nnz]; + uint8_t cache[1024]; + + do { + int val, rc; + + val = vpx_rac_get_prob_branchy(c, tp[0]); // eob + eob[band][nnz][val]++; + if (!val) + break; + +skip_eob: + if (!vpx_rac_get_prob_branchy(c, tp[1])) { // zero + cnt[band][nnz][0]++; + if (!--band_left) + band_left = band_counts[++band]; + cache[scan[i]] = 0; + nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1; + tp = p[band][nnz]; + if (++i == n_coeffs) + break; //invalid input; blocks should end with EOB + goto skip_eob; + } + + rc = scan[i]; + if (!vpx_rac_get_prob_branchy(c, tp[2])) { // one + cnt[band][nnz][1]++; + val = 1; + cache[rc] = 1; + } else { + cnt[band][nnz][2]++; + if (!vpx_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4 + if (!vpx_rac_get_prob_branchy(c, tp[4])) { + cache[rc] = val = 2; + } else { + val = 3 + vpx_rac_get_prob(c, tp[5]); + cache[rc] = 3; + } + } else if (!vpx_rac_get_prob_branchy(c, tp[6])) { // cat1/2 + cache[rc] = 4; + if (!vpx_rac_get_prob_branchy(c, tp[7])) { + val = vpx_rac_get_prob(c, 159) + 5; + } else { + val = (vpx_rac_get_prob(c, 165) << 1) + 7; + val += vpx_rac_get_prob(c, 145); + } + } else { // cat 3-6 + cache[rc] = 5; + if (!vpx_rac_get_prob_branchy(c, tp[8])) { + if (!vpx_rac_get_prob_branchy(c, tp[9])) { + val = 11 + (vpx_rac_get_prob(c, 173) << 2); + val += (vpx_rac_get_prob(c, 148) << 1); + val += vpx_rac_get_prob(c, 140); + } else { + val = 19 + (vpx_rac_get_prob(c, 176) << 3); + val += (vpx_rac_get_prob(c, 155) << 2); + val += (vpx_rac_get_prob(c, 140) << 1); + val += vpx_rac_get_prob(c, 135); + } + } else if (!vpx_rac_get_prob_branchy(c, tp[10])) { + val = (vpx_rac_get_prob(c, 180) << 4) + 35; + val += (vpx_rac_get_prob(c, 157) << 3); + val += (vpx_rac_get_prob(c, 141) << 2); + val += (vpx_rac_get_prob(c, 134) << 1); + val += vpx_rac_get_prob(c, 130); + } else { + val = 67; + if (!is8bitsperpixel) { + if (bpp == 12) { + val += vpx_rac_get_prob(c, 255) << 17; + val += vpx_rac_get_prob(c, 255) << 16; + } + val += (vpx_rac_get_prob(c, 255) << 15); + val += (vpx_rac_get_prob(c, 255) << 14); + } + val += (vpx_rac_get_prob(c, 254) << 13); + val += (vpx_rac_get_prob(c, 254) << 12); + val += (vpx_rac_get_prob(c, 254) << 11); + val += (vpx_rac_get_prob(c, 252) << 10); + val += (vpx_rac_get_prob(c, 249) << 9); + val += (vpx_rac_get_prob(c, 243) << 8); + val += (vpx_rac_get_prob(c, 230) << 7); + val += (vpx_rac_get_prob(c, 196) << 6); + val += (vpx_rac_get_prob(c, 177) << 5); + val += (vpx_rac_get_prob(c, 153) << 4); + val += (vpx_rac_get_prob(c, 140) << 3); + val += (vpx_rac_get_prob(c, 133) << 2); + val += (vpx_rac_get_prob(c, 130) << 1); + val += vpx_rac_get_prob(c, 129); + } + } + } +#define STORE_COEF(c, i, v) do { \ + if (is8bitsperpixel) { \ + c[i] = v; \ + } else { \ + AV_WN32A(&c[i * 2], v); \ + } \ +} while (0) + if (!--band_left) + band_left = band_counts[++band]; + if (is_tx32x32) + STORE_COEF(coef, rc, (int)((vp89_rac_get(c) ? -val : val) * (unsigned)qmul[!!i]) / 2); + else + STORE_COEF(coef, rc, (vp89_rac_get(c) ? -val : val) * (unsigned)qmul[!!i]); + nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1; + tp = p[band][nnz]; + } while (++i < n_coeffs); + + return i; +} + +static int decode_coeffs_b_8bpp(VP9TileData *td, int16_t *coef, int n_coeffs, + unsigned (*cnt)[6][3], unsigned (*eob)[6][2], + const uint8_t (*p)[6][11], int nnz, const int16_t *scan, + const int16_t (*nb)[2], const int16_t *band_counts, + const int16_t *qmul) +{ + return decode_coeffs_b_generic(td->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p, + nnz, scan, nb, band_counts, qmul); +} + +static int decode_coeffs_b32_8bpp(VP9TileData *td, int16_t *coef, int n_coeffs, + unsigned (*cnt)[6][3], unsigned (*eob)[6][2], + const uint8_t (*p)[6][11], int nnz, const int16_t *scan, + const int16_t (*nb)[2], const int16_t *band_counts, + const int16_t *qmul) +{ + return decode_coeffs_b_generic(td->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p, + nnz, scan, nb, band_counts, qmul); +} + +static int decode_coeffs_b_16bpp(VP9TileData *td, int16_t *coef, int n_coeffs, + unsigned (*cnt)[6][3], unsigned (*eob)[6][2], + const uint8_t (*p)[6][11], int nnz, const int16_t *scan, + const int16_t (*nb)[2], const int16_t *band_counts, + const int16_t *qmul) +{ + return decode_coeffs_b_generic(td->c, coef, n_coeffs, 0, 0, td->s->s.h.bpp, cnt, eob, p, + nnz, scan, nb, band_counts, qmul); +} + +static int decode_coeffs_b32_16bpp(VP9TileData *td, int16_t *coef, int n_coeffs, + unsigned (*cnt)[6][3], unsigned (*eob)[6][2], + const uint8_t (*p)[6][11], int nnz, const int16_t *scan, + const int16_t (*nb)[2], const int16_t *band_counts, + const int16_t *qmul) +{ + return decode_coeffs_b_generic(td->c, coef, n_coeffs, 1, 0, td->s->s.h.bpp, cnt, eob, p, + nnz, scan, nb, band_counts, qmul); +} + +static av_always_inline int decode_coeffs(VP9TileData *td, int is8bitsperpixel) +{ + const VP9Context *s = td->s; + VP9Block *b = td->b; + int row = td->row, col = td->col; + const uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra]; + unsigned (*c)[6][3] = td->counts.coef[b->tx][0 /* y */][!b->intra]; + unsigned (*e)[6][2] = td->counts.eob[b->tx][0 /* y */][!b->intra]; + int w4 = ff_vp9_bwh_tab[1][b->bs][0] << 1, h4 = ff_vp9_bwh_tab[1][b->bs][1] << 1; + int end_x = FFMIN(2 * (s->cols - col), w4); + int end_y = FFMIN(2 * (s->rows - row), h4); + int n, pl, x, y, ret; + const int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul; + int tx = 4 * s->s.h.lossless + b->tx; + const int16_t * const *yscans = ff_vp9_scans[tx]; + const int16_t (* const * ynbs)[2] = ff_vp9_scans_nb[tx]; + const int16_t *uvscan = ff_vp9_scans[b->uvtx][DCT_DCT]; + const int16_t (*uvnb)[2] = ff_vp9_scans_nb[b->uvtx][DCT_DCT]; + uint8_t *a = &s->above_y_nnz_ctx[col * 2]; + uint8_t *l = &td->left_y_nnz_ctx[(row & 7) << 1]; + static const int16_t band_counts[4][8] = { + { 1, 2, 3, 4, 3, 16 - 13 }, + { 1, 2, 3, 4, 11, 64 - 21 }, + { 1, 2, 3, 4, 11, 256 - 21 }, + { 1, 2, 3, 4, 11, 1024 - 21 }, + }; + const int16_t *y_band_counts = band_counts[b->tx]; + const int16_t *uv_band_counts = band_counts[b->uvtx]; + int bytesperpixel = is8bitsperpixel ? 1 : 2; + int total_coeff = 0; + +#define MERGE(la, end, step, rd) \ + for (n = 0; n < end; n += step) \ + la[n] = !!rd(&la[n]) +#define MERGE_CTX(step, rd) \ + do { \ + MERGE(l, end_y, step, rd); \ + MERGE(a, end_x, step, rd); \ + } while (0) + +#define DECODE_Y_COEF_LOOP(step, mode_index, v) \ + for (n = 0, y = 0; y < end_y; y += step) { \ + for (x = 0; x < end_x; x += step, n += step * step) { \ + enum TxfmType txtp = ff_vp9_intra_txfm_type[b->mode[mode_index]]; \ + ret = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \ + (td, td->block + 16 * n * bytesperpixel, 16 * step * step, \ + c, e, p, a[x] + l[y], yscans[txtp], \ + ynbs[txtp], y_band_counts, qmul[0]); \ + a[x] = l[y] = !!ret; \ + total_coeff |= !!ret; \ + if (step >= 4) { \ + AV_WN16A(&td->eob[n], ret); \ + } else { \ + td->eob[n] = ret; \ + } \ + } \ + } + +#define SPLAT(la, end, step, cond) \ + if (step == 2) { \ + for (n = 1; n < end; n += step) \ + la[n] = la[n - 1]; \ + } else if (step == 4) { \ + if (cond) { \ + for (n = 0; n < end; n += step) \ + AV_WN32A(&la[n], la[n] * 0x01010101); \ + } else { \ + for (n = 0; n < end; n += step) \ + memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \ + } \ + } else /* step == 8 */ { \ + if (cond) { \ + if (HAVE_FAST_64BIT) { \ + for (n = 0; n < end; n += step) \ + AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \ + } else { \ + for (n = 0; n < end; n += step) { \ + uint32_t v32 = la[n] * 0x01010101; \ + AV_WN32A(&la[n], v32); \ + AV_WN32A(&la[n + 4], v32); \ + } \ + } \ + } else { \ + for (n = 0; n < end; n += step) \ + memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \ + } \ + } +#define SPLAT_CTX(step) \ + do { \ + SPLAT(a, end_x, step, end_x == w4); \ + SPLAT(l, end_y, step, end_y == h4); \ + } while (0) + + /* y tokens */ + switch (b->tx) { + case TX_4X4: + DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,); + break; + case TX_8X8: + MERGE_CTX(2, AV_RN16A); + DECODE_Y_COEF_LOOP(2, 0,); + SPLAT_CTX(2); + break; + case TX_16X16: + MERGE_CTX(4, AV_RN32A); + DECODE_Y_COEF_LOOP(4, 0,); + SPLAT_CTX(4); + break; + case TX_32X32: + MERGE_CTX(8, AV_RN64A); + DECODE_Y_COEF_LOOP(8, 0, 32); + SPLAT_CTX(8); + break; + } + +#define DECODE_UV_COEF_LOOP(step, v) \ + for (n = 0, y = 0; y < end_y; y += step) { \ + for (x = 0; x < end_x; x += step, n += step * step) { \ + ret = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \ + (td, td->uvblock[pl] + 16 * n * bytesperpixel, \ + 16 * step * step, c, e, p, a[x] + l[y], \ + uvscan, uvnb, uv_band_counts, qmul[1]); \ + a[x] = l[y] = !!ret; \ + total_coeff |= !!ret; \ + if (step >= 4) { \ + AV_WN16A(&td->uveob[pl][n], ret); \ + } else { \ + td->uveob[pl][n] = ret; \ + } \ + } \ + } + + p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra]; + c = td->counts.coef[b->uvtx][1 /* uv */][!b->intra]; + e = td->counts.eob[b->uvtx][1 /* uv */][!b->intra]; + w4 >>= s->ss_h; + end_x >>= s->ss_h; + h4 >>= s->ss_v; + end_y >>= s->ss_v; + for (pl = 0; pl < 2; pl++) { + a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h]; + l = &td->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v]; + switch (b->uvtx) { + case TX_4X4: + DECODE_UV_COEF_LOOP(1,); + break; + case TX_8X8: + MERGE_CTX(2, AV_RN16A); + DECODE_UV_COEF_LOOP(2,); + SPLAT_CTX(2); + break; + case TX_16X16: + MERGE_CTX(4, AV_RN32A); + DECODE_UV_COEF_LOOP(4,); + SPLAT_CTX(4); + break; + case TX_32X32: + MERGE_CTX(8, AV_RN64A); + DECODE_UV_COEF_LOOP(8, 32); + SPLAT_CTX(8); + break; + } + } + + return total_coeff; +} + +static int decode_coeffs_8bpp(VP9TileData *td) +{ + return decode_coeffs(td, 1); +} + +static int decode_coeffs_16bpp(VP9TileData *td) +{ + return decode_coeffs(td, 0); +} + +static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v, + int row_and_7, int col_and_7, + int w, int h, int col_end, int row_end, + enum TxfmMode tx, int skip_inter) +{ + static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 }; + static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 }; + + // FIXME I'm pretty sure all loops can be replaced by a single LUT if + // we make VP9Filter.mask uint64_t (i.e. row/col all single variable) + // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then + // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7) + + // the intended behaviour of the vp9 loopfilter is to work on 8-pixel + // edges. This means that for UV, we work on two subsampled blocks at + // a time, and we only use the topleft block's mode information to set + // things like block strength. Thus, for any block size smaller than + // 16x16, ignore the odd portion of the block. + if (tx == TX_4X4 && (ss_v | ss_h)) { + if (h == ss_v) { + if (row_and_7 & 1) + return; + if (!row_end) + h += 1; + } + if (w == ss_h) { + if (col_and_7 & 1) + return; + if (!col_end) + w += 1; + } + } + + if (tx == TX_4X4 && !skip_inter) { + int t = 1 << col_and_7, m_col = (t << w) - t, y; + // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide + int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8; + + for (y = row_and_7; y < h + row_and_7; y++) { + int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]); + + mask[0][y][1] |= m_row_8; + mask[0][y][2] |= m_row_4; + // for odd lines, if the odd col is not being filtered, + // skip odd row also: + // .---. <-- a + // | | + // |___| <-- b + // ^ ^ + // c d + // + // if a/c are even row/col and b/d are odd, and d is skipped, + // e.g. right edge of size-66x66.webm, then skip b also (bug) + if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) { + mask[1][y][col_mask_id] |= (t << (w - 1)) - t; + } else { + mask[1][y][col_mask_id] |= m_col; + } + if (!ss_h) + mask[0][y][3] |= m_col; + if (!ss_v) { + if (ss_h && (col_end & 1)) + mask[1][y][3] |= (t << (w - 1)) - t; + else + mask[1][y][3] |= m_col; + } + } + } else { + int y, t = 1 << col_and_7, m_col = (t << w) - t; + + if (!skip_inter) { + int mask_id = (tx == TX_8X8); + int l2 = tx + ss_h - 1, step1d; + static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 }; + int m_row = m_col & masks[l2]; + + // at odd UV col/row edges tx16/tx32 loopfilter edges, force + // 8wd loopfilter to prevent going off the visible edge. + if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) { + int m_row_16 = ((t << (w - 1)) - t) & masks[l2]; + int m_row_8 = m_row - m_row_16; + + for (y = row_and_7; y < h + row_and_7; y++) { + mask[0][y][0] |= m_row_16; + mask[0][y][1] |= m_row_8; + } + } else { + for (y = row_and_7; y < h + row_and_7; y++) + mask[0][y][mask_id] |= m_row; + } + + l2 = tx + ss_v - 1; + step1d = 1 << l2; + if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) { + for (y = row_and_7; y < h + row_and_7 - 1; y += step1d) + mask[1][y][0] |= m_col; + if (y - row_and_7 == h - 1) + mask[1][y][1] |= m_col; + } else { + for (y = row_and_7; y < h + row_and_7; y += step1d) + mask[1][y][mask_id] |= m_col; + } + } else if (tx != TX_4X4) { + int mask_id; + + mask_id = (tx == TX_8X8) || (h == ss_v); + mask[1][row_and_7][mask_id] |= m_col; + mask_id = (tx == TX_8X8) || (w == ss_h); + for (y = row_and_7; y < h + row_and_7; y++) + mask[0][y][mask_id] |= t; + } else { + int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8; + + for (y = row_and_7; y < h + row_and_7; y++) { + mask[0][y][2] |= t4; + mask[0][y][1] |= t8; + } + mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col; + } + } +} + +void ff_vp9_decode_block(VP9TileData *td, int row, int col, + VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff, + enum BlockLevel bl, enum BlockPartition bp) +{ + const VP9Context *s = td->s; + VP9Block *b = td->b; + enum BlockSize bs = bl * 3 + bp; + int bytesperpixel = s->bytesperpixel; + int w4 = ff_vp9_bwh_tab[1][bs][0], h4 = ff_vp9_bwh_tab[1][bs][1], lvl; + int emu[2]; + AVFrame *f = s->s.frames[CUR_FRAME].tf.f; + + td->row = row; + td->row7 = row & 7; + td->col = col; + td->col7 = col & 7; + + td->min_mv.x = -(128 + col * 64); + td->min_mv.y = -(128 + row * 64); + td->max_mv.x = 128 + (s->cols - col - w4) * 64; + td->max_mv.y = 128 + (s->rows - row - h4) * 64; + + if (s->pass < 2) { + b->bs = bs; + b->bl = bl; + b->bp = bp; + decode_mode(td); + b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) || + (s->ss_v && h4 * 2 == (1 << b->tx))); + + if (td->block_structure) { + td->block_structure[td->nb_block_structure].row = row; + td->block_structure[td->nb_block_structure].col = col; + td->block_structure[td->nb_block_structure].block_size_idx_x = av_log2(w4); + td->block_structure[td->nb_block_structure].block_size_idx_y = av_log2(h4); + td->nb_block_structure++; + } + + if (!b->skip) { + int has_coeffs; + + if (bytesperpixel == 1) { + has_coeffs = decode_coeffs_8bpp(td); + } else { + has_coeffs = decode_coeffs_16bpp(td); + } + if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) { + b->skip = 1; + memset(&s->above_skip_ctx[col], 1, w4); + memset(&td->left_skip_ctx[td->row7], 1, h4); + } + } else { + int row7 = td->row7; + +#define SPLAT_ZERO_CTX(v, n) \ + switch (n) { \ + case 1: v = 0; break; \ + case 2: AV_ZERO16(&v); break; \ + case 4: AV_ZERO32(&v); break; \ + case 8: AV_ZERO64(&v); break; \ + case 16: AV_ZERO128(&v); break; \ + } +#define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \ + do { \ + SPLAT_ZERO_CTX(dir##_y_##var[off * 2], n * 2); \ + if (s->ss_##dir2) { \ + SPLAT_ZERO_CTX(dir##_uv_##var[0][off], n); \ + SPLAT_ZERO_CTX(dir##_uv_##var[1][off], n); \ + } else { \ + SPLAT_ZERO_CTX(dir##_uv_##var[0][off * 2], n * 2); \ + SPLAT_ZERO_CTX(dir##_uv_##var[1][off * 2], n * 2); \ + } \ + } while (0) + + switch (w4) { + case 1: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 1, h); break; + case 2: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 2, h); break; + case 4: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 4, h); break; + case 8: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 8, h); break; + } + switch (h4) { + case 1: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 1, v); break; + case 2: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 2, v); break; + case 4: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 4, v); break; + case 8: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 8, v); break; + } + } + + if (s->pass == 1) { + s->td[0].b++; + s->td[0].block += w4 * h4 * 64 * bytesperpixel; + s->td[0].uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v); + s->td[0].uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v); + s->td[0].eob += 4 * w4 * h4; + s->td[0].uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v); + s->td[0].uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v); + + return; + } + } + + // emulated overhangs if the stride of the target buffer can't hold. This + // makes it possible to support emu-edge and so on even if we have large block + // overhangs + emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] || + (row + h4) > s->rows; + emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] || + (row + h4) > s->rows; + if (emu[0]) { + td->dst[0] = td->tmp_y; + td->y_stride = 128; + } else { + td->dst[0] = f->data[0] + yoff; + td->y_stride = f->linesize[0]; + } + if (emu[1]) { + td->dst[1] = td->tmp_uv[0]; + td->dst[2] = td->tmp_uv[1]; + td->uv_stride = 128; + } else { + td->dst[1] = f->data[1] + uvoff; + td->dst[2] = f->data[2] + uvoff; + td->uv_stride = f->linesize[1]; + } + if (b->intra) { + if (s->s.h.bpp > 8) { + ff_vp9_intra_recon_16bpp(td, yoff, uvoff); + } else { + ff_vp9_intra_recon_8bpp(td, yoff, uvoff); + } + } else { + if (s->s.h.bpp > 8) { + ff_vp9_inter_recon_16bpp(td); + } else { + ff_vp9_inter_recon_8bpp(td); + } + } + if (emu[0]) { + int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0; + + for (n = 0; o < w; n++) { + int bw = 64 >> n; + + av_assert2(n <= 4); + if (w & bw) { + s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0], + td->tmp_y + o * bytesperpixel, 128, h, 0, 0); + o += bw; + } + } + } + if (emu[1]) { + int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h; + int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0; + + for (n = s->ss_h; o < w; n++) { + int bw = 64 >> n; + + av_assert2(n <= 4); + if (w & bw) { + s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1], + td->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0); + s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2], + td->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0); + o += bw; + } + } + } + + // pick filter level and find edges to apply filter to + if (s->s.h.filter.level && + (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1] + [b->mode[3] != ZEROMV]) > 0) { + int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4); + int skip_inter = !b->intra && b->skip, col7 = td->col7, row7 = td->row7; + + setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl); + mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter); + if (s->ss_h || s->ss_v) + mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end, + s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0, + s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0, + b->uvtx, skip_inter); + } + + if (s->pass == 2) { + s->td[0].b++; + s->td[0].block += w4 * h4 * 64 * bytesperpixel; + s->td[0].uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h); + s->td[0].uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h); + s->td[0].eob += 4 * w4 * h4; + s->td[0].uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h); + s->td[0].uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h); + } +} diff --git a/media/ffvpx/libavcodec/vp9data.c b/media/ffvpx/libavcodec/vp9data.c new file mode 100644 index 0000000000..7af8a97b1e --- /dev/null +++ b/media/ffvpx/libavcodec/vp9data.c @@ -0,0 +1,2247 @@ +/* + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "vp9.h" +#include "vp9data.h" + +const uint8_t ff_vp9_bwh_tab[2][N_BS_SIZES][2] = { + { + { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 }, + { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, + }, { + { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 }, + { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, + } +}; + +const int8_t ff_vp9_partition_tree[3][2] = { + { -PARTITION_NONE, 1 }, // '0' + { -PARTITION_H, 2 }, // '10' + { -PARTITION_V, -PARTITION_SPLIT }, // '110', '111' +}; + +const uint8_t ff_vp9_default_kf_partition_probs[4][4][3] = { + { /* 64x64 -> 32x32 */ + { 174, 35, 49 } /* a/l both not split */, + { 68, 11, 27 } /* a split, l not split */, + { 57, 15, 9 } /* l split, a not split */, + { 12, 3, 3 } /* a/l both split */ + }, { /* 32x32 -> 16x16 */ + { 150, 40, 39 } /* a/l both not split */, + { 78, 12, 26 } /* a split, l not split */, + { 67, 33, 11 } /* l split, a not split */, + { 24, 7, 5 } /* a/l both split */, + }, { /* 16x16 -> 8x8 */ + { 149, 53, 53 } /* a/l both not split */, + { 94, 20, 48 } /* a split, l not split */, + { 83, 53, 24 } /* l split, a not split */, + { 52, 18, 18 } /* a/l both split */, + }, { /* 8x8 -> 4x4 */ + { 158, 97, 94 } /* a/l both not split */, + { 93, 24, 99 } /* a split, l not split */, + { 85, 119, 44 } /* l split, a not split */, + { 62, 59, 67 } /* a/l both split */, + }, +}; + +const int8_t ff_vp9_segmentation_tree[7][2] = { + { 1, 2 }, + { 3, 4 }, + { 5, 6 }, + { -0, -1 }, // '00x' + { -2, -3 }, // '01x' + { -4, -5 }, // '10x' + { -6, -7 }, // '11x' +}; + +const int8_t ff_vp9_intramode_tree[9][2] = { + { -DC_PRED, 1 }, // '0' + { -TM_VP8_PRED, 2 }, // '10' + { -VERT_PRED, 3 }, // '110' + { 4, 6 }, + { -HOR_PRED, 5 }, // '11100' + { -DIAG_DOWN_RIGHT_PRED, -VERT_RIGHT_PRED }, // '11101x' + { -DIAG_DOWN_LEFT_PRED, 7 }, // '11110' + { -VERT_LEFT_PRED, 8 }, // '111110' + { -HOR_DOWN_PRED, -HOR_UP_PRED }, // '111111x' +}; + +const uint8_t ff_vp9_default_kf_ymode_probs[10][10][9] = { + { /* above = v */ + { 43, 46, 168, 134, 107, 128, 69, 142, 92 } /* left = v */, + { 44, 29, 68, 159, 201, 177, 50, 57, 77 } /* left = h */, + { 63, 36, 126, 146, 123, 158, 60, 90, 96 } /* left = dc */, + { 58, 38, 76, 114, 97, 172, 78, 133, 92 } /* left = d45 */, + { 46, 41, 76, 140, 63, 184, 69, 112, 57 } /* left = d135 */, + { 38, 32, 85, 140, 46, 112, 54, 151, 133 } /* left = d117 */, + { 39, 27, 61, 131, 110, 175, 44, 75, 136 } /* left = d153 */, + { 47, 35, 80, 100, 74, 143, 64, 163, 74 } /* left = d63 */, + { 52, 30, 74, 113, 130, 175, 51, 64, 58 } /* left = d27 */, + { 36, 61, 116, 114, 128, 162, 80, 125, 82 } /* left = tm */ + }, { /* above = h */ + { 55, 44, 68, 166, 179, 192, 57, 57, 108 } /* left = v */, + { 42, 26, 11, 199, 241, 228, 23, 15, 85 } /* left = h */, + { 82, 26, 26, 171, 208, 204, 44, 32, 105 } /* left = dc */, + { 68, 42, 19, 131, 160, 199, 55, 52, 83 } /* left = d45 */, + { 58, 50, 25, 139, 115, 232, 39, 52, 118 } /* left = d135 */, + { 50, 35, 33, 153, 104, 162, 64, 59, 131 } /* left = d117 */, + { 44, 24, 16, 150, 177, 202, 33, 19, 156 } /* left = d153 */, + { 53, 49, 21, 110, 116, 168, 59, 80, 76 } /* left = d63 */, + { 55, 27, 12, 153, 203, 218, 26, 27, 49 } /* left = d27 */, + { 38, 72, 19, 168, 203, 212, 50, 50, 107 } /* left = tm */ + }, { /* above = dc */ + { 92, 45, 102, 136, 116, 180, 74, 90, 100 } /* left = v */, + { 73, 32, 19, 187, 222, 215, 46, 34, 100 } /* left = h */, + { 137, 30, 42, 148, 151, 207, 70, 52, 91 } /* left = dc */, + { 91, 30, 32, 116, 121, 186, 93, 86, 94 } /* left = d45 */, + { 72, 35, 36, 149, 68, 206, 68, 63, 105 } /* left = d135 */, + { 73, 31, 28, 138, 57, 124, 55, 122, 151 } /* left = d117 */, + { 67, 23, 21, 140, 126, 197, 40, 37, 171 } /* left = d153 */, + { 74, 32, 27, 107, 86, 160, 63, 134, 102 } /* left = d63 */, + { 86, 27, 28, 128, 154, 212, 45, 43, 53 } /* left = d27 */, + { 59, 67, 44, 140, 161, 202, 78, 67, 119 } /* left = tm */ + }, { /* above = d45 */ + { 59, 38, 83, 112, 103, 162, 98, 136, 90 } /* left = v */, + { 62, 30, 23, 158, 200, 207, 59, 57, 50 } /* left = h */, + { 103, 26, 36, 129, 132, 201, 83, 80, 93 } /* left = dc */, + { 67, 30, 29, 84, 86, 191, 102, 91, 59 } /* left = d45 */, + { 60, 32, 33, 112, 71, 220, 64, 89, 104 } /* left = d135 */, + { 53, 26, 34, 130, 56, 149, 84, 120, 103 } /* left = d117 */, + { 53, 21, 23, 133, 109, 210, 56, 77, 172 } /* left = d153 */, + { 61, 29, 29, 93, 97, 165, 83, 175, 162 } /* left = d63 */, + { 77, 19, 29, 112, 142, 228, 55, 66, 36 } /* left = d27 */, + { 47, 47, 43, 114, 137, 181, 100, 99, 95 } /* left = tm */ + }, { /* above = d135 */ + { 53, 40, 55, 139, 69, 183, 61, 80, 110 } /* left = v */, + { 40, 29, 19, 161, 180, 207, 43, 24, 91 } /* left = h */, + { 69, 23, 29, 128, 83, 199, 46, 44, 101 } /* left = dc */, + { 60, 34, 19, 105, 61, 198, 53, 64, 89 } /* left = d45 */, + { 52, 31, 22, 158, 40, 209, 58, 62, 89 } /* left = d135 */, + { 44, 31, 29, 147, 46, 158, 56, 102, 198 } /* left = d117 */, + { 35, 19, 12, 135, 87, 209, 41, 45, 167 } /* left = d153 */, + { 51, 38, 25, 113, 58, 164, 70, 93, 97 } /* left = d63 */, + { 55, 25, 21, 118, 95, 215, 38, 39, 66 } /* left = d27 */, + { 47, 54, 34, 146, 108, 203, 72, 103, 151 } /* left = tm */ + }, { /* above = d117 */ + { 46, 27, 80, 150, 55, 124, 55, 121, 135 } /* left = v */, + { 36, 23, 27, 165, 149, 166, 54, 64, 118 } /* left = h */, + { 64, 19, 37, 156, 66, 138, 49, 95, 133 } /* left = dc */, + { 53, 21, 36, 131, 63, 163, 60, 109, 81 } /* left = d45 */, + { 40, 26, 35, 154, 40, 185, 51, 97, 123 } /* left = d135 */, + { 35, 19, 34, 179, 19, 97, 48, 129, 124 } /* left = d117 */, + { 36, 20, 26, 136, 62, 164, 33, 77, 154 } /* left = d153 */, + { 45, 26, 28, 129, 45, 129, 49, 147, 123 } /* left = d63 */, + { 45, 18, 32, 130, 90, 157, 40, 79, 91 } /* left = d27 */, + { 38, 44, 51, 136, 74, 162, 57, 97, 121 } /* left = tm */ + }, { /* above = d153 */ + { 56, 39, 58, 133, 117, 173, 48, 53, 187 } /* left = v */, + { 35, 21, 12, 161, 212, 207, 20, 23, 145 } /* left = h */, + { 75, 17, 22, 136, 138, 185, 32, 34, 166 } /* left = dc */, + { 56, 29, 19, 117, 109, 181, 55, 68, 112 } /* left = d45 */, + { 47, 29, 17, 153, 64, 220, 59, 51, 114 } /* left = d135 */, + { 46, 16, 24, 136, 76, 147, 41, 64, 172 } /* left = d117 */, + { 34, 17, 11, 108, 152, 187, 13, 15, 209 } /* left = d153 */, + { 55, 30, 18, 122, 79, 179, 44, 88, 116 } /* left = d63 */, + { 51, 24, 14, 115, 133, 209, 32, 26, 104 } /* left = d27 */, + { 37, 49, 25, 129, 168, 164, 41, 54, 148 } /* left = tm */ + }, { /* above = d63 */ + { 48, 34, 86, 101, 92, 146, 78, 179, 134 } /* left = v */, + { 47, 22, 24, 138, 187, 178, 68, 69, 59 } /* left = h */, + { 78, 23, 39, 111, 117, 170, 74, 124, 94 } /* left = dc */, + { 56, 25, 33, 105, 112, 187, 95, 177, 129 } /* left = d45 */, + { 48, 31, 27, 114, 63, 183, 82, 116, 56 } /* left = d135 */, + { 43, 28, 37, 121, 63, 123, 61, 192, 169 } /* left = d117 */, + { 42, 17, 24, 109, 97, 177, 56, 76, 122 } /* left = d153 */, + { 46, 23, 32, 74, 86, 150, 67, 183, 88 } /* left = d63 */, + { 58, 18, 28, 105, 139, 182, 70, 92, 63 } /* left = d27 */, + { 36, 38, 48, 92, 122, 165, 88, 137, 91 } /* left = tm */ + }, { /* above = d27 */ + { 62, 44, 61, 123, 105, 189, 48, 57, 64 } /* left = v */, + { 47, 25, 17, 175, 222, 220, 24, 30, 86 } /* left = h */, + { 82, 22, 32, 127, 143, 213, 39, 41, 70 } /* left = dc */, + { 68, 36, 17, 106, 102, 206, 59, 74, 74 } /* left = d45 */, + { 57, 39, 23, 151, 68, 216, 55, 63, 58 } /* left = d135 */, + { 49, 30, 35, 141, 70, 168, 82, 40, 115 } /* left = d117 */, + { 51, 25, 15, 136, 129, 202, 38, 35, 139 } /* left = d153 */, + { 59, 39, 19, 114, 75, 180, 77, 104, 42 } /* left = d63 */, + { 68, 26, 16, 111, 141, 215, 29, 28, 28 } /* left = d27 */, + { 40, 61, 26, 126, 152, 206, 61, 59, 93 } /* left = tm */ + }, { /* above = tm */ + { 44, 78, 115, 132, 119, 173, 71, 112, 93 } /* left = v */, + { 39, 38, 21, 184, 227, 206, 42, 32, 64 } /* left = h */, + { 65, 70, 60, 155, 159, 199, 61, 60, 81 } /* left = dc */, + { 58, 47, 36, 124, 137, 193, 80, 82, 78 } /* left = d45 */, + { 49, 50, 35, 144, 95, 205, 63, 78, 59 } /* left = d135 */, + { 41, 53, 52, 148, 71, 142, 65, 128, 51 } /* left = d117 */, + { 40, 36, 28, 143, 143, 202, 40, 55, 137 } /* left = d153 */, + { 42, 44, 44, 104, 105, 164, 64, 130, 80 } /* left = d63 */, + { 52, 34, 29, 129, 183, 227, 42, 35, 43 } /* left = d27 */, + { 43, 81, 53, 140, 169, 204, 68, 84, 72 } /* left = tm */ + } +}; + +const uint8_t ff_vp9_default_kf_uvmode_probs[10][9] = { + { 118, 15, 123, 148, 131, 101, 44, 93, 131 } /* y = v */, + { 113, 12, 23, 188, 226, 142, 26, 32, 125 } /* y = h */, + { 144, 11, 54, 157, 195, 130, 46, 58, 108 } /* y = dc */, + { 120, 11, 50, 123, 163, 135, 64, 77, 103 } /* y = d45 */, + { 113, 9, 36, 155, 111, 157, 32, 44, 161 } /* y = d135 */, + { 116, 9, 55, 176, 76, 96, 37, 61, 149 } /* y = d117 */, + { 115, 9, 28, 141, 161, 167, 21, 25, 193 } /* y = d153 */, + { 116, 12, 64, 120, 140, 125, 49, 115, 121 } /* y = d63 */, + { 120, 12, 32, 145, 195, 142, 32, 38, 86 } /* y = d27 */, + { 102, 19, 66, 162, 182, 122, 35, 59, 128 } /* y = tm */ +}; + +const int8_t ff_vp9_inter_mode_tree[3][2] = { + { -ZEROMV, 1 }, // '0' + { -NEARESTMV, 2 }, // '10' + { -NEARMV, -NEWMV }, // '11x' +}; + +const int8_t ff_vp9_filter_tree[2][2] = { + { -0, 1 }, // '0' + { -1, -2 }, // '1x' +}; + +const enum FilterMode ff_vp9_filter_lut[3] = { + FILTER_8TAP_REGULAR, + FILTER_8TAP_SMOOTH, + FILTER_8TAP_SHARP, +}; + +const int16_t ff_vp9_dc_qlookup[3][256] = { + { + 4, 8, 8, 9, 10, 11, 12, 12, + 13, 14, 15, 16, 17, 18, 19, 19, + 20, 21, 22, 23, 24, 25, 26, 26, + 27, 28, 29, 30, 31, 32, 32, 33, + 34, 35, 36, 37, 38, 38, 39, 40, + 41, 42, 43, 43, 44, 45, 46, 47, + 48, 48, 49, 50, 51, 52, 53, 53, + 54, 55, 56, 57, 57, 58, 59, 60, + 61, 62, 62, 63, 64, 65, 66, 66, + 67, 68, 69, 70, 70, 71, 72, 73, + 74, 74, 75, 76, 77, 78, 78, 79, + 80, 81, 81, 82, 83, 84, 85, 85, + 87, 88, 90, 92, 93, 95, 96, 98, + 99, 101, 102, 104, 105, 107, 108, 110, + 111, 113, 114, 116, 117, 118, 120, 121, + 123, 125, 127, 129, 131, 134, 136, 138, + 140, 142, 144, 146, 148, 150, 152, 154, + 156, 158, 161, 164, 166, 169, 172, 174, + 177, 180, 182, 185, 187, 190, 192, 195, + 199, 202, 205, 208, 211, 214, 217, 220, + 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, + 280, 284, 288, 292, 296, 300, 304, 309, + 313, 317, 322, 326, 330, 335, 340, 344, + 349, 354, 359, 364, 369, 374, 379, 384, + 389, 395, 400, 406, 411, 417, 423, 429, + 435, 441, 447, 454, 461, 467, 475, 482, + 489, 497, 505, 513, 522, 530, 539, 549, + 559, 569, 579, 590, 602, 614, 626, 640, + 654, 668, 684, 700, 717, 736, 755, 775, + 796, 819, 843, 869, 896, 925, 955, 988, + 1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336, + }, { + 4, 9, 10, 13, 15, 17, 20, 22, + 25, 28, 31, 34, 37, 40, 43, 47, + 50, 53, 57, 60, 64, 68, 71, 75, + 78, 82, 86, 90, 93, 97, 101, 105, + 109, 113, 116, 120, 124, 128, 132, 136, + 140, 143, 147, 151, 155, 159, 163, 166, + 170, 174, 178, 182, 185, 189, 193, 197, + 200, 204, 208, 212, 215, 219, 223, 226, + 230, 233, 237, 241, 244, 248, 251, 255, + 259, 262, 266, 269, 273, 276, 280, 283, + 287, 290, 293, 297, 300, 304, 307, 310, + 314, 317, 321, 324, 327, 331, 334, 337, + 343, 350, 356, 362, 369, 375, 381, 387, + 394, 400, 406, 412, 418, 424, 430, 436, + 442, 448, 454, 460, 466, 472, 478, 484, + 490, 499, 507, 516, 525, 533, 542, 550, + 559, 567, 576, 584, 592, 601, 609, 617, + 625, 634, 644, 655, 666, 676, 687, 698, + 708, 718, 729, 739, 749, 759, 770, 782, + 795, 807, 819, 831, 844, 856, 868, 880, + 891, 906, 920, 933, 947, 961, 975, 988, + 1001, 1015, 1030, 1045, 1061, 1076, 1090, 1105, + 1120, 1137, 1153, 1170, 1186, 1202, 1218, 1236, + 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, + 1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, + 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717, + 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, + 1958, 1990, 2021, 2054, 2088, 2123, 2159, 2197, + 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, + 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102, + 3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, + 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, + }, { + 4, 12, 18, 25, 33, 41, 50, 60, + 70, 80, 91, 103, 115, 127, 140, 153, + 166, 180, 194, 208, 222, 237, 251, 266, + 281, 296, 312, 327, 343, 358, 374, 390, + 405, 421, 437, 453, 469, 484, 500, 516, + 532, 548, 564, 580, 596, 611, 627, 643, + 659, 674, 690, 706, 721, 737, 752, 768, + 783, 798, 814, 829, 844, 859, 874, 889, + 904, 919, 934, 949, 964, 978, 993, 1008, + 1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122, + 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, + 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, + 1368, 1393, 1419, 1444, 1469, 1494, 1519, 1544, + 1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741, + 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, + 1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199, + 2233, 2267, 2300, 2334, 2367, 2400, 2434, 2467, + 2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788, + 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, + 3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517, + 3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951, + 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, + 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, + 5013, 5083, 5153, 5222, 5291, 5367, 5442, 5517, + 5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149, + 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, + 6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715, + 7832, 7958, 8085, 8214, 8352, 8492, 8635, 8788, + 8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245, + 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, + 12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812, + 16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387, + } +}; + +const int16_t ff_vp9_ac_qlookup[3][256] = { + { + 4, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, + 79, 80, 81, 82, 83, 84, 85, 86, + 87, 88, 89, 90, 91, 92, 93, 94, + 95, 96, 97, 98, 99, 100, 101, 102, + 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, + 136, 138, 140, 142, 144, 146, 148, 150, + 152, 155, 158, 161, 164, 167, 170, 173, + 176, 179, 182, 185, 188, 191, 194, 197, + 200, 203, 207, 211, 215, 219, 223, 227, + 231, 235, 239, 243, 247, 251, 255, 260, + 265, 270, 275, 280, 285, 290, 295, 300, + 305, 311, 317, 323, 329, 335, 341, 347, + 353, 359, 366, 373, 380, 387, 394, 401, + 408, 416, 424, 432, 440, 448, 456, 465, + 474, 483, 492, 501, 510, 520, 530, 540, + 550, 560, 571, 582, 593, 604, 615, 627, + 639, 651, 663, 676, 689, 702, 715, 729, + 743, 757, 771, 786, 801, 816, 832, 848, + 864, 881, 898, 915, 933, 951, 969, 988, + 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, + 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343, + 1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, + 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, + }, { + 4, 9, 11, 13, 16, 18, 21, 24, + 27, 30, 33, 37, 40, 44, 48, 51, + 55, 59, 63, 67, 71, 75, 79, 83, + 88, 92, 96, 100, 105, 109, 114, 118, + 122, 127, 131, 136, 140, 145, 149, 154, + 158, 163, 168, 172, 177, 181, 186, 190, + 195, 199, 204, 208, 213, 217, 222, 226, + 231, 235, 240, 244, 249, 253, 258, 262, + 267, 271, 275, 280, 284, 289, 293, 297, + 302, 306, 311, 315, 319, 324, 328, 332, + 337, 341, 345, 349, 354, 358, 362, 367, + 371, 375, 379, 384, 388, 392, 396, 401, + 409, 417, 425, 433, 441, 449, 458, 466, + 474, 482, 490, 498, 506, 514, 523, 531, + 539, 547, 555, 563, 571, 579, 588, 596, + 604, 616, 628, 640, 652, 664, 676, 688, + 700, 713, 725, 737, 749, 761, 773, 785, + 797, 809, 825, 841, 857, 873, 889, 905, + 922, 938, 954, 970, 986, 1002, 1018, 1038, + 1058, 1078, 1098, 1118, 1138, 1158, 1178, 1198, + 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, + 1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, + 1631, 1663, 1695, 1727, 1759, 1791, 1823, 1859, + 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, + 2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, + 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915, + 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, + 3455, 3523, 3591, 3659, 3731, 3803, 3876, 3952, + 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, + 4692, 4784, 4876, 4972, 5068, 5168, 5268, 5372, + 5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, + 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, + }, { + 4, 13, 19, 27, 35, 44, 54, 64, + 75, 87, 99, 112, 126, 139, 154, 168, + 183, 199, 214, 230, 247, 263, 280, 297, + 314, 331, 349, 366, 384, 402, 420, 438, + 456, 475, 493, 511, 530, 548, 567, 586, + 604, 623, 642, 660, 679, 698, 716, 735, + 753, 772, 791, 809, 828, 846, 865, 884, + 902, 920, 939, 957, 976, 994, 1012, 1030, + 1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175, + 1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317, + 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, + 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, + 1627, 1660, 1693, 1725, 1758, 1791, 1824, 1856, + 1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118, + 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, + 2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750, + 2798, 2847, 2895, 2943, 2992, 3040, 3088, 3137, + 3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619, + 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, + 4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791, + 4871, 4967, 5064, 5160, 5256, 5352, 5448, 5544, + 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, + 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, + 7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635, + 8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028, + 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, + 11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565, + 13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806, + 16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414, + 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, + 21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070, + 25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247, + } +}; + +const enum TxfmType ff_vp9_intra_txfm_type[14] = { + [VERT_PRED] = ADST_DCT, + [HOR_PRED] = DCT_ADST, + [DC_PRED] = DCT_DCT, + [DIAG_DOWN_LEFT_PRED] = DCT_DCT, + [DIAG_DOWN_RIGHT_PRED] = ADST_ADST, + [VERT_RIGHT_PRED] = ADST_DCT, + [HOR_DOWN_PRED] = DCT_ADST, + [VERT_LEFT_PRED] = ADST_DCT, + [HOR_UP_PRED] = DCT_ADST, + [TM_VP8_PRED] = ADST_ADST, + [NEARESTMV] = DCT_DCT, + [NEARMV] = DCT_DCT, + [ZEROMV] = DCT_DCT, + [NEWMV] = DCT_DCT, +}; + +const int16_t ff_vp9_default_scan_4x4[16] = { + 0, 1, 4, 5, + 2, 8, 3, 6, + 12, 9, 7, 10, + 13, 11, 14, 15, +}; + +const int16_t ff_vp9_col_scan_4x4[16] = { + 0, 1, 2, 4, + 3, 5, 6, 8, + 7, 9, 10, 12, + 13, 11, 14, 15, +}; + +const int16_t ff_vp9_row_scan_4x4[16] = { + 0, 4, 1, 8, + 5, 12, 9, 2, + 6, 13, 3, 10, + 7, 14, 11, 15, +}; + +const int16_t ff_vp9_default_scan_8x8[64] = { + 0, 1, 8, 2, 9, 16, 10, 3, + 17, 24, 18, 11, 4, 25, 32, 19, + 12, 26, 5, 33, 20, 27, 40, 13, + 34, 6, 41, 28, 21, 35, 42, 48, + 14, 7, 36, 29, 43, 56, 49, 22, + 15, 37, 50, 44, 57, 30, 23, 51, + 45, 58, 38, 31, 52, 59, 39, 46, + 53, 60, 47, 54, 61, 55, 62, 63, +}; + +const int16_t ff_vp9_col_scan_8x8[64] = { + 0, 1, 2, 8, 3, 9, 4, 10, + 16, 5, 11, 17, 12, 18, 6, 24, + 19, 13, 25, 7, 26, 20, 32, 14, + 27, 21, 33, 28, 34, 15, 22, 35, + 40, 29, 41, 36, 23, 30, 42, 37, + 48, 43, 31, 44, 49, 38, 50, 56, + 45, 39, 51, 57, 52, 46, 58, 53, + 59, 47, 60, 54, 61, 55, 62, 63, +}; + +const int16_t ff_vp9_row_scan_8x8[64] = { + 0, 8, 16, 1, 9, 24, 2, 17, + 32, 10, 25, 3, 40, 18, 11, 33, + 26, 19, 4, 48, 41, 34, 12, 27, + 56, 20, 5, 42, 35, 13, 49, 28, + 6, 21, 43, 36, 14, 50, 29, 57, + 7, 44, 22, 37, 51, 15, 58, 30, + 23, 45, 52, 38, 59, 31, 46, 53, + 39, 60, 47, 61, 54, 62, 55, 63, +}; + +const int16_t ff_vp9_default_scan_16x16[256] = { + 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 34, 19, 49, 20, 5, + 35, 64, 50, 36, 65, 21, 6, 51, 80, 66, 37, 22, 52, 7, 81, 67, + 38, 82, 53, 23, 96, 68, 8, 83, 97, 54, 39, 69, 112, 24, 98, 84, + 70, 55, 9, 40, 85, 99, 113, 128, 25, 114, 100, 71, 86, 56, 10, 41, + 115, 101, 129, 116, 72, 87, 26, 130, 144, 102, 57, 11, 42, 117, 131, 145, + 88, 103, 27, 73, 132, 118, 146, 58, 160, 12, 43, 133, 147, 104, 89, 119, + 161, 74, 148, 134, 28, 162, 59, 13, 176, 120, 149, 90, 135, 105, 163, 44, + 75, 177, 164, 29, 150, 121, 136, 178, 165, 14, 106, 60, 91, 151, 45, 179, + 192, 137, 166, 122, 76, 180, 152, 30, 61, 15, 107, 167, 181, 193, 92, 208, + 46, 138, 123, 153, 194, 77, 168, 182, 31, 195, 209, 183, 108, 139, 62, 154, + 47, 196, 93, 169, 210, 197, 224, 124, 184, 211, 78, 109, 170, 155, 63, 198, + 212, 185, 225, 240, 140, 94, 199, 125, 79, 213, 226, 171, 186, 156, 214, 200, + 110, 227, 141, 95, 241, 215, 228, 201, 126, 242, 187, 172, 157, 229, 111, 216, + 243, 142, 202, 230, 127, 217, 244, 173, 188, 231, 158, 203, 143, 245, 218, 232, + 189, 246, 159, 174, 233, 247, 219, 204, 175, 190, 248, 234, 205, 220, 249, 191, + 235, 221, 250, 206, 222, 251, 236, 207, 237, 223, 252, 238, 253, 239, 254, 255, +}; + +const int16_t ff_vp9_col_scan_16x16[256] = { + 0, 1, 2, 3, 16, 4, 17, 5, 18, 6, 19, 32, 20, 7, 33, 21, + 34, 8, 35, 22, 48, 36, 9, 49, 23, 50, 37, 10, 38, 51, 24, 64, + 52, 11, 65, 39, 25, 53, 66, 54, 40, 67, 12, 80, 26, 68, 55, 81, + 41, 69, 13, 27, 82, 56, 70, 83, 42, 14, 84, 96, 71, 28, 57, 85, + 97, 15, 72, 98, 43, 86, 58, 99, 29, 87, 100, 112, 73, 44, 101, 59, + 30, 113, 88, 114, 74, 128, 102, 45, 31, 115, 60, 103, 89, 116, 75, 129, + 117, 46, 104, 90, 61, 130, 118, 131, 132, 105, 76, 47, 119, 144, 91, 62, + 133, 106, 145, 120, 146, 134, 77, 147, 121, 92, 135, 148, 63, 107, 136, 122, + 93, 149, 160, 78, 150, 137, 108, 161, 162, 151, 123, 79, 138, 163, 152, 94, + 164, 109, 165, 153, 124, 139, 176, 166, 95, 177, 167, 110, 154, 178, 125, 179, + 140, 168, 155, 111, 180, 192, 181, 169, 141, 126, 182, 193, 194, 156, 183, 170, + 195, 127, 142, 196, 184, 208, 197, 157, 171, 143, 185, 198, 209, 199, 210, 172, + 158, 186, 211, 224, 212, 200, 240, 159, 213, 225, 187, 201, 173, 226, 214, 215, + 227, 202, 228, 188, 241, 216, 174, 229, 242, 203, 243, 217, 230, 175, 189, 244, + 231, 204, 218, 232, 245, 219, 246, 190, 233, 205, 191, 247, 234, 248, 220, 206, + 249, 235, 221, 207, 250, 236, 222, 251, 223, 237, 238, 252, 239, 253, 254, 255, +}; + +const int16_t ff_vp9_row_scan_16x16[256] = { + 0, 16, 32, 1, 48, 17, 64, 33, 2, 80, 18, 49, 96, 34, 3, 65, + 19, 112, 50, 81, 35, 4, 128, 66, 20, 97, 51, 82, 5, 144, 36, 67, + 113, 98, 21, 52, 160, 83, 129, 37, 68, 6, 114, 176, 99, 53, 22, 84, + 145, 38, 69, 130, 7, 115, 192, 100, 54, 23, 85, 161, 146, 131, 39, 70, + 208, 116, 8, 101, 177, 55, 86, 24, 162, 147, 132, 71, 224, 117, 40, 102, + 9, 148, 56, 87, 193, 163, 240, 133, 178, 25, 118, 72, 41, 103, 164, 10, + 149, 88, 134, 209, 179, 57, 119, 194, 26, 73, 165, 150, 104, 42, 135, 11, + 180, 120, 89, 225, 195, 58, 27, 210, 151, 181, 166, 74, 43, 105, 12, 136, + 90, 59, 241, 121, 28, 196, 167, 211, 152, 44, 182, 137, 75, 13, 226, 106, + 122, 60, 197, 91, 168, 29, 183, 153, 14, 76, 212, 138, 45, 107, 15, 198, + 92, 227, 169, 30, 123, 154, 61, 242, 184, 213, 139, 46, 77, 31, 108, 170, + 199, 185, 124, 228, 93, 155, 214, 62, 140, 243, 78, 47, 200, 109, 186, 171, + 201, 94, 63, 215, 229, 156, 79, 125, 141, 110, 216, 187, 172, 244, 202, 230, + 217, 95, 157, 126, 245, 111, 142, 231, 188, 127, 158, 218, 173, 232, 246, 233, + 203, 143, 247, 174, 189, 159, 219, 204, 248, 234, 249, 175, 190, 220, 205, 250, + 235, 191, 221, 251, 236, 206, 252, 222, 207, 237, 223, 253, 238, 254, 239, 255, +}; + +const int16_t ff_vp9_default_scan_32x32[1024] = { + 0, 1, 32, 2, 33, 64, 3, 34, 65, 4, 96, 35, 66, 5, 36, 97, 67, 128, 98, 68, 37, 6, 129, 99, 7, 160, 69, 38, 130, 100, 161, 131, + 39, 70, 8, 101, 162, 132, 192, 71, 40, 9, 102, 163, 133, 193, 72, 224, 103, 41, 164, 10, 194, 134, 165, 73, 104, 135, 225, 42, 195, 11, 256, 166, + 226, 196, 74, 105, 136, 43, 12, 167, 197, 227, 257, 75, 106, 137, 228, 44, 198, 168, 258, 288, 13, 229, 76, 107, 199, 138, 259, 169, 289, 45, 230, 260, + 200, 108, 14, 170, 139, 320, 290, 77, 231, 261, 46, 201, 140, 291, 109, 232, 321, 262, 171, 78, 292, 15, 322, 202, 263, 352, 172, 293, 233, 141, 323, 110, + 47, 203, 264, 234, 294, 353, 324, 16, 79, 204, 265, 295, 325, 173, 354, 142, 235, 384, 48, 296, 111, 266, 355, 326, 80, 17, 205, 236, 174, 356, 385, 327, + 143, 297, 267, 357, 386, 112, 49, 328, 298, 206, 416, 237, 358, 387, 81, 175, 18, 329, 359, 388, 299, 330, 389, 113, 417, 238, 360, 50, 207, 418, 390, 331, + 19, 448, 361, 82, 419, 391, 239, 51, 362, 420, 114, 449, 480, 421, 83, 363, 450, 422, 512, 451, 423, 115, 452, 481, 453, 482, 454, 544, 483, 455, 513, 484, + 514, 485, 515, 486, 545, 576, 487, 546, 547, 608, 577, 578, 579, 609, 610, 611, 20, 144, 268, 392, 516, 640, 21, 52, 145, 176, 269, 300, 393, 424, 517, 548, + 641, 672, 22, 53, 84, 146, 177, 208, 270, 301, 332, 394, 425, 456, 518, 549, 580, 642, 673, 704, 23, 54, 85, 116, 147, 178, 209, 240, 271, 302, 333, 364, + 395, 426, 457, 488, 519, 550, 581, 612, 643, 674, 705, 736, 55, 86, 117, 179, 210, 241, 303, 334, 365, 427, 458, 489, 551, 582, 613, 675, 706, 737, 87, 118, + 211, 242, 335, 366, 459, 490, 583, 614, 707, 738, 119, 243, 367, 491, 615, 739, 24, 148, 272, 396, 520, 644, 768, 25, 56, 149, 180, 273, 304, 397, 428, 521, + 552, 645, 676, 769, 800, 26, 57, 88, 150, 181, 212, 274, 305, 336, 398, 429, 460, 522, 553, 584, 646, 677, 708, 770, 801, 832, 27, 58, 89, 120, 151, 182, + 213, 244, 275, 306, 337, 368, 399, 430, 461, 492, 523, 554, 585, 616, 647, 678, 709, 740, 771, 802, 833, 864, 59, 90, 121, 183, 214, 245, 307, 338, 369, 431, + 462, 493, 555, 586, 617, 679, 710, 741, 803, 834, 865, 91, 122, 215, 246, 339, 370, 463, 494, 587, 618, 711, 742, 835, 866, 123, 247, 371, 495, 619, 743, 867, + 28, 152, 276, 400, 524, 648, 772, 896, 29, 60, 153, 184, 277, 308, 401, 432, 525, 556, 649, 680, 773, 804, 897, 928, 30, 61, 92, 154, 185, 216, 278, 309, + 340, 402, 433, 464, 526, 557, 588, 650, 681, 712, 774, 805, 836, 898, 929, 960, 31, 62, 93, 124, 155, 186, 217, 248, 279, 310, 341, 372, 403, 434, 465, 496, + 527, 558, 589, 620, 651, 682, 713, 744, 775, 806, 837, 868, 899, 930, 961, 992, 63, 94, 125, 187, 218, 249, 311, 342, 373, 435, 466, 497, 559, 590, 621, 683, + 714, 745, 807, 838, 869, 931, 962, 993, 95, 126, 219, 250, 343, 374, 467, 498, 591, 622, 715, 746, 839, 870, 963, 994, 127, 251, 375, 499, 623, 747, 871, 995, + 156, 280, 404, 528, 652, 776, 900, 157, 188, 281, 312, 405, 436, 529, 560, 653, 684, 777, 808, 901, 932, 158, 189, 220, 282, 313, 344, 406, 437, 468, 530, 561, + 592, 654, 685, 716, 778, 809, 840, 902, 933, 964, 159, 190, 221, 252, 283, 314, 345, 376, 407, 438, 469, 500, 531, 562, 593, 624, 655, 686, 717, 748, 779, 810, + 841, 872, 903, 934, 965, 996, 191, 222, 253, 315, 346, 377, 439, 470, 501, 563, 594, 625, 687, 718, 749, 811, 842, 873, 935, 966, 997, 223, 254, 347, 378, 471, + 502, 595, 626, 719, 750, 843, 874, 967, 998, 255, 379, 503, 627, 751, 875, 999, 284, 408, 532, 656, 780, 904, 285, 316, 409, 440, 533, 564, 657, 688, 781, 812, + 905, 936, 286, 317, 348, 410, 441, 472, 534, 565, 596, 658, 689, 720, 782, 813, 844, 906, 937, 968, 287, 318, 349, 380, 411, 442, 473, 504, 535, 566, 597, 628, + 659, 690, 721, 752, 783, 814, 845, 876, 907, 938, 969, 1000, 319, 350, 381, 443, 474, 505, 567, 598, 629, 691, 722, 753, 815, 846, 877, 939, 970, 1001, 351, 382, + 475, 506, 599, 630, 723, 754, 847, 878, 971, 1002, 383, 507, 631, 755, 879, 1003, 412, 536, 660, 784, 908, 413, 444, 537, 568, 661, 692, 785, 816, 909, 940, 414, + 445, 476, 538, 569, 600, 662, 693, 724, 786, 817, 848, 910, 941, 972, 415, 446, 477, 508, 539, 570, 601, 632, 663, 694, 725, 756, 787, 818, 849, 880, 911, 942, + 973, 1004, 447, 478, 509, 571, 602, 633, 695, 726, 757, 819, 850, 881, 943, 974, 1005, 479, 510, 603, 634, 727, 758, 851, 882, 975, 1006, 511, 635, 759, 883, 1007, + 540, 664, 788, 912, 541, 572, 665, 696, 789, 820, 913, 944, 542, 573, 604, 666, 697, 728, 790, 821, 852, 914, 945, 976, 543, 574, 605, 636, 667, 698, 729, 760, + 791, 822, 853, 884, 915, 946, 977, 1008, 575, 606, 637, 699, 730, 761, 823, 854, 885, 947, 978, 1009, 607, 638, 731, 762, 855, 886, 979, 1010, 639, 763, 887, 1011, + 668, 792, 916, 669, 700, 793, 824, 917, 948, 670, 701, 732, 794, 825, 856, 918, 949, 980, 671, 702, 733, 764, 795, 826, 857, 888, 919, 950, 981, 1012, 703, 734, + 765, 827, 858, 889, 951, 982, 1013, 735, 766, 859, 890, 983, 1014, 767, 891, 1015, 796, 920, 797, 828, 921, 952, 798, 829, 860, 922, 953, 984, 799, 830, 861, 892, + 923, 954, 985, 1016, 831, 862, 893, 955, 986, 1017, 863, 894, 987, 1018, 895, 1019, 924, 925, 956, 926, 957, 988, 927, 958, 989, 1020, 959, 990, 1021, 991, 1022, 1023, +}; + +const int16_t * const ff_vp9_scans[5][4] = { + { + ff_vp9_default_scan_4x4, ff_vp9_col_scan_4x4, + ff_vp9_row_scan_4x4, ff_vp9_default_scan_4x4 + }, { + ff_vp9_default_scan_8x8, ff_vp9_col_scan_8x8, + ff_vp9_row_scan_8x8, ff_vp9_default_scan_8x8 + }, { + ff_vp9_default_scan_16x16, ff_vp9_col_scan_16x16, + ff_vp9_row_scan_16x16, ff_vp9_default_scan_16x16 + }, { + ff_vp9_default_scan_32x32, ff_vp9_default_scan_32x32, + ff_vp9_default_scan_32x32, ff_vp9_default_scan_32x32 + }, { // lossless + ff_vp9_default_scan_4x4, ff_vp9_default_scan_4x4, + ff_vp9_default_scan_4x4, ff_vp9_default_scan_4x4 + } +}; + +const int16_t ff_vp9_default_scan_4x4_nb[16][2] = { + { 0, 0 }, { 0, 0 }, { 4, 1 }, { 1, 1 }, + { 4, 4 }, { 2, 2 }, { 5, 2 }, { 8, 8 }, + { 8, 5 }, { 6, 3 }, { 9, 6 }, { 12, 9 }, + { 10, 7 }, { 13, 10 }, { 14, 11 }, { 0, 0 }, +}; + +const int16_t ff_vp9_col_scan_4x4_nb[16][2] = { + { 0, 0 }, { 1, 1 }, { 0, 0 }, { 2, 2 }, + { 4, 4 }, { 5, 5 }, { 4, 4 }, { 6, 6 }, + { 8, 8 }, { 9, 9 }, { 8, 8 }, { 12, 12 }, + { 10, 10 }, { 13, 13 }, { 14, 14 }, { 0, 0 }, +}; + +const int16_t ff_vp9_row_scan_4x4_nb[16][2] = { + { 0, 0 }, { 0, 0 }, { 4, 4 }, { 1, 1 }, + { 8, 8 }, { 5, 5 }, { 1, 1 }, { 2, 2 }, + { 9, 9 }, { 2, 2 }, { 6, 6 }, { 3, 3 }, + { 10, 10 }, { 7, 7 }, { 11, 11 }, { 0, 0 }, +}; + +const int16_t ff_vp9_default_scan_8x8_nb[64][2] = { + { 0, 0 }, { 0, 0 }, { 1, 1 }, { 8, 1 }, + { 8, 8 }, { 9, 2 }, { 2, 2 }, { 16, 9 }, + { 16, 16 }, { 17, 10 }, { 10, 3 }, { 3, 3 }, + { 24, 17 }, { 24, 24 }, { 18, 11 }, { 11, 4 }, + { 25, 18 }, { 4, 4 }, { 32, 25 }, { 19, 12 }, + { 26, 19 }, { 32, 32 }, { 12, 5 }, { 33, 26 }, + { 5, 5 }, { 40, 33 }, { 27, 20 }, { 20, 13 }, + { 34, 27 }, { 41, 34 }, { 40, 40 }, { 13, 6 }, + { 6, 6 }, { 35, 28 }, { 28, 21 }, { 42, 35 }, + { 48, 48 }, { 48, 41 }, { 21, 14 }, { 14, 7 }, + { 36, 29 }, { 49, 42 }, { 43, 36 }, { 56, 49 }, + { 29, 22 }, { 22, 15 }, { 50, 43 }, { 44, 37 }, + { 57, 50 }, { 37, 30 }, { 30, 23 }, { 51, 44 }, + { 58, 51 }, { 38, 31 }, { 45, 38 }, { 52, 45 }, + { 59, 52 }, { 46, 39 }, { 53, 46 }, { 60, 53 }, + { 54, 47 }, { 61, 54 }, { 62, 55 }, { 0, 0 }, +}; + +const int16_t ff_vp9_col_scan_8x8_nb[64][2] = { + { 0, 0 }, { 1, 1 }, { 0, 0 }, { 2, 2 }, + { 8, 8 }, { 3, 3 }, { 9, 9 }, { 8, 8 }, + { 4, 4 }, { 10, 10 }, { 16, 16 }, { 11, 11 }, + { 17, 17 }, { 5, 5 }, { 16, 16 }, { 18, 18 }, + { 12, 12 }, { 24, 24 }, { 6, 6 }, { 25, 25 }, + { 19, 19 }, { 24, 24 }, { 13, 13 }, { 26, 26 }, + { 20, 20 }, { 32, 32 }, { 27, 27 }, { 33, 33 }, + { 14, 14 }, { 21, 21 }, { 34, 34 }, { 32, 32 }, + { 28, 28 }, { 40, 40 }, { 35, 35 }, { 22, 22 }, + { 29, 29 }, { 41, 41 }, { 36, 36 }, { 40, 40 }, + { 42, 42 }, { 30, 30 }, { 43, 43 }, { 48, 48 }, + { 37, 37 }, { 49, 49 }, { 48, 48 }, { 44, 44 }, + { 38, 38 }, { 50, 50 }, { 56, 56 }, { 51, 51 }, + { 45, 45 }, { 57, 57 }, { 52, 52 }, { 58, 58 }, + { 46, 46 }, { 59, 59 }, { 53, 53 }, { 60, 60 }, + { 54, 54 }, { 61, 61 }, { 62, 62 }, { 0, 0 }, +}; + +const int16_t ff_vp9_row_scan_8x8_nb[64][2] = { + { 0, 0 }, { 8, 8 }, { 0, 0 }, { 1, 1 }, + { 16, 16 }, { 1, 1 }, { 9, 9 }, { 24, 24 }, + { 2, 2 }, { 17, 17 }, { 2, 2 }, { 32, 32 }, + { 10, 10 }, { 3, 3 }, { 25, 25 }, { 18, 18 }, + { 11, 11 }, { 3, 3 }, { 40, 40 }, { 33, 33 }, + { 26, 26 }, { 4, 4 }, { 19, 19 }, { 48, 48 }, + { 12, 12 }, { 4, 4 }, { 34, 34 }, { 27, 27 }, + { 5, 5 }, { 41, 41 }, { 20, 20 }, { 5, 5 }, + { 13, 13 }, { 35, 35 }, { 28, 28 }, { 6, 6 }, + { 42, 42 }, { 21, 21 }, { 49, 49 }, { 6, 6 }, + { 36, 36 }, { 14, 14 }, { 29, 29 }, { 43, 43 }, + { 7, 7 }, { 50, 50 }, { 22, 22 }, { 15, 15 }, + { 37, 37 }, { 44, 44 }, { 30, 30 }, { 51, 51 }, + { 23, 23 }, { 38, 38 }, { 45, 45 }, { 31, 31 }, + { 52, 52 }, { 39, 39 }, { 53, 53 }, { 46, 46 }, + { 54, 54 }, { 47, 47 }, { 55, 55 }, { 0, 0 }, +}; + +const int16_t ff_vp9_default_scan_16x16_nb[256][2] = { + { 0, 0 }, { 0, 0 }, { 1, 1 }, { 16, 1 }, + { 16, 16 }, { 2, 2 }, { 17, 2 }, { 32, 17 }, + { 32, 32 }, { 3, 3 }, { 33, 18 }, { 18, 3 }, + { 48, 33 }, { 19, 4 }, { 4, 4 }, { 34, 19 }, + { 48, 48 }, { 49, 34 }, { 35, 20 }, { 64, 49 }, + { 20, 5 }, { 5, 5 }, { 50, 35 }, { 64, 64 }, + { 65, 50 }, { 36, 21 }, { 21, 6 }, { 51, 36 }, + { 6, 6 }, { 80, 65 }, { 66, 51 }, { 37, 22 }, + { 81, 66 }, { 52, 37 }, { 22, 7 }, { 80, 80 }, + { 67, 52 }, { 7, 7 }, { 82, 67 }, { 96, 81 }, + { 53, 38 }, { 38, 23 }, { 68, 53 }, { 96, 96 }, + { 23, 8 }, { 97, 82 }, { 83, 68 }, { 69, 54 }, + { 54, 39 }, { 8, 8 }, { 39, 24 }, { 84, 69 }, + { 98, 83 }, { 112, 97 }, { 112, 112 }, { 24, 9 }, + { 113, 98 }, { 99, 84 }, { 70, 55 }, { 85, 70 }, + { 55, 40 }, { 9, 9 }, { 40, 25 }, { 114, 99 }, + { 100, 85 }, { 128, 113 }, { 115, 100 }, { 71, 56 }, + { 86, 71 }, { 25, 10 }, { 129, 114 }, { 128, 128 }, + { 101, 86 }, { 56, 41 }, { 10, 10 }, { 41, 26 }, + { 116, 101 }, { 130, 115 }, { 144, 129 }, { 87, 72 }, + { 102, 87 }, { 26, 11 }, { 72, 57 }, { 131, 116 }, + { 117, 102 }, { 145, 130 }, { 57, 42 }, { 144, 144 }, + { 11, 11 }, { 42, 27 }, { 132, 117 }, { 146, 131 }, + { 103, 88 }, { 88, 73 }, { 118, 103 }, { 160, 145 }, + { 73, 58 }, { 147, 132 }, { 133, 118 }, { 27, 12 }, + { 161, 146 }, { 58, 43 }, { 12, 12 }, { 160, 160 }, + { 119, 104 }, { 148, 133 }, { 89, 74 }, { 134, 119 }, + { 104, 89 }, { 162, 147 }, { 43, 28 }, { 74, 59 }, + { 176, 161 }, { 163, 148 }, { 28, 13 }, { 149, 134 }, + { 120, 105 }, { 135, 120 }, { 177, 162 }, { 164, 149 }, + { 13, 13 }, { 105, 90 }, { 59, 44 }, { 90, 75 }, + { 150, 135 }, { 44, 29 }, { 178, 163 }, { 176, 176 }, + { 136, 121 }, { 165, 150 }, { 121, 106 }, { 75, 60 }, + { 179, 164 }, { 151, 136 }, { 29, 14 }, { 60, 45 }, + { 14, 14 }, { 106, 91 }, { 166, 151 }, { 180, 165 }, + { 192, 177 }, { 91, 76 }, { 192, 192 }, { 45, 30 }, + { 137, 122 }, { 122, 107 }, { 152, 137 }, { 193, 178 }, + { 76, 61 }, { 167, 152 }, { 181, 166 }, { 30, 15 }, + { 194, 179 }, { 208, 193 }, { 182, 167 }, { 107, 92 }, + { 138, 123 }, { 61, 46 }, { 153, 138 }, { 46, 31 }, + { 195, 180 }, { 92, 77 }, { 168, 153 }, { 209, 194 }, + { 196, 181 }, { 208, 208 }, { 123, 108 }, { 183, 168 }, + { 210, 195 }, { 77, 62 }, { 108, 93 }, { 169, 154 }, + { 154, 139 }, { 62, 47 }, { 197, 182 }, { 211, 196 }, + { 184, 169 }, { 224, 209 }, { 224, 224 }, { 139, 124 }, + { 93, 78 }, { 198, 183 }, { 124, 109 }, { 78, 63 }, + { 212, 197 }, { 225, 210 }, { 170, 155 }, { 185, 170 }, + { 155, 140 }, { 213, 198 }, { 199, 184 }, { 109, 94 }, + { 226, 211 }, { 140, 125 }, { 94, 79 }, { 240, 225 }, + { 214, 199 }, { 227, 212 }, { 200, 185 }, { 125, 110 }, + { 241, 226 }, { 186, 171 }, { 171, 156 }, { 156, 141 }, + { 228, 213 }, { 110, 95 }, { 215, 200 }, { 242, 227 }, + { 141, 126 }, { 201, 186 }, { 229, 214 }, { 126, 111 }, + { 216, 201 }, { 243, 228 }, { 172, 157 }, { 187, 172 }, + { 230, 215 }, { 157, 142 }, { 202, 187 }, { 142, 127 }, + { 244, 229 }, { 217, 202 }, { 231, 216 }, { 188, 173 }, + { 245, 230 }, { 158, 143 }, { 173, 158 }, { 232, 217 }, + { 246, 231 }, { 218, 203 }, { 203, 188 }, { 174, 159 }, + { 189, 174 }, { 247, 232 }, { 233, 218 }, { 204, 189 }, + { 219, 204 }, { 248, 233 }, { 190, 175 }, { 234, 219 }, + { 220, 205 }, { 249, 234 }, { 205, 190 }, { 221, 206 }, + { 250, 235 }, { 235, 220 }, { 206, 191 }, { 236, 221 }, + { 222, 207 }, { 251, 236 }, { 237, 222 }, { 252, 237 }, + { 238, 223 }, { 253, 238 }, { 254, 239 }, { 0, 0 }, +}; + +const int16_t ff_vp9_col_scan_16x16_nb[256][2] = { + { 0, 0 }, { 1, 1 }, { 2, 2 }, { 0, 0 }, + { 3, 3 }, { 16, 16 }, { 4, 4 }, { 17, 17 }, + { 5, 5 }, { 18, 18 }, { 16, 16 }, { 19, 19 }, + { 6, 6 }, { 32, 32 }, { 20, 20 }, { 33, 33 }, + { 7, 7 }, { 34, 34 }, { 21, 21 }, { 32, 32 }, + { 35, 35 }, { 8, 8 }, { 48, 48 }, { 22, 22 }, + { 49, 49 }, { 36, 36 }, { 9, 9 }, { 37, 37 }, + { 50, 50 }, { 23, 23 }, { 48, 48 }, { 51, 51 }, + { 10, 10 }, { 64, 64 }, { 38, 38 }, { 24, 24 }, + { 52, 52 }, { 65, 65 }, { 53, 53 }, { 39, 39 }, + { 66, 66 }, { 11, 11 }, { 64, 64 }, { 25, 25 }, + { 67, 67 }, { 54, 54 }, { 80, 80 }, { 40, 40 }, + { 68, 68 }, { 12, 12 }, { 26, 26 }, { 81, 81 }, + { 55, 55 }, { 69, 69 }, { 82, 82 }, { 41, 41 }, + { 13, 13 }, { 83, 83 }, { 80, 80 }, { 70, 70 }, + { 27, 27 }, { 56, 56 }, { 84, 84 }, { 96, 96 }, + { 14, 14 }, { 71, 71 }, { 97, 97 }, { 42, 42 }, + { 85, 85 }, { 57, 57 }, { 98, 98 }, { 28, 28 }, + { 86, 86 }, { 99, 99 }, { 96, 96 }, { 72, 72 }, + { 43, 43 }, { 100, 100 }, { 58, 58 }, { 29, 29 }, + { 112, 112 }, { 87, 87 }, { 113, 113 }, { 73, 73 }, + { 112, 112 }, { 101, 101 }, { 44, 44 }, { 30, 30 }, + { 114, 114 }, { 59, 59 }, { 102, 102 }, { 88, 88 }, + { 115, 115 }, { 74, 74 }, { 128, 128 }, { 116, 116 }, + { 45, 45 }, { 103, 103 }, { 89, 89 }, { 60, 60 }, + { 129, 129 }, { 117, 117 }, { 130, 130 }, { 131, 131 }, + { 104, 104 }, { 75, 75 }, { 46, 46 }, { 118, 118 }, + { 128, 128 }, { 90, 90 }, { 61, 61 }, { 132, 132 }, + { 105, 105 }, { 144, 144 }, { 119, 119 }, { 145, 145 }, + { 133, 133 }, { 76, 76 }, { 146, 146 }, { 120, 120 }, + { 91, 91 }, { 134, 134 }, { 147, 147 }, { 62, 62 }, + { 106, 106 }, { 135, 135 }, { 121, 121 }, { 92, 92 }, + { 148, 148 }, { 144, 144 }, { 77, 77 }, { 149, 149 }, + { 136, 136 }, { 107, 107 }, { 160, 160 }, { 161, 161 }, + { 150, 150 }, { 122, 122 }, { 78, 78 }, { 137, 137 }, + { 162, 162 }, { 151, 151 }, { 93, 93 }, { 163, 163 }, + { 108, 108 }, { 164, 164 }, { 152, 152 }, { 123, 123 }, + { 138, 138 }, { 160, 160 }, { 165, 165 }, { 94, 94 }, + { 176, 176 }, { 166, 166 }, { 109, 109 }, { 153, 153 }, + { 177, 177 }, { 124, 124 }, { 178, 178 }, { 139, 139 }, + { 167, 167 }, { 154, 154 }, { 110, 110 }, { 179, 179 }, + { 176, 176 }, { 180, 180 }, { 168, 168 }, { 140, 140 }, + { 125, 125 }, { 181, 181 }, { 192, 192 }, { 193, 193 }, + { 155, 155 }, { 182, 182 }, { 169, 169 }, { 194, 194 }, + { 126, 126 }, { 141, 141 }, { 195, 195 }, { 183, 183 }, + { 192, 192 }, { 196, 196 }, { 156, 156 }, { 170, 170 }, + { 142, 142 }, { 184, 184 }, { 197, 197 }, { 208, 208 }, + { 198, 198 }, { 209, 209 }, { 171, 171 }, { 157, 157 }, + { 185, 185 }, { 210, 210 }, { 208, 208 }, { 211, 211 }, + { 199, 199 }, { 224, 224 }, { 158, 158 }, { 212, 212 }, + { 224, 224 }, { 186, 186 }, { 200, 200 }, { 172, 172 }, + { 225, 225 }, { 213, 213 }, { 214, 214 }, { 226, 226 }, + { 201, 201 }, { 227, 227 }, { 187, 187 }, { 240, 240 }, + { 215, 215 }, { 173, 173 }, { 228, 228 }, { 241, 241 }, + { 202, 202 }, { 242, 242 }, { 216, 216 }, { 229, 229 }, + { 174, 174 }, { 188, 188 }, { 243, 243 }, { 230, 230 }, + { 203, 203 }, { 217, 217 }, { 231, 231 }, { 244, 244 }, + { 218, 218 }, { 245, 245 }, { 189, 189 }, { 232, 232 }, + { 204, 204 }, { 190, 190 }, { 246, 246 }, { 233, 233 }, + { 247, 247 }, { 219, 219 }, { 205, 205 }, { 248, 248 }, + { 234, 234 }, { 220, 220 }, { 206, 206 }, { 249, 249 }, + { 235, 235 }, { 221, 221 }, { 250, 250 }, { 222, 222 }, + { 236, 236 }, { 237, 237 }, { 251, 251 }, { 238, 238 }, + { 252, 252 }, { 253, 253 }, { 254, 254 }, { 0, 0 }, +}; + +const int16_t ff_vp9_row_scan_16x16_nb[256][2] = { + { 0, 0 }, { 16, 16 }, { 0, 0 }, { 32, 32 }, + { 1, 1 }, { 48, 48 }, { 17, 17 }, { 1, 1 }, + { 64, 64 }, { 2, 2 }, { 33, 33 }, { 80, 80 }, + { 18, 18 }, { 2, 2 }, { 49, 49 }, { 3, 3 }, + { 96, 96 }, { 34, 34 }, { 65, 65 }, { 19, 19 }, + { 3, 3 }, { 112, 112 }, { 50, 50 }, { 4, 4 }, + { 81, 81 }, { 35, 35 }, { 66, 66 }, { 4, 4 }, + { 128, 128 }, { 20, 20 }, { 51, 51 }, { 97, 97 }, + { 82, 82 }, { 5, 5 }, { 36, 36 }, { 144, 144 }, + { 67, 67 }, { 113, 113 }, { 21, 21 }, { 52, 52 }, + { 5, 5 }, { 98, 98 }, { 160, 160 }, { 83, 83 }, + { 37, 37 }, { 6, 6 }, { 68, 68 }, { 129, 129 }, + { 22, 22 }, { 53, 53 }, { 114, 114 }, { 6, 6 }, + { 99, 99 }, { 176, 176 }, { 84, 84 }, { 38, 38 }, + { 7, 7 }, { 69, 69 }, { 145, 145 }, { 130, 130 }, + { 115, 115 }, { 23, 23 }, { 54, 54 }, { 192, 192 }, + { 100, 100 }, { 7, 7 }, { 85, 85 }, { 161, 161 }, + { 39, 39 }, { 70, 70 }, { 8, 8 }, { 146, 146 }, + { 131, 131 }, { 116, 116 }, { 55, 55 }, { 208, 208 }, + { 101, 101 }, { 24, 24 }, { 86, 86 }, { 8, 8 }, + { 132, 132 }, { 40, 40 }, { 71, 71 }, { 177, 177 }, + { 147, 147 }, { 224, 224 }, { 117, 117 }, { 162, 162 }, + { 9, 9 }, { 102, 102 }, { 56, 56 }, { 25, 25 }, + { 87, 87 }, { 148, 148 }, { 9, 9 }, { 133, 133 }, + { 72, 72 }, { 118, 118 }, { 193, 193 }, { 163, 163 }, + { 41, 41 }, { 103, 103 }, { 178, 178 }, { 10, 10 }, + { 57, 57 }, { 149, 149 }, { 134, 134 }, { 88, 88 }, + { 26, 26 }, { 119, 119 }, { 10, 10 }, { 164, 164 }, + { 104, 104 }, { 73, 73 }, { 209, 209 }, { 179, 179 }, + { 42, 42 }, { 11, 11 }, { 194, 194 }, { 135, 135 }, + { 165, 165 }, { 150, 150 }, { 58, 58 }, { 27, 27 }, + { 89, 89 }, { 11, 11 }, { 120, 120 }, { 74, 74 }, + { 43, 43 }, { 225, 225 }, { 105, 105 }, { 12, 12 }, + { 180, 180 }, { 151, 151 }, { 195, 195 }, { 136, 136 }, + { 28, 28 }, { 166, 166 }, { 121, 121 }, { 59, 59 }, + { 12, 12 }, { 210, 210 }, { 90, 90 }, { 106, 106 }, + { 44, 44 }, { 181, 181 }, { 75, 75 }, { 152, 152 }, + { 13, 13 }, { 167, 167 }, { 137, 137 }, { 13, 13 }, + { 60, 60 }, { 196, 196 }, { 122, 122 }, { 29, 29 }, + { 91, 91 }, { 14, 14 }, { 182, 182 }, { 76, 76 }, + { 211, 211 }, { 153, 153 }, { 14, 14 }, { 107, 107 }, + { 138, 138 }, { 45, 45 }, { 226, 226 }, { 168, 168 }, + { 197, 197 }, { 123, 123 }, { 30, 30 }, { 61, 61 }, + { 15, 15 }, { 92, 92 }, { 154, 154 }, { 183, 183 }, + { 169, 169 }, { 108, 108 }, { 212, 212 }, { 77, 77 }, + { 139, 139 }, { 198, 198 }, { 46, 46 }, { 124, 124 }, + { 227, 227 }, { 62, 62 }, { 31, 31 }, { 184, 184 }, + { 93, 93 }, { 170, 170 }, { 155, 155 }, { 185, 185 }, + { 78, 78 }, { 47, 47 }, { 199, 199 }, { 213, 213 }, + { 140, 140 }, { 63, 63 }, { 109, 109 }, { 125, 125 }, + { 94, 94 }, { 200, 200 }, { 171, 171 }, { 156, 156 }, + { 228, 228 }, { 186, 186 }, { 214, 214 }, { 201, 201 }, + { 79, 79 }, { 141, 141 }, { 110, 110 }, { 229, 229 }, + { 95, 95 }, { 126, 126 }, { 215, 215 }, { 172, 172 }, + { 111, 111 }, { 142, 142 }, { 202, 202 }, { 157, 157 }, + { 216, 216 }, { 230, 230 }, { 217, 217 }, { 187, 187 }, + { 127, 127 }, { 231, 231 }, { 158, 158 }, { 173, 173 }, + { 143, 143 }, { 203, 203 }, { 188, 188 }, { 232, 232 }, + { 218, 218 }, { 233, 233 }, { 159, 159 }, { 174, 174 }, + { 204, 204 }, { 189, 189 }, { 234, 234 }, { 219, 219 }, + { 175, 175 }, { 205, 205 }, { 235, 235 }, { 220, 220 }, + { 190, 190 }, { 236, 236 }, { 206, 206 }, { 191, 191 }, + { 221, 221 }, { 207, 207 }, { 237, 237 }, { 222, 222 }, + { 238, 238 }, { 223, 223 }, { 239, 239 }, { 0, 0 }, +}; + +const int16_t ff_vp9_default_scan_32x32_nb[1024][2] = { + { 0, 0 }, { 0, 0 }, { 1, 1 }, { 32, 1 }, + { 32, 32 }, { 2, 2 }, { 33, 2 }, { 64, 33 }, + { 3, 3 }, { 64, 64 }, { 34, 3 }, { 65, 34 }, + { 4, 4 }, { 35, 4 }, { 96, 65 }, { 66, 35 }, + { 96, 96 }, { 97, 66 }, { 67, 36 }, { 36, 5 }, + { 5, 5 }, { 128, 97 }, { 98, 67 }, { 6, 6 }, + { 128, 128 }, { 68, 37 }, { 37, 6 }, { 129, 98 }, + { 99, 68 }, { 160, 129 }, { 130, 99 }, { 38, 7 }, + { 69, 38 }, { 7, 7 }, { 100, 69 }, { 161, 130 }, + { 131, 100 }, { 160, 160 }, { 70, 39 }, { 39, 8 }, + { 8, 8 }, { 101, 70 }, { 162, 131 }, { 132, 101 }, + { 192, 161 }, { 71, 40 }, { 192, 192 }, { 102, 71 }, + { 40, 9 }, { 163, 132 }, { 9, 9 }, { 193, 162 }, + { 133, 102 }, { 164, 133 }, { 72, 41 }, { 103, 72 }, + { 134, 103 }, { 224, 193 }, { 41, 10 }, { 194, 163 }, + { 10, 10 }, { 224, 224 }, { 165, 134 }, { 225, 194 }, + { 195, 164 }, { 73, 42 }, { 104, 73 }, { 135, 104 }, + { 42, 11 }, { 11, 11 }, { 166, 135 }, { 196, 165 }, + { 226, 195 }, { 256, 225 }, { 74, 43 }, { 105, 74 }, + { 136, 105 }, { 227, 196 }, { 43, 12 }, { 197, 166 }, + { 167, 136 }, { 257, 226 }, { 256, 256 }, { 12, 12 }, + { 228, 197 }, { 75, 44 }, { 106, 75 }, { 198, 167 }, + { 137, 106 }, { 258, 227 }, { 168, 137 }, { 288, 257 }, + { 44, 13 }, { 229, 198 }, { 259, 228 }, { 199, 168 }, + { 107, 76 }, { 13, 13 }, { 169, 138 }, { 138, 107 }, + { 288, 288 }, { 289, 258 }, { 76, 45 }, { 230, 199 }, + { 260, 229 }, { 45, 14 }, { 200, 169 }, { 139, 108 }, + { 290, 259 }, { 108, 77 }, { 231, 200 }, { 320, 289 }, + { 261, 230 }, { 170, 139 }, { 77, 46 }, { 291, 260 }, + { 14, 14 }, { 321, 290 }, { 201, 170 }, { 262, 231 }, + { 320, 320 }, { 171, 140 }, { 292, 261 }, { 232, 201 }, + { 140, 109 }, { 322, 291 }, { 109, 78 }, { 46, 15 }, + { 202, 171 }, { 263, 232 }, { 233, 202 }, { 293, 262 }, + { 352, 321 }, { 323, 292 }, { 15, 15 }, { 78, 47 }, + { 203, 172 }, { 264, 233 }, { 294, 263 }, { 324, 293 }, + { 172, 141 }, { 353, 322 }, { 141, 110 }, { 234, 203 }, + { 352, 352 }, { 47, 16 }, { 295, 264 }, { 110, 79 }, + { 265, 234 }, { 354, 323 }, { 325, 294 }, { 79, 48 }, + { 16, 16 }, { 204, 173 }, { 235, 204 }, { 173, 142 }, + { 355, 324 }, { 384, 353 }, { 326, 295 }, { 142, 111 }, + { 296, 265 }, { 266, 235 }, { 356, 325 }, { 385, 354 }, + { 111, 80 }, { 48, 17 }, { 327, 296 }, { 297, 266 }, + { 205, 174 }, { 384, 384 }, { 236, 205 }, { 357, 326 }, + { 386, 355 }, { 80, 49 }, { 174, 143 }, { 17, 17 }, + { 328, 297 }, { 358, 327 }, { 387, 356 }, { 298, 267 }, + { 329, 298 }, { 388, 357 }, { 112, 81 }, { 416, 385 }, + { 237, 206 }, { 359, 328 }, { 49, 18 }, { 206, 175 }, + { 417, 386 }, { 389, 358 }, { 330, 299 }, { 18, 18 }, + { 416, 416 }, { 360, 329 }, { 81, 50 }, { 418, 387 }, + { 390, 359 }, { 238, 207 }, { 50, 19 }, { 361, 330 }, + { 419, 388 }, { 113, 82 }, { 448, 417 }, { 448, 448 }, + { 420, 389 }, { 82, 51 }, { 362, 331 }, { 449, 418 }, + { 421, 390 }, { 480, 480 }, { 450, 419 }, { 422, 391 }, + { 114, 83 }, { 451, 420 }, { 480, 449 }, { 452, 421 }, + { 481, 450 }, { 453, 422 }, { 512, 512 }, { 482, 451 }, + { 454, 423 }, { 512, 481 }, { 483, 452 }, { 513, 482 }, + { 484, 453 }, { 514, 483 }, { 485, 454 }, { 544, 513 }, + { 544, 544 }, { 486, 455 }, { 545, 514 }, { 546, 515 }, + { 576, 576 }, { 576, 545 }, { 577, 546 }, { 578, 547 }, + { 608, 577 }, { 609, 578 }, { 610, 579 }, { 19, 19 }, + { 143, 112 }, { 267, 236 }, { 391, 360 }, { 515, 484 }, + { 608, 608 }, { 20, 20 }, { 51, 20 }, { 144, 113 }, + { 175, 144 }, { 268, 237 }, { 299, 268 }, { 392, 361 }, + { 423, 392 }, { 516, 485 }, { 547, 516 }, { 640, 609 }, + { 640, 640 }, { 21, 21 }, { 52, 21 }, { 83, 52 }, + { 145, 114 }, { 176, 145 }, { 207, 176 }, { 269, 238 }, + { 300, 269 }, { 331, 300 }, { 393, 362 }, { 424, 393 }, + { 455, 424 }, { 517, 486 }, { 548, 517 }, { 579, 548 }, + { 641, 610 }, { 672, 641 }, { 672, 672 }, { 22, 22 }, + { 53, 22 }, { 84, 53 }, { 115, 84 }, { 146, 115 }, + { 177, 146 }, { 208, 177 }, { 239, 208 }, { 270, 239 }, + { 301, 270 }, { 332, 301 }, { 363, 332 }, { 394, 363 }, + { 425, 394 }, { 456, 425 }, { 487, 456 }, { 518, 487 }, + { 549, 518 }, { 580, 549 }, { 611, 580 }, { 642, 611 }, + { 673, 642 }, { 704, 673 }, { 704, 704 }, { 54, 23 }, + { 85, 54 }, { 116, 85 }, { 178, 147 }, { 209, 178 }, + { 240, 209 }, { 302, 271 }, { 333, 302 }, { 364, 333 }, + { 426, 395 }, { 457, 426 }, { 488, 457 }, { 550, 519 }, + { 581, 550 }, { 612, 581 }, { 674, 643 }, { 705, 674 }, + { 736, 705 }, { 86, 55 }, { 117, 86 }, { 210, 179 }, + { 241, 210 }, { 334, 303 }, { 365, 334 }, { 458, 427 }, + { 489, 458 }, { 582, 551 }, { 613, 582 }, { 706, 675 }, + { 737, 706 }, { 118, 87 }, { 242, 211 }, { 366, 335 }, + { 490, 459 }, { 614, 583 }, { 738, 707 }, { 23, 23 }, + { 147, 116 }, { 271, 240 }, { 395, 364 }, { 519, 488 }, + { 643, 612 }, { 736, 736 }, { 24, 24 }, { 55, 24 }, + { 148, 117 }, { 179, 148 }, { 272, 241 }, { 303, 272 }, + { 396, 365 }, { 427, 396 }, { 520, 489 }, { 551, 520 }, + { 644, 613 }, { 675, 644 }, { 768, 737 }, { 768, 768 }, + { 25, 25 }, { 56, 25 }, { 87, 56 }, { 149, 118 }, + { 180, 149 }, { 211, 180 }, { 273, 242 }, { 304, 273 }, + { 335, 304 }, { 397, 366 }, { 428, 397 }, { 459, 428 }, + { 521, 490 }, { 552, 521 }, { 583, 552 }, { 645, 614 }, + { 676, 645 }, { 707, 676 }, { 769, 738 }, { 800, 769 }, + { 800, 800 }, { 26, 26 }, { 57, 26 }, { 88, 57 }, + { 119, 88 }, { 150, 119 }, { 181, 150 }, { 212, 181 }, + { 243, 212 }, { 274, 243 }, { 305, 274 }, { 336, 305 }, + { 367, 336 }, { 398, 367 }, { 429, 398 }, { 460, 429 }, + { 491, 460 }, { 522, 491 }, { 553, 522 }, { 584, 553 }, + { 615, 584 }, { 646, 615 }, { 677, 646 }, { 708, 677 }, + { 739, 708 }, { 770, 739 }, { 801, 770 }, { 832, 801 }, + { 832, 832 }, { 58, 27 }, { 89, 58 }, { 120, 89 }, + { 182, 151 }, { 213, 182 }, { 244, 213 }, { 306, 275 }, + { 337, 306 }, { 368, 337 }, { 430, 399 }, { 461, 430 }, + { 492, 461 }, { 554, 523 }, { 585, 554 }, { 616, 585 }, + { 678, 647 }, { 709, 678 }, { 740, 709 }, { 802, 771 }, + { 833, 802 }, { 864, 833 }, { 90, 59 }, { 121, 90 }, + { 214, 183 }, { 245, 214 }, { 338, 307 }, { 369, 338 }, + { 462, 431 }, { 493, 462 }, { 586, 555 }, { 617, 586 }, + { 710, 679 }, { 741, 710 }, { 834, 803 }, { 865, 834 }, + { 122, 91 }, { 246, 215 }, { 370, 339 }, { 494, 463 }, + { 618, 587 }, { 742, 711 }, { 866, 835 }, { 27, 27 }, + { 151, 120 }, { 275, 244 }, { 399, 368 }, { 523, 492 }, + { 647, 616 }, { 771, 740 }, { 864, 864 }, { 28, 28 }, + { 59, 28 }, { 152, 121 }, { 183, 152 }, { 276, 245 }, + { 307, 276 }, { 400, 369 }, { 431, 400 }, { 524, 493 }, + { 555, 524 }, { 648, 617 }, { 679, 648 }, { 772, 741 }, + { 803, 772 }, { 896, 865 }, { 896, 896 }, { 29, 29 }, + { 60, 29 }, { 91, 60 }, { 153, 122 }, { 184, 153 }, + { 215, 184 }, { 277, 246 }, { 308, 277 }, { 339, 308 }, + { 401, 370 }, { 432, 401 }, { 463, 432 }, { 525, 494 }, + { 556, 525 }, { 587, 556 }, { 649, 618 }, { 680, 649 }, + { 711, 680 }, { 773, 742 }, { 804, 773 }, { 835, 804 }, + { 897, 866 }, { 928, 897 }, { 928, 928 }, { 30, 30 }, + { 61, 30 }, { 92, 61 }, { 123, 92 }, { 154, 123 }, + { 185, 154 }, { 216, 185 }, { 247, 216 }, { 278, 247 }, + { 309, 278 }, { 340, 309 }, { 371, 340 }, { 402, 371 }, + { 433, 402 }, { 464, 433 }, { 495, 464 }, { 526, 495 }, + { 557, 526 }, { 588, 557 }, { 619, 588 }, { 650, 619 }, + { 681, 650 }, { 712, 681 }, { 743, 712 }, { 774, 743 }, + { 805, 774 }, { 836, 805 }, { 867, 836 }, { 898, 867 }, + { 929, 898 }, { 960, 929 }, { 960, 960 }, { 62, 31 }, + { 93, 62 }, { 124, 93 }, { 186, 155 }, { 217, 186 }, + { 248, 217 }, { 310, 279 }, { 341, 310 }, { 372, 341 }, + { 434, 403 }, { 465, 434 }, { 496, 465 }, { 558, 527 }, + { 589, 558 }, { 620, 589 }, { 682, 651 }, { 713, 682 }, + { 744, 713 }, { 806, 775 }, { 837, 806 }, { 868, 837 }, + { 930, 899 }, { 961, 930 }, { 992, 961 }, { 94, 63 }, + { 125, 94 }, { 218, 187 }, { 249, 218 }, { 342, 311 }, + { 373, 342 }, { 466, 435 }, { 497, 466 }, { 590, 559 }, + { 621, 590 }, { 714, 683 }, { 745, 714 }, { 838, 807 }, + { 869, 838 }, { 962, 931 }, { 993, 962 }, { 126, 95 }, + { 250, 219 }, { 374, 343 }, { 498, 467 }, { 622, 591 }, + { 746, 715 }, { 870, 839 }, { 994, 963 }, { 155, 124 }, + { 279, 248 }, { 403, 372 }, { 527, 496 }, { 651, 620 }, + { 775, 744 }, { 899, 868 }, { 156, 125 }, { 187, 156 }, + { 280, 249 }, { 311, 280 }, { 404, 373 }, { 435, 404 }, + { 528, 497 }, { 559, 528 }, { 652, 621 }, { 683, 652 }, + { 776, 745 }, { 807, 776 }, { 900, 869 }, { 931, 900 }, + { 157, 126 }, { 188, 157 }, { 219, 188 }, { 281, 250 }, + { 312, 281 }, { 343, 312 }, { 405, 374 }, { 436, 405 }, + { 467, 436 }, { 529, 498 }, { 560, 529 }, { 591, 560 }, + { 653, 622 }, { 684, 653 }, { 715, 684 }, { 777, 746 }, + { 808, 777 }, { 839, 808 }, { 901, 870 }, { 932, 901 }, + { 963, 932 }, { 158, 127 }, { 189, 158 }, { 220, 189 }, + { 251, 220 }, { 282, 251 }, { 313, 282 }, { 344, 313 }, + { 375, 344 }, { 406, 375 }, { 437, 406 }, { 468, 437 }, + { 499, 468 }, { 530, 499 }, { 561, 530 }, { 592, 561 }, + { 623, 592 }, { 654, 623 }, { 685, 654 }, { 716, 685 }, + { 747, 716 }, { 778, 747 }, { 809, 778 }, { 840, 809 }, + { 871, 840 }, { 902, 871 }, { 933, 902 }, { 964, 933 }, + { 995, 964 }, { 190, 159 }, { 221, 190 }, { 252, 221 }, + { 314, 283 }, { 345, 314 }, { 376, 345 }, { 438, 407 }, + { 469, 438 }, { 500, 469 }, { 562, 531 }, { 593, 562 }, + { 624, 593 }, { 686, 655 }, { 717, 686 }, { 748, 717 }, + { 810, 779 }, { 841, 810 }, { 872, 841 }, { 934, 903 }, + { 965, 934 }, { 996, 965 }, { 222, 191 }, { 253, 222 }, + { 346, 315 }, { 377, 346 }, { 470, 439 }, { 501, 470 }, + { 594, 563 }, { 625, 594 }, { 718, 687 }, { 749, 718 }, + { 842, 811 }, { 873, 842 }, { 966, 935 }, { 997, 966 }, + { 254, 223 }, { 378, 347 }, { 502, 471 }, { 626, 595 }, + { 750, 719 }, { 874, 843 }, { 998, 967 }, { 283, 252 }, + { 407, 376 }, { 531, 500 }, { 655, 624 }, { 779, 748 }, + { 903, 872 }, { 284, 253 }, { 315, 284 }, { 408, 377 }, + { 439, 408 }, { 532, 501 }, { 563, 532 }, { 656, 625 }, + { 687, 656 }, { 780, 749 }, { 811, 780 }, { 904, 873 }, + { 935, 904 }, { 285, 254 }, { 316, 285 }, { 347, 316 }, + { 409, 378 }, { 440, 409 }, { 471, 440 }, { 533, 502 }, + { 564, 533 }, { 595, 564 }, { 657, 626 }, { 688, 657 }, + { 719, 688 }, { 781, 750 }, { 812, 781 }, { 843, 812 }, + { 905, 874 }, { 936, 905 }, { 967, 936 }, { 286, 255 }, + { 317, 286 }, { 348, 317 }, { 379, 348 }, { 410, 379 }, + { 441, 410 }, { 472, 441 }, { 503, 472 }, { 534, 503 }, + { 565, 534 }, { 596, 565 }, { 627, 596 }, { 658, 627 }, + { 689, 658 }, { 720, 689 }, { 751, 720 }, { 782, 751 }, + { 813, 782 }, { 844, 813 }, { 875, 844 }, { 906, 875 }, + { 937, 906 }, { 968, 937 }, { 999, 968 }, { 318, 287 }, + { 349, 318 }, { 380, 349 }, { 442, 411 }, { 473, 442 }, + { 504, 473 }, { 566, 535 }, { 597, 566 }, { 628, 597 }, + { 690, 659 }, { 721, 690 }, { 752, 721 }, { 814, 783 }, + { 845, 814 }, { 876, 845 }, { 938, 907 }, { 969, 938 }, + { 1000, 969 }, { 350, 319 }, { 381, 350 }, { 474, 443 }, + { 505, 474 }, { 598, 567 }, { 629, 598 }, { 722, 691 }, + { 753, 722 }, { 846, 815 }, { 877, 846 }, { 970, 939 }, + { 1001, 970 }, { 382, 351 }, { 506, 475 }, { 630, 599 }, + { 754, 723 }, { 878, 847 }, { 1002, 971 }, { 411, 380 }, + { 535, 504 }, { 659, 628 }, { 783, 752 }, { 907, 876 }, + { 412, 381 }, { 443, 412 }, { 536, 505 }, { 567, 536 }, + { 660, 629 }, { 691, 660 }, { 784, 753 }, { 815, 784 }, + { 908, 877 }, { 939, 908 }, { 413, 382 }, { 444, 413 }, + { 475, 444 }, { 537, 506 }, { 568, 537 }, { 599, 568 }, + { 661, 630 }, { 692, 661 }, { 723, 692 }, { 785, 754 }, + { 816, 785 }, { 847, 816 }, { 909, 878 }, { 940, 909 }, + { 971, 940 }, { 414, 383 }, { 445, 414 }, { 476, 445 }, + { 507, 476 }, { 538, 507 }, { 569, 538 }, { 600, 569 }, + { 631, 600 }, { 662, 631 }, { 693, 662 }, { 724, 693 }, + { 755, 724 }, { 786, 755 }, { 817, 786 }, { 848, 817 }, + { 879, 848 }, { 910, 879 }, { 941, 910 }, { 972, 941 }, + { 1003, 972 }, { 446, 415 }, { 477, 446 }, { 508, 477 }, + { 570, 539 }, { 601, 570 }, { 632, 601 }, { 694, 663 }, + { 725, 694 }, { 756, 725 }, { 818, 787 }, { 849, 818 }, + { 880, 849 }, { 942, 911 }, { 973, 942 }, { 1004, 973 }, + { 478, 447 }, { 509, 478 }, { 602, 571 }, { 633, 602 }, + { 726, 695 }, { 757, 726 }, { 850, 819 }, { 881, 850 }, + { 974, 943 }, { 1005, 974 }, { 510, 479 }, { 634, 603 }, + { 758, 727 }, { 882, 851 }, { 1006, 975 }, { 539, 508 }, + { 663, 632 }, { 787, 756 }, { 911, 880 }, { 540, 509 }, + { 571, 540 }, { 664, 633 }, { 695, 664 }, { 788, 757 }, + { 819, 788 }, { 912, 881 }, { 943, 912 }, { 541, 510 }, + { 572, 541 }, { 603, 572 }, { 665, 634 }, { 696, 665 }, + { 727, 696 }, { 789, 758 }, { 820, 789 }, { 851, 820 }, + { 913, 882 }, { 944, 913 }, { 975, 944 }, { 542, 511 }, + { 573, 542 }, { 604, 573 }, { 635, 604 }, { 666, 635 }, + { 697, 666 }, { 728, 697 }, { 759, 728 }, { 790, 759 }, + { 821, 790 }, { 852, 821 }, { 883, 852 }, { 914, 883 }, + { 945, 914 }, { 976, 945 }, { 1007, 976 }, { 574, 543 }, + { 605, 574 }, { 636, 605 }, { 698, 667 }, { 729, 698 }, + { 760, 729 }, { 822, 791 }, { 853, 822 }, { 884, 853 }, + { 946, 915 }, { 977, 946 }, { 1008, 977 }, { 606, 575 }, + { 637, 606 }, { 730, 699 }, { 761, 730 }, { 854, 823 }, + { 885, 854 }, { 978, 947 }, { 1009, 978 }, { 638, 607 }, + { 762, 731 }, { 886, 855 }, { 1010, 979 }, { 667, 636 }, + { 791, 760 }, { 915, 884 }, { 668, 637 }, { 699, 668 }, + { 792, 761 }, { 823, 792 }, { 916, 885 }, { 947, 916 }, + { 669, 638 }, { 700, 669 }, { 731, 700 }, { 793, 762 }, + { 824, 793 }, { 855, 824 }, { 917, 886 }, { 948, 917 }, + { 979, 948 }, { 670, 639 }, { 701, 670 }, { 732, 701 }, + { 763, 732 }, { 794, 763 }, { 825, 794 }, { 856, 825 }, + { 887, 856 }, { 918, 887 }, { 949, 918 }, { 980, 949 }, + { 1011, 980 }, { 702, 671 }, { 733, 702 }, { 764, 733 }, + { 826, 795 }, { 857, 826 }, { 888, 857 }, { 950, 919 }, + { 981, 950 }, { 1012, 981 }, { 734, 703 }, { 765, 734 }, + { 858, 827 }, { 889, 858 }, { 982, 951 }, { 1013, 982 }, + { 766, 735 }, { 890, 859 }, { 1014, 983 }, { 795, 764 }, + { 919, 888 }, { 796, 765 }, { 827, 796 }, { 920, 889 }, + { 951, 920 }, { 797, 766 }, { 828, 797 }, { 859, 828 }, + { 921, 890 }, { 952, 921 }, { 983, 952 }, { 798, 767 }, + { 829, 798 }, { 860, 829 }, { 891, 860 }, { 922, 891 }, + { 953, 922 }, { 984, 953 }, { 1015, 984 }, { 830, 799 }, + { 861, 830 }, { 892, 861 }, { 954, 923 }, { 985, 954 }, + { 1016, 985 }, { 862, 831 }, { 893, 862 }, { 986, 955 }, + { 1017, 986 }, { 894, 863 }, { 1018, 987 }, { 923, 892 }, + { 924, 893 }, { 955, 924 }, { 925, 894 }, { 956, 925 }, + { 987, 956 }, { 926, 895 }, { 957, 926 }, { 988, 957 }, + { 1019, 988 }, { 958, 927 }, { 989, 958 }, { 1020, 989 }, + { 990, 959 }, { 1021, 990 }, { 1022, 991 }, { 0, 0 }, +}; + +const int16_t (* const ff_vp9_scans_nb[5][4])[2] = { + { + ff_vp9_default_scan_4x4_nb, ff_vp9_col_scan_4x4_nb, + ff_vp9_row_scan_4x4_nb, ff_vp9_default_scan_4x4_nb + }, { + ff_vp9_default_scan_8x8_nb, ff_vp9_col_scan_8x8_nb, + ff_vp9_row_scan_8x8_nb, ff_vp9_default_scan_8x8_nb + }, { + ff_vp9_default_scan_16x16_nb, ff_vp9_col_scan_16x16_nb, + ff_vp9_row_scan_16x16_nb, ff_vp9_default_scan_16x16_nb + }, { + ff_vp9_default_scan_32x32_nb, ff_vp9_default_scan_32x32_nb, + ff_vp9_default_scan_32x32_nb, ff_vp9_default_scan_32x32_nb + }, { // lossless + ff_vp9_default_scan_4x4_nb, ff_vp9_default_scan_4x4_nb, + ff_vp9_default_scan_4x4_nb, ff_vp9_default_scan_4x4_nb + } +}; + +const uint8_t ff_vp9_model_pareto8[256][8] = { + { 6, 86, 128, 11, 87, 42, 91, 52 }, + { 3, 86, 128, 6, 86, 23, 88, 29 }, + { 6, 86, 128, 11, 87, 42, 91, 52 }, + { 9, 86, 129, 17, 88, 61, 94, 76 }, + { 12, 86, 129, 22, 88, 77, 97, 93 }, + { 15, 87, 129, 28, 89, 93, 100, 110 }, + { 17, 87, 129, 33, 90, 105, 103, 123 }, + { 20, 88, 130, 38, 91, 118, 106, 136 }, + { 23, 88, 130, 43, 91, 128, 108, 146 }, + { 26, 89, 131, 48, 92, 139, 111, 156 }, + { 28, 89, 131, 53, 93, 147, 114, 163 }, + { 31, 90, 131, 58, 94, 156, 117, 171 }, + { 34, 90, 131, 62, 94, 163, 119, 177 }, + { 37, 90, 132, 66, 95, 171, 122, 184 }, + { 39, 90, 132, 70, 96, 177, 124, 189 }, + { 42, 91, 132, 75, 97, 183, 127, 194 }, + { 44, 91, 132, 79, 97, 188, 129, 198 }, + { 47, 92, 133, 83, 98, 193, 132, 202 }, + { 49, 92, 133, 86, 99, 197, 134, 205 }, + { 52, 93, 133, 90, 100, 201, 137, 208 }, + { 54, 93, 133, 94, 100, 204, 139, 211 }, + { 57, 94, 134, 98, 101, 208, 142, 214 }, + { 59, 94, 134, 101, 102, 211, 144, 216 }, + { 62, 94, 135, 105, 103, 214, 146, 218 }, + { 64, 94, 135, 108, 103, 216, 148, 220 }, + { 66, 95, 135, 111, 104, 219, 151, 222 }, + { 68, 95, 135, 114, 105, 221, 153, 223 }, + { 71, 96, 136, 117, 106, 224, 155, 225 }, + { 73, 96, 136, 120, 106, 225, 157, 226 }, + { 76, 97, 136, 123, 107, 227, 159, 228 }, + { 78, 97, 136, 126, 108, 229, 160, 229 }, + { 80, 98, 137, 129, 109, 231, 162, 231 }, + { 82, 98, 137, 131, 109, 232, 164, 232 }, + { 84, 98, 138, 134, 110, 234, 166, 233 }, + { 86, 98, 138, 137, 111, 235, 168, 234 }, + { 89, 99, 138, 140, 112, 236, 170, 235 }, + { 91, 99, 138, 142, 112, 237, 171, 235 }, + { 93, 100, 139, 145, 113, 238, 173, 236 }, + { 95, 100, 139, 147, 114, 239, 174, 237 }, + { 97, 101, 140, 149, 115, 240, 176, 238 }, + { 99, 101, 140, 151, 115, 241, 177, 238 }, + { 101, 102, 140, 154, 116, 242, 179, 239 }, + { 103, 102, 140, 156, 117, 242, 180, 239 }, + { 105, 103, 141, 158, 118, 243, 182, 240 }, + { 107, 103, 141, 160, 118, 243, 183, 240 }, + { 109, 104, 141, 162, 119, 244, 185, 241 }, + { 111, 104, 141, 164, 119, 244, 186, 241 }, + { 113, 104, 142, 166, 120, 245, 187, 242 }, + { 114, 104, 142, 168, 121, 245, 188, 242 }, + { 116, 105, 143, 170, 122, 246, 190, 243 }, + { 118, 105, 143, 171, 122, 246, 191, 243 }, + { 120, 106, 143, 173, 123, 247, 192, 244 }, + { 121, 106, 143, 175, 124, 247, 193, 244 }, + { 123, 107, 144, 177, 125, 248, 195, 244 }, + { 125, 107, 144, 178, 125, 248, 196, 244 }, + { 127, 108, 145, 180, 126, 249, 197, 245 }, + { 128, 108, 145, 181, 127, 249, 198, 245 }, + { 130, 109, 145, 183, 128, 249, 199, 245 }, + { 132, 109, 145, 184, 128, 249, 200, 245 }, + { 134, 110, 146, 186, 129, 250, 201, 246 }, + { 135, 110, 146, 187, 130, 250, 202, 246 }, + { 137, 111, 147, 189, 131, 251, 203, 246 }, + { 138, 111, 147, 190, 131, 251, 204, 246 }, + { 140, 112, 147, 192, 132, 251, 205, 247 }, + { 141, 112, 147, 193, 132, 251, 206, 247 }, + { 143, 113, 148, 194, 133, 251, 207, 247 }, + { 144, 113, 148, 195, 134, 251, 207, 247 }, + { 146, 114, 149, 197, 135, 252, 208, 248 }, + { 147, 114, 149, 198, 135, 252, 209, 248 }, + { 149, 115, 149, 199, 136, 252, 210, 248 }, + { 150, 115, 149, 200, 137, 252, 210, 248 }, + { 152, 115, 150, 201, 138, 252, 211, 248 }, + { 153, 115, 150, 202, 138, 252, 212, 248 }, + { 155, 116, 151, 204, 139, 253, 213, 249 }, + { 156, 116, 151, 205, 139, 253, 213, 249 }, + { 158, 117, 151, 206, 140, 253, 214, 249 }, + { 159, 117, 151, 207, 141, 253, 215, 249 }, + { 161, 118, 152, 208, 142, 253, 216, 249 }, + { 162, 118, 152, 209, 142, 253, 216, 249 }, + { 163, 119, 153, 210, 143, 253, 217, 249 }, + { 164, 119, 153, 211, 143, 253, 217, 249 }, + { 166, 120, 153, 212, 144, 254, 218, 250 }, + { 167, 120, 153, 212, 145, 254, 219, 250 }, + { 168, 121, 154, 213, 146, 254, 220, 250 }, + { 169, 121, 154, 214, 146, 254, 220, 250 }, + { 171, 122, 155, 215, 147, 254, 221, 250 }, + { 172, 122, 155, 216, 147, 254, 221, 250 }, + { 173, 123, 155, 217, 148, 254, 222, 250 }, + { 174, 123, 155, 217, 149, 254, 222, 250 }, + { 176, 124, 156, 218, 150, 254, 223, 250 }, + { 177, 124, 156, 219, 150, 254, 223, 250 }, + { 178, 125, 157, 220, 151, 254, 224, 251 }, + { 179, 125, 157, 220, 151, 254, 224, 251 }, + { 180, 126, 157, 221, 152, 254, 225, 251 }, + { 181, 126, 157, 221, 152, 254, 225, 251 }, + { 183, 127, 158, 222, 153, 254, 226, 251 }, + { 184, 127, 158, 223, 154, 254, 226, 251 }, + { 185, 128, 159, 224, 155, 255, 227, 251 }, + { 186, 128, 159, 224, 155, 255, 227, 251 }, + { 187, 129, 160, 225, 156, 255, 228, 251 }, + { 188, 130, 160, 225, 156, 255, 228, 251 }, + { 189, 131, 160, 226, 157, 255, 228, 251 }, + { 190, 131, 160, 226, 158, 255, 228, 251 }, + { 191, 132, 161, 227, 159, 255, 229, 251 }, + { 192, 132, 161, 227, 159, 255, 229, 251 }, + { 193, 133, 162, 228, 160, 255, 230, 252 }, + { 194, 133, 162, 229, 160, 255, 230, 252 }, + { 195, 134, 163, 230, 161, 255, 231, 252 }, + { 196, 134, 163, 230, 161, 255, 231, 252 }, + { 197, 135, 163, 231, 162, 255, 231, 252 }, + { 198, 135, 163, 231, 162, 255, 231, 252 }, + { 199, 136, 164, 232, 163, 255, 232, 252 }, + { 200, 136, 164, 232, 164, 255, 232, 252 }, + { 201, 137, 165, 233, 165, 255, 233, 252 }, + { 201, 137, 165, 233, 165, 255, 233, 252 }, + { 202, 138, 166, 233, 166, 255, 233, 252 }, + { 203, 138, 166, 233, 166, 255, 233, 252 }, + { 204, 139, 166, 234, 167, 255, 234, 252 }, + { 205, 139, 166, 234, 167, 255, 234, 252 }, + { 206, 140, 167, 235, 168, 255, 235, 252 }, + { 206, 140, 167, 235, 168, 255, 235, 252 }, + { 207, 141, 168, 236, 169, 255, 235, 252 }, + { 208, 141, 168, 236, 170, 255, 235, 252 }, + { 209, 142, 169, 237, 171, 255, 236, 252 }, + { 209, 143, 169, 237, 171, 255, 236, 252 }, + { 210, 144, 169, 237, 172, 255, 236, 252 }, + { 211, 144, 169, 237, 172, 255, 236, 252 }, + { 212, 145, 170, 238, 173, 255, 237, 252 }, + { 213, 145, 170, 238, 173, 255, 237, 252 }, + { 214, 146, 171, 239, 174, 255, 237, 253 }, + { 214, 146, 171, 239, 174, 255, 237, 253 }, + { 215, 147, 172, 240, 175, 255, 238, 253 }, + { 215, 147, 172, 240, 175, 255, 238, 253 }, + { 216, 148, 173, 240, 176, 255, 238, 253 }, + { 217, 148, 173, 240, 176, 255, 238, 253 }, + { 218, 149, 173, 241, 177, 255, 239, 253 }, + { 218, 149, 173, 241, 178, 255, 239, 253 }, + { 219, 150, 174, 241, 179, 255, 239, 253 }, + { 219, 151, 174, 241, 179, 255, 239, 253 }, + { 220, 152, 175, 242, 180, 255, 240, 253 }, + { 221, 152, 175, 242, 180, 255, 240, 253 }, + { 222, 153, 176, 242, 181, 255, 240, 253 }, + { 222, 153, 176, 242, 181, 255, 240, 253 }, + { 223, 154, 177, 243, 182, 255, 240, 253 }, + { 223, 154, 177, 243, 182, 255, 240, 253 }, + { 224, 155, 178, 244, 183, 255, 241, 253 }, + { 224, 155, 178, 244, 183, 255, 241, 253 }, + { 225, 156, 178, 244, 184, 255, 241, 253 }, + { 225, 157, 178, 244, 184, 255, 241, 253 }, + { 226, 158, 179, 244, 185, 255, 242, 253 }, + { 227, 158, 179, 244, 185, 255, 242, 253 }, + { 228, 159, 180, 245, 186, 255, 242, 253 }, + { 228, 159, 180, 245, 186, 255, 242, 253 }, + { 229, 160, 181, 245, 187, 255, 242, 253 }, + { 229, 160, 181, 245, 187, 255, 242, 253 }, + { 230, 161, 182, 246, 188, 255, 243, 253 }, + { 230, 162, 182, 246, 188, 255, 243, 253 }, + { 231, 163, 183, 246, 189, 255, 243, 253 }, + { 231, 163, 183, 246, 189, 255, 243, 253 }, + { 232, 164, 184, 247, 190, 255, 243, 253 }, + { 232, 164, 184, 247, 190, 255, 243, 253 }, + { 233, 165, 185, 247, 191, 255, 244, 253 }, + { 233, 165, 185, 247, 191, 255, 244, 253 }, + { 234, 166, 185, 247, 192, 255, 244, 253 }, + { 234, 167, 185, 247, 192, 255, 244, 253 }, + { 235, 168, 186, 248, 193, 255, 244, 253 }, + { 235, 168, 186, 248, 193, 255, 244, 253 }, + { 236, 169, 187, 248, 194, 255, 244, 253 }, + { 236, 169, 187, 248, 194, 255, 244, 253 }, + { 236, 170, 188, 248, 195, 255, 245, 253 }, + { 236, 170, 188, 248, 195, 255, 245, 253 }, + { 237, 171, 189, 249, 196, 255, 245, 254 }, + { 237, 172, 189, 249, 196, 255, 245, 254 }, + { 238, 173, 190, 249, 197, 255, 245, 254 }, + { 238, 173, 190, 249, 197, 255, 245, 254 }, + { 239, 174, 191, 249, 198, 255, 245, 254 }, + { 239, 174, 191, 249, 198, 255, 245, 254 }, + { 240, 175, 192, 249, 199, 255, 246, 254 }, + { 240, 176, 192, 249, 199, 255, 246, 254 }, + { 240, 177, 193, 250, 200, 255, 246, 254 }, + { 240, 177, 193, 250, 200, 255, 246, 254 }, + { 241, 178, 194, 250, 201, 255, 246, 254 }, + { 241, 178, 194, 250, 201, 255, 246, 254 }, + { 242, 179, 195, 250, 202, 255, 246, 254 }, + { 242, 180, 195, 250, 202, 255, 246, 254 }, + { 242, 181, 196, 250, 203, 255, 247, 254 }, + { 242, 181, 196, 250, 203, 255, 247, 254 }, + { 243, 182, 197, 251, 204, 255, 247, 254 }, + { 243, 183, 197, 251, 204, 255, 247, 254 }, + { 244, 184, 198, 251, 205, 255, 247, 254 }, + { 244, 184, 198, 251, 205, 255, 247, 254 }, + { 244, 185, 199, 251, 206, 255, 247, 254 }, + { 244, 185, 199, 251, 206, 255, 247, 254 }, + { 245, 186, 200, 251, 207, 255, 247, 254 }, + { 245, 187, 200, 251, 207, 255, 247, 254 }, + { 246, 188, 201, 252, 207, 255, 248, 254 }, + { 246, 188, 201, 252, 207, 255, 248, 254 }, + { 246, 189, 202, 252, 208, 255, 248, 254 }, + { 246, 190, 202, 252, 208, 255, 248, 254 }, + { 247, 191, 203, 252, 209, 255, 248, 254 }, + { 247, 191, 203, 252, 209, 255, 248, 254 }, + { 247, 192, 204, 252, 210, 255, 248, 254 }, + { 247, 193, 204, 252, 210, 255, 248, 254 }, + { 248, 194, 205, 252, 211, 255, 248, 254 }, + { 248, 194, 205, 252, 211, 255, 248, 254 }, + { 248, 195, 206, 252, 212, 255, 249, 254 }, + { 248, 196, 206, 252, 212, 255, 249, 254 }, + { 249, 197, 207, 253, 213, 255, 249, 254 }, + { 249, 197, 207, 253, 213, 255, 249, 254 }, + { 249, 198, 208, 253, 214, 255, 249, 254 }, + { 249, 199, 209, 253, 214, 255, 249, 254 }, + { 250, 200, 210, 253, 215, 255, 249, 254 }, + { 250, 200, 210, 253, 215, 255, 249, 254 }, + { 250, 201, 211, 253, 215, 255, 249, 254 }, + { 250, 202, 211, 253, 215, 255, 249, 254 }, + { 250, 203, 212, 253, 216, 255, 249, 254 }, + { 250, 203, 212, 253, 216, 255, 249, 254 }, + { 251, 204, 213, 253, 217, 255, 250, 254 }, + { 251, 205, 213, 253, 217, 255, 250, 254 }, + { 251, 206, 214, 254, 218, 255, 250, 254 }, + { 251, 206, 215, 254, 218, 255, 250, 254 }, + { 252, 207, 216, 254, 219, 255, 250, 254 }, + { 252, 208, 216, 254, 219, 255, 250, 254 }, + { 252, 209, 217, 254, 220, 255, 250, 254 }, + { 252, 210, 217, 254, 220, 255, 250, 254 }, + { 252, 211, 218, 254, 221, 255, 250, 254 }, + { 252, 212, 218, 254, 221, 255, 250, 254 }, + { 253, 213, 219, 254, 222, 255, 250, 254 }, + { 253, 213, 220, 254, 222, 255, 250, 254 }, + { 253, 214, 221, 254, 223, 255, 250, 254 }, + { 253, 215, 221, 254, 223, 255, 250, 254 }, + { 253, 216, 222, 254, 224, 255, 251, 254 }, + { 253, 217, 223, 254, 224, 255, 251, 254 }, + { 253, 218, 224, 254, 225, 255, 251, 254 }, + { 253, 219, 224, 254, 225, 255, 251, 254 }, + { 254, 220, 225, 254, 225, 255, 251, 254 }, + { 254, 221, 226, 254, 225, 255, 251, 254 }, + { 254, 222, 227, 255, 226, 255, 251, 254 }, + { 254, 223, 227, 255, 226, 255, 251, 254 }, + { 254, 224, 228, 255, 227, 255, 251, 254 }, + { 254, 225, 229, 255, 227, 255, 251, 254 }, + { 254, 226, 230, 255, 228, 255, 251, 254 }, + { 254, 227, 230, 255, 229, 255, 251, 254 }, + { 255, 228, 231, 255, 230, 255, 251, 254 }, + { 255, 229, 232, 255, 230, 255, 251, 254 }, + { 255, 230, 233, 255, 231, 255, 252, 254 }, + { 255, 231, 234, 255, 231, 255, 252, 254 }, + { 255, 232, 235, 255, 232, 255, 252, 254 }, + { 255, 233, 236, 255, 232, 255, 252, 254 }, + { 255, 235, 237, 255, 233, 255, 252, 254 }, + { 255, 236, 238, 255, 234, 255, 252, 254 }, + { 255, 238, 240, 255, 235, 255, 252, 255 }, + { 255, 239, 241, 255, 235, 255, 252, 254 }, + { 255, 241, 243, 255, 236, 255, 252, 254 }, + { 255, 243, 245, 255, 237, 255, 252, 254 }, + { 255, 246, 247, 255, 239, 255, 253, 255 }, +}; + +const ProbContext ff_vp9_default_probs = { + { /* y_mode */ + { 65, 32, 18, 144, 162, 194, 41, 51, 98 } /* bsize < 8x8 */, + { 132, 68, 18, 165, 217, 196, 45, 40, 78 } /* bsize < 16x16 */, + { 173, 80, 19, 176, 240, 193, 64, 35, 46 } /* bsize < 32x32 */, + { 221, 135, 38, 194, 248, 121, 96, 85, 29 } /* bsize >= 32x32 */ + }, { /* uv_mode */ + { 48, 12, 154, 155, 139, 90, 34, 117, 119 } /* y = v */, + { 67, 6, 25, 204, 243, 158, 13, 21, 96 } /* y = h */, + { 120, 7, 76, 176, 208, 126, 28, 54, 103 } /* y = dc */, + { 97, 5, 44, 131, 176, 139, 48, 68, 97 } /* y = d45 */, + { 83, 5, 42, 156, 111, 152, 26, 49, 152 } /* y = d135 */, + { 80, 5, 58, 178, 74, 83, 33, 62, 145 } /* y = d117 */, + { 86, 5, 32, 154, 192, 168, 14, 22, 163 } /* y = d153 */, + { 77, 7, 64, 116, 132, 122, 37, 126, 120 } /* y = d63 */, + { 85, 5, 32, 156, 216, 148, 19, 29, 73 } /* y = d27 */, + { 101, 21, 107, 181, 192, 103, 19, 67, 125 } /* y = tm */ + }, { /* filter */ + { 235, 162, }, + { 36, 255, }, + { 34, 3, }, + { 149, 144, }, + }, { /* mv_mode */ + { 2, 173, 34 }, // 0 = both zero mv + { 7, 145, 85 }, // 1 = one zero mv + one a predicted mv + { 7, 166, 63 }, // 2 = two predicted mvs + { 7, 94, 66 }, // 3 = one predicted/zero and one new mv + { 8, 64, 46 }, // 4 = two new mvs + { 17, 81, 31 }, // 5 = one intra neighbor + x + { 25, 29, 30 }, // 6 = two intra neighbors + }, { /* intra */ + 9, 102, 187, 225 + }, { /* comp */ + 239, 183, 119, 96, 41 + }, { /* single_ref */ + { 33, 16 }, + { 77, 74 }, + { 142, 142 }, + { 172, 170 }, + { 238, 247 } + }, { /* comp_ref */ + 50, 126, 123, 221, 226 + }, { /* tx32p */ + { 3, 136, 37, }, + { 5, 52, 13, }, + }, { /* tx16p */ + { 20, 152, }, + { 15, 101, }, + }, { /* tx8p */ + 100, 66 + }, { /* skip */ + 192, 128, 64 + }, { /* mv_joint */ + 32, 64, 96 + }, { + { /* mv vertical component */ + 128, /* sign */ + { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 }, /* class */ + 216, /* class0 */ + { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, /* bits */ + { /* class0_fp */ + { 128, 128, 64 }, + { 96, 112, 64 } + }, + { 64, 96, 64 }, /* fp */ + 160, /* class0_hp bit */ + 128, /* hp */ + }, { /* mv horizontal component */ + 128, /* sign */ + { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 }, /* class */ + 208, /* class0 */ + { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, /* bits */ + { /* class0_fp */ + { 128, 128, 64 }, + { 96, 112, 64 } + }, + { 64, 96, 64 }, /* fp */ + 160, /* class0_hp bit */ + 128, /* hp */ + } + }, { /* partition */ + { /* 64x64 -> 32x32 */ + { 222, 34, 30 } /* a/l both not split */, + { 72, 16, 44 } /* a split, l not split */, + { 58, 32, 12 } /* l split, a not split */, + { 10, 7, 6 } /* a/l both split */, + }, { /* 32x32 -> 16x16 */ + { 177, 58, 59 } /* a/l both not split */, + { 68, 26, 63 } /* a split, l not split */, + { 52, 79, 25 } /* l split, a not split */, + { 17, 14, 12 } /* a/l both split */, + }, { /* 16x16 -> 8x8 */ + { 174, 73, 87 } /* a/l both not split */, + { 92, 41, 83 } /* a split, l not split */, + { 82, 99, 50 } /* l split, a not split */, + { 53, 39, 39 } /* a/l both split */, + }, { /* 8x8 -> 4x4 */ + { 199, 122, 141 } /* a/l both not split */, + { 147, 63, 159 } /* a split, l not split */, + { 148, 133, 118 } /* l split, a not split */, + { 121, 104, 114 } /* a/l both split */, + } + }, +}; + +const uint8_t ff_vp9_default_coef_probs[4][2][2][6][6][3] = { + { /* tx = 4x4 */ + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 195, 29, 183 }, + { 84, 49, 136 }, + { 8, 42, 71 } + }, { /* Coeff Band 1 */ + { 31, 107, 169 }, + { 35, 99, 159 }, + { 17, 82, 140 }, + { 8, 66, 114 }, + { 2, 44, 76 }, + { 1, 19, 32 } + }, { /* Coeff Band 2 */ + { 40, 132, 201 }, + { 29, 114, 187 }, + { 13, 91, 157 }, + { 7, 75, 127 }, + { 3, 58, 95 }, + { 1, 28, 47 } + }, { /* Coeff Band 3 */ + { 69, 142, 221 }, + { 42, 122, 201 }, + { 15, 91, 159 }, + { 6, 67, 121 }, + { 1, 42, 77 }, + { 1, 17, 31 } + }, { /* Coeff Band 4 */ + { 102, 148, 228 }, + { 67, 117, 204 }, + { 17, 82, 154 }, + { 6, 59, 114 }, + { 2, 39, 75 }, + { 1, 15, 29 } + }, { /* Coeff Band 5 */ + { 156, 57, 233 }, + { 119, 57, 212 }, + { 58, 48, 163 }, + { 29, 40, 124 }, + { 12, 30, 81 }, + { 3, 12, 31 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 191, 107, 226 }, + { 124, 117, 204 }, + { 25, 99, 155 } + }, { /* Coeff Band 1 */ + { 29, 148, 210 }, + { 37, 126, 194 }, + { 8, 93, 157 }, + { 2, 68, 118 }, + { 1, 39, 69 }, + { 1, 17, 33 } + }, { /* Coeff Band 2 */ + { 41, 151, 213 }, + { 27, 123, 193 }, + { 3, 82, 144 }, + { 1, 58, 105 }, + { 1, 32, 60 }, + { 1, 13, 26 } + }, { /* Coeff Band 3 */ + { 59, 159, 220 }, + { 23, 126, 198 }, + { 4, 88, 151 }, + { 1, 66, 114 }, + { 1, 38, 71 }, + { 1, 18, 34 } + }, { /* Coeff Band 4 */ + { 114, 136, 232 }, + { 51, 114, 207 }, + { 11, 83, 155 }, + { 3, 56, 105 }, + { 1, 33, 65 }, + { 1, 17, 34 } + }, { /* Coeff Band 5 */ + { 149, 65, 234 }, + { 121, 57, 215 }, + { 61, 49, 166 }, + { 28, 36, 114 }, + { 12, 25, 76 }, + { 3, 16, 42 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 214, 49, 220 }, + { 132, 63, 188 }, + { 42, 65, 137 } + }, { /* Coeff Band 1 */ + { 85, 137, 221 }, + { 104, 131, 216 }, + { 49, 111, 192 }, + { 21, 87, 155 }, + { 2, 49, 87 }, + { 1, 16, 28 } + }, { /* Coeff Band 2 */ + { 89, 163, 230 }, + { 90, 137, 220 }, + { 29, 100, 183 }, + { 10, 70, 135 }, + { 2, 42, 81 }, + { 1, 17, 33 } + }, { /* Coeff Band 3 */ + { 108, 167, 237 }, + { 55, 133, 222 }, + { 15, 97, 179 }, + { 4, 72, 135 }, + { 1, 45, 85 }, + { 1, 19, 38 } + }, { /* Coeff Band 4 */ + { 124, 146, 240 }, + { 66, 124, 224 }, + { 17, 88, 175 }, + { 4, 58, 122 }, + { 1, 36, 75 }, + { 1, 18, 37 } + }, { /* Coeff Band 5 */ + { 141, 79, 241 }, + { 126, 70, 227 }, + { 66, 58, 182 }, + { 30, 44, 136 }, + { 12, 34, 96 }, + { 2, 20, 47 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 229, 99, 249 }, + { 143, 111, 235 }, + { 46, 109, 192 } + }, { /* Coeff Band 1 */ + { 82, 158, 236 }, + { 94, 146, 224 }, + { 25, 117, 191 }, + { 9, 87, 149 }, + { 3, 56, 99 }, + { 1, 33, 57 } + }, { /* Coeff Band 2 */ + { 83, 167, 237 }, + { 68, 145, 222 }, + { 10, 103, 177 }, + { 2, 72, 131 }, + { 1, 41, 79 }, + { 1, 20, 39 } + }, { /* Coeff Band 3 */ + { 99, 167, 239 }, + { 47, 141, 224 }, + { 10, 104, 178 }, + { 2, 73, 133 }, + { 1, 44, 85 }, + { 1, 22, 47 } + }, { /* Coeff Band 4 */ + { 127, 145, 243 }, + { 71, 129, 228 }, + { 17, 93, 177 }, + { 3, 61, 124 }, + { 1, 41, 84 }, + { 1, 21, 52 } + }, { /* Coeff Band 5 */ + { 157, 78, 244 }, + { 140, 72, 231 }, + { 69, 58, 184 }, + { 31, 44, 137 }, + { 14, 38, 105 }, + { 8, 23, 61 } + } + } + } + }, { /* tx = 8x8 */ + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 125, 34, 187 }, + { 52, 41, 133 }, + { 6, 31, 56 } + }, { /* Coeff Band 1 */ + { 37, 109, 153 }, + { 51, 102, 147 }, + { 23, 87, 128 }, + { 8, 67, 101 }, + { 1, 41, 63 }, + { 1, 19, 29 } + }, { /* Coeff Band 2 */ + { 31, 154, 185 }, + { 17, 127, 175 }, + { 6, 96, 145 }, + { 2, 73, 114 }, + { 1, 51, 82 }, + { 1, 28, 45 } + }, { /* Coeff Band 3 */ + { 23, 163, 200 }, + { 10, 131, 185 }, + { 2, 93, 148 }, + { 1, 67, 111 }, + { 1, 41, 69 }, + { 1, 14, 24 } + }, { /* Coeff Band 4 */ + { 29, 176, 217 }, + { 12, 145, 201 }, + { 3, 101, 156 }, + { 1, 69, 111 }, + { 1, 39, 63 }, + { 1, 14, 23 } + }, { /* Coeff Band 5 */ + { 57, 192, 233 }, + { 25, 154, 215 }, + { 6, 109, 167 }, + { 3, 78, 118 }, + { 1, 48, 69 }, + { 1, 21, 29 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 202, 105, 245 }, + { 108, 106, 216 }, + { 18, 90, 144 } + }, { /* Coeff Band 1 */ + { 33, 172, 219 }, + { 64, 149, 206 }, + { 14, 117, 177 }, + { 5, 90, 141 }, + { 2, 61, 95 }, + { 1, 37, 57 } + }, { /* Coeff Band 2 */ + { 33, 179, 220 }, + { 11, 140, 198 }, + { 1, 89, 148 }, + { 1, 60, 104 }, + { 1, 33, 57 }, + { 1, 12, 21 } + }, { /* Coeff Band 3 */ + { 30, 181, 221 }, + { 8, 141, 198 }, + { 1, 87, 145 }, + { 1, 58, 100 }, + { 1, 31, 55 }, + { 1, 12, 20 } + }, { /* Coeff Band 4 */ + { 32, 186, 224 }, + { 7, 142, 198 }, + { 1, 86, 143 }, + { 1, 58, 100 }, + { 1, 31, 55 }, + { 1, 12, 22 } + }, { /* Coeff Band 5 */ + { 57, 192, 227 }, + { 20, 143, 204 }, + { 3, 96, 154 }, + { 1, 68, 112 }, + { 1, 42, 69 }, + { 1, 19, 32 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 212, 35, 215 }, + { 113, 47, 169 }, + { 29, 48, 105 } + }, { /* Coeff Band 1 */ + { 74, 129, 203 }, + { 106, 120, 203 }, + { 49, 107, 178 }, + { 19, 84, 144 }, + { 4, 50, 84 }, + { 1, 15, 25 } + }, { /* Coeff Band 2 */ + { 71, 172, 217 }, + { 44, 141, 209 }, + { 15, 102, 173 }, + { 6, 76, 133 }, + { 2, 51, 89 }, + { 1, 24, 42 } + }, { /* Coeff Band 3 */ + { 64, 185, 231 }, + { 31, 148, 216 }, + { 8, 103, 175 }, + { 3, 74, 131 }, + { 1, 46, 81 }, + { 1, 18, 30 } + }, { /* Coeff Band 4 */ + { 65, 196, 235 }, + { 25, 157, 221 }, + { 5, 105, 174 }, + { 1, 67, 120 }, + { 1, 38, 69 }, + { 1, 15, 30 } + }, { /* Coeff Band 5 */ + { 65, 204, 238 }, + { 30, 156, 224 }, + { 7, 107, 177 }, + { 2, 70, 124 }, + { 1, 42, 73 }, + { 1, 18, 34 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 225, 86, 251 }, + { 144, 104, 235 }, + { 42, 99, 181 } + }, { /* Coeff Band 1 */ + { 85, 175, 239 }, + { 112, 165, 229 }, + { 29, 136, 200 }, + { 12, 103, 162 }, + { 6, 77, 123 }, + { 2, 53, 84 } + }, { /* Coeff Band 2 */ + { 75, 183, 239 }, + { 30, 155, 221 }, + { 3, 106, 171 }, + { 1, 74, 128 }, + { 1, 44, 76 }, + { 1, 17, 28 } + }, { /* Coeff Band 3 */ + { 73, 185, 240 }, + { 27, 159, 222 }, + { 2, 107, 172 }, + { 1, 75, 127 }, + { 1, 42, 73 }, + { 1, 17, 29 } + }, { /* Coeff Band 4 */ + { 62, 190, 238 }, + { 21, 159, 222 }, + { 2, 107, 172 }, + { 1, 72, 122 }, + { 1, 40, 71 }, + { 1, 18, 32 } + }, { /* Coeff Band 5 */ + { 61, 199, 240 }, + { 27, 161, 226 }, + { 4, 113, 180 }, + { 1, 76, 129 }, + { 1, 46, 80 }, + { 1, 23, 41 } + } + } + } + }, { /* tx = 16x16 */ + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 7, 27, 153 }, + { 5, 30, 95 }, + { 1, 16, 30 } + }, { /* Coeff Band 1 */ + { 50, 75, 127 }, + { 57, 75, 124 }, + { 27, 67, 108 }, + { 10, 54, 86 }, + { 1, 33, 52 }, + { 1, 12, 18 } + }, { /* Coeff Band 2 */ + { 43, 125, 151 }, + { 26, 108, 148 }, + { 7, 83, 122 }, + { 2, 59, 89 }, + { 1, 38, 60 }, + { 1, 17, 27 } + }, { /* Coeff Band 3 */ + { 23, 144, 163 }, + { 13, 112, 154 }, + { 2, 75, 117 }, + { 1, 50, 81 }, + { 1, 31, 51 }, + { 1, 14, 23 } + }, { /* Coeff Band 4 */ + { 18, 162, 185 }, + { 6, 123, 171 }, + { 1, 78, 125 }, + { 1, 51, 86 }, + { 1, 31, 54 }, + { 1, 14, 23 } + }, { /* Coeff Band 5 */ + { 15, 199, 227 }, + { 3, 150, 204 }, + { 1, 91, 146 }, + { 1, 55, 95 }, + { 1, 30, 53 }, + { 1, 11, 20 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 19, 55, 240 }, + { 19, 59, 196 }, + { 3, 52, 105 } + }, { /* Coeff Band 1 */ + { 41, 166, 207 }, + { 104, 153, 199 }, + { 31, 123, 181 }, + { 14, 101, 152 }, + { 5, 72, 106 }, + { 1, 36, 52 } + }, { /* Coeff Band 2 */ + { 35, 176, 211 }, + { 12, 131, 190 }, + { 2, 88, 144 }, + { 1, 60, 101 }, + { 1, 36, 60 }, + { 1, 16, 28 } + }, { /* Coeff Band 3 */ + { 28, 183, 213 }, + { 8, 134, 191 }, + { 1, 86, 142 }, + { 1, 56, 96 }, + { 1, 30, 53 }, + { 1, 12, 20 } + }, { /* Coeff Band 4 */ + { 20, 190, 215 }, + { 4, 135, 192 }, + { 1, 84, 139 }, + { 1, 53, 91 }, + { 1, 28, 49 }, + { 1, 11, 20 } + }, { /* Coeff Band 5 */ + { 13, 196, 216 }, + { 2, 137, 192 }, + { 1, 86, 143 }, + { 1, 57, 99 }, + { 1, 32, 56 }, + { 1, 13, 24 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 211, 29, 217 }, + { 96, 47, 156 }, + { 22, 43, 87 } + }, { /* Coeff Band 1 */ + { 78, 120, 193 }, + { 111, 116, 186 }, + { 46, 102, 164 }, + { 15, 80, 128 }, + { 2, 49, 76 }, + { 1, 18, 28 } + }, { /* Coeff Band 2 */ + { 71, 161, 203 }, + { 42, 132, 192 }, + { 10, 98, 150 }, + { 3, 69, 109 }, + { 1, 44, 70 }, + { 1, 18, 29 } + }, { /* Coeff Band 3 */ + { 57, 186, 211 }, + { 30, 140, 196 }, + { 4, 93, 146 }, + { 1, 62, 102 }, + { 1, 38, 65 }, + { 1, 16, 27 } + }, { /* Coeff Band 4 */ + { 47, 199, 217 }, + { 14, 145, 196 }, + { 1, 88, 142 }, + { 1, 57, 98 }, + { 1, 36, 62 }, + { 1, 15, 26 } + }, { /* Coeff Band 5 */ + { 26, 219, 229 }, + { 5, 155, 207 }, + { 1, 94, 151 }, + { 1, 60, 104 }, + { 1, 36, 62 }, + { 1, 16, 28 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 233, 29, 248 }, + { 146, 47, 220 }, + { 43, 52, 140 } + }, { /* Coeff Band 1 */ + { 100, 163, 232 }, + { 179, 161, 222 }, + { 63, 142, 204 }, + { 37, 113, 174 }, + { 26, 89, 137 }, + { 18, 68, 97 } + }, { /* Coeff Band 2 */ + { 85, 181, 230 }, + { 32, 146, 209 }, + { 7, 100, 164 }, + { 3, 71, 121 }, + { 1, 45, 77 }, + { 1, 18, 30 } + }, { /* Coeff Band 3 */ + { 65, 187, 230 }, + { 20, 148, 207 }, + { 2, 97, 159 }, + { 1, 68, 116 }, + { 1, 40, 70 }, + { 1, 14, 29 } + }, { /* Coeff Band 4 */ + { 40, 194, 227 }, + { 8, 147, 204 }, + { 1, 94, 155 }, + { 1, 65, 112 }, + { 1, 39, 66 }, + { 1, 14, 26 } + }, { /* Coeff Band 5 */ + { 16, 208, 228 }, + { 3, 151, 207 }, + { 1, 98, 160 }, + { 1, 67, 117 }, + { 1, 41, 74 }, + { 1, 17, 31 } + } + } + } + }, { /* tx = 32x32 */ + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 17, 38, 140 }, + { 7, 34, 80 }, + { 1, 17, 29 } + }, { /* Coeff Band 1 */ + { 37, 75, 128 }, + { 41, 76, 128 }, + { 26, 66, 116 }, + { 12, 52, 94 }, + { 2, 32, 55 }, + { 1, 10, 16 } + }, { /* Coeff Band 2 */ + { 50, 127, 154 }, + { 37, 109, 152 }, + { 16, 82, 121 }, + { 5, 59, 85 }, + { 1, 35, 54 }, + { 1, 13, 20 } + }, { /* Coeff Band 3 */ + { 40, 142, 167 }, + { 17, 110, 157 }, + { 2, 71, 112 }, + { 1, 44, 72 }, + { 1, 27, 45 }, + { 1, 11, 17 } + }, { /* Coeff Band 4 */ + { 30, 175, 188 }, + { 9, 124, 169 }, + { 1, 74, 116 }, + { 1, 48, 78 }, + { 1, 30, 49 }, + { 1, 11, 18 } + }, { /* Coeff Band 5 */ + { 10, 222, 223 }, + { 2, 150, 194 }, + { 1, 83, 128 }, + { 1, 48, 79 }, + { 1, 27, 45 }, + { 1, 11, 17 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 36, 41, 235 }, + { 29, 36, 193 }, + { 10, 27, 111 } + }, { /* Coeff Band 1 */ + { 85, 165, 222 }, + { 177, 162, 215 }, + { 110, 135, 195 }, + { 57, 113, 168 }, + { 23, 83, 120 }, + { 10, 49, 61 } + }, { /* Coeff Band 2 */ + { 85, 190, 223 }, + { 36, 139, 200 }, + { 5, 90, 146 }, + { 1, 60, 103 }, + { 1, 38, 65 }, + { 1, 18, 30 } + }, { /* Coeff Band 3 */ + { 72, 202, 223 }, + { 23, 141, 199 }, + { 2, 86, 140 }, + { 1, 56, 97 }, + { 1, 36, 61 }, + { 1, 16, 27 } + }, { /* Coeff Band 4 */ + { 55, 218, 225 }, + { 13, 145, 200 }, + { 1, 86, 141 }, + { 1, 57, 99 }, + { 1, 35, 61 }, + { 1, 13, 22 } + }, { /* Coeff Band 5 */ + { 15, 235, 212 }, + { 1, 132, 184 }, + { 1, 84, 139 }, + { 1, 57, 97 }, + { 1, 34, 56 }, + { 1, 14, 23 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 181, 21, 201 }, + { 61, 37, 123 }, + { 10, 38, 71 } + }, { /* Coeff Band 1 */ + { 47, 106, 172 }, + { 95, 104, 173 }, + { 42, 93, 159 }, + { 18, 77, 131 }, + { 4, 50, 81 }, + { 1, 17, 23 } + }, { /* Coeff Band 2 */ + { 62, 147, 199 }, + { 44, 130, 189 }, + { 28, 102, 154 }, + { 18, 75, 115 }, + { 2, 44, 65 }, + { 1, 12, 19 } + }, { /* Coeff Band 3 */ + { 55, 153, 210 }, + { 24, 130, 194 }, + { 3, 93, 146 }, + { 1, 61, 97 }, + { 1, 31, 50 }, + { 1, 10, 16 } + }, { /* Coeff Band 4 */ + { 49, 186, 223 }, + { 17, 148, 204 }, + { 1, 96, 142 }, + { 1, 53, 83 }, + { 1, 26, 44 }, + { 1, 11, 17 } + }, { /* Coeff Band 5 */ + { 13, 217, 212 }, + { 2, 136, 180 }, + { 1, 78, 124 }, + { 1, 50, 83 }, + { 1, 29, 49 }, + { 1, 14, 23 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 197, 13, 247 }, + { 82, 17, 222 }, + { 25, 17, 162 } + }, { /* Coeff Band 1 */ + { 126, 186, 247 }, + { 234, 191, 243 }, + { 176, 177, 234 }, + { 104, 158, 220 }, + { 66, 128, 186 }, + { 55, 90, 137 } + }, { /* Coeff Band 2 */ + { 111, 197, 242 }, + { 46, 158, 219 }, + { 9, 104, 171 }, + { 2, 65, 125 }, + { 1, 44, 80 }, + { 1, 17, 91 } + }, { /* Coeff Band 3 */ + { 104, 208, 245 }, + { 39, 168, 224 }, + { 3, 109, 162 }, + { 1, 79, 124 }, + { 1, 50, 102 }, + { 1, 43, 102 } + }, { /* Coeff Band 4 */ + { 84, 220, 246 }, + { 31, 177, 231 }, + { 2, 115, 180 }, + { 1, 79, 134 }, + { 1, 55, 77 }, + { 1, 60, 79 } + }, { /* Coeff Band 5 */ + { 43, 243, 240 }, + { 8, 180, 217 }, + { 1, 115, 166 }, + { 1, 84, 121 }, + { 1, 51, 67 }, + { 1, 16, 6 } + } + } + } + } +}; + +const int8_t ff_vp9_mv_joint_tree[3][2] = { + { -MV_JOINT_ZERO, 1 }, // '0' + { -MV_JOINT_H, 2 }, // '10' + { -MV_JOINT_V, -MV_JOINT_HV }, // '11x' +}; + +const int8_t ff_vp9_mv_class_tree[10][2] = { + { -0, 1 }, // '0' + { -1, 2 }, // '10' + { 3, 4 }, + { -2, -3 }, // '110x' + { 5, 6 }, + { -4, -5 }, // '1110x' + { -6, 7 }, // '11110' + { 8, 9 }, + { -7, -8 }, // '111110x' + { -9, -10 }, // '111111x' +}; + +const int8_t ff_vp9_mv_fp_tree[3][2] = { + { -0, 1 }, // '0' + { -1, 2 }, // '10' + { -2, -3 }, // '11x' +}; diff --git a/media/ffvpx/libavcodec/vp9data.h b/media/ffvpx/libavcodec/vp9data.h new file mode 100644 index 0000000000..086dbdec06 --- /dev/null +++ b/media/ffvpx/libavcodec/vp9data.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VP9DATA_H +#define AVCODEC_VP9DATA_H + +#include <stdint.h> + +#include "vp9dec.h" + +extern const uint8_t ff_vp9_bwh_tab[2][N_BS_SIZES][2]; +extern const int8_t ff_vp9_partition_tree[3][2]; +extern const uint8_t ff_vp9_default_kf_partition_probs[4][4][3]; +extern const int8_t ff_vp9_segmentation_tree[7][2]; +extern const int8_t ff_vp9_intramode_tree[9][2]; +extern const uint8_t ff_vp9_default_kf_ymode_probs[10][10][9]; +extern const uint8_t ff_vp9_default_kf_uvmode_probs[10][9]; +extern const int8_t ff_vp9_inter_mode_tree[3][2]; +extern const int8_t ff_vp9_filter_tree[2][2]; +extern const enum FilterMode ff_vp9_filter_lut[3]; +extern const int16_t ff_vp9_dc_qlookup[3][256]; +extern const int16_t ff_vp9_ac_qlookup[3][256]; +extern const enum TxfmType ff_vp9_intra_txfm_type[14]; +extern const int16_t ff_vp9_default_scan_4x4[16]; +extern const int16_t ff_vp9_col_scan_4x4[16]; +extern const int16_t ff_vp9_row_scan_4x4[16]; +extern const int16_t ff_vp9_default_scan_8x8[64]; +extern const int16_t ff_vp9_col_scan_8x8[64]; +extern const int16_t ff_vp9_row_scan_8x8[64]; +extern const int16_t ff_vp9_default_scan_16x16[256]; +extern const int16_t ff_vp9_col_scan_16x16[256]; +extern const int16_t ff_vp9_row_scan_16x16[256]; +extern const int16_t ff_vp9_default_scan_32x32[1024]; +extern const int16_t * const ff_vp9_scans[5][4]; +extern const int16_t ff_vp9_default_scan_4x4_nb[16][2]; +extern const int16_t ff_vp9_col_scan_4x4_nb[16][2]; +extern const int16_t ff_vp9_row_scan_4x4_nb[16][2]; +extern const int16_t ff_vp9_default_scan_8x8_nb[64][2]; +extern const int16_t ff_vp9_col_scan_8x8_nb[64][2]; +extern const int16_t ff_vp9_row_scan_8x8_nb[64][2]; +extern const int16_t ff_vp9_default_scan_16x16_nb[256][2]; +extern const int16_t ff_vp9_col_scan_16x16_nb[256][2]; +extern const int16_t ff_vp9_row_scan_16x16_nb[256][2]; +extern const int16_t ff_vp9_default_scan_32x32_nb[1024][2]; +extern const int16_t (* const ff_vp9_scans_nb[5][4])[2]; +extern const uint8_t ff_vp9_model_pareto8[256][8]; +extern const ProbContext ff_vp9_default_probs; +extern const uint8_t ff_vp9_default_coef_probs[4][2][2][6][6][3]; +extern const int8_t ff_vp9_mv_joint_tree[3][2]; +extern const int8_t ff_vp9_mv_class_tree[10][2]; +extern const int8_t ff_vp9_mv_fp_tree[3][2]; + +#endif /* AVCODEC_VP9DATA_H */ diff --git a/media/ffvpx/libavcodec/vp9dec.h b/media/ffvpx/libavcodec/vp9dec.h new file mode 100644 index 0000000000..de7aba0458 --- /dev/null +++ b/media/ffvpx/libavcodec/vp9dec.h @@ -0,0 +1,259 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VP9DEC_H +#define AVCODEC_VP9DEC_H + +#include <stddef.h> +#include <stdint.h> +#include <stdatomic.h> + +#include "libavutil/buffer.h" +#include "libavutil/mem_internal.h" +#include "libavutil/thread.h" +#include "libavutil/internal.h" + +#include "get_bits.h" +#include "videodsp.h" +#include "vp9.h" +#include "vp9dsp.h" +#include "vp9shared.h" +#include "vpx_rac.h" + +#define REF_INVALID_SCALE 0xFFFF + +enum MVJoint { + MV_JOINT_ZERO, + MV_JOINT_H, + MV_JOINT_V, + MV_JOINT_HV, +}; + +typedef struct ProbContext { + uint8_t y_mode[4][9]; + uint8_t uv_mode[10][9]; + uint8_t filter[4][2]; + uint8_t mv_mode[7][3]; + uint8_t intra[4]; + uint8_t comp[5]; + uint8_t single_ref[5][2]; + uint8_t comp_ref[5]; + uint8_t tx32p[2][3]; + uint8_t tx16p[2][2]; + uint8_t tx8p[2]; + uint8_t skip[3]; + uint8_t mv_joint[3]; + struct { + uint8_t sign; + uint8_t classes[10]; + uint8_t class0; + uint8_t bits[10]; + uint8_t class0_fp[2][3]; + uint8_t fp[3]; + uint8_t class0_hp; + uint8_t hp; + } mv_comp[2]; + uint8_t partition[4][4][3]; +} ProbContext; + +typedef struct VP9Filter { + uint8_t level[8 * 8]; + uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */] + [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */]; +} VP9Filter; + +typedef struct VP9Block { + uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip; + enum FilterMode filter; + VP9mv mv[4 /* b_idx */][2 /* ref */]; + enum BlockSize bs; + enum TxfmMode tx, uvtx; + enum BlockLevel bl; + enum BlockPartition bp; +} VP9Block; + +typedef struct VP9TileData VP9TileData; + +typedef struct VP9Context { + VP9SharedContext s; + VP9TileData *td; + + VP9DSPContext dsp; + VideoDSPContext vdsp; + GetBitContext gb; + VPXRangeCoder c; + int pass, active_tile_cols; + +#if HAVE_THREADS + pthread_mutex_t progress_mutex; + pthread_cond_t progress_cond; + atomic_int *entries; + unsigned pthread_init_cnt; +#endif + + uint8_t ss_h, ss_v; + uint8_t last_bpp, bpp_index, bytesperpixel; + uint8_t last_keyframe; + // sb_cols/rows, rows/cols and last_fmt are used for allocating all internal + // arrays, and are thus per-thread. w/h and gf_fmt are synced between threads + // and are therefore per-stream. pix_fmt represents the value in the header + // of the currently processed frame. + int w, h; + enum AVPixelFormat pix_fmt, last_fmt, gf_fmt; + unsigned sb_cols, sb_rows, rows, cols; + ThreadFrame next_refs[8]; + + struct { + uint8_t lim_lut[64]; + uint8_t mblim_lut[64]; + } filter_lut; + struct { + ProbContext p; + uint8_t coef[4][2][2][6][6][3]; + } prob_ctx[4]; + struct { + ProbContext p; + uint8_t coef[4][2][2][6][6][11]; + } prob; + + // contextual (above) cache + uint8_t *above_partition_ctx; + uint8_t *above_mode_ctx; + // FIXME maybe merge some of the below in a flags field? + uint8_t *above_y_nnz_ctx; + uint8_t *above_uv_nnz_ctx[2]; + uint8_t *above_skip_ctx; // 1bit + uint8_t *above_txfm_ctx; // 2bit + uint8_t *above_segpred_ctx; // 1bit + uint8_t *above_intra_ctx; // 1bit + uint8_t *above_comp_ctx; // 1bit + uint8_t *above_ref_ctx; // 2bit + uint8_t *above_filter_ctx; + VP9mv (*above_mv_ctx)[2]; + + // whole-frame cache + uint8_t *intra_pred_data[3]; + VP9Filter *lflvl; + + // block reconstruction intermediates + int block_alloc_using_2pass; + uint16_t mvscale[3][2]; + uint8_t mvstep[3][2]; + + // frame specific buffer pools + AVBufferPool *frame_extradata_pool; + int frame_extradata_pool_size; +} VP9Context; + +struct VP9TileData { + const VP9Context *s; + VPXRangeCoder *c_b; + VPXRangeCoder *c; + int row, row7, col, col7; + uint8_t *dst[3]; + ptrdiff_t y_stride, uv_stride; + VP9Block *b_base, *b; + unsigned tile_col_start; + + struct { + unsigned y_mode[4][10]; + unsigned uv_mode[10][10]; + unsigned filter[4][3]; + unsigned mv_mode[7][4]; + unsigned intra[4][2]; + unsigned comp[5][2]; + unsigned single_ref[5][2][2]; + unsigned comp_ref[5][2]; + unsigned tx32p[2][4]; + unsigned tx16p[2][3]; + unsigned tx8p[2][2]; + unsigned skip[3][2]; + unsigned mv_joint[4]; + struct { + unsigned sign[2]; + unsigned classes[11]; + unsigned class0[2]; + unsigned bits[10][2]; + unsigned class0_fp[2][4]; + unsigned fp[4]; + unsigned class0_hp[2]; + unsigned hp[2]; + } mv_comp[2]; + unsigned partition[4][4][4]; + unsigned coef[4][2][2][6][6][3]; + unsigned eob[4][2][2][6][6][2]; + } counts; + + // whole-frame cache + DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[135 * 144 * 2]; + + // contextual (left) cache + DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16]; + DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16]; + DECLARE_ALIGNED(16, VP9mv, left_mv_ctx)[16][2]; + DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16]; + DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8]; + DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8]; + DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8]; + DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8]; + DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8]; + DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8]; + DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8]; + DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8]; + // block reconstruction intermediates + DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2]; + DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2]; + struct { int x, y; } min_mv, max_mv; + int16_t *block_base, *block, *uvblock_base[2], *uvblock[2]; + uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2]; + + // error message + int error_info; + struct { + unsigned int row:13; + unsigned int col:13; + unsigned int block_size_idx_x:2; + unsigned int block_size_idx_y:2; + } *block_structure; + unsigned int nb_block_structure; +}; + +void ff_vp9_fill_mv(VP9TileData *td, VP9mv *mv, int mode, int sb); + +void ff_vp9_adapt_probs(VP9Context *s); + +void ff_vp9_decode_block(VP9TileData *td, int row, int col, + VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff, + enum BlockLevel bl, enum BlockPartition bp); + +void ff_vp9_loopfilter_sb(AVCodecContext *avctx, VP9Filter *lflvl, + int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff); + +void ff_vp9_intra_recon_8bpp(VP9TileData *td, + ptrdiff_t y_off, ptrdiff_t uv_off); +void ff_vp9_intra_recon_16bpp(VP9TileData *td, + ptrdiff_t y_off, ptrdiff_t uv_off); +void ff_vp9_inter_recon_8bpp(VP9TileData *td); +void ff_vp9_inter_recon_16bpp(VP9TileData *td); + +#endif /* AVCODEC_VP9DEC_H */ diff --git a/media/ffvpx/libavcodec/vp9dsp.c b/media/ffvpx/libavcodec/vp9dsp.c new file mode 100644 index 0000000000..d8ddf74d4f --- /dev/null +++ b/media/ffvpx/libavcodec/vp9dsp.c @@ -0,0 +1,110 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/mem_internal.h" + +#include "vp9dsp.h" + +const DECLARE_ALIGNED(16, int16_t, ff_vp9_subpel_filters)[3][16][8] = { + [FILTER_8TAP_REGULAR] = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { 0, 1, -5, 126, 8, -3, 1, 0 }, + { -1, 3, -10, 122, 18, -6, 2, 0 }, + { -1, 4, -13, 118, 27, -9, 3, -1 }, + { -1, 4, -16, 112, 37, -11, 4, -1 }, + { -1, 5, -18, 105, 48, -14, 4, -1 }, + { -1, 5, -19, 97, 58, -16, 5, -1 }, + { -1, 6, -19, 88, 68, -18, 5, -1 }, + { -1, 6, -19, 78, 78, -19, 6, -1 }, + { -1, 5, -18, 68, 88, -19, 6, -1 }, + { -1, 5, -16, 58, 97, -19, 5, -1 }, + { -1, 4, -14, 48, 105, -18, 5, -1 }, + { -1, 4, -11, 37, 112, -16, 4, -1 }, + { -1, 3, -9, 27, 118, -13, 4, -1 }, + { 0, 2, -6, 18, 122, -10, 3, -1 }, + { 0, 1, -3, 8, 126, -5, 1, 0 }, + }, [FILTER_8TAP_SHARP] = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { -1, 3, -7, 127, 8, -3, 1, 0 }, + { -2, 5, -13, 125, 17, -6, 3, -1 }, + { -3, 7, -17, 121, 27, -10, 5, -2 }, + { -4, 9, -20, 115, 37, -13, 6, -2 }, + { -4, 10, -23, 108, 48, -16, 8, -3 }, + { -4, 10, -24, 100, 59, -19, 9, -3 }, + { -4, 11, -24, 90, 70, -21, 10, -4 }, + { -4, 11, -23, 80, 80, -23, 11, -4 }, + { -4, 10, -21, 70, 90, -24, 11, -4 }, + { -3, 9, -19, 59, 100, -24, 10, -4 }, + { -3, 8, -16, 48, 108, -23, 10, -4 }, + { -2, 6, -13, 37, 115, -20, 9, -4 }, + { -2, 5, -10, 27, 121, -17, 7, -3 }, + { -1, 3, -6, 17, 125, -13, 5, -2 }, + { 0, 1, -3, 8, 127, -7, 3, -1 }, + }, [FILTER_8TAP_SMOOTH] = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { -3, -1, 32, 64, 38, 1, -3, 0 }, + { -2, -2, 29, 63, 41, 2, -3, 0 }, + { -2, -2, 26, 63, 43, 4, -4, 0 }, + { -2, -3, 24, 62, 46, 5, -4, 0 }, + { -2, -3, 21, 60, 49, 7, -4, 0 }, + { -1, -4, 18, 59, 51, 9, -4, 0 }, + { -1, -4, 16, 57, 53, 12, -4, -1 }, + { -1, -4, 14, 55, 55, 14, -4, -1 }, + { -1, -4, 12, 53, 57, 16, -4, -1 }, + { 0, -4, 9, 51, 59, 18, -4, -1 }, + { 0, -4, 7, 49, 60, 21, -3, -2 }, + { 0, -4, 5, 46, 62, 24, -3, -2 }, + { 0, -4, 4, 43, 63, 26, -2, -2 }, + { 0, -3, 2, 41, 63, 29, -2, -2 }, + { 0, -3, 1, 38, 64, 32, -1, -3 }, + } +}; + + +av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact) +{ + if (bpp == 8) { + ff_vp9dsp_init_8(dsp); + } else if (bpp == 10) { + ff_vp9dsp_init_10(dsp); + } else { + av_assert0(bpp == 12); + ff_vp9dsp_init_12(dsp); + } + +#if ARCH_AARCH64 + ff_vp9dsp_init_aarch64(dsp, bpp); +#elif ARCH_ARM + ff_vp9dsp_init_arm(dsp, bpp); +#elif ARCH_X86 + ff_vp9dsp_init_x86(dsp, bpp, bitexact); +#elif ARCH_MIPS + ff_vp9dsp_init_mips(dsp, bpp); +#elif ARCH_LOONGARCH + ff_vp9dsp_init_loongarch(dsp, bpp); +#endif +} diff --git a/media/ffvpx/libavcodec/vp9dsp.h b/media/ffvpx/libavcodec/vp9dsp.h new file mode 100644 index 0000000000..be0ac0b181 --- /dev/null +++ b/media/ffvpx/libavcodec/vp9dsp.h @@ -0,0 +1,138 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VP9DSP_H +#define AVCODEC_VP9DSP_H + +#include <stddef.h> +#include <stdint.h> + +#include "libavcodec/vp9.h" +#include "libavutil/attributes_internal.h" + +typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *ref, ptrdiff_t ref_stride, + int h, int mx, int my); +typedef void (*vp9_scaled_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *ref, ptrdiff_t ref_stride, + int h, int mx, int my, int dx, int dy); + +typedef struct VP9DSPContext { + /* + * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32 + * dimension 2: intra prediction modes + * + * dst/left/top is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels) + * stride is aligned by 16 pixels + * top[-1] is top/left; top[4,7] is top-right for 4x4 + */ + // FIXME(rbultje) maybe replace left/top pointers with HAVE_TOP/ + // HAVE_LEFT/HAVE_TOPRIGHT flags instead, and then handle it in-place? + // also needs to fit in with what H.264/VP8/etc do + void (*intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst, + ptrdiff_t stride, + const uint8_t *left, + const uint8_t *top); + + /* + * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32, 4=lossless (3-4=dct only) + * dimension 2: 0=dct/dct, 1=dct/adst, 2=adst/dct, 3=adst/adst + * + * dst is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels) + * stride is aligned by 16 pixels + * block is 16-byte aligned + * eob indicates the position (+1) of the last non-zero coefficient, + * in scan-order. This can be used to write faster versions, e.g. a + * dc-only 4x4/8x8/16x16/32x32, or a 4x4-only (eob<10) 8x8/16x16/32x32, + * etc. + */ + // FIXME also write idct_add_block() versions for whole (inter) pred + // blocks, so we can do 2 4x4s at once + void (*itxfm_add[N_TXFM_SIZES + 1][N_TXFM_TYPES])(uint8_t *dst, + ptrdiff_t stride, + int16_t *block, int eob); + + /* + * dimension 1: width of filter (0=4, 1=8, 2=16) + * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v) + * + * dst/stride are aligned by 8 + */ + void (*loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride, + int mb_lim, int lim, int hev_thr); + + /* + * dimension 1: 0=col-edge filter (h), 1=row-edge filter (v) + * + * The width of filter is assumed to be 16; dst/stride are aligned by 16 + */ + void (*loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride, + int mb_lim, int lim, int hev_thr); + + /* + * dimension 1/2: width of filter (0=4, 1=8) for each filter half + * dimension 3: 0=col-edge filter (h), 1=row-edge filter (v) + * + * dst/stride are aligned by operation size + * this basically calls loop_filter[d1][d3][0](), followed by + * loop_filter[d2][d3][0]() on the next 8 pixels + * mb_lim/lim/hev_thr contain two values in the lowest two bytes of the + * integer. + */ + // FIXME perhaps a mix4 that operates on 32px (for AVX2) + void (*loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride, + int mb_lim, int lim, int hev_thr); + + /* + * dimension 1: hsize (0: 64, 1: 32, 2: 16, 3: 8, 4: 4) + * dimension 2: filter type (0: smooth, 1: regular, 2: sharp, 3: bilin) + * dimension 3: averaging type (0: put, 1: avg) + * dimension 4: x subpel interpolation (0: none, 1: 8tap/bilin) + * dimension 5: y subpel interpolation (0: none, 1: 8tap/bilin) + * + * dst/stride are aligned by hsize + */ + vp9_mc_func mc[5][N_FILTERS][2][2][2]; + + /* + * for scalable MC, first 3 dimensions identical to above, the other two + * don't exist since it changes per stepsize. + */ + vp9_scaled_mc_func smc[5][N_FILTERS][2]; +} VP9DSPContext; + +extern const int16_t attribute_visibility_hidden ff_vp9_subpel_filters[3][16][8]; + +void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact); + +void ff_vp9dsp_init_8(VP9DSPContext *dsp); +void ff_vp9dsp_init_10(VP9DSPContext *dsp); +void ff_vp9dsp_init_12(VP9DSPContext *dsp); + +void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp); +void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp); +void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact); +void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp); +void ff_vp9dsp_init_loongarch(VP9DSPContext *dsp, int bpp); + +#endif /* AVCODEC_VP9DSP_H */ diff --git a/media/ffvpx/libavcodec/vp9dsp_10bpp.c b/media/ffvpx/libavcodec/vp9dsp_10bpp.c new file mode 100644 index 0000000000..62ce182070 --- /dev/null +++ b/media/ffvpx/libavcodec/vp9dsp_10bpp.c @@ -0,0 +1,26 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BIT_DEPTH 10 +#define dctint int64_t +#include "vp9dsp_template.c" diff --git a/media/ffvpx/libavcodec/vp9dsp_12bpp.c b/media/ffvpx/libavcodec/vp9dsp_12bpp.c new file mode 100644 index 0000000000..2f36471c5b --- /dev/null +++ b/media/ffvpx/libavcodec/vp9dsp_12bpp.c @@ -0,0 +1,26 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BIT_DEPTH 12 +#define dctint int64_t +#include "vp9dsp_template.c" diff --git a/media/ffvpx/libavcodec/vp9dsp_8bpp.c b/media/ffvpx/libavcodec/vp9dsp_8bpp.c new file mode 100644 index 0000000000..4b219b06b0 --- /dev/null +++ b/media/ffvpx/libavcodec/vp9dsp_8bpp.c @@ -0,0 +1,26 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BIT_DEPTH 8 +#define dctint int +#include "vp9dsp_template.c" diff --git a/media/ffvpx/libavcodec/vp9dsp_template.c b/media/ffvpx/libavcodec/vp9dsp_template.c new file mode 100644 index 0000000000..9b11661704 --- /dev/null +++ b/media/ffvpx/libavcodec/vp9dsp_template.c @@ -0,0 +1,2546 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/common.h" +#include "bit_depth_template.c" +#include "vp9dsp.h" + +#if BIT_DEPTH != 12 + +// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8) +// back with h264pred.[ch] + +static void vert_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 p4 = AV_RN4PA(top); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, p4); + AV_WN4PA(dst + stride * 1, p4); + AV_WN4PA(dst + stride * 2, p4); + AV_WN4PA(dst + stride * 3, p4); +} + +static void vert_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 p4a = AV_RN4PA(top + 0); + pixel4 p4b = AV_RN4PA(top + 4); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, p4a); + AV_WN4PA(dst + 4, p4b); + dst += stride; + } +} + +static void vert_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 p4a = AV_RN4PA(top + 0); + pixel4 p4b = AV_RN4PA(top + 4); + pixel4 p4c = AV_RN4PA(top + 8); + pixel4 p4d = AV_RN4PA(top + 12); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, p4a); + AV_WN4PA(dst + 4, p4b); + AV_WN4PA(dst + 8, p4c); + AV_WN4PA(dst + 12, p4d); + dst += stride; + } +} + +static void vert_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 p4a = AV_RN4PA(top + 0); + pixel4 p4b = AV_RN4PA(top + 4); + pixel4 p4c = AV_RN4PA(top + 8); + pixel4 p4d = AV_RN4PA(top + 12); + pixel4 p4e = AV_RN4PA(top + 16); + pixel4 p4f = AV_RN4PA(top + 20); + pixel4 p4g = AV_RN4PA(top + 24); + pixel4 p4h = AV_RN4PA(top + 28); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, p4a); + AV_WN4PA(dst + 4, p4b); + AV_WN4PA(dst + 8, p4c); + AV_WN4PA(dst + 12, p4d); + AV_WN4PA(dst + 16, p4e); + AV_WN4PA(dst + 20, p4f); + AV_WN4PA(dst + 24, p4g); + AV_WN4PA(dst + 28, p4h); + dst += stride; + } +} + +static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, PIXEL_SPLAT_X4(left[3])); + AV_WN4PA(dst + stride * 1, PIXEL_SPLAT_X4(left[2])); + AV_WN4PA(dst + stride * 2, PIXEL_SPLAT_X4(left[1])); + AV_WN4PA(dst + stride * 3, PIXEL_SPLAT_X4(left[0])); +} + +static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + pixel4 p4 = PIXEL_SPLAT_X4(left[7 - y]); + + AV_WN4PA(dst + 0, p4); + AV_WN4PA(dst + 4, p4); + dst += stride; + } +} + +static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + pixel4 p4 = PIXEL_SPLAT_X4(left[15 - y]); + + AV_WN4PA(dst + 0, p4); + AV_WN4PA(dst + 4, p4); + AV_WN4PA(dst + 8, p4); + AV_WN4PA(dst + 12, p4); + dst += stride; + } +} + +static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + pixel4 p4 = PIXEL_SPLAT_X4(left[31 - y]); + + AV_WN4PA(dst + 0, p4); + AV_WN4PA(dst + 4, p4); + AV_WN4PA(dst + 8, p4); + AV_WN4PA(dst + 12, p4); + AV_WN4PA(dst + 16, p4); + AV_WN4PA(dst + 20, p4); + AV_WN4PA(dst + 24, p4); + AV_WN4PA(dst + 28, p4); + dst += stride; + } +} + +#endif /* BIT_DEPTH != 12 */ + +static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + int y, tl = top[-1]; + + stride /= sizeof(pixel); + for (y = 0; y < 4; y++) { + int l_m_tl = left[3 - y] - tl; + + dst[0] = av_clip_pixel(top[0] + l_m_tl); + dst[1] = av_clip_pixel(top[1] + l_m_tl); + dst[2] = av_clip_pixel(top[2] + l_m_tl); + dst[3] = av_clip_pixel(top[3] + l_m_tl); + dst += stride; + } +} + +static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + int y, tl = top[-1]; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + int l_m_tl = left[7 - y] - tl; + + dst[0] = av_clip_pixel(top[0] + l_m_tl); + dst[1] = av_clip_pixel(top[1] + l_m_tl); + dst[2] = av_clip_pixel(top[2] + l_m_tl); + dst[3] = av_clip_pixel(top[3] + l_m_tl); + dst[4] = av_clip_pixel(top[4] + l_m_tl); + dst[5] = av_clip_pixel(top[5] + l_m_tl); + dst[6] = av_clip_pixel(top[6] + l_m_tl); + dst[7] = av_clip_pixel(top[7] + l_m_tl); + dst += stride; + } +} + +static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + int y, tl = top[-1]; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + int l_m_tl = left[15 - y] - tl; + + dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl); + dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl); + dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl); + dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl); + dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl); + dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl); + dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl); + dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl); + dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl); + dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl); + dst[10] = av_clip_pixel(top[10] + l_m_tl); + dst[11] = av_clip_pixel(top[11] + l_m_tl); + dst[12] = av_clip_pixel(top[12] + l_m_tl); + dst[13] = av_clip_pixel(top[13] + l_m_tl); + dst[14] = av_clip_pixel(top[14] + l_m_tl); + dst[15] = av_clip_pixel(top[15] + l_m_tl); + dst += stride; + } +} + +static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + int y, tl = top[-1]; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + int l_m_tl = left[31 - y] - tl; + + dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl); + dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl); + dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl); + dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl); + dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl); + dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl); + dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl); + dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl); + dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl); + dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl); + dst[10] = av_clip_pixel(top[10] + l_m_tl); + dst[11] = av_clip_pixel(top[11] + l_m_tl); + dst[12] = av_clip_pixel(top[12] + l_m_tl); + dst[13] = av_clip_pixel(top[13] + l_m_tl); + dst[14] = av_clip_pixel(top[14] + l_m_tl); + dst[15] = av_clip_pixel(top[15] + l_m_tl); + dst[16] = av_clip_pixel(top[16] + l_m_tl); + dst[17] = av_clip_pixel(top[17] + l_m_tl); + dst[18] = av_clip_pixel(top[18] + l_m_tl); + dst[19] = av_clip_pixel(top[19] + l_m_tl); + dst[20] = av_clip_pixel(top[20] + l_m_tl); + dst[21] = av_clip_pixel(top[21] + l_m_tl); + dst[22] = av_clip_pixel(top[22] + l_m_tl); + dst[23] = av_clip_pixel(top[23] + l_m_tl); + dst[24] = av_clip_pixel(top[24] + l_m_tl); + dst[25] = av_clip_pixel(top[25] + l_m_tl); + dst[26] = av_clip_pixel(top[26] + l_m_tl); + dst[27] = av_clip_pixel(top[27] + l_m_tl); + dst[28] = av_clip_pixel(top[28] + l_m_tl); + dst[29] = av_clip_pixel(top[29] + l_m_tl); + dst[30] = av_clip_pixel(top[30] + l_m_tl); + dst[31] = av_clip_pixel(top[31] + l_m_tl); + dst += stride; + } +} + +#if BIT_DEPTH != 12 + +static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + + top[0] + top[1] + top[2] + top[3] + 4) >> 3); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, dc); + AV_WN4PA(dst + stride * 1, dc); + AV_WN4PA(dst + stride * 2, dc); + AV_WN4PA(dst + stride * 3, dc); +} + +static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + + left[6] + left[7] + top[0] + top[1] + top[2] + top[3] + + top[4] + top[5] + top[6] + top[7] + 8) >> 4); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + dst += stride; + } +} + +static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] + + left[7] + left[8] + left[9] + left[10] + left[11] + left[12] + + left[13] + left[14] + left[15] + top[0] + top[1] + top[2] + top[3] + + top[4] + top[5] + top[6] + top[7] + top[8] + top[9] + top[10] + + top[11] + top[12] + top[13] + top[14] + top[15] + 16) >> 5); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + dst += stride; + } +} + +static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] + + left[7] + left[8] + left[9] + left[10] + left[11] + left[12] + + left[13] + left[14] + left[15] + left[16] + left[17] + left[18] + + left[19] + left[20] + left[21] + left[22] + left[23] + left[24] + + left[25] + left[26] + left[27] + left[28] + left[29] + left[30] + + left[31] + top[0] + top[1] + top[2] + top[3] + top[4] + top[5] + + top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + top[12] + + top[13] + top[14] + top[15] + top[16] + top[17] + top[18] + top[19] + + top[20] + top[21] + top[22] + top[23] + top[24] + top[25] + top[26] + + top[27] + top[28] + top[29] + top[30] + top[31] + 32) >> 6); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + AV_WN4PA(dst + 16, dc); + AV_WN4PA(dst + 20, dc); + AV_WN4PA(dst + 24, dc); + AV_WN4PA(dst + 28, dc); + dst += stride; + } +} + +static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + 2) >> 2); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, dc); + AV_WN4PA(dst + stride * 1, dc); + AV_WN4PA(dst + stride * 2, dc); + AV_WN4PA(dst + stride * 3, dc); +} + +static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + + left[4] + left[5] + left[6] + left[7] + 4) >> 3); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + dst += stride; + } +} + +static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + + left[6] + left[7] + left[8] + left[9] + left[10] + left[11] + + left[12] + left[13] + left[14] + left[15] + 8) >> 4); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + dst += stride; + } +} + +static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + pixel4 dc = PIXEL_SPLAT_X4 + ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + + left[6] + left[7] + left[8] + left[9] + left[10] + left[11] + + left[12] + left[13] + left[14] + left[15] + left[16] + left[17] + + left[18] + left[19] + left[20] + left[21] + left[22] + left[23] + + left[24] + left[25] + left[26] + left[27] + left[28] + left[29] + + left[30] + left[31] + 16) >> 5); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + AV_WN4PA(dst + 16, dc); + AV_WN4PA(dst + 20, dc); + AV_WN4PA(dst + 24, dc); + AV_WN4PA(dst + 28, dc); + dst += stride; + } +} + +static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4((top[0] + top[1] + top[2] + top[3] + 2) >> 2); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, dc); + AV_WN4PA(dst + stride * 1, dc); + AV_WN4PA(dst + stride * 2, dc); + AV_WN4PA(dst + stride * 3, dc); +} + +static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((top[0] + top[1] + top[2] + top[3] + + top[4] + top[5] + top[6] + top[7] + 4) >> 3); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + dst += stride; + } +} + +static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] + + top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + + top[12] + top[13] + top[14] + top[15] + 8) >> 4); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + dst += stride; + } +} + +static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + pixel4 dc = PIXEL_SPLAT_X4 + ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] + + top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + + top[12] + top[13] + top[14] + top[15] + top[16] + top[17] + + top[18] + top[19] + top[20] + top[21] + top[22] + top[23] + + top[24] + top[25] + top[26] + top[27] + top[28] + top[29] + + top[30] + top[31] + 16) >> 5); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, dc); + AV_WN4PA(dst + 4, dc); + AV_WN4PA(dst + 8, dc); + AV_WN4PA(dst + 12, dc); + AV_WN4PA(dst + 16, dc); + AV_WN4PA(dst + 20, dc); + AV_WN4PA(dst + 24, dc); + AV_WN4PA(dst + 28, dc); + dst += stride; + } +} + +#endif /* BIT_DEPTH != 12 */ + +static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8)); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, val); + AV_WN4PA(dst + stride * 1, val); + AV_WN4PA(dst + stride * 2, val); + AV_WN4PA(dst + stride * 3, val); +} + +static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8)); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + dst += stride; + } +} + +static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8)); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + dst += stride; + } +} + +static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8)); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + AV_WN4PA(dst + 16, val); + AV_WN4PA(dst + 20, val); + AV_WN4PA(dst + 24, val); + AV_WN4PA(dst + 28, val); + dst += stride; + } +} + +static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, val); + AV_WN4PA(dst + stride * 1, val); + AV_WN4PA(dst + stride * 2, val); + AV_WN4PA(dst + stride * 3, val);} + +static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + dst += stride; + } +} + +static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + dst += stride; + } +} + +static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + AV_WN4PA(dst + 16, val); + AV_WN4PA(dst + 20, val); + AV_WN4PA(dst + 24, val); + AV_WN4PA(dst + 28, val); + dst += stride; + } +} + +static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1); + + stride /= sizeof(pixel); + AV_WN4PA(dst + stride * 0, val); + AV_WN4PA(dst + stride * 1, val); + AV_WN4PA(dst + stride * 2, val); + AV_WN4PA(dst + stride * 3, val); +} + +static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 8; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + dst += stride; + } +} + +static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 16; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + dst += stride; + } +} + +static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1); + int y; + + stride /= sizeof(pixel); + for (y = 0; y < 32; y++) { + AV_WN4PA(dst + 0, val); + AV_WN4PA(dst + 4, val); + AV_WN4PA(dst + 8, val); + AV_WN4PA(dst + 12, val); + AV_WN4PA(dst + 16, val); + AV_WN4PA(dst + 20, val); + AV_WN4PA(dst + 24, val); + AV_WN4PA(dst + 28, val); + dst += stride; + } +} + +#if BIT_DEPTH != 12 + +#if BIT_DEPTH == 8 +#define memset_bpc memset +#else +static inline void memset_bpc(uint16_t *dst, int val, int len) { + int n; + for (n = 0; n < len; n++) { + dst[n] = val; + } +} +#endif + +#define DST(x, y) dst[(x) + (y) * stride] + +static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3], + a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7]; + + stride /= sizeof(pixel); + DST(0,0) = (a0 + a1 * 2 + a2 + 2) >> 2; + DST(1,0) = DST(0,1) = (a1 + a2 * 2 + a3 + 2) >> 2; + DST(2,0) = DST(1,1) = DST(0,2) = (a2 + a3 * 2 + a4 + 2) >> 2; + DST(3,0) = DST(2,1) = DST(1,2) = DST(0,3) = (a3 + a4 * 2 + a5 + 2) >> 2; + DST(3,1) = DST(2,2) = DST(1,3) = (a4 + a5 * 2 + a6 + 2) >> 2; + DST(3,2) = DST(2,3) = (a5 + a6 * 2 + a7 + 2) >> 2; + DST(3,3) = a7; // note: this is different from vp8 and such +} + +#define def_diag_downleft(size) \ +static void diag_downleft_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *left, const uint8_t *_top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *top = (const pixel *) _top; \ + int i, j; \ + pixel v[size - 1]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size - 2; i++) \ + v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \ + v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \ +\ + for (j = 0; j < size; j++) { \ + memcpy(dst + j*stride, v + j, (size - 1 - j) * sizeof(pixel)); \ + memset_bpc(dst + j*stride + size - 1 - j, top[size - 1], j + 1); \ + } \ +} + +def_diag_downleft(8) +def_diag_downleft(16) +def_diag_downleft(32) + +static void diag_downright_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + const pixel *left = (const pixel *) _left; + int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3], + l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0]; + + stride /= sizeof(pixel); + DST(0,3) = (l1 + l2 * 2 + l3 + 2) >> 2; + DST(0,2) = DST(1,3) = (l0 + l1 * 2 + l2 + 2) >> 2; + DST(0,1) = DST(1,2) = DST(2,3) = (tl + l0 * 2 + l1 + 2) >> 2; + DST(0,0) = DST(1,1) = DST(2,2) = DST(3,3) = (l0 + tl * 2 + a0 + 2) >> 2; + DST(1,0) = DST(2,1) = DST(3,2) = (tl + a0 * 2 + a1 + 2) >> 2; + DST(2,0) = DST(3,1) = (a0 + a1 * 2 + a2 + 2) >> 2; + DST(3,0) = (a1 + a2 * 2 + a3 + 2) >> 2; +} + +#define def_diag_downright(size) \ +static void diag_downright_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *_left, const uint8_t *_top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *top = (const pixel *) _top; \ + const pixel *left = (const pixel *) _left; \ + int i, j; \ + pixel v[size + size - 1]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size - 2; i++) { \ + v[i ] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \ + v[size + 1 + i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \ + } \ + v[size - 2] = (left[size - 2] + left[size - 1] * 2 + top[-1] + 2) >> 2; \ + v[size - 1] = (left[size - 1] + top[-1] * 2 + top[ 0] + 2) >> 2; \ + v[size ] = (top[-1] + top[0] * 2 + top[ 1] + 2) >> 2; \ +\ + for (j = 0; j < size; j++) \ + memcpy(dst + j*stride, v + size - 1 - j, size * sizeof(pixel)); \ +} + +def_diag_downright(8) +def_diag_downright(16) +def_diag_downright(32) + +static void vert_right_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + const pixel *left = (const pixel *) _left; + int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3], + l0 = left[3], l1 = left[2], l2 = left[1]; + + stride /= sizeof(pixel); + DST(0,3) = (l0 + l1 * 2 + l2 + 2) >> 2; + DST(0,2) = (tl + l0 * 2 + l1 + 2) >> 2; + DST(0,0) = DST(1,2) = (tl + a0 + 1) >> 1; + DST(0,1) = DST(1,3) = (l0 + tl * 2 + a0 + 2) >> 2; + DST(1,0) = DST(2,2) = (a0 + a1 + 1) >> 1; + DST(1,1) = DST(2,3) = (tl + a0 * 2 + a1 + 2) >> 2; + DST(2,0) = DST(3,2) = (a1 + a2 + 1) >> 1; + DST(2,1) = DST(3,3) = (a0 + a1 * 2 + a2 + 2) >> 2; + DST(3,0) = (a2 + a3 + 1) >> 1; + DST(3,1) = (a1 + a2 * 2 + a3 + 2) >> 2; +} + +#define def_vert_right(size) \ +static void vert_right_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *_left, const uint8_t *_top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *top = (const pixel *) _top; \ + const pixel *left = (const pixel *) _left; \ + int i, j; \ + pixel ve[size + size/2 - 1], vo[size + size/2 - 1]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size/2 - 2; i++) { \ + vo[i] = (left[i*2 + 3] + left[i*2 + 2] * 2 + left[i*2 + 1] + 2) >> 2; \ + ve[i] = (left[i*2 + 4] + left[i*2 + 3] * 2 + left[i*2 + 2] + 2) >> 2; \ + } \ + vo[size/2 - 2] = (left[size - 1] + left[size - 2] * 2 + left[size - 3] + 2) >> 2; \ + ve[size/2 - 2] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \ +\ + ve[size/2 - 1] = (top[-1] + top[0] + 1) >> 1; \ + vo[size/2 - 1] = (left[size - 1] + top[-1] * 2 + top[0] + 2) >> 2; \ + for (i = 0; i < size - 1; i++) { \ + ve[size/2 + i] = (top[i] + top[i + 1] + 1) >> 1; \ + vo[size/2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \ + } \ +\ + for (j = 0; j < size / 2; j++) { \ + memcpy(dst + j*2 *stride, ve + size/2 - 1 - j, size * sizeof(pixel)); \ + memcpy(dst + (j*2 + 1)*stride, vo + size/2 - 1 - j, size * sizeof(pixel)); \ + } \ +} + +def_vert_right(8) +def_vert_right(16) +def_vert_right(32) + +static void hor_down_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + const pixel *left = (const pixel *) _left; + int l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0], + tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2]; + + stride /= sizeof(pixel); + DST(2,0) = (tl + a0 * 2 + a1 + 2) >> 2; + DST(3,0) = (a0 + a1 * 2 + a2 + 2) >> 2; + DST(0,0) = DST(2,1) = (tl + l0 + 1) >> 1; + DST(1,0) = DST(3,1) = (a0 + tl * 2 + l0 + 2) >> 2; + DST(0,1) = DST(2,2) = (l0 + l1 + 1) >> 1; + DST(1,1) = DST(3,2) = (tl + l0 * 2 + l1 + 2) >> 2; + DST(0,2) = DST(2,3) = (l1 + l2 + 1) >> 1; + DST(1,2) = DST(3,3) = (l0 + l1 * 2 + l2 + 2) >> 2; + DST(0,3) = (l2 + l3 + 1) >> 1; + DST(1,3) = (l1 + l2 * 2 + l3 + 2) >> 2; +} + +#define def_hor_down(size) \ +static void hor_down_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *_left, const uint8_t *_top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *top = (const pixel *) _top; \ + const pixel *left = (const pixel *) _left; \ + int i, j; \ + pixel v[size * 3 - 2]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size - 2; i++) { \ + v[i*2 ] = (left[i + 1] + left[i + 0] + 1) >> 1; \ + v[i*2 + 1] = (left[i + 2] + left[i + 1] * 2 + left[i + 0] + 2) >> 2; \ + v[size*2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \ + } \ + v[size*2 - 2] = (top[-1] + left[size - 1] + 1) >> 1; \ + v[size*2 - 4] = (left[size - 1] + left[size - 2] + 1) >> 1; \ + v[size*2 - 1] = (top[0] + top[-1] * 2 + left[size - 1] + 2) >> 2; \ + v[size*2 - 3] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \ +\ + for (j = 0; j < size; j++) \ + memcpy(dst + j*stride, v + size*2 - 2 - j*2, size * sizeof(pixel)); \ +} + +def_hor_down(8) +def_hor_down(16) +def_hor_down(32) + +static void vert_left_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *_top) +{ + pixel *dst = (pixel *) _dst; + const pixel *top = (const pixel *) _top; + int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3], + a4 = top[4], a5 = top[5], a6 = top[6]; + + stride /= sizeof(pixel); + DST(0,0) = (a0 + a1 + 1) >> 1; + DST(0,1) = (a0 + a1 * 2 + a2 + 2) >> 2; + DST(1,0) = DST(0,2) = (a1 + a2 + 1) >> 1; + DST(1,1) = DST(0,3) = (a1 + a2 * 2 + a3 + 2) >> 2; + DST(2,0) = DST(1,2) = (a2 + a3 + 1) >> 1; + DST(2,1) = DST(1,3) = (a2 + a3 * 2 + a4 + 2) >> 2; + DST(3,0) = DST(2,2) = (a3 + a4 + 1) >> 1; + DST(3,1) = DST(2,3) = (a3 + a4 * 2 + a5 + 2) >> 2; + DST(3,2) = (a4 + a5 + 1) >> 1; + DST(3,3) = (a4 + a5 * 2 + a6 + 2) >> 2; +} + +#define def_vert_left(size) \ +static void vert_left_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *left, const uint8_t *_top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *top = (const pixel *) _top; \ + int i, j; \ + pixel ve[size - 1], vo[size - 1]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size - 2; i++) { \ + ve[i] = (top[i] + top[i + 1] + 1) >> 1; \ + vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \ + } \ + ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1; \ + vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \ +\ + for (j = 0; j < size / 2; j++) { \ + memcpy(dst + j*2 * stride, ve + j, (size - j - 1) * sizeof(pixel)); \ + memset_bpc(dst + j*2 * stride + size - j - 1, top[size - 1], j + 1); \ + memcpy(dst + (j*2 + 1) * stride, vo + j, (size - j - 1) * sizeof(pixel)); \ + memset_bpc(dst + (j*2 + 1) * stride + size - j - 1, top[size - 1], j + 1); \ + } \ +} + +def_vert_left(8) +def_vert_left(16) +def_vert_left(32) + +static void hor_up_4x4_c(uint8_t *_dst, ptrdiff_t stride, + const uint8_t *_left, const uint8_t *top) +{ + pixel *dst = (pixel *) _dst; + const pixel *left = (const pixel *) _left; + int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3]; + + stride /= sizeof(pixel); + DST(0,0) = (l0 + l1 + 1) >> 1; + DST(1,0) = (l0 + l1 * 2 + l2 + 2) >> 2; + DST(0,1) = DST(2,0) = (l1 + l2 + 1) >> 1; + DST(1,1) = DST(3,0) = (l1 + l2 * 2 + l3 + 2) >> 2; + DST(0,2) = DST(2,1) = (l2 + l3 + 1) >> 1; + DST(1,2) = DST(3,1) = (l2 + l3 * 3 + 2) >> 2; + DST(0,3) = DST(1,3) = DST(2,2) = DST(2,3) = DST(3,2) = DST(3,3) = l3; +} + +#define def_hor_up(size) \ +static void hor_up_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \ + const uint8_t *_left, const uint8_t *top) \ +{ \ + pixel *dst = (pixel *) _dst; \ + const pixel *left = (const pixel *) _left; \ + int i, j; \ + pixel v[size*2 - 2]; \ +\ + stride /= sizeof(pixel); \ + for (i = 0; i < size - 2; i++) { \ + v[i*2 ] = (left[i] + left[i + 1] + 1) >> 1; \ + v[i*2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \ + } \ + v[size*2 - 4] = (left[size - 2] + left[size - 1] + 1) >> 1; \ + v[size*2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2; \ +\ + for (j = 0; j < size / 2; j++) \ + memcpy(dst + j*stride, v + j*2, size * sizeof(pixel)); \ + for (j = size / 2; j < size; j++) { \ + memcpy(dst + j*stride, v + j*2, (size*2 - 2 - j*2) * sizeof(pixel)); \ + memset_bpc(dst + j*stride + size*2 - 2 - j*2, left[size - 1], \ + 2 + j*2 - size); \ + } \ +} + +def_hor_up(8) +def_hor_up(16) +def_hor_up(32) + +#undef DST + +#endif /* BIT_DEPTH != 12 */ + +#if BIT_DEPTH != 8 +void ff_vp9dsp_intrapred_init_10(VP9DSPContext *dsp); +#endif +#if BIT_DEPTH != 10 +static +#endif +av_cold void FUNC(ff_vp9dsp_intrapred_init)(VP9DSPContext *dsp) +{ +#define init_intra_pred_bd_aware(tx, sz) \ + dsp->intra_pred[tx][TM_VP8_PRED] = tm_##sz##_c; \ + dsp->intra_pred[tx][DC_128_PRED] = dc_128_##sz##_c; \ + dsp->intra_pred[tx][DC_127_PRED] = dc_127_##sz##_c; \ + dsp->intra_pred[tx][DC_129_PRED] = dc_129_##sz##_c + +#if BIT_DEPTH == 12 + ff_vp9dsp_intrapred_init_10(dsp); +#define init_intra_pred(tx, sz) \ + init_intra_pred_bd_aware(tx, sz) +#else + #define init_intra_pred(tx, sz) \ + dsp->intra_pred[tx][VERT_PRED] = vert_##sz##_c; \ + dsp->intra_pred[tx][HOR_PRED] = hor_##sz##_c; \ + dsp->intra_pred[tx][DC_PRED] = dc_##sz##_c; \ + dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED] = diag_downleft_##sz##_c; \ + dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_##sz##_c; \ + dsp->intra_pred[tx][VERT_RIGHT_PRED] = vert_right_##sz##_c; \ + dsp->intra_pred[tx][HOR_DOWN_PRED] = hor_down_##sz##_c; \ + dsp->intra_pred[tx][VERT_LEFT_PRED] = vert_left_##sz##_c; \ + dsp->intra_pred[tx][HOR_UP_PRED] = hor_up_##sz##_c; \ + dsp->intra_pred[tx][LEFT_DC_PRED] = dc_left_##sz##_c; \ + dsp->intra_pred[tx][TOP_DC_PRED] = dc_top_##sz##_c; \ + init_intra_pred_bd_aware(tx, sz) +#endif + + init_intra_pred(TX_4X4, 4x4); + init_intra_pred(TX_8X8, 8x8); + init_intra_pred(TX_16X16, 16x16); + init_intra_pred(TX_32X32, 32x32); + +#undef init_intra_pred +#undef init_intra_pred_bd_aware +} + +#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \ +static void type_a##_##type_b##_##sz##x##sz##_add_c(uint8_t *_dst, \ + ptrdiff_t stride, \ + int16_t *_block, int eob) \ +{ \ + int i, j; \ + pixel *dst = (pixel *) _dst; \ + dctcoef *block = (dctcoef *) _block, tmp[sz * sz], out[sz]; \ +\ + stride /= sizeof(pixel); \ + if (has_dconly && eob == 1) { \ + const int t = ((((dctint) block[0] * 11585 + (1 << 13)) >> 14) \ + * 11585 + (1 << 13)) >> 14; \ + block[0] = 0; \ + for (i = 0; i < sz; i++) { \ + for (j = 0; j < sz; j++) \ + dst[j * stride] = av_clip_pixel(dst[j * stride] + \ + (bits ? \ + (int)(t + (1U << (bits - 1))) >> bits : \ + t)); \ + dst++; \ + } \ + return; \ + } \ +\ + for (i = 0; i < sz; i++) \ + type_a##sz##_1d(block + i, sz, tmp + i * sz, 0); \ + memset(block, 0, sz * sz * sizeof(*block)); \ + for (i = 0; i < sz; i++) { \ + type_b##sz##_1d(tmp + i, sz, out, 1); \ + for (j = 0; j < sz; j++) \ + dst[j * stride] = av_clip_pixel(dst[j * stride] + \ + (bits ? \ + (int)(out[j] + (1U << (bits - 1))) >> bits : \ + out[j])); \ + dst++; \ + } \ +} + +#define itxfm_wrap(sz, bits) \ +itxfm_wrapper(idct, idct, sz, bits, 1) \ +itxfm_wrapper(iadst, idct, sz, bits, 0) \ +itxfm_wrapper(idct, iadst, sz, bits, 0) \ +itxfm_wrapper(iadst, iadst, sz, bits, 0) + +#define IN(x) ((dctint) in[(x) * stride]) + +static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t1, t2, t3; + + t0 = ((IN(0) + IN(2)) * 11585 + (1 << 13)) >> 14; + t1 = ((IN(0) - IN(2)) * 11585 + (1 << 13)) >> 14; + t2 = (IN(1) * 6270 - IN(3) * 15137 + (1 << 13)) >> 14; + t3 = (IN(1) * 15137 + IN(3) * 6270 + (1 << 13)) >> 14; + + out[0] = t0 + t3; + out[1] = t1 + t2; + out[2] = t1 - t2; + out[3] = t0 - t3; +} + +static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t1, t2, t3; + + t0 = 5283 * IN(0) + 15212 * IN(2) + 9929 * IN(3); + t1 = 9929 * IN(0) - 5283 * IN(2) - 15212 * IN(3); + t2 = 13377 * (IN(0) - IN(2) + IN(3)); + t3 = 13377 * IN(1); + + out[0] = (t0 + t3 + (1 << 13)) >> 14; + out[1] = (t1 + t3 + (1 << 13)) >> 14; + out[2] = (t2 + (1 << 13)) >> 14; + out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14; +} + +itxfm_wrap(4, 4) + +static av_always_inline void idct8_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a; + + t0a = ((IN(0) + IN(4)) * 11585 + (1 << 13)) >> 14; + t1a = ((IN(0) - IN(4)) * 11585 + (1 << 13)) >> 14; + t2a = (IN(2) * 6270 - IN(6) * 15137 + (1 << 13)) >> 14; + t3a = (IN(2) * 15137 + IN(6) * 6270 + (1 << 13)) >> 14; + t4a = (IN(1) * 3196 - IN(7) * 16069 + (1 << 13)) >> 14; + t5a = (IN(5) * 13623 - IN(3) * 9102 + (1 << 13)) >> 14; + t6a = (IN(5) * 9102 + IN(3) * 13623 + (1 << 13)) >> 14; + t7a = (IN(1) * 16069 + IN(7) * 3196 + (1 << 13)) >> 14; + + t0 = t0a + t3a; + t1 = t1a + t2a; + t2 = t1a - t2a; + t3 = t0a - t3a; + t4 = t4a + t5a; + t5a = t4a - t5a; + t7 = t7a + t6a; + t6a = t7a - t6a; + + t5 = ((t6a - t5a) * 11585 + (1 << 13)) >> 14; + t6 = ((t6a + t5a) * 11585 + (1 << 13)) >> 14; + + out[0] = t0 + t7; + out[1] = t1 + t6; + out[2] = t2 + t5; + out[3] = t3 + t4; + out[4] = t3 - t4; + out[5] = t2 - t5; + out[6] = t1 - t6; + out[7] = t0 - t7; +} + +static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a; + + t0a = 16305 * IN(7) + 1606 * IN(0); + t1a = 1606 * IN(7) - 16305 * IN(0); + t2a = 14449 * IN(5) + 7723 * IN(2); + t3a = 7723 * IN(5) - 14449 * IN(2); + t4a = 10394 * IN(3) + 12665 * IN(4); + t5a = 12665 * IN(3) - 10394 * IN(4); + t6a = 4756 * IN(1) + 15679 * IN(6); + t7a = 15679 * IN(1) - 4756 * IN(6); + + t0 = (t0a + t4a + (1 << 13)) >> 14; + t1 = (t1a + t5a + (1 << 13)) >> 14; + t2 = (t2a + t6a + (1 << 13)) >> 14; + t3 = (t3a + t7a + (1 << 13)) >> 14; + t4 = (t0a - t4a + (1 << 13)) >> 14; + t5 = (t1a - t5a + (1 << 13)) >> 14; + t6 = (t2a - t6a + (1 << 13)) >> 14; + t7 = (t3a - t7a + (1 << 13)) >> 14; + + t4a = 15137U * t4 + 6270U * t5; + t5a = 6270U * t4 - 15137U * t5; + t6a = 15137U * t7 - 6270U * t6; + t7a = 6270U * t7 + 15137U * t6; + + out[0] = t0 + t2; + out[7] = -(t1 + t3); + t2 = t0 - t2; + t3 = t1 - t3; + + out[1] = -((dctint)((1U << 13) + t4a + t6a) >> 14); + out[6] = (dctint)((1U << 13) + t5a + t7a) >> 14; + t6 = (dctint)((1U << 13) + t4a - t6a) >> 14; + t7 = (dctint)((1U << 13) + t5a - t7a) >> 14; + + out[3] = -((dctint)((t2 + t3) * 11585U + (1 << 13)) >> 14); + out[4] = (dctint)((t2 - t3) * 11585U + (1 << 13)) >> 14; + out[2] = (dctint)((t6 + t7) * 11585U + (1 << 13)) >> 14; + out[5] = -((dctint)((t6 - t7) * 11585U + (1 << 13)) >> 14); +} + +itxfm_wrap(8, 5) + +static av_always_inline void idct16_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a; + dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a; + + t0a = (dctint)((IN(0) + IN(8)) * 11585U + (1 << 13)) >> 14; + t1a = (dctint)((IN(0) - IN(8)) * 11585U + (1 << 13)) >> 14; + t2a = (dctint)(IN(4) * 6270U - IN(12) * 15137U + (1 << 13)) >> 14; + t3a = (dctint)(IN(4) * 15137U + IN(12) * 6270U + (1 << 13)) >> 14; + t4a = (dctint)(IN(2) * 3196U - IN(14) * 16069U + (1 << 13)) >> 14; + t7a = (dctint)(IN(2) * 16069U + IN(14) * 3196U + (1 << 13)) >> 14; + t5a = (dctint)(IN(10) * 13623U - IN(6) * 9102U + (1 << 13)) >> 14; + t6a = (dctint)(IN(10) * 9102U + IN(6) * 13623U + (1 << 13)) >> 14; + t8a = (dctint)(IN(1) * 1606U - IN(15) * 16305U + (1 << 13)) >> 14; + t15a = (dctint)(IN(1) * 16305U + IN(15) * 1606U + (1 << 13)) >> 14; + t9a = (dctint)(IN(9) * 12665U - IN(7) * 10394U + (1 << 13)) >> 14; + t14a = (dctint)(IN(9) * 10394U + IN(7) * 12665U + (1 << 13)) >> 14; + t10a = (dctint)(IN(5) * 7723U - IN(11) * 14449U + (1 << 13)) >> 14; + t13a = (dctint)(IN(5) * 14449U + IN(11) * 7723U + (1 << 13)) >> 14; + t11a = (dctint)(IN(13) * 15679U - IN(3) * 4756U + (1 << 13)) >> 14; + t12a = (dctint)(IN(13) * 4756U + IN(3) * 15679U + (1 << 13)) >> 14; + + t0 = t0a + t3a; + t1 = t1a + t2a; + t2 = t1a - t2a; + t3 = t0a - t3a; + t4 = t4a + t5a; + t5 = t4a - t5a; + t6 = t7a - t6a; + t7 = t7a + t6a; + t8 = t8a + t9a; + t9 = t8a - t9a; + t10 = t11a - t10a; + t11 = t11a + t10a; + t12 = t12a + t13a; + t13 = t12a - t13a; + t14 = t15a - t14a; + t15 = t15a + t14a; + + t5a = (dctint)((t6 - t5) * 11585U + (1 << 13)) >> 14; + t6a = (dctint)((t6 + t5) * 11585U + (1 << 13)) >> 14; + t9a = (dctint)( t14 * 6270U - t9 * 15137U + (1 << 13)) >> 14; + t14a = (dctint)( t14 * 15137U + t9 * 6270U + (1 << 13)) >> 14; + t10a = (dctint)(-(t13 * 15137U + t10 * 6270U) + (1 << 13)) >> 14; + t13a = (dctint)( t13 * 6270U - t10 * 15137U + (1 << 13)) >> 14; + + t0a = t0 + t7; + t1a = t1 + t6a; + t2a = t2 + t5a; + t3a = t3 + t4; + t4 = t3 - t4; + t5 = t2 - t5a; + t6 = t1 - t6a; + t7 = t0 - t7; + t8a = t8 + t11; + t9 = t9a + t10a; + t10 = t9a - t10a; + t11a = t8 - t11; + t12a = t15 - t12; + t13 = t14a - t13a; + t14 = t14a + t13a; + t15a = t15 + t12; + + t10a = (dctint)((t13 - t10) * 11585U + (1 << 13)) >> 14; + t13a = (dctint)((t13 + t10) * 11585U + (1 << 13)) >> 14; + t11 = (dctint)((t12a - t11a) * 11585U + (1 << 13)) >> 14; + t12 = (dctint)((t12a + t11a) * 11585U + (1 << 13)) >> 14; + + out[ 0] = t0a + t15a; + out[ 1] = t1a + t14; + out[ 2] = t2a + t13a; + out[ 3] = t3a + t12; + out[ 4] = t4 + t11; + out[ 5] = t5 + t10a; + out[ 6] = t6 + t9; + out[ 7] = t7 + t8a; + out[ 8] = t7 - t8a; + out[ 9] = t6 - t9; + out[10] = t5 - t10a; + out[11] = t4 - t11; + out[12] = t3a - t12; + out[13] = t2a - t13a; + out[14] = t1a - t14; + out[15] = t0a - t15a; +} + +static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; + dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a; + dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a; + + t0 = IN(15) * 16364U + IN(0) * 804U; + t1 = IN(15) * 804U - IN(0) * 16364U; + t2 = IN(13) * 15893U + IN(2) * 3981U; + t3 = IN(13) * 3981U - IN(2) * 15893U; + t4 = IN(11) * 14811U + IN(4) * 7005U; + t5 = IN(11) * 7005U - IN(4) * 14811U; + t6 = IN(9) * 13160U + IN(6) * 9760U; + t7 = IN(9) * 9760U - IN(6) * 13160U; + t8 = IN(7) * 11003U + IN(8) * 12140U; + t9 = IN(7) * 12140U - IN(8) * 11003U; + t10 = IN(5) * 8423U + IN(10) * 14053U; + t11 = IN(5) * 14053U - IN(10) * 8423U; + t12 = IN(3) * 5520U + IN(12) * 15426U; + t13 = IN(3) * 15426U - IN(12) * 5520U; + t14 = IN(1) * 2404U + IN(14) * 16207U; + t15 = IN(1) * 16207U - IN(14) * 2404U; + + t0a = (dctint)((1U << 13) + t0 + t8 ) >> 14; + t1a = (dctint)((1U << 13) + t1 + t9 ) >> 14; + t2a = (dctint)((1U << 13) + t2 + t10) >> 14; + t3a = (dctint)((1U << 13) + t3 + t11) >> 14; + t4a = (dctint)((1U << 13) + t4 + t12) >> 14; + t5a = (dctint)((1U << 13) + t5 + t13) >> 14; + t6a = (dctint)((1U << 13) + t6 + t14) >> 14; + t7a = (dctint)((1U << 13) + t7 + t15) >> 14; + t8a = (dctint)((1U << 13) + t0 - t8 ) >> 14; + t9a = (dctint)((1U << 13) + t1 - t9 ) >> 14; + t10a = (dctint)((1U << 13) + t2 - t10) >> 14; + t11a = (dctint)((1U << 13) + t3 - t11) >> 14; + t12a = (dctint)((1U << 13) + t4 - t12) >> 14; + t13a = (dctint)((1U << 13) + t5 - t13) >> 14; + t14a = (dctint)((1U << 13) + t6 - t14) >> 14; + t15a = (dctint)((1U << 13) + t7 - t15) >> 14; + + t8 = t8a * 16069U + t9a * 3196U; + t9 = t8a * 3196U - t9a * 16069U; + t10 = t10a * 9102U + t11a * 13623U; + t11 = t10a * 13623U - t11a * 9102U; + t12 = t13a * 16069U - t12a * 3196U; + t13 = t13a * 3196U + t12a * 16069U; + t14 = t15a * 9102U - t14a * 13623U; + t15 = t15a * 13623U + t14a * 9102U; + + t0 = t0a + t4a; + t1 = t1a + t5a; + t2 = t2a + t6a; + t3 = t3a + t7a; + t4 = t0a - t4a; + t5 = t1a - t5a; + t6 = t2a - t6a; + t7 = t3a - t7a; + t8a = (dctint)((1U << 13) + t8 + t12) >> 14; + t9a = (dctint)((1U << 13) + t9 + t13) >> 14; + t10a = (dctint)((1U << 13) + t10 + t14) >> 14; + t11a = (dctint)((1U << 13) + t11 + t15) >> 14; + t12a = (dctint)((1U << 13) + t8 - t12) >> 14; + t13a = (dctint)((1U << 13) + t9 - t13) >> 14; + t14a = (dctint)((1U << 13) + t10 - t14) >> 14; + t15a = (dctint)((1U << 13) + t11 - t15) >> 14; + + t4a = t4 * 15137U + t5 * 6270U; + t5a = t4 * 6270U - t5 * 15137U; + t6a = t7 * 15137U - t6 * 6270U; + t7a = t7 * 6270U + t6 * 15137U; + t12 = t12a * 15137U + t13a * 6270U; + t13 = t12a * 6270U - t13a * 15137U; + t14 = t15a * 15137U - t14a * 6270U; + t15 = t15a * 6270U + t14a * 15137U; + + out[ 0] = t0 + t2; + out[15] = -(t1 + t3); + t2a = t0 - t2; + t3a = t1 - t3; + out[ 3] = -((dctint)((1U << 13) + t4a + t6a) >> 14); + out[12] = (dctint)((1U << 13) + t5a + t7a) >> 14; + t6 = (dctint)((1U << 13) + t4a - t6a) >> 14; + t7 = (dctint)((1U << 13) + t5a - t7a) >> 14; + out[ 1] = -(t8a + t10a); + out[14] = t9a + t11a; + t10 = t8a - t10a; + t11 = t9a - t11a; + out[ 2] = (dctint)((1U << 13) + t12 + t14) >> 14; + out[13] = -((dctint)((1U << 13) + t13 + t15) >> 14); + t14a = (dctint)((1U << 13) + t12 - t14) >> 14; + t15a = (dctint)((1U << 13) + t13 - t15) >> 14; + + out[ 7] = (dctint)(-(t2a + t3a) * 11585U + (1 << 13)) >> 14; + out[ 8] = (dctint)( (t2a - t3a) * 11585U + (1 << 13)) >> 14; + out[ 4] = (dctint)( (t7 + t6) * 11585U + (1 << 13)) >> 14; + out[11] = (dctint)( (t7 - t6) * 11585U + (1 << 13)) >> 14; + out[ 6] = (dctint)( (t11 + t10) * 11585U + (1 << 13)) >> 14; + out[ 9] = (dctint)( (t11 - t10) * 11585U + (1 << 13)) >> 14; + out[ 5] = (dctint)(-(t14a + t15a) * 11585U + (1 << 13)) >> 14; + out[10] = (dctint)( (t14a - t15a) * 11585U + (1 << 13)) >> 14; +} + +itxfm_wrap(16, 6) + +static av_always_inline void idct32_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + dctint t0a = (dctint)((IN(0) + IN(16)) * 11585U + (1 << 13)) >> 14; + dctint t1a = (dctint)((IN(0) - IN(16)) * 11585U + (1 << 13)) >> 14; + dctint t2a = (dctint)(IN( 8) * 6270U - IN(24) * 15137U + (1 << 13)) >> 14; + dctint t3a = (dctint)(IN( 8) * 15137U + IN(24) * 6270U + (1 << 13)) >> 14; + dctint t4a = (dctint)(IN( 4) * 3196U - IN(28) * 16069U + (1 << 13)) >> 14; + dctint t7a = (dctint)(IN( 4) * 16069U + IN(28) * 3196U + (1 << 13)) >> 14; + dctint t5a = (dctint)(IN(20) * 13623U - IN(12) * 9102U + (1 << 13)) >> 14; + dctint t6a = (dctint)(IN(20) * 9102U + IN(12) * 13623U + (1 << 13)) >> 14; + dctint t8a = (dctint)(IN( 2) * 1606U - IN(30) * 16305U + (1 << 13)) >> 14; + dctint t15a = (dctint)(IN( 2) * 16305U + IN(30) * 1606U + (1 << 13)) >> 14; + dctint t9a = (dctint)(IN(18) * 12665U - IN(14) * 10394U + (1 << 13)) >> 14; + dctint t14a = (dctint)(IN(18) * 10394U + IN(14) * 12665U + (1 << 13)) >> 14; + dctint t10a = (dctint)(IN(10) * 7723U - IN(22) * 14449U + (1 << 13)) >> 14; + dctint t13a = (dctint)(IN(10) * 14449U + IN(22) * 7723U + (1 << 13)) >> 14; + dctint t11a = (dctint)(IN(26) * 15679U - IN( 6) * 4756U + (1 << 13)) >> 14; + dctint t12a = (dctint)(IN(26) * 4756U + IN( 6) * 15679U + (1 << 13)) >> 14; + dctint t16a = (dctint)(IN( 1) * 804U - IN(31) * 16364U + (1 << 13)) >> 14; + dctint t31a = (dctint)(IN( 1) * 16364U + IN(31) * 804U + (1 << 13)) >> 14; + dctint t17a = (dctint)(IN(17) * 12140U - IN(15) * 11003U + (1 << 13)) >> 14; + dctint t30a = (dctint)(IN(17) * 11003U + IN(15) * 12140U + (1 << 13)) >> 14; + dctint t18a = (dctint)(IN( 9) * 7005U - IN(23) * 14811U + (1 << 13)) >> 14; + dctint t29a = (dctint)(IN( 9) * 14811U + IN(23) * 7005U + (1 << 13)) >> 14; + dctint t19a = (dctint)(IN(25) * 15426U - IN( 7) * 5520U + (1 << 13)) >> 14; + dctint t28a = (dctint)(IN(25) * 5520U + IN( 7) * 15426U + (1 << 13)) >> 14; + dctint t20a = (dctint)(IN( 5) * 3981U - IN(27) * 15893U + (1 << 13)) >> 14; + dctint t27a = (dctint)(IN( 5) * 15893U + IN(27) * 3981U + (1 << 13)) >> 14; + dctint t21a = (dctint)(IN(21) * 14053U - IN(11) * 8423U + (1 << 13)) >> 14; + dctint t26a = (dctint)(IN(21) * 8423U + IN(11) * 14053U + (1 << 13)) >> 14; + dctint t22a = (dctint)(IN(13) * 9760U - IN(19) * 13160U + (1 << 13)) >> 14; + dctint t25a = (dctint)(IN(13) * 13160U + IN(19) * 9760U + (1 << 13)) >> 14; + dctint t23a = (dctint)(IN(29) * 16207U - IN( 3) * 2404U + (1 << 13)) >> 14; + dctint t24a = (dctint)(IN(29) * 2404U + IN( 3) * 16207U + (1 << 13)) >> 14; + + dctint t0 = t0a + t3a; + dctint t1 = t1a + t2a; + dctint t2 = t1a - t2a; + dctint t3 = t0a - t3a; + dctint t4 = t4a + t5a; + dctint t5 = t4a - t5a; + dctint t6 = t7a - t6a; + dctint t7 = t7a + t6a; + dctint t8 = t8a + t9a; + dctint t9 = t8a - t9a; + dctint t10 = t11a - t10a; + dctint t11 = t11a + t10a; + dctint t12 = t12a + t13a; + dctint t13 = t12a - t13a; + dctint t14 = t15a - t14a; + dctint t15 = t15a + t14a; + dctint t16 = t16a + t17a; + dctint t17 = t16a - t17a; + dctint t18 = t19a - t18a; + dctint t19 = t19a + t18a; + dctint t20 = t20a + t21a; + dctint t21 = t20a - t21a; + dctint t22 = t23a - t22a; + dctint t23 = t23a + t22a; + dctint t24 = t24a + t25a; + dctint t25 = t24a - t25a; + dctint t26 = t27a - t26a; + dctint t27 = t27a + t26a; + dctint t28 = t28a + t29a; + dctint t29 = t28a - t29a; + dctint t30 = t31a - t30a; + dctint t31 = t31a + t30a; + + t5a = (dctint)((t6 - t5) * 11585U + (1 << 13)) >> 14; + t6a = (dctint)((t6 + t5) * 11585U + (1 << 13)) >> 14; + t9a = (dctint)( t14 * 6270U - t9 * 15137U + (1 << 13)) >> 14; + t14a = (dctint)( t14 * 15137U + t9 * 6270U + (1 << 13)) >> 14; + t10a = (dctint)(-(t13 * 15137U + t10 * 6270U) + (1 << 13)) >> 14; + t13a = (dctint)( t13 * 6270U - t10 * 15137U + (1 << 13)) >> 14; + t17a = (dctint)( t30 * 3196U - t17 * 16069U + (1 << 13)) >> 14; + t30a = (dctint)( t30 * 16069U + t17 * 3196U + (1 << 13)) >> 14; + t18a = (dctint)(-(t29 * 16069U + t18 * 3196U) + (1 << 13)) >> 14; + t29a = (dctint)( t29 * 3196U - t18 * 16069U + (1 << 13)) >> 14; + t21a = (dctint)( t26 * 13623U - t21 * 9102U + (1 << 13)) >> 14; + t26a = (dctint)( t26 * 9102U + t21 * 13623U + (1 << 13)) >> 14; + t22a = (dctint)(-(t25 * 9102U + t22 * 13623U) + (1 << 13)) >> 14; + t25a = (dctint)( t25 * 13623U - t22 * 9102U + (1 << 13)) >> 14; + + t0a = t0 + t7; + t1a = t1 + t6a; + t2a = t2 + t5a; + t3a = t3 + t4; + t4a = t3 - t4; + t5 = t2 - t5a; + t6 = t1 - t6a; + t7a = t0 - t7; + t8a = t8 + t11; + t9 = t9a + t10a; + t10 = t9a - t10a; + t11a = t8 - t11; + t12a = t15 - t12; + t13 = t14a - t13a; + t14 = t14a + t13a; + t15a = t15 + t12; + t16a = t16 + t19; + t17 = t17a + t18a; + t18 = t17a - t18a; + t19a = t16 - t19; + t20a = t23 - t20; + t21 = t22a - t21a; + t22 = t22a + t21a; + t23a = t23 + t20; + t24a = t24 + t27; + t25 = t25a + t26a; + t26 = t25a - t26a; + t27a = t24 - t27; + t28a = t31 - t28; + t29 = t30a - t29a; + t30 = t30a + t29a; + t31a = t31 + t28; + + t10a = (dctint)((t13 - t10) * 11585U + (1 << 13)) >> 14; + t13a = (dctint)((t13 + t10) * 11585U + (1 << 13)) >> 14; + t11 = (dctint)((t12a - t11a) * 11585U + (1 << 13)) >> 14; + t12 = (dctint)((t12a + t11a) * 11585U + (1 << 13)) >> 14; + t18a = (dctint)( t29 * 6270U - t18 * 15137U + (1 << 13)) >> 14; + t29a = (dctint)( t29 * 15137U + t18 * 6270U + (1 << 13)) >> 14; + t19 = (dctint)( t28a * 6270U - t19a * 15137U + (1 << 13)) >> 14; + t28 = (dctint)( t28a * 15137U + t19a * 6270U + (1 << 13)) >> 14; + t20 = (dctint)(-(t27a * 15137U + t20a * 6270U) + (1 << 13)) >> 14; + t27 = (dctint)( t27a * 6270U - t20a * 15137U + (1 << 13)) >> 14; + t21a = (dctint)(-(t26 * 15137U + t21 * 6270U) + (1 << 13)) >> 14; + t26a = (dctint)( t26 * 6270U - t21 * 15137U + (1 << 13)) >> 14; + + t0 = t0a + t15a; + t1 = t1a + t14; + t2 = t2a + t13a; + t3 = t3a + t12; + t4 = t4a + t11; + t5a = t5 + t10a; + t6a = t6 + t9; + t7 = t7a + t8a; + t8 = t7a - t8a; + t9a = t6 - t9; + t10 = t5 - t10a; + t11a = t4a - t11; + t12a = t3a - t12; + t13 = t2a - t13a; + t14a = t1a - t14; + t15 = t0a - t15a; + t16 = t16a + t23a; + t17a = t17 + t22; + t18 = t18a + t21a; + t19a = t19 + t20; + t20a = t19 - t20; + t21 = t18a - t21a; + t22a = t17 - t22; + t23 = t16a - t23a; + t24 = t31a - t24a; + t25a = t30 - t25; + t26 = t29a - t26a; + t27a = t28 - t27; + t28a = t28 + t27; + t29 = t29a + t26a; + t30a = t30 + t25; + t31 = t31a + t24a; + + t20 = (dctint)((t27a - t20a) * 11585U + (1 << 13)) >> 14; + t27 = (dctint)((t27a + t20a) * 11585U + (1 << 13)) >> 14; + t21a = (dctint)((t26 - t21 ) * 11585U + (1 << 13)) >> 14; + t26a = (dctint)((t26 + t21 ) * 11585U + (1 << 13)) >> 14; + t22 = (dctint)((t25a - t22a) * 11585U + (1 << 13)) >> 14; + t25 = (dctint)((t25a + t22a) * 11585U + (1 << 13)) >> 14; + t23a = (dctint)((t24 - t23 ) * 11585U + (1 << 13)) >> 14; + t24a = (dctint)((t24 + t23 ) * 11585U + (1 << 13)) >> 14; + + out[ 0] = t0 + t31; + out[ 1] = t1 + t30a; + out[ 2] = t2 + t29; + out[ 3] = t3 + t28a; + out[ 4] = t4 + t27; + out[ 5] = t5a + t26a; + out[ 6] = t6a + t25; + out[ 7] = t7 + t24a; + out[ 8] = t8 + t23a; + out[ 9] = t9a + t22; + out[10] = t10 + t21a; + out[11] = t11a + t20; + out[12] = t12a + t19a; + out[13] = t13 + t18; + out[14] = t14a + t17a; + out[15] = t15 + t16; + out[16] = t15 - t16; + out[17] = t14a - t17a; + out[18] = t13 - t18; + out[19] = t12a - t19a; + out[20] = t11a - t20; + out[21] = t10 - t21a; + out[22] = t9a - t22; + out[23] = t8 - t23a; + out[24] = t7 - t24a; + out[25] = t6a - t25; + out[26] = t5a - t26a; + out[27] = t4 - t27; + out[28] = t3 - t28a; + out[29] = t2 - t29; + out[30] = t1 - t30a; + out[31] = t0 - t31; +} + +itxfm_wrapper(idct, idct, 32, 6, 1) + +static av_always_inline void iwht4_1d(const dctcoef *in, ptrdiff_t stride, + dctcoef *out, int pass) +{ + int t0, t1, t2, t3, t4; + + if (pass == 0) { + t0 = IN(0) >> 2; + t1 = IN(3) >> 2; + t2 = IN(1) >> 2; + t3 = IN(2) >> 2; + } else { + t0 = IN(0); + t1 = IN(3); + t2 = IN(1); + t3 = IN(2); + } + + t0 += t2; + t3 -= t1; + t4 = (t0 - t3) >> 1; + t1 = t4 - t1; + t2 = t4 - t2; + t0 -= t1; + t3 += t2; + + out[0] = t0; + out[1] = t1; + out[2] = t2; + out[3] = t3; +} + +itxfm_wrapper(iwht, iwht, 4, 0, 0) + +#undef IN +#undef itxfm_wrapper +#undef itxfm_wrap + +static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp) +{ +#define init_itxfm(tx, sz) \ + dsp->itxfm_add[tx][DCT_DCT] = idct_idct_##sz##_add_c; \ + dsp->itxfm_add[tx][DCT_ADST] = iadst_idct_##sz##_add_c; \ + dsp->itxfm_add[tx][ADST_DCT] = idct_iadst_##sz##_add_c; \ + dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_##sz##_add_c + +#define init_idct(tx, nm) \ + dsp->itxfm_add[tx][DCT_DCT] = \ + dsp->itxfm_add[tx][ADST_DCT] = \ + dsp->itxfm_add[tx][DCT_ADST] = \ + dsp->itxfm_add[tx][ADST_ADST] = nm##_add_c + + init_itxfm(TX_4X4, 4x4); + init_itxfm(TX_8X8, 8x8); + init_itxfm(TX_16X16, 16x16); + init_idct(TX_32X32, idct_idct_32x32); + init_idct(4 /* lossless */, iwht_iwht_4x4); + +#undef init_itxfm +#undef init_idct +} + +static av_always_inline void loop_filter(pixel *dst, int E, int I, int H, + ptrdiff_t stridea, ptrdiff_t strideb, + int wd) +{ + int i, F = 1 << (BIT_DEPTH - 8); + + E <<= (BIT_DEPTH - 8); + I <<= (BIT_DEPTH - 8); + H <<= (BIT_DEPTH - 8); + for (i = 0; i < 8; i++, dst += stridea) { + int p7, p6, p5, p4; + int p3 = dst[strideb * -4], p2 = dst[strideb * -3]; + int p1 = dst[strideb * -2], p0 = dst[strideb * -1]; + int q0 = dst[strideb * +0], q1 = dst[strideb * +1]; + int q2 = dst[strideb * +2], q3 = dst[strideb * +3]; + int q4, q5, q6, q7; + int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I && + FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I && + FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I && + FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E; + int flat8out, flat8in; + + if (!fm) + continue; + + if (wd >= 16) { + p7 = dst[strideb * -8]; + p6 = dst[strideb * -7]; + p5 = dst[strideb * -6]; + p4 = dst[strideb * -5]; + q4 = dst[strideb * +4]; + q5 = dst[strideb * +5]; + q6 = dst[strideb * +6]; + q7 = dst[strideb * +7]; + + flat8out = FFABS(p7 - p0) <= F && FFABS(p6 - p0) <= F && + FFABS(p5 - p0) <= F && FFABS(p4 - p0) <= F && + FFABS(q4 - q0) <= F && FFABS(q5 - q0) <= F && + FFABS(q6 - q0) <= F && FFABS(q7 - q0) <= F; + } + + if (wd >= 8) + flat8in = FFABS(p3 - p0) <= F && FFABS(p2 - p0) <= F && + FFABS(p1 - p0) <= F && FFABS(q1 - q0) <= F && + FFABS(q2 - q0) <= F && FFABS(q3 - q0) <= F; + + if (wd >= 16 && flat8out && flat8in) { + dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 + + p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4; + dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 + + p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4; + dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 + + p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4; + dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 + + p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4; + dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 + + p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4; + dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + + p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4; + dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + + q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4; + dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + + q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4; + dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + + q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4; + dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + + q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4; + dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + + q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4; + dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + + q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4; + dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + + q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4; + dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + + q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4; + } else if (wd >= 8 && flat8in) { + dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3; + dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3; + dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3; + dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3; + dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3; + dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3; + } else { + int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H; + + if (hev) { + int f = av_clip_intp2(p1 - q1, BIT_DEPTH - 1), f1, f2; + f = av_clip_intp2(3 * (q0 - p0) + f, BIT_DEPTH - 1); + + f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3; + f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3; + + dst[strideb * -1] = av_clip_pixel(p0 + f2); + dst[strideb * +0] = av_clip_pixel(q0 - f1); + } else { + int f = av_clip_intp2(3 * (q0 - p0), BIT_DEPTH - 1), f1, f2; + + f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3; + f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3; + + dst[strideb * -1] = av_clip_pixel(p0 + f2); + dst[strideb * +0] = av_clip_pixel(q0 - f1); + + f = (f1 + 1) >> 1; + dst[strideb * -2] = av_clip_pixel(p1 + f); + dst[strideb * +1] = av_clip_pixel(q1 - f); + } + } + } +} + +#define lf_8_fn(dir, wd, stridea, strideb) \ +static void loop_filter_##dir##_##wd##_8_c(uint8_t *_dst, \ + ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + pixel *dst = (pixel *) _dst; \ + stride /= sizeof(pixel); \ + loop_filter(dst, E, I, H, stridea, strideb, wd); \ +} + +#define lf_8_fns(wd) \ +lf_8_fn(h, wd, stride, 1) \ +lf_8_fn(v, wd, 1, stride) + +lf_8_fns(4) +lf_8_fns(8) +lf_8_fns(16) + +#undef lf_8_fn +#undef lf_8_fns + +#define lf_16_fn(dir, stridea) \ +static void loop_filter_##dir##_16_16_c(uint8_t *dst, \ + ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + loop_filter_##dir##_16_8_c(dst, stride, E, I, H); \ + loop_filter_##dir##_16_8_c(dst + 8 * stridea, stride, E, I, H); \ +} + +lf_16_fn(h, stride) +lf_16_fn(v, sizeof(pixel)) + +#undef lf_16_fn + +#define lf_mix_fn(dir, wd1, wd2, stridea) \ +static void loop_filter_##dir##_##wd1##wd2##_16_c(uint8_t *dst, \ + ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + loop_filter_##dir##_##wd1##_8_c(dst, stride, E & 0xff, I & 0xff, H & 0xff); \ + loop_filter_##dir##_##wd2##_8_c(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \ +} + +#define lf_mix_fns(wd1, wd2) \ +lf_mix_fn(h, wd1, wd2, stride) \ +lf_mix_fn(v, wd1, wd2, sizeof(pixel)) + +lf_mix_fns(4, 4) +lf_mix_fns(4, 8) +lf_mix_fns(8, 4) +lf_mix_fns(8, 8) + +#undef lf_mix_fn +#undef lf_mix_fns + +static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp) +{ + dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c; + dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c; + dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c; + dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c; + dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c; + dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c; + + dsp->loop_filter_16[0] = loop_filter_h_16_16_c; + dsp->loop_filter_16[1] = loop_filter_v_16_16_c; + + dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c; + dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c; + dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c; + dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c; + dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c; + dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c; + dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c; + dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c; +} + +#if BIT_DEPTH != 12 + +static av_always_inline void copy_c(uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int w, int h) +{ + do { + memcpy(dst, src, w * sizeof(pixel)); + + dst += dst_stride; + src += src_stride; + } while (--h); +} + +static av_always_inline void avg_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h) +{ + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + do { + int x; + + for (x = 0; x < w; x += 4) + AV_WN4PA(&dst[x], rnd_avg_pixel4(AV_RN4PA(&dst[x]), AV_RN4P(&src[x]))); + + dst += dst_stride; + src += src_stride; + } while (--h); +} + +#define fpel_fn(type, sz) \ +static void type##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + type##_c(dst, dst_stride, src, src_stride, sz, h); \ +} + +#define copy_avg_fn(sz) \ +fpel_fn(copy, sz) \ +fpel_fn(avg, sz) + +copy_avg_fn(64) +copy_avg_fn(32) +copy_avg_fn(16) +copy_avg_fn(8) +copy_avg_fn(4) + +#undef fpel_fn +#undef copy_avg_fn + +#endif /* BIT_DEPTH != 12 */ + +#define FILTER_8TAP(src, x, F, stride) \ + av_clip_pixel((F[0] * src[x + -3 * stride] + \ + F[1] * src[x + -2 * stride] + \ + F[2] * src[x + -1 * stride] + \ + F[3] * src[x + +0 * stride] + \ + F[4] * src[x + +1 * stride] + \ + F[5] * src[x + +2 * stride] + \ + F[6] * src[x + +3 * stride] + \ + F[7] * src[x + +4 * stride] + 64) >> 7) + +static av_always_inline void do_8tap_1d_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, ptrdiff_t ds, + const int16_t *filter, int avg) +{ + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + do { + int x; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1; + } else { + dst[x] = FILTER_8TAP(src, x, filter, ds); + } + + dst += dst_stride; + src += src_stride; + } while (--h); +} + +#define filter_8tap_1d_fn(opn, opa, dir, ds) \ +static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, const int16_t *filter) \ +{ \ + do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \ +} + +filter_8tap_1d_fn(put, 0, v, src_stride / sizeof(pixel)) +filter_8tap_1d_fn(put, 0, h, 1) +filter_8tap_1d_fn(avg, 1, v, src_stride / sizeof(pixel)) +filter_8tap_1d_fn(avg, 1, h, 1) + +#undef filter_8tap_1d_fn + +static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, const int16_t *filterx, + const int16_t *filtery, int avg) +{ + int tmp_h = h + 7; + pixel tmp[64 * 71], *tmp_ptr = tmp; + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + src -= src_stride * 3; + do { + int x; + + for (x = 0; x < w; x++) + tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1); + + tmp_ptr += 64; + src += src_stride; + } while (--tmp_h); + + tmp_ptr = tmp + 64 * 3; + do { + int x; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1; + } else { + dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64); + } + + tmp_ptr += 64; + dst += dst_stride; + } while (--h); +} + +#define filter_8tap_2d_fn(opn, opa) \ +static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, const int16_t *filterx, \ + const int16_t *filtery) \ +{ \ + do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \ +} + +filter_8tap_2d_fn(put, 0) +filter_8tap_2d_fn(avg, 1) + +#undef filter_8tap_2d_fn + +#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \ +static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \ + ff_vp9_subpel_filters[type_idx][dir_m]); \ +} + +#define filter_fn_2d(sz, type, type_idx, avg) \ +static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \ + ff_vp9_subpel_filters[type_idx][mx], \ + ff_vp9_subpel_filters[type_idx][my]); \ +} + +#if BIT_DEPTH != 12 + +#define FILTER_BILIN(src, x, mxy, stride) \ + (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4)) + +static av_always_inline void do_bilin_1d_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, ptrdiff_t ds, int mxy, int avg) +{ + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + do { + int x; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1; + } else { + dst[x] = FILTER_BILIN(src, x, mxy, ds); + } + + dst += dst_stride; + src += src_stride; + } while (--h); +} + +#define bilin_1d_fn(opn, opa, dir, ds) \ +static av_noinline void opn##_bilin_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, int mxy) \ +{ \ + do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa); \ +} + +bilin_1d_fn(put, 0, v, src_stride / sizeof(pixel)) +bilin_1d_fn(put, 0, h, 1) +bilin_1d_fn(avg, 1, v, src_stride / sizeof(pixel)) +bilin_1d_fn(avg, 1, h, 1) + +#undef bilin_1d_fn + +static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, int mx, int my, int avg) +{ + pixel tmp[64 * 65], *tmp_ptr = tmp; + int tmp_h = h + 1; + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + do { + int x; + + for (x = 0; x < w; x++) + tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1); + + tmp_ptr += 64; + src += src_stride; + } while (--tmp_h); + + tmp_ptr = tmp; + do { + int x; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1; + } else { + dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64); + } + + tmp_ptr += 64; + dst += dst_stride; + } while (--h); +} + +#define bilin_2d_fn(opn, opa) \ +static av_noinline void opn##_bilin_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my) \ +{ \ + do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \ +} + +bilin_2d_fn(put, 0) +bilin_2d_fn(avg, 1) + +#undef bilin_2d_fn + +#define bilinf_fn_1d(sz, dir, dir_m, avg) \ +static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + avg##_bilin_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, dir_m); \ +} + +#define bilinf_fn_2d(sz, avg) \ +static void avg##_bilin_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + avg##_bilin_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, mx, my); \ +} + +#else + +#define bilinf_fn_1d(a, b, c, d) +#define bilinf_fn_2d(a, b) + +#endif + +#define filter_fn(sz, avg) \ +filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \ +filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \ +filter_fn_2d(sz, regular, FILTER_8TAP_REGULAR, avg) \ +filter_fn_1d(sz, h, mx, smooth, FILTER_8TAP_SMOOTH, avg) \ +filter_fn_1d(sz, v, my, smooth, FILTER_8TAP_SMOOTH, avg) \ +filter_fn_2d(sz, smooth, FILTER_8TAP_SMOOTH, avg) \ +filter_fn_1d(sz, h, mx, sharp, FILTER_8TAP_SHARP, avg) \ +filter_fn_1d(sz, v, my, sharp, FILTER_8TAP_SHARP, avg) \ +filter_fn_2d(sz, sharp, FILTER_8TAP_SHARP, avg) \ +bilinf_fn_1d(sz, h, mx, avg) \ +bilinf_fn_1d(sz, v, my, avg) \ +bilinf_fn_2d(sz, avg) + +#define filter_fn_set(avg) \ +filter_fn(64, avg) \ +filter_fn(32, avg) \ +filter_fn(16, avg) \ +filter_fn(8, avg) \ +filter_fn(4, avg) + +filter_fn_set(put) +filter_fn_set(avg) + +#undef filter_fn +#undef filter_fn_set +#undef filter_fn_1d +#undef filter_fn_2d +#undef bilinf_fn_1d +#undef bilinf_fn_2d + +#if BIT_DEPTH != 8 +void ff_vp9dsp_mc_init_10(VP9DSPContext *dsp); +#endif +#if BIT_DEPTH != 10 +static +#endif +av_cold void FUNC(ff_vp9dsp_mc_init)(VP9DSPContext *dsp) +{ +#if BIT_DEPTH == 12 + ff_vp9dsp_mc_init_10(dsp); +#else /* BIT_DEPTH == 12 */ + +#define init_fpel(idx1, idx2, sz, type) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_c; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_c; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = type##sz##_c; \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = type##sz##_c + +#define init_copy_avg(idx, sz) \ + init_fpel(idx, 0, sz, copy); \ + init_fpel(idx, 1, sz, avg) + + init_copy_avg(0, 64); + init_copy_avg(1, 32); + init_copy_avg(2, 16); + init_copy_avg(3, 8); + init_copy_avg(4, 4); + +#undef init_copy_avg +#undef init_fpel + +#endif /* BIT_DEPTH == 12 */ + +#define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_c; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_c; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_c + +#if BIT_DEPTH == 12 +#define init_subpel1 init_subpel1_bd_aware +#else +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \ + init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type); \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][idxh][idxv] = type##_bilin_##sz##dir##_c +#endif + +#define init_subpel2(idx, idxh, idxv, dir, type) \ + init_subpel1(0, idx, idxh, idxv, 64, dir, type); \ + init_subpel1(1, idx, idxh, idxv, 32, dir, type); \ + init_subpel1(2, idx, idxh, idxv, 16, dir, type); \ + init_subpel1(3, idx, idxh, idxv, 8, dir, type); \ + init_subpel1(4, idx, idxh, idxv, 4, dir, type) + +#define init_subpel3(idx, type) \ + init_subpel2(idx, 1, 1, hv, type); \ + init_subpel2(idx, 0, 1, v, type); \ + init_subpel2(idx, 1, 0, h, type) + + init_subpel3(0, put); + init_subpel3(1, avg); + +#undef init_subpel1 +#undef init_subpel2 +#undef init_subpel3 +#undef init_subpel1_bd_aware +} + +static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, int mx, int my, + int dx, int dy, int avg, + const int16_t (*filters)[8]) +{ + int tmp_h = (((h - 1) * dy + my) >> 4) + 8; + pixel tmp[64 * 135], *tmp_ptr = tmp; + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + src -= src_stride * 3; + do { + int x; + int imx = mx, ioff = 0; + + for (x = 0; x < w; x++) { + tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1); + imx += dx; + ioff += imx >> 4; + imx &= 0xf; + } + + tmp_ptr += 64; + src += src_stride; + } while (--tmp_h); + + tmp_ptr = tmp + 64 * 3; + do { + int x; + const int16_t *filter = filters[my]; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1; + } else { + dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64); + } + + my += dy; + tmp_ptr += (my >> 4) * 64; + my &= 0xf; + dst += dst_stride; + } while (--h); +} + +#define scaled_filter_8tap_fn(opn, opa) \ +static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my, int dx, int dy, \ + const int16_t (*filters)[8]) \ +{ \ + do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \ + opa, filters); \ +} + +scaled_filter_8tap_fn(put, 0) +scaled_filter_8tap_fn(avg, 1) + +#undef scaled_filter_8tap_fn + +#undef FILTER_8TAP + +#define scaled_filter_fn(sz, type, type_idx, avg) \ +static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my, int dx, int dy) \ +{ \ + avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \ + ff_vp9_subpel_filters[type_idx]); \ +} + +#if BIT_DEPTH != 12 + +static av_always_inline void do_scaled_bilin_c(uint8_t *_dst, ptrdiff_t dst_stride, + const uint8_t *_src, ptrdiff_t src_stride, + int w, int h, int mx, int my, + int dx, int dy, int avg) +{ + pixel tmp[64 * 129], *tmp_ptr = tmp; + int tmp_h = (((h - 1) * dy + my) >> 4) + 2; + pixel *dst = (pixel *) _dst; + const pixel *src = (const pixel *) _src; + + dst_stride /= sizeof(pixel); + src_stride /= sizeof(pixel); + do { + int x; + int imx = mx, ioff = 0; + + for (x = 0; x < w; x++) { + tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1); + imx += dx; + ioff += imx >> 4; + imx &= 0xf; + } + + tmp_ptr += 64; + src += src_stride; + } while (--tmp_h); + + tmp_ptr = tmp; + do { + int x; + + for (x = 0; x < w; x++) + if (avg) { + dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1; + } else { + dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64); + } + + my += dy; + tmp_ptr += (my >> 4) * 64; + my &= 0xf; + dst += dst_stride; + } while (--h); +} + +#define scaled_bilin_fn(opn, opa) \ +static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int w, int h, int mx, int my, int dx, int dy) \ +{ \ + do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \ +} + +scaled_bilin_fn(put, 0) +scaled_bilin_fn(avg, 1) + +#undef scaled_bilin_fn + +#undef FILTER_BILIN + +#define scaled_bilinf_fn(sz, avg) \ +static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my, int dx, int dy) \ +{ \ + avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \ +} + +#else + +#define scaled_bilinf_fn(a, b) + +#endif + +#define scaled_filter_fns(sz, avg) \ +scaled_filter_fn(sz, regular, FILTER_8TAP_REGULAR, avg) \ +scaled_filter_fn(sz, smooth, FILTER_8TAP_SMOOTH, avg) \ +scaled_filter_fn(sz, sharp, FILTER_8TAP_SHARP, avg) \ +scaled_bilinf_fn(sz, avg) + +#define scaled_filter_fn_set(avg) \ +scaled_filter_fns(64, avg) \ +scaled_filter_fns(32, avg) \ +scaled_filter_fns(16, avg) \ +scaled_filter_fns(8, avg) \ +scaled_filter_fns(4, avg) + +scaled_filter_fn_set(put) +scaled_filter_fn_set(avg) + +#undef scaled_filter_fns +#undef scaled_filter_fn_set +#undef scaled_filter_fn +#undef scaled_bilinf_fn + +#if BIT_DEPTH != 8 +void ff_vp9dsp_scaled_mc_init_10(VP9DSPContext *dsp); +#endif +#if BIT_DEPTH != 10 +static +#endif +av_cold void FUNC(ff_vp9dsp_scaled_mc_init)(VP9DSPContext *dsp) +{ +#define init_scaled_bd_aware(idx1, idx2, sz, type) \ + dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \ + dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \ + dsp->smc[idx1][FILTER_8TAP_SHARP ][idx2] = type##_scaled_sharp_##sz##_c + +#if BIT_DEPTH == 12 + ff_vp9dsp_scaled_mc_init_10(dsp); +#define init_scaled(a,b,c,d) init_scaled_bd_aware(a,b,c,d) +#else +#define init_scaled(idx1, idx2, sz, type) \ + init_scaled_bd_aware(idx1, idx2, sz, type); \ + dsp->smc[idx1][FILTER_BILINEAR ][idx2] = type##_scaled_bilin_##sz##_c +#endif + +#define init_scaled_put_avg(idx, sz) \ + init_scaled(idx, 0, sz, put); \ + init_scaled(idx, 1, sz, avg) + + init_scaled_put_avg(0, 64); + init_scaled_put_avg(1, 32); + init_scaled_put_avg(2, 16); + init_scaled_put_avg(3, 8); + init_scaled_put_avg(4, 4); + +#undef init_scaled_put_avg +#undef init_scaled +#undef init_scaled_bd_aware +} + +av_cold void FUNC(ff_vp9dsp_init)(VP9DSPContext *dsp) +{ + FUNC(ff_vp9dsp_intrapred_init)(dsp); + vp9dsp_itxfm_init(dsp); + vp9dsp_loopfilter_init(dsp); + FUNC(ff_vp9dsp_mc_init)(dsp); + FUNC(ff_vp9dsp_scaled_mc_init)(dsp); +} diff --git a/media/ffvpx/libavcodec/vp9lpf.c b/media/ffvpx/libavcodec/vp9lpf.c new file mode 100644 index 0000000000..414cede852 --- /dev/null +++ b/media/ffvpx/libavcodec/vp9lpf.c @@ -0,0 +1,202 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "vp9dec.h" + +static av_always_inline void filter_plane_cols(VP9Context *s, int col, int ss_h, int ss_v, + uint8_t *lvl, uint8_t (*mask)[4], + uint8_t *dst, ptrdiff_t ls) +{ + int y, x, bytesperpixel = s->bytesperpixel; + + // filter edges between columns (e.g. block1 | block2) + for (y = 0; y < 8; y += 2 << ss_v, dst += 16 * ls, lvl += 16 << ss_v) { + uint8_t *ptr = dst, *l = lvl, *hmask1 = mask[y], *hmask2 = mask[y + 1 + ss_v]; + unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3]; + unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3]; + unsigned hm = hm1 | hm2 | hm13 | hm23; + + for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8 * bytesperpixel >> ss_h) { + if (col || x > 1) { + if (hm1 & x) { + int L = *l, H = L >> 4; + int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L]; + + if (hmask1[0] & x) { + if (hmask2[0] & x) { + av_assert2(l[8 << ss_v] == L); + s->dsp.loop_filter_16[0](ptr, ls, E, I, H); + } else { + s->dsp.loop_filter_8[2][0](ptr, ls, E, I, H); + } + } else if (hm2 & x) { + L = l[8 << ss_v]; + H |= (L >> 4) << 8; + E |= s->filter_lut.mblim_lut[L] << 8; + I |= s->filter_lut.lim_lut[L] << 8; + s->dsp.loop_filter_mix2[!!(hmask1[1] & x)] + [!!(hmask2[1] & x)] + [0](ptr, ls, E, I, H); + } else { + s->dsp.loop_filter_8[!!(hmask1[1] & x)] + [0](ptr, ls, E, I, H); + } + } else if (hm2 & x) { + int L = l[8 << ss_v], H = L >> 4; + int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L]; + + s->dsp.loop_filter_8[!!(hmask2[1] & x)] + [0](ptr + 8 * ls, ls, E, I, H); + } + } + if (ss_h) { + if (x & 0xAA) + l += 2; + } else { + if (hm13 & x) { + int L = *l, H = L >> 4; + int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L]; + + if (hm23 & x) { + L = l[8 << ss_v]; + H |= (L >> 4) << 8; + E |= s->filter_lut.mblim_lut[L] << 8; + I |= s->filter_lut.lim_lut[L] << 8; + s->dsp.loop_filter_mix2[0][0][0](ptr + 4 * bytesperpixel, ls, E, I, H); + } else { + s->dsp.loop_filter_8[0][0](ptr + 4 * bytesperpixel, ls, E, I, H); + } + } else if (hm23 & x) { + int L = l[8 << ss_v], H = L >> 4; + int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L]; + + s->dsp.loop_filter_8[0][0](ptr + 8 * ls + 4 * bytesperpixel, ls, E, I, H); + } + l++; + } + } + } +} + +static av_always_inline void filter_plane_rows(VP9Context *s, int row, int ss_h, int ss_v, + uint8_t *lvl, uint8_t (*mask)[4], + uint8_t *dst, ptrdiff_t ls) +{ + int y, x, bytesperpixel = s->bytesperpixel; + + // block1 + // filter edges between rows (e.g. ------) + // block2 + for (y = 0; y < 8; y++, dst += 8 * ls >> ss_v) { + uint8_t *ptr = dst, *l = lvl, *vmask = mask[y]; + unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3]; + + for (x = 1; vm & ~(x - 1); x <<= (2 << ss_h), ptr += 16 * bytesperpixel, l += 2 << ss_h) { + if (row || y) { + if (vm & x) { + int L = *l, H = L >> 4; + int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L]; + + if (vmask[0] & x) { + if (vmask[0] & (x << (1 + ss_h))) { + av_assert2(l[1 + ss_h] == L); + s->dsp.loop_filter_16[1](ptr, ls, E, I, H); + } else { + s->dsp.loop_filter_8[2][1](ptr, ls, E, I, H); + } + } else if (vm & (x << (1 + ss_h))) { + L = l[1 + ss_h]; + H |= (L >> 4) << 8; + E |= s->filter_lut.mblim_lut[L] << 8; + I |= s->filter_lut.lim_lut[L] << 8; + s->dsp.loop_filter_mix2[!!(vmask[1] & x)] + [!!(vmask[1] & (x << (1 + ss_h)))] + [1](ptr, ls, E, I, H); + } else { + s->dsp.loop_filter_8[!!(vmask[1] & x)] + [1](ptr, ls, E, I, H); + } + } else if (vm & (x << (1 + ss_h))) { + int L = l[1 + ss_h], H = L >> 4; + int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L]; + + s->dsp.loop_filter_8[!!(vmask[1] & (x << (1 + ss_h)))] + [1](ptr + 8 * bytesperpixel, ls, E, I, H); + } + } + if (!ss_v) { + if (vm3 & x) { + int L = *l, H = L >> 4; + int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L]; + + if (vm3 & (x << (1 + ss_h))) { + L = l[1 + ss_h]; + H |= (L >> 4) << 8; + E |= s->filter_lut.mblim_lut[L] << 8; + I |= s->filter_lut.lim_lut[L] << 8; + s->dsp.loop_filter_mix2[0][0][1](ptr + ls * 4, ls, E, I, H); + } else { + s->dsp.loop_filter_8[0][1](ptr + ls * 4, ls, E, I, H); + } + } else if (vm3 & (x << (1 + ss_h))) { + int L = l[1 + ss_h], H = L >> 4; + int E = s->filter_lut.mblim_lut[L], I = s->filter_lut.lim_lut[L]; + + s->dsp.loop_filter_8[0][1](ptr + ls * 4 + 8 * bytesperpixel, ls, E, I, H); + } + } + } + if (ss_v) { + if (y & 1) + lvl += 16; + } else { + lvl += 8; + } + } +} + +void ff_vp9_loopfilter_sb(AVCodecContext *avctx, VP9Filter *lflvl, + int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff) +{ + VP9Context *s = avctx->priv_data; + AVFrame *f = s->s.frames[CUR_FRAME].tf.f; + uint8_t *dst = f->data[0] + yoff; + ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1]; + uint8_t (*uv_masks)[8][4] = lflvl->mask[s->ss_h | s->ss_v]; + int p; + + /* FIXME: In how far can we interleave the v/h loopfilter calls? E.g. + * if you think of them as acting on a 8x8 block max, we can interleave + * each v/h within the single x loop, but that only works if we work on + * 8 pixel blocks, and we won't always do that (we want at least 16px + * to use SSE2 optimizations, perhaps 32 for AVX2) */ + + filter_plane_cols(s, col, 0, 0, lflvl->level, lflvl->mask[0][0], dst, ls_y); + filter_plane_rows(s, row, 0, 0, lflvl->level, lflvl->mask[0][1], dst, ls_y); + + for (p = 0; p < 2; p++) { + dst = f->data[1 + p] + uvoff; + filter_plane_cols(s, col, s->ss_h, s->ss_v, lflvl->level, uv_masks[0], dst, ls_uv); + filter_plane_rows(s, row, s->ss_h, s->ss_v, lflvl->level, uv_masks[1], dst, ls_uv); + } +} diff --git a/media/ffvpx/libavcodec/vp9mvs.c b/media/ffvpx/libavcodec/vp9mvs.c new file mode 100644 index 0000000000..b93d878d6f --- /dev/null +++ b/media/ffvpx/libavcodec/vp9mvs.c @@ -0,0 +1,364 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "threadframe.h" +#include "vp89_rac.h" +#include "vp9data.h" +#include "vp9dec.h" +#include "vpx_rac.h" + +static av_always_inline void clamp_mv(VP9mv *dst, const VP9mv *src, + VP9TileData *td) +{ + dst->x = av_clip(src->x, td->min_mv.x, td->max_mv.x); + dst->y = av_clip(src->y, td->min_mv.y, td->max_mv.y); +} + +static void find_ref_mvs(VP9TileData *td, + VP9mv *pmv, int ref, int z, int idx, int sb) +{ + static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = { + [BS_64x64] = { { 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 }, + { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 } }, + [BS_64x32] = { { 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 }, + { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 } }, + [BS_32x64] = { { -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 }, + { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 } }, + [BS_32x32] = { { 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 }, + { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 } }, + [BS_32x16] = { { 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 }, + { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 } }, + [BS_16x32] = { { -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 }, + { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 } }, + [BS_16x16] = { { 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 }, + { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 } }, + [BS_16x8] = { { 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 }, + { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 } }, + [BS_8x16] = { { -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 }, + { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 } }, + [BS_8x8] = { { 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 }, + { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } }, + [BS_8x4] = { { 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 }, + { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } }, + [BS_4x8] = { { 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 }, + { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } }, + [BS_4x4] = { { 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 }, + { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } }, + }; + const VP9Context *s = td->s; + VP9Block *b = td->b; + int row = td->row, col = td->col, row7 = td->row7; + const int8_t (*p)[2] = mv_ref_blk_off[b->bs]; +#define INVALID_MV 0x80008000U + uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV; + int i; + +#define RETURN_DIRECT_MV(mv) \ + do { \ + uint32_t m = AV_RN32A(&mv); \ + if (!idx) { \ + AV_WN32A(pmv, m); \ + return; \ + } else if (mem == INVALID_MV) { \ + mem = m; \ + } else if (m != mem) { \ + AV_WN32A(pmv, m); \ + return; \ + } \ + } while (0) + + if (sb >= 0) { + if (sb == 2 || sb == 1) { + RETURN_DIRECT_MV(b->mv[0][z]); + } else if (sb == 3) { + RETURN_DIRECT_MV(b->mv[2][z]); + RETURN_DIRECT_MV(b->mv[1][z]); + RETURN_DIRECT_MV(b->mv[0][z]); + } + +#define RETURN_MV(mv) \ + do { \ + if (sb > 0) { \ + VP9mv tmp; \ + uint32_t m; \ + av_assert2(idx == 1); \ + av_assert2(mem != INVALID_MV); \ + if (mem_sub8x8 == INVALID_MV) { \ + clamp_mv(&tmp, &mv, td); \ + m = AV_RN32A(&tmp); \ + if (m != mem) { \ + AV_WN32A(pmv, m); \ + return; \ + } \ + mem_sub8x8 = AV_RN32A(&mv); \ + } else if (mem_sub8x8 != AV_RN32A(&mv)) { \ + clamp_mv(&tmp, &mv, td); \ + m = AV_RN32A(&tmp); \ + if (m != mem) { \ + AV_WN32A(pmv, m); \ + } else { \ + /* BUG I'm pretty sure this isn't the intention */ \ + AV_WN32A(pmv, 0); \ + } \ + return; \ + } \ + } else { \ + uint32_t m = AV_RN32A(&mv); \ + if (!idx) { \ + clamp_mv(pmv, &mv, td); \ + return; \ + } else if (mem == INVALID_MV) { \ + mem = m; \ + } else if (m != mem) { \ + clamp_mv(pmv, &mv, td); \ + return; \ + } \ + } \ + } while (0) + + if (row > 0) { + VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col]; + if (mv->ref[0] == ref) + RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]); + else if (mv->ref[1] == ref) + RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]); + } + if (col > td->tile_col_start) { + VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1]; + if (mv->ref[0] == ref) + RETURN_MV(td->left_mv_ctx[2 * row7 + (sb >> 1)][0]); + else if (mv->ref[1] == ref) + RETURN_MV(td->left_mv_ctx[2 * row7 + (sb >> 1)][1]); + } + i = 2; + } else { + i = 0; + } + + // previously coded MVs in this neighborhood, using same reference frame + for (; i < 8; i++) { + int c = p[i][0] + col, r = p[i][1] + row; + + if (c >= td->tile_col_start && c < s->cols && + r >= 0 && r < s->rows) { + VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c]; + + if (mv->ref[0] == ref) + RETURN_MV(mv->mv[0]); + else if (mv->ref[1] == ref) + RETURN_MV(mv->mv[1]); + } + } + + // MV at this position in previous frame, using same reference frame + if (s->s.h.use_last_frame_mvs) { + VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col]; + + if (!s->s.frames[REF_FRAME_MVPAIR].uses_2pass) + ff_thread_await_progress(&s->s.frames[REF_FRAME_MVPAIR].tf, row >> 3, 0); + if (mv->ref[0] == ref) + RETURN_MV(mv->mv[0]); + else if (mv->ref[1] == ref) + RETURN_MV(mv->mv[1]); + } + +#define RETURN_SCALE_MV(mv, scale) \ + do { \ + if (scale) { \ + VP9mv mv_temp = { -mv.x, -mv.y }; \ + RETURN_MV(mv_temp); \ + } else { \ + RETURN_MV(mv); \ + } \ + } while (0) + + // previously coded MVs in this neighborhood, using different reference frame + for (i = 0; i < 8; i++) { + int c = p[i][0] + col, r = p[i][1] + row; + + if (c >= td->tile_col_start && c < s->cols && r >= 0 && r < s->rows) { + VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c]; + + if (mv->ref[0] != ref && mv->ref[0] >= 0) + RETURN_SCALE_MV(mv->mv[0], + s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]); + if (mv->ref[1] != ref && mv->ref[1] >= 0 && + // BUG - libvpx has this condition regardless of whether + // we used the first ref MV and pre-scaling + AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) { + RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]); + } + } + } + + // MV at this position in previous frame, using different reference frame + if (s->s.h.use_last_frame_mvs) { + VP9mvrefPair *mv = &s->s.frames[REF_FRAME_MVPAIR].mv[row * s->sb_cols * 8 + col]; + + // no need to await_progress, because we already did that above + if (mv->ref[0] != ref && mv->ref[0] >= 0) + RETURN_SCALE_MV(mv->mv[0], s->s.h.signbias[mv->ref[0]] != s->s.h.signbias[ref]); + if (mv->ref[1] != ref && mv->ref[1] >= 0 && + // BUG - libvpx has this condition regardless of whether + // we used the first ref MV and pre-scaling + AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) { + RETURN_SCALE_MV(mv->mv[1], s->s.h.signbias[mv->ref[1]] != s->s.h.signbias[ref]); + } + } + + AV_ZERO32(pmv); + clamp_mv(pmv, pmv, td); +#undef INVALID_MV +#undef RETURN_MV +#undef RETURN_SCALE_MV +} + +static av_always_inline int read_mv_component(VP9TileData *td, int idx, int hp) +{ + const VP9Context *s = td->s; + int bit, sign = vpx_rac_get_prob(td->c, s->prob.p.mv_comp[idx].sign); + int n, c = vp89_rac_get_tree(td->c, ff_vp9_mv_class_tree, + s->prob.p.mv_comp[idx].classes); + + td->counts.mv_comp[idx].sign[sign]++; + td->counts.mv_comp[idx].classes[c]++; + if (c) { + int m; + + for (n = 0, m = 0; m < c; m++) { + bit = vpx_rac_get_prob(td->c, s->prob.p.mv_comp[idx].bits[m]); + n |= bit << m; + td->counts.mv_comp[idx].bits[m][bit]++; + } + n <<= 3; + bit = vp89_rac_get_tree(td->c, ff_vp9_mv_fp_tree, + s->prob.p.mv_comp[idx].fp); + n |= bit << 1; + td->counts.mv_comp[idx].fp[bit]++; + if (hp) { + bit = vpx_rac_get_prob(td->c, s->prob.p.mv_comp[idx].hp); + td->counts.mv_comp[idx].hp[bit]++; + n |= bit; + } else { + n |= 1; + // bug in libvpx - we count for bw entropy purposes even if the + // bit wasn't coded + td->counts.mv_comp[idx].hp[1]++; + } + n += 8 << c; + } else { + n = vpx_rac_get_prob(td->c, s->prob.p.mv_comp[idx].class0); + td->counts.mv_comp[idx].class0[n]++; + bit = vp89_rac_get_tree(td->c, ff_vp9_mv_fp_tree, + s->prob.p.mv_comp[idx].class0_fp[n]); + td->counts.mv_comp[idx].class0_fp[n][bit]++; + n = (n << 3) | (bit << 1); + if (hp) { + bit = vpx_rac_get_prob(td->c, s->prob.p.mv_comp[idx].class0_hp); + td->counts.mv_comp[idx].class0_hp[bit]++; + n |= bit; + } else { + n |= 1; + // bug in libvpx - we count for bw entropy purposes even if the + // bit wasn't coded + td->counts.mv_comp[idx].class0_hp[1]++; + } + } + + return sign ? -(n + 1) : (n + 1); +} + +void ff_vp9_fill_mv(VP9TileData *td, VP9mv *mv, int mode, int sb) +{ + const VP9Context *s = td->s; + VP9Block *b = td->b; + + if (mode == ZEROMV) { + AV_ZERO64(mv); + } else { + int hp; + + // FIXME cache this value and reuse for other subblocks + find_ref_mvs(td, &mv[0], b->ref[0], 0, mode == NEARMV, + mode == NEWMV ? -1 : sb); + // FIXME maybe move this code into find_ref_mvs() + if ((mode == NEWMV || sb == -1) && + !(hp = s->s.h.highprecisionmvs && + abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) { + if (mv[0].y & 1) { + if (mv[0].y < 0) + mv[0].y++; + else + mv[0].y--; + } + if (mv[0].x & 1) { + if (mv[0].x < 0) + mv[0].x++; + else + mv[0].x--; + } + } + if (mode == NEWMV) { + enum MVJoint j = vp89_rac_get_tree(td->c, ff_vp9_mv_joint_tree, + s->prob.p.mv_joint); + + td->counts.mv_joint[j]++; + if (j >= MV_JOINT_V) + mv[0].y += read_mv_component(td, 0, hp); + if (j & 1) + mv[0].x += read_mv_component(td, 1, hp); + } + + if (b->comp) { + // FIXME cache this value and reuse for other subblocks + find_ref_mvs(td, &mv[1], b->ref[1], 1, mode == NEARMV, + mode == NEWMV ? -1 : sb); + if ((mode == NEWMV || sb == -1) && + !(hp = s->s.h.highprecisionmvs && + abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) { + if (mv[1].y & 1) { + if (mv[1].y < 0) + mv[1].y++; + else + mv[1].y--; + } + if (mv[1].x & 1) { + if (mv[1].x < 0) + mv[1].x++; + else + mv[1].x--; + } + } + if (mode == NEWMV) { + enum MVJoint j = vp89_rac_get_tree(td->c, ff_vp9_mv_joint_tree, + s->prob.p.mv_joint); + + td->counts.mv_joint[j]++; + if (j >= MV_JOINT_V) + mv[1].y += read_mv_component(td, 0, hp); + if (j & 1) + mv[1].x += read_mv_component(td, 1, hp); + } + } + } +} diff --git a/media/ffvpx/libavcodec/vp9prob.c b/media/ffvpx/libavcodec/vp9prob.c new file mode 100644 index 0000000000..69a5180770 --- /dev/null +++ b/media/ffvpx/libavcodec/vp9prob.c @@ -0,0 +1,272 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "vp9.h" +#include "vp9dec.h" + +static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1, + int max_count, int update_factor) +{ + unsigned ct = ct0 + ct1, p2, p1; + + if (!ct) + return; + + update_factor = FASTDIV(update_factor * FFMIN(ct, max_count), max_count); + p1 = *p; + p2 = ((((int64_t) ct0) << 8) + (ct >> 1)) / ct; + p2 = av_clip(p2, 1, 255); + + // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8 + *p = p1 + (((p2 - p1) * update_factor + 128) >> 8); +} + +void ff_vp9_adapt_probs(VP9Context *s) +{ + int i, j, k, l, m; + ProbContext *p = &s->prob_ctx[s->s.h.framectxid].p; + int uf = (s->s.h.keyframe || s->s.h.intraonly || !s->last_keyframe) ? 112 : 128; + + // coefficients + for (i = 0; i < 4; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + for (l = 0; l < 6; l++) + for (m = 0; m < 6; m++) { + uint8_t *pp = s->prob_ctx[s->s.h.framectxid].coef[i][j][k][l][m]; + unsigned *e = s->td[0].counts.eob[i][j][k][l][m]; + unsigned *c = s->td[0].counts.coef[i][j][k][l][m]; + + if (l == 0 && m >= 3) // dc only has 3 pt + break; + + adapt_prob(&pp[0], e[0], e[1], 24, uf); + adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf); + adapt_prob(&pp[2], c[1], c[2], 24, uf); + } + + if (s->s.h.keyframe || s->s.h.intraonly) { + memcpy(p->skip, s->prob.p.skip, sizeof(p->skip)); + memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p)); + memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p)); + memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p)); + return; + } + + // skip flag + for (i = 0; i < 3; i++) + adapt_prob(&p->skip[i], s->td[0].counts.skip[i][0], + s->td[0].counts.skip[i][1], 20, 128); + + // intra/inter flag + for (i = 0; i < 4; i++) + adapt_prob(&p->intra[i], s->td[0].counts.intra[i][0], + s->td[0].counts.intra[i][1], 20, 128); + + // comppred flag + if (s->s.h.comppredmode == PRED_SWITCHABLE) { + for (i = 0; i < 5; i++) + adapt_prob(&p->comp[i], s->td[0].counts.comp[i][0], + s->td[0].counts.comp[i][1], 20, 128); + } + + // reference frames + if (s->s.h.comppredmode != PRED_SINGLEREF) { + for (i = 0; i < 5; i++) + adapt_prob(&p->comp_ref[i], s->td[0].counts.comp_ref[i][0], + s->td[0].counts.comp_ref[i][1], 20, 128); + } + + if (s->s.h.comppredmode != PRED_COMPREF) { + for (i = 0; i < 5; i++) { + uint8_t *pp = p->single_ref[i]; + unsigned (*c)[2] = s->td[0].counts.single_ref[i]; + + adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128); + adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128); + } + } + + // block partitioning + for (i = 0; i < 4; i++) + for (j = 0; j < 4; j++) { + uint8_t *pp = p->partition[i][j]; + unsigned *c = s->td[0].counts.partition[i][j]; + + adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128); + adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128); + adapt_prob(&pp[2], c[2], c[3], 20, 128); + } + + // tx size + if (s->s.h.txfmmode == TX_SWITCHABLE) { + for (i = 0; i < 2; i++) { + unsigned *c16 = s->td[0].counts.tx16p[i], *c32 = s->td[0].counts.tx32p[i]; + + adapt_prob(&p->tx8p[i], s->td[0].counts.tx8p[i][0], + s->td[0].counts.tx8p[i][1], 20, 128); + adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128); + adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128); + adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128); + adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128); + adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128); + } + } + + // interpolation filter + if (s->s.h.filtermode == FILTER_SWITCHABLE) { + for (i = 0; i < 4; i++) { + uint8_t *pp = p->filter[i]; + unsigned *c = s->td[0].counts.filter[i]; + + adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128); + adapt_prob(&pp[1], c[1], c[2], 20, 128); + } + } + + // inter modes + for (i = 0; i < 7; i++) { + uint8_t *pp = p->mv_mode[i]; + unsigned *c = s->td[0].counts.mv_mode[i]; + + adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128); + adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128); + adapt_prob(&pp[2], c[1], c[3], 20, 128); + } + + // mv joints + { + uint8_t *pp = p->mv_joint; + unsigned *c = s->td[0].counts.mv_joint; + + adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128); + adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128); + adapt_prob(&pp[2], c[2], c[3], 20, 128); + } + + // mv components + for (i = 0; i < 2; i++) { + uint8_t *pp; + unsigned *c, (*c2)[2], sum; + + adapt_prob(&p->mv_comp[i].sign, s->td[0].counts.mv_comp[i].sign[0], + s->td[0].counts.mv_comp[i].sign[1], 20, 128); + + pp = p->mv_comp[i].classes; + c = s->td[0].counts.mv_comp[i].classes; + sum = c[1] + c[2] + c[3] + c[4] + c[5] + + c[6] + c[7] + c[8] + c[9] + c[10]; + adapt_prob(&pp[0], c[0], sum, 20, 128); + sum -= c[1]; + adapt_prob(&pp[1], c[1], sum, 20, 128); + sum -= c[2] + c[3]; + adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128); + adapt_prob(&pp[3], c[2], c[3], 20, 128); + sum -= c[4] + c[5]; + adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128); + adapt_prob(&pp[5], c[4], c[5], 20, 128); + sum -= c[6]; + adapt_prob(&pp[6], c[6], sum, 20, 128); + adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128); + adapt_prob(&pp[8], c[7], c[8], 20, 128); + adapt_prob(&pp[9], c[9], c[10], 20, 128); + + adapt_prob(&p->mv_comp[i].class0, s->td[0].counts.mv_comp[i].class0[0], + s->td[0].counts.mv_comp[i].class0[1], 20, 128); + pp = p->mv_comp[i].bits; + c2 = s->td[0].counts.mv_comp[i].bits; + for (j = 0; j < 10; j++) + adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128); + + for (j = 0; j < 2; j++) { + pp = p->mv_comp[i].class0_fp[j]; + c = s->td[0].counts.mv_comp[i].class0_fp[j]; + adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128); + adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128); + adapt_prob(&pp[2], c[2], c[3], 20, 128); + } + pp = p->mv_comp[i].fp; + c = s->td[0].counts.mv_comp[i].fp; + adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128); + adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128); + adapt_prob(&pp[2], c[2], c[3], 20, 128); + + if (s->s.h.highprecisionmvs) { + adapt_prob(&p->mv_comp[i].class0_hp, + s->td[0].counts.mv_comp[i].class0_hp[0], + s->td[0].counts.mv_comp[i].class0_hp[1], 20, 128); + adapt_prob(&p->mv_comp[i].hp, s->td[0].counts.mv_comp[i].hp[0], + s->td[0].counts.mv_comp[i].hp[1], 20, 128); + } + } + + // y intra modes + for (i = 0; i < 4; i++) { + uint8_t *pp = p->y_mode[i]; + unsigned *c = s->td[0].counts.y_mode[i], sum, s2; + + sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9]; + adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128); + sum -= c[TM_VP8_PRED]; + adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128); + sum -= c[VERT_PRED]; + adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128); + s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED]; + sum -= s2; + adapt_prob(&pp[3], s2, sum, 20, 128); + s2 -= c[HOR_PRED]; + adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128); + adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], + 20, 128); + sum -= c[DIAG_DOWN_LEFT_PRED]; + adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128); + sum -= c[VERT_LEFT_PRED]; + adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128); + adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128); + } + + // uv intra modes + for (i = 0; i < 10; i++) { + uint8_t *pp = p->uv_mode[i]; + unsigned *c = s->td[0].counts.uv_mode[i], sum, s2; + + sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9]; + adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128); + sum -= c[TM_VP8_PRED]; + adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128); + sum -= c[VERT_PRED]; + adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128); + s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED]; + sum -= s2; + adapt_prob(&pp[3], s2, sum, 20, 128); + s2 -= c[HOR_PRED]; + adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128); + adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], + 20, 128); + sum -= c[DIAG_DOWN_LEFT_PRED]; + adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128); + sum -= c[VERT_LEFT_PRED]; + adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128); + adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128); + } +} diff --git a/media/ffvpx/libavcodec/vp9recon.c b/media/ffvpx/libavcodec/vp9recon.c new file mode 100644 index 0000000000..073c04b47d --- /dev/null +++ b/media/ffvpx/libavcodec/vp9recon.c @@ -0,0 +1,654 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/mem_internal.h" + +#include "threadframe.h" +#include "videodsp.h" +#include "vp9data.h" +#include "vp9dec.h" + +static av_always_inline int check_intra_mode(VP9TileData *td, int mode, uint8_t **a, + uint8_t *dst_edge, ptrdiff_t stride_edge, + uint8_t *dst_inner, ptrdiff_t stride_inner, + uint8_t *l, int col, int x, int w, + int row, int y, enum TxfmMode tx, + int p, int ss_h, int ss_v, int bytesperpixel) +{ + const VP9Context *s = td->s; + int have_top = row > 0 || y > 0; + int have_left = col > td->tile_col_start || x > 0; + int have_right = x < w - 1; + int bpp = s->s.h.bpp; + static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = { + [VERT_PRED] = { { DC_127_PRED, VERT_PRED }, + { DC_127_PRED, VERT_PRED } }, + [HOR_PRED] = { { DC_129_PRED, DC_129_PRED }, + { HOR_PRED, HOR_PRED } }, + [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED }, + { LEFT_DC_PRED, DC_PRED } }, + [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED }, + { DC_127_PRED, DIAG_DOWN_LEFT_PRED } }, + [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED }, + { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } }, + [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED }, + { VERT_RIGHT_PRED, VERT_RIGHT_PRED } }, + [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED }, + { HOR_DOWN_PRED, HOR_DOWN_PRED } }, + [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED }, + { DC_127_PRED, VERT_LEFT_PRED } }, + [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED }, + { HOR_UP_PRED, HOR_UP_PRED } }, + [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED }, + { HOR_PRED, TM_VP8_PRED } }, + }; + static const struct { + uint8_t needs_left:1; + uint8_t needs_top:1; + uint8_t needs_topleft:1; + uint8_t needs_topright:1; + uint8_t invert_left:1; + } edges[N_INTRA_PRED_MODES] = { + [VERT_PRED] = { .needs_top = 1 }, + [HOR_PRED] = { .needs_left = 1 }, + [DC_PRED] = { .needs_top = 1, .needs_left = 1 }, + [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 }, + [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, + .needs_topleft = 1 }, + [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, + .needs_topleft = 1 }, + [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, + .needs_topleft = 1 }, + [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 }, + [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 }, + [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, + .needs_topleft = 1 }, + [LEFT_DC_PRED] = { .needs_left = 1 }, + [TOP_DC_PRED] = { .needs_top = 1 }, + [DC_128_PRED] = { 0 }, + [DC_127_PRED] = { 0 }, + [DC_129_PRED] = { 0 } + }; + + av_assert2(mode >= 0 && mode < 10); + mode = mode_conv[mode][have_left][have_top]; + if (edges[mode].needs_top) { + uint8_t *top, *topleft; + int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4; + int n_px_need_tr = 0; + + if (tx == TX_4X4 && edges[mode].needs_topright && have_right) + n_px_need_tr = 4; + + // if top of sb64-row, use s->intra_pred_data[] instead of + // dst[-stride] for intra prediction (it contains pre- instead of + // post-loopfilter data) + if (have_top) { + top = !(row & 7) && !y ? + s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel : + y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner]; + if (have_left) + topleft = !(row & 7) && !y ? + s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel : + y == 0 || x == 0 ? &dst_edge[-stride_edge] : + &dst_inner[-stride_inner]; + } + + if (have_top && + (!edges[mode].needs_topleft || (have_left && top == topleft)) && + (tx != TX_4X4 || !edges[mode].needs_topright || have_right) && + n_px_need + n_px_need_tr <= n_px_have) { + *a = top; + } else { + if (have_top) { + if (n_px_need <= n_px_have) { + memcpy(*a, top, n_px_need * bytesperpixel); + } else { +#define memset_bpp(c, i1, v, i2, num) do { \ + if (bytesperpixel == 1) { \ + memset(&(c)[(i1)], (v)[(i2)], (num)); \ + } else { \ + int n, val = AV_RN16A(&(v)[(i2) * 2]); \ + for (n = 0; n < (num); n++) { \ + AV_WN16A(&(c)[((i1) + n) * 2], val); \ + } \ + } \ +} while (0) + memcpy(*a, top, n_px_have * bytesperpixel); + memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have); + } + } else { +#define memset_val(c, val, num) do { \ + if (bytesperpixel == 1) { \ + memset((c), (val), (num)); \ + } else { \ + int n; \ + for (n = 0; n < (num); n++) { \ + AV_WN16A(&(c)[n * 2], (val)); \ + } \ + } \ +} while (0) + memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need); + } + if (edges[mode].needs_topleft) { + if (have_left && have_top) { +#define assign_bpp(c, i1, v, i2) do { \ + if (bytesperpixel == 1) { \ + (c)[(i1)] = (v)[(i2)]; \ + } else { \ + AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \ + } \ +} while (0) + assign_bpp(*a, -1, topleft, -1); + } else { +#define assign_val(c, i, v) do { \ + if (bytesperpixel == 1) { \ + (c)[(i)] = (v); \ + } else { \ + AV_WN16A(&(c)[(i) * 2], (v)); \ + } \ +} while (0) + assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1)); + } + } + if (tx == TX_4X4 && edges[mode].needs_topright) { + if (have_top && have_right && + n_px_need + n_px_need_tr <= n_px_have) { + memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel); + } else { + memset_bpp(*a, 4, *a, 3, 4); + } + } + } + } + if (edges[mode].needs_left) { + if (have_left) { + int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4; + uint8_t *dst = x == 0 ? dst_edge : dst_inner; + ptrdiff_t stride = x == 0 ? stride_edge : stride_inner; + + if (edges[mode].invert_left) { + if (n_px_need <= n_px_have) { + for (i = 0; i < n_px_need; i++) + assign_bpp(l, i, &dst[i * stride], -1); + } else { + for (i = 0; i < n_px_have; i++) + assign_bpp(l, i, &dst[i * stride], -1); + memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have); + } + } else { + if (n_px_need <= n_px_have) { + for (i = 0; i < n_px_need; i++) + assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1); + } else { + for (i = 0; i < n_px_have; i++) + assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1); + memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have); + } + } + } else { + memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx); + } + } + + return mode; +} + +static av_always_inline void intra_recon(VP9TileData *td, ptrdiff_t y_off, + ptrdiff_t uv_off, int bytesperpixel) +{ + const VP9Context *s = td->s; + VP9Block *b = td->b; + int row = td->row, col = td->col; + int w4 = ff_vp9_bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n; + int h4 = ff_vp9_bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2); + int end_x = FFMIN(2 * (s->cols - col), w4); + int end_y = FFMIN(2 * (s->rows - row), h4); + int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless; + int uvstep1d = 1 << b->uvtx, p; + uint8_t *dst = td->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off; + LOCAL_ALIGNED_32(uint8_t, a_buf, [96]); + LOCAL_ALIGNED_32(uint8_t, l, [64]); + + for (n = 0, y = 0; y < end_y; y += step1d) { + uint8_t *ptr = dst, *ptr_r = dst_r; + for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel, + ptr_r += 4 * step1d * bytesperpixel, n += step) { + int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ? + y * 2 + x : 0]; + uint8_t *a = &a_buf[32]; + enum TxfmType txtp = ff_vp9_intra_txfm_type[mode]; + int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&td->eob[n]) : td->eob[n]; + + mode = check_intra_mode(td, mode, &a, ptr_r, + s->s.frames[CUR_FRAME].tf.f->linesize[0], + ptr, td->y_stride, l, + col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel); + s->dsp.intra_pred[b->tx][mode](ptr, td->y_stride, l, a); + if (eob) + s->dsp.itxfm_add[tx][txtp](ptr, td->y_stride, + td->block + 16 * n * bytesperpixel, eob); + } + dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0]; + dst += 4 * step1d * td->y_stride; + } + + // U/V + w4 >>= s->ss_h; + end_x >>= s->ss_h; + end_y >>= s->ss_v; + step = 1 << (b->uvtx * 2); + for (p = 0; p < 2; p++) { + dst = td->dst[1 + p]; + dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off; + for (n = 0, y = 0; y < end_y; y += uvstep1d) { + uint8_t *ptr = dst, *ptr_r = dst_r; + for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel, + ptr_r += 4 * uvstep1d * bytesperpixel, n += step) { + int mode = b->uvmode; + uint8_t *a = &a_buf[32]; + int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&td->uveob[p][n]) : td->uveob[p][n]; + + mode = check_intra_mode(td, mode, &a, ptr_r, + s->s.frames[CUR_FRAME].tf.f->linesize[1], + ptr, td->uv_stride, l, col, x, w4, row, y, + b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel); + s->dsp.intra_pred[b->uvtx][mode](ptr, td->uv_stride, l, a); + if (eob) + s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, td->uv_stride, + td->uvblock[p] + 16 * n * bytesperpixel, eob); + } + dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1]; + dst += 4 * uvstep1d * td->uv_stride; + } + } +} + +void ff_vp9_intra_recon_8bpp(VP9TileData *td, ptrdiff_t y_off, ptrdiff_t uv_off) +{ + intra_recon(td, y_off, uv_off, 1); +} + +void ff_vp9_intra_recon_16bpp(VP9TileData *td, ptrdiff_t y_off, ptrdiff_t uv_off) +{ + intra_recon(td, y_off, uv_off, 2); +} + +static av_always_inline void mc_luma_unscaled(VP9TileData *td, const vp9_mc_func (*mc)[2], + uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *ref, ptrdiff_t ref_stride, + const ThreadFrame *ref_frame, + ptrdiff_t y, ptrdiff_t x, const VP9mv *mv, + int bw, int bh, int w, int h, int bytesperpixel) +{ + const VP9Context *s = td->s; + int mx = mv->x, my = mv->y, th; + + y += my >> 3; + x += mx >> 3; + ref += y * ref_stride + x * bytesperpixel; + mx &= 7; + my &= 7; + // FIXME bilinear filter only needs 0/1 pixels, not 3/4 + // we use +7 because the last 7 pixels of each sbrow can be changed in + // the longest loopfilter of the next sbrow + th = (y + bh + 4 * !!my + 7) >> 6; + ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0); + // The arm/aarch64 _hv filters read one more row than what actually is + // needed, so switch to emulated edge one pixel sooner vertically + // (!!my * 5) than horizontally (!!mx * 4). + if (x < !!mx * 3 || y < !!my * 3 || + x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) { + s->vdsp.emulated_edge_mc(td->edge_emu_buffer, + ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel, + 160, ref_stride, + bw + !!mx * 7, bh + !!my * 7, + x - !!mx * 3, y - !!my * 3, w, h); + ref = td->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel; + ref_stride = 160; + } + mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1); +} + +static av_always_inline void mc_chroma_unscaled(VP9TileData *td, const vp9_mc_func (*mc)[2], + uint8_t *dst_u, uint8_t *dst_v, + ptrdiff_t dst_stride, + const uint8_t *ref_u, ptrdiff_t src_stride_u, + const uint8_t *ref_v, ptrdiff_t src_stride_v, + const ThreadFrame *ref_frame, + ptrdiff_t y, ptrdiff_t x, const VP9mv *mv, + int bw, int bh, int w, int h, int bytesperpixel) +{ + const VP9Context *s = td->s; + int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th; + + y += my >> 4; + x += mx >> 4; + ref_u += y * src_stride_u + x * bytesperpixel; + ref_v += y * src_stride_v + x * bytesperpixel; + mx &= 15; + my &= 15; + // FIXME bilinear filter only needs 0/1 pixels, not 3/4 + // we use +7 because the last 7 pixels of each sbrow can be changed in + // the longest loopfilter of the next sbrow + th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v); + ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0); + // The arm/aarch64 _hv filters read one more row than what actually is + // needed, so switch to emulated edge one pixel sooner vertically + // (!!my * 5) than horizontally (!!mx * 4). + if (x < !!mx * 3 || y < !!my * 3 || + x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) { + s->vdsp.emulated_edge_mc(td->edge_emu_buffer, + ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel, + 160, src_stride_u, + bw + !!mx * 7, bh + !!my * 7, + x - !!mx * 3, y - !!my * 3, w, h); + ref_u = td->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel; + mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my); + + s->vdsp.emulated_edge_mc(td->edge_emu_buffer, + ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel, + 160, src_stride_v, + bw + !!mx * 7, bh + !!my * 7, + x - !!mx * 3, y - !!my * 3, w, h); + ref_v = td->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel; + mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my); + } else { + mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my); + mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my); + } +} + +#define mc_luma_dir(td, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \ + px, py, pw, ph, bw, bh, w, h, i) \ + mc_luma_unscaled(td, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \ + mv, bw, bh, w, h, bytesperpixel) +#define mc_chroma_dir(td, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ + row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \ + mc_chroma_unscaled(td, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ + row, col, mv, bw, bh, w, h, bytesperpixel) +#define SCALED 0 +#define FN(x) x##_8bpp +#define BYTES_PER_PIXEL 1 +#include "vp9_mc_template.c" +#undef FN +#undef BYTES_PER_PIXEL +#define FN(x) x##_16bpp +#define BYTES_PER_PIXEL 2 +#include "vp9_mc_template.c" +#undef mc_luma_dir +#undef mc_chroma_dir +#undef FN +#undef BYTES_PER_PIXEL +#undef SCALED + +static av_always_inline void mc_luma_scaled(VP9TileData *td, vp9_scaled_mc_func smc, + const vp9_mc_func (*mc)[2], + uint8_t *dst, ptrdiff_t dst_stride, + const uint8_t *ref, ptrdiff_t ref_stride, + const ThreadFrame *ref_frame, + ptrdiff_t y, ptrdiff_t x, const VP9mv *in_mv, + int px, int py, int pw, int ph, + int bw, int bh, int w, int h, int bytesperpixel, + const uint16_t *scale, const uint8_t *step) +{ + const VP9Context *s = td->s; + if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width && + s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) { + mc_luma_unscaled(td, mc, dst, dst_stride, ref, ref_stride, ref_frame, + y, x, in_mv, bw, bh, w, h, bytesperpixel); + } else { +#define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14) + int mx, my; + int refbw_m1, refbh_m1; + int th; + VP9mv mv; + + mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8); + mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8); + // BUG libvpx seems to scale the two components separately. This introduces + // rounding errors but we have to reproduce them to be exactly compatible + // with the output from libvpx... + mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0); + my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1); + + y = my >> 4; + x = mx >> 4; + ref += y * ref_stride + x * bytesperpixel; + mx &= 15; + my &= 15; + refbw_m1 = ((bw - 1) * step[0] + mx) >> 4; + refbh_m1 = ((bh - 1) * step[1] + my) >> 4; + // FIXME bilinear filter only needs 0/1 pixels, not 3/4 + // we use +7 because the last 7 pixels of each sbrow can be changed in + // the longest loopfilter of the next sbrow + th = (y + refbh_m1 + 4 + 7) >> 6; + ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0); + // The arm/aarch64 _hv filters read one more row than what actually is + // needed, so switch to emulated edge one pixel sooner vertically + // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1). + if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) { + s->vdsp.emulated_edge_mc(td->edge_emu_buffer, + ref - 3 * ref_stride - 3 * bytesperpixel, + 288, ref_stride, + refbw_m1 + 8, refbh_m1 + 8, + x - 3, y - 3, w, h); + ref = td->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel; + ref_stride = 288; + } + smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]); + } +} + +static av_always_inline void mc_chroma_scaled(VP9TileData *td, vp9_scaled_mc_func smc, + const vp9_mc_func (*mc)[2], + uint8_t *dst_u, uint8_t *dst_v, + ptrdiff_t dst_stride, + const uint8_t *ref_u, ptrdiff_t src_stride_u, + const uint8_t *ref_v, ptrdiff_t src_stride_v, + const ThreadFrame *ref_frame, + ptrdiff_t y, ptrdiff_t x, const VP9mv *in_mv, + int px, int py, int pw, int ph, + int bw, int bh, int w, int h, int bytesperpixel, + const uint16_t *scale, const uint8_t *step) +{ + const VP9Context *s = td->s; + if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width && + s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) { + mc_chroma_unscaled(td, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u, + ref_v, src_stride_v, ref_frame, + y, x, in_mv, bw, bh, w, h, bytesperpixel); + } else { + int mx, my; + int refbw_m1, refbh_m1; + int th; + VP9mv mv; + + if (s->ss_h) { + // BUG https://code.google.com/p/webm/issues/detail?id=820 + mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16); + mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15); + } else { + mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8); + mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0); + } + if (s->ss_v) { + // BUG https://code.google.com/p/webm/issues/detail?id=820 + mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16); + my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15); + } else { + mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8); + my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1); + } +#undef scale_mv + y = my >> 4; + x = mx >> 4; + ref_u += y * src_stride_u + x * bytesperpixel; + ref_v += y * src_stride_v + x * bytesperpixel; + mx &= 15; + my &= 15; + refbw_m1 = ((bw - 1) * step[0] + mx) >> 4; + refbh_m1 = ((bh - 1) * step[1] + my) >> 4; + // FIXME bilinear filter only needs 0/1 pixels, not 3/4 + // we use +7 because the last 7 pixels of each sbrow can be changed in + // the longest loopfilter of the next sbrow + th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v); + ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0); + // The arm/aarch64 _hv filters read one more row than what actually is + // needed, so switch to emulated edge one pixel sooner vertically + // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1). + if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) { + s->vdsp.emulated_edge_mc(td->edge_emu_buffer, + ref_u - 3 * src_stride_u - 3 * bytesperpixel, + 288, src_stride_u, + refbw_m1 + 8, refbh_m1 + 8, + x - 3, y - 3, w, h); + ref_u = td->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel; + smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]); + + s->vdsp.emulated_edge_mc(td->edge_emu_buffer, + ref_v - 3 * src_stride_v - 3 * bytesperpixel, + 288, src_stride_v, + refbw_m1 + 8, refbh_m1 + 8, + x - 3, y - 3, w, h); + ref_v = td->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel; + smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]); + } else { + smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]); + smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]); + } + } +} + +#define mc_luma_dir(td, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \ + px, py, pw, ph, bw, bh, w, h, i) \ + mc_luma_scaled(td, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \ + mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \ + s->mvscale[b->ref[i]], s->mvstep[b->ref[i]]) +#define mc_chroma_dir(td, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ + row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \ + mc_chroma_scaled(td, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \ + row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \ + s->mvscale[b->ref[i]], s->mvstep[b->ref[i]]) +#define SCALED 1 +#define FN(x) x##_scaled_8bpp +#define BYTES_PER_PIXEL 1 +#include "vp9_mc_template.c" +#undef FN +#undef BYTES_PER_PIXEL +#define FN(x) x##_scaled_16bpp +#define BYTES_PER_PIXEL 2 +#include "vp9_mc_template.c" +#undef mc_luma_dir +#undef mc_chroma_dir +#undef FN +#undef BYTES_PER_PIXEL +#undef SCALED + +static av_always_inline void inter_recon(VP9TileData *td, int bytesperpixel) +{ + const VP9Context *s = td->s; + VP9Block *b = td->b; + int row = td->row, col = td->col; + + if (s->mvscale[b->ref[0]][0] == REF_INVALID_SCALE || + (b->comp && s->mvscale[b->ref[1]][0] == REF_INVALID_SCALE)) { + if (!s->td->error_info) { + s->td->error_info = AVERROR_INVALIDDATA; + av_log(NULL, AV_LOG_ERROR, "Bitstream not supported, " + "reference frame has invalid dimensions\n"); + } + return; + } + + if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) { + if (bytesperpixel == 1) { + inter_pred_scaled_8bpp(td); + } else { + inter_pred_scaled_16bpp(td); + } + } else { + if (bytesperpixel == 1) { + inter_pred_8bpp(td); + } else { + inter_pred_16bpp(td); + } + } + + if (!b->skip) { + /* mostly copied intra_recon() */ + + int w4 = ff_vp9_bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n; + int h4 = ff_vp9_bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2); + int end_x = FFMIN(2 * (s->cols - col), w4); + int end_y = FFMIN(2 * (s->rows - row), h4); + int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless; + int uvstep1d = 1 << b->uvtx, p; + uint8_t *dst = td->dst[0]; + + // y itxfm add + for (n = 0, y = 0; y < end_y; y += step1d) { + uint8_t *ptr = dst; + for (x = 0; x < end_x; x += step1d, + ptr += 4 * step1d * bytesperpixel, n += step) { + int eob = b->tx > TX_8X8 ? AV_RN16A(&td->eob[n]) : td->eob[n]; + + if (eob) + s->dsp.itxfm_add[tx][DCT_DCT](ptr, td->y_stride, + td->block + 16 * n * bytesperpixel, eob); + } + dst += 4 * td->y_stride * step1d; + } + + // uv itxfm add + end_x >>= s->ss_h; + end_y >>= s->ss_v; + step = 1 << (b->uvtx * 2); + for (p = 0; p < 2; p++) { + dst = td->dst[p + 1]; + for (n = 0, y = 0; y < end_y; y += uvstep1d) { + uint8_t *ptr = dst; + for (x = 0; x < end_x; x += uvstep1d, + ptr += 4 * uvstep1d * bytesperpixel, n += step) { + int eob = b->uvtx > TX_8X8 ? AV_RN16A(&td->uveob[p][n]) : td->uveob[p][n]; + + if (eob) + s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, td->uv_stride, + td->uvblock[p] + 16 * n * bytesperpixel, eob); + } + dst += 4 * uvstep1d * td->uv_stride; + } + } + } +} + +void ff_vp9_inter_recon_8bpp(VP9TileData *td) +{ + inter_recon(td, 1); +} + +void ff_vp9_inter_recon_16bpp(VP9TileData *td) +{ + inter_recon(td, 2); +} diff --git a/media/ffvpx/libavcodec/vp9shared.h b/media/ffvpx/libavcodec/vp9shared.h new file mode 100644 index 0000000000..543a496df8 --- /dev/null +++ b/media/ffvpx/libavcodec/vp9shared.h @@ -0,0 +1,175 @@ +/* + * VP9 compatible video decoder + * + * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> + * Copyright (C) 2013 Clément BÅ“sch <u pkh me> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_VP9SHARED_H +#define AVCODEC_VP9SHARED_H + +#include <stddef.h> +#include <stdint.h> + +#include "libavutil/mem_internal.h" + +#include "vp9.h" +#include "threadframe.h" + +enum BlockPartition { + PARTITION_NONE, // [ ] <-. + PARTITION_H, // [-] | + PARTITION_V, // [|] | + PARTITION_SPLIT, // [+] --' +}; + +enum InterPredMode { + NEARESTMV = 10, + NEARMV = 11, + ZEROMV = 12, + NEWMV = 13, +}; + +enum CompPredMode { + PRED_SINGLEREF, + PRED_COMPREF, + PRED_SWITCHABLE, +}; + +typedef struct VP9mv { + DECLARE_ALIGNED(4, int16_t, x); + int16_t y; +} VP9mv; + +typedef struct VP9mvrefPair { + VP9mv mv[2]; + int8_t ref[2]; +} VP9mvrefPair; + +typedef struct VP9Frame { + ThreadFrame tf; + AVBufferRef *extradata; + uint8_t *segmentation_map; + VP9mvrefPair *mv; + int uses_2pass; + + AVBufferRef *hwaccel_priv_buf; + void *hwaccel_picture_private; +} VP9Frame; + +enum BlockLevel { + BL_64X64, + BL_32X32, + BL_16X16, + BL_8X8, +}; + +enum BlockSize { + BS_64x64, + BS_64x32, + BS_32x64, + BS_32x32, + BS_32x16, + BS_16x32, + BS_16x16, + BS_16x8, + BS_8x16, + BS_8x8, + BS_8x4, + BS_4x8, + BS_4x4, + N_BS_SIZES, +}; + +typedef struct VP9BitstreamHeader { + // bitstream header + uint8_t profile; + uint8_t bpp; + uint8_t keyframe; + uint8_t invisible; + uint8_t errorres; + uint8_t intraonly; + uint8_t resetctx; + uint8_t refreshrefmask; + uint8_t highprecisionmvs; + enum FilterMode filtermode; + uint8_t allowcompinter; + uint8_t refreshctx; + uint8_t parallelmode; + uint8_t framectxid; + uint8_t use_last_frame_mvs; + uint8_t refidx[3]; + uint8_t signbias[3]; + uint8_t fixcompref; + uint8_t varcompref[2]; + struct { + uint8_t level; + int8_t sharpness; + } filter; + struct { + uint8_t enabled; + uint8_t updated; + int8_t mode[2]; + int8_t ref[4]; + } lf_delta; + uint8_t yac_qi; + int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta; + uint8_t lossless; +#define MAX_SEGMENT 8 + struct { + uint8_t enabled; + uint8_t temporal; + uint8_t absolute_vals; + uint8_t update_map; + uint8_t prob[7]; + uint8_t pred_prob[3]; + struct { + uint8_t q_enabled; + uint8_t lf_enabled; + uint8_t ref_enabled; + uint8_t skip_enabled; + uint8_t ref_val; + int16_t q_val; + int8_t lf_val; + int16_t qmul[2][2]; + uint8_t lflvl[4][2]; + } feat[MAX_SEGMENT]; + } segmentation; + enum TxfmMode txfmmode; + enum CompPredMode comppredmode; + struct { + unsigned log2_tile_cols, log2_tile_rows; + unsigned tile_cols, tile_rows; + } tiling; + + int uncompressed_header_size; + int compressed_header_size; +} VP9BitstreamHeader; + +typedef struct VP9SharedContext { + VP9BitstreamHeader h; + + ThreadFrame refs[8]; +#define CUR_FRAME 0 +#define REF_FRAME_MVPAIR 1 +#define REF_FRAME_SEGMAP 2 + VP9Frame frames[3]; +} VP9SharedContext; + +#endif /* AVCODEC_VP9SHARED_H */ diff --git a/media/ffvpx/libavcodec/vpx_rac.c b/media/ffvpx/libavcodec/vpx_rac.c new file mode 100644 index 0000000000..cf02e9a19c --- /dev/null +++ b/media/ffvpx/libavcodec/vpx_rac.c @@ -0,0 +1,53 @@ +/* + * VP5/6/8 decoder + * Copyright (c) 2010 Fiona Glaser <fiona@x264.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> +#include "libavutil/error.h" +#include "bytestream.h" +#include "vpx_rac.h" + +const uint8_t ff_vpx_norm_shift[256]= { + 8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + +int ff_vpx_init_range_decoder(VPXRangeCoder *c, const uint8_t *buf, int buf_size) +{ + c->high = 255; + c->bits = -16; + c->buffer = buf; + c->end = buf + buf_size; + c->end_reached = 0; + if (buf_size < 1) + return AVERROR_INVALIDDATA; + c->code_word = bytestream_get_be24(&c->buffer); + return 0; +} diff --git a/media/ffvpx/libavcodec/vpx_rac.h b/media/ffvpx/libavcodec/vpx_rac.h new file mode 100644 index 0000000000..b158cc0754 --- /dev/null +++ b/media/ffvpx/libavcodec/vpx_rac.h @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Common VP5-VP9 range decoder stuff + */ + +#ifndef AVCODEC_VPX_RAC_H +#define AVCODEC_VPX_RAC_H + +#include <stdint.h> + +#include "config.h" +#include "libavutil/attributes.h" +#include "bytestream.h" + +typedef struct VPXRangeCoder { + int high; + int bits; /* stored negated (i.e. negative "bits" is a positive number of + bits left) in order to eliminate a negate in cache refilling */ + const uint8_t *buffer; + const uint8_t *end; + unsigned int code_word; + int end_reached; +} VPXRangeCoder; + +extern const uint8_t ff_vpx_norm_shift[256]; +int ff_vpx_init_range_decoder(VPXRangeCoder *c, const uint8_t *buf, int buf_size); + +/** + * returns 1 if the end of the stream has been reached, 0 otherwise. + */ +static av_always_inline int vpx_rac_is_end(VPXRangeCoder *c) +{ + if (c->end <= c->buffer && c->bits >= 0) + c->end_reached ++; + return c->end_reached > 10; +} + +static av_always_inline unsigned int vpx_rac_renorm(VPXRangeCoder *c) +{ + int shift = ff_vpx_norm_shift[c->high]; + int bits = c->bits; + unsigned int code_word = c->code_word; + + c->high <<= shift; + code_word <<= shift; + bits += shift; + if(bits >= 0 && c->buffer < c->end) { + code_word |= bytestream_get_be16(&c->buffer) << bits; + bits -= 16; + } + c->bits = bits; + return code_word; +} + +#if ARCH_ARM +#include "arm/vpx_arith.h" +#elif ARCH_X86 +#include "x86/vpx_arith.h" +#endif + +#ifndef vpx_rac_get_prob +#define vpx_rac_get_prob vpx_rac_get_prob +static av_always_inline int vpx_rac_get_prob(VPXRangeCoder *c, uint8_t prob) +{ + unsigned int code_word = vpx_rac_renorm(c); + unsigned int low = 1 + (((c->high - 1) * prob) >> 8); + unsigned int low_shift = low << 16; + int bit = code_word >= low_shift; + + c->high = bit ? c->high - low : low; + c->code_word = bit ? code_word - low_shift : code_word; + + return bit; +} +#endif + +#ifndef vpx_rac_get_prob_branchy +// branchy variant, to be used where there's a branch based on the bit decoded +static av_always_inline int vpx_rac_get_prob_branchy(VPXRangeCoder *c, int prob) +{ + unsigned long code_word = vpx_rac_renorm(c); + unsigned low = 1 + (((c->high - 1) * prob) >> 8); + unsigned low_shift = low << 16; + + if (code_word >= low_shift) { + c->high -= low; + c->code_word = code_word - low_shift; + return 1; + } + + c->high = low; + c->code_word = code_word; + return 0; +} +#endif + +static av_always_inline int vpx_rac_get(VPXRangeCoder *c) +{ + unsigned int code_word = vpx_rac_renorm(c); + /* equiprobable */ + int low = (c->high + 1) >> 1; + unsigned int low_shift = low << 16; + int bit = code_word >= low_shift; + if (bit) { + c->high -= low; + code_word -= low_shift; + } else { + c->high = low; + } + + c->code_word = code_word; + return bit; +} + +#endif /* AVCODEC_VPX_RAC_H */ diff --git a/media/ffvpx/libavcodec/x86/constants.c b/media/ffvpx/libavcodec/x86/constants.c new file mode 100644 index 0000000000..bc7f2b17b8 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/constants.c @@ -0,0 +1,93 @@ +/* + * MMX/SSE/AVX constants used across x86 dsp optimizations. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mem_internal.h" +#include "libavutil/x86/asm.h" // for xmm_reg +#include "constants.h" + +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL, + 0x0001000100010001ULL, 0x0001000100010001ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL, + 0x0002000200020002ULL, 0x0002000200020002ULL }; +DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL }; +DECLARE_ASM_ALIGNED(32, const ymm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL, + 0x0004000400040004ULL, 0x0004000400040004ULL }; +DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL }; +DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL }; +DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_20) = { 0x0014001400140014ULL, 0x0014001400140014ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL }; +DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; +DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; +DECLARE_ASM_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; +DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; +DECLARE_ASM_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_255) = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, + 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_256) = { 0x0100010001000100ULL, 0x0100010001000100ULL, + 0x0100010001000100ULL, 0x0100010001000100ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL, + 0x0200020002000200ULL, 0x0200020002000200ULL }; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1023) = { 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL, + 0x03ff03ff03ff03ffULL, 0x03ff03ff03ff03ffULL}; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL, + 0x0400040004000400ULL, 0x0400040004000400ULL}; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL, + 0x0800080008000800ULL, 0x0800080008000800ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL, + 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL, + 0x1000100010001000ULL, 0x1000100010001000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL, + 0x2000200020002000ULL, 0x2000200020002000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_m1) = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, + 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL }; + +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL, + 0x0000000000000000ULL, 0x0000000000000000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL, 0x0101010101010101ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_2) = { 0x0202020202020202ULL, 0x0202020202020202ULL, + 0x0202020202020202ULL, 0x0202020202020202ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL, + 0x0303030303030303ULL, 0x0303030303030303ULL }; +DECLARE_ALIGNED(32, const xmm_reg, ff_pb_15) = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL, + 0x8080808080808080ULL, 0x8080808080808080ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL, + 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL }; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; + +DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x8000000080000000ULL }; + +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL, + 0x0000000100000001ULL, 0x0000000100000001ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL, + 0x0000001000000010ULL, 0x0000001000000010ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL, + 0x0000002000000020ULL, 0x0000002000000020ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL, + 0x0000200000002000ULL, 0x0000200000002000ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, + 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL }; diff --git a/media/ffvpx/libavcodec/x86/constants.h b/media/ffvpx/libavcodec/x86/constants.h new file mode 100644 index 0000000000..85da38b7b9 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/constants.h @@ -0,0 +1,72 @@ +/* + * MMX/SSE constants used across x86 dsp optimizations. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_CONSTANTS_H +#define AVCODEC_X86_CONSTANTS_H + +#include <stdint.h> + +#include "libavutil/x86/asm.h" + +extern const ymm_reg ff_pw_1; +extern const ymm_reg ff_pw_2; +extern const xmm_reg ff_pw_3; +extern const ymm_reg ff_pw_4; +extern const xmm_reg ff_pw_5; +extern const xmm_reg ff_pw_8; +extern const xmm_reg ff_pw_9; +extern const uint64_t ff_pw_15; +extern const xmm_reg ff_pw_16; +extern const xmm_reg ff_pw_18; +extern const xmm_reg ff_pw_20; +extern const xmm_reg ff_pw_32; +extern const uint64_t ff_pw_42; +extern const uint64_t ff_pw_53; +extern const xmm_reg ff_pw_64; +extern const uint64_t ff_pw_96; +extern const uint64_t ff_pw_128; +extern const ymm_reg ff_pw_255; +extern const ymm_reg ff_pw_256; +extern const ymm_reg ff_pw_512; +extern const ymm_reg ff_pw_1023; +extern const ymm_reg ff_pw_1024; +extern const ymm_reg ff_pw_2048; +extern const ymm_reg ff_pw_4095; +extern const ymm_reg ff_pw_4096; +extern const ymm_reg ff_pw_8192; +extern const ymm_reg ff_pw_m1; + +extern const ymm_reg ff_pb_0; +extern const ymm_reg ff_pb_1; +extern const ymm_reg ff_pb_2; +extern const ymm_reg ff_pb_3; +extern const ymm_reg ff_pb_80; +extern const ymm_reg ff_pb_FE; +extern const uint64_t ff_pb_FC; + +extern const xmm_reg ff_ps_neg; + +extern const ymm_reg ff_pd_1; +extern const ymm_reg ff_pd_16; +extern const ymm_reg ff_pd_32; +extern const ymm_reg ff_pd_8192; +extern const ymm_reg ff_pd_65535; + +#endif /* AVCODEC_X86_CONSTANTS_H */ diff --git a/media/ffvpx/libavcodec/x86/dct32.asm b/media/ffvpx/libavcodec/x86/dct32.asm new file mode 100644 index 0000000000..37fba51543 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/dct32.asm @@ -0,0 +1,481 @@ +;****************************************************************************** +;* 32 point SSE-optimized DCT transform +;* Copyright (c) 2010 Vitor Sessak +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 + +ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 + dd 0.553104, 0.582935, 0.622504, 0.674808 + dd -10.190008, -3.407609, -2.057781, -1.484165 + dd -1.169440, -0.972568, -0.839350, -0.744536 + dd 0.502419, 0.522499, 0.566944, 0.646822 + dd 0.788155, 1.060678, 1.722447, 5.101149 + dd 0.509796, 0.601345, 0.899976, 2.562916 + dd 0.509796, 0.601345, 0.899976, 2.562916 + dd 1.000000, 1.000000, 1.306563, 0.541196 + dd 1.000000, 1.000000, 1.306563, 0.541196 + dd 1.000000, 0.707107, 1.000000, -0.707107 + dd 1.000000, 0.707107, 1.000000, -0.707107 + dd 0.707107, 0.707107, 0.707107, 0.707107 + +%macro BUTTERFLY 4 + subps %4, %1, %2 + addps %2, %2, %1 + mulps %1, %4, %3 +%endmacro + +%macro BUTTERFLY0 5 +%if cpuflag(sse2) && notcpuflag(avx) + pshufd %4, %1, %5 + xorps %1, %2 + addps %1, %4 + mulps %1, %3 +%else + shufps %4, %1, %1, %5 + xorps %1, %1, %2 + addps %4, %4, %1 + mulps %1, %4, %3 +%endif +%endmacro + +%macro BUTTERFLY2 4 + BUTTERFLY0 %1, %2, %3, %4, 0x1b +%endmacro + +%macro BUTTERFLY3 4 + BUTTERFLY0 %1, %2, %3, %4, 0xb1 +%endmacro + +%macro BUTTERFLY3V 5 + movaps m%5, m%1 + addps m%1, m%2 + subps m%5, m%2 + SWAP %2, %5 + mulps m%2, [ps_cos_vec+192] + movaps m%5, m%3 + addps m%3, m%4 + subps m%4, m%5 + mulps m%4, [ps_cos_vec+192] +%endmacro + +%macro PASS6_AND_PERMUTE 0 + mov tmpd, [outq+4] + movss m7, [outq+72] + addss m7, [outq+76] + movss m3, [outq+56] + addss m3, [outq+60] + addss m4, m3 + movss m2, [outq+52] + addss m2, m3 + movss m3, [outq+104] + addss m3, [outq+108] + addss m1, m3 + addss m5, m4 + movss [outq+ 16], m1 + movss m1, [outq+100] + addss m1, m3 + movss m3, [outq+40] + movss [outq+ 48], m1 + addss m3, [outq+44] + movss m1, [outq+100] + addss m4, m3 + addss m3, m2 + addss m1, [outq+108] + movss [outq+ 40], m3 + addss m2, [outq+36] + movss m3, [outq+8] + movss [outq+ 56], m2 + addss m3, [outq+12] + movss [outq+ 32], m3 + movss m3, [outq+80] + movss [outq+ 8], m5 + movss [outq+ 80], m1 + movss m2, [outq+52] + movss m5, [outq+120] + addss m5, [outq+124] + movss m1, [outq+64] + addss m2, [outq+60] + addss m0, m5 + addss m5, [outq+116] + mov [outq+64], tmpd + addss m6, m0 + addss m1, m6 + mov tmpd, [outq+12] + mov [outq+ 96], tmpd + movss [outq+ 4], m1 + movss m1, [outq+24] + movss [outq+ 24], m4 + movss m4, [outq+88] + addss m4, [outq+92] + addss m3, m4 + addss m4, [outq+84] + mov tmpd, [outq+108] + addss m1, [outq+28] + addss m0, m1 + addss m1, m5 + addss m6, m3 + addss m3, m0 + addss m0, m7 + addss m5, [outq+20] + addss m7, m1 + movss [outq+ 12], m6 + mov [outq+112], tmpd + movss m6, [outq+28] + movss [outq+ 28], m0 + movss m0, [outq+36] + movss [outq+ 36], m7 + addss m1, m4 + movss m7, [outq+116] + addss m0, m2 + addss m7, [outq+124] + movss [outq+ 72], m0 + movss m0, [outq+44] + addss m2, m0 + movss [outq+ 44], m1 + movss [outq+ 88], m2 + addss m0, [outq+60] + mov tmpd, [outq+60] + mov [outq+120], tmpd + movss [outq+104], m0 + addss m4, m5 + addss m5, [outq+68] + movss [outq+52], m4 + movss [outq+60], m5 + movss m4, [outq+68] + movss m5, [outq+20] + movss [outq+ 20], m3 + addss m5, m7 + addss m7, m6 + addss m4, m5 + movss m2, [outq+84] + addss m2, [outq+92] + addss m5, m2 + movss [outq+ 68], m4 + addss m2, m7 + movss m4, [outq+76] + movss [outq+ 84], m2 + movss [outq+ 76], m5 + addss m7, m4 + addss m6, [outq+124] + addss m4, m6 + addss m6, [outq+92] + movss [outq+100], m4 + movss [outq+108], m6 + movss m6, [outq+92] + movss [outq+92], m7 + addss m6, [outq+124] + movss [outq+116], m6 +%endmacro + +INIT_YMM avx +SECTION .text +%if HAVE_AVX_EXTERNAL +; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) +cglobal dct32_float, 2,3,8, out, in, tmp + ; pass 1 + vmovaps m4, [inq+0] + vinsertf128 m5, m5, [inq+96], 1 + vinsertf128 m5, m5, [inq+112], 0 + vshufps m5, m5, m5, 0x1b + BUTTERFLY m4, m5, [ps_cos_vec], m6 + + vmovaps m2, [inq+64] + vinsertf128 m6, m6, [inq+32], 1 + vinsertf128 m6, m6, [inq+48], 0 + vshufps m6, m6, m6, 0x1b + BUTTERFLY m2, m6, [ps_cos_vec+32], m0 + + ; pass 2 + + BUTTERFLY m5, m6, [ps_cos_vec+64], m0 + BUTTERFLY m4, m2, [ps_cos_vec+64], m7 + + + ; pass 3 + vperm2f128 m3, m6, m4, 0x31 + vperm2f128 m1, m6, m4, 0x20 + vshufps m3, m3, m3, 0x1b + + BUTTERFLY m1, m3, [ps_cos_vec+96], m6 + + + vperm2f128 m4, m5, m2, 0x20 + vperm2f128 m5, m5, m2, 0x31 + vshufps m5, m5, m5, 0x1b + + BUTTERFLY m4, m5, [ps_cos_vec+96], m6 + + ; pass 4 + vmovaps m6, [ps_p1p1m1m1+0] + vmovaps m2, [ps_cos_vec+128] + + BUTTERFLY2 m5, m6, m2, m7 + BUTTERFLY2 m4, m6, m2, m7 + BUTTERFLY2 m1, m6, m2, m7 + BUTTERFLY2 m3, m6, m2, m7 + + + ; pass 5 + vshufps m6, m6, m6, 0xcc + vmovaps m2, [ps_cos_vec+160] + + BUTTERFLY3 m5, m6, m2, m7 + BUTTERFLY3 m4, m6, m2, m7 + BUTTERFLY3 m1, m6, m2, m7 + BUTTERFLY3 m3, m6, m2, m7 + + vperm2f128 m6, m3, m3, 0x31 + vmovaps [outq], m3 + + vextractf128 [outq+64], m5, 1 + vextractf128 [outq+32], m5, 0 + + vextractf128 [outq+80], m4, 1 + vextractf128 [outq+48], m4, 0 + + vperm2f128 m0, m1, m1, 0x31 + vmovaps [outq+96], m1 + + vzeroupper + + ; pass 6, no SIMD... +INIT_XMM + PASS6_AND_PERMUTE + RET +%endif + +%if ARCH_X86_64 +%define SPILL SWAP +%define UNSPILL SWAP + +%macro PASS5 0 + nop ; FIXME code alignment + SWAP 5, 8 + SWAP 4, 12 + SWAP 6, 14 + SWAP 7, 13 + SWAP 0, 15 + PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13 + TRANSPOSE4x4PS 8, 9, 10, 11, 0 + BUTTERFLY3V 8, 9, 10, 11, 0 + addps m10, m11 + TRANSPOSE4x4PS 12, 13, 14, 15, 0 + BUTTERFLY3V 12, 13, 14, 15, 0 + addps m14, m15 + addps m12, m14 + addps m14, m13 + addps m13, m15 +%endmacro + +%macro PASS6 0 + SWAP 9, 12 + SWAP 11, 14 + movss [outq+0x00], m8 + pshuflw m0, m8, 0xe + movss [outq+0x10], m9 + pshuflw m1, m9, 0xe + movss [outq+0x20], m10 + pshuflw m2, m10, 0xe + movss [outq+0x30], m11 + pshuflw m3, m11, 0xe + movss [outq+0x40], m12 + pshuflw m4, m12, 0xe + movss [outq+0x50], m13 + pshuflw m5, m13, 0xe + movss [outq+0x60], m14 + pshuflw m6, m14, 0xe + movaps [outq+0x70], m15 + pshuflw m7, m15, 0xe + addss m0, m1 + addss m1, m2 + movss [outq+0x08], m0 + addss m2, m3 + movss [outq+0x18], m1 + addss m3, m4 + movss [outq+0x28], m2 + addss m4, m5 + movss [outq+0x38], m3 + addss m5, m6 + movss [outq+0x48], m4 + addss m6, m7 + movss [outq+0x58], m5 + movss [outq+0x68], m6 + movss [outq+0x78], m7 + + PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7 + movhlps m0, m1 + pshufd m1, m1, 3 + SWAP 0, 2, 4, 6, 8, 10, 12, 14 + SWAP 1, 3, 5, 7, 9, 11, 13, 15 +%rep 7 + movhlps m0, m1 + pshufd m1, m1, 3 + addss m15, m1 + SWAP 0, 2, 4, 6, 8, 10, 12, 14 + SWAP 1, 3, 5, 7, 9, 11, 13, 15 +%endrep +%assign i 4 +%rep 15 + addss m0, m1 + movss [outq+i], m0 + SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + %assign i i+8 +%endrep +%endmacro + +%else ; ARCH_X86_32 +%macro SPILL 2 ; xmm#, mempos + movaps [outq+(%2-8)*16], m%1 +%endmacro +%macro UNSPILL 2 + movaps m%1, [outq+(%2-8)*16] +%endmacro + +%define PASS6 PASS6_AND_PERMUTE +%macro PASS5 0 + movaps m2, [ps_cos_vec+160] + shufps m3, m3, 0xcc + + BUTTERFLY3 m5, m3, m2, m1 + SPILL 5, 8 + + UNSPILL 1, 9 + BUTTERFLY3 m1, m3, m2, m5 + SPILL 1, 14 + + BUTTERFLY3 m4, m3, m2, m5 + SPILL 4, 12 + + BUTTERFLY3 m7, m3, m2, m5 + SPILL 7, 13 + + UNSPILL 5, 10 + BUTTERFLY3 m5, m3, m2, m7 + SPILL 5, 10 + + UNSPILL 4, 11 + BUTTERFLY3 m4, m3, m2, m7 + SPILL 4, 11 + + BUTTERFLY3 m6, m3, m2, m7 + SPILL 6, 9 + + BUTTERFLY3 m0, m3, m2, m7 + SPILL 0, 15 +%endmacro +%endif + + +; void ff_dct32_float(FFTSample *out, const FFTSample *in) +%macro DCT32_FUNC 0 +cglobal dct32_float, 2, 3, 16, out, in, tmp + ; pass 1 + + movaps m0, [inq+0] + LOAD_INV m1, [inq+112] + BUTTERFLY m0, m1, [ps_cos_vec], m3 + + movaps m7, [inq+64] + LOAD_INV m4, [inq+48] + BUTTERFLY m7, m4, [ps_cos_vec+32], m3 + + ; pass 2 + movaps m2, [ps_cos_vec+64] + BUTTERFLY m1, m4, m2, m3 + SPILL 1, 11 + SPILL 4, 8 + + ; pass 1 + movaps m1, [inq+16] + LOAD_INV m6, [inq+96] + BUTTERFLY m1, m6, [ps_cos_vec+16], m3 + + movaps m4, [inq+80] + LOAD_INV m5, [inq+32] + BUTTERFLY m4, m5, [ps_cos_vec+48], m3 + + ; pass 2 + BUTTERFLY m0, m7, m2, m3 + + movaps m2, [ps_cos_vec+80] + BUTTERFLY m6, m5, m2, m3 + + BUTTERFLY m1, m4, m2, m3 + + ; pass 3 + movaps m2, [ps_cos_vec+96] + shufps m1, m1, 0x1b + BUTTERFLY m0, m1, m2, m3 + SPILL 0, 15 + SPILL 1, 14 + + UNSPILL 0, 8 + shufps m5, m5, 0x1b + BUTTERFLY m0, m5, m2, m3 + + UNSPILL 1, 11 + shufps m6, m6, 0x1b + BUTTERFLY m1, m6, m2, m3 + SPILL 1, 11 + + shufps m4, m4, 0x1b + BUTTERFLY m7, m4, m2, m3 + + ; pass 4 + movaps m3, [ps_p1p1m1m1+0] + movaps m2, [ps_cos_vec+128] + + BUTTERFLY2 m5, m3, m2, m1 + + BUTTERFLY2 m0, m3, m2, m1 + SPILL 0, 9 + + BUTTERFLY2 m6, m3, m2, m1 + SPILL 6, 10 + + UNSPILL 0, 11 + BUTTERFLY2 m0, m3, m2, m1 + SPILL 0, 11 + + BUTTERFLY2 m4, m3, m2, m1 + + BUTTERFLY2 m7, m3, m2, m1 + + UNSPILL 6, 14 + BUTTERFLY2 m6, m3, m2, m1 + + UNSPILL 0, 15 + BUTTERFLY2 m0, m3, m2, m1 + + PASS5 + PASS6 + RET +%endmacro + +%macro LOAD_INV 2 + pshufd %1, %2, 0x1b +%endmacro + +INIT_XMM sse2 +DCT32_FUNC diff --git a/media/ffvpx/libavcodec/x86/dct_init.c b/media/ffvpx/libavcodec/x86/dct_init.c new file mode 100644 index 0000000000..d0e4b34dd3 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/dct_init.c @@ -0,0 +1,36 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/dct.h" + +void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in); +void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); + +av_cold void ff_dct_init_x86(DCTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) + s->dct32 = ff_dct32_float_sse2; + if (EXTERNAL_AVX_FAST(cpu_flags)) + s->dct32 = ff_dct32_float_avx; +} diff --git a/media/ffvpx/libavcodec/x86/fdct.c b/media/ffvpx/libavcodec/x86/fdct.c new file mode 100644 index 0000000000..f4677ff4be --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fdct.c @@ -0,0 +1,378 @@ +/* + * SIMD-optimized forward DCT + * The gcc porting is Copyright (c) 2001 Fabrice Bellard. + * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. + * + * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT + * + * Intel Application Note AP-922 - fast, precise implementation of DCT + * http://developer.intel.com/vtune/cbts/appnotes.htm + * + * Also of inspiration: + * a page about fdct at http://www.geocities.com/ssavekar/dct.htm + * Skal's fdct at http://skal.planet-d.net/coding/dct.html + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/macros.h" +#include "libavutil/mem_internal.h" +#include "libavutil/x86/asm.h" +#include "fdct.h" + +#if HAVE_SSE2_INLINE + +////////////////////////////////////////////////////////////////////// +// +// constants for the forward DCT +// ----------------------------- +// +// Be sure to check that your compiler is aligning all constants to QWORD +// (8-byte) memory boundaries! Otherwise the unaligned memory access will +// severely stall MMX execution. +// +////////////////////////////////////////////////////////////////////// + +#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy +#define SHIFT_FRW_COL BITS_FRW_ACC +#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) +#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) +//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) + +#define X8(x) x,x,x,x,x,x,x,x + +//concatenated table, for forward DCT transformation +DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { + X8(13036), // tg * (2<<16) + 0.5 + X8(27146), // tg * (2<<16) + 0.5 + X8(-21746) // tg * (2<<16) + 0.5 +}; + +DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { + X8(23170) //cos * (2<<15) + 0.5 +}; + +DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; + +static const struct +{ + DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; +} fdct_r_row_sse2 = +{{ + RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW +}}; +//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; + +static const struct +{ + DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; +} tab_frw_01234567_sse2 = +{{ +//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table +#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ + C4, C4, C5, C7, C2, C6, C3, -C7, \ + -C4, C4, C7, C3, C6, -C2, C7, -C5, \ + C4, -C4, C5, -C1, C2, -C6, C3, -C1, +// c1..c7 * cos(pi/4) * 2^15 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 +}}; + +#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long + +#define FDCT_COL(cpu, mm, mov)\ +static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ +{\ + __asm__ volatile (\ + #mov" 16(%0), %%"#mm"0 \n\t" \ + #mov" 96(%0), %%"#mm"1 \n\t" \ + #mov" %%"#mm"0, %%"#mm"2 \n\t" \ + #mov" 32(%0), %%"#mm"3 \n\t" \ + "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ + #mov" 80(%0), %%"#mm"4 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ + #mov" (%0), %%"#mm"5 \n\t" \ + "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ + "paddsw 112(%0), %%"#mm"5 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ + #mov" %%"#mm"0, %%"#mm"6 \n\t" \ + "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ + #mov" 16(%1), %%"#mm"1 \n\t" \ + "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ + #mov" 48(%0), %%"#mm"7 \n\t" \ + "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ + "paddsw 64(%0), %%"#mm"7 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ + "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ + #mov" %%"#mm"5, %%"#mm"4 \n\t" \ + "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ + "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ + "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ + "por (%2), %%"#mm"1 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ + "pmulhw 16(%1), %%"#mm"5 \n\t" \ + #mov" %%"#mm"4, %%"#mm"7 \n\t" \ + "psubsw 80(%0), %%"#mm"3 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ + #mov" %%"#mm"1, 32(%3) \n\t" \ + "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ + #mov" 48(%0), %%"#mm"1 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ + "psubsw 64(%0), %%"#mm"1 \n\t" \ + #mov" %%"#mm"2, %%"#mm"6 \n\t" \ + #mov" %%"#mm"4, 64(%3) \n\t" \ + "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ + "pmulhw (%4), %%"#mm"2 \n\t" \ + "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ + "pmulhw (%4), %%"#mm"6 \n\t" \ + "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ + "por (%2), %%"#mm"5 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ + "por (%2), %%"#mm"2 \n\t" \ + #mov" %%"#mm"1, %%"#mm"4 \n\t" \ + #mov" (%0), %%"#mm"3 \n\t" \ + "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ + "psubsw 112(%0), %%"#mm"3 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ + #mov" (%1), %%"#mm"0 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ + #mov" 32(%1), %%"#mm"6 \n\t" \ + "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ + #mov" %%"#mm"7, (%3) \n\t" \ + "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ + #mov" %%"#mm"5, 96(%3) \n\t" \ + #mov" %%"#mm"3, %%"#mm"7 \n\t" \ + #mov" 32(%1), %%"#mm"5 \n\t" \ + "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ + "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ + "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ + "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ + "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ + "pmulhw (%1), %%"#mm"3 \n\t" \ + "por (%2), %%"#mm"0 \n\t" \ + "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ + #mov" %%"#mm"0, 16(%3) \n\t" \ + "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ + #mov" %%"#mm"7, 48(%3) \n\t" \ + "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ + #mov" %%"#mm"5, 80(%3) \n\t" \ + #mov" %%"#mm"3, 112(%3) \n\t" \ + : \ + : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ + "r" (out + offset), "r" (ocos_4_16)); \ +} + +FDCT_COL(sse2, xmm, movdqa) + +static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) +{ + __asm__ volatile( +#define FDCT_ROW_SSE2_H1(i,t) \ + "movq " #i "(%0), %%xmm2 \n\t" \ + "movq " #i "+8(%0), %%xmm0 \n\t" \ + "movdqa " #t "+32(%1), %%xmm3 \n\t" \ + "movdqa " #t "+48(%1), %%xmm7 \n\t" \ + "movdqa " #t "(%1), %%xmm4 \n\t" \ + "movdqa " #t "+16(%1), %%xmm5 \n\t" + +#define FDCT_ROW_SSE2_H2(i,t) \ + "movq " #i "(%0), %%xmm2 \n\t" \ + "movq " #i "+8(%0), %%xmm0 \n\t" \ + "movdqa " #t "+32(%1), %%xmm3 \n\t" \ + "movdqa " #t "+48(%1), %%xmm7 \n\t" + +#define FDCT_ROW_SSE2(i) \ + "movq %%xmm2, %%xmm1 \n\t" \ + "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ + "paddsw %%xmm0, %%xmm1 \n\t" \ + "psubsw %%xmm0, %%xmm2 \n\t" \ + "punpckldq %%xmm2, %%xmm1 \n\t" \ + "pshufd $78, %%xmm1, %%xmm2 \n\t" \ + "pmaddwd %%xmm2, %%xmm3 \n\t" \ + "pmaddwd %%xmm1, %%xmm7 \n\t" \ + "pmaddwd %%xmm5, %%xmm2 \n\t" \ + "pmaddwd %%xmm4, %%xmm1 \n\t" \ + "paddd %%xmm7, %%xmm3 \n\t" \ + "paddd %%xmm2, %%xmm1 \n\t" \ + "paddd %%xmm6, %%xmm3 \n\t" \ + "paddd %%xmm6, %%xmm1 \n\t" \ + "psrad %3, %%xmm3 \n\t" \ + "psrad %3, %%xmm1 \n\t" \ + "packssdw %%xmm3, %%xmm1 \n\t" \ + "movdqa %%xmm1, " #i "(%4) \n\t" + + "movdqa (%2), %%xmm6 \n\t" + FDCT_ROW_SSE2_H1(0,0) + FDCT_ROW_SSE2(0) + FDCT_ROW_SSE2_H2(64,0) + FDCT_ROW_SSE2(64) + + FDCT_ROW_SSE2_H1(16,64) + FDCT_ROW_SSE2(16) + FDCT_ROW_SSE2_H2(112,64) + FDCT_ROW_SSE2(112) + + FDCT_ROW_SSE2_H1(32,128) + FDCT_ROW_SSE2(32) + FDCT_ROW_SSE2_H2(96,128) + FDCT_ROW_SSE2(96) + + FDCT_ROW_SSE2_H1(48,192) + FDCT_ROW_SSE2(48) + FDCT_ROW_SSE2_H2(80,192) + FDCT_ROW_SSE2(80) + : + : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), + "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7") + ); +} + +void ff_fdct_sse2(int16_t *block) +{ + DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; + int16_t * const block1= (int16_t*)align_tmp; + + fdct_col_sse2(block, block1, 0); + fdct_row_sse2(block1, block); +} + +#endif /* HAVE_SSE2_INLINE */ diff --git a/media/ffvpx/libavcodec/x86/fdct.h b/media/ffvpx/libavcodec/x86/fdct.h new file mode 100644 index 0000000000..164d4fb30e --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fdct.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_FDCT_H +#define AVCODEC_X86_FDCT_H + +#include <stdint.h> + +void ff_fdct_sse2(int16_t *block); + +#endif /* AVCODEC_X86_FDCT_H */ diff --git a/media/ffvpx/libavcodec/x86/fdctdsp_init.c b/media/ffvpx/libavcodec/x86/fdctdsp_init.c new file mode 100644 index 0000000000..92a842433d --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fdctdsp_init.c @@ -0,0 +1,38 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/fdctdsp.h" +#include "fdct.h" + +av_cold void ff_fdctdsp_init_x86(FDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + const int dct_algo = avctx->dct_algo; + + if (!high_bit_depth) { + if ((dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) { + if (INLINE_SSE2(cpu_flags)) + c->fdct = ff_fdct_sse2; + } + } +} diff --git a/media/ffvpx/libavcodec/x86/fft.asm b/media/ffvpx/libavcodec/x86/fft.asm new file mode 100644 index 0000000000..34c3fc9a0f --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fft.asm @@ -0,0 +1,838 @@ +;****************************************************************************** +;* FFT transform with SSE/AVX optimizations +;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2011 Vitor Sessak +;* +;* This algorithm (though not any of the implementation details) is +;* based on libdjbfft by D. J. Bernstein. +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +; These functions are not individually interchangeable with the C versions. +; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results +; in blocks as conventient to the vector size. +; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 +%define pointer resq +%else +%define pointer resd +%endif + +struc FFTContext + .nbits: resd 1 + .reverse: resd 1 + .revtab: pointer 1 + .tmpbuf: pointer 1 + .mdctsize: resd 1 + .mdctbits: resd 1 + .tcos: pointer 1 + .tsin: pointer 1 + .fftperm: pointer 1 + .fftcalc: pointer 1 + .imdctcalc:pointer 1 + .imdcthalf:pointer 1 +endstruc + +SECTION_RODATA 32 + +%define M_SQRT1_2 0.70710678118654752440 +%define M_COS_PI_1_8 0.923879532511287 +%define M_COS_PI_3_8 0.38268343236509 + +ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 +ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 + +ps_root2: times 8 dd M_SQRT1_2 +ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 + +perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 +perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 +ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 +ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 +ps_m1p1: dd 1<<31, 0 + +cextern ps_neg + +%assign i 16 +%rep 14 +cextern cos_ %+ i +%assign i i<<1 +%endrep + +%if ARCH_X86_64 + %define pointer dq +%else + %define pointer dd +%endif + +%macro IF0 1+ +%endmacro +%macro IF1 1+ + %1 +%endmacro + +SECTION .text + +; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} +; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} +; %3, %4, %5 tmp +; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} +; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} +%macro T8_AVX 5 + vsubps %5, %1, %2 ; v = %1 - %2 + vaddps %3, %1, %2 ; w = %1 + %2 + vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 + vpermilps %2, %2, [perm1] + vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} + vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} + vsubps %4, %5, %1 ; s = r - q + vaddps %1, %5, %1 ; u = r + q + vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} + vshufps %5, %4, %1, 0xbb + vshufps %3, %4, %1, 0xee + vperm2f128 %3, %3, %5, 0x13 + vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} + vshufps %2, %1, %4, 0xdd + vshufps %1, %1, %4, 0x88 + vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} + vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} + vsubps %5, %1, %3 + vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} + vsubps %2, %4, %1 ; %2 = v - w + vaddps %1, %4, %1 ; %1 = v + w +%endmacro + +; In SSE mode do one fft4 transforms +; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} +; +; In AVX mode do two fft4 transforms +; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} +; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} +%macro T4_SSE 3 + subps %3, %1, %2 ; {t3,t4,-t8,t7} + addps %1, %1, %2 ; {t1,t2,t6,t5} + xorps %3, %3, [ps_p1p1m1p1] + shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} + shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} + subps %3, %1, %2 ; {r2,i2,r3,i3} + addps %1, %1, %2 ; {r0,i0,r1,i1} + shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} + shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} +%endmacro + +; In SSE mode do one FFT8 +; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} +; +; In AVX mode do two FFT8 +; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} +; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} +; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} +; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} +%macro T8_SSE 6 + addps %6, %3, %4 ; {t1,t2,t3,t4} + subps %3, %3, %4 ; {r5,i5,r7,i7} + shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} + mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} + mulps %4, %4, [ps_root2] + addps %3, %3, %4 ; {t8,t7,ta,t9} + shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} + shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} + subps %3, %6, %4 ; {t6,t5,tc,tb} + addps %6, %6, %4 ; {t1,t2,t9,ta} + shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} + shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} + subps %3, %1, %6 ; {r4,r5,r6,r7} + addps %1, %1, %6 ; {r0,r1,r2,r3} + subps %4, %2, %5 ; {i4,i5,i6,i7} + addps %2, %2, %5 ; {i0,i1,i2,i3} +%endmacro + +%macro INTERL 5 +%if cpuflag(avx) + vunpckhps %3, %2, %1 + vunpcklps %2, %2, %1 + vextractf128 %4(%5), %2, 0 + vextractf128 %4 %+ H(%5), %3, 0 + vextractf128 %4(%5 + 1), %2, 1 + vextractf128 %4 %+ H(%5 + 1), %3, 1 +%elif cpuflag(sse) + mova %3, %2 + unpcklps %2, %1 + unpckhps %3, %1 + mova %4(%5), %2 + mova %4(%5+1), %3 +%endif +%endmacro + +; scheduled for cpu-bound sizes +%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim +IF%1 mova m4, Z(4) +IF%1 mova m5, Z(5) + mova m0, %2 ; wre + mova m1, %3 ; wim + mulps m2, m4, m0 ; r2*wre +IF%1 mova m6, Z2(6) + mulps m3, m5, m1 ; i2*wim +IF%1 mova m7, Z2(7) + mulps m4, m4, m1 ; r2*wim + mulps m5, m5, m0 ; i2*wre + addps m2, m2, m3 ; r2*wre + i2*wim + mulps m3, m1, m7 ; i3*wim + subps m5, m5, m4 ; i2*wre - r2*wim + mulps m1, m1, m6 ; r3*wim + mulps m4, m0, m6 ; r3*wre + mulps m0, m0, m7 ; i3*wre + subps m4, m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m0, m1 ; i3*wre + r3*wim + subps m1, m4, m2 ; t3 + addps m4, m4, m2 ; t5 + subps m3, m3, m4 ; r2 + addps m4, m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + subps m3, m5, m0 ; t4 + subps m4, m6, m3 ; r3 + addps m3, m3, m6 ; r1 + mova Z2(6), m4 + mova Z(2), m3 + mova m2, Z(3) + addps m3, m5, m0 ; t6 + subps m2, m2, m1 ; i3 + mova m7, Z(1) + addps m1, m1, Z(3) ; i1 + mova Z2(7), m2 + mova Z(3), m1 + subps m4, m7, m3 ; i2 + addps m3, m3, m7 ; i0 + mova Z(5), m4 + mova Z(1), m3 +%endmacro + +; scheduled to avoid store->load aliasing +%macro PASS_BIG 1 ; (!interleave) + mova m4, Z(4) ; r2 + mova m5, Z(5) ; i2 + mova m0, [wq] ; wre + mova m1, [wq+o1q] ; wim + mulps m2, m4, m0 ; r2*wre + mova m6, Z2(6) ; r3 + mulps m3, m5, m1 ; i2*wim + mova m7, Z2(7) ; i3 + mulps m4, m4, m1 ; r2*wim + mulps m5, m5, m0 ; i2*wre + addps m2, m2, m3 ; r2*wre + i2*wim + mulps m3, m1, m7 ; i3*wim + mulps m1, m1, m6 ; r3*wim + subps m5, m5, m4 ; i2*wre - r2*wim + mulps m4, m0, m6 ; r3*wre + mulps m0, m0, m7 ; i3*wre + subps m4, m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m0, m1 ; i3*wre + r3*wim + subps m1, m4, m2 ; t3 + addps m4, m4, m2 ; t5 + subps m3, m3, m4 ; r2 + addps m4, m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + subps m3, m5, m0 ; t4 + subps m4, m6, m3 ; r3 + addps m3, m3, m6 ; r1 +IF%1 mova Z2(6), m4 +IF%1 mova Z(2), m3 + mova m2, Z(3) + addps m5, m5, m0 ; t6 + subps m2, m2, m1 ; i3 + mova m7, Z(1) + addps m1, m1, Z(3) ; i1 +IF%1 mova Z2(7), m2 +IF%1 mova Z(3), m1 + subps m6, m7, m5 ; i2 + addps m5, m5, m7 ; i0 +IF%1 mova Z(5), m6 +IF%1 mova Z(1), m5 +%if %1==0 + INTERL m1, m3, m7, Z, 2 + INTERL m2, m4, m0, Z2, 6 + + mova m1, Z(0) + mova m2, Z(4) + + INTERL m5, m1, m3, Z, 0 + INTERL m6, m2, m7, Z, 4 +%endif +%endmacro + +%define Z(x) [r0+mmsize*x] +%define Z2(x) [r0+mmsize*x] +%define ZH(x) [r0+mmsize*x+mmsize/2] + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +align 16 +fft8_avx: + mova m0, Z(0) + mova m1, Z(1) + T8_AVX m0, m1, m2, m3, m4 + mova Z(0), m0 + mova Z(1), m1 + ret + + +align 16 +fft16_avx: + mova m2, Z(2) + mova m3, Z(3) + T4_SSE m2, m3, m7 + + mova m0, Z(0) + mova m1, Z(1) + T8_AVX m0, m1, m4, m5, m7 + + mova m4, [ps_cos16_1] + mova m5, [ps_cos16_2] + vmulps m6, m2, m4 + vmulps m7, m3, m5 + vaddps m7, m7, m6 + vmulps m2, m2, m5 + vmulps m3, m3, m4 + vsubps m3, m3, m2 + vblendps m2, m7, m3, 0xf0 + vperm2f128 m3, m7, m3, 0x21 + vaddps m4, m2, m3 + vsubps m2, m3, m2 + vperm2f128 m2, m2, m2, 0x01 + vsubps m3, m1, m2 + vaddps m1, m1, m2 + vsubps m5, m0, m4 + vaddps m0, m0, m4 + vextractf128 Z(0), m0, 0 + vextractf128 ZH(0), m1, 0 + vextractf128 Z(1), m0, 1 + vextractf128 ZH(1), m1, 1 + vextractf128 Z(2), m5, 0 + vextractf128 ZH(2), m3, 0 + vextractf128 Z(3), m5, 1 + vextractf128 ZH(3), m3, 1 + ret + +align 16 +fft32_avx: + call fft16_avx + + mova m0, Z(4) + mova m1, Z(5) + + T4_SSE m0, m1, m4 + + mova m2, Z(6) + mova m3, Z(7) + + T8_SSE m0, m1, m2, m3, m4, m6 + ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} + ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} + + vperm2f128 m4, m0, m2, 0x20 + vperm2f128 m5, m1, m3, 0x20 + vperm2f128 m6, m0, m2, 0x31 + vperm2f128 m7, m1, m3, 0x31 + + PASS_SMALL 0, [cos_32], [cos_32+32] + + ret + +fft32_interleave_avx: + call fft32_avx + mov r2d, 32 +.deint_loop: + mova m2, Z(0) + mova m3, Z(1) + vunpcklps m0, m2, m3 + vunpckhps m1, m2, m3 + vextractf128 Z(0), m0, 0 + vextractf128 ZH(0), m1, 0 + vextractf128 Z(1), m0, 1 + vextractf128 ZH(1), m1, 1 + add r0, mmsize*2 + sub r2d, mmsize/4 + jg .deint_loop + ret + +%endif + +INIT_XMM sse + +align 16 +fft4_avx: +fft4_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova Z(0), m0 + mova Z(1), m1 + ret + +align 16 +fft8_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + ret + +align 16 +fft16_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova m4, Z(4) + mova m5, Z(5) + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + T4_SSE m4, m5, m6 + mova m6, Z2(6) + mova m7, Z2(7) + T4_SSE m6, m7, m0 + PASS_SMALL 0, [cos_16], [cos_16+16] + ret + + +%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] +%define Z2(x) [zcq + o3q + mmsize*(x&1)] +%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] +%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] + +%macro DECL_PASS 2+ ; name, payload +align 16 +%1: +DEFINE_ARGS zc, w, n, o1, o3 + lea o3q, [nq*3] + lea o1q, [nq*8] + shl o3q, 4 +.loop: + %2 + add zcq, mmsize*2 + add wq, mmsize + sub nd, mmsize/8 + jg .loop + rep ret +%endmacro + +%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs + lea r2, [dispatch_tab%1] + mov r2, [r2 + (%2q-2)*gprsize] +%ifdef PIC + lea r3, [$$] + add r2, r3 +%endif + call r2 +%endmacro ; FFT_DISPATCH + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +DECL_PASS pass_avx, PASS_BIG 1 +DECL_PASS pass_interleave_avx, PASS_BIG 0 + +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + RET + +%endif + +INIT_XMM sse + +DECL_PASS pass_sse, PASS_BIG 1 +DECL_PASS pass_interleave_sse, PASS_BIG 0 + +INIT_XMM sse +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + PUSH r1 + PUSH r3 + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + POP rcx + POP r4 + cmp rcx, 3+(mmsize/16) + jg .end + mov r2, -1 + add rcx, 3 + shl r2, cl + sub r4, r2 +.loop: + movaps xmm0, [r4 + r2] + movaps xmm1, xmm0 + unpcklps xmm0, [r4 + r2 + 16] + unpckhps xmm1, [r4 + r2 + 16] + movaps [r4 + r2], xmm0 + movaps [r4 + r2 + 16], xmm1 + add r2, mmsize*2 + jl .loop +.end: + RET + +cglobal fft_permute, 2,7,1 + mov r4, [r0 + FFTContext.revtab] + mov r5, [r0 + FFTContext.tmpbuf] + mov ecx, [r0 + FFTContext.nbits] + mov r2, 1 + shl r2, cl + xor r0, r0 +%if ARCH_X86_32 + mov r1, r1m +%endif +.loop: + movaps xmm0, [r1 + 8*r0] + movzx r6, word [r4 + 2*r0] + movzx r3, word [r4 + 2*r0 + 2] + movlps [r5 + 8*r6], xmm0 + movhps [r5 + 8*r3], xmm0 + add r0, 2 + cmp r0, r2 + jl .loop + shl r2, 3 + add r1, r2 + add r5, r2 + neg r2 +; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B +.loopcopy: + movaps xmm0, [r5 + r2] + movaps xmm1, [r5 + r2 + 16] + movaps [r1 + r2], xmm0 + movaps [r1 + r2 + 16], xmm1 + add r2, 32 + jl .loopcopy + RET + +INIT_XMM sse +cglobal imdct_calc, 3,5,3 + mov r3d, [r0 + FFTContext.mdctsize] + mov r4, [r0 + FFTContext.imdcthalf] + add r1, r3 + PUSH r3 + PUSH r1 +%if ARCH_X86_32 + push r2 + push r1 + push r0 +%else + sub rsp, 8+32*WIN64 ; allocate win64 shadow space +%endif + call r4 +%if ARCH_X86_32 + add esp, 12 +%else + add rsp, 8+32*WIN64 +%endif + POP r1 + POP r3 + lea r0, [r1 + 2*r3] + mov r2, r3 + sub r3, mmsize + neg r2 + mova m2, [ps_neg] +.loop: + mova m0, [r1 + r3] + mova m1, [r0 + r2] + shufps m0, m0, 0x1b + shufps m1, m1, 0x1b + xorps m0, m2 + mova [r0 + r3], m1 + mova [r1 + r2], m0 + sub r3, mmsize + add r2, mmsize + jl .loop + RET + +%ifdef PIC +%define SECTION_REL - $$ +%else +%define SECTION_REL +%endif + +%macro DECL_FFT 1-2 ; nbits, suffix +%ifidn %0, 1 +%xdefine fullsuffix SUFFIX +%else +%xdefine fullsuffix %2 %+ SUFFIX +%endif +%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL +%if %1>=5 +%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL +%endif +%if %1>=6 +%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL +%endif + +%assign n 1<<%1 +%rep 18-%1 +%assign n2 n/2 +%assign n4 n/4 +%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL + +align 16 +fft %+ n %+ fullsuffix: + call fft %+ n2 %+ SUFFIX + add r0, n*4 - (n&(-2<<%1)) + call fft %+ n4 %+ SUFFIX + add r0, n*2 - (n2&(-2<<%1)) + call fft %+ n4 %+ SUFFIX + sub r0, n*6 + (n2&(-2<<%1)) + lea r1, [cos_ %+ n] + mov r2d, n4/2 + jmp pass %+ fullsuffix + +%assign n n*2 +%endrep +%undef n + +align 8 +dispatch_tab %+ fullsuffix: pointer list_of_fft +%endmacro ; DECL_FFT + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +DECL_FFT 6 +DECL_FFT 6, _interleave +%endif +INIT_XMM sse +DECL_FFT 5 +DECL_FFT 5, _interleave + +INIT_XMM sse +%undef mulps +%undef addps +%undef subps +%undef unpcklps +%undef unpckhps + +%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 + movaps xmm0, [%3+%2*4] + movaps xmm1, [%3+%1*4-0x10] + movaps xmm2, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm1, xmm2, 0x77 + movlps xmm4, [%4+%2*2] + movlps xmm5, [%5+%2*2+0x0] + movhps xmm4, [%4+%1*2-0x8] + movhps xmm5, [%5+%1*2-0x8] + movaps xmm2, xmm0 + movaps xmm3, xmm1 + mulps xmm0, xmm5 + mulps xmm1, xmm4 + mulps xmm2, xmm4 + mulps xmm3, xmm5 + subps xmm1, xmm0 + addps xmm2, xmm3 + movaps xmm0, xmm1 + unpcklps xmm1, xmm2 + unpckhps xmm0, xmm2 +%endmacro + +%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 + mulps m6, %3, [%5+%1] + mulps m7, %2, [%5+%1] + mulps %2, %2, [%6+%1] + mulps %3, %3, [%6+%1] + subps %2, %2, m6 + addps %3, %3, m7 +%endmacro + +%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 +.post: +%if cpuflag(avx) + vmovaps ymm1, [%3+%1*2] + vmovaps ymm0, [%3+%1*2+0x20] + vmovaps ymm3, [%3+%2*2] + vmovaps ymm2, [%3+%2*2+0x20] + + CMUL %1, ymm0, ymm1, %3, %4, %5 + CMUL %2, ymm2, ymm3, %3, %4, %5 + vshufps ymm1, ymm1, ymm1, 0x1b + vshufps ymm3, ymm3, ymm3, 0x1b + vperm2f128 ymm1, ymm1, ymm1, 0x01 + vperm2f128 ymm3, ymm3, ymm3, 0x01 + vunpcklps ymm6, ymm2, ymm1 + vunpckhps ymm4, ymm2, ymm1 + vunpcklps ymm7, ymm0, ymm3 + vunpckhps ymm5, ymm0, ymm3 + + vextractf128 [%3+%1*2], ymm7, 0 + vextractf128 [%3+%1*2+0x10], ymm5, 0 + vextractf128 [%3+%1*2+0x20], ymm7, 1 + vextractf128 [%3+%1*2+0x30], ymm5, 1 + + vextractf128 [%3+%2*2], ymm6, 0 + vextractf128 [%3+%2*2+0x10], ymm4, 0 + vextractf128 [%3+%2*2+0x20], ymm6, 1 + vextractf128 [%3+%2*2+0x30], ymm4, 1 + sub %2, 0x20 + add %1, 0x20 + jl .post +%else + movaps xmm1, [%3+%1*2] + movaps xmm0, [%3+%1*2+0x10] + CMUL %1, xmm0, xmm1, %3, %4, %5 + movaps xmm5, [%3+%2*2] + movaps xmm4, [%3+%2*2+0x10] + CMUL %2, xmm4, xmm5, %3, %4, %5 + shufps xmm1, xmm1, 0x1b + shufps xmm5, xmm5, 0x1b + movaps xmm6, xmm4 + unpckhps xmm4, xmm1 + unpcklps xmm6, xmm1 + movaps xmm2, xmm0 + unpcklps xmm0, xmm5 + unpckhps xmm2, xmm5 + movaps [%3+%2*2], xmm6 + movaps [%3+%2*2+0x10], xmm4 + movaps [%3+%1*2], xmm0 + movaps [%3+%1*2+0x10], xmm2 + sub %2, 0x10 + add %1, 0x10 + jl .post +%endif +%endmacro + +%macro DECL_IMDCT 0 +cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input +%if ARCH_X86_64 +%define rrevtab r7 +%define rtcos r8 +%define rtsin r9 +%else +%define rrevtab r6 +%define rtsin r6 +%define rtcos r5 +%endif + mov r3d, [r0+FFTContext.mdctsize] + add r2, r3 + shr r3, 1 + mov rtcos, [r0+FFTContext.tcos] + mov rtsin, [r0+FFTContext.tsin] + add rtcos, r3 + add rtsin, r3 +%if ARCH_X86_64 == 0 + push rtcos + push rtsin +%endif + shr r3, 1 + mov rrevtab, [r0+FFTContext.revtab] + add rrevtab, r3 +%if ARCH_X86_64 == 0 + push rrevtab +%endif + + sub r3, 4 +%if ARCH_X86_64 + xor r4, r4 + sub r4, r3 +%endif +.pre: +%if ARCH_X86_64 == 0 +;unspill + xor r4, r4 + sub r4, r3 + mov rtcos, [esp+8] + mov rtsin, [esp+4] +%endif + + PREROTATER r4, r3, r2, rtcos, rtsin +%if ARCH_X86_64 + movzx r5, word [rrevtab+r4-4] + movzx r6, word [rrevtab+r4-2] + movzx r10, word [rrevtab+r3] + movzx r11, word [rrevtab+r3+2] + movlps [r1+r5 *8], xmm0 + movhps [r1+r6 *8], xmm0 + movlps [r1+r10*8], xmm1 + movhps [r1+r11*8], xmm1 + add r4, 4 +%else + mov r6, [esp] + movzx r5, word [r6+r4-4] + movzx r4, word [r6+r4-2] + movlps [r1+r5*8], xmm0 + movhps [r1+r4*8], xmm0 + movzx r5, word [r6+r3] + movzx r4, word [r6+r3+2] + movlps [r1+r5*8], xmm1 + movhps [r1+r4*8], xmm1 +%endif + sub r3, 4 + jns .pre + + mov r5, r0 + mov r6, r1 + mov r0, r1 + mov r1d, [r5+FFTContext.nbits] + + FFT_DISPATCH SUFFIX, r1 + + mov r0d, [r5+FFTContext.mdctsize] + add r6, r0 + shr r0, 1 +%if ARCH_X86_64 == 0 +%define rtcos r2 +%define rtsin r3 + mov rtcos, [esp+8] + mov rtsin, [esp+4] +%endif + neg r0 + mov r1, -mmsize + sub r1, r0 + POSROTATESHUF r0, r1, r6, rtcos, rtsin +%if ARCH_X86_64 == 0 + add esp, 12 +%endif + RET +%endmacro + +DECL_IMDCT + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +DECL_IMDCT +%endif diff --git a/media/ffvpx/libavcodec/x86/fft.h b/media/ffvpx/libavcodec/x86/fft.h new file mode 100644 index 0000000000..37418ec1f4 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fft.h @@ -0,0 +1,32 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_FFT_H +#define AVCODEC_X86_FFT_H + +#include "libavcodec/fft.h" + +void ff_fft_permute_sse(FFTContext *s, FFTComplex *z); +void ff_fft_calc_avx(FFTContext *s, FFTComplex *z); +void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); + +void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); + +#endif /* AVCODEC_X86_FFT_H */ diff --git a/media/ffvpx/libavcodec/x86/fft_init.c b/media/ffvpx/libavcodec/x86/fft_init.c new file mode 100644 index 0000000000..df79d57dc7 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fft_init.c @@ -0,0 +1,47 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" + +#include "fft.h" + +av_cold void ff_fft_init_x86(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (s->nbits > 16) + return; + + if (EXTERNAL_SSE(cpu_flags)) { + s->imdct_calc = ff_imdct_calc_sse; + s->imdct_half = ff_imdct_half_sse; + s->fft_permute = ff_fft_permute_sse; + s->fft_calc = ff_fft_calc_sse; + s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; + } + + if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) { + s->imdct_half = ff_imdct_half_avx; + s->fft_calc = ff_fft_calc_avx; + s->fft_permutation = FF_FFT_PERM_AVX; + } +} diff --git a/media/ffvpx/libavcodec/x86/flacdsp.asm b/media/ffvpx/libavcodec/x86/flacdsp.asm new file mode 100644 index 0000000000..44416e4dfd --- /dev/null +++ b/media/ffvpx/libavcodec/x86/flacdsp.asm @@ -0,0 +1,326 @@ +;****************************************************************************** +;* FLAC DSP SIMD optimizations +;* +;* Copyright (C) 2014 Loren Merritt +;* Copyright (C) 2014 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +vector: db 0,1,4,5,8,9,12,13,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,1,4,5,8,9,12,13, + +SECTION .text + +%macro PMACSDQL 5 +%if cpuflag(xop) + pmacsdql %1, %2, %3, %1 +%else + pmuldq %2, %3 + paddq %1, %2 +%endif +%endmacro + +%macro LPC_32 1 +INIT_XMM %1 +cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j + sub lend, pred_orderd + jle .ret + lea decodedq, [decodedq+pred_orderq*4-8] + lea coeffsq, [coeffsq+pred_orderq*4] + neg pred_orderq + movd m4, qlevelm +ALIGN 16 +.loop_sample: + movd m0, [decodedq+pred_orderq*4+8] + add decodedq, 8 + movd m1, [coeffsq+pred_orderq*4] + pxor m2, m2 + pxor m3, m3 + lea jq, [pred_orderq+1] + test jq, jq + jz .end_order +.loop_order: + PMACSDQL m2, m0, m1, m2, m0 + movd m0, [decodedq+jq*4] + PMACSDQL m3, m1, m0, m3, m1 + movd m1, [coeffsq+jq*4] + inc jq + jl .loop_order +.end_order: + PMACSDQL m2, m0, m1, m2, m0 + psrlq m2, m4 + movd m0, [decodedq] + paddd m0, m2 + movd [decodedq], m0 + sub lend, 2 + jl .ret + PMACSDQL m3, m1, m0, m3, m1 + psrlq m3, m4 + movd m1, [decodedq+4] + paddd m1, m3 + movd [decodedq+4], m1 + jg .loop_sample +.ret: + RET +%endmacro + +%if HAVE_XOP_EXTERNAL +LPC_32 xop +%endif +LPC_32 sse4 + +;---------------------------------------------------------------------------------- +;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels, +; int len, int shift); +;---------------------------------------------------------------------------------- +%macro FLAC_DECORRELATE_16 3-4 +cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len +%ifidn %1, indep2 + VBROADCASTI128 m2, [vector] +%endif +%if ARCH_X86_32 + mov lend, lenm +%endif + movd m3, r4m + shl lend, 2 + mov in1q, [in0q + gprsize] + mov in0q, [in0q] + mov outq, [outq] + add in1q, lenq + add in0q, lenq + add outq, lenq + neg lenq + +align 16 +.loop: + mova m0, [in0q + lenq] + mova m1, [in1q + lenq] +%ifidn %1, ms + psrad m2, m1, 1 + psubd m0, m2 +%endif +%ifnidn %1, indep2 + p%4d m2, m0, m1 + packssdw m%2, m%2 + packssdw m%3, m%3 + punpcklwd m%2, m%3 + psllw m%2, m3 +%else + pslld m%2, m3 + pslld m%3, m3 + pshufb m%2, m%2, m2 + pshufb m%3, m%3, m2 + punpcklwd m%2, m%3 +%endif + mova [outq + lenq], m%2 + add lenq, 16 + jl .loop + RET +%endmacro + +INIT_XMM sse2 +FLAC_DECORRELATE_16 ls, 0, 2, sub +FLAC_DECORRELATE_16 rs, 2, 1, add +FLAC_DECORRELATE_16 ms, 2, 0, add + +;---------------------------------------------------------------------------------- +;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels, +; int len, int shift); +;---------------------------------------------------------------------------------- +%macro FLAC_DECORRELATE_32 5 +cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len +%if ARCH_X86_32 + mov lend, lenm +%endif + movd m3, r4m + mov in1q, [in0q + gprsize] + mov in0q, [in0q] + mov outq, [outq] + sub in1q, in0q + +align 16 +.loop: + mova m0, [in0q] + mova m1, [in0q + in1q] +%ifidn %1, ms + psrad m2, m1, 1 + psubd m0, m2 +%endif + p%5d m2, m0, m1 + pslld m%2, m3 + pslld m%3, m3 + + SBUTTERFLY dq, %2, %3, %4 + + mova [outq ], m%2 + mova [outq + mmsize], m%3 + + add in0q, mmsize + add outq, mmsize*2 + sub lend, mmsize/4 + jg .loop + RET +%endmacro + +INIT_XMM sse2 +FLAC_DECORRELATE_32 ls, 0, 2, 1, sub +FLAC_DECORRELATE_32 rs, 2, 1, 0, add +FLAC_DECORRELATE_32 ms, 2, 0, 1, add + +;----------------------------------------------------------------------------------------- +;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels, +; int len, int shift); +;----------------------------------------------------------------------------------------- +;%1 = bps +;%2 = channels +;%3 = last xmm reg used +;%4 = word/dword (shift instruction) +%macro FLAC_DECORRELATE_INDEP 4 +%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels +cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7 +%if ARCH_X86_32 +%if %2 == 6 + DEFINE_ARGS out, in0, in1, in2, in3, in4, in5 + %define lend dword r3m +%else + mov lend, lenm +%endif +%endif + movd m%3, r4m + +%assign %%i 1 +%rep %2-1 + mov in %+ %%i %+ q, [in0q+%%i*gprsize] +%assign %%i %%i+1 +%endrep + + mov in0q, [in0q] + mov outq, [outq] + +%assign %%i 1 +%rep %2-1 + sub in %+ %%i %+ q, in0q +%assign %%i %%i+1 +%endrep + +align 16 +.loop: + mova m0, [in0q] + +%assign %%i 1 +%rep REPCOUNT-1 + mova m %+ %%i, [in0q + in %+ %%i %+ q] +%assign %%i %%i+1 +%endrep + +%if %1 == 32 + +%if %2 == 8 + TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8 +%elif %2 == 6 + SBUTTERFLY dq, 0, 1, 6 + SBUTTERFLY dq, 2, 3, 6 + SBUTTERFLY dq, 4, 5, 6 + + punpcklqdq m6, m0, m2 + punpckhqdq m2, m4 + shufps m4, m0, 0xe4 + punpcklqdq m0, m1, m3 + punpckhqdq m3, m5 + shufps m5, m1, 0xe4 + SWAP 0,6,1,4,5,3 +%elif %2 == 4 + TRANSPOSE4x4D 0, 1, 2, 3, 4 +%else ; %2 == 2 + SBUTTERFLY dq, 0, 1, 2 +%endif + +%else ; %1 == 16 + +%if %2 == 8 + packssdw m0, [in0q + in4q] + packssdw m1, [in0q + in5q] + packssdw m2, [in0q + in6q] + packssdw m3, [in0q + in7q] + TRANSPOSE2x4x4W 0, 1, 2, 3, 4 +%elif %2 == 6 + packssdw m0, [in0q + in3q] + packssdw m1, [in0q + in4q] + packssdw m2, [in0q + in5q] + pshufd m3, m0, q1032 + punpcklwd m0, m1 + punpckhwd m1, m2 + punpcklwd m2, m3 + + shufps m3, m0, m2, q2020 + shufps m0, m1, q2031 + shufps m2, m1, q3131 + shufps m1, m2, m3, q3120 + shufps m3, m0, q0220 + shufps m0, m2, q3113 + SWAP 2, 0, 3 +%else ; %2 == 4 + packssdw m0, [in0q + in2q] + packssdw m1, [in0q + in3q] + SBUTTERFLY wd, 0, 1, 2 + SBUTTERFLY dq, 0, 1, 2 +%endif + +%endif + +%assign %%i 0 +%rep REPCOUNT + psll%4 m %+ %%i, m%3 +%assign %%i %%i+1 +%endrep + +%assign %%i 0 +%rep REPCOUNT + mova [outq + %%i*mmsize], m %+ %%i +%assign %%i %%i+1 +%endrep + + add in0q, mmsize + add outq, mmsize*REPCOUNT + sub lend, mmsize/4 + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro +FLAC_DECORRELATE_INDEP 32, 2, 3, d +FLAC_DECORRELATE_INDEP 16, 4, 3, w +FLAC_DECORRELATE_INDEP 32, 4, 5, d +FLAC_DECORRELATE_INDEP 16, 6, 4, w +FLAC_DECORRELATE_INDEP 32, 6, 7, d +%if ARCH_X86_64 +FLAC_DECORRELATE_INDEP 16, 8, 5, w +FLAC_DECORRELATE_INDEP 32, 8, 9, d +%endif + +INIT_XMM avx +FLAC_DECORRELATE_INDEP 32, 4, 5, d +FLAC_DECORRELATE_INDEP 32, 6, 7, d +%if ARCH_X86_64 +FLAC_DECORRELATE_INDEP 16, 8, 5, w +FLAC_DECORRELATE_INDEP 32, 8, 9, d +%endif diff --git a/media/ffvpx/libavcodec/x86/flacdsp_init.c b/media/ffvpx/libavcodec/x86/flacdsp_init.c new file mode 100644 index 0000000000..87daed7005 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/flacdsp_init.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2014 James Almer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavcodec/flacdsp.h" +#include "libavutil/x86/cpu.h" +#include "config.h" + +void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order, + int qlevel, int len); +void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order, + int qlevel, int len); + +#define DECORRELATE_FUNCS(fmt, opt) \ +void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_rs_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_ms_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift) + +#define DECORRELATE_IFUNCS(fmt, opt) \ +void ff_flac_decorrelate_indep2_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_indep4_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_indep6_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift); \ +void ff_flac_decorrelate_indep8_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ + int len, int shift) + +DECORRELATE_FUNCS(16, sse2); +DECORRELATE_FUNCS(16, avx); +DECORRELATE_FUNCS(32, sse2); +DECORRELATE_FUNCS(32, avx); +DECORRELATE_IFUNCS(16, ssse3); +DECORRELATE_IFUNCS(16, avx); +DECORRELATE_IFUNCS(32, ssse3); +DECORRELATE_IFUNCS(32, avx); + +av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int channels) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) { + if (fmt == AV_SAMPLE_FMT_S16) { + c->decorrelate[1] = ff_flac_decorrelate_ls_16_sse2; + c->decorrelate[2] = ff_flac_decorrelate_rs_16_sse2; + c->decorrelate[3] = ff_flac_decorrelate_ms_16_sse2; + } else if (fmt == AV_SAMPLE_FMT_S32) { + c->decorrelate[1] = ff_flac_decorrelate_ls_32_sse2; + c->decorrelate[2] = ff_flac_decorrelate_rs_32_sse2; + c->decorrelate[3] = ff_flac_decorrelate_ms_32_sse2; + } + } + if (EXTERNAL_SSSE3(cpu_flags)) { + if (fmt == AV_SAMPLE_FMT_S16) { + if (channels == 2) + c->decorrelate[0] = ff_flac_decorrelate_indep2_16_ssse3; + else if (channels == 4) + c->decorrelate[0] = ff_flac_decorrelate_indep4_16_ssse3; + else if (channels == 6) + c->decorrelate[0] = ff_flac_decorrelate_indep6_16_ssse3; + else if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_16_ssse3; + } else if (fmt == AV_SAMPLE_FMT_S32) { + if (channels == 2) + c->decorrelate[0] = ff_flac_decorrelate_indep2_32_ssse3; + else if (channels == 4) + c->decorrelate[0] = ff_flac_decorrelate_indep4_32_ssse3; + else if (channels == 6) + c->decorrelate[0] = ff_flac_decorrelate_indep6_32_ssse3; + else if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_32_ssse3; + } + } + if (EXTERNAL_SSE4(cpu_flags)) { + c->lpc32 = ff_flac_lpc_32_sse4; + } + if (EXTERNAL_AVX(cpu_flags)) { + if (fmt == AV_SAMPLE_FMT_S16) { + if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_16_avx; + } else if (fmt == AV_SAMPLE_FMT_S32) { + if (channels == 4) + c->decorrelate[0] = ff_flac_decorrelate_indep4_32_avx; + else if (channels == 6) + c->decorrelate[0] = ff_flac_decorrelate_indep6_32_avx; + else if (ARCH_X86_64 && channels == 8) + c->decorrelate[0] = ff_flac_decorrelate_indep8_32_avx; + } + } + if (EXTERNAL_XOP(cpu_flags)) { + c->lpc32 = ff_flac_lpc_32_xop; + } +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred.asm b/media/ffvpx/libavcodec/x86/h264_intrapred.asm new file mode 100644 index 0000000000..8a38ba2bb5 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/h264_intrapred.asm @@ -0,0 +1,2044 @@ +;****************************************************************************** +;* H.264 intra prediction asm optimizations +;* Copyright (c) 2010 Fiona Glaser +;* Copyright (c) 2010 Holger Lubitz +;* Copyright (c) 2010 Loren Merritt +;* Copyright (c) 2010 Ronald S. Bultje +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +tm_shuf: times 8 db 0x03, 0x80 +pw_ff00: times 8 dw 0xff00 +plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 + db 1, 2, 3, 4, 5, 6, 7, 8 +plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 + db 1, 2, 3, 4, 0, 0, 0, 0 +pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 +pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 +pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 +pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 + +SECTION .text + +cextern pb_1 +cextern pb_3 +cextern pw_4 +cextern pw_8 + +;----------------------------------------------------------------------------- +; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_XMM sse +cglobal pred16x16_vertical_8, 2,3 + sub r0, r1 + mov r2, 4 + movaps xmm0, [r0] +.loop: + movaps [r0+r1*1], xmm0 + movaps [r0+r1*2], xmm0 + lea r0, [r0+r1*2] + movaps [r0+r1*1], xmm0 + movaps [r0+r1*2], xmm0 + lea r0, [r0+r1*2] + dec r2 + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED16x16_H 0 +cglobal pred16x16_horizontal_8, 2,3 + mov r2, 8 +%if cpuflag(ssse3) + mova m2, [pb_3] +%endif +.loop: + movd m0, [r0+r1*0-4] + movd m1, [r0+r1*1-4] + +%if cpuflag(ssse3) + pshufb m0, m2 + pshufb m1, m2 +%else + punpcklbw m0, m0 + punpcklbw m1, m1 + SPLATW m0, m0, 3 + SPLATW m1, m1, 3 + mova [r0+r1*0+8], m0 + mova [r0+r1*1+8], m1 +%endif + + mova [r0+r1*0], m0 + mova [r0+r1*1], m1 + lea r0, [r0+r1*2] + dec r2 + jg .loop + RET +%endmacro + +INIT_MMX mmxext +PRED16x16_H +INIT_XMM ssse3 +PRED16x16_H + +;----------------------------------------------------------------------------- +; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED16x16_DC 0 +cglobal pred16x16_dc_8, 2,7 + mov r4, r0 + sub r0, r1 + pxor mm0, mm0 + pxor mm1, mm1 + psadbw mm0, [r0+0] + psadbw mm1, [r0+8] + dec r0 + movzx r5d, byte [r0+r1*1] + paddw mm0, mm1 + movd r6d, mm0 + lea r0, [r0+r1*2] +%rep 7 + movzx r2d, byte [r0+r1*0] + movzx r3d, byte [r0+r1*1] + add r5d, r2d + add r6d, r3d + lea r0, [r0+r1*2] +%endrep + movzx r2d, byte [r0+r1*0] + add r5d, r6d + lea r2d, [r2+r5+16] + shr r2d, 5 +%if cpuflag(ssse3) + pxor m1, m1 +%endif + SPLATB_REG m0, r2, m1 + + mov r3d, 4 +.loop: + mova [r4+r1*0], m0 + mova [r4+r1*1], m0 + lea r4, [r4+r1*2] + mova [r4+r1*0], m0 + mova [r4+r1*1], m0 + lea r4, [r4+r1*2] + dec r3d + jg .loop + RET +%endmacro + +INIT_XMM sse2 +PRED16x16_DC +INIT_XMM ssse3 +PRED16x16_DC + +;----------------------------------------------------------------------------- +; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_XMM sse2 +cglobal pred16x16_tm_vp8_8, 2,6,6 + sub r0, r1 + pxor xmm2, xmm2 + movdqa xmm0, [r0] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + movzx r4d, byte [r0-1] + mov r5d, 8 +.loop: + movzx r2d, byte [r0+r1*1-1] + movzx r3d, byte [r0+r1*2-1] + sub r2d, r4d + sub r3d, r4d + movd xmm2, r2d + movd xmm4, r3d + pshuflw xmm2, xmm2, 0 + pshuflw xmm4, xmm4, 0 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm4, xmm4 + movdqa xmm3, xmm2 + movdqa xmm5, xmm4 + paddw xmm2, xmm0 + paddw xmm3, xmm1 + paddw xmm4, xmm0 + paddw xmm5, xmm1 + packuswb xmm2, xmm3 + packuswb xmm4, xmm5 + movdqa [r0+r1*1], xmm2 + movdqa [r0+r1*2], xmm4 + lea r0, [r0+r1*2] + dec r5d + jg .loop + RET + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration + sub dstq, strideq + pmovzxbw m0, [dstq] + vpbroadcastb xm1, [r0-1] + pmovzxbw m1, xm1 + psubw m0, m1 + mov iterationd, 4 + lea stride3q, [strideq*3] +.loop: + vpbroadcastb xm1, [dstq+strideq*1-1] + vpbroadcastb xm2, [dstq+strideq*2-1] + vpbroadcastb xm3, [dstq+stride3q-1] + vpbroadcastb xm4, [dstq+strideq*4-1] + pmovzxbw m1, xm1 + pmovzxbw m2, xm2 + pmovzxbw m3, xm3 + pmovzxbw m4, xm4 + paddw m1, m0 + paddw m2, m0 + paddw m3, m0 + paddw m4, m0 + vpackuswb m1, m1, m2 + vpackuswb m3, m3, m4 + vpermq m1, m1, q3120 + vpermq m3, m3, q3120 + movdqa [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*2], m1, 1 + movdqa [dstq+stride3q*1], xm3 + vextracti128 [dstq+strideq*4], m3, 1 + lea dstq, [dstq+strideq*4] + dec iterationd + jg .loop + RET +%endif + +;----------------------------------------------------------------------------- +; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro H264_PRED16x16_PLANE 1 +cglobal pred16x16_plane_%1_8, 2,9,7 + mov r2, r1 ; +stride + neg r1 ; -stride + + movh m0, [r0+r1 -1] +%if cpuflag(ssse3) + movhps m0, [r0+r1 +8] + pmaddubsw m0, [plane_shuf] ; H coefficients +%else ; sse2 + pxor m2, m2 + movh m1, [r0+r1 +8] + punpcklbw m0, m2 + punpcklbw m1, m2 + pmullw m0, [pw_m8tom1] + pmullw m1, [pw_1to8] + paddw m0, m1 +%endif + movhlps m1, m0 + paddw m0, m1 + PSHUFLW m1, m0, 0xE + paddw m0, m1 + PSHUFLW m1, m0, 0x1 + paddw m0, m1 ; sum of H coefficients + + lea r4, [r0+r2*8-1] + lea r3, [r0+r2*4-1] + add r4, r2 + +%if ARCH_X86_64 +%define e_reg r8 +%else +%define e_reg r0 +%endif + + movzx e_reg, byte [r3+r2*2 ] + movzx r5, byte [r4+r1 ] + sub r5, e_reg + + movzx e_reg, byte [r3+r2 ] + movzx r6, byte [r4 ] + sub r6, e_reg + lea r5, [r5+r6*2] + + movzx e_reg, byte [r3+r1 ] + movzx r6, byte [r4+r2*2 ] + sub r6, e_reg + lea r5, [r5+r6*4] + + movzx e_reg, byte [r3 ] +%if ARCH_X86_64 + movzx r7, byte [r4+r2 ] + sub r7, e_reg +%else + movzx r6, byte [r4+r2 ] + sub r6, e_reg + lea r5, [r5+r6*4] + sub r5, r6 +%endif + + lea e_reg, [r3+r1*4] + lea r3, [r4+r2*4] + + movzx r4, byte [e_reg+r2 ] + movzx r6, byte [r3 ] + sub r6, r4 +%if ARCH_X86_64 + lea r6, [r7+r6*2] + lea r5, [r5+r6*2] + add r5, r6 +%else + lea r5, [r5+r6*4] + lea r5, [r5+r6*2] +%endif + + movzx r4, byte [e_reg ] +%if ARCH_X86_64 + movzx r7, byte [r3 +r2 ] + sub r7, r4 + sub r5, r7 +%else + movzx r6, byte [r3 +r2 ] + sub r6, r4 + lea r5, [r5+r6*8] + sub r5, r6 +%endif + + movzx r4, byte [e_reg+r1 ] + movzx r6, byte [r3 +r2*2] + sub r6, r4 +%if ARCH_X86_64 + add r6, r7 +%endif + lea r5, [r5+r6*8] + + movzx r4, byte [e_reg+r2*2] + movzx r6, byte [r3 +r1 ] + sub r6, r4 + lea r5, [r5+r6*4] + add r5, r6 ; sum of V coefficients + +%if ARCH_X86_64 == 0 + mov r0, r0m +%endif + +%ifidn %1, h264 + lea r5, [r5*5+32] + sar r5, 6 +%elifidn %1, rv40 + lea r5, [r5*5] + sar r5, 6 +%elifidn %1, svq3 + test r5, r5 + lea r6, [r5+3] + cmovs r5, r6 + sar r5, 2 ; V/4 + lea r5, [r5*5] ; 5*(V/4) + test r5, r5 + lea r6, [r5+15] + cmovs r5, r6 + sar r5, 4 ; (5*(V/4))/16 +%endif + + movzx r4, byte [r0+r1 +15] + movzx r3, byte [r3+r2*2 ] + lea r3, [r3+r4+1] + shl r3, 4 + + movd r1d, m0 + movsx r1d, r1w +%ifnidn %1, svq3 +%ifidn %1, h264 + lea r1d, [r1d*5+32] +%else ; rv40 + lea r1d, [r1d*5] +%endif + sar r1d, 6 +%else ; svq3 + test r1d, r1d + lea r4d, [r1d+3] + cmovs r1d, r4d + sar r1d, 2 ; H/4 + lea r1d, [r1d*5] ; 5*(H/4) + test r1d, r1d + lea r4d, [r1d+15] + cmovs r1d, r4d + sar r1d, 4 ; (5*(H/4))/16 +%endif + movd m0, r1d + + add r1d, r5d + add r3d, r1d + shl r1d, 3 + sub r3d, r1d ; a + + movd m1, r5d + movd m3, r3d + SPLATW m0, m0, 0 ; H + SPLATW m1, m1, 0 ; V + SPLATW m3, m3, 0 ; a +%ifidn %1, svq3 + SWAP 0, 1 +%endif + mova m2, m0 + pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) + psllw m2, 3 + paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H + paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H + + mov r4, 8 +.loop: + mova m3, m0 ; b[0..7] + mova m4, m2 ; b[8..15] + psraw m3, 5 + psraw m4, 5 + packuswb m3, m4 + mova [r0], m3 + paddw m0, m1 + paddw m2, m1 + + mova m3, m0 ; b[0..7] + mova m4, m2 ; b[8..15] + psraw m3, 5 + psraw m4, 5 + packuswb m3, m4 + mova [r0+r2], m3 + paddw m0, m1 + paddw m2, m1 + + lea r0, [r0+r2*2] + dec r4 + jg .loop + RET +%endmacro + +INIT_XMM sse2 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 +INIT_XMM ssse3 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 + +;----------------------------------------------------------------------------- +; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro H264_PRED8x8_PLANE 0 +cglobal pred8x8_plane_8, 2,9,7 + mov r2, r1 ; +stride + neg r1 ; -stride + + movd m0, [r0+r1 -1] +%if cpuflag(ssse3) + movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary + pmaddubsw m0, [plane8_shuf] ; H coefficients +%else ; sse2 + pxor m2, m2 + movd m1, [r0+r1 +4] + punpckldq m0, m1 + punpcklbw m0, m2 + pmullw m0, [pw_m4to4] +%endif + movhlps m1, m0 + paddw m0, m1 + +%if notcpuflag(ssse3) + PSHUFLW m1, m0, 0xE + paddw m0, m1 +%endif ; !ssse3 + + PSHUFLW m1, m0, 0x1 + paddw m0, m1 ; sum of H coefficients + + lea r4, [r0+r2*4-1] + lea r3, [r0 -1] + add r4, r2 + +%if ARCH_X86_64 +%define e_reg r8 +%else +%define e_reg r0 +%endif + + movzx e_reg, byte [r3+r2*2 ] + movzx r5, byte [r4+r1 ] + sub r5, e_reg + + movzx e_reg, byte [r3 ] +%if ARCH_X86_64 + movzx r7, byte [r4+r2 ] + sub r7, e_reg + sub r5, r7 +%else + movzx r6, byte [r4+r2 ] + sub r6, e_reg + lea r5, [r5+r6*4] + sub r5, r6 +%endif + + movzx e_reg, byte [r3+r1 ] + movzx r6, byte [r4+r2*2 ] + sub r6, e_reg +%if ARCH_X86_64 + add r6, r7 +%endif + lea r5, [r5+r6*4] + + movzx e_reg, byte [r3+r2 ] + movzx r6, byte [r4 ] + sub r6, e_reg + lea r6, [r5+r6*2] + + lea r5, [r6*9+16] + lea r5, [r5+r6*8] + sar r5, 5 + +%if ARCH_X86_64 == 0 + mov r0, r0m +%endif + + movzx r3, byte [r4+r2*2 ] + movzx r4, byte [r0+r1 +7] + lea r3, [r3+r4+1] + shl r3, 4 + movd r1d, m0 + movsx r1d, r1w + imul r1d, 17 + add r1d, 16 + sar r1d, 5 + movd m0, r1d + add r1d, r5d + sub r3d, r1d + add r1d, r1d + sub r3d, r1d ; a + + movd m1, r5d + movd m3, r3d + SPLATW m0, m0, 0 ; H + SPLATW m1, m1, 0 ; V + SPLATW m3, m3, 0 ; a + pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) + paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H + + mov r4, 4 +ALIGN 16 +.loop: + mova m3, m0 ; b[0..7] + paddw m0, m1 + psraw m3, 5 + mova m4, m0 ; V+b[0..7] + paddw m0, m1 + psraw m4, 5 + packuswb m3, m4 + movh [r0], m3 + movhps [r0+r2], m3 + + lea r0, [r0+r2*2] + dec r4 + jg .loop + RET +%endmacro + +INIT_XMM sse2 +H264_PRED8x8_PLANE +INIT_XMM ssse3 +H264_PRED8x8_PLANE + +;----------------------------------------------------------------------------- +; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmx +cglobal pred8x8_vertical_8, 2,2 + sub r0, r1 + movq mm0, [r0] +%rep 3 + movq [r0+r1*1], mm0 + movq [r0+r1*2], mm0 + lea r0, [r0+r1*2] +%endrep + movq [r0+r1*1], mm0 + movq [r0+r1*2], mm0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8_H 0 +cglobal pred8x8_horizontal_8, 2,3 + mov r2, 4 +%if cpuflag(ssse3) + mova m2, [pb_3] +%endif +.loop: + SPLATB_LOAD m0, r0+r1*0-1, m2 + SPLATB_LOAD m1, r0+r1*1-1, m2 + mova [r0+r1*0], m0 + mova [r0+r1*1], m1 + lea r0, [r0+r1*2] + dec r2 + jg .loop + RET +%endmacro + +INIT_MMX mmxext +PRED8x8_H +INIT_MMX ssse3 +PRED8x8_H + +;----------------------------------------------------------------------------- +; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_MMX mmxext +cglobal pred8x8_top_dc_8, 2,5 + sub r0, r1 + movq mm0, [r0] + pxor mm1, mm1 + pxor mm2, mm2 + lea r2, [r0+r1*2] + punpckhbw mm1, mm0 + punpcklbw mm0, mm2 + psadbw mm1, mm2 ; s1 + lea r3, [r2+r1*2] + psadbw mm0, mm2 ; s0 + psrlw mm1, 1 + psrlw mm0, 1 + pavgw mm1, mm2 + lea r4, [r3+r1*2] + pavgw mm0, mm2 + pshufw mm1, mm1, 0 + pshufw mm0, mm0, 0 ; dc0 (w) + packuswb mm0, mm1 ; dc0,dc1 (b) + movq [r0+r1*1], mm0 + movq [r0+r1*2], mm0 + lea r0, [r3+r1*2] + movq [r2+r1*1], mm0 + movq [r2+r1*2], mm0 + movq [r3+r1*1], mm0 + movq [r3+r1*2], mm0 + movq [r0+r1*1], mm0 + movq [r0+r1*2], mm0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred8x8_dc_8, 2,5 + sub r0, r1 + pxor m7, m7 + movd m0, [r0+0] + movd m1, [r0+4] + psadbw m0, m7 ; s0 + mov r4, r0 + psadbw m1, m7 ; s1 + + movzx r2d, byte [r0+r1*1-1] + movzx r3d, byte [r0+r1*2-1] + lea r0, [r0+r1*2] + add r2d, r3d + movzx r3d, byte [r0+r1*1-1] + add r2d, r3d + movzx r3d, byte [r0+r1*2-1] + add r2d, r3d + lea r0, [r0+r1*2] + movd m2, r2d ; s2 + movzx r2d, byte [r0+r1*1-1] + movzx r3d, byte [r0+r1*2-1] + lea r0, [r0+r1*2] + add r2d, r3d + movzx r3d, byte [r0+r1*1-1] + add r2d, r3d + movzx r3d, byte [r0+r1*2-1] + add r2d, r3d + movd m3, r2d ; s3 + + punpcklwd m0, m1 + mov r0, r4 + punpcklwd m2, m3 + punpckldq m0, m2 ; s0, s1, s2, s3 + pshufw m3, m0, 11110110b ; s2, s1, s3, s3 + lea r2, [r0+r1*2] + pshufw m0, m0, 01110100b ; s0, s1, s3, s1 + paddw m0, m3 + lea r3, [r2+r1*2] + psrlw m0, 2 + pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 + lea r4, [r3+r1*2] + packuswb m0, m0 + punpcklbw m0, m0 + movq m1, m0 + punpcklbw m0, m0 + punpckhbw m1, m1 + movq [r0+r1*1], m0 + movq [r0+r1*2], m0 + movq [r2+r1*1], m0 + movq [r2+r1*2], m0 + movq [r3+r1*1], m1 + movq [r3+r1*2], m1 + movq [r4+r1*1], m1 + movq [r4+r1*2], m1 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred8x8_dc_rv40_8, 2,7 + mov r4, r0 + sub r0, r1 + pxor mm0, mm0 + psadbw mm0, [r0] + dec r0 + movzx r5d, byte [r0+r1*1] + movd r6d, mm0 + lea r0, [r0+r1*2] +%rep 3 + movzx r2d, byte [r0+r1*0] + movzx r3d, byte [r0+r1*1] + add r5d, r2d + add r6d, r3d + lea r0, [r0+r1*2] +%endrep + movzx r2d, byte [r0+r1*0] + add r5d, r6d + lea r2d, [r2+r5+8] + shr r2d, 4 + movd mm0, r2d + punpcklbw mm0, mm0 + pshufw mm0, mm0, 0 + mov r3d, 4 +.loop: + movq [r4+r1*0], mm0 + movq [r4+r1*1], mm0 + lea r4, [r4+r1*2] + dec r3d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_XMM sse2 +cglobal pred8x8_tm_vp8_8, 2,6,4 + sub r0, r1 + pxor xmm1, xmm1 + movq xmm0, [r0] + punpcklbw xmm0, xmm1 + movzx r4d, byte [r0-1] + mov r5d, 4 +.loop: + movzx r2d, byte [r0+r1*1-1] + movzx r3d, byte [r0+r1*2-1] + sub r2d, r4d + sub r3d, r4d + movd xmm2, r2d + movd xmm3, r3d + pshuflw xmm2, xmm2, 0 + pshuflw xmm3, xmm3, 0 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + paddw xmm2, xmm0 + paddw xmm3, xmm0 + packuswb xmm2, xmm3 + movq [r0+r1*1], xmm2 + movhps [r0+r1*2], xmm2 + lea r0, [r0+r1*2] + dec r5d + jg .loop + RET + +INIT_XMM ssse3 +cglobal pred8x8_tm_vp8_8, 2,3,6 + sub r0, r1 + movdqa xmm4, [tm_shuf] + pxor xmm1, xmm1 + movq xmm0, [r0] + punpcklbw xmm0, xmm1 + movd xmm5, [r0-4] + pshufb xmm5, xmm4 + mov r2d, 4 +.loop: + movd xmm2, [r0+r1*1-4] + movd xmm3, [r0+r1*2-4] + pshufb xmm2, xmm4 + pshufb xmm3, xmm4 + psubw xmm2, xmm5 + psubw xmm3, xmm5 + paddw xmm2, xmm0 + paddw xmm3, xmm0 + packuswb xmm2, xmm3 + movq [r0+r1*1], xmm2 + movhps [r0+r1*2], xmm2 + lea r0, [r0+r1*2] + dec r2d + jg .loop + RET + +; dest, left, right, src, tmp +; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 +%macro PRED4x4_LOWPASS 5 + mova %5, %2 + pavgb %2, %3 + pxor %3, %5 + mova %1, %4 + pand %3, [pb_1] + psubusb %2, %3 + pavgb %1, %2 +%endmacro + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_TOP_DC 0 +cglobal pred8x8l_top_dc_8, 4,4 + sub r0, r3 + pxor mm7, mm7 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d ; top_left + jz .fix_lt_2 + test r2d, r2d ; top_right + jz .fix_tr_1 + jmp .body +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d ; top_right + jnz .body +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 +.body: + PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 + psadbw mm7, mm0 + paddw mm7, [pw_4] + psrlw mm7, 3 + pshufw mm7, mm7, 0 + packuswb mm7, mm7 +%rep 3 + movq [r0+r3*1], mm7 + movq [r0+r3*2], mm7 + lea r0, [r0+r3*2] +%endrep + movq [r0+r3*1], mm7 + movq [r0+r3*2], mm7 + RET +%endmacro + +INIT_MMX mmxext +PRED8x8L_TOP_DC +INIT_MMX ssse3 +PRED8x8L_TOP_DC + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_DC 0 +cglobal pred8x8l_dc_8, 4,5 + sub r0, r3 + lea r4, [r0+r3*2] + movq mm0, [r0+r3*1-8] + punpckhbw mm0, [r0+r3*0-8] + movq mm1, [r4+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r4, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r4] + mov r0, r4 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + test r1d, r1d + jnz .do_left +.fix_lt_1: + movq mm5, mm3 + pxor mm5, mm4 + psrlq mm5, 56 + psllq mm5, 48 + pxor mm1, mm5 + jmp .do_left +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d + jnz .body +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .body +.do_left: + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq mm4, mm0 + movq mm7, mm2 + PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + psllq mm1, 56 + PALIGNR mm7, mm1, 7, mm3 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d + jz .fix_lt_2 + test r2d, r2d + jz .fix_tr_1 +.body: + lea r1, [r0+r3*2] + PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 + pxor mm0, mm0 + pxor mm1, mm1 + lea r2, [r1+r3*2] + psadbw mm0, mm7 + psadbw mm1, mm6 + paddw mm0, [pw_8] + paddw mm0, mm1 + lea r4, [r2+r3*2] + psrlw mm0, 4 + pshufw mm0, mm0, 0 + packuswb mm0, mm0 + movq [r0+r3*1], mm0 + movq [r0+r3*2], mm0 + movq [r1+r3*1], mm0 + movq [r1+r3*2], mm0 + movq [r2+r3*1], mm0 + movq [r2+r3*2], mm0 + movq [r4+r3*1], mm0 + movq [r4+r3*2], mm0 + RET +%endmacro + +INIT_MMX mmxext +PRED8x8L_DC +INIT_MMX ssse3 +PRED8x8L_DC + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_HORIZONTAL 0 +cglobal pred8x8l_horizontal_8, 4,4 + sub r0, r3 + lea r2, [r0+r3*2] + movq mm0, [r0+r3*1-8] + test r1d, r1d + lea r1, [r0+r3] + cmovnz r1, r0 + punpckhbw mm0, [r1+r3*0-8] + movq mm1, [r2+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r2, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r1+r3*0-8] + mov r0, r2 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq mm4, mm0 + movq mm7, mm2 + PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + psllq mm1, 56 + PALIGNR mm7, mm1, 7, mm3 + movq mm3, mm7 + lea r1, [r0+r3*2] + movq mm7, mm3 + punpckhbw mm3, mm3 + punpcklbw mm7, mm7 + pshufw mm0, mm3, 0xff + pshufw mm1, mm3, 0xaa + lea r2, [r1+r3*2] + pshufw mm2, mm3, 0x55 + pshufw mm3, mm3, 0x00 + pshufw mm4, mm7, 0xff + pshufw mm5, mm7, 0xaa + pshufw mm6, mm7, 0x55 + pshufw mm7, mm7, 0x00 + movq [r0+r3*1], mm0 + movq [r0+r3*2], mm1 + movq [r1+r3*1], mm2 + movq [r1+r3*2], mm3 + movq [r2+r3*1], mm4 + movq [r2+r3*2], mm5 + lea r0, [r2+r3*2] + movq [r0+r3*1], mm6 + movq [r0+r3*2], mm7 + RET +%endmacro + +INIT_MMX mmxext +PRED8x8L_HORIZONTAL +INIT_MMX ssse3 +PRED8x8L_HORIZONTAL + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_VERTICAL 0 +cglobal pred8x8l_vertical_8, 4,4 + sub r0, r3 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d ; top_left + jz .fix_lt_2 + test r2d, r2d ; top_right + jz .fix_tr_1 + jmp .body +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d ; top_right + jnz .body +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 +.body: + PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 +%rep 3 + movq [r0+r3*1], mm0 + movq [r0+r3*2], mm0 + lea r0, [r0+r3*2] +%endrep + movq [r0+r3*1], mm0 + movq [r0+r3*2], mm0 + RET +%endmacro + +INIT_MMX mmxext +PRED8x8L_VERTICAL +INIT_MMX ssse3 +PRED8x8L_VERTICAL + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_DOWN_LEFT 0 +cglobal pred8x8l_down_left_8, 4,4 + sub r0, r3 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d ; top_left + jz .fix_lt_2 + test r2d, r2d ; top_right + jz .fix_tr_1 + jmp .do_top +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d ; top_right + jnz .do_top +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .do_top +.fix_tr_2: + punpckhbw mm3, mm3 + pshufw mm1, mm3, 0xFF + jmp .do_topright +.do_top: + PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 + movq2dq xmm3, mm4 + test r2d, r2d ; top_right + jz .fix_tr_2 + movq mm0, [r0+8] + movq mm5, mm0 + movq mm2, mm0 + movq mm4, mm0 + psrlq mm5, 56 + PALIGNR mm2, mm3, 7, mm3 + PALIGNR mm5, mm4, 1, mm4 + PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 +.do_topright: + movq2dq xmm4, mm1 + psrlq mm1, 56 + movq2dq xmm5, mm1 + lea r1, [r0+r3*2] + pslldq xmm4, 8 + por xmm3, xmm4 + movdqa xmm2, xmm3 + psrldq xmm2, 1 + pslldq xmm5, 15 + por xmm2, xmm5 + lea r2, [r1+r3*2] + movdqa xmm1, xmm3 + pslldq xmm1, 1 +INIT_XMM cpuname + PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 + psrldq xmm0, 1 + movq [r0+r3*1], xmm0 + psrldq xmm0, 1 + movq [r0+r3*2], xmm0 + psrldq xmm0, 1 + lea r0, [r2+r3*2] + movq [r1+r3*1], xmm0 + psrldq xmm0, 1 + movq [r1+r3*2], xmm0 + psrldq xmm0, 1 + movq [r2+r3*1], xmm0 + psrldq xmm0, 1 + movq [r2+r3*2], xmm0 + psrldq xmm0, 1 + movq [r0+r3*1], xmm0 + psrldq xmm0, 1 + movq [r0+r3*2], xmm0 + RET +%endmacro + +INIT_MMX sse2 +PRED8x8L_DOWN_LEFT +INIT_MMX ssse3 +PRED8x8L_DOWN_LEFT + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_down_right_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_DOWN_RIGHT 0 +cglobal pred8x8l_down_right_8, 4,5 + sub r0, r3 + lea r4, [r0+r3*2] + movq mm0, [r0+r3*1-8] + punpckhbw mm0, [r0+r3*0-8] + movq mm1, [r4+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r4, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r4] + mov r0, r4 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + test r1d, r1d + jz .fix_lt_1 + jmp .do_left +.fix_lt_1: + movq mm5, mm3 + pxor mm5, mm4 + psrlq mm5, 56 + psllq mm5, 48 + pxor mm1, mm5 + jmp .do_left +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d + jnz .do_top +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .do_top +.do_left: + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq mm4, mm0 + movq mm7, mm2 + movq2dq xmm3, mm2 + PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + psllq mm1, 56 + PALIGNR mm7, mm1, 7, mm3 + movq2dq xmm1, mm7 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d + jz .fix_lt_2 + test r2d, r2d + jz .fix_tr_1 +.do_top: + PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 + movq2dq xmm4, mm4 + lea r1, [r0+r3*2] + movdqa xmm0, xmm3 + pslldq xmm4, 8 + por xmm3, xmm4 + lea r2, [r1+r3*2] + pslldq xmm4, 1 + por xmm1, xmm4 + psrldq xmm0, 7 + pslldq xmm0, 15 + psrldq xmm0, 7 + por xmm1, xmm0 + lea r0, [r2+r3*2] + movdqa xmm2, xmm3 + psrldq xmm2, 1 +INIT_XMM cpuname + PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 + movdqa xmm1, xmm0 + psrldq xmm1, 1 + movq [r0+r3*2], xmm0 + movq [r0+r3*1], xmm1 + psrldq xmm0, 2 + psrldq xmm1, 2 + movq [r2+r3*2], xmm0 + movq [r2+r3*1], xmm1 + psrldq xmm0, 2 + psrldq xmm1, 2 + movq [r1+r3*2], xmm0 + movq [r1+r3*1], xmm1 + psrldq xmm0, 2 + psrldq xmm1, 2 + movq [r4+r3*2], xmm0 + movq [r4+r3*1], xmm1 + RET +%endmacro + +INIT_MMX sse2 +PRED8x8L_DOWN_RIGHT +INIT_MMX ssse3 +PRED8x8L_DOWN_RIGHT + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_VERTICAL_RIGHT 0 +cglobal pred8x8l_vertical_right_8, 4,5,7 + ; manually spill XMM registers for Win64 because + ; the code here is initialized with INIT_MMX + WIN64_SPILL_XMM 7 + sub r0, r3 + lea r4, [r0+r3*2] + movq mm0, [r0+r3*1-8] + punpckhbw mm0, [r0+r3*0-8] + movq mm1, [r4+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r4, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r4] + mov r0, r4 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + test r1d, r1d + jnz .do_left +.fix_lt_1: + movq mm5, mm3 + pxor mm5, mm4 + psrlq mm5, 56 + psllq mm5, 48 + pxor mm1, mm5 + jmp .do_left +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d + jnz .do_top +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .do_top +.do_left: + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq2dq xmm0, mm2 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d + jz .fix_lt_2 + test r2d, r2d + jz .fix_tr_1 +.do_top: + PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 + lea r1, [r0+r3*2] + movq2dq xmm4, mm6 + pslldq xmm4, 8 + por xmm0, xmm4 + movdqa xmm6, [pw_ff00] + movdqa xmm1, xmm0 + lea r2, [r1+r3*2] + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslldq xmm0, 1 + pslldq xmm1, 2 + pavgb xmm2, xmm0 +INIT_XMM cpuname + PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 + pandn xmm6, xmm4 + movdqa xmm5, xmm4 + psrlw xmm4, 8 + packuswb xmm6, xmm4 + movhlps xmm4, xmm6 + movhps [r0+r3*2], xmm5 + movhps [r0+r3*1], xmm2 + psrldq xmm5, 4 + movss xmm5, xmm6 + psrldq xmm2, 4 + movss xmm2, xmm4 + lea r0, [r2+r3*2] + psrldq xmm5, 1 + psrldq xmm2, 1 + movq [r0+r3*2], xmm5 + movq [r0+r3*1], xmm2 + psrldq xmm5, 1 + psrldq xmm2, 1 + movq [r2+r3*2], xmm5 + movq [r2+r3*1], xmm2 + psrldq xmm5, 1 + psrldq xmm2, 1 + movq [r1+r3*2], xmm5 + movq [r1+r3*1], xmm2 + RET +%endmacro + +INIT_MMX sse2 +PRED8x8L_VERTICAL_RIGHT +INIT_MMX ssse3 +PRED8x8L_VERTICAL_RIGHT + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_VERTICAL_LEFT 0 +cglobal pred8x8l_vertical_left_8, 4,4 + sub r0, r3 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d + jz .fix_lt_2 + test r2d, r2d + jz .fix_tr_1 + jmp .do_top +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d + jnz .do_top +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .do_top +.fix_tr_2: + punpckhbw mm3, mm3 + pshufw mm1, mm3, 0xFF + jmp .do_topright +.do_top: + PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 + movq2dq xmm4, mm4 + test r2d, r2d + jz .fix_tr_2 + movq mm0, [r0+8] + movq mm5, mm0 + movq mm2, mm0 + movq mm4, mm0 + psrlq mm5, 56 + PALIGNR mm2, mm3, 7, mm3 + PALIGNR mm5, mm4, 1, mm4 + PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 +.do_topright: + movq2dq xmm3, mm1 + lea r1, [r0+r3*2] + pslldq xmm3, 8 + por xmm4, xmm3 + movdqa xmm2, xmm4 + movdqa xmm1, xmm4 + movdqa xmm3, xmm4 + psrldq xmm2, 1 + pslldq xmm1, 1 + pavgb xmm3, xmm2 + lea r2, [r1+r3*2] +INIT_XMM cpuname + PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 + psrldq xmm0, 1 + movq [r0+r3*1], xmm3 + movq [r0+r3*2], xmm0 + lea r0, [r2+r3*2] + psrldq xmm3, 1 + psrldq xmm0, 1 + movq [r1+r3*1], xmm3 + movq [r1+r3*2], xmm0 + psrldq xmm3, 1 + psrldq xmm0, 1 + movq [r2+r3*1], xmm3 + movq [r2+r3*2], xmm0 + psrldq xmm3, 1 + psrldq xmm0, 1 + movq [r0+r3*1], xmm3 + movq [r0+r3*2], xmm0 + RET +%endmacro + +INIT_MMX sse2 +PRED8x8L_VERTICAL_LEFT +INIT_MMX ssse3 +PRED8x8L_VERTICAL_LEFT + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_HORIZONTAL_UP 0 +cglobal pred8x8l_horizontal_up_8, 4,4 + sub r0, r3 + lea r2, [r0+r3*2] + movq mm0, [r0+r3*1-8] + test r1d, r1d + lea r1, [r0+r3] + cmovnz r1, r0 + punpckhbw mm0, [r1+r3*0-8] + movq mm1, [r2+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r2, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r1+r3*0-8] + mov r0, r2 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq mm4, mm0 + movq mm7, mm2 + PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + psllq mm1, 56 + PALIGNR mm7, mm1, 7, mm3 + lea r1, [r0+r3*2] + pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 + psllq mm7, 56 ; l7 .. .. .. .. .. .. .. + movq mm2, mm0 + psllw mm0, 8 + psrlw mm2, 8 + por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 + movq mm3, mm2 + movq mm4, mm2 + movq mm5, mm2 + psrlq mm2, 8 + psrlq mm3, 16 + lea r2, [r1+r3*2] + por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1 + punpckhbw mm7, mm7 + por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2 + pavgb mm4, mm2 + PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6 + movq mm5, mm4 + punpcklbw mm4, mm1 ; p4 p3 p2 p1 + punpckhbw mm5, mm1 ; p8 p7 p6 p5 + movq mm6, mm5 + movq mm7, mm5 + movq mm0, mm5 + PALIGNR mm5, mm4, 2, mm1 + pshufw mm1, mm6, 11111001b + PALIGNR mm6, mm4, 4, mm2 + pshufw mm2, mm7, 11111110b + PALIGNR mm7, mm4, 6, mm3 + pshufw mm3, mm0, 11111111b + movq [r0+r3*1], mm4 + movq [r0+r3*2], mm5 + lea r0, [r2+r3*2] + movq [r1+r3*1], mm6 + movq [r1+r3*2], mm7 + movq [r2+r3*1], mm0 + movq [r2+r3*2], mm1 + movq [r0+r3*1], mm2 + movq [r0+r3*2], mm3 + RET +%endmacro + +INIT_MMX mmxext +PRED8x8L_HORIZONTAL_UP +INIT_MMX ssse3 +PRED8x8L_HORIZONTAL_UP + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +%macro PRED8x8L_HORIZONTAL_DOWN 0 +cglobal pred8x8l_horizontal_down_8, 4,5 + sub r0, r3 + lea r4, [r0+r3*2] + movq mm0, [r0+r3*1-8] + punpckhbw mm0, [r0+r3*0-8] + movq mm1, [r4+r3*1-8] + punpckhbw mm1, [r0+r3*2-8] + mov r4, r0 + punpckhwd mm1, mm0 + lea r0, [r0+r3*4] + movq mm2, [r0+r3*1-8] + punpckhbw mm2, [r0+r3*0-8] + lea r0, [r0+r3*2] + movq mm3, [r0+r3*1-8] + punpckhbw mm3, [r0+r3*0-8] + punpckhwd mm3, mm2 + punpckhdq mm3, mm1 + lea r0, [r0+r3*2] + movq mm0, [r0+r3*0-8] + movq mm1, [r4] + mov r0, r4 + movq mm4, mm3 + movq mm2, mm3 + PALIGNR mm4, mm0, 7, mm0 + PALIGNR mm1, mm2, 1, mm2 + test r1d, r1d + jnz .do_left +.fix_lt_1: + movq mm5, mm3 + pxor mm5, mm4 + psrlq mm5, 56 + psllq mm5, 48 + pxor mm1, mm5 + jmp .do_left +.fix_lt_2: + movq mm5, mm3 + pxor mm5, mm2 + psllq mm5, 56 + psrlq mm5, 56 + pxor mm2, mm5 + test r2d, r2d + jnz .do_top +.fix_tr_1: + movq mm5, mm3 + pxor mm5, mm1 + psrlq mm5, 56 + psllq mm5, 56 + pxor mm1, mm5 + jmp .do_top +.fix_tr_2: + punpckhbw mm3, mm3 + pshufw mm1, mm3, 0xFF + jmp .do_topright +.do_left: + movq mm0, mm4 + PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 + movq2dq xmm0, mm2 + pslldq xmm0, 8 + movq mm4, mm0 + PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 + movq2dq xmm2, mm1 + pslldq xmm2, 15 + psrldq xmm2, 8 + por xmm0, xmm2 + movq mm0, [r0-8] + movq mm3, [r0] + movq mm1, [r0+8] + movq mm2, mm3 + movq mm4, mm3 + PALIGNR mm2, mm0, 7, mm0 + PALIGNR mm1, mm4, 1, mm4 + test r1d, r1d + jz .fix_lt_2 + test r2d, r2d + jz .fix_tr_1 +.do_top: + PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 + movq2dq xmm1, mm4 + test r2d, r2d + jz .fix_tr_2 + movq mm0, [r0+8] + movq mm5, mm0 + movq mm2, mm0 + movq mm4, mm0 + psrlq mm5, 56 + PALIGNR mm2, mm3, 7, mm3 + PALIGNR mm5, mm4, 1, mm4 + PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 +.do_topright: + movq2dq xmm5, mm1 + pslldq xmm5, 8 + por xmm1, xmm5 +INIT_XMM cpuname + lea r2, [r4+r3*2] + movdqa xmm2, xmm1 + movdqa xmm3, xmm1 + PALIGNR xmm1, xmm0, 7, xmm4 + PALIGNR xmm2, xmm0, 9, xmm5 + lea r1, [r2+r3*2] + PALIGNR xmm3, xmm0, 8, xmm0 + movdqa xmm4, xmm1 + pavgb xmm4, xmm3 + lea r0, [r1+r3*2] + PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 + punpcklbw xmm4, xmm0 + movhlps xmm0, xmm4 + movq [r0+r3*2], xmm4 + movq [r2+r3*2], xmm0 + psrldq xmm4, 2 + psrldq xmm0, 2 + movq [r0+r3*1], xmm4 + movq [r2+r3*1], xmm0 + psrldq xmm4, 2 + psrldq xmm0, 2 + movq [r1+r3*2], xmm4 + movq [r4+r3*2], xmm0 + psrldq xmm4, 2 + psrldq xmm0, 2 + movq [r1+r3*1], xmm4 + movq [r4+r3*1], xmm0 + RET +%endmacro + +INIT_MMX sse2 +PRED8x8L_HORIZONTAL_DOWN +INIT_MMX ssse3 +PRED8x8L_HORIZONTAL_DOWN + +;------------------------------------------------------------------------------- +; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_dc_8, 3,5 + pxor mm7, mm7 + mov r4, r0 + sub r0, r2 + movd mm0, [r0] + psadbw mm0, mm7 + movzx r1d, byte [r0+r2*1-1] + movd r3d, mm0 + add r3d, r1d + movzx r1d, byte [r0+r2*2-1] + lea r0, [r0+r2*2] + add r3d, r1d + movzx r1d, byte [r0+r2*1-1] + add r3d, r1d + movzx r1d, byte [r0+r2*2-1] + add r3d, r1d + add r3d, 4 + shr r3d, 3 + imul r3d, 0x01010101 + mov [r4+r2*0], r3d + mov [r0+r2*0], r3d + mov [r0+r2*1], r3d + mov [r0+r2*2], r3d + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_tm_vp8_8, 3,6 + sub r0, r2 + pxor mm7, mm7 + movd mm0, [r0] + punpcklbw mm0, mm7 + movzx r4d, byte [r0-1] + mov r5d, 2 +.loop: + movzx r1d, byte [r0+r2*1-1] + movzx r3d, byte [r0+r2*2-1] + sub r1d, r4d + sub r3d, r4d + movd mm2, r1d + movd mm4, r3d + pshufw mm2, mm2, 0 + pshufw mm4, mm4, 0 + paddw mm2, mm0 + paddw mm4, mm0 + packuswb mm2, mm2 + packuswb mm4, mm4 + movd [r0+r2*1], mm2 + movd [r0+r2*2], mm4 + lea r0, [r0+r2*2] + dec r5d + jg .loop + RET + +INIT_XMM ssse3 +cglobal pred4x4_tm_vp8_8, 3,3 + sub r0, r2 + movq mm6, [tm_shuf] + pxor mm1, mm1 + movd mm0, [r0] + punpcklbw mm0, mm1 + movd mm7, [r0-4] + pshufb mm7, mm6 + lea r1, [r0+r2*2] + movd mm2, [r0+r2*1-4] + movd mm3, [r0+r2*2-4] + movd mm4, [r1+r2*1-4] + movd mm5, [r1+r2*2-4] + pshufb mm2, mm6 + pshufb mm3, mm6 + pshufb mm4, mm6 + pshufb mm5, mm6 + psubw mm0, mm7 + paddw mm2, mm0 + paddw mm3, mm0 + paddw mm4, mm0 + paddw mm5, mm0 + packuswb mm2, mm2 + packuswb mm3, mm3 + packuswb mm4, mm4 + packuswb mm5, mm5 + movd [r0+r2*1], mm2 + movd [r0+r2*2], mm3 + movd [r1+r2*1], mm4 + movd [r1+r2*2], mm5 + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_vertical_vp8_8, 3,3 + sub r0, r2 + movd m1, [r0-1] + movd m0, [r0] + mova m2, m0 ;t0 t1 t2 t3 + punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 + lea r1, [r0+r2*2] + psrlq m0, 8 ;t1 t2 t3 t4 + PRED4x4_LOWPASS m3, m1, m0, m2, m4 + movd [r0+r2*1], m3 + movd [r0+r2*2], m3 + movd [r1+r2*1], m3 + movd [r1+r2*2], m3 + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_MMX mmxext +cglobal pred4x4_down_left_8, 3,3 + sub r0, r2 + movq m1, [r0] + punpckldq m1, [r1] + movq m2, m1 + movq m3, m1 + psllq m1, 8 + pxor m2, m1 + psrlq m2, 8 + pxor m2, m3 + PRED4x4_LOWPASS m0, m1, m2, m3, m4 + lea r1, [r0+r2*2] + psrlq m0, 8 + movd [r0+r2*1], m0 + psrlq m0, 8 + movd [r0+r2*2], m0 + psrlq m0, 8 + movd [r1+r2*1], m0 + psrlq m0, 8 + movd [r1+r2*2], m0 + RET + +;------------------------------------------------------------------------------ +; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------ + +INIT_MMX mmxext +cglobal pred4x4_vertical_left_8, 3,3 + sub r0, r2 + movq m1, [r0] + punpckldq m1, [r1] + movq m3, m1 + movq m2, m1 + psrlq m3, 8 + psrlq m2, 16 + movq m4, m3 + pavgb m4, m1 + PRED4x4_LOWPASS m0, m1, m2, m3, m5 + lea r1, [r0+r2*2] + movh [r0+r2*1], m4 + movh [r0+r2*2], m0 + psrlq m4, 8 + psrlq m0, 8 + movh [r1+r2*1], m4 + movh [r1+r2*2], m0 + RET + +;------------------------------------------------------------------------------ +; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------ + +INIT_MMX mmxext +cglobal pred4x4_horizontal_up_8, 3,3 + sub r0, r2 + lea r1, [r0+r2*2] + movd m0, [r0+r2*1-4] + punpcklbw m0, [r0+r2*2-4] + movd m1, [r1+r2*1-4] + punpcklbw m1, [r1+r2*2-4] + punpckhwd m0, m1 + movq m1, m0 + punpckhbw m1, m1 + pshufw m1, m1, 0xFF + punpckhdq m0, m1 + movq m2, m0 + movq m3, m0 + movq m7, m0 + psrlq m2, 16 + psrlq m3, 8 + pavgb m7, m3 + PRED4x4_LOWPASS m4, m0, m2, m3, m5 + punpcklbw m7, m4 + movd [r0+r2*1], m7 + psrlq m7, 16 + movd [r0+r2*2], m7 + psrlq m7, 16 + movd [r1+r2*1], m7 + movd [r1+r2*2], m1 + RET + +;------------------------------------------------------------------------------ +; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src, +; const uint8_t *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------ + +INIT_MMX mmxext +cglobal pred4x4_horizontal_down_8, 3,3 + sub r0, r2 + lea r1, [r0+r2*2] + movh m0, [r0-4] ; lt .. + punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. .. + psllq m0, 8 ; t2 t1 t0 lt .. .. .. .. + movd m1, [r1+r2*2-4] ; l3 + punpcklbw m1, [r1+r2*1-4] ; l2 l3 + movd m2, [r0+r2*2-4] ; l1 + punpcklbw m2, [r0+r2*1-4] ; l0 l1 + punpckhwd m1, m2 ; l0 l1 l2 l3 + punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 + movq m0, m1 + movq m2, m1 + movq m5, m1 + psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 + psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 + pavgb m5, m2 + PRED4x4_LOWPASS m3, m1, m0, m2, m4 + punpcklbw m5, m3 + psrlq m3, 32 + PALIGNR m3, m5, 6, m4 + movh [r1+r2*2], m5 + psrlq m5, 16 + movh [r1+r2*1], m5 + psrlq m5, 16 + movh [r0+r2*2], m5 + movh [r0+r2*1], m3 + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src, +; const uint8_t *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_vertical_right_8, 3,3 + sub r0, r2 + lea r1, [r0+r2*2] + movh m0, [r0] ; ........t3t2t1t0 + movq m5, m0 + PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt + pavgb m5, m0 + PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0 + movq m1, m0 + PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 + movq m2, m0 + PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 + PRED4x4_LOWPASS m3, m1, m0, m2, m4 + movq m1, m3 + psrlq m3, 16 + psllq m1, 48 + movh [r0+r2*1], m5 + movh [r0+r2*2], m3 + PALIGNR m5, m1, 7, m2 + psllq m1, 8 + movh [r1+r2*1], m5 + PALIGNR m3, m1, 7, m1 + movh [r1+r2*2], m3 + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_down_right_8, 3,3 + sub r0, r2 + lea r1, [r0+r2*2] + movq m1, [r1-8] + movq m2, [r0+r2*1-8] + punpckhbw m2, [r0-8] + movh m3, [r0] + punpckhwd m1, m2 + PALIGNR m3, m1, 5, m1 + movq m1, m3 + PALIGNR m3, [r1+r2*1-8], 7, m4 + movq m2, m3 + PALIGNR m3, [r1+r2*2-8], 7, m4 + PRED4x4_LOWPASS m0, m3, m1, m2, m4 + movh [r1+r2*2], m0 + psrlq m0, 8 + movh [r1+r2*1], m0 + psrlq m0, 8 + movh [r0+r2*2], m0 + psrlq m0, 8 + movh [r0+r2*1], m0 + RET diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm b/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm new file mode 100644 index 0000000000..2f30807332 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/h264_intrapred_10bit.asm @@ -0,0 +1,1119 @@ +;***************************************************************************** +;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code +;***************************************************************************** +;* Copyright (C) 2005-2011 x264 project +;* +;* Authors: Daniel Kang <daniel.d.kang@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pw_1023 +%define pw_pixel_max pw_1023 +cextern pw_512 +cextern pw_16 +cextern pw_8 +cextern pw_4 +cextern pw_2 +cextern pw_1 +cextern pd_16 + +pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 +pw_m3: times 8 dw -3 +pd_17: times 4 dd 17 + +SECTION .text + +; dest, left, right, src +; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 +%macro PRED4x4_LOWPASS 4 + paddw %2, %3 + psrlw %2, 1 + pavgw %1, %4, %2 +%endmacro + +;----------------------------------------------------------------------------- +; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED4x4_DR 0 +cglobal pred4x4_down_right_10, 3, 3 + sub r0, r2 + lea r1, [r0+r2*2] + movhps m1, [r1-8] + movhps m2, [r0+r2*1-8] + movhps m4, [r0-8] + punpckhwd m2, m4 + movq m3, [r0] + punpckhdq m1, m2 + PALIGNR m3, m1, 10, m1 + movhps m4, [r1+r2*1-8] + PALIGNR m0, m3, m4, 14, m4 + movhps m4, [r1+r2*2-8] + PALIGNR m2, m0, m4, 14, m4 + PRED4x4_LOWPASS m0, m2, m3, m0 + movq [r1+r2*2], m0 + psrldq m0, 2 + movq [r1+r2*1], m0 + psrldq m0, 2 + movq [r0+r2*2], m0 + psrldq m0, 2 + movq [r0+r2*1], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED4x4_DR +INIT_XMM ssse3 +PRED4x4_DR +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED4x4_DR +%endif + +;------------------------------------------------------------------------------ +; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------ +%macro PRED4x4_VR 0 +cglobal pred4x4_vertical_right_10, 3, 3, 6 + sub r0, r2 + lea r1, [r0+r2*2] + movq m5, [r0] ; ........t3t2t1t0 + movhps m1, [r0-8] + PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt + pavgw m5, m0 + movhps m1, [r0+r2*1-8] + PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 + movhps m2, [r0+r2*2-8] + PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 + movhps m3, [r1+r2*1-8] + PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 + PRED4x4_LOWPASS m1, m0, m2, m1 + pslldq m0, m1, 12 + psrldq m1, 4 + movq [r0+r2*1], m5 + movq [r0+r2*2], m1 + PALIGNR m5, m0, 14, m2 + pslldq m0, 2 + movq [r1+r2*1], m5 + PALIGNR m1, m0, 14, m0 + movq [r1+r2*2], m1 + RET +%endmacro + +INIT_XMM sse2 +PRED4x4_VR +INIT_XMM ssse3 +PRED4x4_VR +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED4x4_VR +%endif + +;------------------------------------------------------------------------------- +; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------- +%macro PRED4x4_HD 0 +cglobal pred4x4_horizontal_down_10, 3, 3 + sub r0, r2 + lea r1, [r0+r2*2] + movq m0, [r0-8] ; lt .. + movhps m0, [r0] + pslldq m0, 2 ; t2 t1 t0 lt .. .. .. .. + movq m1, [r1+r2*2-8] ; l3 + movq m3, [r1+r2*1-8] + punpcklwd m1, m3 ; l2 l3 + movq m2, [r0+r2*2-8] ; l1 + movq m3, [r0+r2*1-8] + punpcklwd m2, m3 ; l0 l1 + punpckhdq m1, m2 ; l0 l1 l2 l3 + punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 + psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 + psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 + pavgw m5, m1, m3 + PRED4x4_LOWPASS m3, m1, m0, m3 + punpcklwd m5, m3 + psrldq m3, 8 + PALIGNR m3, m5, 12, m4 + movq [r1+r2*2], m5 + movhps [r0+r2*2], m5 + psrldq m5, 4 + movq [r1+r2*1], m5 + movq [r0+r2*1], m3 + RET +%endmacro + +INIT_XMM sse2 +PRED4x4_HD +INIT_XMM ssse3 +PRED4x4_HD +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED4x4_HD +%endif + +;----------------------------------------------------------------------------- +; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- + +INIT_MMX mmxext +cglobal pred4x4_dc_10, 3, 3 + sub r0, r2 + lea r1, [r0+r2*2] + movq m2, [r0+r2*1-8] + paddw m2, [r0+r2*2-8] + paddw m2, [r1+r2*1-8] + paddw m2, [r1+r2*2-8] + psrlq m2, 48 + movq m0, [r0] + HADDW m0, m1 + paddw m0, [pw_4] + paddw m0, m2 + psrlw m0, 3 + SPLATW m0, m0, 0 + movq [r0+r2*1], m0 + movq [r0+r2*2], m0 + movq [r1+r2*1], m0 + movq [r1+r2*2], m0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED4x4_DL 0 +cglobal pred4x4_down_left_10, 3, 3 + sub r0, r2 + movq m0, [r0] + movhps m0, [r1] + psrldq m2, m0, 2 + pslldq m3, m0, 2 + pshufhw m2, m2, 10100100b + PRED4x4_LOWPASS m0, m3, m2, m0 + lea r1, [r0+r2*2] + movhps [r1+r2*2], m0 + psrldq m0, 2 + movq [r0+r2*1], m0 + psrldq m0, 2 + movq [r0+r2*2], m0 + psrldq m0, 2 + movq [r1+r2*1], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED4x4_DL +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED4x4_DL +%endif + +;----------------------------------------------------------------------------- +; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED4x4_VL 0 +cglobal pred4x4_vertical_left_10, 3, 3 + sub r0, r2 + movu m1, [r0] + movhps m1, [r1] + psrldq m0, m1, 2 + psrldq m2, m1, 4 + pavgw m4, m0, m1 + PRED4x4_LOWPASS m0, m1, m2, m0 + lea r1, [r0+r2*2] + movq [r0+r2*1], m4 + movq [r0+r2*2], m0 + psrldq m4, 2 + psrldq m0, 2 + movq [r1+r2*1], m4 + movq [r1+r2*2], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED4x4_VL +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED4x4_VL +%endif + +;----------------------------------------------------------------------------- +; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_MMX mmxext +cglobal pred4x4_horizontal_up_10, 3, 3 + sub r0, r2 + lea r1, [r0+r2*2] + movq m0, [r0+r2*1-8] + punpckhwd m0, [r0+r2*2-8] + movq m1, [r1+r2*1-8] + punpckhwd m1, [r1+r2*2-8] + punpckhdq m0, m1 + pshufw m1, m1, 0xFF + movq [r1+r2*2], m1 + movd [r1+r2*1+4], m1 + pshufw m2, m0, 11111001b + movq m1, m2 + pavgw m2, m0 + + pshufw m5, m0, 11111110b + PRED4x4_LOWPASS m1, m0, m5, m1 + movq m6, m2 + punpcklwd m6, m1 + movq [r0+r2*1], m6 + psrlq m2, 16 + psrlq m1, 16 + punpcklwd m2, m1 + movq [r0+r2*2], m2 + psrlq m2, 32 + movd [r1+r2*1], m2 + RET + + + +;----------------------------------------------------------------------------- +; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred8x8_vertical_10, 2, 2 + sub r0, r1 + mova m0, [r0] +%rep 3 + mova [r0+r1*1], m0 + mova [r0+r1*2], m0 + lea r0, [r0+r1*2] +%endrep + mova [r0+r1*1], m0 + mova [r0+r1*2], m0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred8x8_horizontal_10, 2, 3 + mov r2d, 4 +.loop: + movq m0, [r0+r1*0-8] + movq m1, [r0+r1*1-8] + pshuflw m0, m0, 0xff + pshuflw m1, m1, 0xff + punpcklqdq m0, m0 + punpcklqdq m1, m1 + mova [r0+r1*0], m0 + mova [r0+r1*1], m1 + lea r0, [r0+r1*2] + dec r2d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro MOV8 2-3 +; sort of a hack, but it works + movdqa [%1], %2 +%endmacro + +%macro PRED8x8_DC 1 +cglobal pred8x8_dc_10, 2, 6 + sub r0, r1 + pxor m4, m4 + movq m0, [r0+0] + movq m1, [r0+8] + punpcklwd m0, m1 + movhlps m1, m0 + paddw m0, m1 + %1 m2, m0, 00001110b + paddw m0, m2 + + lea r5, [r1*3] + lea r4, [r0+r1*4] + movzx r2d, word [r0+r1*1-2] + movzx r3d, word [r0+r1*2-2] + add r2d, r3d + movzx r3d, word [r0+r5*1-2] + add r2d, r3d + movzx r3d, word [r4-2] + add r2d, r3d + movd m2, r2d ; s2 + + movzx r2d, word [r4+r1*1-2] + movzx r3d, word [r4+r1*2-2] + add r2d, r3d + movzx r3d, word [r4+r5*1-2] + add r2d, r3d + movzx r3d, word [r4+r1*4-2] + add r2d, r3d + movd m3, r2d ; s3 + + punpcklwd m2, m3 + punpckldq m0, m2 ; s0, s1, s2, s3 + %1 m3, m0, 11110110b ; s2, s1, s3, s3 + %1 m0, m0, 01110100b ; s0, s1, s3, s1 + paddw m0, m3 + psrlw m0, 2 + pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 + punpcklwd m0, m0 + pshufd m3, m0, 11111010b + punpckldq m0, m0 + SWAP 0,1 + MOV8 r0+r1*1, m1, m2 + MOV8 r0+r1*2, m1, m2 + MOV8 r0+r5*1, m1, m2 + MOV8 r0+r1*4, m1, m2 + MOV8 r4+r1*1, m3, m4 + MOV8 r4+r1*2, m3, m4 + MOV8 r4+r5*1, m3, m4 + MOV8 r4+r1*4, m3, m4 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8_DC pshuflw + +;----------------------------------------------------------------------------- +; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred8x8_top_dc_10, 2, 4 + sub r0, r1 + mova m0, [r0] + pshuflw m1, m0, 0x4e + pshufhw m1, m1, 0x4e + paddw m0, m1 + pshuflw m1, m0, 0xb1 + pshufhw m1, m1, 0xb1 + paddw m0, m1 + lea r2, [r1*3] + lea r3, [r0+r1*4] + paddw m0, [pw_2] + psrlw m0, 2 + mova [r0+r1*1], m0 + mova [r0+r1*2], m0 + mova [r0+r2*1], m0 + mova [r0+r1*4], m0 + mova [r3+r1*1], m0 + mova [r3+r1*2], m0 + mova [r3+r2*1], m0 + mova [r3+r1*4], m0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred8x8_plane_10, 2, 7, 7 + sub r0, r1 + lea r2, [r1*3] + lea r3, [r0+r1*4] + mova m2, [r0] + pmaddwd m2, [pw_m32101234] + HADDD m2, m1 + movd m0, [r0-4] + psrld m0, 14 + psubw m2, m0 ; H + movd m0, [r3+r1*4-4] + movd m1, [r0+12] + paddw m0, m1 + psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7]) + movzx r4d, word [r3+r1*1-2] ; src[4*stride-1] + movzx r5d, word [r0+r2*1-2] ; src[2*stride-1] + sub r4d, r5d + movzx r6d, word [r3+r1*2-2] ; src[5*stride-1] + movzx r5d, word [r0+r1*2-2] ; src[1*stride-1] + sub r6d, r5d + lea r4d, [r4+r6*2] + movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] + movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] + sub r5d, r6d + lea r5d, [r5*3] + add r4d, r5d + movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] + movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] + sub r6d, r5d + lea r4d, [r4+r6*4] + movd m3, r4d ; V + punpckldq m2, m3 + pmaddwd m2, [pd_17] + paddd m2, [pd_16] + psrad m2, 5 ; b, c + + mova m3, [pw_pixel_max] + pxor m1, m1 + SPLATW m0, m0, 1 + SPLATW m4, m2, 2 + SPLATW m2, m2, 0 + pmullw m2, [pw_m32101234] ; b + pmullw m5, m4, [pw_m3] ; c + paddw m5, [pw_16] + mov r2d, 8 + add r0, r1 +.loop: + paddsw m6, m2, m5 + paddsw m6, m0 + psraw m6, 5 + CLIPW m6, m1, m3 + mova [r0], m6 + paddw m5, m4 + add r0, r1 + dec r2d + jg .loop + RET + + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred8x8l_128_dc_10, 4, 4 + mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) + lea r1, [r3*3] + lea r2, [r0+r3*4] + MOV8 r0+r3*0, m0, m0 + MOV8 r0+r3*1, m0, m0 + MOV8 r0+r3*2, m0, m0 + MOV8 r0+r1*1, m0, m0 + MOV8 r2+r3*0, m0, m0 + MOV8 r2+r3*1, m0, m0 + MOV8 r2+r3*2, m0, m0 + MOV8 r2+r1*1, m0, m0 + RET + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_TOP_DC 0 +cglobal pred8x8l_top_dc_10, 4, 4, 6 + sub r0, r3 + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + lea r1, [r3*3] + lea r2, [r0+r3*4] + PRED4x4_LOWPASS m0, m2, m1, m0 + HADDW m0, m1 + paddw m0, [pw_4] + psrlw m0, 3 + SPLATW m0, m0, 0 + mova [r0+r3*1], m0 + mova [r0+r3*2], m0 + mova [r0+r1*1], m0 + mova [r0+r3*4], m0 + mova [r2+r3*1], m0 + mova [r2+r3*2], m0 + mova [r2+r1*1], m0 + mova [r2+r3*4], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_TOP_DC +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_TOP_DC +%endif + +;------------------------------------------------------------------------------- +; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;------------------------------------------------------------------------------- +;TODO: see if scalar is faster +%macro PRED8x8L_DC 0 +cglobal pred8x8l_dc_10, 4, 6, 6 + sub r0, r3 + lea r4, [r0+r3*4] + lea r5, [r3*3] + mova m0, [r0+r3*2-16] + punpckhwd m0, [r0+r3*1-16] + mova m1, [r4+r3*0-16] + punpckhwd m1, [r0+r5*1-16] + punpckhdq m1, m0 + mova m2, [r4+r3*2-16] + punpckhwd m2, [r4+r3*1-16] + mova m3, [r4+r3*4-16] + punpckhwd m3, [r4+r5*1-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + not r1 + and r1, r3 + pslldq m4, m3, 2 + psrldq m5, m3, 2 + pshuflw m4, m4, 11100101b + pinsrw m5, [r0+r1-2], 7 + PRED4x4_LOWPASS m3, m4, m5, m3 + PRED4x4_LOWPASS m0, m2, m1, m0 + paddw m0, m3 + HADDW m0, m1 + paddw m0, [pw_8] + psrlw m0, 4 + SPLATW m0, m0 + mova [r0+r3*1], m0 + mova [r0+r3*2], m0 + mova [r0+r5*1], m0 + mova [r0+r3*4], m0 + mova [r4+r3*1], m0 + mova [r4+r3*2], m0 + mova [r4+r5*1], m0 + mova [r4+r3*4], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_DC +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_DC +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_VERTICAL 0 +cglobal pred8x8l_vertical_10, 4, 4, 6 + sub r0, r3 + mova m0, [r0] + shr r1d, 14 + shr r2d, 13 + neg r1 + pslldq m1, m0, 2 + psrldq m2, m0, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + lea r1, [r3*3] + lea r2, [r0+r3*4] + PRED4x4_LOWPASS m0, m2, m1, m0 + mova [r0+r3*1], m0 + mova [r0+r3*2], m0 + mova [r0+r1*1], m0 + mova [r0+r3*4], m0 + mova [r2+r3*1], m0 + mova [r2+r3*2], m0 + mova [r2+r1*1], m0 + mova [r2+r3*4], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_VERTICAL +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_VERTICAL +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_HORIZONTAL 0 +cglobal pred8x8l_horizontal_10, 4, 4, 5 + mova m0, [r0-16] + shr r1d, 14 + dec r1 + and r1, r3 + sub r1, r3 + punpckhwd m0, [r0+r1-16] + mova m1, [r0+r3*2-16] + punpckhwd m1, [r0+r3*1-16] + lea r2, [r0+r3*4] + lea r1, [r3*3] + punpckhdq m1, m0 + mova m2, [r2+r3*0-16] + punpckhwd m2, [r0+r1-16] + mova m3, [r2+r3*2-16] + punpckhwd m3, [r2+r3*1-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + PALIGNR m4, m3, [r2+r1-16], 14, m0 + pslldq m0, m4, 2 + pshuflw m0, m0, 11100101b + PRED4x4_LOWPASS m4, m3, m0, m4 + punpckhwd m3, m4, m4 + punpcklwd m4, m4 + pshufd m0, m3, 0xff + pshufd m1, m3, 0xaa + pshufd m2, m3, 0x55 + pshufd m3, m3, 0x00 + mova [r0+r3*0], m0 + mova [r0+r3*1], m1 + mova [r0+r3*2], m2 + mova [r0+r1*1], m3 + pshufd m0, m4, 0xff + pshufd m1, m4, 0xaa + pshufd m2, m4, 0x55 + pshufd m3, m4, 0x00 + mova [r2+r3*0], m0 + mova [r2+r3*1], m1 + mova [r2+r3*2], m2 + mova [r2+r1*1], m3 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_HORIZONTAL +INIT_XMM ssse3 +PRED8x8L_HORIZONTAL +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_HORIZONTAL +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright, +; ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_DOWN_LEFT 0 +cglobal pred8x8l_down_left_10, 4, 4, 7 + sub r0, r3 + mova m3, [r0] + shr r1d, 14 + neg r1 + shr r2d, 13 + pslldq m1, m3, 2 + psrldq m2, m3, 2 + pinsrw m1, [r0+r1], 0 + pinsrw m2, [r0+r2+14], 7 + PRED4x4_LOWPASS m6, m2, m1, m3 + jz .fix_tr ; flags from shr r2d + mova m1, [r0+16] + psrldq m5, m1, 2 + PALIGNR m2, m1, m3, 14, m3 + pshufhw m5, m5, 10100100b + PRED4x4_LOWPASS m1, m2, m5, m1 +.do_topright: + lea r1, [r3*3] + psrldq m5, m1, 14 + lea r2, [r0+r3*4] + PALIGNR m2, m1, m6, 2, m0 + PALIGNR m3, m1, m6, 14, m0 + PALIGNR m5, m1, 2, m0 + pslldq m4, m6, 2 + PRED4x4_LOWPASS m6, m4, m2, m6 + PRED4x4_LOWPASS m1, m3, m5, m1 + mova [r2+r3*4], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r2+r1*1], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r2+r3*2], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r2+r3*1], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*4], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r0+r1*1], m1 + PALIGNR m1, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*2], m1 + PALIGNR m1, m6, 14, m6 + mova [r0+r3*1], m1 + RET +.fix_tr: + punpckhwd m3, m3 + pshufd m1, m3, 0xFF + jmp .do_topright +%endmacro + +INIT_XMM sse2 +PRED8x8L_DOWN_LEFT +INIT_XMM ssse3 +PRED8x8L_DOWN_LEFT +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_DOWN_LEFT +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_DOWN_RIGHT 0 +; standard forbids this when has_topleft is false +; no need to check +cglobal pred8x8l_down_right_10, 4, 5, 8 + sub r0, r3 + lea r4, [r0+r3*4] + lea r1, [r3*3] + mova m0, [r0+r3*1-16] + punpckhwd m0, [r0+r3*0-16] + mova m1, [r0+r1*1-16] + punpckhwd m1, [r0+r3*2-16] + punpckhdq m1, m0 + mova m2, [r4+r3*1-16] + punpckhwd m2, [r4+r3*0-16] + mova m3, [r4+r1*1-16] + punpckhwd m3, [r4+r3*2-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + mova m0, [r4+r3*4-16] + mova m1, [r0] + PALIGNR m4, m3, m0, 14, m0 + PALIGNR m1, m3, 2, m2 + pslldq m0, m4, 2 + pshuflw m0, m0, 11100101b + PRED4x4_LOWPASS m6, m1, m4, m3 + PRED4x4_LOWPASS m4, m3, m0, m4 + mova m3, [r0] + shr r2d, 13 + pslldq m1, m3, 2 + psrldq m2, m3, 2 + pinsrw m1, [r0-2], 0 + pinsrw m2, [r0+r2+14], 7 + PRED4x4_LOWPASS m3, m2, m1, m3 + PALIGNR m2, m3, m6, 2, m0 + PALIGNR m5, m3, m6, 14, m0 + psrldq m7, m3, 2 + PRED4x4_LOWPASS m6, m4, m2, m6 + PRED4x4_LOWPASS m3, m5, m7, m3 + mova [r4+r3*4], m6 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*2], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r1*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r0+r3*4], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r4+r3*1], m3 + PALIGNR m3, m6, 14, m2 + pslldq m6, 2 + mova [r4+r3*2], m3 + PALIGNR m3, m6, 14, m6 + mova [r4+r1*1], m3 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_DOWN_RIGHT +INIT_XMM ssse3 +PRED8x8L_DOWN_RIGHT +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_DOWN_RIGHT +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_VERTICAL_RIGHT 0 +; likewise with 8x8l_down_right +cglobal pred8x8l_vertical_right_10, 4, 5, 7 + sub r0, r3 + lea r4, [r0+r3*4] + lea r1, [r3*3] + mova m0, [r0+r3*1-16] + punpckhwd m0, [r0+r3*0-16] + mova m1, [r0+r1*1-16] + punpckhwd m1, [r0+r3*2-16] + punpckhdq m1, m0 + mova m2, [r4+r3*1-16] + punpckhwd m2, [r4+r3*0-16] + mova m3, [r4+r1*1-16] + punpckhwd m3, [r4+r3*2-16] + punpckhdq m3, m2 + punpckhqdq m3, m1 + mova m0, [r4+r3*4-16] + mova m1, [r0] + PALIGNR m4, m3, m0, 14, m0 + PALIGNR m1, m3, 2, m2 + PRED4x4_LOWPASS m3, m1, m4, m3 + mova m2, [r0] + shr r2d, 13 + pslldq m1, m2, 2 + psrldq m5, m2, 2 + pinsrw m1, [r0-2], 0 + pinsrw m5, [r0+r2+14], 7 + PRED4x4_LOWPASS m2, m5, m1, m2 + PALIGNR m6, m2, m3, 12, m1 + PALIGNR m5, m2, m3, 14, m0 + PRED4x4_LOWPASS m0, m6, m2, m5 + pavgw m2, m5 + mova [r0+r3*2], m0 + mova [r0+r3*1], m2 + pslldq m6, m3, 4 + pslldq m1, m3, 2 + PRED4x4_LOWPASS m1, m3, m6, m1 + PALIGNR m2, m1, 14, m4 + mova [r0+r1*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m3 + mova [r0+r3*4], m0 + pslldq m1, 2 + PALIGNR m2, m1, 14, m4 + mova [r4+r3*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m3 + mova [r4+r3*2], m0 + pslldq m1, 2 + PALIGNR m2, m1, 14, m4 + mova [r4+r1*1], m2 + pslldq m1, 2 + PALIGNR m0, m1, 14, m1 + mova [r4+r3*4], m0 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_VERTICAL_RIGHT +INIT_XMM ssse3 +PRED8x8L_VERTICAL_RIGHT +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_VERTICAL_RIGHT +%endif + +;----------------------------------------------------------------------------- +; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft, +; int has_topright, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro PRED8x8L_HORIZONTAL_UP 0 +cglobal pred8x8l_horizontal_up_10, 4, 4, 6 + mova m0, [r0+r3*0-16] + punpckhwd m0, [r0+r3*1-16] + shr r1d, 14 + dec r1 + and r1, r3 + sub r1, r3 + mova m4, [r0+r1*1-16] + lea r1, [r3*3] + lea r2, [r0+r3*4] + mova m1, [r0+r3*2-16] + punpckhwd m1, [r0+r1*1-16] + punpckhdq m0, m1 + mova m2, [r2+r3*0-16] + punpckhwd m2, [r2+r3*1-16] + mova m3, [r2+r3*2-16] + punpckhwd m3, [r2+r1*1-16] + punpckhdq m2, m3 + punpckhqdq m0, m2 + PALIGNR m1, m0, m4, 14, m4 + psrldq m2, m0, 2 + pshufhw m2, m2, 10100100b + PRED4x4_LOWPASS m0, m1, m2, m0 + psrldq m1, m0, 2 + psrldq m2, m0, 4 + pshufhw m1, m1, 10100100b + pshufhw m2, m2, 01010100b + pavgw m4, m0, m1 + PRED4x4_LOWPASS m1, m2, m0, m1 + punpckhwd m5, m4, m1 + punpcklwd m4, m1 + mova [r2+r3*0], m5 + mova [r0+r3*0], m4 + pshufd m0, m5, 11111001b + pshufd m1, m5, 11111110b + pshufd m2, m5, 11111111b + mova [r2+r3*1], m0 + mova [r2+r3*2], m1 + mova [r2+r1*1], m2 + PALIGNR m2, m5, m4, 4, m0 + PALIGNR m3, m5, m4, 8, m1 + PALIGNR m5, m5, m4, 12, m4 + mova [r0+r3*1], m2 + mova [r0+r3*2], m3 + mova [r0+r1*1], m5 + RET +%endmacro + +INIT_XMM sse2 +PRED8x8L_HORIZONTAL_UP +INIT_XMM ssse3 +PRED8x8L_HORIZONTAL_UP +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +PRED8x8L_HORIZONTAL_UP +%endif + + +;----------------------------------------------------------------------------- +; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +%macro MOV16 3-5 + mova [%1+ 0], %2 + mova [%1+mmsize], %3 +%endmacro + +INIT_XMM sse2 +cglobal pred16x16_vertical_10, 2, 3 + sub r0, r1 + mov r2d, 8 + mova m0, [r0+ 0] + mova m1, [r0+mmsize] +.loop: + MOV16 r0+r1*1, m0, m1, m2, m3 + MOV16 r0+r1*2, m0, m1, m2, m3 + lea r0, [r0+r1*2] + dec r2d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred16x16_horizontal_10, 2, 3 + mov r2d, 8 +.vloop: + movd m0, [r0+r1*0-4] + movd m1, [r0+r1*1-4] + SPLATW m0, m0, 1 + SPLATW m1, m1, 1 + MOV16 r0+r1*0, m0, m0, m0, m0 + MOV16 r0+r1*1, m1, m1, m1, m1 + lea r0, [r0+r1*2] + dec r2d + jg .vloop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred16x16_dc_10, 2, 6 + mov r5, r0 + sub r0, r1 + mova m0, [r0+0] + paddw m0, [r0+mmsize] + HADDW m0, m2 + + lea r0, [r0+r1-2] + movzx r3d, word [r0] + movzx r4d, word [r0+r1] +%rep 7 + lea r0, [r0+r1*2] + movzx r2d, word [r0] + add r3d, r2d + movzx r2d, word [r0+r1] + add r4d, r2d +%endrep + lea r3d, [r3+r4+16] + + movd m1, r3d + paddw m0, m1 + psrlw m0, 5 + SPLATW m0, m0 + mov r3d, 8 +.loop: + MOV16 r5+r1*0, m0, m0, m0, m0 + MOV16 r5+r1*1, m0, m0, m0, m0 + lea r5, [r5+r1*2] + dec r3d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred16x16_top_dc_10, 2, 3 + sub r0, r1 + mova m0, [r0+0] + paddw m0, [r0+mmsize] + HADDW m0, m2 + + SPLATW m0, m0 + paddw m0, [pw_8] + psrlw m0, 4 + mov r2d, 8 +.loop: + MOV16 r0+r1*1, m0, m0, m0, m0 + MOV16 r0+r1*2, m0, m0, m0, m0 + lea r0, [r0+r1*2] + dec r2d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred16x16_left_dc_10, 2, 6 + mov r5, r0 + + sub r0, 2 + movzx r3d, word [r0] + movzx r4d, word [r0+r1] +%rep 7 + lea r0, [r0+r1*2] + movzx r2d, word [r0] + add r3d, r2d + movzx r2d, word [r0+r1] + add r4d, r2d +%endrep + lea r3d, [r3+r4+8] + shr r3d, 4 + + movd m0, r3d + SPLATW m0, m0 + mov r3d, 8 +.loop: + MOV16 r5+r1*0, m0, m0, m0, m0 + MOV16 r5+r1*1, m0, m0, m0, m0 + lea r5, [r5+r1*2] + dec r3d + jg .loop + RET + +;----------------------------------------------------------------------------- +; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pred16x16_128_dc_10, 2,3 + mova m0, [pw_512] + mov r2d, 8 +.loop: + MOV16 r0+r1*0, m0, m0, m0, m0 + MOV16 r0+r1*1, m0, m0, m0, m0 + lea r0, [r0+r1*2] + dec r2d + jg .loop + RET diff --git a/media/ffvpx/libavcodec/x86/h264_intrapred_init.c b/media/ffvpx/libavcodec/x86/h264_intrapred_init.c new file mode 100644 index 0000000000..ee46927a24 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/h264_intrapred_init.c @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2010 Fiona Glaser <fiona@x264.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stddef.h> +#include <stdint.h> +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/codec_id.h" +#include "libavcodec/h264pred.h" + +#define PRED4x4(TYPE, DEPTH, OPT) \ +void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ + const uint8_t *topright, \ + ptrdiff_t stride); + +PRED4x4(dc, 10, mmxext) +PRED4x4(down_left, 10, sse2) +PRED4x4(down_left, 10, avx) +PRED4x4(down_right, 10, sse2) +PRED4x4(down_right, 10, ssse3) +PRED4x4(down_right, 10, avx) +PRED4x4(vertical_left, 10, sse2) +PRED4x4(vertical_left, 10, avx) +PRED4x4(vertical_right, 10, sse2) +PRED4x4(vertical_right, 10, ssse3) +PRED4x4(vertical_right, 10, avx) +PRED4x4(horizontal_up, 10, mmxext) +PRED4x4(horizontal_down, 10, sse2) +PRED4x4(horizontal_down, 10, ssse3) +PRED4x4(horizontal_down, 10, avx) + +#define PRED8x8(TYPE, DEPTH, OPT) \ +void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ + ptrdiff_t stride); + +PRED8x8(dc, 10, sse2) +PRED8x8(top_dc, 10, sse2) +PRED8x8(plane, 10, sse2) +PRED8x8(vertical, 10, sse2) +PRED8x8(horizontal, 10, sse2) + +#define PRED8x8L(TYPE, DEPTH, OPT)\ +void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ + int has_topleft, \ + int has_topright, \ + ptrdiff_t stride); + +PRED8x8L(dc, 10, sse2) +PRED8x8L(dc, 10, avx) +PRED8x8L(128_dc, 10, sse2) +PRED8x8L(top_dc, 10, sse2) +PRED8x8L(top_dc, 10, avx) +PRED8x8L(vertical, 10, sse2) +PRED8x8L(vertical, 10, avx) +PRED8x8L(horizontal, 10, sse2) +PRED8x8L(horizontal, 10, ssse3) +PRED8x8L(horizontal, 10, avx) +PRED8x8L(down_left, 10, sse2) +PRED8x8L(down_left, 10, ssse3) +PRED8x8L(down_left, 10, avx) +PRED8x8L(down_right, 10, sse2) +PRED8x8L(down_right, 10, ssse3) +PRED8x8L(down_right, 10, avx) +PRED8x8L(vertical_right, 10, sse2) +PRED8x8L(vertical_right, 10, ssse3) +PRED8x8L(vertical_right, 10, avx) +PRED8x8L(horizontal_up, 10, sse2) +PRED8x8L(horizontal_up, 10, ssse3) +PRED8x8L(horizontal_up, 10, avx) + +#define PRED16x16(TYPE, DEPTH, OPT)\ +void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ + ptrdiff_t stride); + +PRED16x16(dc, 10, sse2) +PRED16x16(top_dc, 10, sse2) +PRED16x16(128_dc, 10, sse2) +PRED16x16(left_dc, 10, sse2) +PRED16x16(vertical, 10, sse2) +PRED16x16(horizontal, 10, sse2) + +/* 8-bit versions */ +PRED16x16(vertical, 8, sse) +PRED16x16(horizontal, 8, mmxext) +PRED16x16(horizontal, 8, ssse3) +PRED16x16(dc, 8, sse2) +PRED16x16(dc, 8, ssse3) +PRED16x16(plane_h264, 8, sse2) +PRED16x16(plane_h264, 8, ssse3) +PRED16x16(plane_rv40, 8, sse2) +PRED16x16(plane_rv40, 8, ssse3) +PRED16x16(plane_svq3, 8, sse2) +PRED16x16(plane_svq3, 8, ssse3) +PRED16x16(tm_vp8, 8, sse2) +PRED16x16(tm_vp8, 8, avx2) + +PRED8x8(top_dc, 8, mmxext) +PRED8x8(dc_rv40, 8, mmxext) +PRED8x8(dc, 8, mmxext) +PRED8x8(vertical, 8, mmx) +PRED8x8(horizontal, 8, mmxext) +PRED8x8(horizontal, 8, ssse3) +PRED8x8(plane, 8, sse2) +PRED8x8(plane, 8, ssse3) +PRED8x8(tm_vp8, 8, sse2) +PRED8x8(tm_vp8, 8, ssse3) + +PRED8x8L(top_dc, 8, mmxext) +PRED8x8L(top_dc, 8, ssse3) +PRED8x8L(dc, 8, mmxext) +PRED8x8L(dc, 8, ssse3) +PRED8x8L(horizontal, 8, mmxext) +PRED8x8L(horizontal, 8, ssse3) +PRED8x8L(vertical, 8, mmxext) +PRED8x8L(vertical, 8, ssse3) +PRED8x8L(down_left, 8, sse2) +PRED8x8L(down_left, 8, ssse3) +PRED8x8L(down_right, 8, sse2) +PRED8x8L(down_right, 8, ssse3) +PRED8x8L(vertical_right, 8, sse2) +PRED8x8L(vertical_right, 8, ssse3) +PRED8x8L(vertical_left, 8, sse2) +PRED8x8L(vertical_left, 8, ssse3) +PRED8x8L(horizontal_up, 8, mmxext) +PRED8x8L(horizontal_up, 8, ssse3) +PRED8x8L(horizontal_down, 8, sse2) +PRED8x8L(horizontal_down, 8, ssse3) + +PRED4x4(dc, 8, mmxext) +PRED4x4(down_left, 8, mmxext) +PRED4x4(down_right, 8, mmxext) +PRED4x4(vertical_left, 8, mmxext) +PRED4x4(vertical_right, 8, mmxext) +PRED4x4(horizontal_up, 8, mmxext) +PRED4x4(horizontal_down, 8, mmxext) +PRED4x4(tm_vp8, 8, mmxext) +PRED4x4(tm_vp8, 8, ssse3) +PRED4x4(vertical_vp8, 8, mmxext) + +av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, + const int bit_depth, + const int chroma_format_idc) +{ + int cpu_flags = av_get_cpu_flags(); + + if (bit_depth == 8) { + if (EXTERNAL_MMX(cpu_flags)) { + if (chroma_format_idc <= 1) { + h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx; + } + } + + if (EXTERNAL_MMXEXT(cpu_flags)) { + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext; + if (chroma_format_idc <= 1) + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext; + h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext; + h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext; + h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext; + h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext; + h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext; + h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext; + h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext; + h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext; + h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext; + if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8 || + codec_id == AV_CODEC_ID_H264) { + h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext; + } + if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) { + h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext; + } + if (codec_id != AV_CODEC_ID_RV40) { + h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext; + } + if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) { + if (chroma_format_idc <= 1) { + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext; + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext; + } + } + if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) { + h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext; + h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext; + h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext; + } + } + + if (EXTERNAL_SSE(cpu_flags)) { + h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2; + h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2; + h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2; + h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2; + h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2; + h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2; + if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) { + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2; + h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2; + } else { + if (chroma_format_idc <= 1) + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2; + if (codec_id == AV_CODEC_ID_SVQ3) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2; + } else if (codec_id == AV_CODEC_ID_RV40) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2; + } else { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2; + } + } + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3; + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3; + if (chroma_format_idc <= 1) + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3; + h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3; + h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3; + h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3; + h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3; + h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3; + h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3; + h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3; + h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3; + h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3; + h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3; + if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) { + h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3; + h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3; + } else { + if (chroma_format_idc <= 1) + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3; + if (codec_id == AV_CODEC_ID_SVQ3) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3; + } else if (codec_id == AV_CODEC_ID_RV40) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3; + } else { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3; + } + } + } + + if(EXTERNAL_AVX2(cpu_flags)){ + if (codec_id == AV_CODEC_ID_VP8) { + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2; + } + } + } else if (bit_depth == 10) { + if (EXTERNAL_MMXEXT(cpu_flags)) { + h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext; + h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; + } + if (EXTERNAL_SSE2(cpu_flags)) { + h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2; + h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2; + h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2; + h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2; + h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2; + + if (chroma_format_idc <= 1) { + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2; + h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2; + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2; + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2; + } + + h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2; + h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2; + h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2; + h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2; + h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2; + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2; + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2; + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2; + + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2; + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2; + h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2; + h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2; + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2; + } + if (EXTERNAL_SSSE3(cpu_flags)) { + h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3; + h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3; + h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3; + + h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3; + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3; + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3; + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3; + } + if (EXTERNAL_AVX(cpu_flags)) { + h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx; + h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx; + h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx; + h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx; + h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx; + + h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx; + h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx; + h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx; + h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx; + h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx; + h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx; + h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx; + } + } +} diff --git a/media/ffvpx/libavcodec/x86/idctdsp.asm b/media/ffvpx/libavcodec/x86/idctdsp.asm new file mode 100644 index 0000000000..1cfdb5419d --- /dev/null +++ b/media/ffvpx/libavcodec/x86/idctdsp.asm @@ -0,0 +1,112 @@ +;****************************************************************************** +;* SIMD-optimized IDCT-related routines +;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2003-2013 Michael Niedermayer +;* Copyright (c) 2013 Daniel Kang +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pb_80 + +SECTION .text + +;-------------------------------------------------------------------------- +;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels, +; ptrdiff_t line_size) +;-------------------------------------------------------------------------- + +%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1 + mova m1, [blockq+mmsize*0+%1] + mova m2, [blockq+mmsize*2+%1] + packsswb m1, [blockq+mmsize*1+%1] + packsswb m2, [blockq+mmsize*3+%1] + paddb m1, m0 + paddb m2, m0 + movq [pixelsq+lsizeq*0], m1 + movhps [pixelsq+lsizeq*1], m1 + movq [pixelsq+lsizeq*2], m2 + movhps [pixelsq+lsize3q ], m2 +%endmacro + +INIT_XMM sse2 +cglobal put_signed_pixels_clamped, 3, 4, 3, block, pixels, lsize, lsize3 + mova m0, [pb_80] + lea lsize3q, [lsizeq*3] + PUT_SIGNED_PIXELS_CLAMPED_HALF 0 + lea pixelsq, [pixelsq+lsizeq*4] + PUT_SIGNED_PIXELS_CLAMPED_HALF 64 + RET + +;-------------------------------------------------------------------------- +; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels, +; ptrdiff_t line_size); +;-------------------------------------------------------------------------- +; %1 = block offset +%macro PUT_PIXELS_CLAMPED_HALF 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*2+%1] + packuswb m0, [blockq+mmsize*1+%1] + packuswb m1, [blockq+mmsize*3+%1] + movq [pixelsq], m0 + movhps [lsizeq+pixelsq], m0 + movq [2*lsizeq+pixelsq], m1 + movhps [lsize3q+pixelsq], m1 +%endmacro + +INIT_XMM sse2 +cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3 + lea lsize3q, [lsizeq*3] + PUT_PIXELS_CLAMPED_HALF 0 + lea pixelsq, [pixelsq+lsizeq*4] + PUT_PIXELS_CLAMPED_HALF 64 + RET + +;-------------------------------------------------------------------------- +; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels, +; ptrdiff_t line_size); +;-------------------------------------------------------------------------- +; %1 = block offset +%macro ADD_PIXELS_CLAMPED 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*1+%1] + movq m2, [pixelsq] + movq m3, [pixelsq+lsizeq] + punpcklbw m2, m4 + punpcklbw m3, m4 + paddsw m0, m2 + paddsw m1, m3 + packuswb m0, m1 + movq [pixelsq], m0 + movhps [pixelsq+lsizeq], m0 +%endmacro + +INIT_XMM sse2 +cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize + pxor m4, m4 + ADD_PIXELS_CLAMPED 0 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 32 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 64 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 96 + RET diff --git a/media/ffvpx/libavcodec/x86/idctdsp.h b/media/ffvpx/libavcodec/x86/idctdsp.h new file mode 100644 index 0000000000..738e4e36e4 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/idctdsp.h @@ -0,0 +1,33 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_IDCTDSP_H +#define AVCODEC_X86_IDCTDSP_H + +#include <stddef.h> +#include <stdint.h> + +void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); +void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); +void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels, + ptrdiff_t line_size); + + +#endif /* AVCODEC_X86_IDCTDSP_H */ diff --git a/media/ffvpx/libavcodec/x86/idctdsp_init.c b/media/ffvpx/libavcodec/x86/idctdsp_init.c new file mode 100644 index 0000000000..f28a1ad744 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/idctdsp_init.c @@ -0,0 +1,159 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idctdsp.h" +#include "simple_idct.h" + +/* Input permutation for the simple_idct_mmx */ +static const uint8_t simple_mmx_permutation[64] = { + 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, + 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, + 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, + 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, + 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, + 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, + 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, + 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, +}; + +static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + +av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation, + enum idct_permutation_type perm_type) +{ + int i; + + switch (perm_type) { + case FF_IDCT_PERM_SIMPLE: + for (i = 0; i < 64; i++) + idct_permutation[i] = simple_mmx_permutation[i]; + return 1; + case FF_IDCT_PERM_SSE2: + for (i = 0; i < 64; i++) + idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7]; + return 1; + } + + return 0; +} + +av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + +#if ARCH_X86_32 + if (EXTERNAL_MMX(cpu_flags)) { + if (!high_bit_depth && + avctx->lowres == 0 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + c->idct = ff_simple_idct_mmx; + } + } +#endif + + if (EXTERNAL_SSE2(cpu_flags)) { + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2; + c->put_pixels_clamped = ff_put_pixels_clamped_sse2; + c->add_pixels_clamped = ff_add_pixels_clamped_sse2; + +#if ARCH_X86_32 + if (!high_bit_depth && + avctx->lowres == 0 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + c->idct_put = ff_simple_idct_put_sse2; + c->idct_add = ff_simple_idct_add_sse2; + c->perm_type = FF_IDCT_PERM_SIMPLE; + } +#endif + + if (ARCH_X86_64 && + !high_bit_depth && + avctx->lowres == 0 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX || + avctx->idct_algo == FF_IDCT_SIMPLE)) { + c->idct = ff_simple_idct8_sse2; + c->idct_put = ff_simple_idct8_put_sse2; + c->idct_add = ff_simple_idct8_add_sse2; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + } + + if (ARCH_X86_64 && avctx->lowres == 0) { + if (EXTERNAL_AVX(cpu_flags) && + !high_bit_depth && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX || + avctx->idct_algo == FF_IDCT_SIMPLE)) { + c->idct = ff_simple_idct8_avx; + c->idct_put = ff_simple_idct8_put_avx; + c->idct_add = ff_simple_idct8_add_avx; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + + if (avctx->bits_per_raw_sample == 10 && + avctx->codec_id != AV_CODEC_ID_MPEG4 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEAUTO || + avctx->idct_algo == FF_IDCT_SIMPLE)) { + if (EXTERNAL_SSE2(cpu_flags)) { + c->idct_put = ff_simple_idct10_put_sse2; + c->idct_add = NULL; + c->idct = ff_simple_idct10_sse2; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + + } + if (EXTERNAL_AVX(cpu_flags)) { + c->idct_put = ff_simple_idct10_put_avx; + c->idct_add = NULL; + c->idct = ff_simple_idct10_avx; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + } + + if (avctx->bits_per_raw_sample == 12 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + if (EXTERNAL_SSE2(cpu_flags)) { + c->idct_put = ff_simple_idct12_put_sse2; + c->idct_add = NULL; + c->idct = ff_simple_idct12_sse2; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + if (EXTERNAL_AVX(cpu_flags)) { + c->idct_put = ff_simple_idct12_put_avx; + c->idct_add = NULL; + c->idct = ff_simple_idct12_avx; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + } + } +} diff --git a/media/ffvpx/libavcodec/x86/imdct36.asm b/media/ffvpx/libavcodec/x86/imdct36.asm new file mode 100644 index 0000000000..888c6bf4d6 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/imdct36.asm @@ -0,0 +1,736 @@ +;****************************************************************************** +;* 36 point SSE-optimized IMDCT transform +;* Copyright (c) 2011 Vitor Sessak +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +ps_mask: dd 0, ~0, ~0, ~0 +ps_mask2: dd 0, ~0, 0, ~0 +ps_mask3: dd 0, 0, 0, ~0 +ps_mask4: dd 0, ~0, 0, 0 + +ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 +ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 +ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 +ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 +ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 +ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 +ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 + +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 +ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 + +ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 + dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 + dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 + dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 + dd 1.0, 0.70710678118654752439, 0.0, 0.0 + +ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 + dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 + dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 + dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 + dd 1.0, -0.70710678118654752439, 0.0, 0.0 + +costabs: times 4 dd 0.98480773 + times 4 dd 0.93969262 + times 4 dd 0.86602539 + times 4 dd -0.76604444 + times 4 dd -0.64278764 + times 4 dd 0.50000000 + times 4 dd -0.50000000 + times 4 dd -0.34202015 + times 4 dd -0.17364818 + times 4 dd 0.50190992 + times 4 dd 0.51763808 + times 4 dd 0.55168896 + times 4 dd 0.61038726 + times 4 dd 0.70710677 + times 4 dd 0.87172341 + times 4 dd 1.18310082 + times 4 dd 1.93185163 + times 4 dd 5.73685646 + +%define SBLIMIT 32 +SECTION .text + +%macro PSHUFD 3 +%if cpuflag(sse2) && notcpuflag(avx) + pshufd %1, %2, %3 +%else + shufps %1, %2, %2, %3 +%endif +%endmacro + +; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} +; output %1={x3,x4,y1,y2} +%macro BUILDINVHIGHLOW 3 +%if cpuflag(avx) + shufps %1, %2, %3, 0x4e +%else + movlhps %1, %3 + movhlps %1, %2 +%endif +%endmacro + +; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} +; output %1={x4,y1,y2,y3} +%macro ROTLEFT 3 +%if cpuflag(ssse3) + palignr %1, %3, %2, 12 +%else + BUILDINVHIGHLOW %1, %2, %3 + shufps %1, %1, %3, 0x99 +%endif +%endmacro + +%macro INVERTHL 2 +%if cpuflag(sse2) + PSHUFD %1, %2, 0x4e +%else + movhlps %1, %2 + movlhps %1, %2 +%endif +%endmacro + +%macro BUTTERF 3 + INVERTHL %2, %1 + xorps %1, [ps_p1p1m1m1] + addps %1, %2 +%if cpuflag(sse3) + mulps %1, %1, [ps_cosh_sse3 + %3] + PSHUFD %2, %1, 0xb1 + addsubps %1, %1, %2 +%else + mulps %1, [ps_cosh + %3] + PSHUFD %2, %1, 0xb1 + xorps %1, [ps_p1m1p1m1] + addps %1, %2 +%endif +%endmacro + +%macro BUTTERF2 3 +%if cpuflag(sse3) + mulps %1, %1, [ps_cosh_sse3 + %3] + PSHUFD %2, %1, 0xe1 + addsubps %1, %1, %2 +%else + mulps %1, [ps_cosh + %3] + PSHUFD %2, %1, 0xe1 + xorps %1, [ps_p1m1p1m1] + addps %1, %2 +%endif +%endmacro + +%macro STORE 4 +%if cpuflag(sse4) + movss [%3 ], %1 + extractps dword [%3 + %4], %1, 1 + extractps dword [%3 + 2*%4], %1, 2 + extractps dword [%3 + 3*%4], %1, 3 +%else + movhlps %2, %1 + movss [%3 ], %1 + movss [%3 + 2*%4], %2 + shufps %1, %1, 0xb1 + movss [%3 + %4], %1 + movhlps %2, %1 + movss [%3 + 3*%4], %2 +%endif +%endmacro + +%macro LOAD 4 + movlps %1, [%3 ] + movhps %1, [%3 + %4] + movlps %2, [%3 + 2*%4] + movhps %2, [%3 + 3*%4] + shufps %1, %2, 0x88 +%endmacro + +%macro LOADA64 2 +%if cpuflag(avx) + movu %1, [%2] +%else + movlps %1, [%2] + movhps %1, [%2 + 8] +%endif +%endmacro + +%macro DEFINE_IMDCT 0 +cglobal imdct36_float, 4,4,9, out, buf, in, win + + ; for(i=17;i>=1;i--) in[i] += in[i-1]; + LOADA64 m0, inq + LOADA64 m1, inq + 16 + + ROTLEFT m5, m0, m1 + + PSHUFD m6, m0, 0x93 + andps m6, m6, [ps_mask] + addps m0, m0, m6 + + LOADA64 m2, inq + 32 + + ROTLEFT m7, m1, m2 + + addps m1, m1, m5 + LOADA64 m3, inq + 48 + + ROTLEFT m5, m2, m3 + + xorps m4, m4, m4 + movlps m4, [inq+64] + BUILDINVHIGHLOW m6, m3, m4 + shufps m6, m6, m4, 0xa9 + + addps m4, m4, m6 + addps m2, m2, m7 + addps m3, m3, m5 + + ; for(i=17;i>=3;i-=2) in[i] += in[i-2]; + movlhps m5, m5, m0 + andps m5, m5, [ps_mask3] + + BUILDINVHIGHLOW m7, m0, m1 + andps m7, m7, [ps_mask2] + + addps m0, m0, m5 + + BUILDINVHIGHLOW m6, m1, m2 + andps m6, m6, [ps_mask2] + + addps m1, m1, m7 + + BUILDINVHIGHLOW m7, m2, m3 + andps m7, m7, [ps_mask2] + + addps m2, m2, m6 + + movhlps m6, m6, m3 + andps m6, m6, [ps_mask4] + + addps m3, m3, m7 + addps m4, m4, m6 + + ; Populate tmp[] + movlhps m6, m1, m5 ; zero out high values + subps m6, m6, m4 + + subps m5, m0, m3 + +%if ARCH_X86_64 + SWAP m5, m8 +%endif + + mulps m7, m2, [ps_val1] + +%if ARCH_X86_64 + mulps m5, m8, [ps_val2] +%else + mulps m5, m5, [ps_val2] +%endif + addps m7, m7, m5 + + mulps m5, m6, [ps_val1] + subps m7, m7, m5 + +%if ARCH_X86_64 + SWAP m5, m8 +%else + subps m5, m0, m3 +%endif + + subps m5, m5, m6 + addps m5, m5, m2 + + shufps m6, m4, m3, 0xe4 + subps m6, m6, m2 + mulps m6, m6, [ps_val3] + + addps m4, m4, m1 + mulps m4, m4, [ps_val4] + + shufps m1, m1, m0, 0xe4 + addps m1, m1, m2 + mulps m1, m1, [ps_val5] + + mulps m3, m3, [ps_val6] + mulps m0, m0, [ps_val7] + addps m0, m0, m3 + + xorps m2, m1, [ps_p1p1m1m1] + subps m2, m2, m4 + addps m2, m2, m0 + + addps m3, m4, m0 + subps m3, m3, m6 + xorps m3, m3, [ps_p1p1m1m1] + + shufps m0, m0, m4, 0xe4 + subps m0, m0, m1 + addps m0, m0, m6 + + BUILDINVHIGHLOW m4, m2, m3 + shufps m3, m3, m2, 0x4e + + ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} + + BUTTERF m0, m1, 0 + BUTTERF m7, m2, 16 + BUTTERF m3, m6, 32 + BUTTERF m4, m1, 48 + BUTTERF2 m5, m1, 64 + + ; permutates: + ; m0 0 1 2 3 => 2 6 10 14 m1 + ; m7 4 5 6 7 => 3 7 11 15 m2 + ; m3 8 9 10 11 => 17 13 9 5 m3 + ; m4 12 13 14 15 => 16 12 8 4 m5 + ; m5 16 17 xx xx => 0 1 xx xx m0 + + unpckhps m1, m0, m7 + unpckhps m6, m3, m4 + movhlps m2, m6, m1 + movlhps m1, m1, m6 + + unpcklps m5, m5, m4 + unpcklps m3, m3, m7 + movhlps m4, m3, m5 + movlhps m5, m5, m3 + SWAP m4, m3 + ; permutation done + + PSHUFD m6, m2, 0xb1 + movss m4, [bufq + 4*68] + movss m7, [bufq + 4*64] + unpcklps m7, m7, m4 + mulps m6, m6, [winq + 16*4] + addps m6, m6, m7 + movss [outq + 64*SBLIMIT], m6 + shufps m6, m6, m6, 0xb1 + movss [outq + 68*SBLIMIT], m6 + + mulps m6, m3, [winq + 4*4] + LOAD m4, m7, bufq + 4*16, 16 + addps m6, m6, m4 + STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT + + shufps m4, m0, m3, 0xb5 + mulps m4, m4, [winq + 8*4] + LOAD m7, m6, bufq + 4*32, 16 + addps m4, m4, m7 + STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT + + shufps m3, m3, m2, 0xb1 + mulps m3, m3, [winq + 12*4] + LOAD m7, m6, bufq + 4*48, 16 + addps m3, m3, m7 + STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT + + mulps m2, m2, [winq] + LOAD m6, m7, bufq, 16 + addps m2, m2, m6 + STORE m2, m7, outq, 4*SBLIMIT + + mulps m4, m1, [winq + 20*4] + STORE m4, m7, bufq, 16 + + mulps m3, m5, [winq + 24*4] + STORE m3, m7, bufq + 4*16, 16 + + shufps m0, m0, m5, 0xb0 + mulps m0, m0, [winq + 28*4] + STORE m0, m7, bufq + 4*32, 16 + + shufps m5, m5, m1, 0xb1 + mulps m5, m5, [winq + 32*4] + STORE m5, m7, bufq + 4*48, 16 + + shufps m1, m1, m1, 0xb1 + mulps m1, m1, [winq + 36*4] + movss [bufq + 4*64], m1 + shufps m1, m1, 0xb1 + movss [bufq + 4*68], m1 + RET +%endmacro + +INIT_XMM sse2 +DEFINE_IMDCT + +INIT_XMM sse3 +DEFINE_IMDCT + +INIT_XMM ssse3 +DEFINE_IMDCT + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +DEFINE_IMDCT +%endif + +INIT_XMM sse + +%if ARCH_X86_64 +%define SPILL SWAP +%define UNSPILL SWAP +%define SPILLED(x) m %+ x +%else +%define SPILLED(x) [tmpq+(x-8)*16 + 32*4] +%macro SPILL 2 ; xmm#, mempos + movaps SPILLED(%2), m%1 +%endmacro +%macro UNSPILL 2 + movaps m%1, SPILLED(%2) +%endmacro +%endif + +%macro DEFINE_FOUR_IMDCT 0 +cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp + movlps m0, [inq+64] + movhps m0, [inq+64 + 72] + movlps m3, [inq+64 + 2*72] + movhps m3, [inq+64 + 3*72] + + shufps m5, m0, m3, 0xdd + shufps m0, m0, m3, 0x88 + + mova m1, [inq+48] + movu m6, [inq+48 + 72] + mova m7, [inq+48 + 2*72] + movu m3, [inq+48 + 3*72] + + TRANSPOSE4x4PS 1, 6, 7, 3, 4 + + addps m4, m6, m7 + mova [tmpq+4*28], m4 + + addps m7, m3 + addps m6, m1 + addps m3, m0 + addps m0, m5 + addps m0, m7 + addps m7, m6 + mova [tmpq+4*12], m7 + SPILL 3, 12 + + mova m4, [inq+32] + movu m5, [inq+32 + 72] + mova m2, [inq+32 + 2*72] + movu m7, [inq+32 + 3*72] + + TRANSPOSE4x4PS 4, 5, 2, 7, 3 + + addps m1, m7 + SPILL 1, 11 + + addps m3, m5, m2 + SPILL 3, 13 + + addps m7, m2 + addps m5, m4 + addps m6, m7 + mova [tmpq], m6 + addps m7, m5 + mova [tmpq+4*16], m7 + + mova m2, [inq+16] + movu m7, [inq+16 + 72] + mova m1, [inq+16 + 2*72] + movu m6, [inq+16 + 3*72] + + TRANSPOSE4x4PS 2, 7, 1, 6, 3 + + addps m4, m6 + addps m6, m1 + addps m1, m7 + addps m7, m2 + addps m5, m6 + SPILL 5, 15 + addps m6, m7 + mulps m6, [costabs + 16*2] + mova [tmpq+4*8], m6 + SPILL 1, 10 + SPILL 0, 14 + + mova m1, [inq] + movu m6, [inq + 72] + mova m3, [inq + 2*72] + movu m5, [inq + 3*72] + + TRANSPOSE4x4PS 1, 6, 3, 5, 0 + + addps m2, m5 + addps m5, m3 + addps m7, m5 + addps m3, m6 + addps m6, m1 + SPILL 7, 8 + addps m5, m6 + SPILL 6, 9 + addps m6, m4, SPILLED(12) + subps m6, m2 + UNSPILL 7, 11 + SPILL 5, 11 + subps m5, m1, m7 + mulps m7, [costabs + 16*5] + addps m7, m1 + mulps m0, m6, [costabs + 16*6] + addps m0, m5 + mova [tmpq+4*24], m0 + addps m6, m5 + mova [tmpq+4*4], m6 + addps m6, m4, m2 + mulps m6, [costabs + 16*1] + subps m4, SPILLED(12) + mulps m4, [costabs + 16*8] + addps m2, SPILLED(12) + mulps m2, [costabs + 16*3] + subps m5, m7, m6 + subps m5, m2 + addps m6, m7 + addps m6, m4 + addps m7, m2 + subps m7, m4 + mova [tmpq+4*20], m7 + mova m2, [tmpq+4*28] + mova [tmpq+4*28], m5 + UNSPILL 7, 13 + subps m5, m7, m2 + mulps m5, [costabs + 16*7] + UNSPILL 1, 10 + mulps m1, [costabs + 16*2] + addps m4, m3, m2 + mulps m4, [costabs + 16*4] + addps m2, m7 + addps m7, m3 + mulps m7, [costabs] + subps m3, m2 + mulps m3, [costabs + 16*2] + addps m2, m7, m5 + addps m2, m1 + SPILL 2, 10 + addps m7, m4 + subps m7, m1 + SPILL 7, 12 + subps m5, m4 + subps m5, m1 + UNSPILL 0, 14 + SPILL 5, 13 + addps m1, m0, SPILLED(15) + subps m1, SPILLED(8) + mova m4, [costabs + 16*5] + mulps m4, [tmpq] + UNSPILL 2, 9 + addps m4, m2 + subps m2, [tmpq] + mulps m5, m1, [costabs + 16*6] + addps m5, m2 + SPILL 5, 9 + addps m2, m1 + SPILL 2, 14 + UNSPILL 5, 15 + subps m7, m5, m0 + addps m5, SPILLED(8) + mulps m5, [costabs + 16*1] + mulps m7, [costabs + 16*8] + addps m0, SPILLED(8) + mulps m0, [costabs + 16*3] + subps m2, m4, m5 + subps m2, m0 + SPILL 2, 15 + addps m5, m4 + addps m5, m7 + addps m4, m0 + subps m4, m7 + SPILL 4, 8 + mova m7, [tmpq+4*16] + mova m2, [tmpq+4*12] + addps m0, m7, m2 + subps m0, SPILLED(11) + mulps m0, [costabs + 16*2] + addps m4, m7, SPILLED(11) + mulps m4, [costabs] + subps m7, m2 + mulps m7, [costabs + 16*7] + addps m2, SPILLED(11) + mulps m2, [costabs + 16*4] + addps m1, m7, [tmpq+4*8] + addps m1, m4 + addps m4, m2 + subps m4, [tmpq+4*8] + SPILL 4, 11 + subps m7, m2 + subps m7, [tmpq+4*8] + addps m4, m6, SPILLED(10) + subps m6, SPILLED(10) + addps m2, m5, m1 + mulps m2, [costabs + 16*9] + subps m5, m1 + mulps m5, [costabs + 16*17] + subps m1, m4, m2 + addps m4, m2 + mulps m2, m1, [winq+4*36] + addps m2, [bufq+4*36] + mova [outq+1152], m2 + mulps m1, [winq+4*32] + addps m1, [bufq+4*32] + mova [outq+1024], m1 + mulps m1, m4, [winq+4*116] + mova [bufq+4*36], m1 + mulps m4, [winq+4*112] + mova [bufq+4*32], m4 + addps m2, m6, m5 + subps m6, m5 + mulps m1, m6, [winq+4*68] + addps m1, [bufq+4*68] + mova [outq+2176], m1 + mulps m6, [winq] + addps m6, [bufq] + mova [outq], m6 + mulps m1, m2, [winq+4*148] + mova [bufq+4*68], m1 + mulps m2, [winq+4*80] + mova [bufq], m2 + addps m5, m3, [tmpq+4*24] + mova m2, [tmpq+4*24] + subps m2, m3 + mova m1, SPILLED(9) + subps m1, m0 + mulps m1, [costabs + 16*10] + addps m0, SPILLED(9) + mulps m0, [costabs + 16*16] + addps m6, m5, m1 + subps m5, m1 + mulps m3, m5, [winq+4*40] + addps m3, [bufq+4*40] + mova [outq+1280], m3 + mulps m5, [winq+4*28] + addps m5, [bufq+4*28] + mova [outq+896], m5 + mulps m1, m6, [winq+4*120] + mova [bufq+4*40], m1 + mulps m6, [winq+4*108] + mova [bufq+4*28], m6 + addps m1, m2, m0 + subps m2, m0 + mulps m5, m2, [winq+4*64] + addps m5, [bufq+4*64] + mova [outq+2048], m5 + mulps m2, [winq+4*4] + addps m2, [bufq+4*4] + mova [outq+128], m2 + mulps m0, m1, [winq+4*144] + mova [bufq+4*64], m0 + mulps m1, [winq+4*84] + mova [bufq+4*4], m1 + mova m1, [tmpq+4*28] + mova m5, m1 + addps m1, SPILLED(13) + subps m5, SPILLED(13) + UNSPILL 3, 15 + addps m2, m7, m3 + mulps m2, [costabs + 16*11] + subps m3, m7 + mulps m3, [costabs + 16*15] + addps m0, m2, m1 + subps m1, m2 + SWAP m0, m2 + mulps m6, m1, [winq+4*44] + addps m6, [bufq+4*44] + mova [outq+1408], m6 + mulps m1, [winq+4*24] + addps m1, [bufq+4*24] + mova [outq+768], m1 + mulps m0, m2, [winq+4*124] + mova [bufq+4*44], m0 + mulps m2, [winq+4*104] + mova [bufq+4*24], m2 + addps m0, m5, m3 + subps m5, m3 + mulps m1, m5, [winq+4*60] + addps m1, [bufq+4*60] + mova [outq+1920], m1 + mulps m5, [winq+4*8] + addps m5, [bufq+4*8] + mova [outq+256], m5 + mulps m1, m0, [winq+4*140] + mova [bufq+4*60], m1 + mulps m0, [winq+4*88] + mova [bufq+4*8], m0 + mova m1, [tmpq+4*20] + addps m1, SPILLED(12) + mova m2, [tmpq+4*20] + subps m2, SPILLED(12) + UNSPILL 7, 8 + subps m0, m7, SPILLED(11) + addps m7, SPILLED(11) + mulps m4, m7, [costabs + 16*12] + mulps m0, [costabs + 16*14] + addps m5, m1, m4 + subps m1, m4 + mulps m7, m1, [winq+4*48] + addps m7, [bufq+4*48] + mova [outq+1536], m7 + mulps m1, [winq+4*20] + addps m1, [bufq+4*20] + mova [outq+640], m1 + mulps m1, m5, [winq+4*128] + mova [bufq+4*48], m1 + mulps m5, [winq+4*100] + mova [bufq+4*20], m5 + addps m6, m2, m0 + subps m2, m0 + mulps m1, m2, [winq+4*56] + addps m1, [bufq+4*56] + mova [outq+1792], m1 + mulps m2, [winq+4*12] + addps m2, [bufq+4*12] + mova [outq+384], m2 + mulps m0, m6, [winq+4*136] + mova [bufq+4*56], m0 + mulps m6, [winq+4*92] + mova [bufq+4*12], m6 + UNSPILL 0, 14 + mulps m0, [costabs + 16*13] + mova m3, [tmpq+4*4] + addps m2, m0, m3 + subps m3, m0 + mulps m0, m3, [winq+4*52] + addps m0, [bufq+4*52] + mova [outq+1664], m0 + mulps m3, [winq+4*16] + addps m3, [bufq+4*16] + mova [outq+512], m3 + mulps m0, m2, [winq+4*132] + mova [bufq+4*52], m0 + mulps m2, [winq+4*96] + mova [bufq+4*16], m2 + RET +%endmacro + +INIT_XMM sse +DEFINE_FOUR_IMDCT + +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +DEFINE_FOUR_IMDCT +%endif diff --git a/media/ffvpx/libavcodec/x86/mathops.h b/media/ffvpx/libavcodec/x86/mathops.h new file mode 100644 index 0000000000..ca7e2dffc1 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/mathops.h @@ -0,0 +1,153 @@ +/* + * simple math operations + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_MATHOPS_H +#define AVCODEC_X86_MATHOPS_H + +#include "config.h" + +#include "libavutil/common.h" +#include "libavutil/x86/asm.h" + +#if HAVE_INLINE_ASM + +#if ARCH_X86_32 + +#define MULL MULL +static av_always_inline av_const int MULL(int a, int b, unsigned shift) +{ + int rt, dummy; + if (__builtin_constant_p(shift)) + __asm__ ( + "imull %3 \n\t" + "shrdl %4, %%edx, %%eax \n\t" + :"=a"(rt), "=d"(dummy) + :"a"(a), "rm"(b), "i"(shift & 0x1F) + ); + else + __asm__ ( + "imull %3 \n\t" + "shrdl %4, %%edx, %%eax \n\t" + :"=a"(rt), "=d"(dummy) + :"a"(a), "rm"(b), "c"((uint8_t)shift) + ); + return rt; +} + +#define MULH MULH +static av_always_inline av_const int MULH(int a, int b) +{ + int rt, dummy; + __asm__ ( + "imull %3" + :"=d"(rt), "=a"(dummy) + :"a"(a), "rm"(b) + ); + return rt; +} + +#define MUL64 MUL64 +static av_always_inline av_const int64_t MUL64(int a, int b) +{ + int64_t rt; + __asm__ ( + "imull %2" + :"=A"(rt) + :"a"(a), "rm"(b) + ); + return rt; +} + +#endif /* ARCH_X86_32 */ + +#if HAVE_I686 +/* median of 3 */ +#define mid_pred mid_pred +static inline av_const int mid_pred(int a, int b, int c) +{ + int i=b; + __asm__ ( + "cmp %2, %1 \n\t" + "cmovg %1, %0 \n\t" + "cmovg %2, %1 \n\t" + "cmp %3, %1 \n\t" + "cmovl %3, %1 \n\t" + "cmp %1, %0 \n\t" + "cmovg %1, %0 \n\t" + :"+&r"(i), "+&r"(a) + :"r"(b), "r"(c) + ); + return i; +} + +#if HAVE_6REGS +#define COPY3_IF_LT(x, y, a, b, c, d)\ +__asm__ volatile(\ + "cmpl %0, %3 \n\t"\ + "cmovl %3, %0 \n\t"\ + "cmovl %4, %1 \n\t"\ + "cmovl %5, %2 \n\t"\ + : "+&r" (x), "+&r" (a), "+r" (c)\ + : "r" (y), "r" (b), "r" (d)\ +); +#endif /* HAVE_6REGS */ + +#endif /* HAVE_I686 */ + +#define MASK_ABS(mask, level) \ + __asm__ ("cdq \n\t" \ + "xorl %1, %0 \n\t" \ + "subl %1, %0 \n\t" \ + : "+a"(level), "=&d"(mask)) + +// avoid +32 for shift optimization (gcc should do that ...) +#define NEG_SSR32 NEG_SSR32 +static inline int32_t NEG_SSR32( int32_t a, int8_t s){ + if (__builtin_constant_p(s)) + __asm__ ("sarl %1, %0\n\t" + : "+r" (a) + : "i" (-s & 0x1F) + ); + else + __asm__ ("sarl %1, %0\n\t" + : "+r" (a) + : "c" ((uint8_t)(-s)) + ); + return a; +} + +#define NEG_USR32 NEG_USR32 +static inline uint32_t NEG_USR32(uint32_t a, int8_t s){ + if (__builtin_constant_p(s)) + __asm__ ("shrl %1, %0\n\t" + : "+r" (a) + : "i" (-s & 0x1F) + ); + else + __asm__ ("shrl %1, %0\n\t" + : "+r" (a) + : "c" ((uint8_t)(-s)) + ); + return a; +} + +#endif /* HAVE_INLINE_ASM */ +#endif /* AVCODEC_X86_MATHOPS_H */ diff --git a/media/ffvpx/libavcodec/x86/moz.build b/media/ffvpx/libavcodec/x86/moz.build new file mode 100644 index 0000000000..693218099a --- /dev/null +++ b/media/ffvpx/libavcodec/x86/moz.build @@ -0,0 +1,55 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +SOURCES += [ + 'constants.c', + 'dct32.asm', + 'dct_init.c', + 'fdct.c', + 'fdctdsp_init.c', + 'flacdsp.asm', + 'flacdsp_init.c', + 'h264_intrapred.asm', + 'h264_intrapred_10bit.asm', + 'h264_intrapred_init.c', + 'idctdsp.asm', + 'idctdsp_init.c', + 'imdct36.asm', + 'mpegaudiodsp.c', + 'videodsp.asm', + 'videodsp_init.c', + 'vp8dsp.asm', + 'vp8dsp_init.c', + 'vp8dsp_loopfilter.asm', + 'vp9dsp_init.c', + 'vp9dsp_init_10bpp.c', + 'vp9dsp_init_12bpp.c', + 'vp9dsp_init_16bpp.c', + 'vp9intrapred.asm', + 'vp9intrapred_16bpp.asm', + 'vp9itxfm.asm', + 'vp9itxfm_16bpp.asm', + 'vp9lpf.asm', + 'vp9lpf_16bpp.asm', + 'vp9mc.asm', + 'vp9mc_16bpp.asm', +] + +if CONFIG['CPU_ARCH'] == 'x86': + SOURCES += [ 'simple_idct.asm' ] + +if CONFIG['CPU_ARCH'] == 'x86_64': + SOURCES += [ 'simple_idct10.asm' ] + +if CONFIG['MOZ_LIBAV_FFT']: + SOURCES += [ + 'fft.asm', + 'fft_init.c', + ] + +FINAL_LIBRARY = 'mozavcodec' + +include('/media/ffvpx/ffvpxcommon.mozbuild') diff --git a/media/ffvpx/libavcodec/x86/mpegaudiodsp.c b/media/ffvpx/libavcodec/x86/mpegaudiodsp.c new file mode 100644 index 0000000000..6586fe0726 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/mpegaudiodsp.c @@ -0,0 +1,284 @@ +/* + * SIMD-optimized MP3 decoding functions + * Copyright (c) 2010 Vitor Sessak + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stddef.h> + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem_internal.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/mpegaudiodsp.h" + +#define DECL(CPU)\ +static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\ +void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win); + +#if HAVE_X86ASM +DECL(sse2) +DECL(sse3) +DECL(ssse3) +DECL(avx) +#endif /* HAVE_X86ASM */ + +void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, + float *tmpbuf); +void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, + float *tmpbuf); + +DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; + +#if HAVE_6REGS && HAVE_SSE_INLINE + +#define MACS(rt, ra, rb) rt+=(ra)*(rb) +#define MLSS(rt, ra, rb) rt-=(ra)*(rb) + +#define SUM8(op, sum, w, p) \ +{ \ + op(sum, (w)[0 * 64], (p)[0 * 64]); \ + op(sum, (w)[1 * 64], (p)[1 * 64]); \ + op(sum, (w)[2 * 64], (p)[2 * 64]); \ + op(sum, (w)[3 * 64], (p)[3 * 64]); \ + op(sum, (w)[4 * 64], (p)[4 * 64]); \ + op(sum, (w)[5 * 64], (p)[5 * 64]); \ + op(sum, (w)[6 * 64], (p)[6 * 64]); \ + op(sum, (w)[7 * 64], (p)[7 * 64]); \ +} + +static void apply_window(const float *buf, const float *win1, + const float *win2, float *sum1, float *sum2, int len) +{ + x86_reg count = - 4*len; + const float *win1a = win1+len; + const float *win2a = win2+len; + const float *bufa = buf+len; + float *sum1a = sum1+len; + float *sum2a = sum2+len; + + +#define MULT(a, b) \ + "movaps " #a "(%1,%0), %%xmm1 \n\t" \ + "movaps " #a "(%3,%0), %%xmm2 \n\t" \ + "mulps %%xmm2, %%xmm1 \n\t" \ + "subps %%xmm1, %%xmm0 \n\t" \ + "mulps " #b "(%2,%0), %%xmm2 \n\t" \ + "subps %%xmm2, %%xmm4 \n\t" \ + + __asm__ volatile( + "1: \n\t" + "xorps %%xmm0, %%xmm0 \n\t" + "xorps %%xmm4, %%xmm4 \n\t" + + MULT( 0, 0) + MULT( 256, 64) + MULT( 512, 128) + MULT( 768, 192) + MULT(1024, 256) + MULT(1280, 320) + MULT(1536, 384) + MULT(1792, 448) + + "movaps %%xmm0, (%4,%0) \n\t" + "movaps %%xmm4, (%5,%0) \n\t" + "add $16, %0 \n\t" + "jl 1b \n\t" + :"+&r"(count) + :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a) + ); + +#undef MULT +} + +static void apply_window_mp3(float *in, float *win, int *unused, float *out, + ptrdiff_t incr) +{ + LOCAL_ALIGNED_16(float, suma, [17]); + LOCAL_ALIGNED_16(float, sumb, [17]); + LOCAL_ALIGNED_16(float, sumc, [17]); + LOCAL_ALIGNED_16(float, sumd, [17]); + + float sum; + + /* copy to avoid wrap */ + __asm__ volatile( + "movaps 0(%0), %%xmm0 \n\t" \ + "movaps 16(%0), %%xmm1 \n\t" \ + "movaps 32(%0), %%xmm2 \n\t" \ + "movaps 48(%0), %%xmm3 \n\t" \ + "movaps %%xmm0, 0(%1) \n\t" \ + "movaps %%xmm1, 16(%1) \n\t" \ + "movaps %%xmm2, 32(%1) \n\t" \ + "movaps %%xmm3, 48(%1) \n\t" \ + "movaps 64(%0), %%xmm0 \n\t" \ + "movaps 80(%0), %%xmm1 \n\t" \ + "movaps 96(%0), %%xmm2 \n\t" \ + "movaps 112(%0), %%xmm3 \n\t" \ + "movaps %%xmm0, 64(%1) \n\t" \ + "movaps %%xmm1, 80(%1) \n\t" \ + "movaps %%xmm2, 96(%1) \n\t" \ + "movaps %%xmm3, 112(%1) \n\t" + ::"r"(in), "r"(in+512) + :"memory" + ); + + apply_window(in + 16, win , win + 512, suma, sumc, 16); + apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); + + SUM8(MACS, suma[0], win + 32, in + 48); + + sumc[ 0] = 0; + sumb[16] = 0; + sumd[16] = 0; + +#define SUMS(suma, sumb, sumc, sumd, out1, out2) \ + "movups " #sumd "(%4), %%xmm0 \n\t" \ + "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ + "subps " #suma "(%1), %%xmm0 \n\t" \ + "movaps %%xmm0," #out1 "(%0) \n\t" \ +\ + "movups " #sumc "(%3), %%xmm0 \n\t" \ + "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ + "addps " #sumb "(%2), %%xmm0 \n\t" \ + "movaps %%xmm0," #out2 "(%0) \n\t" + + if (incr == 1) { + __asm__ volatile( + SUMS( 0, 48, 4, 52, 0, 112) + SUMS(16, 32, 20, 36, 16, 96) + SUMS(32, 16, 36, 20, 32, 80) + SUMS(48, 0, 52, 4, 48, 64) + + :"+&r"(out) + :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0]) + :"memory" + ); + out += 16*incr; + } else { + int j; + float *out2 = out + 32 * incr; + out[0 ] = -suma[ 0]; + out += incr; + out2 -= incr; + for(j=1;j<16;j++) { + *out = -suma[ j] + sumd[16-j]; + *out2 = sumb[16-j] + sumc[ j]; + out += incr; + out2 -= incr; + } + } + + sum = 0; + SUM8(MLSS, sum, win + 16 + 32, in + 32); + *out = sum; +} + +#endif /* HAVE_6REGS && HAVE_SSE_INLINE */ + +#if HAVE_X86ASM +#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ +static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ + int count, int switch_point, int block_type) \ +{ \ + int align_end = count - (count & 3); \ + int j; \ + for (j = 0; j < align_end; j+= 4) { \ + LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ + float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ + /* apply window & overlap with previous buffer */ \ + \ + /* select window */ \ + ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ + in += 4*18; \ + buf += 4*18; \ + out += 4; \ + } \ + for (; j < count; j++) { \ + /* apply window & overlap with previous buffer */ \ + \ + /* select window */ \ + int win_idx = (switch_point && j < 2) ? 0 : block_type; \ + float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ + \ + ff_imdct36_float_ ## CPU1(out, buf, in, win); \ + \ + in += 18; \ + buf++; \ + out++; \ + } \ +} + +#if HAVE_SSE +DECL_IMDCT_BLOCKS(sse2,sse) +DECL_IMDCT_BLOCKS(sse3,sse) +DECL_IMDCT_BLOCKS(ssse3,sse) +#endif +#if HAVE_AVX_EXTERNAL +DECL_IMDCT_BLOCKS(avx,avx) +#endif +#endif /* HAVE_X86ASM */ + +av_cold void ff_mpadsp_init_x86_tabs(void) +{ + int i, j; + for (j = 0; j < 4; j++) { + for (i = 0; i < 40; i ++) { + mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; + mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; + mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; + mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; + mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; + mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; + mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; + mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; + } + } +} + +av_cold void ff_mpadsp_init_x86(MPADSPContext *s) +{ + av_unused int cpu_flags = av_get_cpu_flags(); + +#if HAVE_6REGS && HAVE_SSE_INLINE + if (INLINE_SSE(cpu_flags)) { + s->apply_window_float = apply_window_mp3; + } +#endif /* HAVE_SSE_INLINE */ + +#if HAVE_X86ASM +#if HAVE_SSE + if (EXTERNAL_SSE2(cpu_flags)) { + s->imdct36_blocks_float = imdct36_blocks_sse2; + } + if (EXTERNAL_SSE3(cpu_flags)) { + s->imdct36_blocks_float = imdct36_blocks_sse3; + } + if (EXTERNAL_SSSE3(cpu_flags)) { + s->imdct36_blocks_float = imdct36_blocks_ssse3; + } +#endif +#if HAVE_AVX_EXTERNAL + if (EXTERNAL_AVX(cpu_flags)) { + s->imdct36_blocks_float = imdct36_blocks_avx; + } +#endif +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/simple_idct.asm b/media/ffvpx/libavcodec/x86/simple_idct.asm new file mode 100644 index 0000000000..982b2f0bbb --- /dev/null +++ b/media/ffvpx/libavcodec/x86/simple_idct.asm @@ -0,0 +1,871 @@ +; +; Simple IDCT MMX +; +; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> +; +; Conversion from gcc syntax to x264asm syntax with minimal modifications +; by James Darnley <jdarnley@obe.tv>. +; +; This file is part of FFmpeg. +; +; FFmpeg is free software; you can redistribute it and/or +; modify it under the terms of the GNU Lesser General Public +; License as published by the Free Software Foundation; either +; version 2.1 of the License, or (at your option) any later version. +; +; FFmpeg is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Lesser General Public License for more details. +; +; You should have received a copy of the GNU Lesser General Public +; License along with FFmpeg; if not, write to the Free Software +; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;/ + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +%if ARCH_X86_32 +cextern pb_80 + +wm1010: dw 0, 0xffff, 0, 0xffff +d40000: dd 4 << 16, 0 + +; 23170.475006 +; 22725.260826 +; 21406.727617 +; 19265.545870 +; 16384.000000 +; 12872.826198 +; 8866.956905 +; 4520.335430 + +%define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 +%define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C6 8867 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +%define C7 4520 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + +%define ROW_SHIFT 11 +%define COL_SHIFT 20 ; 6 + +coeffs: + dw 1 << (ROW_SHIFT - 1), 0 + dw 1 << (ROW_SHIFT - 1), 0 + dw 1 << (ROW_SHIFT - 1), 1 + dw 1 << (ROW_SHIFT - 1), 0 + + dw C4, C4, C4, C4 + dw C4, -C4, C4, -C4 + + dw C2, C6, C2, C6 + dw C6, -C2, C6, -C2 + + dw C1, C3, C1, C3 + dw C5, C7, C5, C7 + + dw C3, -C7, C3, -C7 + dw -C1, -C5, -C1, -C5 + + dw C5, -C1, C5, -C1 + dw C7, C3, C7, C3 + + dw C7, -C5, C7, -C5 + dw C3, -C1, C3, -C1 + +SECTION .text + +%macro DC_COND_IDCT 7 + movq mm0, [blockq + %1] ; R4 R0 r4 r0 + movq mm1, [blockq + %2] ; R6 R2 r6 r2 + movq mm2, [blockq + %3] ; R3 R1 r3 r1 + movq mm3, [blockq + %4] ; R7 R5 r7 r5 + movq mm4, [wm1010] + pand mm4, mm0 + por mm4, mm1 + por mm4, mm2 + por mm4, mm3 + packssdw mm4, mm4 + movd t0d, mm4 + or t0d, t0d + jz %%1 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + paddd mm4, [coeffs + 8] + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5 + paddd mm0, [coeffs + 8] + paddd mm1, mm0 ; A1 a1 + paddd mm0, mm0 + psubd mm0, mm1 ; A2 a2 + pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm5 ; B0 b0 + movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + paddd mm5, mm2 ; B1 b1 + psrad mm7, %7 + psrad mm4, %7 + movq mm2, mm1 ; A1 a1 + paddd mm1, mm5 ; A1+B1 a1+b1 + psubd mm2, mm5 ; A1-B1 a1-b1 + psrad mm1, %7 + psrad mm2, %7 + packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 + packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 + movq [%5], mm7 + movq mm1, [blockq + %3] ; R3 R1 r3 r1 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + movq [24 + %5], mm2 + pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1 + movq mm7, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm0 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm4, mm7 ; B2 b2 + paddd mm2, mm4 ; A2+B2 a2+b2 + psubd mm0, mm4 ; a2-B2 a2-b2 + psrad mm2, %7 + psrad mm0, %7 + movq mm4, mm6 ; A3 a3 + paddd mm3, mm1 ; B3 b3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm4, mm3 ; a3-B3 a3-b3 + psrad mm6, %7 + packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 + movq [8 + %5], mm2 + psrad mm4, %7 + packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 + movq [16 + %5], mm4 + jmp %%2 +%%1: + pslld mm0, 16 + paddd mm0, [d40000] + psrad mm0, 13 + packssdw mm0, mm0 + movq [%5], mm0 + movq [8 + %5], mm0 + movq [16 + %5], mm0 + movq [24 + %5], mm0 +%%2: +%endmacro + +%macro Z_COND_IDCT 8 + movq mm0, [blockq + %1] ; R4 R0 r4 r0 + movq mm1, [blockq + %2] ; R6 R2 r6 r2 + movq mm2, [blockq + %3] ; R3 R1 r3 r1 + movq mm3, [blockq + %4] ; R7 R5 r7 r5 + movq mm4, mm0 + por mm4, mm1 + por mm4, mm2 + por mm4, mm3 + packssdw mm4, mm4 + movd t0d, mm4 + or t0d, t0d + jz %8 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + paddd mm4, [coeffs] + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5 + paddd mm0, [coeffs] + paddd mm1, mm0 ; A1 a1 + paddd mm0, mm0 + psubd mm0, mm1 ; A2 a2 + pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm5 ; B0 b0 + movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + paddd mm5, mm2 ; B1 b1 + psrad mm7, %7 + psrad mm4, %7 + movq mm2, mm1 ; A1 a1 + paddd mm1, mm5 ; A1+B1 a1+b1 + psubd mm2, mm5 ; A1-B1 a1-b1 + psrad mm1, %7 + psrad mm2, %7 + packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 + packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 + movq [%5], mm7 + movq mm1, [blockq + %3] ; R3 R1 r3 r1 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + movq [24 + %5], mm2 + pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1 + movq mm7, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm0 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm4, mm7 ; B2 b2 + paddd mm2, mm4 ; A2+B2 a2+b2 + psubd mm0, mm4 ; a2-B2 a2-b2 + psrad mm2, %7 + psrad mm0, %7 + movq mm4, mm6 ; A3 a3 + paddd mm3, mm1 ; B3 b3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm4, mm3 ; a3-B3 a3-b3 + psrad mm6, %7 + packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 + movq [8 + %5], mm2 + psrad mm4, %7 + packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 + movq [16 + %5], mm4 +%endmacro + +%macro IDCT1 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm1, %2 ; R6 R2 r6 r2 + movq mm2, %3 ; R3 R1 r3 r1 + movq mm3, %4 ; R7 R5 r7 r5 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + paddd mm0, mm1 ; A1 a1 + psubd mm5, mm1 ; A2 a2 + movq mm1, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 + pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm1 ; B0 b0 + movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + paddd mm1, mm2 ; B1 b1 + psrad mm7, %6 + psrad mm4, %6 + movq mm2, mm0 ; A1 a1 + paddd mm0, mm1 ; A1+B1 a1+b1 + psubd mm2, mm1 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm2, %6 + packssdw mm7, mm7 ; A0+B0 a0+b0 + movd [%5], mm7 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm2, mm2 ; A1-B1 a1-b1 + movd [96 + %5], mm2 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm0, %3 ; R3 R1 r3 r1 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1 + movq mm7, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm5 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm4, mm7 ; B2 b2 + paddd mm2, mm4 ; A2+B2 a2+b2 + psubd mm5, mm4 ; a2-B2 a2-b2 + psrad mm2, %6 + psrad mm5, %6 + movq mm4, mm6 ; A3 a3 + paddd mm3, mm0 ; B3 b3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm4, mm3 ; a3-B3 a3-b3 + psrad mm6, %6 + psrad mm4, %6 + packssdw mm2, mm2 ; A2+B2 a2+b2 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [32 + %5], mm2 + packssdw mm4, mm4 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [48 + %5], mm6 + movd [64 + %5], mm4 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT2 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm1, %2 ; R6 R2 r6 r2 + movq mm3, %4 ; R7 R5 r7 r5 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + paddd mm0, mm1 ; A1 a1 + psubd mm5, mm1 ; A2 a2 + movq mm1, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 + movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm1, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm1 ; A0-B0 a0-b0 + psrad mm1, %6 + psrad mm4, %6 + movq mm2, mm0 ; A1 a1 + paddd mm0, mm7 ; A1+B1 a1+b1 + psubd mm2, mm7 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm2, %6 + packssdw mm1, mm1 ; A0+B0 a0+b0 + movd [%5], mm1 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm2, mm2 ; A1-B1 a1-b1 + movd [96 + %5], mm2 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm1, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm5 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm2, mm1 ; A2+B2 a2+b2 + psubd mm5, mm1 ; a2-B2 a2-b2 + psrad mm2, %6 + psrad mm5, %6 + movq mm1, mm6 ; A3 a3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm1, mm3 ; a3-B3 a3-b3 + psrad mm6, %6 + psrad mm1, %6 + packssdw mm2, mm2 ; A2+B2 a2+b2 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [32 + %5], mm2 + packssdw mm1, mm1 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [48 + %5], mm6 + movd [64 + %5], mm1 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT3 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm3, %4 ; R7 R5 r7 r5 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm1, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 + movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm1, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm1 ; A0-B0 a0-b0 + psrad mm1, %6 + psrad mm4, %6 + movq mm2, mm0 ; A1 a1 + paddd mm0, mm7 ; A1+B1 a1+b1 + psubd mm2, mm7 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm2, %6 + packssdw mm1, mm1 ; A0+B0 a0+b0 + movd [%5], mm1 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm2, mm2 ; A1-B1 a1-b1 + movd [96 + %5], mm2 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm1, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm5 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm2, mm1 ; A2+B2 a2+b2 + psubd mm5, mm1 ; a2-B2 a2-b2 + psrad mm2, %6 + psrad mm5, %6 + movq mm1, mm6 ; A3 a3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm1, mm3 ; a3-B3 a3-b3 + psrad mm6, %6 + psrad mm1, %6 + packssdw mm2, mm2 ; A2+B2 a2+b2 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [32 + %5], mm2 + packssdw mm1, mm1 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [48 + %5], mm6 + movd [64 + %5], mm1 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT4 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm2, %3 ; R3 R1 r3 r1 + movq mm3, %4 ; R7 R5 r7 r5 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm1, [coeffs + 56] ; C7 C5 C7 C5 + pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5 + pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm1 ; B0 b0 + movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1 + pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + paddd mm1, mm2 ; B1 b1 + psrad mm7, %6 + psrad mm4, %6 + movq mm2, mm0 ; A1 a1 + paddd mm0, mm1 ; A1+B1 a1+b1 + psubd mm2, mm1 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm2, %6 + packssdw mm7, mm7 ; A0+B0 a0+b0 + movd [%5], mm7 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm2, mm2 ; A1-B1 a1-b1 + movd [96 + %5], mm2 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm0, %3 ; R3 R1 r3 r1 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1 + movq mm7, [coeffs + 88] ; C3 C7 C3 C7 + pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5 + movq mm2, mm5 ; A2 a2 + pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5 + paddd mm4, mm7 ; B2 b2 + paddd mm2, mm4 ; A2+B2 a2+b2 + psubd mm5, mm4 ; a2-B2 a2-b2 + psrad mm2, %6 + psrad mm5, %6 + movq mm4, mm6 ; A3 a3 + paddd mm3, mm0 ; B3 b3 + paddd mm6, mm3 ; A3+B3 a3+b3 + psubd mm4, mm3 ; a3-B3 a3-b3 + psrad mm6, %6 + psrad mm4, %6 + packssdw mm2, mm2 ; A2+B2 a2+b2 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [32 + %5], mm2 + packssdw mm4, mm4 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [48 + %5], mm6 + movd [64 + %5], mm4 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT5 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm2, %3 ; R3 R1 r3 r1 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm3, [coeffs + 64] + pmaddwd mm3, mm2 ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + psrad mm7, %6 + psrad mm4, %6 + movq mm1, mm0 ; A1 a1 + paddd mm0, mm3 ; A1+B1 a1+b1 + psubd mm1, mm3 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm1, %6 + packssdw mm7, mm7 ; A0+B0 a0+b0 + movd [%5], mm7 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm1, mm1 ; A1-B1 a1-b1 + movd [96 + %5], mm1 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1 + pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + movq mm1, mm5 ; A2 a2 + paddd mm1, mm4 ; A2+B2 a2+b2 + psubd mm5, mm4 ; a2-B2 a2-b2 + psrad mm1, %6 + psrad mm5, %6 + movq mm4, mm6 ; A3 a3 + paddd mm6, mm2 ; A3+B3 a3+b3 + psubd mm4, mm2 ; a3-B3 a3-b3 + psrad mm6, %6 + psrad mm4, %6 + packssdw mm1, mm1 ; A2+B2 a2+b2 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [32 + %5], mm1 + packssdw mm4, mm4 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [48 + %5], mm6 + movd [64 + %5], mm4 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT6 6 + movq mm0, [%1] ; R4 R0 r4 r0 + movq mm1, [%2] ; R6 R2 r6 r2 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + paddd mm0, mm1 ; A1 a1 + psubd mm5, mm1 ; A2 a2 + movq mm2, [8 + %1] ; R4 R0 r4 r0 + movq mm3, [8 + %2] ; R6 R2 r6 r2 + movq mm1, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm7, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm7, mm3 ; C6R6+C2R2 C6r6+C2r2 + pmaddwd mm3, [coeffs + 40] ; -C2R6+C6R2 -C2r6+C6r2 + paddd mm7, mm1 ; A0 a0 + paddd mm1, mm1 ; 2C0 2c0 + psubd mm1, mm7 ; A3 a3 + paddd mm3, mm2 ; A1 a1 + paddd mm2, mm2 ; 2C1 2c1 + psubd mm2, mm3 ; A2 a2 + psrad mm4, %6 + psrad mm7, %6 + psrad mm3, %6 + packssdw mm4, mm7 ; A0 a0 + movq [%5], mm4 + psrad mm0, %6 + packssdw mm0, mm3 ; A1 a1 + movq [16 + %5], mm0 + movq [96 + %5], mm0 + movq [112 + %5], mm4 + psrad mm5, %6 + psrad mm6, %6 + psrad mm2, %6 + packssdw mm5, mm2 ; A2-B2 a2-b2 + movq [32 + %5], mm5 + psrad mm1, %6 + packssdw mm6, mm1 ; A3+B3 a3+b3 + movq [48 + %5], mm6 + movq [64 + %5], mm6 + movq [80 + %5], mm5 +%endmacro + +%macro IDCT7 6 + movq mm0, %1 ; R4 R0 r4 r0 + movq mm1, %2 ; R6 R2 r6 r2 + movq mm2, %3 ; R3 R1 r3 r1 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm5, [coeffs + 32] ; C6 C2 C6 C2 + pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2 + movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6 + pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2 + movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 48] ; C3 C1 C3 C1 + pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1 + paddd mm4, mm5 ; A0 a0 + psubd mm6, mm5 ; A3 a3 + movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0 + paddd mm0, mm1 ; A1 a1 + psubd mm5, mm1 ; A2 a2 + movq mm1, [coeffs + 64] + pmaddwd mm1, mm2 ; -C7R3+C3R1 -C7r3+C3r1 + paddd mm7, mm4 ; A0+B0 a0+b0 + paddd mm4, mm4 ; 2A0 2a0 + psubd mm4, mm7 ; A0-B0 a0-b0 + psrad mm7, %6 + psrad mm4, %6 + movq mm3, mm0 ; A1 a1 + paddd mm0, mm1 ; A1+B1 a1+b1 + psubd mm3, mm1 ; A1-B1 a1-b1 + psrad mm0, %6 + psrad mm3, %6 + packssdw mm7, mm7 ; A0+B0 a0+b0 + movd [%5], mm7 + packssdw mm0, mm0 ; A1+B1 a1+b1 + movd [16 + %5], mm0 + packssdw mm3, mm3 ; A1-B1 a1-b1 + movd [96 + %5], mm3 + packssdw mm4, mm4 ; A0-B0 a0-b0 + movd [112 + %5], mm4 + movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5 + pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1 + pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1 + movq mm3, mm5 ; A2 a2 + paddd mm3, mm4 ; A2+B2 a2+b2 + psubd mm5, mm4 ; a2-B2 a2-b2 + psrad mm3, %6 + psrad mm5, %6 + movq mm4, mm6 ; A3 a3 + paddd mm6, mm2 ; A3+B3 a3+b3 + psubd mm4, mm2 ; a3-B3 a3-b3 + psrad mm6, %6 + packssdw mm3, mm3 ; A2+B2 a2+b2 + movd [32 + %5], mm3 + psrad mm4, %6 + packssdw mm6, mm6 ; A3+B3 a3+b3 + movd [48 + %5], mm6 + packssdw mm4, mm4 ; A3-B3 a3-b3 + packssdw mm5, mm5 ; A2-B2 a2-b2 + movd [64 + %5], mm4 + movd [80 + %5], mm5 +%endmacro + +%macro IDCT8 6 + movq mm0, [%1] ; R4 R0 r4 r0 + movq mm4, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0 + movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0 + psrad mm4, %6 + psrad mm0, %6 + movq mm2, [8 + %1] ; R4 R0 r4 r0 + movq mm1, [coeffs + 16] ; C4 C4 C4 C4 + pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0 + movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4 + pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0 + movq mm7, [coeffs + 32] ; C6 C2 C6 C2 + psrad mm1, %6 + packssdw mm4, mm1 ; A0 a0 + movq [%5], mm4 + psrad mm2, %6 + packssdw mm0, mm2 ; A1 a1 + movq [16 + %5], mm0 + movq [96 + %5], mm0 + movq [112 + %5], mm4 + movq [32 + %5], mm0 + movq [48 + %5], mm4 + movq [64 + %5], mm4 + movq [80 + %5], mm0 +%endmacro + +%macro IDCT 0 + DC_COND_IDCT 0, 8, 16, 24, rsp + 0, null, 11 + Z_COND_IDCT 32, 40, 48, 56, rsp + 32, null, 11, %%4 + Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%2 + Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%1 + + IDCT1 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT1 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%4: + Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%6 + Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5 + + IDCT2 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT2 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%6: + Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7 + + IDCT3 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT3 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%2: + Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3 + + IDCT4 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT4 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%3: + + IDCT5 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT5 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%5: + + IDCT6 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20 + IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20 + jmp %%9 + + ALIGN 16 + %%1: + + IDCT7 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20 + IDCT7 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20 + IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20 + IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20 + jmp %%9 + + ALIGN 16 + %%7: + + IDCT8 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20 + IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20 + + %%9: +%endmacro + +%macro PUT_PIXELS_CLAMPED_HALF 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*2+%1] +%if mmsize == 8 + mova m2, [blockq+mmsize*4+%1] + mova m3, [blockq+mmsize*6+%1] +%endif + packuswb m0, [blockq+mmsize*1+%1] + packuswb m1, [blockq+mmsize*3+%1] +%if mmsize == 8 + packuswb m2, [blockq+mmsize*5+%1] + packuswb m3, [blockq+mmsize*7+%1] + movq [pixelsq], m0 + movq [lsizeq+pixelsq], m1 + movq [2*lsizeq+pixelsq], m2 + movq [lsize3q+pixelsq], m3 +%else + movq [pixelsq], m0 + movhps [lsizeq+pixelsq], m0 + movq [2*lsizeq+pixelsq], m1 + movhps [lsize3q+pixelsq], m1 +%endif +%endmacro + +%macro ADD_PIXELS_CLAMPED 1 + mova m0, [blockq+mmsize*0+%1] + mova m1, [blockq+mmsize*1+%1] +%if mmsize == 8 + mova m5, [blockq+mmsize*2+%1] + mova m6, [blockq+mmsize*3+%1] +%endif + movq m2, [pixelsq] + movq m3, [pixelsq+lsizeq] +%if mmsize == 8 + mova m7, m2 + punpcklbw m2, m4 + punpckhbw m7, m4 + paddsw m0, m2 + paddsw m1, m7 + mova m7, m3 + punpcklbw m3, m4 + punpckhbw m7, m4 + paddsw m5, m3 + paddsw m6, m7 +%else + punpcklbw m2, m4 + punpcklbw m3, m4 + paddsw m0, m2 + paddsw m1, m3 +%endif + packuswb m0, m1 +%if mmsize == 8 + packuswb m5, m6 + movq [pixelsq], m0 + movq [pixelsq+lsizeq], m5 +%else + movq [pixelsq], m0 + movhps [pixelsq+lsizeq], m0 +%endif +%endmacro + +INIT_MMX mmx + +cglobal simple_idct, 1, 2, 8, 128, block, t0 + IDCT +RET + +INIT_XMM sse2 + +cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0 + IDCT + lea lsize3q, [lsizeq*3] + PUT_PIXELS_CLAMPED_HALF 0 + lea pixelsq, [pixelsq+lsizeq*4] + PUT_PIXELS_CLAMPED_HALF 64 +RET + +cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0 + IDCT + pxor m4, m4 + ADD_PIXELS_CLAMPED 0 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 32 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 64 + lea pixelsq, [pixelsq+lsizeq*2] + ADD_PIXELS_CLAMPED 96 +RET +%endif diff --git a/media/ffvpx/libavcodec/x86/simple_idct.h b/media/ffvpx/libavcodec/x86/simple_idct.h new file mode 100644 index 0000000000..9b64cfe9bc --- /dev/null +++ b/media/ffvpx/libavcodec/x86/simple_idct.h @@ -0,0 +1,53 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_SIMPLE_IDCT_H +#define AVCODEC_X86_SIMPLE_IDCT_H + +#include <stddef.h> +#include <stdint.h> + +void ff_simple_idct_mmx(int16_t *block); +void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct8_sse2(int16_t *block); +void ff_simple_idct8_avx(int16_t *block); + +void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct10_sse2(int16_t *block); +void ff_simple_idct10_avx(int16_t *block); + +void ff_simple_idct10_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct10_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +void ff_simple_idct12_sse2(int16_t *block); +void ff_simple_idct12_avx(int16_t *block); + +void ff_simple_idct12_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct12_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +#endif /* AVCODEC_X86_SIMPLE_IDCT_H */ diff --git a/media/ffvpx/libavcodec/x86/simple_idct10.asm b/media/ffvpx/libavcodec/x86/simple_idct10.asm new file mode 100644 index 0000000000..069bb61378 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/simple_idct10.asm @@ -0,0 +1,205 @@ +;****************************************************************************** +;* x86-SIMD-optimized IDCT for prores +;* this is identical to "simple" IDCT written by Michael Niedermayer +;* except for the clip range +;* +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> +;* Copyright (c) 2015 Christophe Gisquet +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 + +SECTION_RODATA + +cextern pw_2 +cextern pw_16 +cextern pw_32 +cextern pw_1023 +cextern pw_4095 +pd_round_11: times 4 dd 1<<(11-1) +pd_round_12: times 4 dd 1<<(12-1) +pd_round_15: times 4 dd 1<<(15-1) +pd_round_19: times 4 dd 1<<(19-1) +pd_round_20: times 4 dd 1<<(20-1) + +%macro CONST_DEC 3 +const %1 +times 4 dw %2, %3 +%endmacro + +%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1 +%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1 +%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2 +%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1 +%define W3sh2_lo 19266 +%define W4sh2_lo 16383 +%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1 +%define W6sh2 8867 ; W6 = 35468 = 8867<<2 +%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1 + +CONST_DEC w4_plus_w2_hi, W4sh2, +W2sh2 +CONST_DEC w4_min_w2_hi, W4sh2, -W2sh2 +CONST_DEC w4_plus_w6_hi, W4sh2, +W6sh2 +CONST_DEC w4_min_w6_hi, W4sh2, -W6sh2 +CONST_DEC w1_plus_w3_hi, W1sh2, +W3sh2 +CONST_DEC w3_min_w1_hi, W3sh2, -W1sh2 +CONST_DEC w7_plus_w3_hi, W7sh2, +W3sh2 +CONST_DEC w3_min_w7_hi, W3sh2, -W7sh2 +CONST_DEC w1_plus_w5, W1sh2, +W5sh2 +CONST_DEC w5_min_w1, W5sh2, -W1sh2 +CONST_DEC w5_plus_w7, W5sh2, +W7sh2 +CONST_DEC w7_min_w5, W7sh2, -W5sh2 +CONST_DEC w4_plus_w2_lo, W4sh2_lo, +W2sh2 +CONST_DEC w4_min_w2_lo, W4sh2_lo, -W2sh2 +CONST_DEC w4_plus_w6_lo, W4sh2_lo, +W6sh2 +CONST_DEC w4_min_w6_lo, W4sh2_lo, -W6sh2 +CONST_DEC w1_plus_w3_lo, W1sh2, +W3sh2_lo +CONST_DEC w3_min_w1_lo, W3sh2_lo, -W1sh2 +CONST_DEC w7_plus_w3_lo, W7sh2, +W3sh2_lo +CONST_DEC w3_min_w7_lo, W3sh2_lo, -W7sh2 + +%include "libavcodec/x86/simple_idct10_template.asm" + +SECTION .text + +%macro STORE_HI_LO 12 + movq %1, %9 + movq %3, %10 + movq %5, %11 + movq %7, %12 + movhps %2, %9 + movhps %4, %10 + movhps %6, %11 + movhps %8, %12 +%endmacro + +%macro LOAD_ZXBW_8 16 + pmovzxbw %1, %9 + pmovzxbw %2, %10 + pmovzxbw %3, %11 + pmovzxbw %4, %12 + pmovzxbw %5, %13 + pmovzxbw %6, %14 + pmovzxbw %7, %15 + pmovzxbw %8, %16 +%endmacro + +%macro LOAD_ZXBW_4 9 + movh %1, %5 + movh %2, %6 + movh %3, %7 + movh %4, %8 + punpcklbw %1, %9 + punpcklbw %2, %9 + punpcklbw %3, %9 + punpcklbw %4, %9 +%endmacro + +%define PASS4ROWS(base, stride, stride3) \ + [base], [base + stride], [base + 2*stride], [base + stride3] + +%macro idct_fn 0 + +define_constants _lo + +cglobal simple_idct8, 1, 1, 16, 32, block + IDCT_FN "", 11, pw_32, 20, "store" +RET + +cglobal simple_idct8_put, 3, 4, 16, 32, pixels, lsize, block + IDCT_FN "", 11, pw_32, 20 + lea r3, [3*lsizeq] + lea r2, [pixelsq + r3] + packuswb m8, m0 + packuswb m1, m2 + packuswb m4, m11 + packuswb m9, m10 + STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9 +RET + +cglobal simple_idct8_add, 3, 4, 16, 32, pixels, lsize, block + IDCT_FN "", 11, pw_32, 20 + lea r2, [3*lsizeq] + %if cpuflag(sse4) + lea r3, [pixelsq + r2] + LOAD_ZXBW_8 m3, m5, m6, m7, m12, m13, m14, m15, PASS8ROWS(pixelsq, r3, lsizeq, r2) + paddsw m8, m3 + paddsw m0, m5 + paddsw m1, m6 + paddsw m2, m7 + paddsw m4, m12 + paddsw m11, m13 + paddsw m9, m14 + paddsw m10, m15 + %else + pxor m12, m12 + LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12 + paddsw m8, m3 + paddsw m0, m5 + paddsw m1, m6 + paddsw m2, m7 + lea r3, [pixelsq + 4*lsizeq] + LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(r3, lsizeq, r2), m12 + paddsw m4, m3 + paddsw m11, m5 + paddsw m9, m6 + paddsw m10, m7 + lea r3, [pixelsq + r2] + %endif + packuswb m8, m0 + packuswb m1, m2 + packuswb m4, m11 + packuswb m9, m10 + STORE_HI_LO PASS8ROWS(pixelsq, r3, lsizeq, r2), m8, m1, m4, m9 +RET + +define_constants _hi + +cglobal simple_idct10, 1, 1, 16, block + IDCT_FN "", 12, "", 19, "store" + RET + +cglobal simple_idct10_put, 3, 3, 16, pixels, lsize, block + IDCT_FN "", 12, "", 19, "put", 0, pw_1023 + RET + +cglobal simple_idct12, 1, 1, 16, block + ; coeffs are already 15bits, adding the offset would cause + ; overflow in the input + IDCT_FN "", 15, pw_2, 16, "store" + RET + +cglobal simple_idct12_put, 3, 3, 16, pixels, lsize, block + ; range isn't known, so the C simple_idct range is used + ; Also, using a bias on input overflows, so use the bias + ; on output of the first butterfly instead + IDCT_FN "", 15, pw_2, 16, "put", 0, pw_4095 + RET +%endmacro + +INIT_XMM sse2 +idct_fn +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +idct_fn +%endif + +%endif diff --git a/media/ffvpx/libavcodec/x86/simple_idct10_template.asm b/media/ffvpx/libavcodec/x86/simple_idct10_template.asm new file mode 100644 index 0000000000..0d04a9818a --- /dev/null +++ b/media/ffvpx/libavcodec/x86/simple_idct10_template.asm @@ -0,0 +1,369 @@ +;****************************************************************************** +;* x86-SIMD-optimized IDCT for prores +;* this is identical to "simple" IDCT written by Michael Niedermayer +;* except for the clip range +;* +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +; add SECTION_RODATA and proper include before including this file! + +%if ARCH_X86_64 + +%macro define_constants 1 + %undef w4_plus_w2 + %undef w4_min_w2 + %undef w4_plus_w6 + %undef w4_min_w6 + %undef w1_plus_w3 + %undef w3_min_w1 + %undef w7_plus_w3 + %undef w3_min_w7 + %define w4_plus_w2 w4_plus_w2%1 + %define w4_min_w2 w4_min_w2%1 + %define w4_plus_w6 w4_plus_w6%1 + %define w4_min_w6 w4_min_w6%1 + %define w1_plus_w3 w1_plus_w3%1 + %define w3_min_w1 w3_min_w1%1 + %define w7_plus_w3 w7_plus_w3%1 + %define w3_min_w7 w3_min_w7%1 +%endmacro + +; interleave data while maintaining source +; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave +%macro SBUTTERFLY3 5 + punpckl%1 m%2, m%4, m%5 + punpckh%1 m%3, m%4, m%5 +%endmacro + +; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift +; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6 +; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3) +%macro SUMSUB_SHPK 7 + psubd %3, %1, %5 ; { a0 - b0 }[0-3] + psubd %4, %2, %6 ; { a0 - b0 }[4-7] + paddd %1, %5 ; { a0 + b0 }[0-3] + paddd %2, %6 ; { a0 + b0 }[4-7] + psrad %1, %7 + psrad %2, %7 + psrad %3, %7 + psrad %4, %7 + packssdw %1, %2 ; row[0] + packssdw %3, %4 ; row[7] +%endmacro + +; %1 = initial bias ("" if nop) +; %2 = number of bits to shift at the end +; %3 = qmat (for prores) +%macro IDCT_1D 2-3 + ; a0 = (W4 * row[0]) + (1 << (15 - 1)); + ; a1 = a0; + ; a2 = a0; + ; a3 = a0; + ; a0 += W2 * row[2]; + ; a1 += W6 * row[2]; + ; a2 -= W6 * row[2]; + ; a3 -= W2 * row[2]; +%ifstr %1 + mova m15, [pd_round_ %+ %2] +%else + paddw m10, [%1] +%endif + SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] + pmaddwd m2, m0, [w4_plus_w6] + pmaddwd m3, m1, [w4_plus_w6] + pmaddwd m4, m0, [w4_min_w6] + pmaddwd m5, m1, [w4_min_w6] + pmaddwd m6, m0, [w4_min_w2] + pmaddwd m7, m1, [w4_min_w2] + pmaddwd m0, [w4_plus_w2] + pmaddwd m1, [w4_plus_w2] +%ifstr %1 + ; Adding 1<<(%2-1) for >=15 bits values + paddd m2, m15 + paddd m3, m15 + paddd m4, m15 + paddd m5, m15 + paddd m6, m15 + paddd m7, m15 + paddd m0, m15 + paddd m1, m15 +%endif + + ; a0: -1*row[0]-1*row[2] + ; a1: -1*row[0] + ; a2: -1*row[0] + ; a3: -1*row[0]+1*row[2] + + ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4] + ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6] + ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6] + ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4] + SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7] + pmaddwd m10, m8, [w4_plus_w6] + pmaddwd m11, m9, [w4_plus_w6] + paddd m0, m10 ; a0[0-3] + paddd m1, m11 ; a0[4-7] + pmaddwd m10, m8, [w4_min_w6] + pmaddwd m11, m9, [w4_min_w6] + paddd m6, m10 ; a3[0-3] + paddd m7, m11 ; a3[4-7] + pmaddwd m10, m8, [w4_min_w2] + pmaddwd m11, m9, [w4_min_w2] + pmaddwd m8, [w4_plus_w2] + pmaddwd m9, [w4_plus_w2] + psubd m4, m10 ; a2[0-3] intermediate + psubd m5, m11 ; a2[4-7] intermediate + psubd m2, m8 ; a1[0-3] intermediate + psubd m3, m9 ; a1[4-7] intermediate + + ; load/store + mova [blockq+ 0], m0 + mova [blockq+ 32], m2 + mova [blockq+ 64], m4 + mova [blockq+ 96], m6 + mova m10,[blockq+ 16] ; { row[1] }[0-7] + mova m8, [blockq+ 48] ; { row[3] }[0-7] + mova m13,[blockq+ 80] ; { row[5] }[0-7] + mova m14,[blockq+112] ; { row[7] }[0-7] + mova [blockq+ 16], m1 + mova [blockq+ 48], m3 + mova [blockq+ 80], m5 + mova [blockq+112], m7 +%if %0 == 3 + pmullw m10,[%3+ 16] + pmullw m8, [%3+ 48] + pmullw m13,[%3+ 80] + pmullw m14,[%3+112] +%endif + + ; b0 = MUL(W1, row[1]); + ; MAC(b0, W3, row[3]); + ; b1 = MUL(W3, row[1]); + ; MAC(b1, -W7, row[3]); + ; b2 = MUL(W5, row[1]); + ; MAC(b2, -W1, row[3]); + ; b3 = MUL(W7, row[1]); + ; MAC(b3, -W5, row[3]); + SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7] + pmaddwd m2, m0, [w3_min_w7] + pmaddwd m3, m1, [w3_min_w7] + pmaddwd m4, m0, [w5_min_w1] + pmaddwd m5, m1, [w5_min_w1] + pmaddwd m6, m0, [w7_min_w5] + pmaddwd m7, m1, [w7_min_w5] + pmaddwd m0, [w1_plus_w3] + pmaddwd m1, [w1_plus_w3] + + ; b0: +1*row[1]+2*row[3] + ; b1: +2*row[1]-1*row[3] + ; b2: -1*row[1]-1*row[3] + ; b3: +1*row[1]+1*row[3] + + ; MAC(b0, W5, row[5]); + ; MAC(b0, W7, row[7]); + ; MAC(b1, -W1, row[5]); + ; MAC(b1, -W5, row[7]); + ; MAC(b2, W7, row[5]); + ; MAC(b2, W3, row[7]); + ; MAC(b3, W3, row[5]); + ; MAC(b3, -W1, row[7]); + SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7] + + ; b0: -1*row[5]+1*row[7] + ; b1: -1*row[5]+1*row[7] + ; b2: +1*row[5]+2*row[7] + ; b3: +2*row[5]-1*row[7] + + pmaddwd m10, m8, [w1_plus_w5] + pmaddwd m11, m9, [w1_plus_w5] + pmaddwd m12, m8, [w5_plus_w7] + pmaddwd m13, m9, [w5_plus_w7] + psubd m2, m10 ; b1[0-3] + psubd m3, m11 ; b1[4-7] + paddd m0, m12 ; b0[0-3] + paddd m1, m13 ; b0[4-7] + pmaddwd m12, m8, [w7_plus_w3] + pmaddwd m13, m9, [w7_plus_w3] + pmaddwd m8, [w3_min_w1] + pmaddwd m9, [w3_min_w1] + paddd m4, m12 ; b2[0-3] + paddd m5, m13 ; b2[4-7] + paddd m6, m8 ; b3[0-3] + paddd m7, m9 ; b3[4-7] + + ; row[0] = (a0 + b0) >> 15; + ; row[7] = (a0 - b0) >> 15; + ; row[1] = (a1 + b1) >> 15; + ; row[6] = (a1 - b1) >> 15; + ; row[2] = (a2 + b2) >> 15; + ; row[5] = (a2 - b2) >> 15; + ; row[3] = (a3 + b3) >> 15; + ; row[4] = (a3 - b3) >> 15; + mova m8, [blockq+ 0] ; a0[0-3] + mova m9, [blockq+16] ; a0[4-7] + SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2 + mova m0, [blockq+32] ; a1[0-3] + mova m1, [blockq+48] ; a1[4-7] + SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2 + mova m1, [blockq+64] ; a2[0-3] + mova m2, [blockq+80] ; a2[4-7] + SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2 + mova m2, [blockq+96] ; a3[0-3] + mova m3, [blockq+112] ; a3[4-7] + SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2 +%endmacro + +; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t stride, +; int16_t *block, const int16_t *qmat); + +; %1 = row shift +; %2 = row bias macro +; %3 = column shift +; %4 = column bias macro +; %5 = final action (nothing, "store", "put", "add") +; %6 = min pixel value +; %7 = max pixel value +; %8 = qmat (for prores) + +%macro IDCT_FN 4-8 + ; for (i = 0; i < 8; i++) + ; idctRowCondDC(block + i*8); + mova m10,[blockq+ 0] ; { row[0] }[0-7] + mova m8, [blockq+32] ; { row[2] }[0-7] + mova m13,[blockq+64] ; { row[4] }[0-7] + mova m12,[blockq+96] ; { row[6] }[0-7] + +%if %0 == 8 + pmullw m10,[%8+ 0] + pmullw m8, [%8+32] + pmullw m13,[%8+64] + pmullw m12,[%8+96] + + IDCT_1D %1, %2, %8 +%elif %2 == 11 + ; This copies the DC-only shortcut. When there is only a DC coefficient the + ; C shifts the value and splats it to all coeffs rather than multiplying and + ; doing the full IDCT. This causes a difference on 8-bit because the + ; coefficient is 16383 rather than 16384 (which you can get with shifting). + por m1, m8, m13 + por m1, m12 + por m1, [blockq+ 16] ; { row[1] }[0-7] + por m1, [blockq+ 48] ; { row[3] }[0-7] + por m1, [blockq+ 80] ; { row[5] }[0-7] + por m1, [blockq+112] ; { row[7] }[0-7] + pxor m2, m2 + pcmpeqw m1, m2 + psllw m2, m10, 3 + pand m2, m1 + pcmpeqb m3, m3 + pxor m1, m3 + mova [rsp], m1 + mova [rsp+16], m2 + + IDCT_1D %1, %2 + + mova m5, [rsp] + mova m6, [rsp+16] + pand m8, m5 + por m8, m6 + pand m0, m5 + por m0, m6 + pand m1, m5 + por m1, m6 + pand m2, m5 + por m2, m6 + pand m4, m5 + por m4, m6 + pand m11, m5 + por m11, m6 + pand m9, m5 + por m9, m6 + pand m10, m5 + por m10, m6 +%else + IDCT_1D %1, %2 +%endif + + ; transpose for second part of IDCT + TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 + mova [blockq+ 16], m0 + mova [blockq+ 48], m2 + mova [blockq+ 80], m11 + mova [blockq+112], m10 + SWAP 8, 10 + SWAP 1, 8 + SWAP 4, 13 + SWAP 9, 12 + + ; for (i = 0; i < 8; i++) + ; idctSparseColAdd(dest + i, line_size, block + i); + IDCT_1D %3, %4 + + ; clip/store +%if %0 >= 5 +%ifidn %5,"store" + ; No clamping, means pure idct + mova [blockq+ 0], m8 + mova [blockq+ 16], m0 + mova [blockq+ 32], m1 + mova [blockq+ 48], m2 + mova [blockq+ 64], m4 + mova [blockq+ 80], m11 + mova [blockq+ 96], m9 + mova [blockq+112], m10 +%elifidn %5,"put" +%ifidn %6, 0 + pxor m3, m3 +%else + mova m3, [%6] +%endif ; ifidn %6, 0 + mova m5, [%7] + pmaxsw m8, m3 + pmaxsw m0, m3 + pmaxsw m1, m3 + pmaxsw m2, m3 + pmaxsw m4, m3 + pmaxsw m11, m3 + pmaxsw m9, m3 + pmaxsw m10, m3 + pminsw m8, m5 + pminsw m0, m5 + pminsw m1, m5 + pminsw m2, m5 + pminsw m4, m5 + pminsw m11, m5 + pminsw m9, m5 + pminsw m10, m5 + + lea r2, [r1*3] + mova [r0 ], m8 + mova [r0+r1 ], m0 + mova [r0+r1*2], m1 + mova [r0+r2 ], m2 + lea r0, [r0+r1*4] + mova [r0 ], m4 + mova [r0+r1 ], m11 + mova [r0+r1*2], m9 + mova [r0+r2 ], m10 +%endif ; %5 action +%endif; if %0 >= 5 +%endmacro + +%endif diff --git a/media/ffvpx/libavcodec/x86/videodsp.asm b/media/ffvpx/libavcodec/x86/videodsp.asm new file mode 100644 index 0000000000..3cc07878d3 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/videodsp.asm @@ -0,0 +1,436 @@ +;****************************************************************************** +;* Core video DSP functions +;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +; slow vertical extension loop function. Works with variable-width, and +; does per-line reading/writing of source data + +%macro V_COPY_ROW 2 ; type (top/body/bottom), h +.%1_y_loop: ; do { + mov wq, r7mp ; initialize w (r7mp = wmp) +.%1_x_loop: ; do { + movu m0, [srcq+wq] ; m0 = read($mmsize) + movu [dstq+wq], m0 ; write(m0, $mmsize) + add wq, mmsize ; w -= $mmsize + cmp wq, -mmsize ; } while (w > $mmsize); + jl .%1_x_loop + movu m0, [srcq-mmsize] ; m0 = read($mmsize) + movu [dstq-mmsize], m0 ; write(m0, $mmsize) +%ifidn %1, body ; if ($type == body) { + add srcq, src_strideq ; src += src_stride +%endif ; } + add dstq, dst_strideq ; dst += dst_stride + dec %2 ; } while (--$h); + jnz .%1_y_loop +%endmacro + +; .----. <- zero +; | | <- top is copied from first line in body of source +; |----| <- start_y +; | | <- body is copied verbatim (line-by-line) from source +; |----| <- end_y +; | | <- bottom is copied from last line in body of source +; '----' <- bh +INIT_XMM sse +%if ARCH_X86_64 +cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ + start_y, end_y, bh, w +%else ; x86-32 +cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w +%define src_strideq r3mp +%define dst_strideq r1mp + mov srcq, r2mp + mov start_yq, r4mp + mov end_yq, r5mp + mov bhq, r6mp +%endif + sub bhq, end_yq ; bh -= end_q + sub end_yq, start_yq ; end_q -= start_q + add srcq, r7mp ; (r7mp = wmp) + add dstq, r7mp ; (r7mp = wmp) + neg r7mp ; (r7mp = wmp) + test start_yq, start_yq ; if (start_q) { + jz .body + V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) +.body: ; } + V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) + test bhq, bhq ; if (bh) { + jz .end + sub srcq, src_strideq ; src -= src_stride + V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) +.end: ; } + RET + +%macro hvar_fn 0 +cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w + lea dstq, [dstq+n_wordsq*2] + neg n_wordsq + lea start_xq, [start_xq+n_wordsq*2] +.y_loop: ; do { +%if cpuflag(avx2) + vpbroadcastb m0, [dstq+start_xq] + mov wq, n_wordsq ; initialize w +%else + movzx wd, byte [dstq+start_xq] ; w = read(1) + imul wd, 0x01010101 ; w *= 0x01010101 + movd m0, wd + mov wq, n_wordsq ; initialize w + pshufd m0, m0, q0000 ; splat +%endif ; avx2 +.x_loop: ; do { + movu [dstq+wq*2], m0 ; write($reg, $mmsize) + add wq, mmsize/2 ; w -= $mmsize/2 + cmp wq, -(mmsize/2) ; } while (w > $mmsize/2) + jl .x_loop + movu [dstq-mmsize], m0 ; write($reg, $mmsize) + add dstq, dst_strideq ; dst += dst_stride + dec hq ; } while (h--) + jnz .y_loop + RET +%endmacro + +INIT_XMM sse2 +hvar_fn + +%if HAVE_AVX2_EXTERNAL +INIT_XMM avx2 +hvar_fn +%endif + +; macro to read/write a horizontal number of pixels (%2) to/from registers +; on sse, - fills xmm0-15 for consecutive sets of 16 pixels +; - if (%2 & 8) fills 8 bytes into xmm$next +; - if (%2 & 4) fills 4 bytes into xmm$next +; - if (%2 & 3) fills 1, 2 or 4 bytes in eax +; on mmx, - fills mm0-7 for consecutive sets of 8 pixels +; - if (%2 & 4) fills 4 bytes into mm$next +; - if (%2 & 3) fills 1, 2 or 4 bytes in eax +; writing data out is in the same way +%macro READ_NUM_BYTES 2 +%assign %%off 0 ; offset in source buffer +%assign %%mmx_idx 0 ; mmx register index +%assign %%xmm_idx 0 ; xmm register index + +%rep %2/mmsize +%if mmsize == 16 + movu xmm %+ %%xmm_idx, [srcq+%%off] +%assign %%xmm_idx %%xmm_idx+1 +%else ; mmx + movu mm %+ %%mmx_idx, [srcq+%%off] +%assign %%mmx_idx %%mmx_idx+1 +%endif +%assign %%off %%off+mmsize +%endrep ; %2/mmsize + +%if mmsize == 16 +%if (%2-%%off) >= 8 +%if %2 > 16 && (%2-%%off) > 8 + movu xmm %+ %%xmm_idx, [srcq+%2-16] +%assign %%xmm_idx %%xmm_idx+1 +%assign %%off %2 +%else + movq mm %+ %%mmx_idx, [srcq+%%off] +%assign %%mmx_idx %%mmx_idx+1 +%assign %%off %%off+8 +%endif +%endif ; (%2-%%off) >= 8 +%endif + +%if (%2-%%off) >= 4 +%if %2 > 8 && (%2-%%off) > 4 + movq mm %+ %%mmx_idx, [srcq+%2-8] +%assign %%off %2 +%else + movd mm %+ %%mmx_idx, [srcq+%%off] +%assign %%off %%off+4 +%endif +%assign %%mmx_idx %%mmx_idx+1 +%endif ; (%2-%%off) >= 4 + +%if (%2-%%off) >= 1 +%if %2 >= 4 + movd mm %+ %%mmx_idx, [srcq+%2-4] +%elif (%2-%%off) == 1 + mov valb, [srcq+%2-1] +%elif (%2-%%off) == 2 + mov valw, [srcq+%2-2] +%else + mov valb, [srcq+%2-1] + ror vald, 16 + mov valw, [srcq+%2-3] +%endif +%endif ; (%2-%%off) >= 1 +%endmacro ; READ_NUM_BYTES + +%macro WRITE_NUM_BYTES 2 +%assign %%off 0 ; offset in destination buffer +%assign %%mmx_idx 0 ; mmx register index +%assign %%xmm_idx 0 ; xmm register index + +%rep %2/mmsize +%if mmsize == 16 + movu [dstq+%%off], xmm %+ %%xmm_idx +%assign %%xmm_idx %%xmm_idx+1 +%else ; mmx + movu [dstq+%%off], mm %+ %%mmx_idx +%assign %%mmx_idx %%mmx_idx+1 +%endif +%assign %%off %%off+mmsize +%endrep ; %2/mmsize + +%if mmsize == 16 +%if (%2-%%off) >= 8 +%if %2 > 16 && (%2-%%off) > 8 + movu [dstq+%2-16], xmm %+ %%xmm_idx +%assign %%xmm_idx %%xmm_idx+1 +%assign %%off %2 +%else + movq [dstq+%%off], mm %+ %%mmx_idx +%assign %%mmx_idx %%mmx_idx+1 +%assign %%off %%off+8 +%endif +%endif ; (%2-%%off) >= 8 +%endif + +%if (%2-%%off) >= 4 +%if %2 > 8 && (%2-%%off) > 4 + movq [dstq+%2-8], mm %+ %%mmx_idx +%assign %%off %2 +%else + movd [dstq+%%off], mm %+ %%mmx_idx +%assign %%off %%off+4 +%endif +%assign %%mmx_idx %%mmx_idx+1 +%endif ; (%2-%%off) >= 4 + +%if (%2-%%off) >= 1 +%if %2 >= 4 + movd [dstq+%2-4], mm %+ %%mmx_idx +%elif (%2-%%off) == 1 + mov [dstq+%2-1], valb +%elif (%2-%%off) == 2 + mov [dstq+%2-2], valw +%else + mov [dstq+%2-3], valw + ror vald, 16 + mov [dstq+%2-1], valb +%ifnidn %1, body + ror vald, 16 +%endif +%endif +%endif ; (%2-%%off) >= 1 +%endmacro ; WRITE_NUM_BYTES + +; vertical top/bottom extend and body copy fast loops +; these are function pointers to set-width line copy functions, i.e. +; they read a fixed number of pixels into set registers, and write +; those out into the destination buffer +%macro VERTICAL_EXTEND 2 +%assign %%n %1 +%rep 1+%2-%1 +%if %%n <= 3 +%if ARCH_X86_64 +cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ + start_y, end_y, val, bh + mov bhq, r6mp ; r6mp = bhmp +%else ; x86-32 +cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh + mov dstq, r0mp + mov srcq, r2mp + mov start_yq, r4mp + mov end_yq, r5mp + mov bhq, r6mp +%define dst_strideq r1mp +%define src_strideq r3mp +%endif ; x86-64/32 +%else +%if ARCH_X86_64 +cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ + start_y, end_y, bh +%else ; x86-32 +cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh + mov srcq, r2mp + mov start_yq, r4mp + mov end_yq, r5mp + mov bhq, r6mp +%define dst_strideq r1mp +%define src_strideq r3mp +%endif ; x86-64/32 +%endif + ; FIXME move this to c wrapper? + sub bhq, end_yq ; bh -= end_y + sub end_yq, start_yq ; end_y -= start_y + + ; extend pixels above body + test start_yq, start_yq ; if (start_y) { + jz .body_loop + READ_NUM_BYTES top, %%n ; $variable_regs = read($n) +.top_loop: ; do { + WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += linesize + dec start_yq ; } while (--start_y) + jnz .top_loop ; } + + ; copy body pixels +.body_loop: ; do { + READ_NUM_BYTES body, %%n ; $variable_regs = read($n) + WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += dst_stride + add srcq, src_strideq ; src += src_stride + dec end_yq ; } while (--end_y) + jnz .body_loop + + ; copy bottom pixels + test bhq, bhq ; if (block_h) { + jz .end + sub srcq, src_strideq ; src -= linesize + READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) +.bottom_loop: ; do { + WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += linesize + dec bhq ; } while (--bh) + jnz .bottom_loop ; } + +.end: + RET +%assign %%n %%n+1 +%endrep ; 1+%2-%1 +%endmacro ; VERTICAL_EXTEND + +INIT_MMX mmx +VERTICAL_EXTEND 1, 15 + +INIT_XMM sse +VERTICAL_EXTEND 16, 22 + +; left/right (horizontal) fast extend functions +; these are essentially identical to the vertical extend ones above, +; just left/right separated because number of pixels to extend is +; obviously not the same on both sides. + +%macro READ_V_PIXEL 2 +%if cpuflag(avx2) + vpbroadcastb m0, %2 +%else + movzx vald, byte %2 + imul vald, 0x01010101 +%if %1 >= 8 + movd m0, vald +%if mmsize == 16 + pshufd m0, m0, q0000 +%else + punpckldq m0, m0 +%endif ; mmsize == 16 +%endif ; %1 > 16 +%endif ; avx2 +%endmacro ; READ_V_PIXEL + +%macro WRITE_V_PIXEL 2 +%assign %%off 0 + +%if %1 >= 8 + +%rep %1/mmsize + movu [%2+%%off], m0 +%assign %%off %%off+mmsize +%endrep ; %1/mmsize + +%if mmsize == 16 +%if %1-%%off >= 8 +%if %1 > 16 && %1-%%off > 8 + movu [%2+%1-16], m0 +%assign %%off %1 +%else + movq [%2+%%off], m0 +%assign %%off %%off+8 +%endif +%endif ; %1-%%off >= 8 +%endif ; mmsize == 16 + +%if %1-%%off >= 4 +%if %1 > 8 && %1-%%off > 4 + movq [%2+%1-8], m0 +%assign %%off %1 +%else + movd [%2+%%off], m0 +%assign %%off %%off+4 +%endif +%endif ; %1-%%off >= 4 + +%else ; %1 < 8 + +%rep %1/4 + mov [%2+%%off], vald +%assign %%off %%off+4 +%endrep ; %1/4 + +%endif ; %1 >=/< 8 + +%if %1-%%off == 2 +%if cpuflag(avx2) + movd [%2+%%off-2], m0 +%else + mov [%2+%%off], valw +%endif ; avx2 +%endif ; (%1-%%off)/2 +%endmacro ; WRITE_V_PIXEL + +%macro H_EXTEND 2 +%assign %%n %1 +%rep 1+(%2-%1)/2 +%if cpuflag(avx2) +cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh +%else +cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val +%endif +.loop_y: ; do { + READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) + WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) + add dstq, dst_strideq ; dst += dst_stride + dec bhq ; } while (--bh) + jnz .loop_y + RET +%assign %%n %%n+2 +%endrep ; 1+(%2-%1)/2 +%endmacro ; H_EXTEND + +INIT_MMX mmx +H_EXTEND 2, 14 + +INIT_XMM sse2 +H_EXTEND 16, 22 + +%if HAVE_AVX2_EXTERNAL +INIT_XMM avx2 +H_EXTEND 8, 22 +%endif + +INIT_MMX mmxext +cglobal prefetch, 3, 3, 0, buf, stride, h +.loop: + prefetcht0 [bufq] + add bufq, strideq + dec hd + jg .loop + RET diff --git a/media/ffvpx/libavcodec/x86/videodsp_init.c b/media/ffvpx/libavcodec/x86/videodsp_init.c new file mode 100644 index 0000000000..ae9db95624 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/videodsp_init.c @@ -0,0 +1,237 @@ +/* + * Copyright (C) 2002-2012 Michael Niedermayer + * Copyright (C) 2012 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/avassert.h" +#include "libavutil/common.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/videodsp.h" + +#if HAVE_X86ASM +typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride, + const uint8_t *src, x86_reg src_stride, + x86_reg start_y, x86_reg end_y, x86_reg bh); +typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride, + const uint8_t *src, x86_reg src_stride, + x86_reg start_y, x86_reg end_y, x86_reg bh, + x86_reg w); + +extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx; +extern emu_edge_vfix_func ff_emu_edge_vfix16_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix17_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix18_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix19_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix20_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix21_sse; +extern emu_edge_vfix_func ff_emu_edge_vfix22_sse; +static emu_edge_vfix_func * const vfixtbl_sse[22] = { + ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx, + ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx, + ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx, + ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx, + ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx, + ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse, + ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse, + ff_emu_edge_vfix22_sse +}; +extern emu_edge_vvar_func ff_emu_edge_vvar_sse; + +typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride, + x86_reg start_x, x86_reg bh); +typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride, + x86_reg start_x, x86_reg n_words, x86_reg bh); + +extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx; +extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; +extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; +static emu_edge_hfix_func * const hfixtbl_sse2[11] = { + ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, + ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, + ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, + ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 +}; +extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; +#if HAVE_AVX2_EXTERNAL +extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2; +extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2; +static emu_edge_hfix_func * const hfixtbl_avx2[11] = { + ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, + ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2, + ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2, + ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2 +}; +extern emu_edge_hvar_func ff_emu_edge_hvar_avx2; +#endif + +static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride, + ptrdiff_t src_stride, + x86_reg block_w, x86_reg block_h, + x86_reg src_x, x86_reg src_y, + x86_reg w, x86_reg h, + emu_edge_vfix_func * const *vfix_tbl, + emu_edge_vvar_func *v_extend_var, + emu_edge_hfix_func * const *hfix_tbl, + emu_edge_hvar_func *h_extend_var) +{ + x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p; + + if (!w || !h) + return; + + av_assert2(block_w <= FFABS(dst_stride)); + + if (src_y >= h) { + src -= src_y*src_stride; + src_y_add = h - 1; + src_y = h - 1; + } else if (src_y <= -block_h) { + src -= src_y*src_stride; + src_y_add = 1 - block_h; + src_y = 1 - block_h; + } + if (src_x >= w) { + src += w - 1 - src_x; + src_x = w - 1; + } else if (src_x <= -block_w) { + src += 1 - block_w - src_x; + src_x = 1 - block_w; + } + + start_y = FFMAX(0, -src_y); + start_x = FFMAX(0, -src_x); + end_y = FFMIN(block_h, h-src_y); + end_x = FFMIN(block_w, w-src_x); + av_assert2(start_x < end_x && block_w > 0); + av_assert2(start_y < end_y && block_h > 0); + + // fill in the to-be-copied part plus all above/below + src += (src_y_add + start_y) * src_stride + start_x; + w = end_x - start_x; + if (w <= 22) { + vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride, + start_y, end_y, block_h); + } else { + v_extend_var(dst + start_x, dst_stride, src, src_stride, + start_y, end_y, block_h, w); + } + + // fill left + if (start_x) { + if (start_x <= 22) { + hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h); + } else { + h_extend_var(dst, dst_stride, + start_x, (start_x + 1) >> 1, block_h); + } + } + + // fill right + p = block_w - end_x; + if (p) { + if (p <= 22) { + hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride, + -!(p & 1), block_h); + } else { + h_extend_var(dst + end_x - (p & 1), dst_stride, + -!(p & 1), (p + 1) >> 1, block_h); + } + } +} + +static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src, + ptrdiff_t buf_stride, + ptrdiff_t src_stride, + int block_w, int block_h, + int src_x, int src_y, int w, + int h) +{ + emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, + src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, + hfixtbl_sse2, &ff_emu_edge_hvar_sse2); +} + +#if HAVE_AVX2_EXTERNAL +static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src, + ptrdiff_t buf_stride, + ptrdiff_t src_stride, + int block_w, int block_h, + int src_x, int src_y, int w, + int h) +{ + emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, + src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, + hfixtbl_avx2, &ff_emu_edge_hvar_avx2); +} +#endif /* HAVE_AVX2_EXTERNAL */ +#endif /* HAVE_X86ASM */ + +void ff_prefetch_mmxext(const uint8_t *buf, ptrdiff_t stride, int h); + +av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMXEXT(cpu_flags)) { + ctx->prefetch = ff_prefetch_mmxext; + } + if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) { + ctx->emulated_edge_mc = emulated_edge_mc_sse2; + } +#if HAVE_AVX2_EXTERNAL + if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) { + ctx->emulated_edge_mc = emulated_edge_mc_avx2; + } +#endif +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/vp56_arith.h b/media/ffvpx/libavcodec/x86/vp56_arith.h new file mode 100644 index 0000000000..9f7639980c --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp56_arith.h @@ -0,0 +1,53 @@ +/** + * VP5 and VP6 compatible video decoder (arith decoder) + * + * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> + * Copyright (C) 2010 Eli Friedman + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_VP56_ARITH_H +#define AVCODEC_X86_VP56_ARITH_H + +#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS +#include "libavutil/attributes.h" + +#define vp56_rac_get_prob vp56_rac_get_prob +static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob) +{ + unsigned int code_word = vp56_rac_renorm(c); + unsigned int low = 1 + (((c->high - 1) * prob) >> 8); + unsigned int low_shift = low << 16; + int bit = 0; + c->code_word = code_word; + + __asm__( + "subl %4, %1 \n\t" + "subl %3, %2 \n\t" + "setae %b0 \n\t" + "cmovb %4, %1 \n\t" + "cmovb %5, %2 \n\t" + : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word) + : "r"(low_shift), "r"(low), "r"(code_word) + ); + + return bit; +} +#endif + +#endif /* AVCODEC_X86_VP56_ARITH_H */ diff --git a/media/ffvpx/libavcodec/x86/vp8dsp.asm b/media/ffvpx/libavcodec/x86/vp8dsp.asm new file mode 100644 index 0000000000..6ac5a7721b --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp8dsp.asm @@ -0,0 +1,1116 @@ +;****************************************************************************** +;* VP8 MMXEXT optimizations +;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> +;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +fourtap_filter_hw_m: times 4 dw -6, 123 + times 4 dw 12, -1 + times 4 dw -9, 93 + times 4 dw 50, -6 + times 4 dw -6, 50 + times 4 dw 93, -9 + times 4 dw -1, 12 + times 4 dw 123, -6 + +sixtap_filter_hw_m: times 4 dw 2, -11 + times 4 dw 108, 36 + times 4 dw -8, 1 + times 4 dw 3, -16 + times 4 dw 77, 77 + times 4 dw -16, 3 + times 4 dw 1, -8 + times 4 dw 36, 108 + times 4 dw -11, 2 + +fourtap_filter_hb_m: times 8 db -6, 123 + times 8 db 12, -1 + times 8 db -9, 93 + times 8 db 50, -6 + times 8 db -6, 50 + times 8 db 93, -9 + times 8 db -1, 12 + times 8 db 123, -6 + +sixtap_filter_hb_m: times 8 db 2, 1 + times 8 db -11, 108 + times 8 db 36, -8 + times 8 db 3, 3 + times 8 db -16, 77 + times 8 db 77, -16 + times 8 db 1, 2 + times 8 db -8, 36 + times 8 db 108, -11 + +fourtap_filter_v_m: times 8 dw -6 + times 8 dw 123 + times 8 dw 12 + times 8 dw -1 + times 8 dw -9 + times 8 dw 93 + times 8 dw 50 + times 8 dw -6 + times 8 dw -6 + times 8 dw 50 + times 8 dw 93 + times 8 dw -9 + times 8 dw -1 + times 8 dw 12 + times 8 dw 123 + times 8 dw -6 + +sixtap_filter_v_m: times 8 dw 2 + times 8 dw -11 + times 8 dw 108 + times 8 dw 36 + times 8 dw -8 + times 8 dw 1 + times 8 dw 3 + times 8 dw -16 + times 8 dw 77 + times 8 dw 77 + times 8 dw -16 + times 8 dw 3 + times 8 dw 1 + times 8 dw -8 + times 8 dw 36 + times 8 dw 108 + times 8 dw -11 + times 8 dw 2 + +bilinear_filter_vw_m: times 8 dw 1 + times 8 dw 2 + times 8 dw 3 + times 8 dw 4 + times 8 dw 5 + times 8 dw 6 + times 8 dw 7 + +bilinear_filter_vb_m: times 8 db 7, 1 + times 8 db 6, 2 + times 8 db 5, 3 + times 8 db 4, 4 + times 8 db 3, 5 + times 8 db 2, 6 + times 8 db 1, 7 + +%ifdef PIC +%define fourtap_filter_hw picregq +%define sixtap_filter_hw picregq +%define fourtap_filter_hb picregq +%define sixtap_filter_hb picregq +%define fourtap_filter_v picregq +%define sixtap_filter_v picregq +%define bilinear_filter_vw picregq +%define bilinear_filter_vb picregq +%define npicregs 1 +%else +%define fourtap_filter_hw fourtap_filter_hw_m +%define sixtap_filter_hw sixtap_filter_hw_m +%define fourtap_filter_hb fourtap_filter_hb_m +%define sixtap_filter_hb sixtap_filter_hb_m +%define fourtap_filter_v fourtap_filter_v_m +%define sixtap_filter_v sixtap_filter_v_m +%define bilinear_filter_vw bilinear_filter_vw_m +%define bilinear_filter_vb bilinear_filter_vb_m +%define npicregs 0 +%endif + +filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 + +filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 +filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 +filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 + +pw_20091: times 4 dw 20091 +pw_17734: times 4 dw 17734 + +cextern pw_3 +cextern pw_4 +cextern pw_64 +cextern pw_256 + +SECTION .text + +;------------------------------------------------------------------------------- +; subpel MC functions: +; +; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride, +; const uint8_t *src, ptrdiff_t srcstride, +; int height, int mx, int my); +;------------------------------------------------------------------------------- + +%macro FILTER_SSSE3 1 +cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] + mova m3, [filter_h6_shuf2] + mova m4, [filter_h6_shuf3] +%ifdef PIC + lea picregq, [sixtap_filter_hb_m] +%endif + mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes + mova m6, [sixtap_filter_hb+mxq*8-32] + mova m7, [sixtap_filter_hb+mxq*8-16] + +.nextrow: + movu m0, [srcq-2] + mova m1, m0 + mova m2, m0 +%if mmsize == 8 +; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the +; shuffle with a memory operand + punpcklbw m0, [srcq+3] +%else + pshufb m0, [filter_h6_shuf1] +%endif + pshufb m1, m3 + pshufb m2, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m6 + pmaddubsw m2, m7 + paddsw m0, m1 + paddsw m0, m2 + pmulhrsw m0, [pw_256] + packuswb m0, m0 + movh [dstq], m0 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 + mova m2, [pw_256] + mova m3, [filter_h2_shuf] + mova m4, [filter_h4_shuf] +%ifdef PIC + lea picregq, [fourtap_filter_hb_m] +%endif + mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes + mova m6, [fourtap_filter_hb+mxq] + +.nextrow: + movu m0, [srcq-1] + mova m1, m0 + pshufb m0, m3 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m6 + paddsw m0, m1 + pmulhrsw m0, m2 + packuswb m0, m0 + movh [dstq], m0 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 +%ifdef PIC + lea picregq, [fourtap_filter_hb_m] +%endif + mova m5, [fourtap_filter_hb+myq-16] + mova m6, [fourtap_filter_hb+myq] + mova m7, [pw_256] + + ; read 3 lines + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq + +.nextrow: + movh m3, [srcq+2*srcstrideq] ; read new row + mova m4, m0 + mova m0, m1 + punpcklbw m4, m1 + mova m1, m2 + punpcklbw m2, m3 + pmaddubsw m4, m5 + pmaddubsw m2, m6 + paddsw m4, m2 + mova m2, m3 + pmulhrsw m4, m7 + packuswb m4, m4 + movh [dstq], m4 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + lea myd, [myq*3] +%ifdef PIC + lea picregq, [sixtap_filter_hb_m] +%endif + lea myq, [sixtap_filter_hb+myq*8] + + ; read 5 lines + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] + +.nextrow: + movh m5, [srcq+2*srcstrideq] ; read new row + mova m6, m0 + punpcklbw m6, m5 + mova m0, m1 + punpcklbw m1, m2 + mova m7, m3 + punpcklbw m7, m4 + pmaddubsw m6, [myq-48] + pmaddubsw m1, [myq-32] + pmaddubsw m7, [myq-16] + paddsw m6, m1 + paddsw m6, m7 + mova m1, m2 + mova m2, m3 + pmulhrsw m6, [pw_256] + mova m3, m4 + packuswb m6, m6 + mova m4, m5 + movh [dstq], m6 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET +%endmacro + +INIT_MMX ssse3 +FILTER_SSSE3 4 +INIT_XMM ssse3 +FILTER_SSSE3 8 + +; 4x4 block, H-only 4-tap filter +INIT_MMX mmxext +cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 +%ifdef PIC + lea picregq, [fourtap_filter_hw_m] +%endif + movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words + movq mm5, [fourtap_filter_hw+mxq] + movq mm7, [pw_64] + pxor mm6, mm6 + +.nextrow: + movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels + + ; first set of 2 pixels + movq mm2, mm1 ; byte ABCD.. + punpcklbw mm1, mm6 ; byte->word ABCD + pshufw mm0, mm2, 9 ; byte CDEF.. + punpcklbw mm0, mm6 ; byte->word CDEF + pshufw mm3, mm1, 0x94 ; word ABBC + pshufw mm1, mm0, 0x94 ; word CDDE + pmaddwd mm3, mm4 ; multiply 2px with F0/F1 + movq mm0, mm1 ; backup for second set of pixels + pmaddwd mm1, mm5 ; multiply 2px with F2/F3 + paddd mm3, mm1 ; finish 1st 2px + + ; second set of 2 pixels, use backup of above + punpckhbw mm2, mm6 ; byte->word EFGH + pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 + pshufw mm1, mm2, 0x94 ; word EFFG + pmaddwd mm1, mm5 ; multiply 2px with F2/F3 + paddd mm0, mm1 ; finish 2nd 2px + + ; merge two sets of 2 pixels into one set of 4, round/clip/store + packssdw mm3, mm0 ; merge dword->word (4px) + paddsw mm3, mm7 ; rounding + psraw mm3, 7 + packuswb mm3, mm6 ; clip and word->bytes + movd [dstq], mm3 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +; 4x4 block, H-only 6-tap filter +INIT_MMX mmxext +cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] +%ifdef PIC + lea picregq, [sixtap_filter_hw_m] +%endif + movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words + movq mm5, [sixtap_filter_hw+mxq*8-32] + movq mm6, [sixtap_filter_hw+mxq*8-16] + movq mm7, [pw_64] + pxor mm3, mm3 + +.nextrow: + movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels + + ; first set of 2 pixels + movq mm2, mm1 ; byte ABCD.. + punpcklbw mm1, mm3 ; byte->word ABCD + pshufw mm0, mm2, 0x9 ; byte CDEF.. + punpckhbw mm2, mm3 ; byte->word EFGH + punpcklbw mm0, mm3 ; byte->word CDEF + pshufw mm1, mm1, 0x94 ; word ABBC + pshufw mm2, mm2, 0x94 ; word EFFG + pmaddwd mm1, mm4 ; multiply 2px with F0/F1 + pshufw mm3, mm0, 0x94 ; word CDDE + movq mm0, mm3 ; backup for second set of pixels + pmaddwd mm3, mm5 ; multiply 2px with F2/F3 + paddd mm1, mm3 ; add to 1st 2px cache + movq mm3, mm2 ; backup for second set of pixels + pmaddwd mm2, mm6 ; multiply 2px with F4/F5 + paddd mm1, mm2 ; finish 1st 2px + + ; second set of 2 pixels, use backup of above + movd mm2, [srcq+3] ; byte FGHI (prevent overreads) + pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 + pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 + paddd mm0, mm3 ; add to 2nd 2px cache + pxor mm3, mm3 + punpcklbw mm2, mm3 ; byte->word FGHI + pshufw mm2, mm2, 0xE9 ; word GHHI + pmaddwd mm2, mm6 ; multiply 2px with F4/F5 + paddd mm0, mm2 ; finish 2nd 2px + + ; merge two sets of 2 pixels into one set of 4, round/clip/store + packssdw mm1, mm0 ; merge dword->word (4px) + paddsw mm1, mm7 ; rounding + psraw mm1, 7 + packuswb mm1, mm3 ; clip and word->bytes + movd [dstq], mm1 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +INIT_XMM sse2 +cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 5 +%ifdef PIC + lea picregq, [fourtap_filter_v_m] +%endif + lea mxq, [fourtap_filter_v+mxq-32] + pxor m7, m7 + mova m4, [pw_64] + mova m5, [mxq+ 0] + mova m6, [mxq+16] +%ifdef m8 + mova m8, [mxq+32] + mova m9, [mxq+48] +%endif +.nextrow: + movq m0, [srcq-1] + movq m1, [srcq-0] + movq m2, [srcq+1] + movq m3, [srcq+2] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + pmullw m0, m5 + pmullw m1, m6 +%ifdef m8 + pmullw m2, m8 + pmullw m3, m9 +%else + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] +%endif + paddsw m0, m1 + paddsw m2, m3 + paddsw m0, m2 + paddsw m0, m4 + psraw m0, 7 + packuswb m0, m7 + movh [dstq], m0 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +INIT_XMM sse2 +cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] + shl mxd, 4 +%ifdef PIC + lea picregq, [sixtap_filter_v_m] +%endif + lea mxq, [sixtap_filter_v+mxq-96] + pxor m7, m7 + mova m6, [pw_64] +%ifdef m8 + mova m8, [mxq+ 0] + mova m9, [mxq+16] + mova m10, [mxq+32] + mova m11, [mxq+48] + mova m12, [mxq+64] + mova m13, [mxq+80] +%endif +.nextrow: + movq m0, [srcq-2] + movq m1, [srcq-1] + movq m2, [srcq-0] + movq m3, [srcq+1] + movq m4, [srcq+2] + movq m5, [srcq+3] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 +%ifdef m8 + pmullw m0, m8 + pmullw m1, m9 + pmullw m2, m10 + pmullw m3, m11 + pmullw m4, m12 + pmullw m5, m13 +%else + pmullw m0, [mxq+ 0] + pmullw m1, [mxq+16] + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] + pmullw m4, [mxq+64] + pmullw m5, [mxq+80] +%endif + paddsw m1, m4 + paddsw m0, m5 + paddsw m1, m2 + paddsw m0, m3 + paddsw m0, m1 + paddsw m0, m6 + psraw m0, 7 + packuswb m0, m7 + movh [dstq], m0 ; store + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + +%macro FILTER_V 1 +; 4x4 block, V-only 4-tap filter +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 5 +%ifdef PIC + lea picregq, [fourtap_filter_v_m] +%endif + lea myq, [fourtap_filter_v+myq-32] + mova m6, [pw_64] + pxor m7, m7 + mova m5, [myq+48] + + ; read 3 lines + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + +.nextrow: + ; first calculate negative taps (to prevent losing positive overflows) + movh m4, [srcq+2*srcstrideq] ; read new row + punpcklbw m4, m7 + mova m3, m4 + pmullw m0, [myq+0] + pmullw m4, m5 + paddsw m4, m0 + + ; then calculate positive taps + mova m0, m1 + pmullw m1, [myq+16] + paddsw m4, m1 + mova m1, m2 + pmullw m2, [myq+32] + paddsw m4, m2 + mova m2, m3 + + ; round/clip/store + paddsw m4, m6 + psraw m4, 7 + packuswb m4, m7 + movh [dstq], m4 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET + + +; 4x4 block, V-only 6-tap filter +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 + lea myq, [myq*3] +%ifdef PIC + lea picregq, [sixtap_filter_v_m] +%endif + lea myq, [sixtap_filter_v+myq-96] + pxor m7, m7 + + ; read 5 lines + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + +.nextrow: + ; first calculate negative taps (to prevent losing positive overflows) + mova m5, m1 + pmullw m5, [myq+16] + mova m6, m4 + pmullw m6, [myq+64] + paddsw m6, m5 + + ; then calculate positive taps + movh m5, [srcq+2*srcstrideq] ; read new row + punpcklbw m5, m7 + pmullw m0, [myq+0] + paddsw m6, m0 + mova m0, m1 + mova m1, m2 + pmullw m2, [myq+32] + paddsw m6, m2 + mova m2, m3 + pmullw m3, [myq+48] + paddsw m6, m3 + mova m3, m4 + mova m4, m5 + pmullw m5, [myq+80] + paddsw m6, m5 + + ; round/clip/store + paddsw m6, [pw_64] + psraw m6, 7 + packuswb m6, m7 + movh [dstq], m6 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + RET +%endmacro + +INIT_MMX mmxext +FILTER_V 4 +INIT_XMM sse2 +FILTER_V 8 + +%macro FILTER_BILINEAR 1 +%if cpuflag(ssse3) +cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 +%ifdef PIC + lea picregq, [bilinear_filter_vb_m] +%endif + pxor m4, m4 + mova m3, [bilinear_filter_vb+myq-16] +.nextrow: + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m2, [srcq+srcstrideq*2] + punpcklbw m0, m1 + punpcklbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + psraw m0, 2 + psraw m1, 2 + pavgw m0, m4 + pavgw m1, m4 +%if mmsize==8 + packuswb m0, m0 + packuswb m1, m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 +%else + packuswb m0, m1 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 +%endif +%else ; cpuflag(ssse3) +cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 +%ifdef PIC + lea picregq, [bilinear_filter_vw_m] +%endif + pxor m6, m6 + mova m5, [bilinear_filter_vw+myq-1*16] + neg myq + mova m4, [bilinear_filter_vw+myq+7*16] +.nextrow: + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m3, [srcq+srcstrideq*2] + punpcklbw m0, m6 + punpcklbw m1, m6 + punpcklbw m3, m6 + mova m2, m1 + pmullw m0, m4 + pmullw m1, m5 + pmullw m2, m4 + pmullw m3, m5 + paddsw m0, m1 + paddsw m2, m3 + psraw m0, 2 + psraw m2, 2 + pavgw m0, m6 + pavgw m2, m6 +%if mmsize == 8 + packuswb m0, m0 + packuswb m2, m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 +%else + packuswb m0, m2 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 +%endif +%endif ; cpuflag(ssse3) + + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 + jg .nextrow + RET + +%if cpuflag(ssse3) +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 +%ifdef PIC + lea picregq, [bilinear_filter_vb_m] +%endif + pxor m4, m4 + mova m2, [filter_h2_shuf] + mova m3, [bilinear_filter_vb+mxq-16] +.nextrow: + movu m0, [srcq+srcstrideq*0] + movu m1, [srcq+srcstrideq*1] + pshufb m0, m2 + pshufb m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + psraw m0, 2 + psraw m1, 2 + pavgw m0, m4 + pavgw m1, m4 +%if mmsize==8 + packuswb m0, m0 + packuswb m1, m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 +%else + packuswb m0, m1 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 +%endif +%else ; cpuflag(ssse3) +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 +%ifdef PIC + lea picregq, [bilinear_filter_vw_m] +%endif + pxor m6, m6 + mova m5, [bilinear_filter_vw+mxq-1*16] + neg mxq + mova m4, [bilinear_filter_vw+mxq+7*16] +.nextrow: + movh m0, [srcq+srcstrideq*0+0] + movh m1, [srcq+srcstrideq*0+1] + movh m2, [srcq+srcstrideq*1+0] + movh m3, [srcq+srcstrideq*1+1] + punpcklbw m0, m6 + punpcklbw m1, m6 + punpcklbw m2, m6 + punpcklbw m3, m6 + pmullw m0, m4 + pmullw m1, m5 + pmullw m2, m4 + pmullw m3, m5 + paddsw m0, m1 + paddsw m2, m3 + psraw m0, 2 + psraw m2, 2 + pavgw m0, m6 + pavgw m2, m6 +%if mmsize == 8 + packuswb m0, m0 + packuswb m2, m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 +%else + packuswb m0, m2 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 +%endif +%endif ; cpuflag(ssse3) + + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 + jg .nextrow + RET +%endmacro + +INIT_MMX mmxext +FILTER_BILINEAR 4 +INIT_XMM sse2 +FILTER_BILINEAR 8 +INIT_MMX ssse3 +FILTER_BILINEAR 4 +INIT_XMM ssse3 +FILTER_BILINEAR 8 + +INIT_MMX mmx +cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height +.nextrow: + movq mm0, [srcq+srcstrideq*0] + movq mm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movq [dstq+dststrideq*0], mm0 + movq [dstq+dststrideq*1], mm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 + jg .nextrow + RET + +INIT_XMM sse +cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height +.nextrow: + movups xmm0, [srcq+srcstrideq*0] + movups xmm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movaps [dstq+dststrideq*0], xmm0 + movaps [dstq+dststrideq*1], xmm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 + jg .nextrow + RET + +;----------------------------------------------------------------------------- +; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +;----------------------------------------------------------------------------- + +%macro ADD_DC 4 + %4 m2, [dst1q+%3] + %4 m3, [dst1q+strideq+%3] + %4 m4, [dst2q+%3] + %4 m5, [dst2q+strideq+%3] + paddusb m2, %1 + paddusb m3, %1 + paddusb m4, %1 + paddusb m5, %1 + psubusb m2, %2 + psubusb m3, %2 + psubusb m4, %2 + psubusb m5, %2 + %4 [dst1q+%3], m2 + %4 [dst1q+strideq+%3], m3 + %4 [dst2q+%3], m4 + %4 [dst2q+strideq+%3], m5 +%endmacro + +%macro VP8_IDCT_DC_ADD 0 +cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride + ; load data + movd m0, [blockq] + pxor m1, m1 + + ; calculate DC + paddw m0, [pw_4] + movd [blockq], m1 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] + movd m2, [dst1q] + movd m3, [dst1q+strideq] + movd m4, [dst2q] + movd m5, [dst2q+strideq] + psraw m0, 3 + pshuflw m0, m0, 0 + punpcklqdq m0, m0 + punpckldq m2, m3 + punpckldq m4, m5 + punpcklbw m2, m1 + punpcklbw m4, m1 + paddw m2, m0 + paddw m4, m0 + packuswb m2, m4 + movd [dst1q], m2 +%if cpuflag(sse4) + pextrd [dst1q+strideq], m2, 1 + pextrd [dst2q], m2, 2 + pextrd [dst2q+strideq], m2, 3 +%else + psrldq m2, 4 + movd [dst1q+strideq], m2 + psrldq m2, 4 + movd [dst2q], m2 + psrldq m2, 4 + movd [dst2q+strideq], m2 +%endif + RET +%endmacro + +INIT_XMM sse2 +VP8_IDCT_DC_ADD +INIT_XMM sse4 +VP8_IDCT_DC_ADD + +;----------------------------------------------------------------------------- +; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); +;----------------------------------------------------------------------------- + +INIT_XMM sse2 +cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride + ; load data + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D + punpckldq m0, m1 ; A B C D + pxor m1, m1 + + ; calculate DC + paddw m0, [pw_4] + movd [blockq+32*0], m1 + movd [blockq+32*1], m1 + movd [blockq+32*2], m1 + movd [blockq+32*3], m1 + psraw m0, 3 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 + punpcklbw m0, m0 + punpcklbw m1, m1 + punpcklbw m0, m0 + punpcklbw m1, m1 + + ; add DC + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] + ADD_DC m0, m1, 0, mova + RET + +;----------------------------------------------------------------------------- +; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); +;----------------------------------------------------------------------------- + +INIT_MMX mmx +cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride + ; load data + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D + punpckldq m0, m1 ; A B C D + pxor m6, m6 + + ; calculate DC + paddw m0, [pw_4] + movd [blockq+32*0], m6 + movd [blockq+32*1], m6 + movd [blockq+32*2], m6 + movd [blockq+32*3], m6 + psraw m0, 3 + psubw m6, m0 + packuswb m0, m0 + packuswb m6, m6 + punpcklbw m0, m0 ; AABBCCDD + punpcklbw m6, m6 ; AABBCCDD + movq m1, m0 + movq m7, m6 + punpcklbw m0, m0 ; AAAABBBB + punpckhbw m1, m1 ; CCCCDDDD + punpcklbw m6, m6 ; AAAABBBB + punpckhbw m7, m7 ; CCCCDDDD + + ; add DC + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] + ADD_DC m0, m6, 0, mova + lea dst1q, [dst1q+strideq*4] + lea dst2q, [dst2q+strideq*4] + ADD_DC m1, m7, 0, mova + RET + +;----------------------------------------------------------------------------- +; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +;----------------------------------------------------------------------------- + +; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) +; this macro assumes that m6/m7 have words for 20091/17734 loaded +%macro VP8_MULTIPLY_SUMSUB 4 + mova %3, %1 + mova %4, %2 + pmulhw %3, m6 ;20091(1) + pmulhw %4, m6 ;20091(2) + paddw %3, %1 + paddw %4, %2 + paddw %1, %1 + paddw %2, %2 + pmulhw %1, m7 ;35468(1) + pmulhw %2, m7 ;35468(2) + psubw %1, %4 + paddw %2, %3 +%endmacro + +; calculate x0=%1+%3; x1=%1-%3 +; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) +; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) +; %5/%6 are temporary registers +; we assume m6/m7 have constant words 20091/17734 loaded in them +%macro VP8_IDCT_TRANSFORM4x4_1D 6 + SUMSUB_BA w, %3, %1, %5 ;t0, t1 + VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 + SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3 + SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2 + SWAP %4, %1 + SWAP %4, %3 +%endmacro + +INIT_MMX sse +cglobal vp8_idct_add, 3, 3, 0, dst, block, stride + ; load block data + movq m0, [blockq+ 0] + movq m1, [blockq+ 8] + movq m2, [blockq+16] + movq m3, [blockq+24] + movq m6, [pw_20091] + movq m7, [pw_17734] + xorps xmm0, xmm0 + movaps [blockq+ 0], xmm0 + movaps [blockq+16], xmm0 + + ; actual IDCT + VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + paddw m0, [pw_4] + VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + + ; store + pxor m4, m4 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+2*strideq] + STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq + STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq + + RET + +;----------------------------------------------------------------------------- +; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16]) +;----------------------------------------------------------------------------- + +%macro SCATTER_WHT 3 + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(0+%3)], dc1w + mov [blockq+2*16*(1+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 + psrlq m%1, 32 + psrlq m%2, 32 + mov [blockq+2*16*(4+%3)], dc1w + mov [blockq+2*16*(5+%3)], dc2w + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(8+%3)], dc1w + mov [blockq+2*16*(9+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 + mov [blockq+2*16*(12+%3)], dc1w + mov [blockq+2*16*(13+%3)], dc2w +%endmacro + +%macro HADAMARD4_1D 4 + SUMSUB_BADC w, %2, %1, %4, %3 + SUMSUB_BADC w, %4, %2, %3, %1 + SWAP %1, %4, %3 +%endmacro + +INIT_MMX sse +cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 + movq m0, [dc1q] + movq m1, [dc1q+8] + movq m2, [dc1q+16] + movq m3, [dc1q+24] + xorps xmm0, xmm0 + movaps [dc1q+ 0], xmm0 + movaps [dc1q+16], xmm0 + HADAMARD4_1D 0, 1, 2, 3 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + paddw m0, [pw_3] + HADAMARD4_1D 0, 1, 2, 3 + psraw m0, 3 + psraw m1, 3 + psraw m2, 3 + psraw m3, 3 + SCATTER_WHT 0, 1, 0 + SCATTER_WHT 2, 3, 2 + RET diff --git a/media/ffvpx/libavcodec/x86/vp8dsp_init.c b/media/ffvpx/libavcodec/x86/vp8dsp_init.c new file mode 100644 index 0000000000..bd20da1fc9 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp8dsp_init.c @@ -0,0 +1,383 @@ +/* + * VP8 DSP functions x86-optimized + * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> + * Copyright (c) 2010 Fiona Glaser <fiona@x264.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem_internal.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vp8dsp.h" + +#if HAVE_X86ASM + +/* + * MC functions + */ +void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + + +void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); +void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int height, int mx, int my); + +#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \ +static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \ + uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride, int height, int mx, int my) \ +{ \ + ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ + dst, dststride, src, srcstride, height, mx, my); \ + ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ + dst + 8, dststride, src + 8, srcstride, height, mx, my); \ +} +#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \ +static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ + uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ + ptrdiff_t srcstride, int height, int mx, int my) \ +{ \ + ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ + dst, dststride, src, srcstride, height, mx, my); \ + ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ + dst + 4, dststride, src + 4, srcstride, height, mx, my); \ +} + +TAP_W16(sse2, epel, h6) +TAP_W16(sse2, epel, v6) +TAP_W16(sse2, bilinear, h) +TAP_W16(sse2, bilinear, v) + +TAP_W16(ssse3, epel, h6) +TAP_W16(ssse3, epel, v6) +TAP_W16(ssse3, bilinear, h) +TAP_W16(ssse3, bilinear, v) + +#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \ +static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \ + uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride, int height, int mx, int my) \ +{ \ + LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \ + uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \ + src -= srcstride * (TAPNUMY / 2 - 1); \ + ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \ + tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \ + ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \ + dst, dststride, tmpptr, SIZE, height, mx, my); \ +} + +#define HVTAPMMX(x, y) \ +HVTAP(mmxext, 8, x, y, 4, 8) + +HVTAPMMX(4, 4) +HVTAPMMX(4, 6) +HVTAPMMX(6, 4) +HVTAPMMX(6, 6) + +#define HVTAPSSE2(x, y, w) \ +HVTAP(sse2, 16, x, y, w, 16) \ +HVTAP(ssse3, 16, x, y, w, 16) + +HVTAPSSE2(4, 4, 8) +HVTAPSSE2(4, 6, 8) +HVTAPSSE2(6, 4, 8) +HVTAPSSE2(6, 6, 8) +HVTAPSSE2(6, 6, 16) + +HVTAP(ssse3, 16, 4, 4, 4, 8) +HVTAP(ssse3, 16, 4, 6, 4, 8) +HVTAP(ssse3, 16, 6, 4, 4, 8) +HVTAP(ssse3, 16, 6, 6, 4, 8) + +#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \ +static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ + uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \ + ptrdiff_t srcstride, int height, int mx, int my) \ +{ \ + LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \ + ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \ + tmp, SIZE, src, srcstride, height + 1, mx, my); \ + ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \ + dst, dststride, tmp, SIZE, height, mx, my); \ +} + +HVBILIN(mmxext, 8, 4, 8) +HVBILIN(sse2, 8, 8, 16) +HVBILIN(sse2, 8, 16, 16) +HVBILIN(ssse3, 8, 4, 8) +HVBILIN(ssse3, 8, 8, 16) +HVBILIN(ssse3, 8, 16, 16) + +void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16], + ptrdiff_t stride); +void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16], + ptrdiff_t stride); +void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16], + ptrdiff_t stride); +void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16], + ptrdiff_t stride); +void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]); +void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride); + +#define DECLARE_LOOP_FILTER(NAME) \ +void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim); \ +void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim); \ +void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); \ +void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ + ptrdiff_t stride, \ + int e, int i, int hvt); \ +void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); \ +void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t s, \ + int e, int i, int hvt); + +DECLARE_LOOP_FILTER(sse2) +DECLARE_LOOP_FILTER(ssse3) +DECLARE_LOOP_FILTER(sse4) + +#endif /* HAVE_X86ASM */ + +#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ + c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT + +#define VP8_MC_FUNC(IDX, SIZE, OPT) \ + c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \ + c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \ + VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) + +#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \ + c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ + c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT + + +av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + c->put_vp8_epel_pixels_tab[1][0][0] = + c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; + } + + /* note that 4-tap width=16 functions are missing because w=16 + * is only used for luma, and luma is always a copy or sixtap. */ + if (EXTERNAL_MMXEXT(cpu_flags)) { + VP8_MC_FUNC(2, 4, mmxext); + VP8_BILINEAR_MC_FUNC(2, 4, mmxext); + } + + if (EXTERNAL_SSE(cpu_flags)) { + c->put_vp8_epel_pixels_tab[0][0][0] = + c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; + } + + if (EXTERNAL_SSE2_SLOW(cpu_flags)) { + VP8_LUMA_MC_FUNC(0, 16, sse2); + VP8_MC_FUNC(1, 8, sse2); + VP8_BILINEAR_MC_FUNC(0, 16, sse2); + VP8_BILINEAR_MC_FUNC(1, 8, sse2); + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + VP8_LUMA_MC_FUNC(0, 16, ssse3); + VP8_MC_FUNC(1, 8, ssse3); + VP8_MC_FUNC(2, 4, ssse3); + VP8_BILINEAR_MC_FUNC(0, 16, ssse3); + VP8_BILINEAR_MC_FUNC(1, 8, ssse3); + VP8_BILINEAR_MC_FUNC(2, 4, ssse3); + } +#endif /* HAVE_X86ASM */ +} + +av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx; + } + + if (EXTERNAL_SSE(cpu_flags)) { + c->vp8_idct_add = ff_vp8_idct_add_sse; + c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; + } + + if (EXTERNAL_SSE2_SLOW(cpu_flags)) { + c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; + + c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; + + c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; + c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse2; + c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; + + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; + + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; + + c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; + c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; + + c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; + c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; + c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3; + + c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3; + c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; + c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; + c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; + } + + if (EXTERNAL_SSE4(cpu_flags)) { + c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; + + c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; + c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; + c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; + } +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm b/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm new file mode 100644 index 0000000000..ef397efd3e --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp8dsp_loopfilter.asm @@ -0,0 +1,1234 @@ +;****************************************************************************** +;* VP8 MMXEXT optimizations +;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> +;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_27: times 8 dw 27 +pw_63: times 8 dw 63 + +pb_4: times 16 db 4 +pb_F8: times 16 db 0xF8 +pb_FE: times 16 db 0xFE +pb_27_63: times 8 db 27, 63 +pb_18_63: times 8 db 18, 63 +pb_9_63: times 8 db 9, 63 + +cextern pb_1 +cextern pb_3 +cextern pw_9 +cextern pw_18 +cextern pb_80 + +SECTION .text + +;----------------------------------------------------------------------------- +; void ff_vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, ptrdiff_t stride, int flim); +;----------------------------------------------------------------------------- + +; macro called with 7 mm register indexes as argument, and 5 regular registers +; first 11 mean the same as READ_8x4_TRANSPOSED above +; fifth regular register is scratchspace to reach the bottom 8 rows, it +; will be set to second regular register + 8*stride at the end +%macro READ_16x4_INTERLEAVED 12 + ; transpose 16 (A-P) rows of 4 pixels each + lea %12, [r0+8*r2] + + ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M + movd m%1, [%8+%10*4] ; A0-3 + movd m%3, [%12+%10*4] ; I0-3 + movd m%2, [%8+%10*2] ; C0-3 + movd m%4, [%12+%10*2] ; K0-3 + movd m%6, [%8+%10] ; D0-3 + movd m%5, [%12+%10] ; L0-3 + movd m%7, [%12] ; M0-3 + add %12, %11 + punpcklbw m%1, m%3 ; A/I + movd m%3, [%8] ; E0-3 + punpcklbw m%2, m%4 ; C/K + punpcklbw m%6, m%5 ; D/L + punpcklbw m%3, m%7 ; E/M + punpcklbw m%2, m%6 ; C/D/K/L interleaved + + ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P + movd m%5, [%9+%10*4] ; B0-3 + movd m%4, [%12+%10*4] ; J0-3 + movd m%7, [%9] ; F0-3 + movd m%6, [%12] ; N0-3 + punpcklbw m%5, m%4 ; B/J + punpcklbw m%7, m%6 ; F/N + punpcklbw m%1, m%5 ; A/B/I/J interleaved + punpcklbw m%3, m%7 ; E/F/M/N interleaved + movd m%4, [%9+%11] ; G0-3 + movd m%6, [%12+%11] ; O0-3 + movd m%5, [%9+%11*2] ; H0-3 + movd m%7, [%12+%11*2] ; P0-3 + punpcklbw m%4, m%6 ; G/O + punpcklbw m%5, m%7 ; H/P + punpcklbw m%4, m%5 ; G/H/O/P interleaved +%endmacro + +; write 4 xmm registers of 4 dwords each +; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular +; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride +; we add 1*stride to the third regular registry in the process +; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the +; same memory region), or 8 if they cover two separate buffers (third one points to +; a different memory region than the first two), allowing for more optimal code for +; the 16-width case +%macro WRITE_4x4D 10 + ; write out (4 dwords per register), start with dwords zero + movd [%5+%8*4], m%1 + movd [%5], m%2 + movd [%7+%8*4], m%3 + movd [%7], m%4 + + ; store dwords 1 + psrldq m%1, 4 + psrldq m%2, 4 + psrldq m%3, 4 + psrldq m%4, 4 + movd [%6+%8*4], m%1 + movd [%6], m%2 +%if %10 == 16 + movd [%6+%9*4], m%3 +%endif + movd [%7+%9], m%4 + + ; write dwords 2 + psrldq m%1, 4 + psrldq m%2, 4 +%if %10 == 8 + movd [%5+%8*2], m%1 + movd %5d, m%3 +%endif + psrldq m%3, 4 + psrldq m%4, 4 +%if %10 == 16 + movd [%5+%8*2], m%1 +%endif + movd [%6+%9], m%2 + movd [%7+%8*2], m%3 + movd [%7+%9*2], m%4 + add %7, %9 + + ; store dwords 3 + psrldq m%1, 4 + psrldq m%2, 4 + psrldq m%3, 4 + psrldq m%4, 4 +%if %10 == 8 + mov [%7+%8*4], %5d + movd [%6+%8*2], m%1 +%else + movd [%5+%8], m%1 +%endif + movd [%6+%9*2], m%2 + movd [%7+%8*2], m%3 + movd [%7+%9*2], m%4 +%endmacro + +%macro WRITE_8W 5 +%if cpuflag(sse4) + pextrw [%3+%4*4], %1, 0 + pextrw [%2+%4*4], %1, 1 + pextrw [%3+%4*2], %1, 2 + pextrw [%3+%4 ], %1, 3 + pextrw [%3 ], %1, 4 + pextrw [%2 ], %1, 5 + pextrw [%2+%5 ], %1, 6 + pextrw [%2+%5*2], %1, 7 +%else + movd %2d, %1 + psrldq %1, 4 + mov [%3+%4*4], %2w + shr %2, 16 + add %3, %5 + mov [%3+%4*4], %2w + + movd %2d, %1 + psrldq %1, 4 + add %3, %4 + mov [%3+%4*2], %2w + shr %2, 16 + mov [%3+%4 ], %2w + + movd %2d, %1 + psrldq %1, 4 + mov [%3 ], %2w + shr %2, 16 + mov [%3+%5 ], %2w + + movd %2d, %1 + add %3, %5 + mov [%3+%5 ], %2w + shr %2, 16 + mov [%3+%5*2], %2w +%endif +%endmacro + +%macro SIMPLE_LOOPFILTER 2 +cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m7, flim, m0 ; splat "flim" into register + + ; set up indexes to address 4 rows + DEFINE_ARGS dst1, mstride, stride, dst3, dst2 + mov strideq, mstrideq + neg mstrideq +%ifidn %1, h + lea dst1q, [dst1q+4*strideq-2] +%endif + +%ifidn %1, v + ; read 4 half/full rows of pixels + mova m0, [dst1q+mstrideq*2] ; p1 + mova m1, [dst1q+mstrideq] ; p0 + mova m2, [dst1q] ; q0 + mova m3, [dst1q+ strideq] ; q1 +%else ; h + lea dst2q, [dst1q+ strideq] + + READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q + TRANSPOSE4x4W 0, 1, 2, 3, 4 +%endif + + ; simple_limit + mova m5, m2 ; m5=backup of q0 + mova m6, m1 ; m6=backup of p0 + psubusb m1, m2 ; p0-q0 + psubusb m2, m6 ; q0-p0 + por m1, m2 ; FFABS(p0-q0) + paddusb m1, m1 ; m1=FFABS(p0-q0)*2 + + mova m4, m3 + mova m2, m0 + psubusb m3, m0 ; q1-p1 + psubusb m0, m4 ; p1-q1 + por m3, m0 ; FFABS(p1-q1) + mova m0, [pb_80] + pxor m2, m0 + pxor m4, m0 + psubsb m2, m4 ; m2=p1-q1 (signed) backup for below + pand m3, [pb_FE] + psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed + paddusb m3, m1 + psubusb m3, m7 + pxor m1, m1 + pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) + + ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) + mova m4, m5 + pxor m5, m0 + pxor m0, m6 + psubsb m5, m0 ; q0-p0 (signed) + paddsb m2, m5 + paddsb m2, m5 + paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) + pand m2, m3 ; apply filter mask (m3) + + mova m3, [pb_F8] + mova m1, m2 + paddsb m2, [pb_4] ; f1<<3=a+4 + paddsb m1, [pb_3] ; f2<<3=a+3 + pand m2, m3 + pand m1, m3 ; cache f2<<3 + + pxor m0, m0 + pxor m3, m3 + pcmpgtb m0, m2 ; which values are <0? + psubb m3, m2 ; -f1<<3 + psrlq m2, 3 ; +f1 + psrlq m3, 3 ; -f1 + pand m3, m0 + pandn m0, m2 + psubusb m4, m0 + paddusb m4, m3 ; q0-f1 + + pxor m0, m0 + pxor m3, m3 + pcmpgtb m0, m1 ; which values are <0? + psubb m3, m1 ; -f2<<3 + psrlq m1, 3 ; +f2 + psrlq m3, 3 ; -f2 + pand m3, m0 + pandn m0, m1 + paddusb m6, m0 + psubusb m6, m3 ; p0+f2 + + ; store +%ifidn %1, v + mova [dst1q], m4 + mova [dst1q+mstrideq], m6 +%else ; h + inc dst1q + SBUTTERFLY bw, 6, 4, 0 + +%if cpuflag(sse4) + inc dst2q +%endif + WRITE_8W m6, dst2q, dst1q, mstrideq, strideq + lea dst2q, [dst3q+mstrideq+1] +%if cpuflag(sse4) + inc dst3q +%endif + WRITE_8W m4, dst3q, dst2q, mstrideq, strideq +%endif + + RET +%endmacro + +INIT_XMM sse2 +SIMPLE_LOOPFILTER v, 3 +SIMPLE_LOOPFILTER h, 5 +INIT_XMM ssse3 +SIMPLE_LOOPFILTER v, 3 +SIMPLE_LOOPFILTER h, 5 +INIT_XMM sse4 +SIMPLE_LOOPFILTER h, 5 + +;----------------------------------------------------------------------------- +; void ff_vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride, +; int flimE, int flimI, int hev_thr); +;----------------------------------------------------------------------------- + +%macro INNER_LOOPFILTER 2 +%define stack_size 0 +%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr +%ifidn %1, v ; [3]=hev() result +%define stack_size mmsize * -4 +%else ; h ; extra storage space for transposes +%define stack_size mmsize * -5 +%endif +%endif + +%if %2 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr +%else ; luma +cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr +%endif + +%if cpuflag(ssse3) + pxor m7, m7 +%endif + +%ifndef m8 + ; splat function arguments + SPLATB_REG m0, flimEq, m7 ; E + SPLATB_REG m1, flimIq, m7 ; I + SPLATB_REG m2, hevthrq, m7 ; hev_thresh + +%define m_flimE [rsp] +%define m_flimI [rsp+mmsize] +%define m_hevthr [rsp+mmsize*2] +%define m_maskres [rsp+mmsize*3] +%define m_p0backup [rsp+mmsize*3] +%define m_q0backup [rsp+mmsize*4] + + mova m_flimE, m0 + mova m_flimI, m1 + mova m_hevthr, m2 +%else +%define m_flimE m9 +%define m_flimI m10 +%define m_hevthr m11 +%define m_maskres m12 +%define m_p0backup m12 +%define m_q0backup m8 + + ; splat function arguments + SPLATB_REG m_flimE, flimEq, m7 ; E + SPLATB_REG m_flimI, flimIq, m7 ; I + SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh +%endif + +%if %2 == 8 ; chroma + DEFINE_ARGS dst1, dst8, mstride, stride, dst2 +%else + DEFINE_ARGS dst1, mstride, stride, dst2, dst8 +%endif + mov strideq, mstrideq + neg mstrideq +%ifidn %1, h + lea dst1q, [dst1q+strideq*4-4] +%if %2 == 8 ; chroma + lea dst8q, [dst8q+strideq*4-4] +%endif +%endif + + ; read + lea dst2q, [dst1q+strideq] +%ifidn %1, v +%if %2 == 8 && mmsize == 16 +%define movrow movh +%else +%define movrow mova +%endif + movrow m0, [dst1q+mstrideq*4] ; p3 + movrow m1, [dst2q+mstrideq*4] ; p2 + movrow m2, [dst1q+mstrideq*2] ; p1 + movrow m5, [dst2q] ; q1 + movrow m6, [dst2q+ strideq*1] ; q2 + movrow m7, [dst2q+ strideq*2] ; q3 +%if mmsize == 16 && %2 == 8 + movhps m0, [dst8q+mstrideq*4] + movhps m2, [dst8q+mstrideq*2] + add dst8q, strideq + movhps m1, [dst8q+mstrideq*4] + movhps m5, [dst8q] + movhps m6, [dst8q+ strideq ] + movhps m7, [dst8q+ strideq*2] + add dst8q, mstrideq +%endif +%else ; h +%if %2 == 16 + lea dst8q, [dst1q+ strideq*8] +%endif + + ; read 16 rows of 8px each, interleave + movh m0, [dst1q+mstrideq*4] + movh m1, [dst8q+mstrideq*4] + movh m2, [dst1q+mstrideq*2] + movh m5, [dst8q+mstrideq*2] + movh m3, [dst1q+mstrideq ] + movh m6, [dst8q+mstrideq ] + movh m4, [dst1q] + movh m7, [dst8q] + punpcklbw m0, m1 ; A/I + punpcklbw m2, m5 ; C/K + punpcklbw m3, m6 ; D/L + punpcklbw m4, m7 ; E/M + + add dst8q, strideq + movh m1, [dst2q+mstrideq*4] + movh m6, [dst8q+mstrideq*4] + movh m5, [dst2q] + movh m7, [dst8q] + punpcklbw m1, m6 ; B/J + punpcklbw m5, m7 ; F/N + movh m6, [dst2q+ strideq ] + movh m7, [dst8q+ strideq ] + punpcklbw m6, m7 ; G/O + + ; 8x16 transpose + TRANSPOSE4x4B 0, 1, 2, 3, 7 +%ifdef m8 + SWAP 1, 8 +%else + mova m_q0backup, m1 +%endif + movh m7, [dst2q+ strideq*2] + movh m1, [dst8q+ strideq*2] + punpcklbw m7, m1 ; H/P + TRANSPOSE4x4B 4, 5, 6, 7, 1 + SBUTTERFLY dq, 0, 4, 1 ; p3/p2 + SBUTTERFLY dq, 2, 6, 1 ; q0/q1 + SBUTTERFLY dq, 3, 7, 1 ; q2/q3 +%ifdef m8 + SWAP 1, 8 + SWAP 2, 8 +%else + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 +%endif + SBUTTERFLY dq, 1, 5, 2 ; p1/p0 +%ifdef m12 + SWAP 5, 12 +%else + mova m_p0backup, m5 ; store p0 +%endif + SWAP 1, 4 + SWAP 2, 4 + SWAP 6, 3 + SWAP 5, 3 +%endif + + ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 + mova m4, m1 + SWAP 4, 1 + psubusb m4, m0 ; p2-p3 + psubusb m0, m1 ; p3-p2 + por m0, m4 ; abs(p3-p2) + + mova m4, m2 + SWAP 4, 2 + psubusb m4, m1 ; p1-p2 + psubusb m1, m2 ; p2-p1 + por m1, m4 ; abs(p2-p1) + + mova m4, m6 + SWAP 4, 6 + psubusb m4, m7 ; q2-q3 + psubusb m7, m6 ; q3-q2 + por m7, m4 ; abs(q3-q2) + + mova m4, m5 + SWAP 4, 5 + psubusb m4, m6 ; q1-q2 + psubusb m6, m5 ; q2-q1 + por m6, m4 ; abs(q2-q1) + + pmaxub m0, m1 + pmaxub m6, m7 + pmaxub m0, m6 + + ; normal_limit and high_edge_variance for p1-p0, q1-q0 + SWAP 7, 3 ; now m7 is zero +%ifidn %1, v + movrow m3, [dst1q+mstrideq ] ; p0 +%if mmsize == 16 && %2 == 8 + movhps m3, [dst8q+mstrideq ] +%endif +%elifdef m12 + SWAP 3, 12 +%else + mova m3, m_p0backup +%endif + + mova m1, m2 + SWAP 1, 2 + mova m6, m3 + SWAP 3, 6 + psubusb m1, m3 ; p1-p0 + psubusb m6, m2 ; p0-p1 + por m1, m6 ; abs(p1-p0) + pmaxub m0, m1 ; max_I + SWAP 1, 4 ; max_hev_thresh + + SWAP 6, 4 ; now m6 is I +%ifidn %1, v + movrow m4, [dst1q] ; q0 +%if mmsize == 16 && %2 == 8 + movhps m4, [dst8q] +%endif +%elifdef m8 + SWAP 4, 8 +%else + mova m4, m_q0backup +%endif + mova m1, m4 + SWAP 1, 4 + mova m7, m5 + SWAP 7, 5 + psubusb m1, m5 ; q0-q1 + psubusb m7, m4 ; q1-q0 + por m1, m7 ; abs(q1-q0) + pxor m7, m7 + pmaxub m0, m1 + pmaxub m6, m1 + psubusb m0, m_flimI + psubusb m6, m_hevthr + pcmpeqb m0, m7 ; max(abs(..)) <= I + pcmpeqb m6, m7 ; !(max(abs..) > thresh) +%ifdef m12 + SWAP 6, 12 +%else + mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) +%endif + + ; simple_limit + mova m1, m3 + SWAP 1, 3 + mova m6, m4 ; keep copies of p0/q0 around for later use + SWAP 6, 4 + psubusb m1, m4 ; p0-q0 + psubusb m6, m3 ; q0-p0 + por m1, m6 ; abs(q0-p0) + paddusb m1, m1 ; m1=2*abs(q0-p0) + + mova m7, m2 + SWAP 7, 2 + mova m6, m5 + SWAP 6, 5 + psubusb m7, m5 ; p1-q1 + psubusb m6, m2 ; q1-p1 + por m7, m6 ; abs(q1-p1) + pxor m6, m6 + pand m7, [pb_FE] + psrlq m7, 1 ; abs(q1-p1)/2 + paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 + psubusb m7, m_flimE + pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E + pand m0, m7 ; normal_limit result + + ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask +%ifdef m8 ; x86-64 && sse2 + mova m8, [pb_80] +%define m_pb_80 m8 +%else ; x86-32 or mmx/mmxext +%define m_pb_80 [pb_80] +%endif + mova m1, m4 + mova m7, m3 + pxor m1, m_pb_80 + pxor m7, m_pb_80 + psubsb m1, m7 ; (signed) q0-p0 + mova m6, m2 + mova m7, m5 + pxor m6, m_pb_80 + pxor m7, m_pb_80 + psubsb m6, m7 ; (signed) p1-q1 + mova m7, m_maskres + pandn m7, m6 + paddsb m7, m1 + paddsb m7, m1 + paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) + + pand m7, m0 + mova m1, [pb_F8] + mova m6, m7 + paddsb m7, [pb_3] + paddsb m6, [pb_4] + pand m7, m1 + pand m6, m1 + + pxor m1, m1 + pxor m0, m0 + pcmpgtb m1, m7 + psubb m0, m7 + psrlq m7, 3 ; +f2 + psrlq m0, 3 ; -f2 + pand m0, m1 + pandn m1, m7 + psubusb m3, m0 + paddusb m3, m1 ; p0+f2 + + pxor m1, m1 + pxor m0, m0 + pcmpgtb m0, m6 + psubb m1, m6 + psrlq m6, 3 ; +f1 + psrlq m1, 3 ; -f1 + pand m1, m0 + pandn m0, m6 + psubusb m4, m0 + paddusb m4, m1 ; q0-f1 + +%ifdef m12 + SWAP 6, 12 +%else + mova m6, m_maskres +%endif + pxor m7, m7 + pand m0, m6 + pand m1, m6 + psubusb m1, [pb_1] + pavgb m0, m7 ; a + pavgb m1, m7 ; -a + psubusb m5, m0 + psubusb m2, m1 + paddusb m5, m1 ; q1-a + paddusb m2, m0 ; p1+a + + ; store +%ifidn %1, v + movrow [dst1q+mstrideq*2], m2 + movrow [dst1q+mstrideq ], m3 + movrow [dst1q], m4 + movrow [dst1q+ strideq ], m5 +%if mmsize == 16 && %2 == 8 + movhps [dst8q+mstrideq*2], m2 + movhps [dst8q+mstrideq ], m3 + movhps [dst8q], m4 + movhps [dst8q+ strideq ], m5 +%endif +%else ; h + add dst1q, 2 + add dst2q, 2 + + ; 4x8/16 transpose + TRANSPOSE4x4B 2, 3, 4, 5, 6 + + lea dst8q, [dst8q+mstrideq +2] + WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2 +%endif + + RET +%endmacro + +INIT_XMM sse2 +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 + +INIT_XMM ssse3 +INNER_LOOPFILTER v, 16 +INNER_LOOPFILTER h, 16 +INNER_LOOPFILTER v, 8 +INNER_LOOPFILTER h, 8 + +;----------------------------------------------------------------------------- +; void ff_vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] ptrdiff_t stride, +; int flimE, int flimI, int hev_thr); +;----------------------------------------------------------------------------- + +%macro MBEDGE_LOOPFILTER 2 +%define stack_size 0 +%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr +%if mmsize == 16 ; [3]=hev() result + ; [4]=filter tmp result + ; [5]/[6] = p2/q2 backup + ; [7]=lim_res sign result +%define stack_size mmsize * -7 +%else ; 8 ; extra storage space for transposes +%define stack_size mmsize * -8 +%endif +%endif + +%if %2 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr +%else ; luma +cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr +%endif + +%if cpuflag(ssse3) + pxor m7, m7 +%endif + +%ifndef m8 + ; splat function arguments + SPLATB_REG m0, flimEq, m7 ; E + SPLATB_REG m1, flimIq, m7 ; I + SPLATB_REG m2, hevthrq, m7 ; hev_thresh + +%define m_flimE [rsp] +%define m_flimI [rsp+mmsize] +%define m_hevthr [rsp+mmsize*2] +%define m_maskres [rsp+mmsize*3] +%define m_limres [rsp+mmsize*4] +%define m_p0backup [rsp+mmsize*3] +%define m_q0backup [rsp+mmsize*4] +%define m_p2backup [rsp+mmsize*5] +%define m_q2backup [rsp+mmsize*6] +%if mmsize == 16 +%define m_limsign [rsp] +%else +%define m_limsign [rsp+mmsize*7] +%endif + + mova m_flimE, m0 + mova m_flimI, m1 + mova m_hevthr, m2 +%else ; sse2 on x86-64 +%define m_flimE m9 +%define m_flimI m10 +%define m_hevthr m11 +%define m_maskres m12 +%define m_limres m8 +%define m_p0backup m12 +%define m_q0backup m8 +%define m_p2backup m13 +%define m_q2backup m14 +%define m_limsign m9 + + ; splat function arguments + SPLATB_REG m_flimE, flimEq, m7 ; E + SPLATB_REG m_flimI, flimIq, m7 ; I + SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh +%endif + +%if %2 == 8 ; chroma + DEFINE_ARGS dst1, dst8, mstride, stride, dst2 +%else + DEFINE_ARGS dst1, mstride, stride, dst2, dst8 +%endif + mov strideq, mstrideq + neg mstrideq +%ifidn %1, h + lea dst1q, [dst1q+strideq*4-4] +%if %2 == 8 ; chroma + lea dst8q, [dst8q+strideq*4-4] +%endif +%endif + + ; read + lea dst2q, [dst1q+ strideq ] +%ifidn %1, v +%if %2 == 8 && mmsize == 16 +%define movrow movh +%else +%define movrow mova +%endif + movrow m0, [dst1q+mstrideq*4] ; p3 + movrow m1, [dst2q+mstrideq*4] ; p2 + movrow m2, [dst1q+mstrideq*2] ; p1 + movrow m5, [dst2q] ; q1 + movrow m6, [dst2q+ strideq ] ; q2 + movrow m7, [dst2q+ strideq*2] ; q3 +%if mmsize == 16 && %2 == 8 + movhps m0, [dst8q+mstrideq*4] + movhps m2, [dst8q+mstrideq*2] + add dst8q, strideq + movhps m1, [dst8q+mstrideq*4] + movhps m5, [dst8q] + movhps m6, [dst8q+ strideq ] + movhps m7, [dst8q+ strideq*2] + add dst8q, mstrideq +%endif +%else ; h +%if %2 == 16 + lea dst8q, [dst1q+ strideq*8 ] +%endif + + ; read 16 rows of 8px each, interleave + movh m0, [dst1q+mstrideq*4] + movh m1, [dst8q+mstrideq*4] + movh m2, [dst1q+mstrideq*2] + movh m5, [dst8q+mstrideq*2] + movh m3, [dst1q+mstrideq ] + movh m6, [dst8q+mstrideq ] + movh m4, [dst1q] + movh m7, [dst8q] + punpcklbw m0, m1 ; A/I + punpcklbw m2, m5 ; C/K + punpcklbw m3, m6 ; D/L + punpcklbw m4, m7 ; E/M + + add dst8q, strideq + movh m1, [dst2q+mstrideq*4] + movh m6, [dst8q+mstrideq*4] + movh m5, [dst2q] + movh m7, [dst8q] + punpcklbw m1, m6 ; B/J + punpcklbw m5, m7 ; F/N + movh m6, [dst2q+ strideq ] + movh m7, [dst8q+ strideq ] + punpcklbw m6, m7 ; G/O + + ; 8x16 transpose + TRANSPOSE4x4B 0, 1, 2, 3, 7 +%ifdef m8 + SWAP 1, 8 +%else + mova m_q0backup, m1 +%endif + movh m7, [dst2q+ strideq*2] + movh m1, [dst8q+ strideq*2] + punpcklbw m7, m1 ; H/P + TRANSPOSE4x4B 4, 5, 6, 7, 1 + SBUTTERFLY dq, 0, 4, 1 ; p3/p2 + SBUTTERFLY dq, 2, 6, 1 ; q0/q1 + SBUTTERFLY dq, 3, 7, 1 ; q2/q3 +%ifdef m8 + SWAP 1, 8 + SWAP 2, 8 +%else + mova m1, m_q0backup + mova m_q0backup, m2 ; store q0 +%endif + SBUTTERFLY dq, 1, 5, 2 ; p1/p0 +%ifdef m12 + SWAP 5, 12 +%else + mova m_p0backup, m5 ; store p0 +%endif + SWAP 1, 4 + SWAP 2, 4 + SWAP 6, 3 + SWAP 5, 3 +%endif + + ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 + mova m4, m1 + SWAP 4, 1 + psubusb m4, m0 ; p2-p3 + psubusb m0, m1 ; p3-p2 + por m0, m4 ; abs(p3-p2) + + mova m4, m2 + SWAP 4, 2 + psubusb m4, m1 ; p1-p2 + mova m_p2backup, m1 + psubusb m1, m2 ; p2-p1 + por m1, m4 ; abs(p2-p1) + + mova m4, m6 + SWAP 4, 6 + psubusb m4, m7 ; q2-q3 + psubusb m7, m6 ; q3-q2 + por m7, m4 ; abs(q3-q2) + + mova m4, m5 + SWAP 4, 5 + psubusb m4, m6 ; q1-q2 + mova m_q2backup, m6 + psubusb m6, m5 ; q2-q1 + por m6, m4 ; abs(q2-q1) + + pmaxub m0, m1 + pmaxub m6, m7 + pmaxub m0, m6 + + ; normal_limit and high_edge_variance for p1-p0, q1-q0 + SWAP 7, 3 ; now m7 is zero +%ifidn %1, v + movrow m3, [dst1q+mstrideq ] ; p0 +%if mmsize == 16 && %2 == 8 + movhps m3, [dst8q+mstrideq ] +%endif +%elifdef m12 + SWAP 3, 12 +%else + mova m3, m_p0backup +%endif + + mova m1, m2 + SWAP 1, 2 + mova m6, m3 + SWAP 3, 6 + psubusb m1, m3 ; p1-p0 + psubusb m6, m2 ; p0-p1 + por m1, m6 ; abs(p1-p0) + pmaxub m0, m1 ; max_I + SWAP 1, 4 ; max_hev_thresh + + SWAP 6, 4 ; now m6 is I +%ifidn %1, v + movrow m4, [dst1q] ; q0 +%if mmsize == 16 && %2 == 8 + movhps m4, [dst8q] +%endif +%elifdef m8 + SWAP 4, 8 +%else + mova m4, m_q0backup +%endif + mova m1, m4 + SWAP 1, 4 + mova m7, m5 + SWAP 7, 5 + psubusb m1, m5 ; q0-q1 + psubusb m7, m4 ; q1-q0 + por m1, m7 ; abs(q1-q0) + pxor m7, m7 + pmaxub m0, m1 + pmaxub m6, m1 + psubusb m0, m_flimI + psubusb m6, m_hevthr + pcmpeqb m0, m7 ; max(abs(..)) <= I + pcmpeqb m6, m7 ; !(max(abs..) > thresh) +%ifdef m12 + SWAP 6, 12 +%else + mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) +%endif + + ; simple_limit + mova m1, m3 + SWAP 1, 3 + mova m6, m4 ; keep copies of p0/q0 around for later use + SWAP 6, 4 + psubusb m1, m4 ; p0-q0 + psubusb m6, m3 ; q0-p0 + por m1, m6 ; abs(q0-p0) + paddusb m1, m1 ; m1=2*abs(q0-p0) + + mova m7, m2 + SWAP 7, 2 + mova m6, m5 + SWAP 6, 5 + psubusb m7, m5 ; p1-q1 + psubusb m6, m2 ; q1-p1 + por m7, m6 ; abs(q1-p1) + pxor m6, m6 + pand m7, [pb_FE] + psrlq m7, 1 ; abs(q1-p1)/2 + paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 + psubusb m7, m_flimE + pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E + pand m0, m7 ; normal_limit result + + ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask +%ifdef m8 ; x86-64 && sse2 + mova m8, [pb_80] +%define m_pb_80 m8 +%else ; x86-32 or mmx/mmxext +%define m_pb_80 [pb_80] +%endif + mova m1, m4 + mova m7, m3 + pxor m1, m_pb_80 + pxor m7, m_pb_80 + psubsb m1, m7 ; (signed) q0-p0 + mova m6, m2 + mova m7, m5 + pxor m6, m_pb_80 + pxor m7, m_pb_80 + psubsb m6, m7 ; (signed) p1-q1 + mova m7, m_maskres + paddsb m6, m1 + paddsb m6, m1 + paddsb m6, m1 + pand m6, m0 +%ifdef m8 + mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge + pand m_limres, m7 +%else + mova m0, m6 + pand m0, m7 + mova m_limres, m0 +%endif + pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common + + mova m1, [pb_F8] + mova m6, m7 + paddsb m7, [pb_3] + paddsb m6, [pb_4] + pand m7, m1 + pand m6, m1 + + pxor m1, m1 + pxor m0, m0 + pcmpgtb m1, m7 + psubb m0, m7 + psrlq m7, 3 ; +f2 + psrlq m0, 3 ; -f2 + pand m0, m1 + pandn m1, m7 + psubusb m3, m0 + paddusb m3, m1 ; p0+f2 + + pxor m1, m1 + pxor m0, m0 + pcmpgtb m0, m6 + psubb m1, m6 + psrlq m6, 3 ; +f1 + psrlq m1, 3 ; -f1 + pand m1, m0 + pandn m0, m6 + psubusb m4, m0 + paddusb m4, m1 ; q0-f1 + + ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) +%if cpuflag(ssse3) + mova m7, [pb_1] +%else + mova m7, [pw_63] +%endif +%ifdef m8 + SWAP 1, 8 +%else + mova m1, m_limres +%endif + pxor m0, m0 + mova m6, m1 + pcmpgtb m0, m1 ; which are negative +%if cpuflag(ssse3) + punpcklbw m6, m7 ; interleave with "1" for rounding + punpckhbw m1, m7 +%else + punpcklbw m6, m0 ; signed byte->word + punpckhbw m1, m0 +%endif + mova m_limsign, m0 +%if cpuflag(ssse3) + mova m7, [pb_27_63] +%ifndef m8 + mova m_limres, m1 +%endif +%ifdef m10 + SWAP 0, 10 ; don't lose lim_sign copy +%endif + mova m0, m7 + pmaddubsw m7, m6 + SWAP 6, 7 + pmaddubsw m0, m1 + SWAP 1, 0 +%ifdef m10 + SWAP 0, 10 +%else + mova m0, m_limsign +%endif +%else + mova m_maskres, m6 ; backup for later in filter + mova m_limres, m1 + pmullw m6, [pw_27] + pmullw m1, [pw_27] + paddw m6, m7 + paddw m1, m7 +%endif + psraw m6, 7 + psraw m1, 7 + packsswb m6, m1 ; a0 + pxor m1, m1 + psubb m1, m6 + pand m1, m0 ; -a0 + pandn m0, m6 ; +a0 +%if cpuflag(ssse3) + mova m6, [pb_18_63] ; pipelining +%endif + psubusb m3, m1 + paddusb m4, m1 + paddusb m3, m0 ; p0+a0 + psubusb m4, m0 ; q0-a0 + +%if cpuflag(ssse3) + SWAP 6, 7 +%ifdef m10 + SWAP 1, 10 +%else + mova m1, m_limres +%endif + mova m0, m7 + pmaddubsw m7, m6 + SWAP 6, 7 + pmaddubsw m0, m1 + SWAP 1, 0 +%ifdef m10 + SWAP 0, 10 +%endif + mova m0, m_limsign +%else + mova m6, m_maskres + mova m1, m_limres + pmullw m6, [pw_18] + pmullw m1, [pw_18] + paddw m6, m7 + paddw m1, m7 +%endif + mova m0, m_limsign + psraw m6, 7 + psraw m1, 7 + packsswb m6, m1 ; a1 + pxor m1, m1 + psubb m1, m6 + pand m1, m0 ; -a1 + pandn m0, m6 ; +a1 +%if cpuflag(ssse3) + mova m6, [pb_9_63] +%endif + psubusb m2, m1 + paddusb m5, m1 + paddusb m2, m0 ; p1+a1 + psubusb m5, m0 ; q1-a1 + +%if cpuflag(ssse3) + SWAP 6, 7 +%ifdef m10 + SWAP 1, 10 +%else + mova m1, m_limres +%endif + mova m0, m7 + pmaddubsw m7, m6 + SWAP 6, 7 + pmaddubsw m0, m1 + SWAP 1, 0 +%else +%ifdef m8 + SWAP 6, 12 + SWAP 1, 8 +%else + mova m6, m_maskres + mova m1, m_limres +%endif + pmullw m6, [pw_9] + pmullw m1, [pw_9] + paddw m6, m7 + paddw m1, m7 +%endif +%ifdef m9 + SWAP 7, 9 +%else + mova m7, m_limsign +%endif + psraw m6, 7 + psraw m1, 7 + packsswb m6, m1 ; a1 + pxor m0, m0 + psubb m0, m6 + pand m0, m7 ; -a1 + pandn m7, m6 ; +a1 +%ifdef m8 + SWAP 1, 13 + SWAP 6, 14 +%else + mova m1, m_p2backup + mova m6, m_q2backup +%endif + psubusb m1, m0 + paddusb m6, m0 + paddusb m1, m7 ; p1+a1 + psubusb m6, m7 ; q1-a1 + + ; store +%ifidn %1, v + movrow [dst2q+mstrideq*4], m1 + movrow [dst1q+mstrideq*2], m2 + movrow [dst1q+mstrideq ], m3 + movrow [dst1q], m4 + movrow [dst2q], m5 + movrow [dst2q+ strideq ], m6 +%if mmsize == 16 && %2 == 8 + add dst8q, mstrideq + movhps [dst8q+mstrideq*2], m1 + movhps [dst8q+mstrideq ], m2 + movhps [dst8q], m3 + add dst8q, strideq + movhps [dst8q], m4 + movhps [dst8q+ strideq ], m5 + movhps [dst8q+ strideq*2], m6 +%endif +%else ; h + inc dst1q + inc dst2q + + ; 4x8/16 transpose + TRANSPOSE4x4B 1, 2, 3, 4, 0 + SBUTTERFLY bw, 5, 6, 0 + + lea dst8q, [dst8q+mstrideq+1] + WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2 + lea dst1q, [dst2q+mstrideq+4] + lea dst8q, [dst8q+mstrideq+4] +%if cpuflag(sse4) + add dst2q, 4 +%endif + WRITE_8W m5, dst2q, dst1q, mstrideq, strideq +%if cpuflag(sse4) + lea dst2q, [dst8q+ strideq ] +%endif + WRITE_8W m6, dst2q, dst8q, mstrideq, strideq +%endif + + RET +%endmacro + +INIT_XMM sse2 +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 + +INIT_XMM ssse3 +MBEDGE_LOOPFILTER v, 16 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER v, 8 +MBEDGE_LOOPFILTER h, 8 + +INIT_XMM sse4 +MBEDGE_LOOPFILTER h, 16 +MBEDGE_LOOPFILTER h, 8 diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init.c b/media/ffvpx/libavcodec/x86/vp9dsp_init.c new file mode 100644 index 0000000000..8d11dbc348 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init.c @@ -0,0 +1,415 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vp9dsp.h" +#include "libavcodec/x86/vp9dsp_init.h" + +#if HAVE_X86ASM + +decl_fpel_func(put, 4, , mmx); +decl_fpel_func(put, 8, , mmx); +decl_fpel_func(put, 16, , sse); +decl_fpel_func(put, 32, , sse); +decl_fpel_func(put, 64, , sse); +decl_fpel_func(avg, 4, _8, mmxext); +decl_fpel_func(avg, 8, _8, mmxext); +decl_fpel_func(avg, 16, _8, sse2); +decl_fpel_func(avg, 32, _8, sse2); +decl_fpel_func(avg, 64, _8, sse2); +decl_fpel_func(put, 32, , avx); +decl_fpel_func(put, 64, , avx); +decl_fpel_func(avg, 32, _8, avx2); +decl_fpel_func(avg, 64, _8, avx2); + +decl_mc_funcs(4, mmxext, int16_t, 8, 8); +decl_mc_funcs(8, sse2, int16_t, 8, 8); +decl_mc_funcs(4, ssse3, int8_t, 32, 8); +decl_mc_funcs(8, ssse3, int8_t, 32, 8); +#if ARCH_X86_64 +decl_mc_funcs(16, ssse3, int8_t, 32, 8); +decl_mc_funcs(32, avx2, int8_t, 32, 8); +#endif + +mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8) +#if ARCH_X86_32 +mc_rep_funcs(16, 8, 8, ssse3, int8_t, 32, 8) +#endif +mc_rep_funcs(32, 16, 16, sse2, int16_t, 8, 8) +mc_rep_funcs(32, 16, 16, ssse3, int8_t, 32, 8) +mc_rep_funcs(64, 32, 32, sse2, int16_t, 8, 8) +mc_rep_funcs(64, 32, 32, ssse3, int8_t, 32, 8) +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8) +#endif + +extern const int8_t ff_filters_ssse3[3][15][4][32]; +extern const int16_t ff_filters_sse2[3][15][8][8]; + +filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2) +filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2) +filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3) +filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3) +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3) +filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3) +filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3) +filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3) +#endif + +filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2) +filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2) +filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3) +filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3) +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3) +filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3) +filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3) +filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3) +#endif + +#define itxfm_func(typea, typeb, size, opt) \ +void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \ + int16_t *block, int eob) +#define itxfm_funcs(size, opt) \ +itxfm_func(idct, idct, size, opt); \ +itxfm_func(iadst, idct, size, opt); \ +itxfm_func(idct, iadst, size, opt); \ +itxfm_func(iadst, iadst, size, opt) + +itxfm_func(idct, idct, 4, mmxext); +itxfm_func(idct, iadst, 4, sse2); +itxfm_func(iadst, idct, 4, sse2); +itxfm_func(iadst, iadst, 4, sse2); +itxfm_funcs(4, ssse3); +itxfm_funcs(8, sse2); +itxfm_funcs(8, ssse3); +itxfm_funcs(8, avx); +itxfm_funcs(16, sse2); +itxfm_funcs(16, ssse3); +itxfm_funcs(16, avx); +itxfm_func(idct, idct, 32, sse2); +itxfm_func(idct, idct, 32, ssse3); +itxfm_func(idct, idct, 32, avx); +itxfm_func(iwht, iwht, 4, mmx); +itxfm_funcs(16, avx2); +itxfm_func(idct, idct, 32, avx2); + +#undef itxfm_func +#undef itxfm_funcs + +#define lpf_funcs(size1, size2, opt) \ +void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H); \ +void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) + +lpf_funcs(4, 8, mmxext); +lpf_funcs(8, 8, mmxext); +lpf_funcs(16, 16, sse2); +lpf_funcs(16, 16, ssse3); +lpf_funcs(16, 16, avx); +lpf_funcs(44, 16, sse2); +lpf_funcs(44, 16, ssse3); +lpf_funcs(44, 16, avx); +lpf_funcs(84, 16, sse2); +lpf_funcs(84, 16, ssse3); +lpf_funcs(84, 16, avx); +lpf_funcs(48, 16, sse2); +lpf_funcs(48, 16, ssse3); +lpf_funcs(48, 16, avx); +lpf_funcs(88, 16, sse2); +lpf_funcs(88, 16, ssse3); +lpf_funcs(88, 16, avx); + +#undef lpf_funcs + +#define ipred_func(size, type, opt) \ +void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \ + const uint8_t *l, const uint8_t *a) + +ipred_func(8, v, mmx); + +#define ipred_dc_funcs(size, opt) \ +ipred_func(size, dc, opt); \ +ipred_func(size, dc_left, opt); \ +ipred_func(size, dc_top, opt) + +ipred_dc_funcs(4, mmxext); +ipred_dc_funcs(8, mmxext); + +#define ipred_dir_tm_funcs(size, opt) \ +ipred_func(size, tm, opt); \ +ipred_func(size, dl, opt); \ +ipred_func(size, dr, opt); \ +ipred_func(size, hd, opt); \ +ipred_func(size, hu, opt); \ +ipred_func(size, vl, opt); \ +ipred_func(size, vr, opt) + +ipred_dir_tm_funcs(4, mmxext); + +ipred_func(16, v, sse); +ipred_func(32, v, sse); + +ipred_dc_funcs(16, sse2); +ipred_dc_funcs(32, sse2); + +#define ipred_dir_tm_h_funcs(size, opt) \ +ipred_dir_tm_funcs(size, opt); \ +ipred_func(size, h, opt) + +ipred_dir_tm_h_funcs(8, sse2); +ipred_dir_tm_h_funcs(16, sse2); +ipred_dir_tm_h_funcs(32, sse2); + +ipred_func(4, h, sse2); + +#define ipred_all_funcs(size, opt) \ +ipred_dc_funcs(size, opt); \ +ipred_dir_tm_h_funcs(size, opt) + +// FIXME hd/vl_4x4_ssse3 does not exist +ipred_all_funcs(4, ssse3); +ipred_all_funcs(8, ssse3); +ipred_all_funcs(16, ssse3); +ipred_all_funcs(32, ssse3); + +ipred_dir_tm_h_funcs(8, avx); +ipred_dir_tm_h_funcs(16, avx); +ipred_dir_tm_h_funcs(32, avx); + +ipred_func(32, v, avx); + +ipred_dc_funcs(32, avx2); +ipred_func(32, h, avx2); +ipred_func(32, tm, avx2); + +#undef ipred_func +#undef ipred_dir_tm_h_funcs +#undef ipred_dir_tm_funcs +#undef ipred_dc_funcs + +#endif /* HAVE_X86ASM */ + +av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) +{ +#if HAVE_X86ASM + int cpu_flags; + + if (bpp == 10) { + ff_vp9dsp_init_10bpp_x86(dsp, bitexact); + return; + } else if (bpp == 12) { + ff_vp9dsp_init_12bpp_x86(dsp, bitexact); + return; + } + + cpu_flags = av_get_cpu_flags(); + +#define init_lpf(opt) do { \ + dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \ + dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \ + dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \ + dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \ + dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \ + dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \ + dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \ + dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \ + dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \ + dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \ +} while (0) + +#define init_ipred(sz, opt, t, e) \ + dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt + +#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext +#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext +#define init_dir_tm_ipred(sz, opt) do { \ + init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \ + init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \ + init_ipred(sz, opt, hd, HOR_DOWN); \ + init_ipred(sz, opt, vl, VERT_LEFT); \ + init_ipred(sz, opt, hu, HOR_UP); \ + init_ipred(sz, opt, tm, TM_VP8); \ + init_ipred(sz, opt, vr, VERT_RIGHT); \ +} while (0) +#define init_dir_tm_h_ipred(sz, opt) do { \ + init_dir_tm_ipred(sz, opt); \ + init_ipred(sz, opt, h, HOR); \ +} while (0) +#define init_dc_ipred(sz, opt) do { \ + init_ipred(sz, opt, dc, DC); \ + init_ipred(sz, opt, dc_left, LEFT_DC); \ + init_ipred(sz, opt, dc_top, TOP_DC); \ +} while (0) +#define init_all_ipred(sz, opt) do { \ + init_dc_ipred(sz, opt); \ + init_dir_tm_h_ipred(sz, opt); \ +} while (0) + + if (EXTERNAL_MMX(cpu_flags)) { + init_fpel_func(4, 0, 4, put, , mmx); + init_fpel_func(3, 0, 8, put, , mmx); + if (!bitexact) { + dsp->itxfm_add[4 /* lossless */][DCT_DCT] = + dsp->itxfm_add[4 /* lossless */][ADST_DCT] = + dsp->itxfm_add[4 /* lossless */][DCT_ADST] = + dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx; + } + init_ipred(8, mmx, v, VERT); + } + + if (EXTERNAL_MMXEXT(cpu_flags)) { + dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext; + dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext; + dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext; + dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext; + init_subpel2(4, 0, 4, put, 8, mmxext); + init_subpel2(4, 1, 4, avg, 8, mmxext); + init_fpel_func(4, 1, 4, avg, _8, mmxext); + init_fpel_func(3, 1, 8, avg, _8, mmxext); + dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; + init_dc_ipred(4, mmxext); + init_dc_ipred(8, mmxext); + init_dir_tm_ipred(4, mmxext); + } + + if (EXTERNAL_SSE(cpu_flags)) { + init_fpel_func(2, 0, 16, put, , sse); + init_fpel_func(1, 0, 32, put, , sse); + init_fpel_func(0, 0, 64, put, , sse); + init_ipred(16, sse, v, VERT); + init_ipred(32, sse, v, VERT); + } + + if (EXTERNAL_SSE2(cpu_flags)) { + init_subpel3_8to64(0, put, 8, sse2); + init_subpel3_8to64(1, avg, 8, sse2); + init_fpel_func(2, 1, 16, avg, _8, sse2); + init_fpel_func(1, 1, 32, avg, _8, sse2); + init_fpel_func(0, 1, 64, avg, _8, sse2); + init_lpf(sse2); + dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; + dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; + dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2; + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2; + dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2; + dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2; + dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2; + init_dc_ipred(16, sse2); + init_dc_ipred(32, sse2); + init_dir_tm_h_ipred(8, sse2); + init_dir_tm_h_ipred(16, sse2); + init_dir_tm_h_ipred(32, sse2); + init_ipred(4, sse2, h, HOR); + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + init_subpel3(0, put, 8, ssse3); + init_subpel3(1, avg, 8, ssse3); + dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3; + dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3; + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3; + dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3; + dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3; + dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; + init_lpf(ssse3); + init_all_ipred(4, ssse3); + init_all_ipred(8, ssse3); + init_all_ipred(16, ssse3); + init_all_ipred(32, ssse3); + } + + if (EXTERNAL_AVX(cpu_flags)) { + dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx; + dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx; + dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx; + dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx; + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; + init_lpf(avx); + init_dir_tm_h_ipred(8, avx); + init_dir_tm_h_ipred(16, avx); + init_dir_tm_h_ipred(32, avx); + } + if (EXTERNAL_AVX_FAST(cpu_flags)) { + init_fpel_func(1, 0, 32, put, , avx); + init_fpel_func(0, 0, 64, put, , avx); + init_ipred(32, avx, v, VERT); + } + + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + init_fpel_func(1, 1, 32, avg, _8, avx2); + init_fpel_func(0, 1, 64, avg, _8, avx2); + if (ARCH_X86_64) { +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL + dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2; + dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2; + dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2; + dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2; + dsp->itxfm_add[TX_32X32][ADST_ADST] = + dsp->itxfm_add[TX_32X32][ADST_DCT] = + dsp->itxfm_add[TX_32X32][DCT_ADST] = + dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2; + init_subpel3_32_64(0, put, 8, avx2); + init_subpel3_32_64(1, avg, 8, avx2); +#endif + } + init_dc_ipred(32, avx2); + init_ipred(32, avx2, h, HOR); + init_ipred(32, avx2, tm, TM_VP8); + } + +#undef init_fpel +#undef init_subpel1 +#undef init_subpel2 +#undef init_subpel3 + +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init.h b/media/ffvpx/libavcodec/x86/vp9dsp_init.h new file mode 100644 index 0000000000..fc1e0557fa --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init.h @@ -0,0 +1,192 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_VP9DSP_INIT_H +#define AVCODEC_X86_VP9DSP_INIT_H + +#include "libavutil/attributes.h" +#include "libavutil/mem_internal.h" + +#include "libavcodec/vp9dsp.h" + +// hack to force-expand BPC +#define cat(a, bpp, b) a##bpp##b + +#define decl_fpel_func(avg, sz, bpp, opt) \ +void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \ +void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, const type (*filter)[f_sz]) + +#define decl_mc_funcs(sz, opt, type, fsz, bpp) \ +decl_mc_func(put, sz, h, opt, type, fsz, bpp); \ +decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \ +decl_mc_func(put, sz, v, opt, type, fsz, bpp); \ +decl_mc_func(avg, sz, v, opt, type, fsz, bpp) + +#define decl_ipred_fn(type, sz, bpp, opt) \ +void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \ + ptrdiff_t stride, \ + const uint8_t *l, \ + const uint8_t *a) + +#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \ +decl_ipred_fn(type, 4, bpp, opt4); \ +decl_ipred_fn(type, 8, bpp, opt8_16_32); \ +decl_ipred_fn(type, 16, bpp, opt8_16_32); \ +decl_ipred_fn(type, 32, bpp, opt8_16_32) + +#define decl_itxfm_func(typea, typeb, size, bpp, opt) \ +void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t *dst, \ + ptrdiff_t stride, \ + int16_t *block, \ + int eob) + +#define decl_itxfm_funcs(size, bpp, opt) \ +decl_itxfm_func(idct, idct, size, bpp, opt); \ +decl_itxfm_func(iadst, idct, size, bpp, opt); \ +decl_itxfm_func(idct, iadst, size, bpp, opt); \ +decl_itxfm_func(iadst, iadst, size, bpp, opt) + +#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \ +static av_always_inline void \ +ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, const type (*filter)[f_sz]) \ +{ \ + ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst, dst_stride, src, \ + src_stride, h, filter); \ + ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \ + src_stride, h, filter); \ +} + +#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \ +mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp) \ +mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp) \ +mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp) \ +mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp) + +#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \ +static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \ + h, ff_filters_##f_opt[f][dvar - 1]); \ +} + +#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, dir, dvar, bpp, opt) \ +filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, bpp, opt) + +#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \ +filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \ +filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt) + +#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \ +filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt) + +#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \ +static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) \ +{ \ + LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \ + ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \ + src_stride, h + 7, \ + ff_filters_##f_opt[f][mx - 1]); \ + ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \ + 64 * bytes, h, \ + ff_filters_##f_opt[f][my - 1]); \ +} + +#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, bpp, bytes, opt) \ +filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, bpp, bytes, opt) + +#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \ +filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \ +filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \ +filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \ +filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \ +filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt) + +#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt + +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \ + type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \ + type##_8tap_regular_##sz##dir##_##bpp##_##opt; \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \ + type##_8tap_sharp_##sz##dir##_##bpp##_##opt + +#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \ + init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \ + init_subpel1(idx1, idx2, 0, 1, sz, v, type, bpp, opt); \ + init_subpel1(idx1, idx2, 1, 0, sz, h, type, bpp, opt) + +#define init_subpel3_32_64(idx, type, bpp, opt) \ + init_subpel2(0, idx, 64, type, bpp, opt); \ + init_subpel2(1, idx, 32, type, bpp, opt) + +#define init_subpel3_8to64(idx, type, bpp, opt) \ + init_subpel3_32_64(idx, type, bpp, opt); \ + init_subpel2(2, idx, 16, type, bpp, opt); \ + init_subpel2(3, idx, 8, type, bpp, opt) + +#define init_subpel3(idx, type, bpp, opt) \ + init_subpel3_8to64(idx, type, bpp, opt); \ + init_subpel2(4, idx, 4, type, bpp, opt) + +#define init_ipred_func(type, enum, sz, bpp, opt) \ + dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \ + cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt) + +#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \ + init_ipred_func(type, enum, 8, bpp, opt); \ + init_ipred_func(type, enum, 16, bpp, opt); \ + init_ipred_func(type, enum, 32, bpp, opt) + +#define init_ipred_funcs(type, enum, bpp, opt) \ + init_ipred_func(type, enum, 4, bpp, opt); \ + init_8_16_32_ipred_funcs(type, enum, bpp, opt) + +void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp, int bitexact); +void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp, int bitexact); +void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp); + +#endif /* AVCODEC_X86_VP9DSP_INIT_H */ diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_10bpp.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_10bpp.c new file mode 100644 index 0000000000..2694c06cb2 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_10bpp.c @@ -0,0 +1,25 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BPC 10 +#define INIT_FUNC ff_vp9dsp_init_10bpp_x86 +#include "vp9dsp_init_16bpp_template.c" diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_12bpp.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_12bpp.c new file mode 100644 index 0000000000..5da3bc1840 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_12bpp.c @@ -0,0 +1,25 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define BPC 12 +#define INIT_FUNC ff_vp9dsp_init_12bpp_x86 +#include "vp9dsp_init_16bpp_template.c" diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c new file mode 100644 index 0000000000..e5afea1512 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp.c @@ -0,0 +1,152 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vp9dsp.h" +#include "libavcodec/x86/vp9dsp_init.h" + +#if HAVE_X86ASM + +decl_fpel_func(put, 8, , mmx); +decl_fpel_func(avg, 8, _16, mmxext); +decl_fpel_func(put, 16, , sse); +decl_fpel_func(put, 32, , sse); +decl_fpel_func(put, 64, , sse); +decl_fpel_func(put, 128, , sse); +decl_fpel_func(avg, 16, _16, sse2); +decl_fpel_func(avg, 32, _16, sse2); +decl_fpel_func(avg, 64, _16, sse2); +decl_fpel_func(avg, 128, _16, sse2); +decl_fpel_func(put, 32, , avx); +decl_fpel_func(put, 64, , avx); +decl_fpel_func(put, 128, , avx); +decl_fpel_func(avg, 32, _16, avx2); +decl_fpel_func(avg, 64, _16, avx2); +decl_fpel_func(avg, 128, _16, avx2); + +decl_ipred_fns(v, 16, mmx, sse); +decl_ipred_fns(h, 16, mmxext, sse2); +decl_ipred_fns(dc, 16, mmxext, sse2); +decl_ipred_fns(dc_top, 16, mmxext, sse2); +decl_ipred_fns(dc_left, 16, mmxext, sse2); +decl_ipred_fn(dl, 16, 16, avx2); +decl_ipred_fn(dl, 32, 16, avx2); +decl_ipred_fn(dr, 16, 16, avx2); +decl_ipred_fn(dr, 32, 16, avx2); +decl_ipred_fn(vl, 16, 16, avx2); +decl_ipred_fn(hd, 16, 16, avx2); + +#define decl_ipred_dir_funcs(type) \ +decl_ipred_fns(type, 16, sse2, sse2); \ +decl_ipred_fns(type, 16, ssse3, ssse3); \ +decl_ipred_fns(type, 16, avx, avx) + +decl_ipred_dir_funcs(dl); +decl_ipred_dir_funcs(dr); +decl_ipred_dir_funcs(vl); +decl_ipred_dir_funcs(vr); +decl_ipred_dir_funcs(hu); +decl_ipred_dir_funcs(hd); +#endif /* HAVE_X86ASM */ + +av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + init_fpel_func(4, 0, 8, put, , mmx); + init_ipred_func(v, VERT, 4, 16, mmx); + } + + if (EXTERNAL_MMXEXT(cpu_flags)) { + init_fpel_func(4, 1, 8, avg, _16, mmxext); + init_ipred_func(h, HOR, 4, 16, mmxext); + init_ipred_func(dc, DC, 4, 16, mmxext); + init_ipred_func(dc_top, TOP_DC, 4, 16, mmxext); + init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext); + } + + if (EXTERNAL_SSE(cpu_flags)) { + init_fpel_func(3, 0, 16, put, , sse); + init_fpel_func(2, 0, 32, put, , sse); + init_fpel_func(1, 0, 64, put, , sse); + init_fpel_func(0, 0, 128, put, , sse); + init_8_16_32_ipred_funcs(v, VERT, 16, sse); + } + + if (EXTERNAL_SSE2(cpu_flags)) { + init_fpel_func(3, 1, 16, avg, _16, sse2); + init_fpel_func(2, 1, 32, avg, _16, sse2); + init_fpel_func(1, 1, 64, avg, _16, sse2); + init_fpel_func(0, 1, 128, avg, _16, sse2); + init_8_16_32_ipred_funcs(h, HOR, 16, sse2); + init_8_16_32_ipred_funcs(dc, DC, 16, sse2); + init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2); + init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2); + init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2); + init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2); + init_ipred_funcs(vl, VERT_LEFT, 16, sse2); + init_ipred_funcs(vr, VERT_RIGHT, 16, sse2); + init_ipred_funcs(hu, HOR_UP, 16, sse2); + init_ipred_funcs(hd, HOR_DOWN, 16, sse2); + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3); + init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3); + init_ipred_funcs(vl, VERT_LEFT, 16, ssse3); + init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3); + init_ipred_funcs(hu, HOR_UP, 16, ssse3); + init_ipred_funcs(hd, HOR_DOWN, 16, ssse3); + } + + if (EXTERNAL_AVX_FAST(cpu_flags)) { + init_fpel_func(2, 0, 32, put, , avx); + init_fpel_func(1, 0, 64, put, , avx); + init_fpel_func(0, 0, 128, put, , avx); + init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx); + init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx); + init_ipred_funcs(vl, VERT_LEFT, 16, avx); + init_ipred_funcs(vr, VERT_RIGHT, 16, avx); + init_ipred_funcs(hu, HOR_UP, 16, avx); + init_ipred_funcs(hd, HOR_DOWN, 16, avx); + } + + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + init_fpel_func(2, 1, 32, avg, _16, avx2); + init_fpel_func(1, 1, 64, avg, _16, avx2); + init_fpel_func(0, 1, 128, avg, _16, avx2); + init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); + init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); + init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); + init_ipred_func(vl, VERT_LEFT, 16, 16, avx2); + init_ipred_func(hd, HOR_DOWN, 16, 16, avx2); +#if ARCH_X86_64 + init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2); +#endif + } + +#endif /* HAVE_X86ASM */ +} diff --git a/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c new file mode 100644 index 0000000000..f93ea2468e --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9dsp_init_16bpp_template.c @@ -0,0 +1,239 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vp9dsp.h" +#include "libavcodec/x86/vp9dsp_init.h" + +#if HAVE_X86ASM + +extern const int16_t ff_filters_16bpp[3][15][4][16]; + +decl_mc_funcs(4, sse2, int16_t, 16, BPC); +decl_mc_funcs(8, sse2, int16_t, 16, BPC); +decl_mc_funcs(16, avx2, int16_t, 16, BPC); + +mc_rep_funcs(16, 8, 16, sse2, int16_t, 16, BPC) +mc_rep_funcs(32, 16, 32, sse2, int16_t, 16, BPC) +mc_rep_funcs(64, 32, 64, sse2, int16_t, 16, BPC) +#if HAVE_AVX2_EXTERNAL +mc_rep_funcs(32, 16, 32, avx2, int16_t, 16, BPC) +mc_rep_funcs(64, 32, 64, avx2, int16_t, 16, BPC) +#endif + +filters_8tap_2d_fn2(put, 16, BPC, 2, sse2, sse2, 16bpp) +filters_8tap_2d_fn2(avg, 16, BPC, 2, sse2, sse2, 16bpp) +#if HAVE_AVX2_EXTERNAL +filters_8tap_2d_fn(put, 64, 32, BPC, 2, avx2, 16bpp) +filters_8tap_2d_fn(avg, 64, 32, BPC, 2, avx2, 16bpp) +filters_8tap_2d_fn(put, 32, 32, BPC, 2, avx2, 16bpp) +filters_8tap_2d_fn(avg, 32, 32, BPC, 2, avx2, 16bpp) +filters_8tap_2d_fn(put, 16, 32, BPC, 2, avx2, 16bpp) +filters_8tap_2d_fn(avg, 16, 32, BPC, 2, avx2, 16bpp) +#endif + +filters_8tap_1d_fn3(put, BPC, sse2, sse2, 16bpp) +filters_8tap_1d_fn3(avg, BPC, sse2, sse2, 16bpp) +#if HAVE_AVX2_EXTERNAL +filters_8tap_1d_fn2(put, 64, BPC, avx2, 16bpp) +filters_8tap_1d_fn2(avg, 64, BPC, avx2, 16bpp) +filters_8tap_1d_fn2(put, 32, BPC, avx2, 16bpp) +filters_8tap_1d_fn2(avg, 32, BPC, avx2, 16bpp) +filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp) +filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp) +#endif + +#define decl_lpf_func(dir, wd, bpp, opt) \ +void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) + +#define decl_lpf_funcs(dir, wd, bpp) \ +decl_lpf_func(dir, wd, bpp, sse2); \ +decl_lpf_func(dir, wd, bpp, ssse3); \ +decl_lpf_func(dir, wd, bpp, avx) + +#define decl_lpf_funcs_wd(dir) \ +decl_lpf_funcs(dir, 4, BPC); \ +decl_lpf_funcs(dir, 8, BPC); \ +decl_lpf_funcs(dir, 16, BPC) + +decl_lpf_funcs_wd(h); +decl_lpf_funcs_wd(v); + +#define lpf_16_wrapper(dir, off, bpp, opt) \ +static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst, stride, E, I, H); \ + ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \ +} + +#define lpf_16_wrappers(bpp, opt) \ +lpf_16_wrapper(h, 8 * stride, bpp, opt) \ +lpf_16_wrapper(v, 16, bpp, opt) + +lpf_16_wrappers(BPC, sse2) +lpf_16_wrappers(BPC, ssse3) +lpf_16_wrappers(BPC, avx) + +#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \ +static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst, stride, \ + E & 0xff, I & 0xff, H & 0xff); \ + ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \ + E >> 8, I >> 8, H >> 8); \ +} + +#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \ +lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt) \ +lpf_mix2_wrapper(v, 16, wd1, wd2, bpp, opt) + +#define lpf_mix2_wrappers_set(bpp, opt) \ +lpf_mix2_wrappers(4, 4, bpp, opt) \ +lpf_mix2_wrappers(4, 8, bpp, opt) \ +lpf_mix2_wrappers(8, 4, bpp, opt) \ +lpf_mix2_wrappers(8, 8, bpp, opt) \ + +lpf_mix2_wrappers_set(BPC, sse2) +lpf_mix2_wrappers_set(BPC, ssse3) +lpf_mix2_wrappers_set(BPC, avx) + +decl_ipred_fns(tm, BPC, mmxext, sse2); + +decl_itxfm_func(iwht, iwht, 4, BPC, mmxext); +#if BPC == 10 +decl_itxfm_func(idct, idct, 4, BPC, mmxext); +decl_itxfm_funcs(4, BPC, ssse3); +#else +decl_itxfm_func(idct, idct, 4, BPC, sse2); +#endif +decl_itxfm_func(idct, iadst, 4, BPC, sse2); +decl_itxfm_func(iadst, idct, 4, BPC, sse2); +decl_itxfm_func(iadst, iadst, 4, BPC, sse2); +decl_itxfm_funcs(8, BPC, sse2); +decl_itxfm_funcs(16, BPC, sse2); +decl_itxfm_func(idct, idct, 32, BPC, sse2); +#endif /* HAVE_X86ASM */ + +av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + +#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \ + dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt +#define init_lpf_16_func(idx, dir, bpp, opt) \ + dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt +#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \ + dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt + +#define init_lpf_funcs(bpp, opt) \ + init_lpf_8_func(0, 0, h, 4, bpp, opt); \ + init_lpf_8_func(0, 1, v, 4, bpp, opt); \ + init_lpf_8_func(1, 0, h, 8, bpp, opt); \ + init_lpf_8_func(1, 1, v, 8, bpp, opt); \ + init_lpf_8_func(2, 0, h, 16, bpp, opt); \ + init_lpf_8_func(2, 1, v, 16, bpp, opt); \ + init_lpf_16_func(0, h, bpp, opt); \ + init_lpf_16_func(1, v, bpp, opt); \ + init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \ + init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \ + init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \ + init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \ + init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \ + init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \ + init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \ + init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt) + +#define init_itx_func(idxa, idxb, typea, typeb, size, bpp, opt) \ + dsp->itxfm_add[idxa][idxb] = \ + cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt); +#define init_itx_func_one(idx, typea, typeb, size, bpp, opt) \ + init_itx_func(idx, DCT_DCT, typea, typeb, size, bpp, opt); \ + init_itx_func(idx, ADST_DCT, typea, typeb, size, bpp, opt); \ + init_itx_func(idx, DCT_ADST, typea, typeb, size, bpp, opt); \ + init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt) +#define init_itx_funcs(idx, size, bpp, opt) \ + init_itx_func(idx, DCT_DCT, idct, idct, size, bpp, opt); \ + init_itx_func(idx, ADST_DCT, idct, iadst, size, bpp, opt); \ + init_itx_func(idx, DCT_ADST, iadst, idct, size, bpp, opt); \ + init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \ + + if (EXTERNAL_MMXEXT(cpu_flags)) { + init_ipred_func(tm, TM_VP8, 4, BPC, mmxext); + if (!bitexact) { + init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext); +#if BPC == 10 + init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext); +#endif + } + } + + if (EXTERNAL_SSE2(cpu_flags)) { + init_subpel3(0, put, BPC, sse2); + init_subpel3(1, avg, BPC, sse2); + init_lpf_funcs(BPC, sse2); + init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2); +#if BPC == 10 + if (!bitexact) { + init_itx_func(TX_4X4, ADST_DCT, idct, iadst, 4, 10, sse2); + init_itx_func(TX_4X4, DCT_ADST, iadst, idct, 4, 10, sse2); + init_itx_func(TX_4X4, ADST_ADST, iadst, iadst, 4, 10, sse2); + } +#else + init_itx_funcs(TX_4X4, 4, 12, sse2); +#endif + init_itx_funcs(TX_8X8, 8, BPC, sse2); + init_itx_funcs(TX_16X16, 16, BPC, sse2); + init_itx_func_one(TX_32X32, idct, idct, 32, BPC, sse2); + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + init_lpf_funcs(BPC, ssse3); +#if BPC == 10 + if (!bitexact) { + init_itx_funcs(TX_4X4, 4, BPC, ssse3); + } +#endif + } + + if (EXTERNAL_AVX(cpu_flags)) { + init_lpf_funcs(BPC, avx); + } + + if (EXTERNAL_AVX2_FAST(cpu_flags)) { +#if HAVE_AVX2_EXTERNAL + init_subpel3_32_64(0, put, BPC, avx2); + init_subpel3_32_64(1, avg, BPC, avx2); + init_subpel2(2, 0, 16, put, BPC, avx2); + init_subpel2(2, 1, 16, avg, BPC, avx2); +#endif + } + +#endif /* HAVE_X86ASM */ + + ff_vp9dsp_init_16bpp_x86(dsp); +} diff --git a/media/ffvpx/libavcodec/x86/vp9intrapred.asm b/media/ffvpx/libavcodec/x86/vp9intrapred.asm new file mode 100644 index 0000000000..31f7d449fd --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9intrapred.asm @@ -0,0 +1,2044 @@ +;****************************************************************************** +;* VP9 Intra prediction SIMD optimizations +;* +;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* Parts based on: +;* H.264 intra prediction asm optimizations +;* Copyright (c) 2010 Fiona Glaser +;* Copyright (c) 2010 Holger Lubitz +;* Copyright (c) 2010 Loren Merritt +;* Copyright (c) 2010 Ronald S. Bultje +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pw_m256: times 16 dw -256 +pw_m255: times 16 dw -255 +pw_4096: times 8 dw 4096 + +pb_4x3_4x2_4x1_4x0: times 4 db 3 + times 4 db 2 + times 4 db 1 + times 4 db 0 +pb_8x1_8x0: times 8 db 1 + times 8 db 0 +pb_8x3_8x2: times 8 db 3 + times 8 db 2 +pb_0to5_2x7: db 0, 1, 2, 3, 4, 5, 7, 7 + times 8 db -1 +pb_0to6_9x7: db 0, 1, 2, 3, 4, 5, 6 + times 9 db 7 +pb_1to6_10x7: db 1, 2, 3, 4, 5, 6 + times 10 db 7 +pb_2to6_3x7: +pb_2to6_11x7: db 2, 3, 4, 5, 6 + times 11 db 7 +pb_1toE_2xF: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 +pb_2toE_3xF: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 +pb_13456_3xm1: db 1, 3, 4, 5, 6 + times 3 db -1 +pb_6012_4xm1: db 6, 0, 1, 2 + times 4 db -1 +pb_6xm1_246_8toE: times 6 db -1 + db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14 +pb_6xm1_BDF_0to6: times 6 db -1 + db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6 +pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + +pb_15x0_1xm1: times 15 db 0 + db -1 +pb_0to2_5x3: db 0, 1, 2 + times 5 db 3 +pb_6xm1_2x0: times 6 db -1 + times 2 db 0 +pb_6x0_2xm1: times 6 db 0 + times 2 db -1 + +cextern pb_1 +cextern pb_2 +cextern pb_3 +cextern pb_15 +cextern pw_2 +cextern pw_4 +cextern pw_8 +cextern pw_16 +cextern pw_32 +cextern pw_255 +cextern pw_512 +cextern pw_1024 +cextern pw_2048 +cextern pw_8192 + +SECTION .text + +; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) + +%macro DC_4to8_FUNCS 0 +cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [lq] + punpckldq m0, [aq] + pxor m1, m1 + psadbw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_4096] + pshufb m0, m1 +%else + paddw m0, [pw_4] + psraw m0, 3 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + RET + +cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a + movq m0, [lq] + movq m1, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] + pshufb m0, m2 +%else + paddw m0, [pw_8] + psraw m0, 4 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET +%endmacro + +INIT_MMX mmxext +DC_4to8_FUNCS +INIT_MMX ssse3 +DC_4to8_FUNCS + +%macro DC_16to32_FUNCS 0 +cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a + mova m0, [lq] + mova m1, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_1024] + pshufb m0, m2 +%else + paddw m0, [pw_16] + psraw m0, 5 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a + mova m0, [lq] + mova m1, [lq+16] + mova m2, [aq] + mova m3, [aq+16] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m4, m4 + psadbw m0, m4 + psadbw m1, m4 + psadbw m2, m4 + psadbw m3, m4 + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_512] + pshufb m0, m4 +%else + paddw m0, [pw_32] + psraw m0, 6 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 8 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DC_16to32_FUNCS +INIT_XMM ssse3 +DC_16to32_FUNCS + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [lq] + mova m1, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + movhlps xm1, xm0 + paddw xm0, xm1 + pmulhrsw xm0, [pw_512] + vpbroadcastb m0, xm0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif + +; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) + +%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l) +cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [%2q] + pxor m1, m1 + psadbw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_8192] + pshufb m0, m1 +%else + paddw m0, [pw_2] + psraw m0, 2 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + RET + +cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a + movq m0, [%2q] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pxor m1, m1 + psadbw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_4096] + pshufb m0, m1 +%else + paddw m0, [pw_4] + psraw m0, 3 + punpcklbw m0, m0 + pshufw m0, m0, q0000 +%endif + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET +%endmacro + +INIT_MMX mmxext +DC_1D_4to8_FUNCS top, a +DC_1D_4to8_FUNCS left, l +INIT_MMX ssse3 +DC_1D_4to8_FUNCS top, a +DC_1D_4to8_FUNCS left, l + +%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l) +cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a + mova m0, [%2q] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] + pshufb m0, m2 +%else + paddw m0, [pw_8] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [%2q] + mova m1, [%2q+16] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_1024] + pshufb m0, m2 +%else + paddw m0, [pw_16] + psraw m0, 5 + punpcklbw m0, m0 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +%endif + mov cntd, 8 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DC_1D_16to32_FUNCS top, a +DC_1D_16to32_FUNCS left, l +INIT_XMM ssse3 +DC_1D_16to32_FUNCS top, a +DC_1D_16to32_FUNCS left, l + +%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l) +%if HAVE_AVX2_EXTERNAL +cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [%2q] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + movhlps xm1, xm0 + paddw xm0, xm1 + pmulhrsw xm0, [pw_1024] + vpbroadcastb m0, xm0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif +%endmacro + +INIT_YMM avx2 +DC_1D_AVX2_FUNCS top, a +DC_1D_AVX2_FUNCS left, l + +; v + +INIT_MMX mmx +cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a + movq m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse +cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse +cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a + mova m0, [aq] + mova m1, [aq+16] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 8 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_YMM avx +cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +; h + +%macro H_XMM_FUNCS 2 +%if notcpuflag(avx) +cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3 + movd m0, [lq] +%if cpuflag(ssse3) + pshufb m0, [pb_4x3_4x2_4x1_4x0] +%else + punpcklbw m0, m0 + pshuflw m0, m0, q0123 + punpcklwd m0, m0 +%endif + lea stride3q, [strideq*3] + movd [dstq+strideq*0], m0 + psrldq m0, 4 + movd [dstq+strideq*1], m0 + psrldq m0, 4 + movd [dstq+strideq*2], m0 + psrldq m0, 4 + movd [dstq+stride3q ], m0 + RET +%endif + +cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) + mova m2, [pb_8x1_8x0] + mova m3, [pb_8x3_8x2] +%endif + lea stride3q, [strideq*3] + mov cntq, 1 +.loop: + movd m0, [lq+cntq*4] +%if cpuflag(ssse3) + pshufb m1, m0, m3 + pshufb m0, m2 +%else + punpcklbw m0, m0 + punpcklwd m0, m0 + pshufd m1, m0, q2233 + pshufd m0, m0, q0011 +%endif + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + movq [dstq+strideq*2], m0 + movhps [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET + +cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) + mova m5, [pb_1] + mova m6, [pb_2] + mova m7, [pb_3] + pxor m4, m4 +%endif + lea stride3q, [strideq*3] + mov cntq, 3 +.loop: + movd m3, [lq+cntq*4] +%if cpuflag(ssse3) + pshufb m0, m3, m7 + pshufb m1, m3, m6 +%else + punpcklbw m3, m3 + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 +%endif + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 +%if cpuflag(ssse3) + pshufb m2, m3, m5 + pshufb m3, m4 +%else + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 +%endif + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET + +cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt +%if cpuflag(ssse3) + mova m5, [pb_1] + mova m6, [pb_2] + mova m7, [pb_3] + pxor m4, m4 +%endif + lea stride3q, [strideq*3] + mov cntq, 7 +.loop: + movd m3, [lq+cntq*4] +%if cpuflag(ssse3) + pshufb m0, m3, m7 + pshufb m1, m3, m6 +%else + punpcklbw m3, m3 + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 +%endif + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m1 +%if cpuflag(ssse3) + pshufb m2, m3, m5 + pshufb m3, m4 +%else + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 +%endif + mova [dstq+strideq*2+ 0], m2 + mova [dstq+strideq*2+16], m2 + mova [dstq+stride3q + 0], m3 + mova [dstq+stride3q +16], m3 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET +%endmacro + +INIT_XMM sse2 +H_XMM_FUNCS 2, 4 +INIT_XMM ssse3 +H_XMM_FUNCS 4, 8 +INIT_XMM avx +H_XMM_FUNCS 4, 8 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt + mova m5, [pb_1] + mova m6, [pb_2] + mova m7, [pb_3] + pxor m4, m4 + lea stride3q, [strideq*3] + mov cntq, 7 +.loop: + movd xm3, [lq+cntq*4] + vinserti128 m3, m3, xm3, 1 + pshufb m0, m3, m7 + pshufb m1, m3, m6 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufb m2, m3, m5 + pshufb m3, m4 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET +%endif + +; tm + +%macro TM_MMX_FUNCS 0 +cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a + pxor m1, m1 + movd m0, [aq] + pinsrw m2, [aq-1], 0 + punpcklbw m0, m1 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m3, [pw_m256] + mova m1, [pw_m255] + pshufb m2, m3 +%else + punpcklbw m2, m1 + pshufw m2, m2, q0000 +%endif + psubw m0, m2 + mov cntq, 1 +.loop: + pinsrw m2, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m4, m2, m1 + pshufb m2, m3 +%else + punpcklbw m2, m1 + pshufw m4, m2, q1111 + pshufw m2, m2, q0000 +%endif + paddw m4, m0 + paddw m2, m0 + packuswb m4, m4 + packuswb m2, m2 + movd [dstq+strideq*0], m4 + movd [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET +%endmacro + +INIT_MMX mmxext +TM_MMX_FUNCS +INIT_MMX ssse3 +TM_MMX_FUNCS + +%macro TM_XMM_FUNCS 0 +cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a + pxor m1, m1 + movh m0, [aq] + pinsrw m2, [aq-1], 0 + punpcklbw m0, m1 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m3, [pw_m256] + mova m1, [pw_m255] + pshufb m2, m3 +%else + punpcklbw m2, m1 + punpcklwd m2, m2 + pshufd m2, m2, q0000 +%endif + psubw m0, m2 + mov cntq, 3 +.loop: + pinsrw m2, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m4, m2, m1 + pshufb m2, m3 +%else + punpcklbw m2, m1 + punpcklwd m2, m2 + pshufd m4, m2, q1111 + pshufd m2, m2, q0000 +%endif + paddw m4, m0 + paddw m2, m0 + packuswb m4, m2 + movh [dstq+strideq*0], m4 + movhps [dstq+strideq*1], m4 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET + +cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a + pxor m3, m3 + mova m0, [aq] + pinsrw m2, [aq-1], 0 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) + mova m4, [pw_m256] + mova m3, [pw_m255] + pshufb m2, m4 +%else + punpcklbw m2, m3 + punpcklwd m2, m2 + pshufd m2, m2, q0000 +%endif + psubw m1, m2 + psubw m0, m2 + mov cntq, 7 +.loop: + pinsrw m7, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m5, m7, m3 + pshufb m7, m4 +%else + punpcklbw m7, m3 + punpcklwd m7, m7 + pshufd m5, m7, q1111 + pshufd m7, m7, q0000 +%endif + paddw m2, m5, m0 + paddw m5, m1 + paddw m6, m7, m0 + paddw m7, m1 + packuswb m2, m5 + packuswb m6, m7 + mova [dstq+strideq*0], m2 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET + +%if ARCH_X86_64 +%define mem 0 +%else +%define mem 64 +%endif +cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a + pxor m5, m5 + pinsrw m4, [aq-1], 0 + mova m0, [aq] + mova m2, [aq+16] + DEFINE_ARGS dst, stride, l, cnt +%if cpuflag(ssse3) +%if ARCH_X86_64 + mova m12, [pw_m256] + mova m13, [pw_m255] +%define pw_m256_reg m12 +%define pw_m255_reg m13 +%else +%define pw_m256_reg [pw_m256] +%define pw_m255_reg [pw_m255] +%endif + pshufb m4, pw_m256_reg +%else + punpcklbw m4, m5 + punpcklwd m4, m4 + pshufd m4, m4, q0000 +%endif + punpckhbw m1, m0, m5 + punpckhbw m3, m2, m5 + punpcklbw m0, m5 + punpcklbw m2, m5 + psubw m1, m4 + psubw m0, m4 + psubw m3, m4 + psubw m2, m4 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 +%else + mova [rsp+0*16], m0 + mova [rsp+1*16], m1 + mova [rsp+2*16], m2 + mova [rsp+3*16], m3 +%endif + mov cntq, 15 +.loop: + pinsrw m3, [lq+cntq*2], 0 +%if cpuflag(ssse3) + pshufb m7, m3, pw_m255_reg + pshufb m3, pw_m256_reg +%else + pxor m7, m7 + punpcklbw m3, m7 + punpcklwd m3, m3 + pshufd m7, m3, q1111 + pshufd m3, m3, q0000 +%endif +%if ARCH_X86_64 + paddw m4, m7, m8 + paddw m5, m7, m9 + paddw m6, m7, m10 + paddw m7, m11 + paddw m0, m3, m8 + paddw m1, m3, m9 + paddw m2, m3, m10 + paddw m3, m11 +%else + paddw m4, m7, [rsp+0*16] + paddw m5, m7, [rsp+1*16] + paddw m6, m7, [rsp+2*16] + paddw m7, [rsp+3*16] + paddw m0, m3, [rsp+0*16] + paddw m1, m3, [rsp+1*16] + paddw m2, m3, [rsp+2*16] + paddw m3, [rsp+3*16] +%endif + packuswb m4, m5 + packuswb m6, m7 + packuswb m0, m1 + packuswb m2, m3 + mova [dstq+strideq*0+ 0], m4 + mova [dstq+strideq*0+16], m6 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m2 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET +%undef pw_m256_reg +%undef pw_m255_reg +%undef mem +%endmacro + +INIT_XMM sse2 +TM_XMM_FUNCS +INIT_XMM ssse3 +TM_XMM_FUNCS +INIT_XMM avx +TM_XMM_FUNCS + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a + pxor m3, m3 + pinsrw xm2, [aq-1], 0 + vinserti128 m2, m2, xm2, 1 + mova m0, [aq] + DEFINE_ARGS dst, stride, l, cnt + mova m4, [pw_m256] + mova m5, [pw_m255] + pshufb m2, m4 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + psubw m1, m2 + psubw m0, m2 + mov cntq, 15 +.loop: + pinsrw xm7, [lq+cntq*2], 0 + vinserti128 m7, m7, xm7, 1 + pshufb m3, m7, m5 + pshufb m7, m4 + paddw m2, m3, m0 + paddw m3, m1 + paddw m6, m7, m0 + paddw m7, m1 + packuswb m2, m3 + packuswb m6, m7 + mova [dstq+strideq*0], m2 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET +%endif + +; dl + +%macro LOWPASS 4 ; left [dst], center, right, tmp + pxor m%4, m%1, m%3 + pand m%4, [pb_1] + pavgb m%1, m%3 + psubusb m%1, m%4 + pavgb m%1, m%2 +%endmacro + +%macro DL_MMX_FUNCS 0 +cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a + movq m1, [aq] +%if cpuflag(ssse3) + pshufb m0, m1, [pb_0to5_2x7] + pshufb m2, m1, [pb_2to6_3x7] +%else + punpckhbw m3, m1, m1 ; 44556677 + pand m0, m1, [pb_6xm1_2x0] ; 012345__ + pand m3, [pb_6x0_2xm1] ; ______77 + psrlq m2, m1, 16 ; 234567__ + por m0, m3 ; 01234577 + por m2, m3 ; 23456777 +%endif + psrlq m1, 8 + LOWPASS 0, 1, 2, 3 + + pshufw m1, m0, q3321 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*2], m1 + psrlq m0, 8 + psrlq m1, 8 + add dstq, strideq + movd [dstq+strideq*0], m0 + movd [dstq+strideq*2], m1 + RET +%endmacro + +INIT_MMX mmxext +DL_MMX_FUNCS +INIT_MMX ssse3 +DL_MMX_FUNCS + +%macro DL_XMM_FUNCS 0 +cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a + movq m0, [aq] + lea stride5q, [strideq*5] +%if cpuflag(ssse3) + pshufb m1, m0, [pb_1to6_10x7] +%else + punpcklbw m1, m0, m0 ; 0011223344556677 + punpckhwd m1, m1 ; 4x4,4x5,4x6,4x7 +%endif + shufps m0, m1, q3310 +%if notcpuflag(ssse3) + psrldq m1, m0, 1 + shufps m1, m0, q3210 +%endif + psrldq m2, m1, 1 + LOWPASS 0, 1, 2, 3 + + pshufd m1, m0, q3321 + movq [dstq+strideq*0], m0 + movq [dstq+strideq*4], m1 + psrldq m0, 1 + psrldq m1, 1 + movq [dstq+strideq*1], m0 + movq [dstq+stride5q ], m1 + lea dstq, [dstq+strideq*2] + psrldq m0, 1 + psrldq m1, 1 + movq [dstq+strideq*0], m0 + movq [dstq+strideq*4], m1 + psrldq m0, 1 + psrldq m1, 1 + movq [dstq+strideq*1], m0 + movq [dstq+stride5q ], m1 + RET + +cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a + mova m0, [aq] +%if cpuflag(ssse3) + mova m5, [pb_1toE_2xF] + pshufb m1, m0, m5 + pshufb m2, m1, m5 + pshufb m4, m0, [pb_15] +%else + pand m5, m0, [pb_15x0_1xm1] ; _______________F + psrldq m1, m0, 1 ; 123456789ABCDEF_ + por m1, m5 ; 123456789ABCDEFF + psrldq m2, m1, 1 ; 23456789ABCDEFF_ + por m2, m5 ; 23456789ABCDEFFF + pshufhw m4, m1, q3333 ; xxxxxxxxFFFFFFFF +%endif + LOWPASS 0, 1, 2, 3 + DEFINE_ARGS dst, stride, cnt, stride9 + lea stride9q, [strideq+strideq*8] + mov cntd, 4 + +.loop: + movhlps m4, m0 + mova [dstq+strideq*0], m0 +%if cpuflag(ssse3) + pshufb m0, m5 +%else + psrldq m0, 1 + por m0, m5 +%endif + mova [dstq+strideq*8], m4 + movhlps m4, m0 + mova [dstq+strideq*1], m0 +%if cpuflag(ssse3) + pshufb m0, m5 +%else + psrldq m0, 1 + por m0, m5 +%endif + mova [dstq+stride9q ], m4 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16 + mova m0, [aq] + mova m1, [aq+16] + PALIGNR m2, m1, m0, 1, m4 + PALIGNR m3, m1, m0, 2, m4 + LOWPASS 0, 2, 3, 4 +%if cpuflag(ssse3) + mova m5, [pb_1toE_2xF] + pshufb m2, m1, m5 + pshufb m3, m2, m5 + pshufb m6, m1, [pb_15] + mova m7, m6 +%else + pand m5, m1, [pb_15x0_1xm1] ; _______________F + psrldq m2, m1, 1 ; 123456789ABCDEF_ + por m2, m5 ; 123456789ABCDEFF + psrldq m3, m2, 1 ; 23456789ABCDEFF_ + por m3, m5 ; 23456789ABCDEFFF + pshufhw m7, m2, q3333 ; xxxxxxxxFFFFFFFF + pshufd m6, m7, q3333 +%endif + LOWPASS 1, 2, 3, 4 + lea dst16q, [dstq +strideq*8] + mov cntd, 8 + lea dst16q, [dst16q+strideq*8] +.loop: + movhlps m7, m1 + mova [dstq +strideq*0+ 0], m0 + mova [dstq +strideq*0+16], m1 + movhps [dstq+strideq*8+ 0], m0 + movq [dstq +strideq*8+ 8], m1 + mova [dstq +strideq*8+16], m7 + mova [dst16q+strideq*0+ 0], m1 + mova [dst16q+strideq*0+16], m6 + mova [dst16q+strideq*8+ 0], m7 + mova [dst16q+strideq*8+16], m6 +%if cpuflag(avx) + vpalignr m0, m1, m0, 1 + pshufb m1, m5 +%elif cpuflag(ssse3) + palignr m2, m1, m0, 1 + pshufb m1, m5 + mova m0, m2 +%else + mova m4, m1 + psrldq m0, 1 + pslldq m4, 15 + psrldq m1, 1 + por m0, m4 + por m1, m5 +%endif + add dstq, strideq + add dst16q, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DL_XMM_FUNCS +INIT_XMM ssse3 +DL_XMM_FUNCS +INIT_XMM avx +DL_XMM_FUNCS + +; dr + +%macro DR_MMX_FUNCS 0 +cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [lq] + punpckldq m0, [aq-1] + movd m1, [aq+3] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + PALIGNR m1, m0, 1, m3 + psrlq m2, m1, 8 + LOWPASS 0, 1, 2, 3 + + movd [dstq+stride3q ], m0 + psrlq m0, 8 + movd [dstq+strideq*2], m0 + psrlq m0, 8 + movd [dstq+strideq*1], m0 + psrlq m0, 8 + movd [dstq+strideq*0], m0 + RET +%endmacro + +INIT_MMX mmxext +DR_MMX_FUNCS +INIT_MMX ssse3 +DR_MMX_FUNCS + +%macro DR_XMM_FUNCS 0 +cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a + movq m1, [lq] + movhps m1, [aq-1] + movd m2, [aq+7] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pslldq m0, m1, 1 + PALIGNR m2, m1, 1, m3 + LOWPASS 0, 1, 2, 3 + + movhps [dstq+strideq*0], m0 + pslldq m0, 1 + movhps [dstq+strideq*1], m0 + pslldq m0, 1 + movhps [dstq+strideq*2], m0 + pslldq m0, 1 + movhps [dstq+stride3q ], m0 + pslldq m0, 1 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], m0 + pslldq m0, 1 + movhps [dstq+strideq*1], m0 + pslldq m0, 1 + movhps [dstq+strideq*2], m0 + pslldq m0, 1 + movhps [dstq+stride3q ], m0 + RET + +cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a + mova m1, [lq] + movu m2, [aq-1] + movd m4, [aq+15] + DEFINE_ARGS dst, stride, stride9, cnt + lea stride9q, [strideq *3] + mov cntd, 4 + lea stride9q, [stride9q*3] + PALIGNR m4, m2, 1, m5 + PALIGNR m3, m2, m1, 15, m5 + LOWPASS 3, 2, 4, 5 + pslldq m0, m1, 1 + PALIGNR m2, m1, 1, m4 + LOWPASS 0, 1, 2, 4 + +.loop: + mova [dstq+strideq*0 ], m3 + movhps [dstq+strideq*8+0], m0 + movq [dstq+strideq*8+8], m3 + PALIGNR m3, m0, 15, m1 + pslldq m0, 1 + mova [dstq+strideq*1 ], m3 + movhps [dstq+stride9q +0], m0 + movq [dstq+stride9q +8], m3 + PALIGNR m3, m0, 15, m1 + pslldq m0, 1 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a + mova m1, [lq] + mova m2, [lq+16] + movu m3, [aq-1] + movu m4, [aq+15] + movd m5, [aq+31] + DEFINE_ARGS dst, stride, stride8, cnt + lea stride8q, [strideq*8] + PALIGNR m5, m4, 1, m7 + PALIGNR m6, m4, m3, 15, m7 + LOWPASS 5, 4, 6, 7 + PALIGNR m4, m3, 1, m7 + PALIGNR m6, m3, m2, 15, m7 + LOWPASS 4, 3, 6, 7 + PALIGNR m3, m2, 1, m7 + PALIGNR m6, m2, m1, 15, m7 + LOWPASS 3, 2, 6, 7 + PALIGNR m2, m1, 1, m6 + pslldq m0, m1, 1 + LOWPASS 2, 1, 0, 6 + mov cntd, 16 + + ; out=m2/m3/m4/m5 +.loop: + mova [dstq+stride8q*0+ 0], m4 + mova [dstq+stride8q*0+16], m5 + mova [dstq+stride8q*2+ 0], m3 + mova [dstq+stride8q*2+16], m4 + PALIGNR m5, m4, 15, m6 + PALIGNR m4, m3, 15, m6 + PALIGNR m3, m2, 15, m6 + pslldq m2, 1 + add dstq, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DR_XMM_FUNCS +INIT_XMM ssse3 +DR_XMM_FUNCS +INIT_XMM avx +DR_XMM_FUNCS + +; vl + +INIT_MMX mmxext +cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a + movq m0, [aq] + psrlq m1, m0, 8 + psrlq m2, m1, 8 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + movd [dstq+strideq*0], m1 + movd [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + psrlq m1, 8 + psrlq m2, 8 + movd [dstq+strideq*0], m1 + movd [dstq+strideq*1], m2 + RET + +%macro VL_XMM_FUNCS 0 +cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a + movq m0, [aq] +%if cpuflag(ssse3) + pshufb m0, [pb_0to6_9x7] +%else + punpcklbw m1, m0, m0 + punpckhwd m1, m1 + shufps m0, m1, q3310 +%endif + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psrldq m1, m0, 1 + psrldq m2, m0, 2 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + + movq [dstq+strideq*0], m1 + movq [dstq+strideq*1], m2 + psrldq m1, 1 + psrldq m2, 1 + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + psrldq m1, 1 + psrldq m2, 1 + movq [dstq+strideq*0], m1 + movq [dstq+strideq*1], m2 + psrldq m1, 1 + psrldq m2, 1 + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] +%if cpuflag(ssse3) + mova m4, [pb_1toE_2xF] + pshufb m1, m0, m4 + pshufb m2, m1, m4 +%else + pand m4, m0, [pb_15x0_1xm1] ; _______________F + psrldq m1, m0, 1 ; 123456789ABCDEF_ + por m1, m4 ; 123456789ABCDEFF + psrldq m2, m1, 1 ; 23456789ABCDEFF_ + por m2, m4 ; 23456789ABCDEFFF +%endif + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 +%if cpuflag(ssse3) + pshufb m1, m4 + pshufb m2, m4 +%else + psrldq m1, 1 + psrldq m2, 1 + por m1, m4 + por m2, m4 +%endif + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 +%if cpuflag(ssse3) + pshufb m1, m4 + pshufb m2, m4 +%else + psrldq m1, 1 + psrldq m2, 1 + por m1, m4 + por m2, m4 +%endif + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a + mova m0, [aq] + mova m5, [aq+16] + DEFINE_ARGS dst, stride, dst16, cnt + PALIGNR m2, m5, m0, 1, m4 + PALIGNR m3, m5, m0, 2, m4 + lea dst16q, [dstq +strideq*8] + LOWPASS 3, 2, 0, 6 + pavgb m2, m0 +%if cpuflag(ssse3) + mova m4, [pb_1toE_2xF] + pshufb m0, m5, m4 + pshufb m1, m0, m4 +%else + pand m4, m5, [pb_15x0_1xm1] ; _______________F + psrldq m0, m5, 1 ; 123456789ABCDEF_ + por m0, m4 ; 123456789ABCDEFF + psrldq m1, m0, 1 ; 23456789ABCDEFF_ + por m1, m4 ; 23456789ABCDEFFF +%endif + lea dst16q, [dst16q+strideq*8] + LOWPASS 1, 0, 5, 6 + pavgb m0, m5 +%if cpuflag(ssse3) + pshufb m5, [pb_15] +%else + punpckhbw m5, m4, m4 + pshufhw m5, m5, q3333 + punpckhqdq m5, m5 +%endif + mov cntd, 8 + +.loop: +%macro %%write 3 + mova [dstq+stride%1+ 0], %2 + mova [dstq+stride%1+16], %3 + movhps [dst16q+stride%1 ], %2 + movu [dst16q+stride%1+ 8], %3 + movq [dst16q+stride%1+24], m5 +%if cpuflag(avx) + palignr %2, %3, %2, 1 + pshufb %3, m4 +%elif cpuflag(ssse3) + palignr m6, %3, %2, 1 + pshufb %3, m4 + mova %2, m6 +%else + pslldq m6, %3, 15 + psrldq %3, 1 + psrldq %2, 1 + por %3, m4 + por %2, m6 +%endif +%endmacro + + %%write q*0, m2, m0 + %%write q*1, m3, m1 + lea dstq, [dstq +strideq*2] + lea dst16q, [dst16q+strideq*2] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +VL_XMM_FUNCS +INIT_XMM ssse3 +VL_XMM_FUNCS +INIT_XMM avx +VL_XMM_FUNCS + +; vr + +%macro VR_MMX_FUNCS 0 +cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a + movq m1, [aq-1] + punpckldq m2, [lq] + movd m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pavgb m0, m1 + PALIGNR m1, m2, 5, m3 + psrlq m2, m1, 8 + psllq m3, m1, 8 + LOWPASS 2, 1, 3, 4 + + ; ABCD <- for the following predictor: + ; EFGH + ; IABC | m0 contains ABCDxxxx + ; JEFG | m2 contains xJIEFGHx + +%if cpuflag(ssse3) + punpckldq m0, m2 + pshufb m2, [pb_13456_3xm1] + movd [dstq+strideq*0], m0 + pshufb m0, [pb_6012_4xm1] + movd [dstq+stride3q ], m2 + psrlq m2, 8 + movd [dstq+strideq*2], m0 + movd [dstq+strideq*1], m2 +%else + psllq m1, m2, 40 + psrlq m2, 24 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m2 + PALIGNR m0, m1, 7, m3 + psllq m1, 8 + PALIGNR m2, m1, 7, m3 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m2 +%endif + RET +%endmacro + +INIT_MMX mmxext +VR_MMX_FUNCS +INIT_MMX ssse3 +VR_MMX_FUNCS + +%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16 +cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a + movu m1, [aq-1] + movhps m2, [lq] + movq m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pavgb m0, m1 + PALIGNR m1, m2, 9, m3 + pslldq m2, m1, 1 + pslldq m3, m1, 2 + LOWPASS 1, 2, 3, 4 + + ; ABCDEFGH <- for the following predictor: + ; IJKLMNOP + ; QABCDEFG | m0 contains ABCDEFGHxxxxxxxx + ; RIJKLMNO | m1 contains xxVUTSRQIJKLMNOP + ; SQABCDEF + ; TRIJKLMN + ; USQABCDE + ; VTRIJKLM + +%if cpuflag(ssse3) + punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ +%endif + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m1 +%if cpuflag(ssse3) + pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG + pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO +%else + psrlw m2, m1, 8 ; x_U_S_Q_xxxxxxxx + pand m3, m1, [pw_255] ; x_V_T_R_xxxxxxxx + packuswb m3, m2 ; xVTRxxxxxUSQxxxx + pslldq m3, 4 ; xxxxxVTRxxxxxUSQ + PALIGNR m0, m3, 7, m4 ; xxxxxxUSQABCDEFG + psrldq m1, 8 + pslldq m3, 8 + PALIGNR m1, m3, 7, m4 ; xxxxxxVTRIJKLMNO +%endif + movhps [dstq+strideq*2], m0 + movhps [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + pslldq m0, 1 + pslldq m1, 1 + movhps [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m1 + pslldq m0, 1 + pslldq m1, 1 + movhps [dstq+strideq*2], m0 + movhps [dstq+stride3q ], m1 + RET + +cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a + mova m0, [aq] + movu m1, [aq-1] + mova m2, [lq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + PALIGNR m3, m1, m2, 15, m6 + LOWPASS 3, 1, 0, 4 + pavgb m0, m1 + PALIGNR m1, m2, 1, m6 + pslldq m4, m2, 1 + LOWPASS 1, 2, 4, 5 +%if cpuflag(ssse3) + pshufb m1, [pb_02468ACE_13579BDF] +%else + psrlw m5, m1, 8 + pand m1, [pw_255] + packuswb m1, m5 +%endif + mov cntd, 4 + +.loop: + movlhps m2, m1 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m3 + PALIGNR m4, m0, m1, 15, m6 + PALIGNR m5, m3, m2, 15, m6 + mova [dstq+strideq*2], m4 + mova [dstq+stride3q ], m5 + lea dstq, [dstq+strideq*4] + PALIGNR m0, m1, 14, m6 + PALIGNR m3, m2, 14, m6 + pslldq m1, 2 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a + mova m0, [aq] + mova m2, [aq+16] + movu m1, [aq-1] + PALIGNR m3, m2, m0, 15, m6 + PALIGNR m4, m2, m0, 14, m6 + LOWPASS 4, 3, 2, 5 + pavgb m3, m2 + mova m2, [lq+16] + PALIGNR m5, m1, m2, 15, m6 + LOWPASS 5, 1, 0, 6 + pavgb m0, m1 + mova m6, [lq] +%if ARCH_X86_64 + SWAP 0, 8 +%else + mova [dstq], m0 +%endif + PALIGNR m1, m2, 1, m0 + PALIGNR m7, m2, m6, 15, m0 + LOWPASS 1, 2, 7, 0 + PALIGNR m2, m6, 1, m0 + pslldq m7, m6, 1 + LOWPASS 2, 6, 7, 0 +%if cpuflag(ssse3) + pshufb m1, [pb_02468ACE_13579BDF] + pshufb m2, [pb_02468ACE_13579BDF] +%else + psrlw m0, m1, 8 + psrlw m6, m2, 8 + pand m1, [pw_255] + pand m2, [pw_255] + packuswb m1, m0 + packuswb m2, m6 +%endif + DEFINE_ARGS dst, stride, dst16, cnt + lea dst16q, [dstq +strideq*8] + lea dst16q, [dst16q+strideq*8] + SBUTTERFLY qdq, 2, 1, 6 +%if ARCH_X86_64 + SWAP 0, 8 +%else + mova m0, [dstq] +%endif + mov cntd, 8 + +.loop: + ; even lines (0, 2, 4, ...): m1 | m0, m3 + ; odd lines (1, 3, 5, ...): m2 | m5, m4 +%macro %%write 4 + mova [dstq+stride%1+ 0], %3 + mova [dstq+stride%1+16], %4 + movhps [dst16q+stride%1 ], %2 + movu [dst16q+stride%1+ 8], %3 + movq [dst16q+stride%1+24], %4 + PALIGNR %4, %3, 15, m6 + PALIGNR %3, %2, 15, m6 + pslldq %2, 1 +%endmacro + + %%write q*0, m1, m0, m3 + %%write q*1, m2, m5, m4 + lea dstq, [dstq +strideq*2] + lea dst16q, [dst16q+strideq*2] + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +VR_XMM_FUNCS 7 +INIT_XMM ssse3 +VR_XMM_FUNCS 6 +INIT_XMM avx +VR_XMM_FUNCS 6 + +; hd + +INIT_MMX mmxext +cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a + movd m0, [lq] + punpckldq m0, [aq-1] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psrlq m1, m0, 8 + psrlq m2, m1, 8 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + + ; DHIJ <- for the following predictor: + ; CGDH + ; BFCG | m1 contains ABCDxxxx + ; AEBF | m2 contains EFGHIJxx + + punpcklbw m1, m2 + punpckhdq m0, m1, m2 + + ; m1 contains AEBFCGDH + ; m0 contains CGDHIJxx + + movd [dstq+stride3q ], m1 + movd [dstq+strideq*1], m0 + psrlq m1, 16 + psrlq m0, 16 + movd [dstq+strideq*2], m1 + movd [dstq+strideq*0], m0 + RET + +%macro HD_XMM_FUNCS 0 +cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a + movq m0, [lq] + movhps m0, [aq-1] + DEFINE_ARGS dst, stride, stride3, dst4 + lea stride3q, [strideq*3] + lea dst4q, [dstq+strideq*4] + psrldq m1, m0, 1 + psrldq m2, m1, 1 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + + ; HPQRSTUV <- for the following predictor + ; GOHPQRST + ; FNGOHPQR | m1 contains ABCDEFGHxxxxxxxx + ; EMFNGOHP | m2 contains IJKLMNOPQRSTUVxx + ; DLEMFNGO + ; CKDLEMFN + ; BJCKDLEM + ; AIBJCKDL + + punpcklbw m1, m2 + movhlps m2, m2 + + ; m1 contains AIBJCKDLEMFNGOHP + ; m2 contains QRSTUVxxxxxxxxxx + + movhps [dstq +stride3q ], m1 + movq [dst4q+stride3q ], m1 + PALIGNR m3, m2, m1, 2, m4 + movhps [dstq +strideq*2], m3 + movq [dst4q+strideq*2], m3 + PALIGNR m3, m2, m1, 4, m4 + movhps [dstq +strideq*1], m3 + movq [dst4q+strideq*1], m3 + PALIGNR m2, m1, 6, m4 + movhps [dstq +strideq*0], m2 + movq [dst4q+strideq*0], m2 + RET + +cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a + mova m0, [lq] + movu m3, [aq-1] + DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12 + lea stride4q, [strideq*4] + lea dst4q, [dstq +stride4q] + lea dst8q, [dst4q+stride4q] + lea dst12q, [dst8q+stride4q] + psrldq m4, m3, 1 + psrldq m5, m3, 2 + LOWPASS 5, 4, 3, 6 + PALIGNR m1, m3, m0, 1, m6 + PALIGNR m2, m3, m0, 2, m6 + LOWPASS 2, 1, 0, 6 + pavgb m1, m0 + SBUTTERFLY bw, 1, 2, 6 + + ; I PROBABLY INVERTED L0 ad L16 here + ; m1, m2, m5 +.loop: + sub stride4q, strideq + movhps [dstq +stride4q +0], m2 + movq [dstq +stride4q +8], m5 + mova [dst4q+stride4q ], m2 + movhps [dst8q+stride4q +0], m1 + movq [dst8q+stride4q +8], m2 + mova [dst12q+stride4q ], m1 +%if cpuflag(avx) + palignr m1, m2, m1, 2 + palignr m2, m5, m2, 2 +%elif cpuflag(ssse3) + palignr m3, m2, m1, 2 + palignr m0, m5, m2, 2 + mova m1, m3 + mova m2, m0 +%else + ; slightly modified version of PALIGNR + mova m6, m2 + mova m4, m5 + pslldq m6, 14 + pslldq m4, 14 + psrldq m1, 2 + psrldq m2, 2 + por m1, m6 + por m2, m4 +%endif + psrldq m5, 2 + jg .loop + RET + +cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a + mova m0, [lq] + mova m1, [lq+16] + movu m2, [aq-1] + movu m3, [aq+15] + DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24 + lea stride8q, [strideq*8] + lea dst8q, [dstq +stride8q] + lea dst16q, [dst8q +stride8q] + lea dst24q, [dst16q+stride8q] + psrldq m4, m3, 1 + psrldq m5, m3, 2 + LOWPASS 5, 4, 3, 6 + PALIGNR m4, m3, m2, 2, m6 + PALIGNR m3, m2, 1, m6 + LOWPASS 4, 3, 2, 6 + PALIGNR m3, m2, m1, 2, m6 + PALIGNR m2, m1, 1, m6 + LOWPASS 3, 2, 1, 6 + pavgb m2, m1 + PALIGNR m6, m1, m0, 1, m7 + PALIGNR m1, m0, 2, m7 + LOWPASS 1, 6, 0, 7 + pavgb m0, m6 + SBUTTERFLY bw, 2, 3, 6 + SBUTTERFLY bw, 0, 1, 6 + + ; m0, m1, m2, m3, m4, m5 +.loop: + sub stride8q, strideq + mova [dstq +stride8q+ 0], m3 + mova [dstq +stride8q+16], m4 + mova [dst8q +stride8q+ 0], m2 + mova [dst8q +stride8q+16], m3 + mova [dst16q+stride8q+ 0], m1 + mova [dst16q+stride8q+16], m2 + mova [dst24q+stride8q+ 0], m0 + mova [dst24q+stride8q+16], m1 +%if cpuflag(avx) + palignr m0, m1, m0, 2 + palignr m1, m2, m1, 2 + palignr m2, m3, m2, 2 + palignr m3, m4, m3, 2 + palignr m4, m5, m4, 2 + psrldq m5, 2 +%elif cpuflag(ssse3) + psrldq m6, m5, 2 + palignr m5, m4, 2 + palignr m4, m3, 2 + palignr m3, m2, 2 + palignr m2, m1, 2 + palignr m1, m0, 2 + mova m0, m1 + mova m1, m2 + mova m2, m3 + mova m3, m4 + mova m4, m5 + mova m5, m6 +%else + ; sort of a half-integrated version of PALIGNR + pslldq m7, m4, 14 + pslldq m6, m5, 14 + psrldq m4, 2 + psrldq m5, 2 + por m4, m6 + pslldq m6, m3, 14 + psrldq m3, 2 + por m3, m7 + pslldq m7, m2, 14 + psrldq m2, 2 + por m2, m6 + pslldq m6, m1, 14 + psrldq m1, 2 + por m1, m7 + psrldq m0, 2 + por m0, m6 +%endif + jg .loop + RET +%endmacro + +INIT_XMM sse2 +HD_XMM_FUNCS +INIT_XMM ssse3 +HD_XMM_FUNCS +INIT_XMM avx +HD_XMM_FUNCS + +%macro HU_MMX_FUNCS 0 +cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l + movd m0, [lq] +%if cpuflag(ssse3) + pshufb m0, [pb_0to2_5x3] +%else + punpcklbw m1, m0, m0 ; 00112233 + pshufw m1, m1, q3333 ; 33333333 + punpckldq m0, m1 ; 01233333 +%endif + psrlq m1, m0, 8 + psrlq m2, m1, 8 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + SBUTTERFLY bw, 1, 2, 0 + PALIGNR m2, m1, 2, m0 + movd [dstq+strideq*0], m1 + movd [dstq+strideq*1], m2 + punpckhdq m1, m1 + punpckhdq m2, m2 + movd [dstq+strideq*2], m1 + movd [dstq+stride3q ], m2 + RET +%endmacro + +INIT_MMX mmxext +HU_MMX_FUNCS +INIT_MMX ssse3 +HU_MMX_FUNCS + +%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32 +cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l + movq m0, [lq] +%if cpuflag(ssse3) + pshufb m0, [pb_0to6_9x7] +%else + punpcklbw m1, m0, m0 ; 0011223344556677 + punpckhwd m1, m1 ; 4444555566667777 + shufps m0, m1, q3310 ; 0123456777777777 +%endif + psrldq m1, m0, 1 + psrldq m2, m1, 1 + LOWPASS 2, 1, 0, 3 + pavgb m1, m0 + DEFINE_ARGS dst, stride, stride3, dst4 + lea stride3q, [strideq*3] + lea dst4q, [dstq+strideq*4] + SBUTTERFLY bw, 1, 2, 0 + movq [dstq +strideq*0], m1 + movhps [dst4q+strideq*0], m1 + PALIGNR m0, m2, m1, 2, m3 + movq [dstq +strideq*1], m0 + movhps [dst4q+strideq*1], m0 + PALIGNR m0, m2, m1, 4, m3 + movq [dstq +strideq*2], m0 + movhps [dst4q+strideq*2], m0 + PALIGNR m2, m1, 6, m3 + movq [dstq +stride3q ], m2 + movhps [dst4q+stride3q ], m2 + RET + +cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l + mova m0, [lq] +%if cpuflag(ssse3) + mova m3, [pb_2toE_3xF] + pshufb m1, m0, [pb_1toE_2xF] + pshufb m2, m0, m3 +%else + pand m3, m0, [pb_15x0_1xm1] + psrldq m1, m0, 1 + por m1, m3 + punpckhbw m3, m3 + psrldq m2, m0, 2 + por m2, m3 +%endif + LOWPASS 2, 1, 0, 4 + pavgb m1, m0 + DEFINE_ARGS dst, stride, stride9, cnt + lea stride9q, [strideq*8+strideq] + mov cntd, 4 + SBUTTERFLY bw, 1, 2, 0 + +.loop: + mova [dstq+strideq*0], m1 + mova [dstq+strideq*8], m2 + PALIGNR m0, m2, m1, 2, m4 +%if cpuflag(ssse3) + pshufb m2, m3 +%else + psrldq m2, 2 + por m2, m3 +%endif + mova [dstq+strideq*1], m0 + mova [dstq+stride9q ], m2 + PALIGNR m1, m2, m0, 2, m4 +%if cpuflag(ssse3) + pshufb m2, m3 +%else + psrldq m2, 2 + por m2, m3 +%endif + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l + mova m1, [lq] + mova m0, [lq+16] + PALIGNR m2, m0, m1, 1, m5 + PALIGNR m3, m0, m1, 2, m5 + LOWPASS 3, 2, 1, 5 + pavgb m2, m1 +%if cpuflag(ssse3) + mova m4, [pb_2toE_3xF] + pshufb m5, m0, [pb_1toE_2xF] + pshufb m1, m0, m4 +%else + pand m4, m0, [pb_15x0_1xm1] + psrldq m5, m0, 1 + por m5, m4 + punpckhbw m4, m4 + psrldq m1, m0, 2 + por m1, m4 +%endif + LOWPASS 1, 5, 0, 6 + pavgb m0, m5 + DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24 + mov cntd, 8 + xor stride0q, stride0q + lea dst8q, [dstq +strideq*8] + lea dst16q, [dst8q +strideq*8] + lea dst24q, [dst16q+strideq*8] + SBUTTERFLY bw, 0, 1, 5 + SBUTTERFLY bw, 2, 3, 5 +%if cpuflag(ssse3) + pshufb m6, m1, [pb_15] +%else + pshufhw m6, m4, q3333 + punpckhqdq m6, m6 +%endif + +.loop: + mova [dstq +stride0q+ 0], m2 + mova [dstq +stride0q+16], m3 + mova [dst8q +stride0q+ 0], m3 + mova [dst8q +stride0q+16], m0 + mova [dst16q+stride0q+ 0], m0 + mova [dst16q+stride0q+16], m1 + mova [dst24q+stride0q+ 0], m1 + mova [dst24q+stride0q+16], m6 +%if cpuflag(avx) + palignr m2, m3, m2, 2 + palignr m3, m0, m3, 2 + palignr m0, m1, m0, 2 + pshufb m1, m4 +%elif cpuflag(ssse3) + pshufb m5, m1, m4 + palignr m1, m0, 2 + palignr m0, m3, 2 + palignr m3, m2, 2 + mova m2, m3 + mova m3, m0 + mova m0, m1 + mova m1, m5 +%else + ; half-integrated version of PALIGNR + pslldq m5, m1, 14 + pslldq m7, m0, 14 + psrldq m1, 2 + psrldq m0, 2 + por m1, m4 + por m0, m5 + pslldq m5, m3, 14 + psrldq m3, 2 + por m3, m7 + psrldq m2, 2 + por m2, m5 +%endif + add stride0q, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +HU_XMM_FUNCS 8 +INIT_XMM ssse3 +HU_XMM_FUNCS 7 +INIT_XMM avx +HU_XMM_FUNCS 7 + +; FIXME 127, 128, 129 ? diff --git a/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm new file mode 100644 index 0000000000..808056a809 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9intrapred_16bpp.asm @@ -0,0 +1,2497 @@ +;****************************************************************************** +;* VP9 Intra prediction SIMD optimizations +;* +;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> +;* Copyright (c) 2015 Henrik Gramner <henrik gramner com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pd_2: times 8 dd 2 +pd_4: times 8 dd 4 +pd_8: times 8 dd 8 + +pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15 +pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0 +pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7 + +cextern pw_1 +cextern pw_1023 +cextern pw_4095 +cextern pd_16 +cextern pd_32 +cextern pd_65535; + +; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take +; only 3 registers on x86-32, which would make it one cycle faster, but that +; would make the code quite a bit uglier... + +SECTION .text + +%macro SCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%if %0 == 4 +%define reg_%4 m%2 +%endif +%else + mova [%3], m%1 +%if %0 == 4 +%define reg_%4 [%3] +%endif +%endif +%endmacro + +%macro UNSCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%if %0 == 4 +%undef reg_%4 +%endif +%endmacro + +%macro PRELOAD 2-3 +%if ARCH_X86_64 + mova m%1, [%2] +%if %0 == 3 +%define reg_%3 m%1 +%endif +%elif %0 == 3 +%define reg_%3 [%2] +%endif +%endmacro + +INIT_MMX mmx +cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse +cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse +cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + mova m1, [aq+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse +cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0] + mova m1, [aq+mmsize*1] + mova m2, [aq+mmsize*2] + mova m3, [aq+mmsize*3] + DEFINE_ARGS dst, stride, cnt + mov cntd, 16 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m3 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*1+32], m2 + mova [dstq+strideq*1+48], m3 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +INIT_MMX mmxext +cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a + mova m3, [lq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufw m0, m3, q3333 + pshufw m1, m3, q2222 + pshufw m2, m3, q1111 + pshufw m3, m3, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a + mova m2, [lq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + punpckhwd m3, m2, m2 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufd m0, m3, q1111 + pshufd m1, m3, q0000 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + punpcklwd m2, m2 + pshufd m0, m2, q3333 + pshufd m1, m2, q2222 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufd m0, m2, q1111 + pshufd m1, m2, q0000 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt + mov cntd, 3 + lea stride3q, [strideq*3] +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*2+ 0], m2 + mova [dstq+strideq*2+16], m2 + mova [dstq+stride3q + 0], m3 + mova [dstq+stride3q +16], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt + mov cntd, 7 + lea stride3q, [strideq*3] +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m1 + mova [dstq+strideq*1+32], m1 + mova [dstq+strideq*1+48], m1 + mova [dstq+strideq*2+ 0], m2 + mova [dstq+strideq*2+16], m2 + mova [dstq+strideq*2+32], m2 + mova [dstq+strideq*2+48], m2 + mova [dstq+stride3q + 0], m3 + mova [dstq+stride3q +16], m3 + mova [dstq+stride3q +32], m3 + mova [dstq+stride3q +48], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +INIT_MMX mmxext +cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufw m1, m0, q3232 + paddd m0, [pd_4] + paddd m0, m1 + psrad m0, 3 + pshufw m0, m0, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [aq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_8] + paddd m0, m1 + psrad m0, 4 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq] + paddw m0, [lq+mmsize] + paddw m0, [aq] + paddw m0, [aq+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_16] + paddd m0, m1 + psrad m0, 5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a + mova m0, [lq+mmsize*0] + paddw m0, [lq+mmsize*1] + paddw m0, [lq+mmsize*2] + paddw m0, [lq+mmsize*3] + paddw m0, [aq+mmsize*0] + paddw m0, [aq+mmsize*1] + paddw m0, [aq+mmsize*2] + paddw m0, [aq+mmsize*3] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 16 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_32] + paddd m0, m1 + psrad m0, 6 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*1+32], m0 + mova [dstq+strideq*1+48], m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET + +%macro DC_1D_FNS 2 +INIT_MMX mmxext +cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufw m1, m0, q3232 + paddd m0, [pd_2] + paddd m0, m1 + psrad m0, 2 + pshufw m0, m0, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_4] + paddd m0, m1 + psrad m0, 3 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2] + paddw m0, [%2+mmsize] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_8] + paddd m0, m1 + psrad m0, 4 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*2+ 0], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q + 0], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET + +INIT_XMM sse2 +cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a + mova m0, [%2+mmsize*0] + paddw m0, [%2+mmsize*1] + paddw m0, [%2+mmsize*2] + paddw m0, [%2+mmsize*3] + DEFINE_ARGS dst, stride, cnt + mov cntd, 16 + pmaddwd m0, [pw_1] + pshufd m1, m0, q3232 + paddd m0, m1 + pshufd m1, m0, q1111 + paddd m0, [pd_16] + paddd m0, m1 + psrad m0, 5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m0 + mova [dstq+strideq*0+32], m0 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m0 + mova [dstq+strideq*1+32], m0 + mova [dstq+strideq*1+48], m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop + RET +%endmacro + +DC_1D_FNS top, aq +DC_1D_FNS left, lq + +INIT_MMX mmxext +cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a + mova m5, [pw_1023] +.body: + mova m4, [aq] + mova m3, [lq] + movd m0, [aq-4] + pshufw m0, m0, q1111 + psubw m4, m0 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufw m0, m3, q3333 + pshufw m1, m3, q2222 + pshufw m2, m3, q1111 + pshufw m3, m3, q0000 + paddw m0, m4 + paddw m1, m4 + paddw m2, m4 + paddw m3, m4 + pxor m4, m4 + pmaxsw m0, m4 + pmaxsw m1, m4 + pmaxsw m2, m4 + pmaxsw m3, m4 + pminsw m0, m5 + pminsw m1, m5 + pminsw m2, m5 + pminsw m3, m5 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + RET + +cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a + mova m4, [pw_1023] +.body: + pxor m6, m6 + mova m5, [aq] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m5, m0 + DEFINE_ARGS dst, stride, l, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 1 +.loop: + movh m3, [lq+cntq*8] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + paddw m0, m5 + paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + pmaxsw m0, m6 + pmaxsw m1, m6 + pmaxsw m2, m6 + pmaxsw m3, m6 + pminsw m0, m4 + pminsw m1, m4 + pminsw m2, m4 + pminsw m3, m4 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a + mova m4, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a + mova m7, [pw_1023] +.body: + pxor m6, m6 + mova m4, [aq] + mova m5, [aq+mmsize] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m4, m0 + psubw m5, m0 + DEFINE_ARGS dst, stride, l, cnt + mov cntd, 7 +.loop: + movd m3, [lq+cntq*4] + punpcklwd m3, m3 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + paddw m0, m2, m4 + paddw m2, m5 + paddw m1, m3, m4 + paddw m3, m5 + pmaxsw m0, m6 + pmaxsw m2, m6 + pmaxsw m1, m6 + pmaxsw m3, m6 + pminsw m0, m7 + pminsw m2, m7 + pminsw m1, m7 + pminsw m3, m7 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m2 + mova [dstq+strideq*1+ 0], m1 + mova [dstq+strideq*1+16], m3 + lea dstq, [dstq+strideq*2] + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a + mova m7, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body + +INIT_XMM sse2 +cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a + mova m0, [pw_1023] +.body: + pxor m1, m1 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 +%define reg_min m9 +%define reg_max m8 +%else + mova [rsp+ 0], m0 + mova [rsp+16], m1 +%define reg_min [rsp+16] +%define reg_max [rsp+ 0] +%endif + + mova m4, [aq+mmsize*0] + mova m5, [aq+mmsize*1] + mova m6, [aq+mmsize*2] + mova m7, [aq+mmsize*3] + movd m0, [aq-4] + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + psubw m4, m0 + psubw m5, m0 + psubw m6, m0 + psubw m7, m0 + DEFINE_ARGS dst, stride, l, cnt + mov cntd, 31 +.loop: + pinsrw m3, [lq+cntq*2], 0 + punpcklwd m3, m3 + pshufd m3, m3, q0000 + paddw m0, m3, m4 + paddw m1, m3, m5 + paddw m2, m3, m6 + paddw m3, m7 + pmaxsw m0, reg_min + pmaxsw m1, reg_min + pmaxsw m2, reg_min + pmaxsw m3, reg_min + pminsw m0, reg_max + pminsw m1, reg_max + pminsw m2, reg_max + pminsw m3, reg_max + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m3 + add dstq, strideq + dec cntd + jge .loop + RET + +cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a + mova m0, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body + +; Directional intra predicion functions +; +; in the functions below, 'abcdefgh' refers to above data (sometimes simply +; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply +; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered +; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered +; top-left data. + +; left=(left+2*center+right+2)>>2 +%macro LOWPASS 3 ; left [dst], center, right + paddw m%1, m%3 + psraw m%1, 1 + pavgw m%1, m%2 +%endmacro + +; abcdefgh (src) -> bcdefghh (dst) +; dst/src can be the same register +%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg] +%if cpuflag(ssse3) + pshufb %1, %2, %3 ; abcdefgh -> bcdefghh +%else + psrldq %1, %2, 2 ; abcdefgh -> bcdefgh. + pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh +%endif +%endmacro + +; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2) +%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg] +%if cpuflag(ssse3) + pshufb %1, %3, %4 ; abcdefgh -> bcdefghh + pshufb %2, %1, %4 ; bcdefghh -> cdefghhh +%else + psrldq %1, %3, 2 ; abcdefgh -> bcdefgh. + psrldq %2, %3, 4 ; abcdefgh -> cdefgh.. + pshufhw %1, %1, q2210 ; bcdefgh. -> bcdefghh + pshufhw %2, %2, q1110 ; cdefgh.. -> cdefghhh +%endif +%endmacro + +%macro DL_FUNCS 0 +cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a + movifnidn aq, amp + movu m1, [aq] ; abcdefgh + pshufhw m0, m1, q3310 ; abcdefhh + SHIFT_RIGHT m1, m1 ; bcdefghh + psrldq m2, m1, 2 ; cdefghh. + LOWPASS 0, 1, 2 ; BCDEFGh. + pshufd m1, m0, q3321 ; DEFGh... + movh [dstq+strideq*0], m0 + movh [dstq+strideq*2], m1 + add dstq, strideq + psrldq m0, 2 ; CDEFGh.. + psrldq m1, 2 ; EFGh.... + movh [dstq+strideq*0], m0 + movh [dstq+strideq*2], m1 + RET + +cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefgh +%if cpuflag(ssse3) + mova m4, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m2, m0, m4 ; bcdefghh/cdefghhh + LOWPASS 0, 1, 2 ; BCDEFGHh + shufps m1, m0, m2, q3332 ; FGHhhhhh + shufps m3, m0, m1, q2121 ; DEFGHhhh + DEFINE_ARGS dst, stride, stride5 + lea stride5q, [strideq*5] + + mova [dstq+strideq*0], m0 + mova [dstq+strideq*4], m1 + SHIFT_RIGHT m0, m0, m4 ; CDEFGHhh + pshuflw m1, m1, q3321 ; GHhhhhhh + pshufd m2, m0, q3321 ; EFGHhhhh + mova [dstq+strideq*1], m0 + mova [dstq+stride5q ], m1 + lea dstq, [dstq+strideq*2] + pshuflw m1, m1, q3321 ; Hhhhhhhh + mova [dstq+strideq*0], m3 + mova [dstq+strideq*4], m1 + pshuflw m1, m1, q3321 ; hhhhhhhh + mova [dstq+strideq*1], m2 + mova [dstq+stride5q ], m1 + RET + +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefgh + mova m3, [aq+mmsize] ; ijklmnop + PALIGNR m1, m3, m0, 2, m4 ; bcdefghi + PALIGNR m2, m3, m0, 4, m4 ; cdefghij + LOWPASS 0, 1, 2 ; BCDEFGHI +%if cpuflag(ssse3) + mova m4, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m2, m1, m3, m4 ; jklmnopp/klmnoppp + LOWPASS 1, 2, 3 ; JKLMNOPp + pshufd m2, m2, q3333 ; pppppppp + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 + +.loop: + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*8+ 0], m1 + mova [dstq+strideq*8+16], m2 + add dstq, strideq +%if cpuflag(avx) + vpalignr m0, m1, m0, 2 +%else + PALIGNR m3, m1, m0, 2, m4 + mova m0, m3 +%endif + SHIFT_RIGHT m1, m1, m4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0] ; abcdefgh + mova m1, [aq+mmsize*1] ; ijklmnop + mova m2, [aq+mmsize*2] ; qrstuvwx + mova m3, [aq+mmsize*3] ; yz012345 + PALIGNR m4, m1, m0, 2, m6 + PALIGNR m5, m1, m0, 4, m6 + LOWPASS 0, 4, 5 ; BCDEFGHI + PALIGNR m4, m2, m1, 2, m6 + PALIGNR m5, m2, m1, 4, m6 + LOWPASS 1, 4, 5 ; JKLMNOPQ + PALIGNR m4, m3, m2, 2, m6 + PALIGNR m5, m3, m2, 4, m6 + LOWPASS 2, 4, 5 ; RSTUVWXY +%if cpuflag(ssse3) + mova m6, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m4, m5, m3, m6 + LOWPASS 3, 4, 5 ; Z0123455 + pshufd m4, m4, q3333 ; 55555555 + DEFINE_ARGS dst, stride, stride8, stride24, cnt + mov cntd, 8 + lea stride8q, [strideq*8] + lea stride24q, [stride8q*3] + +.loop: + mova [dstq+stride8q*0+ 0], m0 + mova [dstq+stride8q*0+16], m1 + mova [dstq+stride8q*0+32], m2 + mova [dstq+stride8q*0+48], m3 + mova [dstq+stride8q*1+ 0], m1 + mova [dstq+stride8q*1+16], m2 + mova [dstq+stride8q*1+32], m3 + mova [dstq+stride8q*1+48], m4 + mova [dstq+stride8q*2+ 0], m2 + mova [dstq+stride8q*2+16], m3 + mova [dstq+stride8q*2+32], m4 + mova [dstq+stride8q*2+48], m4 + mova [dstq+stride24q + 0], m3 + mova [dstq+stride24q +16], m4 + mova [dstq+stride24q +32], m4 + mova [dstq+stride24q +48], m4 + add dstq, strideq +%if cpuflag(avx) + vpalignr m0, m1, m0, 2 + vpalignr m1, m2, m1, 2 + vpalignr m2, m3, m2, 2 +%else + PALIGNR m5, m1, m0, 2, m6 + mova m0, m5 + PALIGNR m5, m2, m1, 2, m6 + mova m1, m5 + PALIGNR m5, m3, m2, 2, m6 + mova m2, m5 +%endif + SHIFT_RIGHT m3, m3, m6 + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DL_FUNCS +INIT_XMM ssse3 +DL_FUNCS +INIT_XMM avx +DL_FUNCS + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefghijklmnop + vpbroadcastw xm1, [aq+30] ; pppppppp + vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp + vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp + vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp + LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp + vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp + DEFINE_ARGS dst, stride, stride3, cnt + mov cntd, 2 + lea stride3q, [strideq*3] + +.loop: + mova [dstq+strideq*0], m0 + vpalignr m3, m2, m0, 2 + vpalignr m4, m2, m0, 4 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 6 + vpalignr m4, m2, m0, 8 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m4 + vpalignr m3, m2, m0, 10 + vpalignr m4, m2, m0, 12 + mova [dstq+strideq*1], m3 + mova [dstq+strideq*2], m4 + vpalignr m3, m2, m0, 14 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + mova m0, m2 + vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop + mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 + vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555 + vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx + vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq + vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr + LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ + vperm2i128 m5, m1, m4, q0201 ; yz01234555555555 + vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455 + vpalignr m3, m5, m1, 4 ; stuvwxyz01234555 + LOWPASS 1, 2, 3 ; RSTUVWXYZ......5 + vperm2i128 m2, m1, m4, q0201 ; Z......555555555 + vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + +.loop: + mova [dstq+strideq*0 + 0], m0 + mova [dstq+strideq*0 +32], m1 + vpalignr m3, m5, m0, 2 + vpalignr m4, m2, m1, 2 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 4 + vpalignr m4, m2, m1, 4 + mova [dstq+strideq*2 + 0], m3 + mova [dstq+strideq*2 +32], m4 + vpalignr m3, m5, m0, 6 + vpalignr m4, m2, m1, 6 + mova [dstq+stride3q*1+ 0], m3 + mova [dstq+stride3q*1+32], m4 + lea dstq, [dstq+strideq*4] + vpalignr m3, m5, m0, 8 + vpalignr m4, m2, m1, 8 + mova [dstq+strideq*0 + 0], m3 + mova [dstq+strideq*0 +32], m4 + vpalignr m3, m5, m0, 10 + vpalignr m4, m2, m1, 10 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 12 + vpalignr m4, m2, m1, 12 + mova [dstq+strideq*2+ 0], m3 + mova [dstq+strideq*2+32], m4 + vpalignr m3, m5, m0, 14 + vpalignr m4, m2, m1, 14 + mova [dstq+stride3q+ 0], m3 + mova [dstq+stride3q+ 32], m4 + vpalignr m3, m5, m0, 16 + vpalignr m4, m2, m1, 16 + vperm2i128 m5, m3, m4, q0201 + vperm2i128 m2, m4, m4, q0101 + mova m0, m3 + mova m1, m4 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif + +%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function +cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a + movh m0, [lq] ; wxyz.... + movhps m0, [aq-2] ; wxyz*abc + movd m1, [aq+6] ; d....... + PALIGNR m1, m0, 2, m2 ; xyz*abcd + psrldq m2, m1, 2 ; yz*abcd. + LOWPASS 0, 1, 2 ; XYZ#ABC. + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+stride3q ], m0 + psrldq m0, 2 ; YZ#ABC.. + movh [dstq+strideq*2], m0 + psrldq m0, 2 ; Z#ABC... + movh [dstq+strideq*1], m0 + psrldq m0, 2 ; #ABC.... + movh [dstq+strideq*0], m0 + RET + +cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a + mova m0, [lq] ; stuvwxyz + movu m1, [aq-2] ; *abcdefg + mova m2, [aq] ; abcdefgh + psrldq m3, m2, 2 ; bcdefgh. + LOWPASS 3, 2, 1 ; ABCDEFG. + PALIGNR m1, m0, 2, m4 ; tuvwxyz* + PALIGNR m2, m1, 2, m4 ; uvwxyz*a + LOWPASS 2, 1, 0 ; TUVWXYZ# + DEFINE_ARGS dst, stride, dst4, stride3 + lea stride3q, [strideq*3] + lea dst4q, [dstq+strideq*4] + + movhps [dstq +stride3q +0], m2 + movh [dstq+ stride3q +8], m3 + mova [dst4q+stride3q +0], m2 + PALIGNR m1, m3, m2, 2, m0 + psrldq m3, 2 + movhps [dstq +strideq*2+0], m1 + movh [dstq+ strideq*2+8], m3 + mova [dst4q+strideq*2+0], m1 + PALIGNR m2, m3, m1, 2, m0 + psrldq m3, 2 + movhps [dstq +strideq*1+0], m2 + movh [dstq+ strideq*1+8], m3 + mova [dst4q+strideq*1+0], m2 + PALIGNR m1, m3, m2, 2, m0 + psrldq m3, 2 + movhps [dstq +strideq*0+0], m1 + movh [dstq+ strideq*0+8], m3 + mova [dst4q+strideq*0+0], m1 + RET + +cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a + mova m0, [lq] ; klmnopqr + mova m1, [lq+mmsize] ; stuvwxyz + movu m2, [aq-2] ; *abcdefg + movu m3, [aq+mmsize-2] ; hijklmno + mova m4, [aq] ; abcdefgh + mova m5, [aq+mmsize] ; ijklmnop + psrldq m6, m5, 2 ; jklmnop. + LOWPASS 6, 5, 3 ; IJKLMNO. + PALIGNR m5, m4, 2, m3 ; bcdefghi + LOWPASS 5, 4, 2 ; ABCDEFGH + PALIGNR m2, m1, 2, m3 ; tuvwxyz* + PALIGNR m4, m2, 2, m3 ; uvwxyz*a + LOWPASS 4, 2, 1 ; TUVWXYZ# + PALIGNR m1, m0, 2, m3 ; lmnopqrs + PALIGNR m2, m1, 2, m3 ; mnopqrst + LOWPASS 2, 1, 0 ; LMNOPQRS + DEFINE_ARGS dst, stride, dst8, cnt + lea dst8q, [dstq+strideq*8] + mov cntd, 8 + +.loop: + sub dst8q, strideq + mova [dst8q+strideq*0+ 0], m4 + mova [dst8q+strideq*0+16], m5 + mova [dst8q+strideq*8+ 0], m2 + mova [dst8q+strideq*8+16], m4 +%if cpuflag(avx) + vpalignr m2, m4, m2, 2 + vpalignr m4, m5, m4, 2 + vpalignr m5, m6, m5, 2 +%else + PALIGNR m0, m4, m2, 2, m1 + mova m2, m0 + PALIGNR m0, m5, m4, 2, m1 + mova m4, m0 + PALIGNR m0, m6, m5, 2, m1 + mova m5, m0 +%endif + psrldq m6, 2 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \ + %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a + mova m0, [aq+mmsize*3] ; a[24-31] + movu m1, [aq+mmsize*3-2] ; a[23-30] + psrldq m2, m0, 2 ; a[25-31]. + LOWPASS 2, 0, 1 ; A[24-30]. + mova m1, [aq+mmsize*2] ; a[16-23] + movu m3, [aq+mmsize*2-2] ; a[15-22] + PALIGNR m0, m1, 2, m4 ; a[17-24] + LOWPASS 0, 1, 3 ; A[16-23] + mova m3, [aq+mmsize*1] ; a[8-15] + movu m4, [aq+mmsize*1-2] ; a[7-14] + PALIGNR m1, m3, 2, m5 ; a[9-16] + LOWPASS 1, 3, 4 ; A[8-15] + mova m4, [aq+mmsize*0] ; a[0-7] + movu m5, [aq+mmsize*0-2] ; *a[0-6] + PALIGNR m3, m4, 2, m6 ; a[1-8] + LOWPASS 3, 4, 5 ; A[0-7] + SCRATCH 1, 8, rsp+0*mmsize + SCRATCH 3, 9, rsp+1*mmsize +%if notcpuflag(ssse3) + SCRATCH 0, 10, rsp+2*mmsize +%endif + mova m6, [lq+mmsize*3] ; l[24-31] + PALIGNR m5, m6, 2, m0 ; l[25-31]* + PALIGNR m4, m5, 2, m0 ; l[26-31]*a + LOWPASS 4, 5, 6 ; L[25-31]# + mova m7, [lq+mmsize*2] ; l[16-23] + PALIGNR m6, m7, 2, m0 ; l[17-24] + PALIGNR m5, m6, 2, m0 ; l[18-25] + LOWPASS 5, 6, 7 ; L[17-24] + mova m1, [lq+mmsize*1] ; l[8-15] + PALIGNR m7, m1, 2, m0 ; l[9-16] + PALIGNR m6, m7, 2, m0 ; l[10-17] + LOWPASS 6, 7, 1 ; L[9-16] + mova m3, [lq+mmsize*0] ; l[0-7] + PALIGNR m1, m3, 2, m0 ; l[1-8] + PALIGNR m7, m1, 2, m0 ; l[2-9] + LOWPASS 7, 1, 3 ; L[1-8] +%if cpuflag(ssse3) +%if cpuflag(avx) + UNSCRATCH 1, 8, rsp+0*mmsize +%endif + UNSCRATCH 3, 9, rsp+1*mmsize +%else + UNSCRATCH 0, 10, rsp+2*mmsize +%endif + DEFINE_ARGS dst8, stride, stride8, stride24, cnt + lea stride8q, [strideq*8] + lea stride24q, [stride8q*3] + lea dst8q, [dst8q+strideq*8] + mov cntd, 8 + +.loop: + sub dst8q, strideq +%if notcpuflag(avx) + UNSCRATCH 1, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 3, 9, rsp+1*mmsize +%endif +%endif + mova [dst8q+stride8q*0+ 0], m4 + mova [dst8q+stride8q*0+16], m3 + mova [dst8q+stride8q*0+32], m1 + mova [dst8q+stride8q*0+48], m0 + mova [dst8q+stride8q*1+ 0], m5 + mova [dst8q+stride8q*1+16], m4 + mova [dst8q+stride8q*1+32], m3 + mova [dst8q+stride8q*1+48], m1 + mova [dst8q+stride8q*2+ 0], m6 + mova [dst8q+stride8q*2+16], m5 + mova [dst8q+stride8q*2+32], m4 + mova [dst8q+stride8q*2+48], m3 + mova [dst8q+stride24q + 0], m7 + mova [dst8q+stride24q +16], m6 + mova [dst8q+stride24q +32], m5 + mova [dst8q+stride24q +48], m4 +%if cpuflag(avx) + vpalignr m7, m6, m7, 2 + vpalignr m6, m5, m6, 2 + vpalignr m5, m4, m5, 2 + vpalignr m4, m3, m4, 2 + vpalignr m3, m1, m3, 2 + vpalignr m1, m0, m1, 2 + vpalignr m0, m2, m0, 2 +%else + SCRATCH 2, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + SCRATCH 0, 9, rsp+1*mmsize +%endif + PALIGNR m2, m6, m7, 2, m0 + mova m7, m2 + PALIGNR m2, m5, m6, 2, m0 + mova m6, m2 + PALIGNR m2, m4, m5, 2, m0 + mova m5, m2 + PALIGNR m2, m3, m4, 2, m0 + mova m4, m2 + PALIGNR m2, m1, m3, 2, m0 + mova m3, m2 +%if notcpuflag(ssse3) + UNSCRATCH 0, 9, rsp+1*mmsize + SCRATCH 3, 9, rsp+1*mmsize +%endif + PALIGNR m2, m0, m1, 2, m3 + mova m1, m2 + UNSCRATCH 2, 8, rsp+0*mmsize + SCRATCH 1, 8, rsp+0*mmsize + PALIGNR m1, m2, m0, 2, m3 + mova m0, m1 +%endif + psrldq m2, 2 + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +DR_FUNCS 3 +INIT_XMM ssse3 +DR_FUNCS 2 +INIT_XMM avx +DR_FUNCS 2 + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a + mova m0, [lq] ; klmnopqrstuvwxyz + movu m1, [aq-2] ; *abcdefghijklmno + mova m2, [aq] ; abcdefghijklmnop + vperm2i128 m4, m2, m2, q2001 ; ijklmnop........ + vpalignr m5, m4, m2, 2 ; bcdefghijklmnop. + vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg + LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. + vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz* + vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a + LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# + vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH + DEFINE_ARGS dst, stride, stride3, stride5, dst3 + lea dst3q, [dstq+strideq*4] + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + + vpalignr m3, m5, m0, 2 + vpalignr m4, m1, m5, 2 + mova [dst3q+stride5q*2], m3 ; 14 + mova [ dstq+stride3q*2], m4 ; 6 + vpalignr m3, m5, m0, 4 + vpalignr m4, m1, m5, 4 + sub dst3q, strideq + mova [dst3q+stride5q*2], m3 ; 13 + mova [dst3q+strideq*2 ], m4 ; 5 + mova [dst3q+stride3q*4], m0 ; 15 + vpalignr m3, m5, m0, 6 + vpalignr m4, m1, m5, 6 + mova [dstq+stride3q*4], m3 ; 12 + mova [dst3q+strideq*1], m4 ; 4 + vpalignr m3, m5, m0, 8 + vpalignr m4, m1, m5, 8 + mova [dst3q+strideq*8], m3 ; 11 + mova [dst3q+strideq*0], m4 ; 3 + vpalignr m3, m5, m0, 10 + vpalignr m4, m1, m5, 10 + mova [dstq+stride5q*2], m3 ; 10 + mova [dstq+strideq*2 ], m4 ; 2 + vpalignr m3, m5, m0, 12 + vpalignr m4, m1, m5, 12 + mova [dst3q+stride3q*2], m3 ; 9 + mova [dstq+strideq*1 ], m4 ; 1 + vpalignr m3, m5, m0, 14 + vpalignr m4, m1, m5, 14 + mova [dstq+strideq*8], m3 ; 8 + mova [dstq+strideq*0], m4 ; 0 + mova [dst3q+strideq*4], m5 ; 7 + RET + +cglobal vp9_ipred_vl_16x16_16, 4, 5, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefghijklmnop + vpbroadcastw xm1, [aq+30] ; pppppppp + vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp + vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp + vperm2i128 m4, m3, m1, q0201 ; jklmnopppppppppp + vpalignr m5, m2, m0, 4 ; cdefghijklmnoppp + vperm2i128 m6, m5, m1, q0201 ; klmnoppppppppppp + LOWPASS 5, 3, 0 ; BCDEFGHIJKLMNOPP + LOWPASS 6, 4, 2 ; JKLMNOPPPPPPPPPP + pavgw m3, m0 ; abcdefghijklmnop + pavgw m4, m2 ; ijklmnoppppppppp + DEFINE_ARGS dst, stride, stride3, stride5, dst4 + lea dst4q, [dstq+strideq*4] + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + + mova [dstq+strideq*0], m3 ; 0 abcdefghijklmnop + mova [dstq+strideq*1], m5 ; 1 BCDEFGHIJKLMNOPP + vpalignr m0, m4, m3, 2 + vpalignr m1, m6, m5, 2 + mova [dstq+strideq*2 ], m0 ; 2 bcdefghijklmnopp + mova [dstq+stride3q*1], m1 ; 3 CDEFGHIJKLMNOPPP + vpalignr m0, m4, m3, 4 + vpalignr m1, m6, m5, 4 + mova [dst4q+strideq*0], m0 ; 4 cdefghijklmnoppp + mova [dstq+stride5q*1], m1 ; 5 DEFGHIJKLMNOPPPP + vpalignr m0, m4, m3, 6 + vpalignr m1, m6, m5, 6 + mova [ dstq+stride3q*2], m0 ; 6 defghijklmnopppp + mova [dst4q+stride3q*1], m1 ; 7 EFGHIJKLMNOPPPPP + vpalignr m0, m4, m3, 8 + vpalignr m1, m6, m5, 8 + mova [ dstq+strideq*8], m0 ; 8 efghijklmnoppppp + mova [dst4q+stride5q*1], m1 ; 9 FGHIJKLMNOPPPPPP + vpalignr m0, m4, m3, 10 + mova [dstq+stride5q*2], m0 ; 10 fghijklmnopppppp + vpalignr m0, m4, m3, 12 + mova [dst4q+strideq*8], m0 ; 12 ghijklmnoppppppp + vpalignr m0, m4, m3, 14 + mova [dst4q+stride5q*2], m0 ; 14 hijklmnopppppppp + sub dst4q, strideq + vpalignr m1, m6, m5, 10 + mova [dst4q+strideq*8], m1 ; 11 GHIJKLMNOPPPPPPP + vpalignr m1, m6, m5, 12 + mova [dst4q+stride5q*2], m1 ; 13 HIJKLMNOPPPPPPPP + vpalignr m1, m6, m5, 14 + mova [dst4q+stride3q*4], m1 ; 15 IJKLMNOPPPPPPPPP + RET + +cglobal vp9_ipred_hd_16x16_16, 4, 5, 7, dst, stride, l, a + movu m0, [aq-2] ; *abcdefghijklmno + mova m1, [lq] ; klmnopqrstuvwxyz + vperm2i128 m2, m1, m0, q0201 ; stuvwxyz*abcdefg + vpalignr m3, m2, m1, 2 ; lmnopqrstuvwxyz* + vpalignr m4, m2, m1, 4 ; mnopqrstuvwxyz*a + LOWPASS 4, 3, 1 ; LMNOPQRSTUVWXYZ# + pavgw m3, m1 ; klmnopqrstuvwxyz + mova m1, [aq] ; abcdefghijklmnop + movu m2, [aq+2] ; bcdefghijklmnop. + LOWPASS 2, 1, 0 ; ABCDEFGHIJKLMNO. + vpunpcklwd m0, m3, m4 ; kLlMmNnOsTtUuVvW + vpunpckhwd m1, m3, m4 ; oPpQqRrSwXxYyZz# + vperm2i128 m3, m1, m0, q0002 ; kLlMmNnOoPpQqRrS + vperm2i128 m4, m0, m1, q0301 ; sTtUuVvWwXxYyZz# + vperm2i128 m0, m4, m2, q0201 ; wXxYyZz#ABCDEFGH + vperm2i128 m1, m3, m4, q0201 ; oPpQqRrSsTtUuVvW + DEFINE_ARGS dst, stride, stride3, stride5, dst5 + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + lea dst5q, [dstq+stride5q] + + mova [dst5q+stride5q*2], m3 ; 15 kLlMmNnOoPpQqRrS + mova [dst5q+stride3q*2], m1 ; 11 oPpQqRrSsTtUuVvW + mova [dst5q+strideq*2], m4 ; 7 sTtUuVvWwXxYyZz# + mova [dstq+stride3q*1], m0 ; 3 wXxYyZz#ABCDEFGH + vpalignr m5, m4, m1, 4 + mova [dstq+stride5q*2], m5 ; 10 pQqRrSsTtUuVvWwX + vpalignr m5, m0, m4, 4 + vpalignr m6, m2, m0, 4 + mova [dstq+stride3q*2], m5 ; 6 tUuVvWwXxYyZz#AB + mova [dstq+strideq*2], m6 ; 2 xYyZz#ABCDEFGHIJ + vpalignr m5, m4, m1, 8 + mova [dst5q+strideq*4], m5 ; 9 qRrSsTtUuVvWwXxY + vpalignr m5, m0, m4, 8 + vpalignr m6, m2, m0, 8 + mova [dstq+stride5q*1], m5 ; 5 uVvWwXxYyZz#ABCD + mova [dstq+strideq*1], m6 ; 1 yZz#ABCDEFGHIJKL + vpalignr m5, m1, m3, 12 + vpalignr m6, m4, m1, 12 + mova [dstq+stride3q*4], m5 ; 12 nOoPpQqRrSsTtUuV + mova [dst5q+stride3q], m6 ; 8 rSsTtUuVvWwXxYyZ + vpalignr m5, m0, m4, 12 + vpalignr m6, m2, m0, 12 + mova [dstq+strideq*4], m5 ; 4 nOoPpQqRrSsTtUuV + mova [dstq+strideq*0], m6 ; 0 z#ABCDEFGHIJKLMN + sub dst5q, strideq + vpalignr m5, m1, m3, 4 + mova [dst5q+stride5q*2], m5 ; 14 lMmNnOoPpQqRrSsT + sub dst5q, strideq + vpalignr m5, m1, m3, 8 + mova [dst5q+stride5q*2], m5 ; 13 mNnOoPpQqRrSsTtU + RET + +%if ARCH_X86_64 +cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a + mova m0, [lq+mmsize*0+0] ; l[0-15] + mova m1, [lq+mmsize*1+0] ; l[16-31] + movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno + mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop + mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345 + vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0 + vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01 + vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012 + LOWPASS 0, 6, 7 ; L[0-15] + vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg + vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz* + vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a + LOWPASS 1, 5, 6 ; L[16-31]# + vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx + vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq + LOWPASS 2, 3, 6 ; A[0-15] + movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234 + vperm2i128 m6, m4, m4, q2001 ; yz012345........ + vpalignr m7, m6, m4, 2 ; rstuvwxyz012345. + LOWPASS 3, 4, 7 ; A[16-31]. + vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH + vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23] + vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX + DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt + lea stride3q, [strideq*3] + lea stride5q, [stride3q+strideq*2] + lea stride7q, [strideq*4+stride3q] + lea dst24q, [dst8q+stride3q*8] + lea dst8q, [dst8q+strideq*8] + mov cntd, 2 + +.loop: + mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7 + mova [dst24q+stride7q+32], m1 + mova [dst8q+stride7q+0], m1 + mova [dst8q+stride7q+32], m2 + vpalignr m6, m4, m1, 2 + vpalignr m7, m5, m0, 2 + vpalignr m9, m8, m2, 2 + mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6 + mova [dst24q+stride3q*2+32], m6 + mova [dst8q+stride3q*2+0], m6 + mova [dst8q+stride3q*2+32], m9 + vpalignr m6, m4, m1, 4 + vpalignr m7, m5, m0, 4 + vpalignr m9, m8, m2, 4 + mova [dst24q+stride5q+0], m7 ; 29 21 13 5 + mova [dst24q+stride5q+32], m6 + mova [dst8q+stride5q+0], m6 + mova [dst8q+stride5q+32], m9 + vpalignr m6, m4, m1, 6 + vpalignr m7, m5, m0, 6 + vpalignr m9, m8, m2, 6 + mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4 + mova [dst24q+strideq*4+32], m6 + mova [dst8q+strideq*4+0], m6 + mova [dst8q+strideq*4+32], m9 + vpalignr m6, m4, m1, 8 + vpalignr m7, m5, m0, 8 + vpalignr m9, m8, m2, 8 + mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3 + mova [dst24q+stride3q+32], m6 + mova [dst8q+stride3q+0], m6 + mova [dst8q+stride3q+32], m9 + vpalignr m6, m4, m1, 10 + vpalignr m7, m5, m0, 10 + vpalignr m9, m8, m2, 10 + mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2 + mova [dst24q+strideq*2+32], m6 + mova [dst8q+strideq*2+0], m6 + mova [dst8q+strideq*2+32], m9 + vpalignr m6, m4, m1, 12 + vpalignr m7, m5, m0, 12 + vpalignr m9, m8, m2, 12 + mova [dst24q+strideq+0 ], m7 ; 25 17 9 1 + mova [dst24q+strideq+32], m6 + mova [dst8q+strideq+0], m6 + mova [dst8q+strideq+32], m9 + vpalignr m6, m4, m1, 14 + vpalignr m7, m5, m0, 14 + vpalignr m9, m8, m2, 14 + mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0 + mova [dst24q+strideq*0+32], m6 + mova [dst8q+strideq*0+0], m6 + mova [dst8q+strideq*0+32], m9 + mova m0, m5 + mova m5, m1 + mova m1, m4 + mova m4, m2 + mova m2, m8 + mova m8, m3 + sub dst24q, stride7q + sub dst24q, strideq + sub dst8q, stride7q + sub dst8q, strideq + dec cntd + jg .loop + RET +%endif +%endif + +%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function +cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a + movifnidn aq, amp + movu m0, [aq] ; abcdefgh + psrldq m1, m0, 2 ; bcdefgh. + psrldq m2, m0, 4 ; cdefgh.. + LOWPASS 2, 1, 0 ; BCDEFGH. + pavgw m1, m0 ; ABCDEFG. + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+strideq*0], m1 + movh [dstq+strideq*1], m2 + psrldq m1, 2 + psrldq m2, 2 + movh [dstq+strideq*2], m1 + movh [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefgh +%if cpuflag(ssse3) + mova m3, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m2, m0, m3 ; bcdefghh/cdefghhh + LOWPASS 2, 1, 0 ; BCDEFGHh + pavgw m1, m0 ; ABCDEFGh + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + SHIFT_RIGHT m1, m1, m3 + SHIFT_RIGHT m2, m2, m3 + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + SHIFT_RIGHT m1, m1, m3 + SHIFT_RIGHT m2, m2, m3 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + SHIFT_RIGHT m1, m1, m3 + SHIFT_RIGHT m2, m2, m3 + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] + mova m1, [aq+mmsize] + PALIGNR m2, m1, m0, 2, m3 + PALIGNR m3, m1, m0, 4, m4 + LOWPASS 3, 2, 0 + pavgw m2, m0 +%if cpuflag(ssse3) + mova m4, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m5, m0, m1, m4 + LOWPASS 0, 5, 1 + pavgw m1, m5 + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 + +.loop: + mova [dstq+strideq*0+ 0], m2 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*1+ 0], m3 + mova [dstq+strideq*1+16], m0 + lea dstq, [dstq+strideq*2] +%if cpuflag(avx) + vpalignr m2, m1, m2, 2 + vpalignr m3, m0, m3, 2 +%else + PALIGNR m5, m1, m2, 2, m4 + mova m2, m5 + PALIGNR m5, m0, m3, 2, m4 + mova m3, m5 +%endif + SHIFT_RIGHT m1, m1, m4 + SHIFT_RIGHT m0, m0, m4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0] + mova m1, [aq+mmsize*1] + mova m2, [aq+mmsize*2] + PALIGNR m6, m1, m0, 2, m5 + PALIGNR m7, m1, m0, 4, m5 + LOWPASS 7, 6, 0 + pavgw m6, m0 + SCRATCH 6, 8, rsp+0*mmsize + PALIGNR m4, m2, m1, 2, m0 + PALIGNR m5, m2, m1, 4, m0 + LOWPASS 5, 4, 1 + pavgw m4, m1 + mova m0, [aq+mmsize*3] + PALIGNR m1, m0, m2, 2, m6 + PALIGNR m3, m0, m2, 4, m6 + LOWPASS 3, 1, 2 + pavgw m2, m1 +%if cpuflag(ssse3) + PRELOAD 10, pb_2to15_14_15, shuf +%endif + SHIFT_RIGHTx2 m6, m1, m0, reg_shuf + LOWPASS 1, 6, 0 + pavgw m0, m6 +%if ARCH_X86_64 + pshufd m9, m6, q3333 +%endif +%if cpuflag(avx) + UNSCRATCH 6, 8, rsp+0*mmsize +%endif + DEFINE_ARGS dst, stride, cnt, stride16, stride17 + mov stride16q, strideq + mov cntd, 8 + shl stride16q, 4 + lea stride17q, [stride16q+strideq] + + ; FIXME m8 is unused for avx, so we could save one register here for win64 +.loop: +%if notcpuflag(avx) + UNSCRATCH 6, 8, rsp+0*mmsize +%endif + mova [dstq+strideq*0+ 0], m6 + mova [dstq+strideq*0+16], m4 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+ 0], m7 + mova [dstq+strideq*1+16], m5 + mova [dstq+strideq*1+32], m3 + mova [dstq+strideq*1+48], m1 + mova [dstq+stride16q+ 0], m4 + mova [dstq+stride16q+16], m2 + mova [dstq+stride16q+32], m0 +%if ARCH_X86_64 + mova [dstq+stride16q+48], m9 +%endif + mova [dstq+stride17q+ 0], m5 + mova [dstq+stride17q+16], m3 + mova [dstq+stride17q+32], m1 +%if ARCH_X86_64 + mova [dstq+stride17q+48], m9 +%endif + lea dstq, [dstq+strideq*2] +%if cpuflag(avx) + vpalignr m6, m4, m6, 2 + vpalignr m4, m2, m4, 2 + vpalignr m2, m0, m2, 2 + vpalignr m7, m5, m7, 2 + vpalignr m5, m3, m5, 2 + vpalignr m3, m1, m3, 2 +%else + SCRATCH 3, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + SCRATCH 1, 10, rsp+1*mmsize +%endif + PALIGNR m3, m4, m6, 2, m1 + mova m6, m3 + PALIGNR m3, m2, m4, 2, m1 + mova m4, m3 + PALIGNR m3, m0, m2, 2, m1 + mova m2, m3 + PALIGNR m3, m5, m7, 2, m1 + mova m7, m3 + UNSCRATCH 3, 8, rsp+0*mmsize + SCRATCH 6, 8, rsp+0*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 1, 10, rsp+1*mmsize + SCRATCH 7, 10, rsp+1*mmsize +%endif + PALIGNR m6, m3, m5, 2, m7 + mova m5, m6 + PALIGNR m6, m1, m3, 2, m7 + mova m3, m6 +%if notcpuflag(ssse3) + UNSCRATCH 7, 10, rsp+1*mmsize +%endif +%endif + SHIFT_RIGHT m1, m1, reg_shuf + SHIFT_RIGHT m0, m0, reg_shuf + dec cntd + jg .loop + +%if ARCH_X86_32 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] +%assign %%n 0 +%rep 4 + mova [dstq+strideq*0+48], m0 + mova [dstq+strideq*1+48], m0 + mova [dstq+strideq*2+48], m0 + mova [dstq+stride3q +48], m0 +%if %%n < 3 + lea dstq, [dstq+strideq*4] +%endif +%assign %%n (%%n+1) +%endrep +%endif + RET +%endmacro + +INIT_XMM sse2 +VL_FUNCS 2 +INIT_XMM ssse3 +VL_FUNCS 1 +INIT_XMM avx +VL_FUNCS 1 + +%macro VR_FUNCS 0 +cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a + movu m0, [aq-2] + movhps m1, [lq] + PALIGNR m0, m1, 10, m2 ; xyz*abcd + pslldq m1, m0, 2 ; .xyz*abc + pslldq m2, m0, 4 ; ..xyz*ab + LOWPASS 2, 1, 0 ; ..YZ#ABC + pavgw m1, m0 ; ....#ABC + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movhps [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m2 + shufps m0, m2, m1, q3210 +%if cpuflag(ssse3) + pshufb m2, [pb_4_5_8to13_8x0] +%else + pshuflw m2, m2, q2222 + psrldq m2, 6 +%endif + psrldq m0, 6 + movh [dstq+strideq*2], m0 + movh [dstq+stride3q ], m2 + RET + +cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a + movu m1, [aq-2] ; *abcdefg + movu m2, [lq] ; stuvwxyz + mova m0, [aq] ; abcdefgh + PALIGNR m3, m1, m2, 14, m4 ; z*abcdef + LOWPASS 3, 1, 0 + pavgw m0, m1 + PALIGNR m1, m2, 2, m4 ; tuvwxyz* + pslldq m4, m2, 2 ; .stuvwxy + LOWPASS 4, 2, 1 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m3 + PALIGNR m0, m4, 14, m1 + pslldq m4, 2 + PALIGNR m3, m4, 14, m1 + pslldq m4, 2 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + PALIGNR m0, m4, 14, m1 + pslldq m4, 2 + PALIGNR m3, m4, 14, m1 + pslldq m4, 2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m3 + PALIGNR m0, m4, 14, m1 + pslldq m4, 2 + PALIGNR m3, m4, 14, m4 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m3 + RET + +cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a + movu m1, [aq-2] ; *abcdefg + movu m2, [aq+mmsize-2] ; hijklmno + mova m3, [aq] ; abcdefgh + mova m4, [aq+mmsize] ; ijklmnop + mova m5, [lq+mmsize] ; stuvwxyz + PALIGNR m0, m1, m5, 14, m6 ; z*abcdef + movu m6, [aq+mmsize-4] ; ghijklmn + LOWPASS 6, 2, 4 + pavgw m2, m4 + LOWPASS 0, 1, 3 + pavgw m3, m1 + PALIGNR m1, m5, 2, m7 ; tuvwxyz* + movu m7, [lq+mmsize-2] ; rstuvwxy + LOWPASS 1, 5, 7 + movu m5, [lq+2] ; lmnopqrs + pslldq m4, m5, 2 ; .lmnopqr + pslldq m7, m5, 4 ; ..lmnopq + LOWPASS 5, 4, 7 + psrld m4, m1, 16 + psrld m7, m5, 16 + pand m1, [pd_65535] + pand m5, [pd_65535] + packssdw m7, m4 + packssdw m5, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 + +.loop: + mova [dstq+strideq*0+ 0], m3 + mova [dstq+strideq*0+16], m2 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+16], m6 + lea dstq, [dstq+strideq*2] + PALIGNR m2, m3, 14, m4 + PALIGNR m3, m7, 14, m4 + pslldq m7, 2 + PALIGNR m6, m0, 14, m4 + PALIGNR m0, m5, 14, m4 + pslldq m5, 2 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a + movu m0, [aq+mmsize*0-2] ; *a[0-6] + movu m1, [aq+mmsize*1-2] ; a[7-14] + movu m2, [aq+mmsize*2-2] ; a[15-22] + movu m3, [aq+mmsize*3-2] ; a[23-30] + mova m4, [aq+mmsize*3+0] ; a[24-31] + movu m5, [aq+mmsize*3-4] ; a[22-29] + LOWPASS 5, 3, 4 ; A[23-30] + SCRATCH 5, 8, rsp+0*mmsize + pavgw m3, m4 + mova m4, [aq+mmsize*2+0] ; a[16-23] + movu m6, [aq+mmsize*2-4] ; a[14-21] + LOWPASS 6, 2, 4 ; A[15-22] + SCRATCH 6, 9, rsp+1*mmsize + pavgw m2, m4 + mova m4, [aq+mmsize*1+0] ; a[8-15] + movu m7, [aq+mmsize*1-4] ; a[6-13] + LOWPASS 7, 1, 4 ; A[7-14] + SCRATCH 7, 10, rsp+2*mmsize + pavgw m1, m4 + mova m4, [aq+mmsize*0+0] ; a[0-7] + mova m5, [lq+mmsize*3+0] ; l[24-31] + PALIGNR m6, m0, m5, 14, m7 ; l[31]*a[0-5] + LOWPASS 6, 0, 4 ; #A[0-6] + SCRATCH 6, 11, rsp+3*mmsize + pavgw m4, m0 + PALIGNR m0, m5, 2, m7 ; l[25-31]* + movu m7, [lq+mmsize*3-2] ; l[23-30] + LOWPASS 0, 5, 7 ; L[24-31] + movu m5, [lq+mmsize*2-2] ; l[15-22] + mova m7, [lq+mmsize*2+0] ; l[16-23] + movu m6, [lq+mmsize*2+2] ; l[17-24] + LOWPASS 5, 7, 6 ; L[16-23] + psrld m7, m0, 16 + psrld m6, m5, 16 + pand m0, [pd_65535] + pand m5, [pd_65535] + packssdw m6, m7 + packssdw m5, m0 + SCRATCH 5, 12, rsp+4*mmsize + SCRATCH 6, 13, rsp+5*mmsize + movu m6, [lq+mmsize*1-2] ; l[7-14] + mova m0, [lq+mmsize*1+0] ; l[8-15] + movu m5, [lq+mmsize*1+2] ; l[9-16] + LOWPASS 6, 0, 5 ; L[8-15] + movu m0, [lq+mmsize*0+2] ; l[1-8] + pslldq m5, m0, 2 ; .l[1-7] + pslldq m7, m0, 4 ; ..l[1-6] + LOWPASS 0, 5, 7 + psrld m5, m6, 16 + psrld m7, m0, 16 + pand m6, [pd_65535] + pand m0, [pd_65535] + packssdw m7, m5 + packssdw m0, m6 + UNSCRATCH 6, 13, rsp+5*mmsize + DEFINE_ARGS dst, stride, stride16, cnt, stride17 + mov stride16q, strideq + mov cntd, 8 + shl stride16q, 4 +%if ARCH_X86_64 + lea stride17q, [stride16q+strideq] +%endif + +.loop: + mova [dstq+strideq*0+ 0], m4 + mova [dstq+strideq*0+16], m1 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m3 +%if ARCH_X86_64 + mova [dstq+strideq*1+ 0], m11 + mova [dstq+strideq*1+16], m10 + mova [dstq+strideq*1+32], m9 + mova [dstq+strideq*1+48], m8 +%endif + mova [dstq+stride16q+ 0], m6 + mova [dstq+stride16q+16], m4 + mova [dstq+stride16q+32], m1 + mova [dstq+stride16q+48], m2 +%if ARCH_X86_64 + mova [dstq+stride17q+ 0], m12 + mova [dstq+stride17q+16], m11 + mova [dstq+stride17q+32], m10 + mova [dstq+stride17q+48], m9 +%endif + lea dstq, [dstq+strideq*2] + PALIGNR m3, m2, 14, m5 + PALIGNR m2, m1, 14, m5 + PALIGNR m1, m4, 14, m5 + PALIGNR m4, m6, 14, m5 + PALIGNR m6, m7, 14, m5 + pslldq m7, 2 +%if ARCH_X86_64 + PALIGNR m8, m9, 14, m5 + PALIGNR m9, m10, 14, m5 + PALIGNR m10, m11, 14, m5 + PALIGNR m11, m12, 14, m5 + PALIGNR m12, m0, 14, m5 + pslldq m0, 2 +%endif + dec cntd + jg .loop + +%if ARCH_X86_32 + UNSCRATCH 5, 12, rsp+4*mmsize + UNSCRATCH 4, 11, rsp+3*mmsize + UNSCRATCH 3, 10, rsp+2*mmsize + UNSCRATCH 2, 9, rsp+1*mmsize + UNSCRATCH 1, 8, rsp+0*mmsize + mov dstq, dstm + mov cntd, 8 + add dstq, strideq +.loop2: + mova [dstq+strideq*0+ 0], m4 + mova [dstq+strideq*0+16], m3 + mova [dstq+strideq*0+32], m2 + mova [dstq+strideq*0+48], m1 + mova [dstq+stride16q+ 0], m5 + mova [dstq+stride16q+16], m4 + mova [dstq+stride16q+32], m3 + mova [dstq+stride16q+48], m2 + lea dstq, [dstq+strideq*2] + PALIGNR m1, m2, 14, m6 + PALIGNR m2, m3, 14, m6 + PALIGNR m3, m4, 14, m6 + PALIGNR m4, m5, 14, m6 + PALIGNR m5, m0, 14, m6 + pslldq m0, 2 + dec cntd + jg .loop2 +%endif + RET +%endmacro + +INIT_XMM sse2 +VR_FUNCS +INIT_XMM ssse3 +VR_FUNCS +INIT_XMM avx +VR_FUNCS + +%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function +cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a + movh m0, [lq] ; abcd +%if cpuflag(ssse3) + pshufb m0, [pb_0to7_67x4] ; abcddddd +%else + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 ; abcddddd +%endif + psrldq m1, m0, 2 ; bcddddd. + psrldq m2, m0, 4 ; cddddd.. + LOWPASS 2, 1, 0 ; BCDddd.. + pavgw m1, m0 ; abcddddd + SBUTTERFLY wd, 1, 2, 0 ; aBbCcDdd, dddddddd + PALIGNR m2, m1, 4, m0 ; bCcDdddd + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+strideq*0], m1 ; aBbC + movh [dstq+strideq*1], m2 ; bCcD + movhps [dstq+strideq*2], m1 ; cDdd + movhps [dstq+stride3q ], m2 ; dddd + RET + +cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a + mova m0, [lq] +%if cpuflag(ssse3) + mova m3, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m2, m0, m3 + LOWPASS 2, 1, 0 + pavgw m1, m0 + SBUTTERFLY wd, 1, 2, 0 + shufps m0, m1, m2, q1032 + pshufd m3, m2, q3332 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + mova [dstq+strideq *0], m1 + mova [dstq+strideq *2], m0 + mova [dstq+strideq *4], m2 + mova [dstq+stride3q*2], m3 + add dstq, strideq +%if cpuflag(avx) + vpalignr m1, m2, m1, 4 +%else + PALIGNR m0, m2, m1, 4, m3 + mova m1, m0 +%endif + pshufd m2, m2, q3321 + shufps m0, m1, m2, q1032 + pshufd m3, m2, q3332 + mova [dstq+strideq *0], m1 + mova [dstq+strideq *2], m0 + mova [dstq+strideq *4], m2 + mova [dstq+stride3q*2], m3 + RET + +cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a + mova m0, [lq] + mova m3, [lq+mmsize] + movu m1, [lq+2] + movu m2, [lq+4] + LOWPASS 2, 1, 0 + pavgw m1, m0 + SBUTTERFLY wd, 1, 2, 0 +%if cpuflag(ssse3) + mova m5, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m0, m4, m3, m5 + LOWPASS 4, 0, 3 + pavgw m3, m0 + SBUTTERFLY wd, 3, 4, 5 + pshufd m0, m0, q3333 + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + +.loop: + mova [dstq+strideq *0+ 0], m1 + mova [dstq+strideq *0+16], m2 + mova [dstq+strideq *4+ 0], m2 + mova [dstq+strideq *4+16], m3 + mova [dstq+strideq *8+ 0], m3 + mova [dstq+strideq *8+16], m4 + mova [dstq+stride3q*4+ 0], m4 + mova [dstq+stride3q*4+16], m0 + add dstq, strideq +%if cpuflag(avx) + vpalignr m1, m2, m1, 4 + vpalignr m2, m3, m2, 4 + vpalignr m3, m4, m3, 4 + vpalignr m4, m0, m4, 4 +%else + PALIGNR m5, m2, m1, 4, m6 + mova m1, m5 + PALIGNR m5, m3, m2, 4, m6 + mova m2, m5 + PALIGNR m5, m4, m3, 4, m6 + mova m3, m5 + PALIGNR m5, m0, m4, 4, m6 + mova m4, m5 +%endif + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \ + %1 * -mmsize * ARCH_X86_32, dst, stride, l, a + mova m2, [lq+mmsize*0+0] + movu m1, [lq+mmsize*0+2] + movu m0, [lq+mmsize*0+4] + LOWPASS 0, 1, 2 + pavgw m1, m2 + SBUTTERFLY wd, 1, 0, 2 + SCRATCH 1, 8, rsp+0*mmsize + mova m4, [lq+mmsize*1+0] + movu m3, [lq+mmsize*1+2] + movu m2, [lq+mmsize*1+4] + LOWPASS 2, 3, 4 + pavgw m3, m4 + SBUTTERFLY wd, 3, 2, 4 + mova m6, [lq+mmsize*2+0] + movu m5, [lq+mmsize*2+2] + movu m4, [lq+mmsize*2+4] + LOWPASS 4, 5, 6 + pavgw m5, m6 + SBUTTERFLY wd, 5, 4, 6 + mova m7, [lq+mmsize*3+0] + SCRATCH 0, 9, rsp+1*mmsize +%if cpuflag(ssse3) + mova m0, [pb_2to15_14_15] +%endif + SHIFT_RIGHTx2 m1, m6, m7, m0 + LOWPASS 6, 1, 7 + pavgw m7, m1 + SBUTTERFLY wd, 7, 6, 0 + pshufd m1, m1, q3333 + UNSCRATCH 0, 9, rsp+1*mmsize + DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 + lea stride3q, [strideq*3] + lea stride4q, [strideq*4] + lea stride28q, [stride4q*8] + lea stride20q, [stride4q*5] + sub stride28q, stride4q + mov cntd, 4 + +.loop: +%if ARCH_X86_64 + SWAP 1, 8 +%else + mova [rsp+1*mmsize], m1 + mova m1, [rsp+0*mmsize] +%endif + mova [dstq+strideq *0+ 0], m1 + mova [dstq+strideq *0+16], m0 + mova [dstq+strideq *0+32], m3 + mova [dstq+strideq *0+48], m2 + mova [dstq+stride4q*1+ 0], m0 + mova [dstq+stride4q*1+16], m3 + mova [dstq+stride4q*1+32], m2 + mova [dstq+stride4q*1+48], m5 + mova [dstq+stride4q*2+ 0], m3 + mova [dstq+stride4q*2+16], m2 + mova [dstq+stride4q*2+32], m5 + mova [dstq+stride4q*2+48], m4 +%if cpuflag(avx) + vpalignr m1, m0, m1, 4 + vpalignr m0, m3, m0, 4 + vpalignr m3, m2, m3, 4 +%else + SCRATCH 6, 9, rsp+2*mmsize +%if notcpuflag(ssse3) + SCRATCH 7, 10, rsp+3*mmsize +%endif + PALIGNR m6, m0, m1, 4, m7 + mova m1, m6 + PALIGNR m6, m3, m0, 4, m7 + mova m0, m6 + PALIGNR m6, m2, m3, 4, m7 + mova m3, m6 + UNSCRATCH 6, 9, rsp+2*mmsize + SCRATCH 0, 9, rsp+2*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 7, 10, rsp+3*mmsize + SCRATCH 3, 10, rsp+3*mmsize +%endif +%endif +%if ARCH_X86_64 + SWAP 1, 8 +%else + mova [rsp+0*mmsize], m1 + mova m1, [rsp+1*mmsize] +%endif + mova [dstq+stride3q*4+ 0], m2 + mova [dstq+stride3q*4+16], m5 + mova [dstq+stride3q*4+32], m4 + mova [dstq+stride3q*4+48], m7 + mova [dstq+stride4q*4+ 0], m5 + mova [dstq+stride4q*4+16], m4 + mova [dstq+stride4q*4+32], m7 + mova [dstq+stride4q*4+48], m6 + mova [dstq+stride20q + 0], m4 + mova [dstq+stride20q +16], m7 + mova [dstq+stride20q +32], m6 + mova [dstq+stride20q +48], m1 + mova [dstq+stride3q*8+ 0], m7 + mova [dstq+stride3q*8+16], m6 + mova [dstq+stride3q*8+32], m1 + mova [dstq+stride3q*8+48], m1 + mova [dstq+stride28q + 0], m6 + mova [dstq+stride28q +16], m1 + mova [dstq+stride28q +32], m1 + mova [dstq+stride28q +48], m1 +%if cpuflag(avx) + vpalignr m2, m5, m2, 4 + vpalignr m5, m4, m5, 4 + vpalignr m4, m7, m4, 4 + vpalignr m7, m6, m7, 4 + vpalignr m6, m1, m6, 4 +%else + PALIGNR m0, m5, m2, 4, m3 + mova m2, m0 + PALIGNR m0, m4, m5, 4, m3 + mova m5, m0 + PALIGNR m0, m7, m4, 4, m3 + mova m4, m0 + PALIGNR m0, m6, m7, 4, m3 + mova m7, m0 + PALIGNR m0, m1, m6, 4, m3 + mova m6, m0 + UNSCRATCH 0, 9, rsp+2*mmsize +%if notcpuflag(ssse3) + UNSCRATCH 3, 10, rsp+3*mmsize +%endif +%endif + add dstq, strideq + dec cntd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +HU_FUNCS 4 +INIT_XMM ssse3 +HU_FUNCS 3 +INIT_XMM avx +HU_FUNCS 2 + +%macro HD_FUNCS 0 +cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a + movh m0, [lq] + movhps m0, [aq-2] + psrldq m1, m0, 2 + psrldq m2, m0, 4 + LOWPASS 2, 1, 0 + pavgw m1, m0 + punpcklwd m1, m2 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movh [dstq+stride3q ], m1 + movhps [dstq+strideq*1], m1 + movhlps m2, m2 + PALIGNR m2, m1, 4, m0 + movh [dstq+strideq*2], m2 + movhps [dstq+strideq*0], m2 + RET + +cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a + mova m0, [lq] + movu m1, [aq-2] + PALIGNR m2, m1, m0, 2, m3 + PALIGNR m3, m1, m0, 4, m4 + LOWPASS 3, 2, 0 + pavgw m2, m0 + SBUTTERFLY wd, 2, 3, 0 + psrldq m0, m1, 2 + psrldq m4, m1, 4 + LOWPASS 1, 0, 4 + DEFINE_ARGS dst8, mstride, cnt + lea dst8q, [dst8q+mstrideq*8] + neg mstrideq + mov cntd, 4 + +.loop: + add dst8q, mstrideq + mova [dst8q+mstrideq*0], m2 + mova [dst8q+mstrideq*4], m3 +%if cpuflag(avx) + vpalignr m2, m3, m2, 4 + vpalignr m3, m1, m3, 4 +%else + PALIGNR m0, m3, m2, 4, m4 + mova m2, m0 + PALIGNR m0, m1, m3, 4, m4 + mova m3, m0 +%endif + psrldq m1, 4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a + mova m2, [lq] + movu m1, [lq+2] + movu m0, [lq+4] + LOWPASS 0, 1, 2 + pavgw m1, m2 + mova m4, [lq+mmsize] + movu m5, [aq-2] + PALIGNR m3, m5, m4, 2, m6 + PALIGNR m2, m5, m4, 4, m6 + LOWPASS 2, 3, 4 + pavgw m3, m4 + SBUTTERFLY wd, 1, 0, 4 + SBUTTERFLY wd, 3, 2, 4 + mova m6, [aq] + movu m4, [aq+2] + LOWPASS 4, 6, 5 + movu m5, [aq+mmsize-2] + psrldq m6, m5, 2 + psrldq m7, m5, 4 + LOWPASS 5, 6, 7 + DEFINE_ARGS dst, mstride, mstride3, cnt + lea dstq, [dstq+mstrideq*8] + lea dstq, [dstq+mstrideq*8] + neg mstrideq + lea mstride3q, [mstrideq*3] + mov cntd, 4 + +.loop: + add dstq, mstrideq + mova [dstq+mstride3q*4+ 0], m2 + mova [dstq+mstride3q*4+16], m4 + mova [dstq+mstrideq *8+ 0], m3 + mova [dstq+mstrideq *8+16], m2 + mova [dstq+mstrideq *4+ 0], m0 + mova [dstq+mstrideq *4+16], m3 + mova [dstq+mstrideq *0+ 0], m1 + mova [dstq+mstrideq *0+16], m0 +%if cpuflag(avx) + vpalignr m1, m0, m1, 4 + vpalignr m0, m3, m0, 4 + vpalignr m3, m2, m3, 4 + vpalignr m2, m4, m2, 4 + vpalignr m4, m5, m4, 4 +%else + PALIGNR m6, m0, m1, 4, m7 + mova m1, m6 + PALIGNR m6, m3, m0, 4, m7 + mova m0, m6 + PALIGNR m6, m2, m3, 4, m7 + mova m3, m6 + PALIGNR m6, m4, m2, 4, m7 + mova m2, m6 + PALIGNR m6, m5, m4, 4, m7 + mova m4, m6 +%endif + psrldq m5, 4 + dec cntd + jg .loop + RET + +cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \ + 10 * -mmsize * ARCH_X86_32, dst, stride, l, a + mova m2, [lq+mmsize*0+0] + movu m1, [lq+mmsize*0+2] + movu m0, [lq+mmsize*0+4] + LOWPASS 0, 1, 2 + pavgw m1, m2 + SBUTTERFLY wd, 1, 0, 2 + mova m4, [lq+mmsize*1+0] + movu m3, [lq+mmsize*1+2] + movu m2, [lq+mmsize*1+4] + LOWPASS 2, 3, 4 + pavgw m3, m4 + SBUTTERFLY wd, 3, 2, 4 + SCRATCH 0, 8, rsp+0*mmsize + SCRATCH 1, 9, rsp+1*mmsize + SCRATCH 2, 10, rsp+2*mmsize + SCRATCH 3, 11, rsp+3*mmsize + mova m6, [lq+mmsize*2+0] + movu m5, [lq+mmsize*2+2] + movu m4, [lq+mmsize*2+4] + LOWPASS 4, 5, 6 + pavgw m5, m6 + SBUTTERFLY wd, 5, 4, 6 + mova m0, [lq+mmsize*3+0] + movu m1, [aq+mmsize*0-2] + PALIGNR m7, m1, m0, 2, m2 + PALIGNR m6, m1, m0, 4, m2 + LOWPASS 6, 7, 0 + pavgw m7, m0 + SBUTTERFLY wd, 7, 6, 0 + mova m2, [aq+mmsize*0+0] + movu m0, [aq+mmsize*0+2] + LOWPASS 0, 2, 1 + movu m1, [aq+mmsize*1-2] + mova m2, [aq+mmsize*1+0] + movu m3, [aq+mmsize*1+2] + LOWPASS 1, 2, 3 + SCRATCH 6, 12, rsp+6*mmsize + SCRATCH 7, 13, rsp+7*mmsize + movu m2, [aq+mmsize*2-2] + mova m3, [aq+mmsize*2+0] + movu m6, [aq+mmsize*2+2] + LOWPASS 2, 3, 6 + movu m3, [aq+mmsize*3-2] + psrldq m6, m3, 2 + psrldq m7, m3, 4 + LOWPASS 3, 6, 7 + UNSCRATCH 6, 12, rsp+6*mmsize + UNSCRATCH 7, 13, rsp+7*mmsize +%if ARCH_X86_32 + mova [rsp+4*mmsize], m4 + mova [rsp+5*mmsize], m5 + ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need + ; to do it again here +%endif + DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28 + mov cntd, 4 + lea stride3q, [strideq*3] +%if ARCH_X86_64 + lea stride4q, [strideq*4] + lea stride28q, [stride4q*8] + lea stride20q, [stride4q*5] + sub stride28q, stride4q +%endif + add dstq, stride3q + + ; x86-32 doesn't have enough registers, so on that platform, we split + ; the loop in 2... Otherwise you spend most of the loop (un)scratching +.loop: +%if ARCH_X86_64 + mova [dstq+stride28q + 0], m9 + mova [dstq+stride28q +16], m8 + mova [dstq+stride28q +32], m11 + mova [dstq+stride28q +48], m10 + mova [dstq+stride3q*8+ 0], m8 + mova [dstq+stride3q*8+16], m11 + mova [dstq+stride3q*8+32], m10 + mova [dstq+stride3q*8+48], m5 + mova [dstq+stride20q + 0], m11 + mova [dstq+stride20q +16], m10 + mova [dstq+stride20q +32], m5 + mova [dstq+stride20q +48], m4 + mova [dstq+stride4q*4+ 0], m10 + mova [dstq+stride4q*4+16], m5 + mova [dstq+stride4q*4+32], m4 + mova [dstq+stride4q*4+48], m7 +%endif + mova [dstq+stride3q*4+ 0], m5 + mova [dstq+stride3q*4+16], m4 + mova [dstq+stride3q*4+32], m7 + mova [dstq+stride3q*4+48], m6 + mova [dstq+strideq* 8+ 0], m4 + mova [dstq+strideq* 8+16], m7 + mova [dstq+strideq* 8+32], m6 + mova [dstq+strideq* 8+48], m0 + mova [dstq+strideq* 4+ 0], m7 + mova [dstq+strideq* 4+16], m6 + mova [dstq+strideq* 4+32], m0 + mova [dstq+strideq* 4+48], m1 + mova [dstq+strideq* 0+ 0], m6 + mova [dstq+strideq* 0+16], m0 + mova [dstq+strideq* 0+32], m1 + mova [dstq+strideq* 0+48], m2 + sub dstq, strideq +%if cpuflag(avx) +%if ARCH_X86_64 + vpalignr m9, m8, m9, 4 + vpalignr m8, m11, m8, 4 + vpalignr m11, m10, m11, 4 + vpalignr m10, m5, m10, 4 +%endif + vpalignr m5, m4, m5, 4 + vpalignr m4, m7, m4, 4 + vpalignr m7, m6, m7, 4 + vpalignr m6, m0, m6, 4 + vpalignr m0, m1, m0, 4 + vpalignr m1, m2, m1, 4 + vpalignr m2, m3, m2, 4 +%else +%if ARCH_X86_64 + PALIGNR m12, m8, m9, 4, m13 + mova m9, m12 + PALIGNR m12, m11, m8, 4, m13 + mova m8, m12 + PALIGNR m12, m10, m11, 4, m13 + mova m11, m12 + PALIGNR m12, m5, m10, 4, m13 + mova m10, m12 +%endif + SCRATCH 3, 12, rsp+8*mmsize, sh +%if notcpuflag(ssse3) + SCRATCH 2, 13, rsp+9*mmsize +%endif + PALIGNR m3, m4, m5, 4, m2 + mova m5, m3 + PALIGNR m3, m7, m4, 4, m2 + mova m4, m3 + PALIGNR m3, m6, m7, 4, m2 + mova m7, m3 + PALIGNR m3, m0, m6, 4, m2 + mova m6, m3 + PALIGNR m3, m1, m0, 4, m2 + mova m0, m3 +%if notcpuflag(ssse3) + UNSCRATCH 2, 13, rsp+9*mmsize + SCRATCH 0, 13, rsp+9*mmsize +%endif + PALIGNR m3, m2, m1, 4, m0 + mova m1, m3 + PALIGNR m3, reg_sh, m2, 4, m0 + mova m2, m3 +%if notcpuflag(ssse3) + UNSCRATCH 0, 13, rsp+9*mmsize +%endif + UNSCRATCH 3, 12, rsp+8*mmsize, sh +%endif + psrldq m3, 4 + dec cntd + jg .loop + +%if ARCH_X86_32 + UNSCRATCH 0, 8, rsp+0*mmsize + UNSCRATCH 1, 9, rsp+1*mmsize + UNSCRATCH 2, 10, rsp+2*mmsize + UNSCRATCH 3, 11, rsp+3*mmsize + mova m4, [rsp+4*mmsize] + mova m5, [rsp+5*mmsize] + mova m6, [rsp+6*mmsize] + mova m7, [rsp+7*mmsize] + DEFINE_ARGS dst, stride, stride5, stride3 + lea stride5q, [strideq*5] + lea dstq, [dstq+stride5q*4] + DEFINE_ARGS dst, stride, cnt, stride3 + mov cntd, 4 +.loop_2: + mova [dstq+stride3q*4+ 0], m1 + mova [dstq+stride3q*4+16], m0 + mova [dstq+stride3q*4+32], m3 + mova [dstq+stride3q*4+48], m2 + mova [dstq+strideq* 8+ 0], m0 + mova [dstq+strideq* 8+16], m3 + mova [dstq+strideq* 8+32], m2 + mova [dstq+strideq* 8+48], m5 + mova [dstq+strideq* 4+ 0], m3 + mova [dstq+strideq* 4+16], m2 + mova [dstq+strideq* 4+32], m5 + mova [dstq+strideq* 4+48], m4 + mova [dstq+strideq* 0+ 0], m2 + mova [dstq+strideq* 0+16], m5 + mova [dstq+strideq* 0+32], m4 + mova [dstq+strideq* 0+48], m7 + sub dstq, strideq +%if cpuflag(avx) + vpalignr m1, m0, m1, 4 + vpalignr m0, m3, m0, 4 + vpalignr m3, m2, m3, 4 + vpalignr m2, m5, m2, 4 + vpalignr m5, m4, m5, 4 + vpalignr m4, m7, m4, 4 + vpalignr m7, m6, m7, 4 +%else + SCRATCH 6, 12, rsp+8*mmsize, sh +%if notcpuflag(ssse3) + SCRATCH 7, 13, rsp+9*mmsize +%endif + PALIGNR m6, m0, m1, 4, m7 + mova m1, m6 + PALIGNR m6, m3, m0, 4, m7 + mova m0, m6 + PALIGNR m6, m2, m3, 4, m7 + mova m3, m6 + PALIGNR m6, m5, m2, 4, m7 + mova m2, m6 + PALIGNR m6, m4, m5, 4, m7 + mova m5, m6 +%if notcpuflag(ssse3) + UNSCRATCH 7, 13, rsp+9*mmsize + SCRATCH 5, 13, rsp+9*mmsize +%endif + PALIGNR m6, m7, m4, 4, m5 + mova m4, m6 + PALIGNR m6, reg_sh, m7, 4, m5 + mova m7, m6 +%if notcpuflag(ssse3) + UNSCRATCH 5, 13, rsp+9*mmsize +%endif + UNSCRATCH 6, 12, rsp+8*mmsize, sh +%endif + psrldq m6, 4 + dec cntd + jg .loop_2 +%endif + RET +%endmacro + +INIT_XMM sse2 +HD_FUNCS +INIT_XMM ssse3 +HD_FUNCS +INIT_XMM avx +HD_FUNCS diff --git a/media/ffvpx/libavcodec/x86/vp9itxfm.asm b/media/ffvpx/libavcodec/x86/vp9itxfm.asm new file mode 100644 index 0000000000..2c63fe514a --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9itxfm.asm @@ -0,0 +1,3197 @@ +;****************************************************************************** +;* VP9 IDCT SIMD optimizations +;* +;* Copyright (C) 2013 Clément Bœsch <u pkh me> +;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" +%include "vp9itxfm_template.asm" + +SECTION_RODATA 32 + +%macro VP9_IDCT_COEFFS 2-3 0 +const pw_m%1_%2 +times 8 dw -%1, %2 +const pw_%2_%1 +times 8 dw %2, %1 + +%if %3 == 1 +const pw_m%2_m%1 +times 8 dw -%2, -%1 +%if %1 != %2 +const pw_m%2_%1 +times 8 dw -%2, %1 +const pw_%1_%2 +times 8 dw %1, %2 +%endif +%endif + +%if %1 < 11585 +pw_m%1x2: times 16 dw -%1*2 +%elif %1 > 11585 +pw_%1x2: times 16 dw %1*2 +%else +const pw_%1x2 +times 16 dw %1*2 +%endif + +%if %2 != %1 +pw_%2x2: times 16 dw %2*2 +%endif +%endmacro + +VP9_IDCT_COEFFS 16364, 804 +VP9_IDCT_COEFFS 16305, 1606 +VP9_IDCT_COEFFS 16069, 3196, 1 +VP9_IDCT_COEFFS 15893, 3981 +VP9_IDCT_COEFFS 15137, 6270, 1 +VP9_IDCT_COEFFS 14811, 7005 +VP9_IDCT_COEFFS 14449, 7723 +VP9_IDCT_COEFFS 13160, 9760 +VP9_IDCT_COEFFS 11585, 11585, 1 +VP9_IDCT_COEFFS 11003, 12140 +VP9_IDCT_COEFFS 10394, 12665 +VP9_IDCT_COEFFS 9102, 13623, 1 +VP9_IDCT_COEFFS 8423, 14053 +VP9_IDCT_COEFFS 5520, 15426 +VP9_IDCT_COEFFS 4756, 15679 +VP9_IDCT_COEFFS 2404, 16207 + +const pw_5283_13377 +times 4 dw 5283, 13377 +const pw_9929_13377 +times 4 dw 9929, 13377 +const pw_15212_m13377 +times 4 dw 15212, -13377 +const pw_15212_9929 +times 4 dw 15212, 9929 +const pw_m5283_m15212 +times 4 dw -5283, -15212 +const pw_13377x2 +times 8 dw 13377*2 +const pw_m13377_13377 +times 4 dw -13377, 13377 +const pw_13377_0 +times 4 dw 13377, 0 + +cextern pw_8 +cextern pw_16 +cextern pw_32 +cextern pw_512 +cextern pw_1024 +cextern pw_2048 +cextern pw_m1 +cextern pd_8192 + +SECTION .text + +%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2 + punpckhwd m%4, m%2, m%1 + punpcklwd m%2, m%1 + pmaddwd m%3, m%4, [pw_m%5_%6] + pmaddwd m%4, [pw_%6_%5] + pmaddwd m%1, m%2, [pw_m%5_%6] + pmaddwd m%2, [pw_%6_%5] +%endmacro + +%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round + SUMSUB_BA d, %1, %2, %5 + SUMSUB_BA d, %3, %4, %5 + paddd m%1, %6 + paddd m%2, %6 + paddd m%3, %6 + paddd m%4, %6 + psrad m%1, 14 + psrad m%2, 14 + psrad m%3, 14 + psrad m%4, 14 + packssdw m%1, m%3 + packssdw m%2, m%4 +%endmacro + +%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst +%if mmsize == 32 + pmovzxbw m%3, [%6] + pmovzxbw m%4, [%6+strideq] +%else + movh m%3, [%6] + movh m%4, [%6+strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 +%endif + paddw m%3, m%1 + paddw m%4, m%2 +%if mmsize == 32 + packuswb m%3, m%4 + ; Intel... + vpermq m%3, m%3, q3120 + mova [%6], xm%3 + vextracti128 [%6+strideq], m%3, 1 +%elif mmsize == 16 + packuswb m%3, m%4 + movh [%6], m%3 + movhps [%6+strideq], m%3 +%else + packuswb m%3, m%5 + packuswb m%4, m%5 + movh [%6], m%3 + movh [%6+strideq], m%4 +%endif +%endmacro + +%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg +%assign %%y 0 +%rep %3 +%assign %%x 0 +%rep %3*2/mmsize + mova [%1+%%y+%%x], %4 +%assign %%x (%%x+mmsize) +%endrep +%assign %%y (%%y+%2) +%endrep +%endmacro + +;------------------------------------------------------------------------------------------- +; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +INIT_MMX mmx +cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob + mova m0, [blockq+0*8] + mova m1, [blockq+1*8] + mova m2, [blockq+2*8] + mova m3, [blockq+3*8] + psraw m0, 2 + psraw m1, 2 + psraw m2, 2 + psraw m3, 2 + + VP9_IWHT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IWHT4_1D + + pxor m4, m4 + VP9_STORE_2X 0, 1, 5, 6, 4 + lea dstq, [dstq+strideq*2] + VP9_STORE_2X 2, 3, 5, 6, 4 + ZERO_BLOCK blockq, 8, 4, m4 + RET + +;------------------------------------------------------------------------------------------- +; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +; 2x2 top left corner +%macro VP9_IDCT4_2x2_1D 0 + pmulhrsw m0, m5 ; m0=t1 + mova m2, m0 ; m2=t0 + mova m3, m1 + pmulhrsw m1, m6 ; m1=t2 + pmulhrsw m3, m7 ; m3=t3 + VP9_IDCT4_1D_FINALIZE +%endmacro + +%macro VP9_IDCT4_WRITEOUT 0 +%if cpuflag(ssse3) + mova m5, [pw_2048] + pmulhrsw m0, m5 ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 + pmulhrsw m1, m5 +%else + mova m5, [pw_8] + paddw m0, m5 + paddw m1, m5 + psraw m0, 4 + psraw m1, 4 +%endif + VP9_STORE_2X 0, 1, 6, 7, 4 + lea dstq, [dstq+2*strideq] +%if cpuflag(ssse3) + pmulhrsw m2, m5 + pmulhrsw m3, m5 +%else + paddw m2, m5 + paddw m3, m5 + psraw m2, 4 + psraw m3, 4 +%endif + VP9_STORE_2X 2, 3, 6, 7, 4 +%endmacro + +%macro IDCT_4x4_FN 1 +INIT_MMX %1 +cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob + +%if cpuflag(ssse3) + cmp eobd, 4 ; 2x2 or smaller + jg .idctfull + + cmp eobd, 1 ; faster path for when only DC is set + jne .idct2x2 +%else + cmp eobd, 1 + jg .idctfull +%endif + +%if cpuflag(ssse3) + movd m0, [blockq] + mova m5, [pw_11585x2] + pmulhrsw m0, m5 + pmulhrsw m0, m5 +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (8 << 14) + 8192 + sar coefd, 14 + 4 + movd m0, coefd +%endif + pshufw m0, m0, 0 + pxor m4, m4 + movh [blockq], m4 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 +%endif + VP9_STORE_2X 0, 0, 6, 7, 4 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 0, 0, 6, 7, 4 + RET + +%if cpuflag(ssse3) +; faster path for when only top left 2x2 block is set +.idct2x2: + movd m0, [blockq+0] + movd m1, [blockq+8] + mova m5, [pw_11585x2] + mova m6, [pw_6270x2] + mova m7, [pw_15137x2] + VP9_IDCT4_2x2_1D + ; partial 2x4 transpose + punpcklwd m0, m1 + punpcklwd m2, m3 + SBUTTERFLY dq, 0, 2, 1 + SWAP 1, 2 + VP9_IDCT4_2x2_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + movh [blockq+ 0], m4 + movh [blockq+ 8], m4 + VP9_IDCT4_WRITEOUT + RET +%endif + +.idctfull: ; generic full 4x4 idct/idct + mova m0, [blockq+ 0] + mova m1, [blockq+ 8] + mova m2, [blockq+16] + mova m3, [blockq+24] +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif + mova m7, [pd_8192] ; rounding + VP9_IDCT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IDCT4_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + mova [blockq+ 0], m4 + mova [blockq+ 8], m4 + mova [blockq+16], m4 + mova [blockq+24], m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +IDCT_4x4_FN mmxext +IDCT_4x4_FN ssse3 + +;------------------------------------------------------------------------------------------- +; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +%macro IADST4_FN 5 +INIT_MMX %5 +cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob +%if WIN64 && notcpuflag(ssse3) + WIN64_SPILL_XMM 8 +%endif + movdqa xmm5, [pd_8192] + mova m0, [blockq+ 0] + mova m1, [blockq+ 8] + mova m2, [blockq+16] + mova m3, [blockq+24] +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif +%ifnidn %1%3, iadstiadst + movdq2q m7, xmm5 +%endif + VP9_%2_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_%4_1D + pxor m4, m4 ; used for the block reset, and VP9_STORE_2X + mova [blockq+ 0], m4 + mova [blockq+ 8], m4 + mova [blockq+16], m4 + mova [blockq+24], m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +IADST4_FN idct, IDCT4, iadst, IADST4, sse2 +IADST4_FN iadst, IADST4, idct, IDCT4, sse2 +IADST4_FN iadst, IADST4, iadst, IADST4, sse2 + +IADST4_FN idct, IDCT4, iadst, IADST4, ssse3 +IADST4_FN iadst, IADST4, idct, IDCT4, ssse3 +IADST4_FN iadst, IADST4, iadst, IADST4, ssse3 + +%macro SCRATCH 3 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova [%3], m%1 +%endif +%endmacro + +%macro UNSCRATCH 3 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%endmacro + +;------------------------------------------------------------------------------------------- +; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;------------------------------------------------------------------------------------------- + +%macro VP9_IDCT8_1D_FINALIZE 0 + SUMSUB_BA w, 3, 6, 5 ; m3=t0+t7, m6=t0-t7 + SUMSUB_BA w, 1, 2, 5 ; m1=t1+t6, m2=t1-t6 + SUMSUB_BA w, 7, 0, 5 ; m7=t2+t5, m0=t2-t5 + + UNSCRATCH 5, 8, blockq+ 0 + SCRATCH 2, 8, blockq+ 0 + + SUMSUB_BA w, 5, 4, 2 ; m5=t3+t4, m4=t3-t4 + SWAP 7, 6, 2 + SWAP 3, 5, 0 + +%if ARCH_X86_64 + SWAP 6, 8 +%endif +%endmacro + +; x86-32 +; - in: m0/m4 is in mem +; - out: m6 is in mem +; x86-64: +; - everything is in registers (m0-7) +%macro VP9_IDCT8_1D 0 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 4, 9 +%endif + + VP9_UNPACK_MULSUB_2W_4X 5, 3, 9102, 13623, D_8192_REG, 0, 4 ; m5=t5a, m3=t6a + VP9_UNPACK_MULSUB_2W_4X 1, 7, 16069, 3196, D_8192_REG, 0, 4 ; m1=t4a, m7=t7a + SUMSUB_BA w, 5, 1, 0 ; m5=t4a+t5a (t4), m1=t4a-t5a (t5a) + SUMSUB_BA w, 3, 7, 0 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) +%if cpuflag(ssse3) + SUMSUB_BA w, 1, 7, 0 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) + pmulhrsw m1, W_11585x2_REG ; m1=t6 + pmulhrsw m7, W_11585x2_REG ; m7=t5 +%else + VP9_UNPACK_MULSUB_2W_4X 7, 1, 11585, 11585, D_8192_REG, 0, 4 +%endif + VP9_UNPACK_MULSUB_2W_4X 2, 6, 15137, 6270, D_8192_REG, 0, 4 ; m2=t2a, m6=t3a + + UNSCRATCH 0, 8, blockq+ 0 ; IN(0) + UNSCRATCH 4, 9, blockq+64 ; IN(4) + SCRATCH 5, 8, blockq+ 0 + +%if cpuflag(ssse3) + SUMSUB_BA w, 4, 0, 5 ; m4=IN(0)+IN(4) m0=IN(0)-IN(4) + pmulhrsw m4, W_11585x2_REG ; m4=t0a + pmulhrsw m0, W_11585x2_REG ; m0=t1a +%else + SCRATCH 7, 9, blockq+64 + VP9_UNPACK_MULSUB_2W_4X 0, 4, 11585, 11585, D_8192_REG, 5, 7 + UNSCRATCH 7, 9, blockq+64 +%endif + SUMSUB_BA w, 6, 4, 5 ; m6=t0a+t3a (t0), m4=t0a-t3a (t3) + SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) + + VP9_IDCT8_1D_FINALIZE +%endmacro + +%macro VP9_IDCT8_4x4_1D 0 + pmulhrsw m0, W_11585x2_REG ; m0=t1a/t0a + pmulhrsw m6, m2, [pw_15137x2] ; m6=t3a + pmulhrsw m2, [pw_6270x2] ; m2=t2a + pmulhrsw m7, m1, [pw_16069x2] ; m7=t7a + pmulhrsw m1, [pw_3196x2] ; m1=t4a + pmulhrsw m5, m3, [pw_m9102x2] ; m5=t5a + pmulhrsw m3, [pw_13623x2] ; m3=t6a + SUMSUB_BA w, 5, 1, 4 ; m1=t4a+t5a (t4), m5=t4a-t5a (t5a) + SUMSUB_BA w, 3, 7, 4 ; m3=t7a+t6a (t7), m7=t7a-t6a (t6a) + SUMSUB_BA w, 1, 7, 4 ; m1=t6a+t5a (t6), m7=t6a-t5a (t5) + pmulhrsw m1, W_11585x2_REG ; m1=t6 + pmulhrsw m7, W_11585x2_REG ; m7=t5 + psubw m4, m0, m6 ; m4=t0a-t3a (t3) + paddw m6, m0 ; m6=t0a+t3a (t0) + SCRATCH 5, 8, blockq+ 0 + SUMSUB_BA w, 2, 0, 5 ; m2=t1a+t2a (t1), m0=t1a-t2a (t2) + VP9_IDCT8_1D_FINALIZE +%endmacro + +%macro VP9_IDCT8_2x2_1D 1 + pmulhrsw m0, W_11585x2_REG ; m0=t0 + pmulhrsw m3, m1, W_16069x2_REG ; m3=t7 + pmulhrsw m1, W_3196x2_REG ; m1=t4 + psubw m7, m3, m1 ; t5 = t7a - t4a + paddw m5, m3, m1 ; t6 = t7a + t4a + pmulhrsw m7, W_11585x2_REG ; m7=t5 + pmulhrsw m5, W_11585x2_REG ; m5=t6 + SWAP 5, 1 + ; merged VP9_IDCT8_1D_FINALIZE to make register-sharing w/ avx easier + psubw m6, m0, m3 ; m6=t0-t7 + paddw m3, m0 ; m3=t0+t7 + psubw m2, m0, m1 ; m2=t1-t6 + paddw m1, m0 ; m1=t1+t6 +%if %1 == 1 + punpcklwd m3, m1 +%define SCRATCH_REG 1 +%elif ARCH_X86_32 + mova [blockq+ 0], m2 +%define SCRATCH_REG 2 +%else +%define SCRATCH_REG 8 +%endif + psubw m4, m0, m5 ; m4=t3-t4 + paddw m5, m0 ; m5=t3+t4 + SUMSUB_BA w, 7, 0, SCRATCH_REG ; m7=t2+t5, m0=t2-t5 + SWAP 7, 6, 2 + SWAP 3, 5, 0 +%undef SCRATCH_REG +%endmacro + +%macro VP9_IDCT8_WRITEx2 6-8 5 ; line1, line2, tmp1, tmp2, zero, pw_1024/pw_16, shift +%if cpuflag(ssse3) + pmulhrsw m%1, %6 ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 + pmulhrsw m%2, %6 +%else + paddw m%1, %6 + paddw m%2, %6 + psraw m%1, %7 + psraw m%2, %7 +%endif +%if %0 <= 7 + VP9_STORE_2X %1, %2, %3, %4, %5 +%else + VP9_STORE_2X %1, %2, %3, %4, %5, %8 +%endif +%endmacro + +; x86-32: +; - m6 is in mem +; x86-64: +; - m8 holds m6 (SWAP) +; m6 holds zero +%macro VP9_IDCT8_WRITEOUT 0 +%if ARCH_X86_64 +%if cpuflag(ssse3) + mova m9, [pw_1024] +%else + mova m9, [pw_16] +%endif +%define ROUND_REG m9 +%else +%if cpuflag(ssse3) +%define ROUND_REG [pw_1024] +%else +%define ROUND_REG [pw_16] +%endif +%endif + SCRATCH 5, 10, blockq+16 + SCRATCH 7, 11, blockq+32 + VP9_IDCT8_WRITEx2 0, 1, 5, 7, 6, ROUND_REG + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 2, 3, 5, 7, 6, ROUND_REG + lea dstq, [dstq+2*strideq] + UNSCRATCH 5, 10, blockq+16 + UNSCRATCH 7, 11, blockq+32 + VP9_IDCT8_WRITEx2 4, 5, 0, 1, 6, ROUND_REG + lea dstq, [dstq+2*strideq] + UNSCRATCH 5, 8, blockq+ 0 + VP9_IDCT8_WRITEx2 5, 7, 0, 1, 6, ROUND_REG + +%undef ROUND_REG +%endmacro + +%macro VP9_IDCT_IDCT_8x8_ADD_XMM 2 +INIT_XMM %1 +cglobal vp9_idct_idct_8x8_add, 4, 4, %2, dst, stride, block, eob + +%if cpuflag(ssse3) +%if ARCH_X86_64 + mova m12, [pw_11585x2] ; often used +%define W_11585x2_REG m12 +%else +%define W_11585x2_REG [pw_11585x2] +%endif + + cmp eobd, 12 ; top left half or less + jg .idctfull + + cmp eobd, 3 ; top left corner or less + jg .idcthalf + + cmp eobd, 1 ; faster path for when only DC is set + jne .idcttopleftcorner +%else + cmp eobd, 1 + jg .idctfull +%endif + +%if cpuflag(ssse3) + movd m0, [blockq] + pmulhrsw m0, W_11585x2_REG + pmulhrsw m0, W_11585x2_REG +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (16 << 14) + 8192 + sar coefd, 14 + 5 + movd m0, coefd +%endif + SPLATW m0, m0, 0 + pxor m4, m4 + movd [blockq], m4 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_1024] ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5 +%endif +%rep 3 + VP9_STORE_2X 0, 0, 6, 7, 4 + lea dstq, [dstq+2*strideq] +%endrep + VP9_STORE_2X 0, 0, 6, 7, 4 + RET + +%if cpuflag(ssse3) +; faster path for when only left corner is set (3 input: DC, right to DC, below +; to DC). Note: also working with a 2x2 block +.idcttopleftcorner: + movd m0, [blockq+0] + movd m1, [blockq+16] +%if ARCH_X86_64 + mova m10, [pw_3196x2] + mova m11, [pw_16069x2] +%define W_3196x2_REG m10 +%define W_16069x2_REG m11 +%else +%define W_3196x2_REG [pw_3196x2] +%define W_16069x2_REG [pw_16069x2] +%endif + VP9_IDCT8_2x2_1D 1 + ; partial 2x8 transpose + ; punpcklwd m0, m1 already done inside idct + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 + punpckldq m0, m2 + punpckldq m4, m6 + SBUTTERFLY qdq, 0, 4, 1 + SWAP 1, 4 + VP9_IDCT8_2x2_1D 2 +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 ; used for the block reset, and VP9_STORE_2X + VP9_IDCT8_WRITEOUT +%if ARCH_X86_64 + movd [blockq+ 0], m6 + movd [blockq+16], m6 +%else + mova [blockq+ 0], m6 + mova [blockq+16], m6 + mova [blockq+32], m6 +%endif + RET + +.idcthalf: + movh m0, [blockq + 0] + movh m1, [blockq +16] + movh m2, [blockq +32] + movh m3, [blockq +48] + VP9_IDCT8_4x4_1D + ; partial 4x8 transpose +%if ARCH_X86_32 + mova m6, [blockq+ 0] +%endif + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 + SBUTTERFLY dq, 0, 2, 1 + SBUTTERFLY dq, 4, 6, 5 + SBUTTERFLY qdq, 0, 4, 1 + SBUTTERFLY qdq, 2, 6, 5 + SWAP 1, 4 + SWAP 3, 6 + VP9_IDCT8_4x4_1D +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 + VP9_IDCT8_WRITEOUT +%if ARCH_X86_64 + movh [blockq+ 0], m6 + movh [blockq+16], m6 + movh [blockq+32], m6 +%else + mova [blockq+ 0], m6 + mova [blockq+16], m6 + mova [blockq+32], m6 +%endif + movh [blockq+48], m6 + RET +%endif + +.idctfull: ; generic full 8x8 idct/idct +%if ARCH_X86_64 + mova m0, [blockq+ 0] ; IN(0) +%endif + mova m1, [blockq+ 16] ; IN(1) + mova m2, [blockq+ 32] ; IN(2) + mova m3, [blockq+ 48] ; IN(3) +%if ARCH_X86_64 + mova m4, [blockq+ 64] ; IN(4) +%endif + mova m5, [blockq+ 80] ; IN(5) + mova m6, [blockq+ 96] ; IN(6) + mova m7, [blockq+112] ; IN(7) +%if ARCH_X86_64 + mova m11, [pd_8192] ; rounding +%define D_8192_REG m11 +%else +%define D_8192_REG [pd_8192] +%endif + VP9_IDCT8_1D +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 + mova [blockq+0], m0 +%endif + VP9_IDCT8_1D + +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 ; used for the block reset, and VP9_STORE_2X + VP9_IDCT8_WRITEOUT + ZERO_BLOCK blockq, 16, 8, m6 + RET +%undef W_11585x2_REG +%endmacro + +VP9_IDCT_IDCT_8x8_ADD_XMM sse2, 12 +VP9_IDCT_IDCT_8x8_ADD_XMM ssse3, 13 +VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13 + +;--------------------------------------------------------------------------------------------- +; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +; x86-32: +; - in: m0/3/4/7 are in mem [blockq+N*16] +; - out: m6 is in mem [blockq+0] +; x86-64: +; - everything is in registers +%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/4/5/6/7 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 3, 9 + SWAP 4, 10 + SWAP 7, 11 +%endif + + VP9_UNPACK_MULSUB_2D_4X 5, 2, 0, 3, 14449, 7723 ; m5/2=t3[d], m2/4=t2[d] + VP9_UNPACK_MULSUB_2D_4X 1, 6, 4, 7, 4756, 15679 ; m1/4=t7[d], m6/7=t6[d] + SCRATCH 4, 12, blockq+1*16 + VP9_RND_SH_SUMSUB_BA 6, 2, 7, 3, 4, D_8192_REG ; m6=t2[w], m2=t6[w] + UNSCRATCH 4, 12, blockq+1*16 + VP9_RND_SH_SUMSUB_BA 1, 5, 4, 0, 3, D_8192_REG ; m1=t3[w], m5=t7[w] + + UNSCRATCH 0, 8, blockq+16*0 + UNSCRATCH 3, 9, blockq+16*3 + UNSCRATCH 4, 10, blockq+16*4 + UNSCRATCH 7, 11, blockq+16*7 + SCRATCH 1, 8, blockq+16*1 + SCRATCH 2, 9, blockq+16*2 + SCRATCH 5, 10, blockq+16*5 + SCRATCH 6, 11, blockq+16*6 + + VP9_UNPACK_MULSUB_2D_4X 7, 0, 1, 2, 16305, 1606 ; m7/1=t1[d], m0/2=t0[d] + VP9_UNPACK_MULSUB_2D_4X 3, 4, 5, 6, 10394, 12665 ; m3/5=t5[d], m4/6=t4[d] + SCRATCH 1, 12, blockq+ 0*16 + VP9_RND_SH_SUMSUB_BA 4, 0, 6, 2, 1, D_8192_REG ; m4=t0[w], m0=t4[w] + UNSCRATCH 1, 12, blockq+ 0*16 + VP9_RND_SH_SUMSUB_BA 3, 7, 5, 1, 2, D_8192_REG ; m3=t1[w], m7=t5[w] + + UNSCRATCH 2, 9, blockq+16*2 + UNSCRATCH 5, 10, blockq+16*5 + SCRATCH 3, 9, blockq+16*3 + SCRATCH 4, 10, blockq+16*4 + + ; m4=t0, m3=t1, m6=t2, m1=t3, m0=t4, m7=t5, m2=t6, m5=t7 + + VP9_UNPACK_MULSUB_2D_4X 0, 7, 1, 3, 15137, 6270 ; m0/1=t5[d], m7/3=t4[d] + VP9_UNPACK_MULSUB_2D_4X 5, 2, 4, 6, 6270, 15137 ; m5/4=t6[d], m2/6=t7[d] + SCRATCH 1, 12, blockq+ 0*16 + VP9_RND_SH_SUMSUB_BA 5, 7, 4, 3, 1, D_8192_REG + UNSCRATCH 1, 12, blockq+ 0*16 + PSIGNW m5, W_M1_REG ; m5=out1[w], m7=t6[w] + VP9_RND_SH_SUMSUB_BA 2, 0, 6, 1, 3, D_8192_REG ; m2=out6[w], m0=t7[w] + + UNSCRATCH 1, 8, blockq+16*1 + UNSCRATCH 3, 9, blockq+16*3 + UNSCRATCH 4, 10, blockq+16*4 + UNSCRATCH 6, 11, blockq+16*6 + SCRATCH 2, 8, blockq+16*0 + + SUMSUB_BA w, 6, 4, 2 ; m6=out0[w], m4=t2[w] + SUMSUB_BA w, 1, 3, 2 + PSIGNW m1, W_M1_REG ; m1=out7[w], m3=t3[w] + + ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7 + + ; unfortunately, the code below overflows in some cases +%if 0; cpuflag(ssse3) + SUMSUB_BA w, 3, 4, 2 + SUMSUB_BA w, 0, 7, 2 + pmulhrsw m3, W_11585x2_REG + pmulhrsw m7, W_11585x2_REG + pmulhrsw m4, W_11585x2_REG ; out4 + pmulhrsw m0, W_11585x2_REG ; out2 +%else + SCRATCH 5, 9, blockq+16*1 + VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, D_8192_REG, 2, 5 + VP9_UNPACK_MULSUB_2W_4X 7, 0, 11585, 11585, D_8192_REG, 2, 5 + UNSCRATCH 5, 9, blockq+16*1 +%endif + PSIGNW m3, W_M1_REG ; out3 + PSIGNW m7, W_M1_REG ; out5 + + ; m6=out0, m5=out1, m0=out2, m3=out3, m4=out4, m7=out5, m2=out6, m1=out7 + +%if ARCH_X86_64 + SWAP 2, 8 +%endif + SWAP 0, 6, 2 + SWAP 7, 1, 5 +%endmacro + +%macro IADST8_FN 6 +INIT_XMM %5 +cglobal vp9_%1_%3_8x8_add, 3, 3, %6, dst, stride, block, eob + +%ifidn %1, idct +%define first_is_idct 1 +%else +%define first_is_idct 0 +%endif + +%ifidn %3, idct +%define second_is_idct 1 +%else +%define second_is_idct 0 +%endif + +%if ARCH_X86_64 + mova m0, [blockq+ 0] ; IN(0) +%endif + mova m1, [blockq+ 16] ; IN(1) + mova m2, [blockq+ 32] ; IN(2) +%if ARCH_X86_64 || first_is_idct + mova m3, [blockq+ 48] ; IN(3) +%endif +%if ARCH_X86_64 + mova m4, [blockq+ 64] ; IN(4) +%endif + mova m5, [blockq+ 80] ; IN(5) + mova m6, [blockq+ 96] ; IN(6) +%if ARCH_X86_64 || first_is_idct + mova m7, [blockq+112] ; IN(7) +%endif +%if ARCH_X86_64 +%if cpuflag(ssse3) + mova m15, [pw_11585x2] ; often used +%endif + mova m13, [pd_8192] ; rounding + mova m14, [pw_m1] +%define W_11585x2_REG m15 +%define D_8192_REG m13 +%define W_M1_REG m14 +%else +%define W_11585x2_REG [pw_11585x2] +%define D_8192_REG [pd_8192] +%define W_M1_REG [pw_m1] +%endif + + ; note different calling conventions for idct8 vs. iadst8 on x86-32 + VP9_%2_1D +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [blockq+0], [blockq+64], 1 + mova [blockq+ 0], m0 +%if second_is_idct == 0 + mova [blockq+ 48], m3 + mova [blockq+112], m7 +%endif +%endif + VP9_%4_1D + +%if ARCH_X86_64 + SWAP 6, 8 +%endif + pxor m6, m6 ; used for the block reset, and VP9_STORE_2X + VP9_IDCT8_WRITEOUT + ZERO_BLOCK blockq, 16, 8, m6 + RET + +%undef W_11585x2_REG +%undef first_is_idct +%undef second_is_idct + +%endmacro + +IADST8_FN idct, IDCT8, iadst, IADST8, sse2, 15 +IADST8_FN iadst, IADST8, idct, IDCT8, sse2, 15 +IADST8_FN iadst, IADST8, iadst, IADST8, sse2, 15 +IADST8_FN idct, IDCT8, iadst, IADST8, ssse3, 16 +IADST8_FN idct, IDCT8, iadst, IADST8, avx, 16 +IADST8_FN iadst, IADST8, idct, IDCT8, ssse3, 16 +IADST8_FN iadst, IADST8, idct, IDCT8, avx, 16 +IADST8_FN iadst, IADST8, iadst, IADST8, ssse3, 16 +IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 + +;--------------------------------------------------------------------------------------------- +; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +; x86-64: +; at the end of this macro, m7 is stored in [%4+15*%5] +; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15 +; the following sumsubs have not been done yet: +; SUMSUB_BA w, 6, 9, 15 ; t6, t9 +; SUMSUB_BA w, 7, 8, 15 ; t7, t8 +; or (x86-32) t0-t5 are in m0-m5, t10-t15 are in x11/9/7/5/3/1, +; and the following simsubs have not been done yet: +; SUMSUB_BA w, x13, x14, 7 ; t6, t9 +; SUMSUB_BA w, x15, x12, 7 ; t7, t8 + +%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst +%if %2 <= 4 + mova m3, [%1+ 1*%3] ; IN(1) + mova m0, [%1+ 3*%3] ; IN(3) + + pmulhrsw m4, m3, [pw_16305x2] ; t14-15 + pmulhrsw m3, [pw_1606x2] ; t8-9 + pmulhrsw m7, m0, [pw_m4756x2] ; t10-11 + pmulhrsw m0, [pw_15679x2] ; t12-13 + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 + ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 + + VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 1, 6 ; t9, t14 + SCRATCH 4, 10, %4+ 1*%5 + SCRATCH 5, 11, %4+ 7*%5 + VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 + UNSCRATCH 5, 11, %4+ 7*%5 + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 +%else + mova m5, [%1+ 1*%3] ; IN(1) + mova m4, [%1+ 7*%3] ; IN(7) +%if %2 <= 8 + pmulhrsw m2, m5, [pw_16305x2] ; t15 + pmulhrsw m5, [pw_1606x2] ; t8 + pmulhrsw m3, m4, [pw_m10394x2] ; t9 + pmulhrsw m4, [pw_12665x2] ; t14 +%else + mova m3, [%1+ 9*%3] ; IN(9) + mova m2, [%1+15*%3] ; IN(15) + + ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7 + ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15 + + VP9_UNPACK_MULSUB_2W_4X 5, 2, 16305, 1606, [pd_8192], 0, 1 ; t8, t15 + VP9_UNPACK_MULSUB_2W_4X 3, 4, 10394, 12665, [pd_8192], 0, 1 ; t9, t14 +%endif + + SUMSUB_BA w, 3, 5, 0 ; t8, t9 + SUMSUB_BA w, 4, 2, 0 ; t15, t14 + + VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 0, 1 ; t9, t14 + + SCRATCH 4, 10, %4+ 1*%5 + SCRATCH 5, 11, %4+ 7*%5 + + mova m6, [%1+ 3*%3] ; IN(3) + mova m7, [%1+ 5*%3] ; IN(5) +%if %2 <= 8 + pmulhrsw m0, m7, [pw_14449x2] ; t13 + pmulhrsw m7, [pw_7723x2] ; t10 + pmulhrsw m1, m6, [pw_m4756x2] ; t11 + pmulhrsw m6, [pw_15679x2] ; t12 +%else + mova m0, [%1+11*%3] ; IN(11) + mova m1, [%1+13*%3] ; IN(13) + + VP9_UNPACK_MULSUB_2W_4X 7, 0, 14449, 7723, [pd_8192], 4, 5 ; t10, t13 + VP9_UNPACK_MULSUB_2W_4X 1, 6, 4756, 15679, [pd_8192], 4, 5 ; t11, t12 +%endif + + ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7 + ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15 + + SUMSUB_BA w, 7, 1, 4 ; t11, t10 + SUMSUB_BA w, 0, 6, 4 ; t12, t13 + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7 + ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15 + + VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 4, 5 ; t10, t13 + + UNSCRATCH 5, 11, %4+ 7*%5 +%endif + + ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7 + ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15 + + SUMSUB_BA w, 7, 3, 4 ; t8, t11 + + ; backup first register + mova [%4+15*%5], m7 + + SUMSUB_BA w, 6, 2, 7 ; t9, t10 + UNSCRATCH 4, 10, %4+ 1*%5 + SUMSUB_BA w, 0, 4, 7 ; t15, t12 + SUMSUB_BA w, 1, 5, 7 ; t14. t13 + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15 + +%if cpuflag(ssse3) && %6 == 0 + SUMSUB_BA w, 2, 5, 7 + SUMSUB_BA w, 3, 4, 7 + pmulhrsw m5, [pw_11585x2] ; t10 + pmulhrsw m4, [pw_11585x2] ; t11 + pmulhrsw m3, [pw_11585x2] ; t12 + pmulhrsw m2, [pw_11585x2] ; t13 +%else + SCRATCH 6, 10, %4+ 1*%5 + VP9_UNPACK_MULSUB_2W_4X 5, 2, 11585, 11585, [pd_8192], 6, 7 ; t10, t13 + VP9_UNPACK_MULSUB_2W_4X 4, 3, 11585, 11585, [pd_8192], 6, 7 ; t11, t12 + UNSCRATCH 6, 10, %4+ 1*%5 +%endif + + ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7 + ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15 + + SCRATCH 0, 8, %4+ 1*%5 + SCRATCH 1, 9, %4+ 3*%5 + SCRATCH 2, 10, %4+ 5*%5 + SCRATCH 3, 11, %4+ 7*%5 + SCRATCH 4, 12, %4+ 9*%5 + SCRATCH 5, 13, %4+11*%5 + SCRATCH 6, 14, %4+13*%5 + + ; even (tx8x8) +%if %2 <= 4 + mova m3, [%1+ 0*%3] ; IN(0) + mova m4, [%1+ 2*%3] ; IN(2) + + pmulhrsw m3, [pw_11585x2] ; t0-t3 + pmulhrsw m7, m4, [pw_16069x2] ; t6-7 + pmulhrsw m4, [pw_3196x2] ; t4-5 + +%if 0 ; overflows :( + paddw m6, m7, m4 + psubw m5, m7, m4 + pmulhrsw m5, [pw_11585x2] ; t5 + pmulhrsw m6, [pw_11585x2] ; t6 +%else + VP9_UNPACK_MULSUB_2W_4X 5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5, t6 +%endif + + psubw m0, m3, m7 + paddw m7, m3 + psubw m1, m3, m6 + paddw m6, m3 + psubw m2, m3, m5 + paddw m5, m3 + +%if ARCH_X86_32 + SWAP 0, 7 +%endif + SCRATCH 7, 15, %4+12*%5 +%else + mova m6, [%1+ 2*%3] ; IN(2) + mova m1, [%1+ 4*%3] ; IN(4) + mova m7, [%1+ 6*%3] ; IN(6) +%if %2 <= 8 + pmulhrsw m0, m1, [pw_15137x2] ; t3 + pmulhrsw m1, [pw_6270x2] ; t2 + pmulhrsw m5, m6, [pw_16069x2] ; t7 + pmulhrsw m6, [pw_3196x2] ; t4 + pmulhrsw m4, m7, [pw_m9102x2] ; t5 + pmulhrsw m7, [pw_13623x2] ; t6 +%else + mova m4, [%1+10*%3] ; IN(10) + mova m0, [%1+12*%3] ; IN(12) + mova m5, [%1+14*%3] ; IN(14) + + VP9_UNPACK_MULSUB_2W_4X 1, 0, 15137, 6270, [pd_8192], 2, 3 ; t2, t3 + VP9_UNPACK_MULSUB_2W_4X 6, 5, 16069, 3196, [pd_8192], 2, 3 ; t4, t7 + VP9_UNPACK_MULSUB_2W_4X 4, 7, 9102, 13623, [pd_8192], 2, 3 ; t5, t6 +%endif + + SUMSUB_BA w, 4, 6, 2 ; t4, t5 + SUMSUB_BA w, 7, 5, 2 ; t7, t6 + +%if cpuflag(ssse3) && %6 == 0 + SUMSUB_BA w, 6, 5, 2 + pmulhrsw m5, [pw_11585x2] ; t5 + pmulhrsw m6, [pw_11585x2] ; t6 +%else + VP9_UNPACK_MULSUB_2W_4X 5, 6, 11585, 11585, [pd_8192], 2, 3 ; t5, t6 +%endif + + SCRATCH 5, 15, %4+10*%5 + mova m2, [%1+ 0*%3] ; IN(0) +%if %2 <= 8 + pmulhrsw m2, [pw_11585x2] ; t0 and t1 + psubw m3, m2, m0 + paddw m0, m2 + + SUMSUB_BA w, 7, 0, 5 ; t0, t7 +%else + mova m3, [%1+ 8*%3] ; IN(8) + + ; from 3 stages back +%if cpuflag(ssse3) && %6 == 0 + SUMSUB_BA w, 3, 2, 5 + pmulhrsw m3, [pw_11585x2] ; t0 + pmulhrsw m2, [pw_11585x2] ; t1 +%else + mova [%1+ 0*%3], m0 + VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 0 ; t0, t1 + mova m0, [%1+ 0*%3] +%endif + + ; from 2 stages back + SUMSUB_BA w, 0, 3, 5 ; t0, t3 + + SUMSUB_BA w, 7, 0, 5 ; t0, t7 +%endif + UNSCRATCH 5, 15, %4+10*%5 +%if ARCH_X86_32 + SWAP 0, 7 +%endif + SCRATCH 7, 15, %4+12*%5 + SUMSUB_BA w, 1, 2, 7 ; t1, t2 + + ; from 1 stage back + SUMSUB_BA w, 6, 1, 7 ; t1, t6 + SUMSUB_BA w, 5, 2, 7 ; t2, t5 +%endif + SUMSUB_BA w, 4, 3, 7 ; t3, t4 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 + SWAP 5, 13 + SWAP 6, 14 + + SUMSUB_BA w, 0, 15, 7 ; t0, t15 + SUMSUB_BA w, 1, 14, 7 ; t1, t14 + SUMSUB_BA w, 2, 13, 7 ; t2, t13 + SUMSUB_BA w, 3, 12, 7 ; t3, t12 + SUMSUB_BA w, 4, 11, 7 ; t4, t11 + SUMSUB_BA w, 5, 10, 7 ; t5, t10 +%else + SWAP 1, 6 + SWAP 2, 5 + SWAP 3, 4 + mova [%4+14*%5], m6 + +%macro %%SUMSUB_BA_STORE 5 ; reg, from_mem, to_mem, scratch, scratch_stride + mova m6, [%4+%2*%5] + SUMSUB_BA w, 6, %1, 7 + SWAP %1, 6 + mova [%4+%3*%5], m6 +%endmacro + + %%SUMSUB_BA_STORE 0, 1, 1, %4, %5 ; t0, t15 + %%SUMSUB_BA_STORE 1, 3, 3, %4, %5 ; t1, t14 + %%SUMSUB_BA_STORE 2, 5, 5, %4, %5 ; t2, t13 + %%SUMSUB_BA_STORE 3, 7, 7, %4, %5 ; t3, t12 + %%SUMSUB_BA_STORE 4, 9, 9, %4, %5 ; t4, t11 + %%SUMSUB_BA_STORE 5, 11, 11, %4, %5 ; t5, t10 +%endif +%endmacro + +%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst +%if %2 == 1 + VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4 + +%if ARCH_X86_64 + ; backup a different register + mova m7, [tmpq+15*16] + mova [tmpq+ 1*16], m15 + + SUMSUB_BA w, 6, 9, 15 ; t6, t9 + SUMSUB_BA w, 7, 8, 15 ; t7, t8 + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15 + mova [tmpq+ 0], m0 + mova [tmpq+ 32], m1 + mova [tmpq+ 64], m2 + mova [tmpq+ 96], m3 + mova [tmpq+128], m4 + mova [tmpq+160], m5 + mova [tmpq+192], m6 + mova [tmpq+224], m7 + + mova m15, [tmpq+ 1*16] + TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 + mova [tmpq+ 16], m8 + mova [tmpq+ 48], m9 + mova [tmpq+ 80], m10 + mova [tmpq+112], m11 + mova [tmpq+144], m12 + mova [tmpq+176], m13 + mova [tmpq+208], m14 + mova [tmpq+240], m15 +%else + mova m6, [tmpq+13*16] + mova m7, [tmpq+14*16] + SUMSUB_BA w, 6, 7 ; t6, t9 + mova [tmpq+14*16], m6 + mova [tmpq+13*16], m7 + mova m7, [tmpq+15*16] + mova m6, [tmpq+12*16] + SUMSUB_BA w, 7, 6 ; t7, t8 + mova [tmpq+15*16], m6 + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+14*16], [tmpq+ 8*16], 1 + mova [tmpq+ 0*16], m0 + mova [tmpq+ 2*16], m1 + mova [tmpq+ 4*16], m2 + mova [tmpq+ 6*16], m3 + mova [tmpq+10*16], m5 + mova [tmpq+12*16], m6 + mova [tmpq+14*16], m7 + + mova m0, [tmpq+15*16] + mova m1, [tmpq+13*16] + mova m2, [tmpq+11*16] + mova m3, [tmpq+ 9*16] + mova m4, [tmpq+ 7*16] + mova m5, [tmpq+ 5*16] + mova m7, [tmpq+ 1*16] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+ 3*16], [tmpq+ 9*16], 1 + mova [tmpq+ 1*16], m0 + mova [tmpq+ 3*16], m1 + mova [tmpq+ 5*16], m2 + mova [tmpq+ 7*16], m3 + mova [tmpq+11*16], m5 + mova [tmpq+13*16], m6 + mova [tmpq+15*16], m7 +%endif +%else ; %2 == 2 + VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4 + +%if cpuflag(ssse3) +%define ROUND_REG [pw_512] +%else +%define ROUND_REG [pw_32] +%endif + + pxor m7, m7 +%if ARCH_X86_64 + ; backup more registers + mova [%1+ 2*32], m8 + mova [%1+ 3*32], m9 + + VP9_IDCT8_WRITEx2 0, 1, 8, 9, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 2, 3, 8, 9, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 4, 5, 8, 9, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + ; restore from cache + SWAP 0, 7 ; move zero from m7 to m0 + mova m7, [%1+15*32] + mova m8, [%1+ 2*32] + mova m9, [%1+ 3*32] + + SUMSUB_BA w, 6, 9, 3 ; t6, t9 + SUMSUB_BA w, 7, 8, 3 ; t7, t8 + + VP9_IDCT8_WRITEx2 6, 7, 3, 4, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 8, 9, 3, 4, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 10, 11, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 12, 13, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 14, 15, 1, 2, 0, ROUND_REG, 6 +%else + mova [tmpq+ 0*32], m5 + + VP9_IDCT8_WRITEx2 0, 1, 5, 6, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 2, 3, 5, 6, 7, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + SWAP 0, 7 ; move zero from m7 to m0 + mova m5, [tmpq+ 0*32] + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m4, [tmpq+13*32] + mova m7, [tmpq+14*32] + mova m5, [tmpq+15*32] + mova m6, [tmpq+12*32] + SUMSUB_BADC w, 4, 7, 5, 6, 1 + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m4, [tmpq+11*32] + mova m5, [tmpq+ 9*32] + mova m6, [tmpq+ 7*32] + mova m7, [tmpq+ 5*32] + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 6, 7, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m4, [tmpq+ 3*32] + mova m5, [tmpq+ 1*32] + + VP9_IDCT8_WRITEx2 4, 5, 1, 2, 0, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] +%endif + +%undef ROUND_REG +%endif ; %2 == 1/2 +%endmacro + +%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride + mova m%3, [dstq] + mova m%5, [dstq+%7] + punpcklbw m%2, m%3, m%6 + punpckhbw m%3, m%6 + punpcklbw m%4, m%5, m%6 + punpckhbw m%5, m%6 + paddw m%2, m%1 + paddw m%3, m%1 + paddw m%4, m%1 + paddw m%5, m%1 + packuswb m%2, m%3 + packuswb m%4, m%5 + mova [dstq], m%2 + mova [dstq+%7], m%4 +%endmacro + +%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1 +INIT_XMM %1 +cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob +%if cpuflag(ssse3) + ; 2x2=eob=3, 4x4=eob=10 + cmp eobd, 38 + jg .idctfull + cmp eobd, 1 ; faster path for when only DC is set + jne .idct8x8 +%else + cmp eobd, 1 ; faster path for when only DC is set + jg .idctfull +%endif + + ; dc-only +%if cpuflag(ssse3) + movd m0, [blockq] + mova m1, [pw_11585x2] + pmulhrsw m0, m1 + pmulhrsw m0, m1 +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (32 << 14) + 8192 + sar coefd, 14 + 6 + movd m0, coefd +%endif + SPLATW m0, m0, q0000 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_512] +%endif + pxor m5, m5 + movd [blockq], m5 +%rep 7 + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 + lea dstq, [dstq+2*strideq] +%endrep + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5 + RET + + DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp +%if cpuflag(ssse3) +.idct8x8: + mov tmpq, rsp + VP9_IDCT16_1D blockq, 1, 8, 0 + + mov cntd, 2 + mov dst_bakq, dstq +.loop2_8x8: + VP9_IDCT16_1D tmpq, 2, 8, 0 + lea dstq, [dst_bakq+8] + add tmpq, 16 + dec cntd + jg .loop2_8x8 + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 8, m0 + RET +%endif + +.idctfull: + mov cntd, 2 + mov tmpq, rsp +.loop1_full: + VP9_IDCT16_1D blockq, 1, 16, 0 + add blockq, 16 + add tmpq, 256 + dec cntd + jg .loop1_full + sub blockq, 32 + + mov cntd, 2 + mov tmpq, rsp + mov dst_bakq, dstq +.loop2_full: + VP9_IDCT16_1D tmpq, 2, 16, 0 + lea dstq, [dst_bakq+8] + add tmpq, 16 + dec cntd + jg .loop2_full + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endmacro + +VP9_IDCT_IDCT_16x16_ADD_XMM sse2 +VP9_IDCT_IDCT_16x16_ADD_XMM ssse3 +VP9_IDCT_IDCT_16x16_ADD_XMM avx + +%macro VP9_IDCT16_YMM_1D 0 + VP9_UNPACK_MULSUB_2W_4X 1, 15, 16305, 1606, [pd_8192], 0, 4 ; t8, t15 + VP9_UNPACK_MULSUB_2W_4X 9, 7, 10394, 12665, [pd_8192], 0, 4 ; t9, t14 + + SUMSUB_BA w, 9, 1, 0 ; t8, t9 + SUMSUB_BA w, 7, 15, 0 ; t15, t14 + + VP9_UNPACK_MULSUB_2W_4X 15, 1, 15137, 6270, [pd_8192], 0, 4 ; t9, t14 + + VP9_UNPACK_MULSUB_2W_4X 5, 11, 14449, 7723, [pd_8192], 0, 4 ; t10, t13 + VP9_UNPACK_MULSUB_2W_4X 13, 3, 4756, 15679, [pd_8192], 0, 4 ; t11, t12 + + SUMSUB_BA w, 5, 13, 0 ; t11, t10 + SUMSUB_BA w, 11, 3, 0 ; t12, t13 + + VP9_UNPACK_MULSUB_2W_4X 3, 13, 6270, m15137, [pd_8192], 0, 4 ; t10, t13 + + SUMSUB_BA w, 5, 9, 0 ; t8, t11 + SUMSUB_BA w, 3, 15, 0 ; t9, t10 + SUMSUB_BA w, 11, 7, 0 ; t15, t12 + SUMSUB_BA w, 13, 1, 0 ; t14, t13 + + SUMSUB_BA w, 15, 1, 0 + SUMSUB_BA w, 9, 7, 0 + pmulhrsw m1, [pw_11585x2] ; t10 + pmulhrsw m7, [pw_11585x2] ; t11 + pmulhrsw m9, [pw_11585x2] ; t12 + pmulhrsw m15, [pw_11585x2] ; t13 + + ; even (tx8x8) + mova m4, [blockq+128] + mova [blockq+128], m5 + VP9_UNPACK_MULSUB_2W_4X 4, 12, 15137, 6270, [pd_8192], 0, 5 ; t2, t3 + VP9_UNPACK_MULSUB_2W_4X 2, 14, 16069, 3196, [pd_8192], 0, 5 ; t4, t7 + VP9_UNPACK_MULSUB_2W_4X 10, 6, 9102, 13623, [pd_8192], 0, 5 ; t5, t6 + mova m0, [blockq+ 0] + SUMSUB_BA w, 8, 0, 5 + pmulhrsw m8, [pw_11585x2] ; t0 + pmulhrsw m0, [pw_11585x2] ; t1 + + SUMSUB_BA w, 10, 2, 5 ; t4, t5 + SUMSUB_BA w, 6, 14, 5 ; t7, t6 + SUMSUB_BA w, 12, 8, 5 ; t0, t3 + SUMSUB_BA w, 4, 0, 5 ; t1, t2 + + SUMSUB_BA w, 2, 14, 5 + pmulhrsw m14, [pw_11585x2] ; t5 + pmulhrsw m2, [pw_11585x2] ; t6 + + SUMSUB_BA w, 6, 12, 5 ; t0, t7 + SUMSUB_BA w, 2, 4, 5 ; t1, t6 + SUMSUB_BA w, 14, 0, 5 ; t2, t5 + SUMSUB_BA w, 10, 8, 5 ; t3, t4 + + ; final stage + SUMSUB_BA w, 11, 6, 5 ; out0, out15 + SUMSUB_BA w, 13, 2, 5 ; out1, out14 + SUMSUB_BA w, 15, 14, 5 ; out2, out13 + SUMSUB_BA w, 9, 10, 5 ; out3, out12 + SUMSUB_BA w, 7, 8, 5 ; out4, out11 + SUMSUB_BA w, 1, 0, 5 ; out5, out10 + SUMSUB_BA w, 3, 4, 5 ; out6, out9 + mova m5, [blockq+128] + mova [blockq+192], m3 + SUMSUB_BA w, 5, 12, 3 ; out7, out8 + + SWAP 0, 11, 8, 12, 10 + SWAP 1, 13, 14, 2, 15, 6, 3, 9, 4, 7, 5 +%endmacro + +; this is almost identical to VP9_STORE_2X, but it does two rows +; for slightly improved interleaving, and it omits vpermq since the +; input is DC so all values are identical +%macro VP9_STORE_YMM_DC_4X 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero + mova xm%2, [dstq] + mova xm%4, [dstq+strideq*2] + vinserti128 m%2, m%2, [dstq+strideq], 1 + vinserti128 m%4, m%4, [dstq+stride3q], 1 + punpckhbw m%3, m%2, m%6 + punpcklbw m%2, m%6 + punpckhbw m%5, m%4, m%6 + punpcklbw m%4, m%6 + paddw m%3, m%1 + paddw m%2, m%1 + paddw m%5, m%1 + paddw m%4, m%1 + packuswb m%2, m%3 + packuswb m%4, m%5 + mova [dstq], xm%2 + mova [dstq+strideq*2], xm%4 + vextracti128 [dstq+strideq], m%2, 1 + vextracti128 [dstq+stride3q], m%4, 1 +%endmacro + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_idct_idct_16x16_add, 4, 4, 16, dst, stride, block, eob + cmp eobd, 1 ; faster path for when only DC is set + jg .idctfull + + ; dc-only + mova m1, [pw_11585x2] + vpbroadcastw m0, [blockq] + pmulhrsw m0, m1 + pmulhrsw m0, m1 + pxor m5, m5 + pmulhrsw m0, [pw_512] + movd [blockq], xm5 + + DEFINE_ARGS dst, stride, stride3, cnt + mov cntd, 4 + lea stride3q, [strideq*3] +.loop_dc: + VP9_STORE_YMM_DC_4X 0, 1, 2, 3, 4, 5 + lea dstq, [dstq+4*strideq] + dec cntd + jg .loop_dc + RET + + DEFINE_ARGS dst, stride, block, eob +.idctfull: + mova m1, [blockq+ 32] + mova m2, [blockq+ 64] + mova m3, [blockq+ 96] + mova m5, [blockq+160] + mova m6, [blockq+192] + mova m7, [blockq+224] + mova m8, [blockq+256] + mova m9, [blockq+288] + mova m10, [blockq+320] + mova m11, [blockq+352] + mova m12, [blockq+384] + mova m13, [blockq+416] + mova m14, [blockq+448] + mova m15, [blockq+480] + + VP9_IDCT16_YMM_1D + TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + [blockq+192], [blockq+128], 1 + mova [blockq+ 0], m0 + VP9_IDCT16_YMM_1D + + mova [blockq+224], m7 + + ; store + VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + mova m6, [blockq+192] + mova m7, [blockq+224] + VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + pxor m0, m0 + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endif + +;--------------------------------------------------------------------------------------------- +; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +%macro VP9_IADST16_1D 2 ; src, pass +%assign %%str 16*%2 + mova m0, [%1+ 0*32] ; in0 + mova m1, [%1+15*32] ; in15 + mova m2, [%1+ 7*32] ; in7 + mova m3, [%1+ 8*32] ; in8 + + VP9_UNPACK_MULSUB_2D_4X 1, 0, 4, 5, 16364, 804 ; m1/4=t1[d], m0/5=t0[d] + VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 6, 11003, 12140 ; m2/7=t9[d], m3/6=t8[d] + SCRATCH 4, 8, tmpq+ 0*%%str + VP9_RND_SH_SUMSUB_BA 3, 0, 6, 5, 4, [pd_8192] ; m3=t0[w], m0=t8[w] + UNSCRATCH 4, 8, tmpq+ 0*%%str + VP9_RND_SH_SUMSUB_BA 2, 1, 7, 4, 5, [pd_8192] ; m2=t1[w], m1=t9[w] + + SCRATCH 0, 10, tmpq+ 0*%%str + SCRATCH 1, 11, tmpq+15*%%str + mova [tmpq+ 7*%%str], m2 + mova [tmpq+ 8*%%str], m3 + + mova m1, [%1+ 2*32] ; in2 + mova m0, [%1+13*32] ; in13 + mova m3, [%1+ 5*32] ; in5 + mova m2, [%1+10*32] ; in10 + + VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 15893, 3981 ; m0/6=t3[d], m1/7=t2[d] + VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d] + SCRATCH 4, 12, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 2, 1, 5, 7, 4, [pd_8192] ; m2=t2[w], m1=t10[w] + UNSCRATCH 4, 12, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 3, 0, 4, 6, 5, [pd_8192] ; m3=t3[w], m0=t11[w] + + SCRATCH 0, 12, tmpq+ 2*%%str + SCRATCH 1, 13, tmpq+13*%%str + mova [tmpq+ 5*%%str], m2 + mova [tmpq+10*%%str], m3 + + mova m2, [%1+ 4*32] ; in4 + mova m3, [%1+11*32] ; in11 + mova m0, [%1+ 3*32] ; in3 + mova m1, [%1+12*32] ; in12 + + VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 14811, 7005 ; m3/7=t5[d], m2/6=t4[d] + VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 5520, 15426 ; m0/4=t13[d], m1/5=t12[d] + SCRATCH 4, 9, tmpq+ 4*%%str + VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t4[w], m2=t12[w] + UNSCRATCH 4, 9, tmpq+ 4*%%str + VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t5[w], m3=t13[w] + + SCRATCH 0, 8, tmpq+ 4*%%str + mova [tmpq+11*%%str], m1 ; t4:m1->r11 + UNSCRATCH 0, 10, tmpq+ 0*%%str + UNSCRATCH 1, 11, tmpq+15*%%str + + ; round 2 interleaved part 1 + VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d] + VP9_UNPACK_MULSUB_2D_4X 3, 2, 5, 4, 3196, 16069 ; m3/5=t12[d], m2/4=t13[d] + SCRATCH 4, 9, tmpq+ 3*%%str + VP9_RND_SH_SUMSUB_BA 3, 1, 5, 7, 4, [pd_8192] ; m3=t8[w], m1=t12[w] + UNSCRATCH 4, 9, tmpq+ 3*%%str + VP9_RND_SH_SUMSUB_BA 2, 0, 4, 6, 5, [pd_8192] ; m2=t9[w], m0=t13[w] + + SCRATCH 0, 10, tmpq+ 0*%%str + SCRATCH 1, 11, tmpq+15*%%str + SCRATCH 2, 14, tmpq+ 3*%%str + SCRATCH 3, 15, tmpq+12*%%str + + mova m2, [%1+ 6*32] ; in6 + mova m3, [%1+ 9*32] ; in9 + mova m0, [%1+ 1*32] ; in1 + mova m1, [%1+14*32] ; in14 + + VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d] + VP9_UNPACK_MULSUB_2D_4X 0, 1, 4, 5, 2404, 16207 ; m0/4=t15[d], m1/5=t14[d] + SCRATCH 4, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 1, 2, 5, 6, 4, [pd_8192] ; m1=t6[w], m2=t14[w] + UNSCRATCH 4, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 0, 3, 4, 7, 6, [pd_8192] ; m0=t7[w], m3=t15[w] + + ; r8=t0, r7=t1, r5=t2, r10=t3, r11=t4, m8|r4=t5, m1=t6, m0=t7 + ; m10|r0=t8, m11|r15=t9, m13|r13=t10, m12|r2=t11, m14|r3=t12, m15|r12=t13, m2=t14, m3=t15 + + UNSCRATCH 4, 12, tmpq+ 2*%%str + UNSCRATCH 5, 13, tmpq+13*%%str + SCRATCH 0, 12, tmpq+ 1*%%str + SCRATCH 1, 13, tmpq+14*%%str + + ; remainder of round 2 (rest of t8-15) + VP9_UNPACK_MULSUB_2D_4X 5, 4, 6, 7, 9102, 13623 ; m5/6=t11[d], m4/7=t10[d] + VP9_UNPACK_MULSUB_2D_4X 3, 2, 1, 0, 13623, 9102 ; m3/1=t14[d], m2/0=t15[d] + SCRATCH 0, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 3, 4, 1, 7, 0, [pd_8192] ; m3=t10[w], m4=t14[w] + UNSCRATCH 0, 9, tmpq+ 6*%%str + VP9_RND_SH_SUMSUB_BA 2, 5, 0, 6, 1, [pd_8192] ; m2=t11[w], m5=t15[w] + + ; m15|r12=t8, m14|r3=t9, m3=t10, m2=t11, m11|r15=t12, m10|r0=t13, m4=t14, m5=t15 + + UNSCRATCH 6, 14, tmpq+ 3*%%str + UNSCRATCH 7, 15, tmpq+12*%%str + + SUMSUB_BA w, 3, 7, 1 + PSIGNW m3, [pw_m1] ; m3=out1[w], m7=t10[w] + SUMSUB_BA w, 2, 6, 1 ; m2=out14[w], m6=t11[w] + + ; unfortunately, the code below overflows in some cases, e.g. + ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8.webm +%if 0; cpuflag(ssse3) + SUMSUB_BA w, 7, 6, 1 + pmulhrsw m7, [pw_11585x2] ; m7=out6[w] + pmulhrsw m6, [pw_11585x2] ; m6=out9[w] +%else + VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, 11585, [pd_8192], 1, 0 +%endif + + mova [tmpq+ 3*%%str], m6 + mova [tmpq+ 6*%%str], m7 + UNSCRATCH 6, 10, tmpq+ 0*%%str + UNSCRATCH 7, 11, tmpq+15*%%str + mova [tmpq+13*%%str], m2 + SCRATCH 3, 11, tmpq+ 9*%%str + + VP9_UNPACK_MULSUB_2D_4X 7, 6, 2, 3, 15137, 6270 ; m6/3=t13[d], m7/2=t12[d] + VP9_UNPACK_MULSUB_2D_4X 5, 4, 1, 0, 6270, 15137 ; m5/1=t14[d], m4/0=t15[d] + SCRATCH 0, 9, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 5, 6, 1, 3, 0, [pd_8192] ; m5=out2[w], m6=t14[w] + UNSCRATCH 0, 9, tmpq+ 2*%%str + VP9_RND_SH_SUMSUB_BA 4, 7, 0, 2, 1, [pd_8192] + PSIGNW m4, [pw_m1] ; m4=out13[w], m7=t15[w] + + ; unfortunately, the code below overflows in some cases +%if 0; cpuflag(ssse3) + SUMSUB_BA w, 7, 6, 1 + pmulhrsw m7, [pw_m11585x2] ; m7=out5[w] + pmulhrsw m6, [pw_11585x2] ; m6=out10[w] +%else + PSIGNW m7, [pw_m1] + VP9_UNPACK_MULSUB_2W_4X 7, 6, 11585, 11585, [pd_8192], 1, 0 +%endif + + ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, m6=out10, m4=out13, r2=out14 + + mova m2, [tmpq+ 8*%%str] + mova m3, [tmpq+ 7*%%str] + mova m1, [tmpq+11*%%str] + mova [tmpq+ 7*%%str], m6 + mova [tmpq+11*%%str], m4 + mova m4, [tmpq+ 5*%%str] + SCRATCH 5, 14, tmpq+ 5*%%str + SCRATCH 7, 15, tmpq+ 8*%%str + UNSCRATCH 6, 8, tmpq+ 4*%%str + UNSCRATCH 5, 12, tmpq+ 1*%%str + UNSCRATCH 7, 13, tmpq+14*%%str + + ; m2=t0, m3=t1, m9=t2, m0=t3, m1=t4, m8=t5, m13=t6, m12=t7 + ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 + + SUMSUB_BA w, 1, 2, 0 ; m1=t0[w], m2=t4[w] + mova m0, [tmpq+10*%%str] + SCRATCH 1, 12, tmpq+ 1*%%str + SUMSUB_BA w, 6, 3, 1 ; m8=t1[w], m3=t5[w] + SCRATCH 6, 13, tmpq+ 4*%%str + SUMSUB_BA w, 7, 4, 1 ; m13=t2[w], m9=t6[w] + SCRATCH 7, 8, tmpq+10*%%str + SUMSUB_BA w, 5, 0, 1 ; m12=t3[w], m0=t7[w] + SCRATCH 5, 9, tmpq+14*%%str + + VP9_UNPACK_MULSUB_2D_4X 2, 3, 7, 5, 15137, 6270 ; m2/6=t5[d], m3/10=t4[d] + VP9_UNPACK_MULSUB_2D_4X 0, 4, 1, 6, 6270, 15137 ; m0/14=t6[d], m9/15=t7[d] + SCRATCH 6, 10, tmpq+ 0*%%str + VP9_RND_SH_SUMSUB_BA 0, 3, 1, 5, 6, [pd_8192] + UNSCRATCH 6, 10, tmpq+ 0*%%str + PSIGNW m0, [pw_m1] ; m0=out3[w], m3=t6[w] + VP9_RND_SH_SUMSUB_BA 4, 2, 6, 7, 5, [pd_8192] ; m9=out12[w], m2=t7[w] + + UNSCRATCH 1, 8, tmpq+10*%%str + UNSCRATCH 5, 9, tmpq+14*%%str + UNSCRATCH 6, 12, tmpq+ 1*%%str + UNSCRATCH 7, 13, tmpq+ 4*%%str + SCRATCH 4, 9, tmpq+14*%%str + + SUMSUB_BA w, 1, 6, 4 ; m13=out0[w], m1=t2[w] + SUMSUB_BA w, 5, 7, 4 + PSIGNW m5, [pw_m1] ; m12=out15[w], m8=t3[w] + + ; unfortunately, the code below overflows in some cases, e.g. + ; http://downloads.webmproject.org/test_data/libvpx/vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm +%if 0 ; cpuflag(ssse3) + SUMSUB_BA w, 7, 6, 4 + pmulhrsw m7, [pw_m11585x2] ; m8=out7[w] + pmulhrsw m6, [pw_11585x2] ; m1=out8[w] + SWAP 6, 7 + SUMSUB_BA w, 3, 2, 4 + pmulhrsw m3, [pw_11585x2] ; m3=out4[w] + pmulhrsw m2, [pw_11585x2] ; m2=out11[w] +%else + SCRATCH 5, 8, tmpq+10*%%str + VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, m11585, [pd_8192], 5, 4 + VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 4 + UNSCRATCH 5, 8, tmpq+10*%%str +%endif + + ; m13=out0, m0=out3, m3=out4, m8=out7, m1=out8, m2=out11, m9=out12, m12=out15 + ; m11|r13=out1, m5=out2, m7=out5, r15=out6, r3=out9, r10=out10, r11=out13, r2=out14 + +%if %2 == 1 +%if ARCH_X86_64 + mova m13, [tmpq+ 6*%%str] + TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 6, 10 + mova [tmpq+ 0*16], m1 + mova [tmpq+ 2*16], m11 + mova [tmpq+ 4*16], m14 + mova [tmpq+ 6*16], m0 + mova m1, [tmpq+ 3*%%str] + mova m11, [tmpq+ 7*%%str] + mova m14, [tmpq+11*%%str] + mova m0, [tmpq+13*%%str] + mova [tmpq+ 8*16], m3 + mova [tmpq+10*16], m15 + mova [tmpq+12*16], m13 + mova [tmpq+14*16], m6 + + TRANSPOSE8x8W 7, 1, 11, 2, 9, 14, 0, 5, 10 + mova [tmpq+ 1*16], m7 + mova [tmpq+ 3*16], m1 + mova [tmpq+ 5*16], m11 + mova [tmpq+ 7*16], m2 + mova [tmpq+ 9*16], m9 + mova [tmpq+11*16], m14 + mova [tmpq+13*16], m0 + mova [tmpq+15*16], m5 +%else + mova [tmpq+12*%%str], m2 + mova [tmpq+ 1*%%str], m5 + mova [tmpq+15*%%str], m7 + mova m2, [tmpq+ 9*%%str] + mova m5, [tmpq+ 5*%%str] + mova m7, [tmpq+ 8*%%str] + TRANSPOSE8x8W 1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1 + mova [tmpq+ 0*16], m1 + mova [tmpq+ 2*16], m2 + mova [tmpq+ 4*16], m5 + mova [tmpq+ 6*16], m0 + mova [tmpq+10*16], m7 + mova m3, [tmpq+12*%%str] + mova [tmpq+12*16], m4 + mova m4, [tmpq+14*%%str] + mova [tmpq+14*16], m6 + + mova m0, [tmpq+15*%%str] + mova m1, [tmpq+ 3*%%str] + mova m2, [tmpq+ 7*%%str] + mova m5, [tmpq+11*%%str] + mova m7, [tmpq+ 1*%%str] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+13*%%str], [tmpq+ 9*%%str], 1 + mova [tmpq+ 1*16], m0 + mova [tmpq+ 3*16], m1 + mova [tmpq+ 5*16], m2 + mova [tmpq+ 7*16], m3 + mova [tmpq+11*16], m5 + mova [tmpq+13*16], m6 + mova [tmpq+15*16], m7 +%endif +%else + pxor m4, m4 + +%if cpuflag(ssse3) +%define ROUND_REG [pw_512] +%else +%define ROUND_REG [pw_32] +%endif + +%if ARCH_X86_64 + mova m12, [tmpq+ 6*%%str] + VP9_IDCT8_WRITEx2 1, 11, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 14, 0, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 3, 15, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 12, 6, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m1, [tmpq+ 3*%%str] + mova m11, [tmpq+ 7*%%str] + mova m14, [tmpq+11*%%str] + mova m0, [tmpq+13*%%str] + + VP9_IDCT8_WRITEx2 7, 1, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 11, 2, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 9, 14, 10, 8, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + VP9_IDCT8_WRITEx2 0, 5, 10, 8, 4, ROUND_REG, 6 +%else + mova [tmpq+ 0*%%str], m2 + mova [tmpq+ 1*%%str], m5 + mova [tmpq+ 2*%%str], m7 + mova m2, [tmpq+ 9*%%str] + VP9_IDCT8_WRITEx2 1, 2, 5, 7, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m5, [tmpq+ 5*%%str] + VP9_IDCT8_WRITEx2 5, 0, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m5, [tmpq+ 8*%%str] + VP9_IDCT8_WRITEx2 3, 5, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m5, [tmpq+ 6*%%str] + VP9_IDCT8_WRITEx2 5, 6, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + + mova m0, [tmpq+ 2*%%str] + mova m3, [tmpq+ 3*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m0, [tmpq+ 7*%%str] + mova m3, [tmpq+ 0*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m0, [tmpq+14*%%str] + mova m3, [tmpq+11*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 + lea dstq, [dstq+strideq*2] + mova m0, [tmpq+13*%%str] + mova m3, [tmpq+ 1*%%str] + VP9_IDCT8_WRITEx2 0, 3, 1, 2, 4, ROUND_REG, 6 +%endif + + SWAP 0, 4 ; zero +%undef ROUND_REG +%endif +%endmacro + +%macro IADST16_FN 5 +INIT_XMM %5 +cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp + mov cntd, 2 + mov tmpq, rsp +.loop1_full: + VP9_%2_1D blockq, 1 + add blockq, 16 + add tmpq, 256 + dec cntd + jg .loop1_full + sub blockq, 32 + + mov cntd, 2 + mov tmpq, rsp + mov dst_bakq, dstq +.loop2_full: + VP9_%4_1D tmpq, 2 + lea dstq, [dst_bakq+8] + add tmpq, 16 + dec cntd + jg .loop2_full + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endmacro + +IADST16_FN idct, IDCT16, iadst, IADST16, sse2 +IADST16_FN iadst, IADST16, idct, IDCT16, sse2 +IADST16_FN iadst, IADST16, iadst, IADST16, sse2 +IADST16_FN idct, IDCT16, iadst, IADST16, ssse3 +IADST16_FN iadst, IADST16, idct, IDCT16, ssse3 +IADST16_FN iadst, IADST16, iadst, IADST16, ssse3 +IADST16_FN idct, IDCT16, iadst, IADST16, avx +IADST16_FN iadst, IADST16, idct, IDCT16, avx +IADST16_FN iadst, IADST16, iadst, IADST16, avx + +; in: data in m[0-15] except m0/m4, which are in [blockq+0] and [blockq+128] +; out: m[0-15] except m6, which is in [blockq+192] +; uses blockq as scratch space +%macro VP9_IADST16_YMM_1D 0 + mova [blockq+ 32], m3 + mova [blockq+ 64], m7 + mova [blockq+ 96], m8 + + ; first half of round 1 + VP9_UNPACK_MULSUB_2D_4X 9, 6, 0, 3, 13160, 9760 ; m9/x=t7[d], m6/x=t6[d] + VP9_UNPACK_MULSUB_2D_4X 1, 14, 4, 7, 2404, 16207 ; m1/x=t15[d], m14/x=t14[d] + VP9_RND_SH_SUMSUB_BA 14, 6, 7, 3, 8, [pd_8192] ; m14=t6[w], m6=t14[w] + VP9_RND_SH_SUMSUB_BA 1, 9, 4, 0, 8, [pd_8192] ; m1=t7[w], m9=t15[w] + + VP9_UNPACK_MULSUB_2D_4X 13, 2, 4, 7, 15893, 3981 ; m13/x=t3[d], m2/x=t2[d] + VP9_UNPACK_MULSUB_2D_4X 5, 10, 0, 3, 8423, 14053 ; m5/x=t11[d], m10/x=t10[d] + VP9_RND_SH_SUMSUB_BA 10, 2, 3, 7, 8, [pd_8192] ; m10=t2[w], m2=t10[w] + VP9_RND_SH_SUMSUB_BA 5, 13, 0, 4, 8, [pd_8192] ; m5=t3[w], m13=t11[w] + + ; half of round 2 t8-15 + VP9_UNPACK_MULSUB_2D_4X 2, 13, 4, 7, 9102, 13623 ; m2/x=t11[d], m13/x=t10[d] + VP9_UNPACK_MULSUB_2D_4X 9, 6, 3, 0, 13623, 9102 ; m9/x=t14[d], m6/x=t15[d] + VP9_RND_SH_SUMSUB_BA 9, 13, 3, 7, 8, [pd_8192] ; m9=t10[w], m13=t14[w] + VP9_RND_SH_SUMSUB_BA 6, 2, 0, 4, 8, [pd_8192] ; m6=t11[w], m2=t15[w] + + SUMSUB_BA w, 14, 10, 8 ; m14=t2, m10=t6 + SUMSUB_BA w, 1, 5, 8 ; m1=t3, m5=t7 + + mova m0, [blockq+ 0] + mova m4, [blockq+128] + mova m3, [blockq+ 32] + mova m7, [blockq+ 64] + mova m8, [blockq+ 96] + mova [blockq+ 0], m1 + mova [blockq+128], m14 + mova [blockq+ 32], m6 + mova [blockq+ 64], m9 + mova [blockq+ 96], m10 + + ; second half of round 1 + VP9_UNPACK_MULSUB_2D_4X 15, 0, 1, 9, 16364, 804 ; m15/x=t1[d], m0/x=t0[d] + VP9_UNPACK_MULSUB_2D_4X 7, 8, 10, 6, 11003, 12140 ; m7/x=t9[d], m8/x=t8[d] + VP9_RND_SH_SUMSUB_BA 8, 0, 6, 9, 14, [pd_8192] ; m8=t0[w], m0=t8[w] + VP9_RND_SH_SUMSUB_BA 7, 15, 10, 1, 14, [pd_8192] ; m7=t1[w], m15=t9[w] + + VP9_UNPACK_MULSUB_2D_4X 11, 4, 10, 6, 14811, 7005 ; m11/x=t5[d], m4/x=t4[d] + VP9_UNPACK_MULSUB_2D_4X 3, 12, 1, 9, 5520, 15426 ; m3/x=t13[d], m12/x=t12[d] + VP9_RND_SH_SUMSUB_BA 12, 4, 9, 6, 14, [pd_8192] ; m12=t4[w], m4=t12[w] + VP9_RND_SH_SUMSUB_BA 3, 11, 1, 10, 14, [pd_8192] ; m3=t5[w], m11=t13[w] + + ; second half of round 2 t8-15 + VP9_UNPACK_MULSUB_2D_4X 0, 15, 6, 10, 16069, 3196 ; m15/x=t8[d], m0/x=t9[d] + VP9_UNPACK_MULSUB_2D_4X 11, 4, 9, 1, 3196, 16069 ; m11/x=t12[d], m4/x=t13[d] + VP9_RND_SH_SUMSUB_BA 11, 15, 9, 10, 14, [pd_8192] ; m11=t8[w], m15=t12[w] + VP9_RND_SH_SUMSUB_BA 4, 0, 1, 6, 14, [pd_8192] ; m4=t9[w], m0=t13[w] + + SUMSUB_BA w, 12, 8, 14 ; m12=t0, m8=t4 + SUMSUB_BA w, 3, 7, 14 ; m3=t1, m7=t5 + + mova m10, [blockq+ 96] + mova [blockq+ 96], m12 + + ; round 3 + VP9_UNPACK_MULSUB_2D_4X 15, 0, 9, 12, 15137, 6270 ; m15/x=t13[d], m0/x=t12[d] + VP9_UNPACK_MULSUB_2D_4X 2, 13, 1, 6, 6270, 15137 ; m2/x=t14[d], m13/x=t15[d] + VP9_RND_SH_SUMSUB_BA 2, 0, 1, 12, 14, [pd_8192] ; m2=out2[w], m0=t14a[w] + VP9_RND_SH_SUMSUB_BA 13, 15, 6, 9, 14, [pd_8192] + PSIGNW m13, [pw_m1] ; m13=out13[w], m15=t15a[w] + + VP9_UNPACK_MULSUB_2D_4X 8, 7, 12, 9, 15137, 6270 ; m8/x=t5[d], m7/x=t4[d] + VP9_UNPACK_MULSUB_2D_4X 5, 10, 1, 6, 6270, 15137 ; m5/x=t6[d], m10/x=t7[d] + VP9_RND_SH_SUMSUB_BA 5, 7, 1, 9, 14, [pd_8192] + PSIGNW m5, [pw_m1] ; m5=out3[w], m7=t6[w] + VP9_RND_SH_SUMSUB_BA 10, 8, 6, 12, 14, [pd_8192] ; m10=out12[w], m8=t7[w] + + mova m1, [blockq+ 0] + mova m14, [blockq+128] + mova m6, [blockq+ 32] + mova m9, [blockq+ 64] + mova m12, [blockq+ 96] + mova [blockq+ 0], m10 + mova [blockq+128], m5 + + SUMSUB_BA w, 14, 12, 5 ; m14=out0, m12=t2a + SUMSUB_BA w, 1, 3, 5 + PSIGNW m1, [pw_m1] ; m1=out15, m3=t3a + + SUMSUB_BA w, 9, 11, 5 + PSIGNW m9, [pw_m1] ; m9=out1, m11=t10 + SUMSUB_BA w, 6, 4, 5 ; m6=out14, m4=t11 + + VP9_UNPACK_MULSUB_2W_4X 4, 11, 11585, 11585, [pd_8192], 5, 10 ; m4=out9, m11=out6 + mova m5, [blockq+128] + mova [blockq+192], m11 + PSIGNW m15, [pw_m1] + VP9_UNPACK_MULSUB_2W_4X 15, 0, 11585, 11585, [pd_8192], 10, 11 ; m15=out5, m0=out10 + + PSIGNW m3, [pw_m1] + VP9_UNPACK_MULSUB_2W_4X 3, 12, 11585, 11585, [pd_8192], 10, 11 ; m3=out7,m12=out8 + VP9_UNPACK_MULSUB_2W_4X 8, 7, 11585, 11585, [pd_8192], 10, 11 ; m8=out11,m7=out4 + + mova m10, [blockq+ 0] + + SWAP 0, 14, 6, 11, 8, 12, 10 + SWAP 1, 9, 15, 4, 7, 3, 5 + SWAP 5, 9, 15 +%endmacro + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +%macro IADST16_YMM_FN 4 +INIT_YMM avx2 +cglobal vp9_%1_%3_16x16_add, 4, 4, 16, dst, stride, block, eob + mova m1, [blockq+ 32] + mova m2, [blockq+ 64] + mova m3, [blockq+ 96] + mova m5, [blockq+160] + mova m6, [blockq+192] + mova m7, [blockq+224] + mova m8, [blockq+256] + mova m9, [blockq+288] + mova m10, [blockq+320] + mova m11, [blockq+352] + mova m12, [blockq+384] + mova m13, [blockq+416] + mova m14, [blockq+448] + mova m15, [blockq+480] + + VP9_%2_YMM_1D + TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + [blockq+192], [blockq+128], 1 + mova [blockq+ 0], m0 + VP9_%4_YMM_1D + + mova [blockq+224], m7 + + ; store + VP9_IDCT8_WRITEx2 0, 1, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 2, 3, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 4, 5, 6, 7, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + mova m6, [blockq+192] + mova m7, [blockq+224] + VP9_IDCT8_WRITEx2 6, 7, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 8, 9, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 10, 11, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 12, 13, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + VP9_IDCT8_WRITEx2 14, 15, 1, 2, unused, [pw_512], 6 + lea dstq, [dstq+2*strideq] + + ; at the end of the loop, m0 should still be zero + ; use that to zero out block coefficients + pxor m0, m0 + ZERO_BLOCK blockq, 32, 16, m0 + RET +%endmacro + +IADST16_YMM_FN idct, IDCT16, iadst, IADST16 +IADST16_YMM_FN iadst, IADST16, idct, IDCT16 +IADST16_YMM_FN iadst, IADST16, iadst, IADST16 +%endif + +;--------------------------------------------------------------------------------------------- +; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +;--------------------------------------------------------------------------------------------- + +%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc +%if %2 == 1 +%assign %%str mmsize +%else +%assign %%str 64 +%endif + + ; first do t0-15, this can be done identical to idct16x16 + VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1 + + ; store everything on stack to make space available for t16-31 + ; we store interleaved with the output of the second half (t16-31) + ; so we don't need to allocate extra stack space + mova [tmpq+ 0*%%str], m0 ; t0 + mova [tmpq+ 4*%%str], m1 ; t1 + mova [tmpq+ 8*%%str], m2 ; t2 + mova [tmpq+12*%%str], m3 ; t3 + mova [tmpq+16*%%str], m4 ; t4 + mova [tmpq+20*%%str], m5 ; t5 +%if ARCH_X86_64 + mova [tmpq+22*%%str], m10 ; t10 + mova [tmpq+18*%%str], m11 ; t11 + mova [tmpq+14*%%str], m12 ; t12 + mova [tmpq+10*%%str], m13 ; t13 + mova [tmpq+ 6*%%str], m14 ; t14 + mova [tmpq+ 2*%%str], m15 ; t15 +%endif + + mova m0, [tmpq+ 30*%%str] + UNSCRATCH 1, 6, tmpq+26*%%str + UNSCRATCH 2, 8, tmpq+24*%%str + UNSCRATCH 3, 9, tmpq+28*%%str + SUMSUB_BA w, 1, 3, 4 ; t6, t9 + SUMSUB_BA w, 0, 2, 4 ; t7, t8 + + mova [tmpq+24*%%str], m1 ; t6 + mova [tmpq+28*%%str], m0 ; t7 + mova [tmpq+30*%%str], m2 ; t8 + mova [tmpq+26*%%str], m3 ; t9 + + ; then, secondly, do t16-31 +%if %3 <= 8 + mova m4, [%1+ 1*64] + mova m7, [%1+ 7*64] + + pmulhrsw m1, m4, [pw_16364x2] ;t31 + pmulhrsw m4, [pw_804x2] ;t16 + + VP9_UNPACK_MULSUB_2W_4X 5, 0, 1, 4, 16069, 3196, [pd_8192], 6, 2 ; t17, t30 + + pmulhrsw m3, m7, [pw_m5520x2] ;t19 + pmulhrsw m7, [pw_15426x2] ;t28 + + SCRATCH 4, 13, tmpq+ 1*%%str + SCRATCH 5, 12, tmpq+15*%%str + + VP9_UNPACK_MULSUB_2W_4X 2, 6, 7, 3, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 +%else + mova m0, [%1+ 1*64] + mova m1, [%1+15*64] +%if %3 <= 16 + pmulhrsw m5, m0, [pw_16364x2] + pmulhrsw m0, [pw_804x2] + pmulhrsw m4, m1, [pw_m11003x2] + pmulhrsw m1, [pw_12140x2] +%else + mova m4, [%1+17*64] + mova m5, [%1+31*64] + + VP9_UNPACK_MULSUB_2W_4X 0, 5, 16364, 804, [pd_8192], 2, 3 ; t16, t31 + VP9_UNPACK_MULSUB_2W_4X 4, 1, 11003, 12140, [pd_8192], 2, 3 ; t17, t30 +%endif + SUMSUB_BA w, 4, 0, 2 + SUMSUB_BA w, 1, 5, 2 + + VP9_UNPACK_MULSUB_2W_4X 5, 0, 16069, 3196, [pd_8192], 2, 3 ; t17, t30 + + SCRATCH 4, 13, tmpq+ 1*%%str + SCRATCH 5, 12, tmpq+15*%%str + + mova m2, [%1+ 7*64] + mova m3, [%1+ 9*64] +%if %3 <= 16 + pmulhrsw m7, m3, [pw_14811x2] + pmulhrsw m3, [pw_7005x2] + pmulhrsw m6, m2, [pw_m5520x2] + pmulhrsw m2, [pw_15426x2] +%else + mova m7, [%1+23*64] + mova m6, [%1+25*64] + + VP9_UNPACK_MULSUB_2W_4X 3, 7, 14811, 7005, [pd_8192], 4, 5 ; t18, t29 + VP9_UNPACK_MULSUB_2W_4X 6, 2, 5520, 15426, [pd_8192], 4, 5 ; t19, t28 +%endif + SUMSUB_BA w, 3, 6, 4 + SUMSUB_BA w, 7, 2, 4 + + VP9_UNPACK_MULSUB_2W_4X 2, 6, 3196, m16069, [pd_8192], 4, 5 ; t18, t29 +%endif + + UNSCRATCH 5, 12, tmpq+15*%%str + SUMSUB_BA w, 6, 0, 4 + mova [tmpq+25*%%str], m6 ; t19 + UNSCRATCH 4, 13, tmpq+ 1*%%str + SUMSUB_BA w, 7, 1, 6 + SUMSUB_BA w, 3, 4, 6 + mova [tmpq+23*%%str], m3 ; t16 + SUMSUB_BA w, 2, 5, 6 + + VP9_UNPACK_MULSUB_2W_4X 0, 5, 15137, 6270, [pd_8192], 6, 3 ; t18, t29 + VP9_UNPACK_MULSUB_2W_4X 1, 4, 15137, 6270, [pd_8192], 6, 3 ; t19, t28 + + SCRATCH 0, 10, tmpq+ 1*%%str + SCRATCH 1, 11, tmpq+ 7*%%str + SCRATCH 2, 9, tmpq+ 9*%%str + SCRATCH 4, 14, tmpq+15*%%str + SCRATCH 5, 15, tmpq+17*%%str + SCRATCH 7, 13, tmpq+31*%%str + +%if %3 <= 8 + mova m0, [%1+ 5*64] + mova m3, [%1+ 3*64] + + pmulhrsw m5, m0, [pw_15893x2] ;t27 + pmulhrsw m0, [pw_3981x2] ;t20 + + VP9_UNPACK_MULSUB_2W_4X 1, 4, 5, 0, 9102, 13623, [pd_8192], 7, 2 ; t21, t26 + + pmulhrsw m6, m3, [pw_m2404x2] ;t23 + pmulhrsw m3, [pw_16207x2] ;t24 + + SCRATCH 5, 8, tmpq+ 5*%%str + SCRATCH 4, 12, tmpq+11*%%str + + VP9_UNPACK_MULSUB_2W_4X 7, 2, 3, 6, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 +%else + mova m4, [%1+ 5*64] + mova m5, [%1+11*64] +%if %3 <= 16 + pmulhrsw m1, m4, [pw_15893x2] + pmulhrsw m4, [pw_3981x2] + pmulhrsw m0, m5, [pw_m8423x2] + pmulhrsw m5, [pw_14053x2] +%else + mova m0, [%1+21*64] + mova m1, [%1+27*64] + + VP9_UNPACK_MULSUB_2W_4X 4, 1, 15893, 3981, [pd_8192], 2, 3 ; t20, t27 + VP9_UNPACK_MULSUB_2W_4X 0, 5, 8423, 14053, [pd_8192], 2, 3 ; t21, t26 +%endif + SUMSUB_BA w, 0, 4, 2 + SUMSUB_BA w, 5, 1, 2 + + VP9_UNPACK_MULSUB_2W_4X 1, 4, 9102, 13623, [pd_8192], 2, 3 ; t21, t26 + + SCRATCH 5, 8, tmpq+ 5*%%str + SCRATCH 4, 12, tmpq+11*%%str + + mova m7, [%1+ 3*64] + mova m6, [%1+13*64] +%if %3 <= 16 + pmulhrsw m3, m6, [pw_13160x2] + pmulhrsw m6, [pw_9760x2] + pmulhrsw m2, m7, [pw_m2404x2] + pmulhrsw m7, [pw_16207x2] +%else + mova m2, [%1+29*64] + mova m3, [%1+19*64] + VP9_UNPACK_MULSUB_2W_4X 6, 3, 13160, 9760, [pd_8192], 4, 5 ; t22, t25 + VP9_UNPACK_MULSUB_2W_4X 2, 7, 2404, 16207, [pd_8192], 4, 5 ; t23, t24 +%endif + SUMSUB_BA w, 6, 2, 4 + SUMSUB_BA w, 3, 7, 4 + + VP9_UNPACK_MULSUB_2W_4X 7, 2, 13623, m9102, [pd_8192], 4, 5 ; t22, t25 +%endif + + ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23, + ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31 + + UNSCRATCH 4, 12, tmpq+11*%%str + SUMSUB_BA w, 0, 6, 5 + SUMSUB_BA w, 4, 2, 5 + UNSCRATCH 5, 8, tmpq+ 5*%%str + SCRATCH 4, 8, tmpq+11*%%str + SUMSUB_BA w, 1, 7, 4 + SUMSUB_BA w, 5, 3, 4 + SCRATCH 5, 12, tmpq+ 5*%%str + + VP9_UNPACK_MULSUB_2W_4X 3, 6, 6270, m15137, [pd_8192], 4, 5 ; t20, t27 + VP9_UNPACK_MULSUB_2W_4X 2, 7, 6270, m15137, [pd_8192], 4, 5 ; t21, t26 + + ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23, + ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31 + + UNSCRATCH 5, 9, tmpq+ 9*%%str + mova m4, [tmpq+23*%%str] ; t16 +%if ARCH_X86_64 + SUMSUB_BA w, 1, 5, 9 + SUMSUB_BA w, 0, 4, 9 +%else + SUMSUB_BADC w, 1, 5, 0, 4 +%endif + mova [tmpq+29*%%str], m1 ; t17 + mova [tmpq+21*%%str], m0 ; t16 + UNSCRATCH 0, 10, tmpq+ 1*%%str + UNSCRATCH 1, 11, tmpq+ 7*%%str +%if ARCH_X86_64 + SUMSUB_BA w, 2, 0, 9 + SUMSUB_BA w, 3, 1, 9 +%else + SUMSUB_BADC w, 2, 0, 3, 1 +%endif + mova [tmpq+ 9*%%str], m2 ; t18 + mova [tmpq+13*%%str], m3 ; t19 + SCRATCH 0, 10, tmpq+23*%%str + SCRATCH 1, 11, tmpq+27*%%str + + UNSCRATCH 2, 14, tmpq+15*%%str + UNSCRATCH 3, 15, tmpq+17*%%str + SUMSUB_BA w, 6, 2, 0 + SUMSUB_BA w, 7, 3, 0 + SCRATCH 6, 14, tmpq+ 3*%%str + SCRATCH 7, 15, tmpq+ 7*%%str + + UNSCRATCH 0, 8, tmpq+11*%%str + mova m1, [tmpq+25*%%str] ; t19 + UNSCRATCH 6, 12, tmpq+ 5*%%str + UNSCRATCH 7, 13, tmpq+31*%%str +%if ARCH_X86_64 + SUMSUB_BA w, 0, 1, 9 + SUMSUB_BA w, 6, 7, 9 +%else + SUMSUB_BADC w, 0, 1, 6, 7 +%endif + + ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23, + ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31 + +%if 0; cpuflag(ssse3) +%if ARCH_X86_64 + SUMSUB_BA w, 4, 7, 8 + SUMSUB_BA w, 5, 1, 8 +%else + SUMSUB_BADC w, 4, 7, 5, 1 +%endif + + pmulhrsw m7, [pw_11585x2] + pmulhrsw m4, [pw_11585x2] + pmulhrsw m1, [pw_11585x2] + pmulhrsw m5, [pw_11585x2] + + mova [tmpq+ 5*%%str], m7 ; t23 + SCRATCH 1, 13, tmpq+25*%%str + UNSCRATCH 7, 10, tmpq+23*%%str + UNSCRATCH 1, 11, tmpq+27*%%str + +%if ARCH_X86_64 + SUMSUB_BA w, 7, 3, 10 + SUMSUB_BA w, 1, 2, 10 +%else + SUMSUB_BADC w, 7, 3, 1, 2 +%endif + + pmulhrsw m3, [pw_11585x2] + pmulhrsw m7, [pw_11585x2] + pmulhrsw m2, [pw_11585x2] + pmulhrsw m1, [pw_11585x2] +%else + SCRATCH 0, 8, tmpq+15*%%str + SCRATCH 6, 9, tmpq+17*%%str + VP9_UNPACK_MULSUB_2W_4X 7, 4, 11585, 11585, [pd_8192], 0, 6 + mova [tmpq+ 5*%%str], m7 ; t23 + UNSCRATCH 7, 10, tmpq+23*%%str + VP9_UNPACK_MULSUB_2W_4X 1, 5, 11585, 11585, [pd_8192], 0, 6 + SCRATCH 1, 13, tmpq+25*%%str + UNSCRATCH 1, 11, tmpq+27*%%str + VP9_UNPACK_MULSUB_2W_4X 3, 7, 11585, 11585, [pd_8192], 0, 6 + VP9_UNPACK_MULSUB_2W_4X 2, 1, 11585, 11585, [pd_8192], 0, 6 + UNSCRATCH 0, 8, tmpq+15*%%str + UNSCRATCH 6, 9, tmpq+17*%%str +%endif + + ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23, + ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31 + + ; then do final pass to sumsub+store the two halves +%if %2 == 1 + mova [tmpq+17*%%str], m2 ; t20 + mova [tmpq+ 1*%%str], m3 ; t21 +%if ARCH_X86_64 + mova [tmpq+25*%%str], m13 ; t22 + + mova m8, [tmpq+ 0*%%str] ; t0 + mova m9, [tmpq+ 4*%%str] ; t1 + mova m12, [tmpq+ 8*%%str] ; t2 + mova m11, [tmpq+12*%%str] ; t3 + mova m2, [tmpq+16*%%str] ; t4 + mova m3, [tmpq+20*%%str] ; t5 + mova m13, [tmpq+24*%%str] ; t6 + + SUMSUB_BA w, 6, 8, 10 + mova [tmpq+ 3*%%str], m8 ; t15 + SUMSUB_BA w, 0, 9, 8 + SUMSUB_BA w, 15, 12, 8 + SUMSUB_BA w, 14, 11, 8 + SUMSUB_BA w, 1, 2, 8 + SUMSUB_BA w, 7, 3, 8 + SUMSUB_BA w, 5, 13, 8 + mova m10, [tmpq+28*%%str] ; t7 + SUMSUB_BA w, 4, 10, 8 +%if cpuflag(avx2) + ; the "shitty" about this idct is that the final pass does the outermost + ; interleave sumsubs (t0/31, t1/30, etc) but the tN for the 16x16 need + ; to be sequential, which means I need to load/store half of the sumsub + ; intermediates back to/from memory to get a 16x16 transpose going... + ; This would be easier if we had more (e.g. 32) YMM regs here. + mova [tmpq+ 7*%%str], m9 + mova [tmpq+11*%%str], m12 + mova [tmpq+15*%%str], m11 + mova [tmpq+19*%%str], m2 + mova [tmpq+23*%%str], m3 + mova [tmpq+27*%%str], m13 + mova [tmpq+31*%%str], m10 + mova [tmpq+12*%%str], m5 + + mova m13, [tmpq+30*%%str] ; t8 + mova m12, [tmpq+26*%%str] ; t9 + mova m11, [tmpq+22*%%str] ; t10 + mova m10, [tmpq+18*%%str] ; t11 + mova m9, [tmpq+17*%%str] ; t20 + mova m8, [tmpq+ 1*%%str] ; t21 + mova m3, [tmpq+25*%%str] ; t22 + mova m2, [tmpq+ 5*%%str] ; t23 + + SUMSUB_BA w, 9, 10, 5 + SUMSUB_BA w, 8, 11, 5 + SUMSUB_BA w, 3, 12, 5 + SUMSUB_BA w, 2, 13, 5 + mova [tmpq+ 1*%%str], m10 + mova [tmpq+ 5*%%str], m11 + mova [tmpq+17*%%str], m12 + mova [tmpq+25*%%str], m13 + + mova m13, [tmpq+14*%%str] ; t12 + mova m12, [tmpq+10*%%str] ; t13 + mova m11, [tmpq+ 9*%%str] ; t18 + mova m10, [tmpq+13*%%str] ; t19 + + SUMSUB_BA w, 11, 12, 5 + SUMSUB_BA w, 10, 13, 5 + mova [tmpq+ 9*%%str], m13 + mova [tmpq+13*%%str], m12 + mova [tmpq+10*%%str], m10 + mova [tmpq+14*%%str], m11 + + mova m13, [tmpq+ 6*%%str] ; t14 + mova m12, [tmpq+ 2*%%str] ; t15 + mova m11, [tmpq+21*%%str] ; t16 + mova m10, [tmpq+29*%%str] ; t17 + SUMSUB_BA w, 11, 12, 5 + SUMSUB_BA w, 10, 13, 5 + mova [tmpq+21*%%str], m12 + mova [tmpq+29*%%str], m13 + mova m12, [tmpq+10*%%str] + mova m13, [tmpq+14*%%str] + + TRANSPOSE16x16W 6, 0, 15, 14, 1, 7, 5, 4, \ + 2, 3, 8, 9, 12, 13, 10, 11, \ + [tmpq+12*%%str], [tmpq+ 8*%%str], 1 + mova [tmpq+ 0*%%str], m6 + mova [tmpq+ 2*%%str], m0 + mova [tmpq+ 4*%%str], m15 + mova [tmpq+ 6*%%str], m14 + mova [tmpq+10*%%str], m7 + mova [tmpq+12*%%str], m5 + mova [tmpq+14*%%str], m4 + mova [tmpq+16*%%str], m2 + mova [tmpq+18*%%str], m3 + mova [tmpq+20*%%str], m8 + mova [tmpq+22*%%str], m9 + mova [tmpq+24*%%str], m12 + mova [tmpq+26*%%str], m13 + mova [tmpq+28*%%str], m10 + mova [tmpq+30*%%str], m11 + + mova m0, [tmpq+21*%%str] + mova m1, [tmpq+29*%%str] + mova m2, [tmpq+13*%%str] + mova m3, [tmpq+ 9*%%str] + mova m4, [tmpq+ 1*%%str] + mova m5, [tmpq+ 5*%%str] + mova m7, [tmpq+25*%%str] + mova m8, [tmpq+31*%%str] + mova m9, [tmpq+27*%%str] + mova m10, [tmpq+23*%%str] + mova m11, [tmpq+19*%%str] + mova m12, [tmpq+15*%%str] + mova m13, [tmpq+11*%%str] + mova m14, [tmpq+ 7*%%str] + mova m15, [tmpq+ 3*%%str] + TRANSPOSE16x16W 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + [tmpq+17*%%str], [tmpq+ 9*%%str], 1 + mova [tmpq+ 1*%%str], m0 + mova [tmpq+ 3*%%str], m1 + mova [tmpq+ 5*%%str], m2 + mova [tmpq+ 7*%%str], m3 + mova [tmpq+11*%%str], m5 + mova [tmpq+13*%%str], m6 + mova [tmpq+15*%%str], m7 + mova [tmpq+17*%%str], m8 + mova [tmpq+19*%%str], m9 + mova [tmpq+21*%%str], m10 + mova [tmpq+23*%%str], m11 + mova [tmpq+25*%%str], m12 + mova [tmpq+27*%%str], m13 + mova [tmpq+29*%%str], m14 + mova [tmpq+31*%%str], m15 +%else ; !avx2 + TRANSPOSE8x8W 6, 0, 15, 14, 1, 7, 5, 4, 8 + mova [tmpq+ 0*%%str], m6 + mova [tmpq+ 4*%%str], m0 + mova [tmpq+ 8*%%str], m15 + mova [tmpq+12*%%str], m14 + mova [tmpq+16*%%str], m1 + mova [tmpq+20*%%str], m7 + mova [tmpq+24*%%str], m5 + mova [tmpq+28*%%str], m4 + + mova m8, [tmpq+ 3*%%str] ; t15 + TRANSPOSE8x8W 10, 13, 3, 2, 11, 12, 9, 8, 0 + mova [tmpq+ 3*%%str], m10 + mova [tmpq+ 7*%%str], m13 + mova [tmpq+11*%%str], m3 + mova [tmpq+15*%%str], m2 + mova [tmpq+19*%%str], m11 + mova [tmpq+23*%%str], m12 + mova [tmpq+27*%%str], m9 + mova [tmpq+31*%%str], m8 + + mova m15, [tmpq+30*%%str] ; t8 + mova m14, [tmpq+26*%%str] ; t9 + mova m13, [tmpq+22*%%str] ; t10 + mova m12, [tmpq+18*%%str] ; t11 + mova m11, [tmpq+14*%%str] ; t12 + mova m10, [tmpq+10*%%str] ; t13 + mova m9, [tmpq+ 6*%%str] ; t14 + mova m8, [tmpq+ 2*%%str] ; t15 + mova m7, [tmpq+21*%%str] ; t16 + mova m6, [tmpq+29*%%str] ; t17 + mova m5, [tmpq+ 9*%%str] ; t18 + mova m4, [tmpq+13*%%str] ; t19 + mova m3, [tmpq+17*%%str] ; t20 + mova m2, [tmpq+ 1*%%str] ; t21 + mova m1, [tmpq+25*%%str] ; t22 + + SUMSUB_BA w, 7, 8, 0 + mova [tmpq+ 2*%%str], m8 + mova m0, [tmpq+ 5*%%str] ; t23 + SUMSUB_BA w, 6, 9, 8 + SUMSUB_BA w, 5, 10, 8 + SUMSUB_BA w, 4, 11, 8 + SUMSUB_BA w, 3, 12, 8 + SUMSUB_BA w, 2, 13, 8 + SUMSUB_BA w, 1, 14, 8 + SUMSUB_BA w, 0, 15, 8 + + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + mova [tmpq+ 1*%%str], m0 + mova [tmpq+ 5*%%str], m1 + mova [tmpq+ 9*%%str], m2 + mova [tmpq+13*%%str], m3 + mova [tmpq+17*%%str], m4 + mova [tmpq+21*%%str], m5 + mova [tmpq+25*%%str], m6 + mova [tmpq+29*%%str], m7 + + mova m8, [tmpq+ 2*%%str] + TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0 + mova [tmpq+ 2*%%str], m8 + mova [tmpq+ 6*%%str], m9 + mova [tmpq+10*%%str], m10 + mova [tmpq+14*%%str], m11 + mova [tmpq+18*%%str], m12 + mova [tmpq+22*%%str], m13 + mova [tmpq+26*%%str], m14 + mova [tmpq+30*%%str], m15 +%endif ; avx2 +%else + mova m2, [tmpq+24*%%str] ; t6 + mova m3, [tmpq+28*%%str] ; t7 + SUMSUB_BADC w, 5, 2, 4, 3 + mova [tmpq+24*%%str], m5 + mova [tmpq+23*%%str], m2 + mova [tmpq+28*%%str], m4 + mova [tmpq+19*%%str], m3 + + mova m2, [tmpq+16*%%str] ; t4 + mova m3, [tmpq+20*%%str] ; t5 + SUMSUB_BA w, 1, 2, 5 + SUMSUB_BA w, 7, 3, 5 + mova [tmpq+15*%%str], m2 + mova [tmpq+11*%%str], m3 + + mova m2, [tmpq+ 0*%%str] ; t0 + mova m3, [tmpq+ 4*%%str] ; t1 + SUMSUB_BA w, 6, 2, 5 + SUMSUB_BA w, 0, 3, 5 + mova [tmpq+31*%%str], m2 + mova [tmpq+27*%%str], m3 + + mova m2, [tmpq+ 8*%%str] ; t2 + mova m3, [tmpq+12*%%str] ; t3 + mova m5, [tmpq+ 7*%%str] + mova m4, [tmpq+ 3*%%str] + SUMSUB_BADC w, 5, 2, 4, 3 + mova [tmpq+ 7*%%str], m2 + mova [tmpq+ 3*%%str], m3 + + mova m3, [tmpq+28*%%str] + TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+24*%%str], [tmpq+16*%%str], 1 + mova [tmpq+ 0*%%str], m6 + mova [tmpq+ 4*%%str], m0 + mova [tmpq+ 8*%%str], m5 + mova [tmpq+12*%%str], m4 + mova [tmpq+20*%%str], m7 + mova [tmpq+24*%%str], m2 + mova [tmpq+28*%%str], m3 + + mova m6, [tmpq+19*%%str] + mova m0, [tmpq+23*%%str] + mova m5, [tmpq+11*%%str] + mova m4, [tmpq+15*%%str] + mova m1, [tmpq+ 3*%%str] + mova m7, [tmpq+ 7*%%str] + mova m3, [tmpq+31*%%str] + TRANSPOSE8x8W 6, 0, 5, 4, 1, 7, 2, 3, [tmpq+27*%%str], [tmpq+19*%%str], 1 + mova [tmpq+ 3*%%str], m6 + mova [tmpq+ 7*%%str], m0 + mova [tmpq+11*%%str], m5 + mova [tmpq+15*%%str], m4 + mova [tmpq+23*%%str], m7 + mova [tmpq+27*%%str], m2 + mova [tmpq+31*%%str], m3 + + mova m1, [tmpq+ 6*%%str] ; t14 + mova m0, [tmpq+ 2*%%str] ; t15 + mova m7, [tmpq+21*%%str] ; t16 + mova m6, [tmpq+29*%%str] ; t17 + SUMSUB_BA w, 7, 0, 2 + SUMSUB_BA w, 6, 1, 2 + mova [tmpq+29*%%str], m7 + mova [tmpq+ 2*%%str], m0 + mova [tmpq+21*%%str], m6 + mova [tmpq+ 6*%%str], m1 + + mova m1, [tmpq+14*%%str] ; t12 + mova m0, [tmpq+10*%%str] ; t13 + mova m5, [tmpq+ 9*%%str] ; t18 + mova m4, [tmpq+13*%%str] ; t19 + SUMSUB_BA w, 5, 0, 2 + SUMSUB_BA w, 4, 1, 2 + mova [tmpq+10*%%str], m0 + mova [tmpq+14*%%str], m1 + + mova m1, [tmpq+22*%%str] ; t10 + mova m0, [tmpq+18*%%str] ; t11 + mova m3, [tmpq+17*%%str] ; t20 + mova m2, [tmpq+ 1*%%str] ; t21 + SUMSUB_BA w, 3, 0, 6 + SUMSUB_BA w, 2, 1, 6 + mova [tmpq+18*%%str], m0 + mova [tmpq+22*%%str], m1 + + mova m7, [tmpq+30*%%str] ; t8 + mova m6, [tmpq+26*%%str] ; t9 + mova m1, [tmpq+25*%%str] ; t22 + mova m0, [tmpq+ 5*%%str] ; t23 + SUMSUB_BADC w, 1, 6, 0, 7 + mova [tmpq+26*%%str], m6 + mova [tmpq+30*%%str], m7 + + mova m7, [tmpq+29*%%str] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+21*%%str], [tmpq+17*%%str], 1 + mova [tmpq+ 1*%%str], m0 + mova [tmpq+ 5*%%str], m1 + mova [tmpq+ 9*%%str], m2 + mova [tmpq+13*%%str], m3 + mova [tmpq+21*%%str], m5 + mova [tmpq+25*%%str], m6 + mova [tmpq+29*%%str], m7 + + mova m0, [tmpq+ 2*%%str] + mova m1, [tmpq+ 6*%%str] + mova m2, [tmpq+10*%%str] + mova m3, [tmpq+14*%%str] + mova m4, [tmpq+18*%%str] + mova m5, [tmpq+22*%%str] + mova m7, [tmpq+30*%%str] + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [tmpq+26*%%str], [tmpq+18*%%str], 1 + mova [tmpq+ 2*%%str], m0 + mova [tmpq+ 6*%%str], m1 + mova [tmpq+10*%%str], m2 + mova [tmpq+14*%%str], m3 + mova [tmpq+22*%%str], m5 + mova [tmpq+26*%%str], m6 + mova [tmpq+30*%%str], m7 +%endif +%else + ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str] + ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str] + ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str] + ; t20-22 is in m4-6 + ; t24-31 is in m8-15 + +%if cpuflag(ssse3) +%define ROUND_REG [pw_512] +%else +%define ROUND_REG [pw_32] +%endif + +%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs + SUMSUB_BA w, %4, %1, %5 + SUMSUB_BA w, %3, %2, %5 + VP9_IDCT8_WRITEx2 %4, %3, %5, %6, %7, ROUND_REG, 6 +%if %8 == 1 + add dstq, stride2q +%endif + VP9_IDCT8_WRITEx2 %2, %1, %5, %6, %7, ROUND_REG, 6, dst_endq +%if %8 == 1 + sub dst_endq, stride2q +%endif +%endmacro + +%if ARCH_X86_64 + pxor m10, m10 + + ; store t0-1 and t30-31 + mova m8, [tmpq+ 0*%%str] + mova m9, [tmpq+ 4*%%str] + %%STORE_2X2 8, 9, 0, 6, 12, 11, 10 + + ; store t2-3 and t28-29 + mova m8, [tmpq+ 8*%%str] + mova m9, [tmpq+12*%%str] + %%STORE_2X2 8, 9, 14, 15, 12, 11, 10 + + ; store t4-5 and t26-27 + mova m8, [tmpq+16*%%str] + mova m9, [tmpq+20*%%str] + %%STORE_2X2 8, 9, 7, 1, 12, 11, 10 + + ; store t6-7 and t24-25 + mova m8, [tmpq+24*%%str] + mova m9, [tmpq+28*%%str] + %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 + + ; store t8-9 and t22-23 + mova m8, [tmpq+30*%%str] + mova m9, [tmpq+26*%%str] + mova m0, [tmpq+ 5*%%str] + %%STORE_2X2 8, 9, 13, 0, 12, 11, 10 + + ; store t10-11 and t20-21 + mova m8, [tmpq+22*%%str] + mova m9, [tmpq+18*%%str] + %%STORE_2X2 8, 9, 2, 3, 12, 11, 10 + + ; store t12-13 and t18-19 + mova m8, [tmpq+14*%%str] + mova m9, [tmpq+10*%%str] + mova m5, [tmpq+13*%%str] + mova m4, [tmpq+ 9*%%str] + %%STORE_2X2 8, 9, 4, 5, 12, 11, 10 + + ; store t14-17 + mova m8, [tmpq+ 6*%%str] + mova m9, [tmpq+ 2*%%str] + mova m5, [tmpq+29*%%str] + mova m4, [tmpq+21*%%str] + %%STORE_2X2 8, 9, 4, 5, 12, 11, 10, 0 + + SWAP 1, 10 ; zero +%else + mova [tmpq+ 1*%%str], m1 + mova [tmpq+11*%%str], m2 + mova [tmpq+15*%%str], m3 + mova [tmpq+17*%%str], m4 + mova [tmpq+19*%%str], m5 + pxor m1, m1 + + ; store t0-1 and t30-31 + mova m2, [tmpq+ 0*%%str] + mova m3, [tmpq+ 4*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t2-3 and t28-29 + mova m2, [tmpq+ 8*%%str] + mova m3, [tmpq+12*%%str] + mova m0, [tmpq+ 3*%%str] + mova m6, [tmpq+ 7*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t4-5 and t26-27 + mova m2, [tmpq+16*%%str] + mova m3, [tmpq+20*%%str] + mova m0, [tmpq+ 1*%%str] + %%STORE_2X2 2, 3, 7, 0, 4, 5, 1 + + ; store t6-7 and t24-25 + mova m2, [tmpq+24*%%str] + mova m3, [tmpq+28*%%str] + mova m0, [tmpq+17*%%str] + mova m6, [tmpq+19*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t8-9 and t22-23 + mova m2, [tmpq+30*%%str] + mova m3, [tmpq+26*%%str] + mova m0, [tmpq+25*%%str] + mova m6, [tmpq+ 5*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t10-11 and t20-21 + mova m2, [tmpq+22*%%str] + mova m3, [tmpq+18*%%str] + mova m0, [tmpq+11*%%str] + mova m6, [tmpq+15*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t12-13 and t18-19 + mova m2, [tmpq+14*%%str] + mova m3, [tmpq+10*%%str] + mova m6, [tmpq+13*%%str] + mova m0, [tmpq+ 9*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1 + + ; store t14-17 + mova m2, [tmpq+ 6*%%str] + mova m3, [tmpq+ 2*%%str] + mova m6, [tmpq+29*%%str] + mova m0, [tmpq+21*%%str] + %%STORE_2X2 2, 3, 0, 6, 4, 5, 1, 0 +%endif +%undef ROUND_REG +%endif +%endmacro + +%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1 +INIT_XMM %1 +cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob + movifnidn eobd, dword eobm +%if cpuflag(ssse3) + cmp eobd, 135 + jg .idctfull + cmp eobd, 34 + jg .idct16x16 + cmp eobd, 1 + jg .idct8x8 +%else + cmp eobd, 1 + jg .idctfull +%endif + + ; dc-only case + movifnidn blockq, blockmp + movifnidn dstq, dstmp + movifnidn strideq, stridemp +%if cpuflag(ssse3) + movd m0, [blockq] + mova m1, [pw_11585x2] + pmulhrsw m0, m1 + pmulhrsw m0, m1 +%else + DEFINE_ARGS dst, stride, block, coef + movsx coefd, word [blockq] + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, (32 << 14) + 8192 + sar coefd, 14 + 6 + movd m0, coefd +%endif + SPLATW m0, m0, q0000 +%if cpuflag(ssse3) + pmulhrsw m0, [pw_512] +%endif + pxor m5, m5 + movd [blockq], m5 +%rep 31 + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize + add dstq, strideq +%endrep + VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize + RET + +%if ARCH_X86_64 + DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp +%else +%define dst_bakq r0mp +%endif +%if cpuflag(ssse3) +.idct8x8: +%if ARCH_X86_32 + DEFINE_ARGS block, u1, u2, u3, u4, tmp + mov blockq, r2mp +%endif + mov tmpq, rsp + VP9_IDCT32_1D blockq, 1, 8 + +%if ARCH_X86_32 + DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp + mov strideq, r1mp +%define cntd dword r3m +%endif + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + sub stride30q, stride2q ; stride*30 +.loop2_8x8: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2, 8 + add dst_bakq, 8 + add tmpq, 16 + dec cntd + jg .loop2_8x8 + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients +%if ARCH_X86_32 + DEFINE_ARGS block + mov blockq, r2mp +%endif + ZERO_BLOCK blockq, 64, 8, m1 + RET + +.idct16x16: +%if ARCH_X86_32 + DEFINE_ARGS block, tmp, cnt + mov blockq, r2mp +%endif + mov cntd, 2 + mov tmpq, rsp +.loop1_16x16: + VP9_IDCT32_1D blockq, 1, 16 + add blockq, 16 + add tmpq, 512 + dec cntd + jg .loop1_16x16 + +%if ARCH_X86_64 + sub blockq, 32 +%else + DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp + mov strideq, r1mp +%define cntd dword r3m +%endif + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + mov tmpq, rsp + sub stride30q, stride2q ; stride*30 +.loop2_16x16: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2, 16 + add dst_bakq, 8 + add tmpq, 16 + dec cntd + jg .loop2_16x16 + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients +%if ARCH_X86_32 + DEFINE_ARGS block + mov blockq, r2mp +%endif + ZERO_BLOCK blockq, 64, 16, m1 + RET +%endif + +.idctfull: +%if ARCH_X86_32 + DEFINE_ARGS block, tmp, cnt + mov blockq, r2mp +%endif + mov cntd, 4 + mov tmpq, rsp +.loop1_full: + VP9_IDCT32_1D blockq, 1 + add blockq, 16 + add tmpq, 512 + dec cntd + jg .loop1_full + +%if ARCH_X86_64 + sub blockq, 64 +%else + DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp + mov strideq, r1mp +%define cntd dword r3m +%endif + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 4 + mov tmpq, rsp + sub stride30q, stride2q ; stride*30 +.loop2_full: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2 + add dst_bakq, 8 + add tmpq, 16 + dec cntd + jg .loop2_full + + ; at the end of the loop, m7 should still be zero + ; use that to zero out block coefficients +%if ARCH_X86_32 + DEFINE_ARGS block + mov blockq, r2mp +%endif + ZERO_BLOCK blockq, 64, 32, m1 + RET +%endmacro + +VP9_IDCT_IDCT_32x32_ADD_XMM sse2 +VP9_IDCT_IDCT_32x32_ADD_XMM ssse3 +VP9_IDCT_IDCT_32x32_ADD_XMM avx + +; this is almost identical to VP9_STORE_2X, but it does two rows +; for slightly improved interleaving, and it omits vpermq since the +; input is DC so all values are identical +%macro VP9_STORE_YMM_DC_2X2 6 ; reg, tmp1, tmp2, tmp3, tmp4, zero + mova m%2, [dstq] + mova m%4, [dstq+strideq] + punpckhbw m%3, m%2, m%6 + punpcklbw m%2, m%6 + punpckhbw m%5, m%4, m%6 + punpcklbw m%4, m%6 + paddw m%3, m%1 + paddw m%2, m%1 + paddw m%5, m%1 + paddw m%4, m%1 + packuswb m%2, m%3 + packuswb m%4, m%5 + mova [dstq+strideq*0], m%2 + mova [dstq+strideq*1], m%4 +%endmacro + +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob + cmp eobd, 135 + jg .idctfull + cmp eobd, 1 + jg .idct16x16 + + ; dc-only case + mova m1, [pw_11585x2] + vpbroadcastw m0, [blockq] + pmulhrsw m0, m1 + pmulhrsw m0, m1 + pxor m5, m5 + pmulhrsw m0, [pw_512] + movd [blockq], xm5 + + DEFINE_ARGS dst, stride, cnt + mov cntd, 16 +.loop_dc: + VP9_STORE_YMM_DC_2X2 0, 1, 2, 3, 4, 5 + lea dstq, [dstq+2*strideq] + dec cntd + jg .loop_dc + RET + + DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp +.idct16x16: + mov tmpq, rsp + VP9_IDCT32_1D blockq, 1, 16 + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 2 + sub stride30q, stride2q ; stride*30 +.loop2_16x16: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2, 16 + add dst_bakq, 16 + add tmpq, 32 + dec cntd + jg .loop2_16x16 + + ; at the end of the loop, m1 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 64, 16, m1 + RET + +.idctfull: + mov cntd, 2 + mov tmpq, rsp +.loop1_full: + VP9_IDCT32_1D blockq, 1 + add blockq, 32 + add tmpq, 1024 + dec cntd + jg .loop1_full + + sub blockq, 64 + + mov stride30q, strideq ; stride + lea stride2q, [strideq*2] ; stride*2 + shl stride30q, 5 ; stride*32 + mov cntd, 2 + mov tmpq, rsp + sub stride30q, stride2q ; stride*30 +.loop2_full: + mov dstq, dst_bakq + lea dst_endq, [dstq+stride30q] + VP9_IDCT32_1D tmpq, 2 + add dst_bakq, 16 + add tmpq, 32 + dec cntd + jg .loop2_full + + ; at the end of the loop, m1 should still be zero + ; use that to zero out block coefficients + ZERO_BLOCK blockq, 64, 32, m1 + RET +%endif diff --git a/media/ffvpx/libavcodec/x86/vp9itxfm_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9itxfm_16bpp.asm new file mode 100644 index 0000000000..902685edf6 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9itxfm_16bpp.asm @@ -0,0 +1,2044 @@ +;****************************************************************************** +;* VP9 inverse transform x86 SIMD optimizations +;* +;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" +%include "vp9itxfm_template.asm" + +SECTION_RODATA + +cextern pw_8 +cextern pw_1023 +cextern pw_2048 +cextern pw_4095 +cextern pw_m1 +cextern pd_1 +cextern pd_16 +cextern pd_32 +cextern pd_8192 + +pd_8: times 4 dd 8 +pd_3fff: times 4 dd 0x3fff + +cextern pw_11585x2 + +cextern pw_5283_13377 +cextern pw_9929_13377 +cextern pw_15212_m13377 +cextern pw_15212_9929 +cextern pw_m5283_m15212 +cextern pw_13377x2 +cextern pw_m13377_13377 +cextern pw_13377_0 + +pw_9929_m5283: times 4 dw 9929, -5283 + +%macro COEF_PAIR 2-3 +cextern pw_m%1_%2 +cextern pw_%2_%1 +%if %0 == 3 +cextern pw_m%1_m%2 +%if %1 != %2 +cextern pw_m%2_%1 +cextern pw_%1_%2 +%endif +%endif +%endmacro + +COEF_PAIR 2404, 16207 +COEF_PAIR 3196, 16069, 1 +COEF_PAIR 4756, 15679 +COEF_PAIR 5520, 15426 +COEF_PAIR 6270, 15137, 1 +COEF_PAIR 8423, 14053 +COEF_PAIR 10394, 12665 +COEF_PAIR 11003, 12140 +COEF_PAIR 11585, 11585, 1 +COEF_PAIR 13160, 9760 +COEF_PAIR 13623, 9102, 1 +COEF_PAIR 14449, 7723 +COEF_PAIR 14811, 7005 +COEF_PAIR 15893, 3981 +COEF_PAIR 16305, 1606 +COEF_PAIR 16364, 804 + +default_8x8: +times 12 db 1 +times 52 db 2 +row_8x8: +times 18 db 1 +times 46 db 2 +col_8x8: +times 6 db 1 +times 58 db 2 +default_16x16: +times 10 db 1 +times 28 db 2 +times 51 db 3 +times 167 db 4 +row_16x16: +times 21 db 1 +times 45 db 2 +times 60 db 3 +times 130 db 4 +col_16x16: +times 5 db 1 +times 12 db 2 +times 25 db 3 +times 214 db 4 +default_32x32: +times 9 db 1 +times 25 db 2 +times 36 db 3 +times 65 db 4 +times 105 db 5 +times 96 db 6 +times 112 db 7 +times 576 db 8 + +SECTION .text + +%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst + mova m%3, [%7] + mova m%4, [%7+strideq] + paddw m%3, m%1 + paddw m%4, m%2 + pmaxsw m%3, m%5 + pmaxsw m%4, m%5 + pminsw m%3, m%6 + pminsw m%4, m%6 + mova [%7], m%3 + mova [%7+strideq], m%4 +%endmacro + +%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg +%assign %%y 0 +%rep %3 +%assign %%x 0 +%rep %3*4/mmsize + mova [%1+%%y+%%x], %4 +%assign %%x (%%x+mmsize) +%endrep +%assign %%y (%%y+%2) +%endrep +%endmacro + +; the input coefficients are scaled up by 2 bit (which we downscale immediately +; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d. +; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling, +; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits +; add 2 bits, we need to scale before converting to word in 12bpp, since the +; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp +; we can scale after converting to words (which is half the instructions), +; since the input is only 14+sign bit, which fits in 15+sign words directly. + +%macro IWHT4_FN 2 ; bpp, max +cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob + mova m7, [pw_%2] + mova m0, [blockq+0*16+0] + mova m1, [blockq+1*16+0] +%if %1 >= 12 + mova m4, [blockq+0*16+8] + mova m5, [blockq+1*16+8] + psrad m0, 2 + psrad m1, 2 + psrad m4, 2 + psrad m5, 2 + packssdw m0, m4 + packssdw m1, m5 +%else + packssdw m0, [blockq+0*16+8] + packssdw m1, [blockq+1*16+8] + psraw m0, 2 + psraw m1, 2 +%endif + mova m2, [blockq+2*16+0] + mova m3, [blockq+3*16+0] +%if %1 >= 12 + mova m4, [blockq+2*16+8] + mova m5, [blockq+3*16+8] + psrad m2, 2 + psrad m3, 2 + psrad m4, 2 + psrad m5, 2 + packssdw m2, m4 + packssdw m3, m5 +%else + packssdw m2, [blockq+2*16+8] + packssdw m3, [blockq+3*16+8] + psraw m2, 2 + psraw m3, 2 +%endif + + VP9_IWHT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IWHT4_1D + + pxor m6, m6 + VP9_STORE_2X 0, 1, 4, 5, 6, 7 + lea dstq, [dstq+strideq*2] + VP9_STORE_2X 2, 3, 4, 5, 6, 7 + ZERO_BLOCK blockq, 16, 4, m6 + RET +%endmacro + +INIT_MMX mmxext +IWHT4_FN 10, 1023 +INIT_MMX mmxext +IWHT4_FN 12, 4095 + +%macro VP9_IDCT4_WRITEOUT 0 +%if cpuflag(ssse3) + mova m5, [pw_2048] + pmulhrsw m0, m5 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + pmulhrsw m3, m5 +%else + mova m5, [pw_8] + paddw m0, m5 + paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + psraw m0, 4 + psraw m1, 4 + psraw m2, 4 + psraw m3, 4 +%endif + mova m5, [pw_1023] + VP9_STORE_2X 0, 1, 6, 7, 4, 5 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 2, 3, 6, 7, 4, 5 +%endmacro + +%macro DC_ONLY 2 ; shift, zero + mov coefd, dword [blockq] + movd [blockq], %2 + imul coefd, 11585 + add coefd, 8192 + sar coefd, 14 + imul coefd, 11585 + add coefd, ((1 << (%1 - 1)) << 14) + 8192 + sar coefd, 14 + %1 +%endmacro + +; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits +; in 15+1 words without additional effort, since the coefficients are 15bpp. + +%macro IDCT4_10_FN 0 +cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob + cmp eobd, 1 + jg .idctfull + + ; dc-only + pxor m4, m4 +%if cpuflag(ssse3) + movd m0, [blockq] + movd [blockq], m4 + mova m5, [pw_11585x2] + pmulhrsw m0, m5 + pmulhrsw m0, m5 +%else + DEFINE_ARGS dst, stride, block, coef + DC_ONLY 4, m4 + movd m0, coefd +%endif + pshufw m0, m0, 0 + mova m5, [pw_1023] +%if cpuflag(ssse3) + pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 +%endif + VP9_STORE_2X 0, 0, 6, 7, 4, 5 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 0, 0, 6, 7, 4, 5 + RET + +.idctfull: + mova m0, [blockq+0*16+0] + mova m1, [blockq+1*16+0] + packssdw m0, [blockq+0*16+8] + packssdw m1, [blockq+1*16+8] + mova m2, [blockq+2*16+0] + mova m3, [blockq+3*16+0] + packssdw m2, [blockq+2*16+8] + packssdw m3, [blockq+3*16+8] + +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif + mova m7, [pd_8192] ; rounding + VP9_IDCT4_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_IDCT4_1D + + pxor m4, m4 + ZERO_BLOCK blockq, 16, 4, m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +INIT_MMX mmxext +IDCT4_10_FN +INIT_MMX ssse3 +IDCT4_10_FN + +%macro IADST4_FN 4 +cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob +%if WIN64 && notcpuflag(ssse3) + WIN64_SPILL_XMM 8 +%endif + movdqa xmm5, [pd_8192] + mova m0, [blockq+0*16+0] + mova m1, [blockq+1*16+0] + packssdw m0, [blockq+0*16+8] + packssdw m1, [blockq+1*16+8] + mova m2, [blockq+2*16+0] + mova m3, [blockq+3*16+0] + packssdw m2, [blockq+2*16+8] + packssdw m3, [blockq+3*16+8] + +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif +%ifnidn %1%3, iadstiadst + movdq2q m7, xmm5 +%endif + VP9_%2_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_%4_1D + + pxor m4, m4 + ZERO_BLOCK blockq, 16, 4, m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +INIT_MMX sse2 +IADST4_FN idct, IDCT4, iadst, IADST4 +IADST4_FN iadst, IADST4, idct, IDCT4 +IADST4_FN iadst, IADST4, iadst, IADST4 + +INIT_MMX ssse3 +IADST4_FN idct, IDCT4, iadst, IADST4 +IADST4_FN iadst, IADST4, idct, IDCT4 +IADST4_FN iadst, IADST4, iadst, IADST4 + +; inputs and outputs are dwords, coefficients are words +; +; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14 +; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14 +%macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask + pand m%3, m%1, %8 + pand m%4, m%2, %8 + psrad m%1, 14 + psrad m%2, 14 + packssdw m%4, m%2 + packssdw m%3, m%1 + punpckhwd m%2, m%4, m%3 + punpcklwd m%4, m%3 + pmaddwd m%3, m%4, [pw_%6_%5] + pmaddwd m%1, m%2, [pw_%6_%5] + pmaddwd m%4, [pw_m%5_%6] + pmaddwd m%2, [pw_m%5_%6] + paddd m%3, %7 + paddd m%4, %7 + psrad m%3, 14 + psrad m%4, 14 + paddd m%1, m%3 + paddd m%2, m%4 +%endmacro + +%macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1 + SUMSUB_MUL %3, %5, %7, %8, 11585, 11585, %1, %2 + SUMSUB_MUL %4, %6, %7, %8, 15137, 6270, %1, %2 + SUMSUB_BA d, %4, %3, %7 + SUMSUB_BA d, %6, %5, %7 + SWAP %4, %6, %3 +%endmacro + +%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max + movh m%1, [dstq+strideq*0] + movh m%2, [dstq+strideq*2] + movhps m%1, [dstq+strideq*1] + movhps m%2, [dstq+stride3q ] + paddw m%1, m%3 + paddw m%2, m%4 + pmaxsw m%1, %5 + pmaxsw m%2, %5 + pminsw m%1, %6 + pminsw m%2, %6 + movh [dstq+strideq*0], m%1 + movhps [dstq+strideq*1], m%1 + movh [dstq+strideq*2], m%2 + movhps [dstq+stride3q ], m%2 +%endmacro + +%macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift + paddd m%1, %7 + paddd m%2, %7 + paddd m%3, %7 + paddd m%4, %7 + psrad m%1, %8 + psrad m%2, %8 + psrad m%3, %8 + psrad m%4, %8 + packssdw m%1, m%2 + packssdw m%3, m%4 + STORE_4x4 %2, %4, %1, %3, %5, %6 +%endmacro + +INIT_XMM sse2 +cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob + cmp eobd, 1 + jg .idctfull + + ; dc-only - this is special, since for 4x4 12bpp, the max coef size is + ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the + ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a + ; dword. After the final shift (4), the result is 13+sign bits, so we + ; don't need any additional processing to fit it in a word + DEFINE_ARGS dst, stride, block, coef + pxor m4, m4 + DC_ONLY 4, m4 + movd m0, coefd + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova m5, [pw_4095] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + STORE_4x4 1, 3, 0, 0, m4, m5 + RET + +.idctfull: + DEFINE_ARGS dst, stride, block, eob + mova m0, [blockq+0*16] + mova m1, [blockq+1*16] + mova m2, [blockq+2*16] + mova m3, [blockq+3*16] + mova m6, [pd_8192] + mova m7, [pd_3fff] + + IDCT4_12BPP_1D m6, m7 + TRANSPOSE4x4D 0, 1, 2, 3, 4 + IDCT4_12BPP_1D m6, m7 + + pxor m4, m4 + ZERO_BLOCK blockq, 16, 4, m4 + + ; writeout + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova m5, [pw_4095] + mova m6, [pd_8] + ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4 + RET + +%macro SCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%if %0 == 4 +%define reg_%4 m%2 +%endif +%else + mova [%3], m%1 +%if %0 == 4 +%define reg_%4 [%3] +%endif +%endif +%endmacro + +%macro UNSCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%if %0 == 4 +%undef reg_%4 +%endif +%endmacro + +%macro PRELOAD 2-3 +%if ARCH_X86_64 + mova m%1, [%2] +%if %0 == 3 +%define reg_%3 m%1 +%endif +%elif %0 == 3 +%define reg_%3 [%2] +%endif +%endmacro + +; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14 +; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14 +; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14 +; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14 +%macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask + pand m4, m0, %2 + pand m5, m1, %2 + psrad m0, 14 + psrad m1, 14 + packssdw m5, m1 + packssdw m4, m0 + punpckhwd m1, m4, m5 + punpcklwd m4, m5 + pand m5, m2, %2 + pand m6, m3, %2 + psrad m2, 14 + psrad m3, 14 + packssdw m6, m3 + packssdw m5, m2 + punpckhwd m3, m5, m6 + punpcklwd m5, m6 + SCRATCH 1, 8, rsp+0*mmsize, a + SCRATCH 5, 9, rsp+1*mmsize, b + + ; m1/3 have the high bits of 0,1,2,3 + ; m4/5 have the low bits of 0,1,2,3 + ; m0/2/6/7 are free + + mova m2, [pw_15212_9929] + mova m0, [pw_5283_13377] + pmaddwd m7, m2, reg_b + pmaddwd m6, m4, m0 + pmaddwd m2, m3 + pmaddwd m0, reg_a + paddd m6, m7 + paddd m0, m2 + mova m1, [pw_m13377_13377] + mova m5, [pw_13377_0] + pmaddwd m7, m1, reg_b + pmaddwd m2, m4, m5 + pmaddwd m1, m3 + pmaddwd m5, reg_a + paddd m2, m7 + paddd m1, m5 + paddd m6, %1 + paddd m2, %1 + psrad m6, 14 + psrad m2, 14 + paddd m0, m6 ; t0 + paddd m2, m1 ; t2 + + mova m7, [pw_m5283_m15212] + mova m5, [pw_9929_13377] + pmaddwd m1, m7, reg_b + pmaddwd m6, m4, m5 + pmaddwd m7, m3 + pmaddwd m5, reg_a + paddd m6, m1 + paddd m7, m5 + UNSCRATCH 5, 9, rsp+1*mmsize, b + pmaddwd m5, [pw_9929_m5283] + pmaddwd m4, [pw_15212_m13377] + pmaddwd m3, [pw_9929_m5283] + UNSCRATCH 1, 8, rsp+0*mmsize, a + pmaddwd m1, [pw_15212_m13377] + paddd m4, m5 + paddd m3, m1 + paddd m6, %1 + paddd m4, %1 + psrad m6, 14 + psrad m4, 14 + paddd m7, m6 ; t1 + paddd m3, m4 ; t3 + + SWAP 1, 7 +%endmacro + +%macro IADST4_12BPP_FN 4 +cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob + mova m0, [blockq+0*16] + mova m1, [blockq+1*16] + mova m2, [blockq+2*16] + mova m3, [blockq+3*16] + + PRELOAD 10, pd_8192, rnd + PRELOAD 11, pd_3fff, mask + %2_12BPP_1D reg_rnd, reg_mask + TRANSPOSE4x4D 0, 1, 2, 3, 4 + %4_12BPP_1D reg_rnd, reg_mask + + pxor m4, m4 + ZERO_BLOCK blockq, 16, 4, m4 + + ; writeout + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova m5, [pw_4095] + mova m6, [pd_8] + ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4 + RET +%endmacro + +INIT_XMM sse2 +IADST4_12BPP_FN idct, IDCT4, iadst, IADST4 +IADST4_12BPP_FN iadst, IADST4, idct, IDCT4 +IADST4_12BPP_FN iadst, IADST4, iadst, IADST4 + +; the following line has not been executed at the end of this macro: +; UNSCRATCH 6, 8, rsp+%3*mmsize +%macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset + mova m0, [%1+0*%4] + mova m2, [%1+2*%4] + mova m4, [%1+4*%4] + mova m6, [%1+6*%4] + IDCT4_12BPP_1D %2, %3, 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3 + SCRATCH 4, 8, rsp+(%5+0)*mmsize + SCRATCH 6, 9, rsp+(%5+1)*mmsize + mova m1, [%1+1*%4] + mova m3, [%1+3*%4] + mova m5, [%1+5*%4] + mova m7, [%1+7*%4] + SUMSUB_MUL 1, 7, 4, 6, 16069, 3196, %2, %3 ; m1=t7a, m7=t4a + SUMSUB_MUL 5, 3, 4, 6, 9102, 13623, %2, %3 ; m5=t6a, m3=t5a + SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a + SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a + SUMSUB_MUL 1, 7, 4, 6, 11585, 11585, %2, %3 ; m1=t6, m7=t5 + SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7 + SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6 + UNSCRATCH 4, 8, rsp+(%5+0)*mmsize + UNSCRATCH 6, 9, rsp+(%5+1)*mmsize + SCRATCH 2, 8, rsp+(%5+0)*mmsize + SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5 + SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4 + SWAP 0, 5, 4, 6, 2, 7 +%endmacro + +%macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max + mova m%1, [%6+%7*0] + mova m%2, [%6+%7*1] + paddw m%1, m%3 + paddw m%2, m%3 + pmaxsw m%1, %4 + pmaxsw m%2, %4 + pminsw m%1, %5 + pminsw m%2, %5 + mova [%6+%7*0], m%1 + mova [%6+%7*1], m%2 +%endmacro + +; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp +; storage also instead of allocating two more stack spaces. This doesn't +; matter much but it's something... +INIT_XMM sse2 +cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \ + 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_1023] + cmp eobd, 1 + jg .idctfull + + ; dc-only - the 10bit version can be done entirely in 32bit, since the max + ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily + ; fits in 32bit + DEFINE_ARGS dst, stride, block, coef + pxor m2, m2 + DC_ONLY 5, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 4 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop_dc + RET + +.idctfull: + SCRATCH 0, 12, rsp+16*mmsize, max + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak +%if ARCH_X86_64 + mov dstbakq, dstq + movsxd cntq, cntd +%endif +%ifdef PIC + lea ptrq, [default_8x8] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [default_8x8+cntq-1] +%endif + mov skipd, 2 + sub skipd, cntd + mov ptrq, rsp + PRELOAD 10, pd_8192, rnd + PRELOAD 11, pd_3fff, mask + PRELOAD 13, pd_16, srnd +.loop_1: + IDCT8_1D blockq, reg_rnd, reg_mask + + TRANSPOSE4x4D 0, 1, 2, 3, 6 + mova [ptrq+ 0*mmsize], m0 + mova [ptrq+ 2*mmsize], m1 + mova [ptrq+ 4*mmsize], m2 + mova [ptrq+ 6*mmsize], m3 + UNSCRATCH 6, 8, rsp+17*mmsize + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 1*mmsize], m4 + mova [ptrq+ 3*mmsize], m5 + mova [ptrq+ 5*mmsize], m6 + mova [ptrq+ 7*mmsize], m7 + add ptrq, 8 * mmsize + add blockq, mmsize + dec cntd + jg .loop_1 + + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + add ptrq, 4 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] + mov cntd, 2 + mov ptrq, rsp +.loop_2: + IDCT8_1D ptrq, reg_rnd, reg_mask + + pxor m6, m6 + ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5 + lea dstq, [dstq+strideq*4] + UNSCRATCH 0, 8, rsp+17*mmsize + UNSCRATCH 1, 12, rsp+16*mmsize, max + UNSCRATCH 2, 13, pd_16, srnd + ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5 + add ptrq, 16 +%if ARCH_X86_64 + lea dstq, [dstbakq+8] +%else + mov dstq, dstm + add dstq, 8 +%endif + dec cntd + jg .loop_2 + + ; m6 is still zero + ZERO_BLOCK blockq-2*mmsize, 32, 8, m6 + RET + +%macro DC_ONLY_64BIT 2 ; shift, zero +%if ARCH_X86_64 + movsxd coefq, dword [blockq] + movd [blockq], %2 + imul coefq, 11585 + add coefq, 8192 + sar coefq, 14 + imul coefq, 11585 + add coefq, ((1 << (%1 - 1)) << 14) + 8192 + sar coefq, 14 + %1 +%else + mov coefd, dword [blockq] + movd [blockq], %2 + DEFINE_ARGS dst, stride, cnt, coef, coefl + mov cntd, 2 +.loop_dc_calc: + mov coefld, coefd + sar coefd, 14 + and coefld, 0x3fff + imul coefd, 11585 + imul coefld, 11585 + add coefld, 8192 + sar coefld, 14 + add coefd, coefld + dec cntd + jg .loop_dc_calc + add coefd, 1 << (%1 - 1) + sar coefd, %1 +%endif +%endmacro + +INIT_XMM sse2 +cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \ + 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_4095] + cmp eobd, 1 + jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull + + ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign + ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies + DEFINE_ARGS dst, stride, block, coef, coefl + pxor m2, m2 + DC_ONLY_64BIT 5, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 4 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0 + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop_dc + RET + +; inputs and outputs are dwords, coefficients are words +; +; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2 +; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1 +%macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask + pand m%3, m%1, %7 + pand m%4, m%2, %7 + psrad m%1, 14 + psrad m%2, 14 + packssdw m%4, m%2 + packssdw m%3, m%1 + punpckhwd m%2, m%4, m%3 + punpcklwd m%4, m%3 + pmaddwd m%3, m%4, [pw_%6_%5] + pmaddwd m%1, m%2, [pw_%6_%5] + pmaddwd m%4, [pw_m%5_%6] + pmaddwd m%2, [pw_m%5_%6] +%endmacro + +; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14 +; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14 +%macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd + SUMSUB_BA d, %1, %2, %5 + SUMSUB_BA d, %3, %4, %5 + paddd m%3, %6 + paddd m%4, %6 + psrad m%3, 14 + psrad m%4, 14 + paddd m%1, m%3 + paddd m%2, m%4 +%endmacro + +%macro NEGD 1 +%if cpuflag(ssse3) + psignd %1, [pw_m1] +%else + pxor %1, [pw_m1] + paddd %1, [pd_1] +%endif +%endmacro + +; the following line has not been executed at the end of this macro: +; UNSCRATCH 6, 8, rsp+17*mmsize +%macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask + mova m0, [%1+ 0*mmsize] + mova m3, [%1+ 6*mmsize] + mova m4, [%1+ 8*mmsize] + mova m7, [%1+14*mmsize] + SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606, %3 ; m7/1=t0a, m0/2=t1a + SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665, %3 ; m3/5=t4a, m4/6=t5a + SCRATCH 0, 8, rsp+17*mmsize + SUMSUB_PACK_D 3, 7, 5, 1, 0, %2 ; m3=t0, m7=t4 + UNSCRATCH 0, 8, rsp+17*mmsize + SUMSUB_PACK_D 4, 0, 6, 2, 1, %2 ; m4=t1, m0=t5 + + SCRATCH 3, 8, rsp+17*mmsize + SCRATCH 4, 9, rsp+18*mmsize + SCRATCH 7, 10, rsp+19*mmsize + SCRATCH 0, 11, rsp+20*mmsize + + mova m1, [%1+ 2*mmsize] + mova m2, [%1+ 4*mmsize] + mova m5, [%1+10*mmsize] + mova m6, [%1+12*mmsize] + SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723, %3 ; m5/8=t2a, m2/9=t3a + SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679, %3 ; m1/10=t6a, m6/11=t7a + SCRATCH 2, 12, rsp+21*mmsize + SUMSUB_PACK_D 1, 5, 7, 3, 2, %2 ; m1=t2, m5=t6 + UNSCRATCH 2, 12, rsp+21*mmsize + SUMSUB_PACK_D 6, 2, 0, 4, 3, %2 ; m6=t3, m2=t7 + + UNSCRATCH 7, 10, rsp+19*mmsize + UNSCRATCH 0, 11, rsp+20*mmsize + SCRATCH 1, 10, rsp+19*mmsize + SCRATCH 6, 11, rsp+20*mmsize + + SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270, %3 ; m7/8=t4a, m0/9=t5a + SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137, %3 ; m2/10=t7a, m5/11=t6a + SCRATCH 2, 12, rsp+21*mmsize + SUMSUB_PACK_D 5, 7, 6, 3, 2, %2 ; m5=-out1, m7=t6 + UNSCRATCH 2, 12, rsp+21*mmsize + NEGD m5 ; m5=out1 + SUMSUB_PACK_D 2, 0, 1, 4, 3, %2 ; m2=out6, m0=t7 + SUMSUB_MUL 7, 0, 3, 4, 11585, 11585, %2, %3 ; m7=out2, m0=-out5 + NEGD m0 ; m0=out5 + + UNSCRATCH 3, 8, rsp+17*mmsize + UNSCRATCH 4, 9, rsp+18*mmsize + UNSCRATCH 1, 10, rsp+19*mmsize + UNSCRATCH 6, 11, rsp+20*mmsize + SCRATCH 2, 8, rsp+17*mmsize + SCRATCH 0, 9, rsp+18*mmsize + + SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2 + SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3 + NEGD m6 ; m6=out7 + SUMSUB_MUL 3, 4, 2, 0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4 + NEGD m3 ; m3=out3 + + UNSCRATCH 0, 9, rsp+18*mmsize + + SWAP 0, 1, 5 + SWAP 2, 7, 6 +%endmacro + +%macro IADST8_FN 5 +cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \ + 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_1023] + +.body: + SCRATCH 0, 13, rsp+16*mmsize, max + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak +%if ARCH_X86_64 + mov dstbakq, dstq + movsxd cntq, cntd +%endif +%ifdef PIC + lea ptrq, [%5_8x8] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [%5_8x8+cntq-1] +%endif + mov skipd, 2 + sub skipd, cntd + mov ptrq, rsp + PRELOAD 14, pd_8192, rnd + PRELOAD 15, pd_3fff, mask +.loop_1: + %2_1D blockq, reg_rnd, reg_mask + + TRANSPOSE4x4D 0, 1, 2, 3, 6 + mova [ptrq+ 0*mmsize], m0 + mova [ptrq+ 2*mmsize], m1 + mova [ptrq+ 4*mmsize], m2 + mova [ptrq+ 6*mmsize], m3 + UNSCRATCH 6, 8, rsp+17*mmsize + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 1*mmsize], m4 + mova [ptrq+ 3*mmsize], m5 + mova [ptrq+ 5*mmsize], m6 + mova [ptrq+ 7*mmsize], m7 + add ptrq, 8 * mmsize + add blockq, mmsize + dec cntd + jg .loop_1 + + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + add ptrq, 4 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] + mov cntd, 2 + mov ptrq, rsp +.loop_2: + %4_1D ptrq, reg_rnd, reg_mask + + pxor m6, m6 + PRELOAD 9, pd_16, srnd + ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5 + lea dstq, [dstq+strideq*4] + UNSCRATCH 0, 8, rsp+17*mmsize + UNSCRATCH 1, 13, rsp+16*mmsize, max + UNSCRATCH 2, 9, pd_16, srnd + ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5 + add ptrq, 16 +%if ARCH_X86_64 + lea dstq, [dstbakq+8] +%else + mov dstq, dstm + add dstq, 8 +%endif + dec cntd + jg .loop_2 + + ; m6 is still zero + ZERO_BLOCK blockq-2*mmsize, 32, 8, m6 + RET + +cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \ + 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +IADST8_FN idct, IDCT8, iadst, IADST8, row +IADST8_FN iadst, IADST8, idct, IDCT8, col +IADST8_FN iadst, IADST8, iadst, IADST8, default + +%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset + IDCT8_1D %1, [pd_8192], [pd_3fff], %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7 + ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6 + SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a + SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a + SCRATCH 2, 13, rsp+(%4+5)*mmsize ; t2a + SCRATCH 3, 12, rsp+(%4+4)*mmsize ; t3a + SCRATCH 4, 11, rsp+(%4+3)*mmsize ; t4 + mova [rsp+(%3+0)*mmsize], m5 ; t5 + mova [rsp+(%3+1)*mmsize], m7 ; t7 + + mova m0, [%1+ 1*%2] ; in1 + mova m3, [%1+ 7*%2] ; in7 + mova m4, [%1+ 9*%2] ; in9 + mova m7, [%1+15*%2] ; in15 + + SUMSUB_MUL 0, 7, 1, 2, 16305, 1606 ; m0=t15a, m7=t8a + SUMSUB_MUL 4, 3, 1, 2, 10394, 12665 ; m4=t14a, m3=t9a + SUMSUB_BA d, 3, 7, 1 ; m3=t8, m7=t9 + SUMSUB_BA d, 4, 0, 1 ; m4=t15,m0=t14 + SUMSUB_MUL 0, 7, 1, 2, 15137, 6270 ; m0=t14a, m7=t9a + + mova m1, [%1+ 3*%2] ; in3 + mova m2, [%1+ 5*%2] ; in5 + mova m5, [%1+11*%2] ; in11 + mova m6, [%1+13*%2] ; in13 + + SCRATCH 0, 9, rsp+(%4+1)*mmsize + SCRATCH 7, 10, rsp+(%4+2)*mmsize + + SUMSUB_MUL 2, 5, 0, 7, 14449, 7723 ; m2=t13a, m5=t10a + SUMSUB_MUL 6, 1, 0, 7, 4756, 15679 ; m6=t12a, m1=t11a + SUMSUB_BA d, 5, 1, 0 ; m5=t11,m1=t10 + SUMSUB_BA d, 2, 6, 0 ; m2=t12,m6=t13 + NEGD m1 ; m1=-t10 + SUMSUB_MUL 1, 6, 0, 7, 15137, 6270 ; m1=t13a, m6=t10a + + UNSCRATCH 7, 10, rsp+(%4+2)*mmsize + SUMSUB_BA d, 5, 3, 0 ; m5=t8a, m3=t11a + SUMSUB_BA d, 6, 7, 0 ; m6=t9, m7=t10 + SUMSUB_BA d, 2, 4, 0 ; m2=t15a,m4=t12a + SCRATCH 5, 10, rsp+(%4+2)*mmsize + SUMSUB_MUL 4, 3, 0, 5, 11585, 11585 ; m4=t12, m3=t11 + UNSCRATCH 0, 9, rsp+(%4+1)*mmsize + SUMSUB_BA d, 1, 0, 5 ; m1=t14, m0=t13 + SCRATCH 6, 9, rsp+(%4+1)*mmsize + SUMSUB_MUL 0, 7, 6, 5, 11585, 11585 ; m0=t13a,m7=t10a + + ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2 + ; free: 6,5 + + UNSCRATCH 5, 15, rsp+(%4+7)*mmsize + SUMSUB_BA d, 2, 5, 6 ; m2=out0, m5=out15 + SCRATCH 5, 15, rsp+(%4+7)*mmsize + UNSCRATCH 5, 14, rsp+(%4+6)*mmsize + SUMSUB_BA d, 1, 5, 6 ; m1=out1, m5=out14 + SCRATCH 5, 14, rsp+(%4+6)*mmsize + UNSCRATCH 5, 13, rsp+(%4+5)*mmsize + SUMSUB_BA d, 0, 5, 6 ; m0=out2, m5=out13 + SCRATCH 5, 13, rsp+(%4+5)*mmsize + UNSCRATCH 5, 12, rsp+(%4+4)*mmsize + SUMSUB_BA d, 4, 5, 6 ; m4=out3, m5=out12 + SCRATCH 5, 12, rsp+(%4+4)*mmsize + UNSCRATCH 5, 11, rsp+(%4+3)*mmsize + SUMSUB_BA d, 3, 5, 6 ; m3=out4, m5=out11 + SCRATCH 4, 11, rsp+(%4+3)*mmsize + mova m4, [rsp+(%3+0)*mmsize] + SUMSUB_BA d, 7, 4, 6 ; m7=out5, m4=out10 + mova [rsp+(%3+0)*mmsize], m5 + UNSCRATCH 5, 8, rsp+(%4+0)*mmsize + UNSCRATCH 6, 9, rsp+(%4+1)*mmsize + SCRATCH 2, 8, rsp+(%4+0)*mmsize + SCRATCH 1, 9, rsp+(%4+1)*mmsize + UNSCRATCH 1, 10, rsp+(%4+2)*mmsize + SCRATCH 0, 10, rsp+(%4+2)*mmsize + mova m0, [rsp+(%3+1)*mmsize] + SUMSUB_BA d, 6, 5, 2 ; m6=out6, m5=out9 + SUMSUB_BA d, 1, 0, 2 ; m1=out7, m0=out8 + + SWAP 0, 3, 1, 7, 2, 6, 4 + + ; output order: 8-11|r67-70=out0-3 + ; 0-6,r65=out4-11 + ; 12-15|r71-74=out12-15 +%endmacro + +INIT_XMM sse2 +cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ + 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_1023] + cmp eobd, 1 + jg .idctfull + + ; dc-only - the 10bit version can be done entirely in 32bit, since the max + ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily + ; fits in 32bit + DEFINE_ARGS dst, stride, block, coef + pxor m2, m2 + DC_ONLY 6, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize + STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop_dc + RET + +.idctfull: + mova [rsp+64*mmsize], m0 + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak +%if ARCH_X86_64 + mov dstbakq, dstq + movsxd cntq, cntd +%endif +%ifdef PIC + lea ptrq, [default_16x16] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [default_16x16+cntq-1] +%endif + mov skipd, 4 + sub skipd, cntd + mov ptrq, rsp +.loop_1: + IDCT16_1D blockq + + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 1*mmsize], m0 + mova [ptrq+ 5*mmsize], m1 + mova [ptrq+ 9*mmsize], m2 + mova [ptrq+13*mmsize], m3 + mova m7, [rsp+65*mmsize] + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 2*mmsize], m4 + mova [ptrq+ 6*mmsize], m5 + mova [ptrq+10*mmsize], m6 + mova [ptrq+14*mmsize], m7 + UNSCRATCH 0, 8, rsp+67*mmsize + UNSCRATCH 1, 9, rsp+68*mmsize + UNSCRATCH 2, 10, rsp+69*mmsize + UNSCRATCH 3, 11, rsp+70*mmsize + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 0*mmsize], m0 + mova [ptrq+ 4*mmsize], m1 + mova [ptrq+ 8*mmsize], m2 + mova [ptrq+12*mmsize], m3 + UNSCRATCH 4, 12, rsp+71*mmsize + UNSCRATCH 5, 13, rsp+72*mmsize + UNSCRATCH 6, 14, rsp+73*mmsize + UNSCRATCH 7, 15, rsp+74*mmsize + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 3*mmsize], m4 + mova [ptrq+ 7*mmsize], m5 + mova [ptrq+11*mmsize], m6 + mova [ptrq+15*mmsize], m7 + add ptrq, 16 * mmsize + add blockq, mmsize + dec cntd + jg .loop_1 + + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + mova [ptrq+mmsize*4], m0 + mova [ptrq+mmsize*5], m0 + mova [ptrq+mmsize*6], m0 + mova [ptrq+mmsize*7], m0 + add ptrq, 8 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] + mov cntd, 4 + mov ptrq, rsp +.loop_2: + IDCT16_1D ptrq + + pxor m7, m7 + lea dstq, [dstq+strideq*4] + ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 + lea dstq, [dstq+strideq*4] + mova m0, [rsp+65*mmsize] + mova m1, [rsp+64*mmsize] + mova m2, [pd_32] + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 + +%if ARCH_X86_64 + DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst +%else + mov dstq, dstm +%endif + UNSCRATCH 0, 8, rsp+67*mmsize + UNSCRATCH 4, 9, rsp+68*mmsize + UNSCRATCH 5, 10, rsp+69*mmsize + UNSCRATCH 3, 11, rsp+70*mmsize + ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6 +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea dstq, [dstbakq+stride3q*4] +%else + lea dstq, [dstq+stride3q*4] +%endif + UNSCRATCH 4, 12, rsp+71*mmsize + UNSCRATCH 5, 13, rsp+72*mmsize + UNSCRATCH 6, 14, rsp+73*mmsize + UNSCRATCH 0, 15, rsp+74*mmsize + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 + + add ptrq, mmsize +%if ARCH_X86_64 + add dstbakq, 8 + mov dstq, dstbakq +%else + add dword dstm, 8 + mov dstq, dstm +%endif + dec cntd + jg .loop_2 + + ; m7 is still zero + ZERO_BLOCK blockq-4*mmsize, 64, 16, m7 + RET + +INIT_XMM sse2 +cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ + 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_4095] + cmp eobd, 1 + jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull + + ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign + ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies + DEFINE_ARGS dst, stride, block, coef, coefl + pxor m2, m2 + DC_ONLY_64BIT 6, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 8 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize + STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize + lea dstq, [dstq+strideq*2] + dec cntd + jg .loop_dc + RET + +; r65-69 are available for spills +; r70-77 are available on x86-32 only (x86-64 should use m8-15) +; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77 +%macro IADST16_1D 1 ; src + mova m0, [%1+ 0*4*mmsize] ; in0 + mova m1, [%1+ 7*4*mmsize] ; in7 + mova m2, [%1+ 8*4*mmsize] ; in8 + mova m3, [%1+15*4*mmsize] ; in15 + SUMSUB_MUL_D 3, 0, 4, 5, 16364, 804 ; m3/4=t0, m0/5=t1 + SUMSUB_MUL_D 1, 2, 6, 7, 11003, 12140 ; m1/6=t8, m2/7=t9 + SCRATCH 0, 8, rsp+70*mmsize + SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t0a, m3=t8a + UNSCRATCH 0, 8, rsp+70*mmsize + SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t1a, m0=t9a + mova [rsp+67*mmsize], m1 + SCRATCH 2, 9, rsp+71*mmsize + SCRATCH 3, 12, rsp+74*mmsize + SCRATCH 0, 13, rsp+75*mmsize + + mova m0, [%1+ 3*4*mmsize] ; in3 + mova m1, [%1+ 4*4*mmsize] ; in4 + mova m2, [%1+11*4*mmsize] ; in11 + mova m3, [%1+12*4*mmsize] ; in12 + SUMSUB_MUL_D 2, 1, 4, 5, 14811, 7005 ; m2/4=t4, m1/5=t5 + SUMSUB_MUL_D 0, 3, 6, 7, 5520, 15426 ; m0/6=t12, m3/7=t13 + SCRATCH 1, 10, rsp+72*mmsize + SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t4a, m2=t12a + UNSCRATCH 1, 10, rsp+72*mmsize + SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t5a, m1=t13a + SCRATCH 0, 15, rsp+77*mmsize + SCRATCH 3, 11, rsp+73*mmsize + + UNSCRATCH 0, 12, rsp+74*mmsize ; t8a + UNSCRATCH 3, 13, rsp+75*mmsize ; t9a + SUMSUB_MUL_D 0, 3, 4, 5, 16069, 3196 ; m0/4=t8, m3/5=t9 + SUMSUB_MUL_D 1, 2, 6, 7, 3196, 16069 ; m1/6=t13, m2/7=t12 + SCRATCH 1, 12, rsp+74*mmsize + SUMSUB_PACK_D 2, 0, 7, 4, 1 ; m2=t8a, m0=t12a + UNSCRATCH 1, 12, rsp+74*mmsize + SUMSUB_PACK_D 1, 3, 6, 5, 4 ; m1=t9a, m3=t13a + mova [rsp+65*mmsize], m2 + mova [rsp+66*mmsize], m1 + SCRATCH 0, 8, rsp+70*mmsize + SCRATCH 3, 12, rsp+74*mmsize + + mova m0, [%1+ 2*4*mmsize] ; in2 + mova m1, [%1+ 5*4*mmsize] ; in5 + mova m2, [%1+10*4*mmsize] ; in10 + mova m3, [%1+13*4*mmsize] ; in13 + SUMSUB_MUL_D 3, 0, 4, 5, 15893, 3981 ; m3/4=t2, m0/5=t3 + SUMSUB_MUL_D 1, 2, 6, 7, 8423, 14053 ; m1/6=t10, m2/7=t11 + SCRATCH 0, 10, rsp+72*mmsize + SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t2a, m3=t10a + UNSCRATCH 0, 10, rsp+72*mmsize + SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t3a, m0=t11a + mova [rsp+68*mmsize], m1 + mova [rsp+69*mmsize], m2 + SCRATCH 3, 13, rsp+75*mmsize + SCRATCH 0, 14, rsp+76*mmsize + + mova m0, [%1+ 1*4*mmsize] ; in1 + mova m1, [%1+ 6*4*mmsize] ; in6 + mova m2, [%1+ 9*4*mmsize] ; in9 + mova m3, [%1+14*4*mmsize] ; in14 + SUMSUB_MUL_D 2, 1, 4, 5, 13160, 9760 ; m2/4=t6, m1/5=t7 + SUMSUB_MUL_D 0, 3, 6, 7, 2404, 16207 ; m0/6=t14, m3/7=t15 + SCRATCH 1, 10, rsp+72*mmsize + SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t6a, m2=t14a + UNSCRATCH 1, 10, rsp+72*mmsize + SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t7a, m1=t15a + + UNSCRATCH 4, 13, rsp+75*mmsize ; t10a + UNSCRATCH 5, 14, rsp+76*mmsize ; t11a + SCRATCH 0, 13, rsp+75*mmsize + SCRATCH 3, 14, rsp+76*mmsize + SUMSUB_MUL_D 4, 5, 6, 7, 9102, 13623 ; m4/6=t10, m5/7=t11 + SUMSUB_MUL_D 1, 2, 0, 3, 13623, 9102 ; m1/0=t15, m2/3=t14 + SCRATCH 0, 10, rsp+72*mmsize + SUMSUB_PACK_D 2, 4, 3, 6, 0 ; m2=t10a, m4=t14a + UNSCRATCH 0, 10, rsp+72*mmsize + SUMSUB_PACK_D 1, 5, 0, 7, 6 ; m1=t11a, m5=t15a + + UNSCRATCH 0, 8, rsp+70*mmsize ; t12a + UNSCRATCH 3, 12, rsp+74*mmsize ; t13a + SCRATCH 2, 8, rsp+70*mmsize + SCRATCH 1, 12, rsp+74*mmsize + SUMSUB_MUL_D 0, 3, 1, 2, 15137, 6270 ; m0/1=t12, m3/2=t13 + SUMSUB_MUL_D 5, 4, 7, 6, 6270, 15137 ; m5/7=t15, m4/6=t14 + SCRATCH 2, 10, rsp+72*mmsize + SUMSUB_PACK_D 4, 0, 6, 1, 2 ; m4=out2, m0=t14a + UNSCRATCH 2, 10, rsp+72*mmsize + SUMSUB_PACK_D 5, 3, 7, 2, 1 ; m5=-out13, m3=t15a + NEGD m5 ; m5=out13 + + UNSCRATCH 1, 9, rsp+71*mmsize ; t1a + mova m2, [rsp+68*mmsize] ; t2a + UNSCRATCH 6, 13, rsp+75*mmsize ; t6a + UNSCRATCH 7, 14, rsp+76*mmsize ; t7a + SCRATCH 4, 10, rsp+72*mmsize + SCRATCH 5, 13, rsp+75*mmsize + UNSCRATCH 4, 15, rsp+77*mmsize ; t4a + UNSCRATCH 5, 11, rsp+73*mmsize ; t5a + SCRATCH 0, 14, rsp+76*mmsize + SCRATCH 3, 15, rsp+77*mmsize + mova m0, [rsp+67*mmsize] ; t0a + SUMSUB_BA d, 4, 0, 3 ; m4=t0, m0=t4 + SUMSUB_BA d, 5, 1, 3 ; m5=t1, m1=t5 + SUMSUB_BA d, 6, 2, 3 ; m6=t2, m2=t6 + SCRATCH 4, 9, rsp+71*mmsize + mova m3, [rsp+69*mmsize] ; t3a + SUMSUB_BA d, 7, 3, 4 ; m7=t3, m3=t7 + + mova [rsp+67*mmsize], m5 + mova [rsp+68*mmsize], m6 + mova [rsp+69*mmsize], m7 + SUMSUB_MUL_D 0, 1, 4, 5, 15137, 6270 ; m0/4=t4a, m1/5=t5a + SUMSUB_MUL_D 3, 2, 7, 6, 6270, 15137 ; m3/7=t7a, m2/6=t6a + SCRATCH 1, 11, rsp+73*mmsize + SUMSUB_PACK_D 2, 0, 6, 4, 1 ; m2=-out3, m0=t6 + NEGD m2 ; m2=out3 + UNSCRATCH 1, 11, rsp+73*mmsize + SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=out12, m1=t7 + SCRATCH 2, 11, rsp+73*mmsize + UNSCRATCH 2, 12, rsp+74*mmsize ; t11a + SCRATCH 3, 12, rsp+74*mmsize + + UNSCRATCH 3, 8, rsp+70*mmsize ; t10a + mova m4, [rsp+65*mmsize] ; t8a + mova m5, [rsp+66*mmsize] ; t9a + SUMSUB_BA d, 3, 4, 6 ; m3=-out1, m4=t10 + NEGD m3 ; m3=out1 + SUMSUB_BA d, 2, 5, 6 ; m2=out14, m5=t11 + UNSCRATCH 6, 9, rsp+71*mmsize ; t0 + UNSCRATCH 7, 14, rsp+76*mmsize ; t14a + SCRATCH 3, 9, rsp+71*mmsize + SCRATCH 2, 14, rsp+76*mmsize + + SUMSUB_MUL 1, 0, 2, 3, 11585, 11585 ; m1=out4, m0=out11 + mova [rsp+65*mmsize], m0 + SUMSUB_MUL 5, 4, 2, 3, 11585, 11585 ; m5=out6, m4=out9 + UNSCRATCH 0, 15, rsp+77*mmsize ; t15a + SUMSUB_MUL 7, 0, 2, 3, 11585, m11585 ; m7=out10, m0=out5 + + mova m2, [rsp+68*mmsize] ; t2 + SUMSUB_BA d, 2, 6, 3 ; m2=out0, m6=t2a + SCRATCH 2, 8, rsp+70*mmsize + mova m2, [rsp+67*mmsize] ; t1 + mova m3, [rsp+69*mmsize] ; t3 + mova [rsp+67*mmsize], m7 + SUMSUB_BA d, 3, 2, 7 ; m3=-out15, m2=t3a + NEGD m3 ; m3=out15 + SCRATCH 3, 15, rsp+77*mmsize + SUMSUB_MUL 6, 2, 7, 3, 11585, m11585 ; m6=out8, m2=out7 + mova m7, [rsp+67*mmsize] + + SWAP 0, 1 + SWAP 2, 5, 4, 6, 7, 3 +%endmacro + +%macro IADST16_FN 7 +cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ + 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_1023] + +.body: + mova [rsp+64*mmsize], m0 + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak +%if ARCH_X86_64 + mov dstbakq, dstq + movsxd cntq, cntd +%endif +%ifdef PIC + lea ptrq, [%7_16x16] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [%7_16x16+cntq-1] +%endif + mov skipd, 4 + sub skipd, cntd + mov ptrq, rsp +.loop_1: + %2_1D blockq + + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 1*mmsize], m0 + mova [ptrq+ 5*mmsize], m1 + mova [ptrq+ 9*mmsize], m2 + mova [ptrq+13*mmsize], m3 + mova m7, [rsp+65*mmsize] + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 2*mmsize], m4 + mova [ptrq+ 6*mmsize], m5 + mova [ptrq+10*mmsize], m6 + mova [ptrq+14*mmsize], m7 + UNSCRATCH 0, 8, rsp+(%3+0)*mmsize + UNSCRATCH 1, 9, rsp+(%3+1)*mmsize + UNSCRATCH 2, 10, rsp+(%3+2)*mmsize + UNSCRATCH 3, 11, rsp+(%3+3)*mmsize + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 0*mmsize], m0 + mova [ptrq+ 4*mmsize], m1 + mova [ptrq+ 8*mmsize], m2 + mova [ptrq+12*mmsize], m3 + UNSCRATCH 4, 12, rsp+(%3+4)*mmsize + UNSCRATCH 5, 13, rsp+(%3+5)*mmsize + UNSCRATCH 6, 14, rsp+(%3+6)*mmsize + UNSCRATCH 7, 15, rsp+(%3+7)*mmsize + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 3*mmsize], m4 + mova [ptrq+ 7*mmsize], m5 + mova [ptrq+11*mmsize], m6 + mova [ptrq+15*mmsize], m7 + add ptrq, 16 * mmsize + add blockq, mmsize + dec cntd + jg .loop_1 + + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + add skipd, skipd + lea blockq, [blockq+skipq*(mmsize/2)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + mova [ptrq+mmsize*4], m0 + mova [ptrq+mmsize*5], m0 + mova [ptrq+mmsize*6], m0 + mova [ptrq+mmsize*7], m0 + add ptrq, 8 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] + mov cntd, 4 + mov ptrq, rsp +.loop_2: + %5_1D ptrq + + pxor m7, m7 + lea dstq, [dstq+strideq*4] + ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 + lea dstq, [dstq+strideq*4] + mova m0, [rsp+65*mmsize] + mova m1, [rsp+64*mmsize] + mova m2, [pd_32] + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 + +%if ARCH_X86_64 + DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst +%else + mov dstq, dstm +%endif + UNSCRATCH 0, 8, rsp+(%6+0)*mmsize + UNSCRATCH 4, 9, rsp+(%6+1)*mmsize + UNSCRATCH 5, 10, rsp+(%6+2)*mmsize + UNSCRATCH 3, 11, rsp+(%6+3)*mmsize + ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6 +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea dstq, [dstbakq+stride3q*4] +%else + lea dstq, [dstq+stride3q*4] +%endif + UNSCRATCH 4, 12, rsp+(%6+4)*mmsize + UNSCRATCH 5, 13, rsp+(%6+5)*mmsize + UNSCRATCH 6, 14, rsp+(%6+6)*mmsize + UNSCRATCH 0, 15, rsp+(%6+7)*mmsize + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6 + + add ptrq, mmsize +%if ARCH_X86_64 + add dstbakq, 8 + mov dstq, dstbakq +%else + add dword dstm, 8 + mov dstq, dstm +%endif + dec cntd + jg .loop_2 + + ; m7 is still zero + ZERO_BLOCK blockq-4*mmsize, 64, 16, m7 + RET + +cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \ + 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70, row +IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67, col +IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default + +%macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride + IDCT16_1D %2, 2 * %3, 272, 257 +%if ARCH_X86_64 + mova [rsp+257*mmsize], m8 + mova [rsp+258*mmsize], m9 + mova [rsp+259*mmsize], m10 + mova [rsp+260*mmsize], m11 + mova [rsp+261*mmsize], m12 + mova [rsp+262*mmsize], m13 + mova [rsp+263*mmsize], m14 + mova [rsp+264*mmsize], m15 +%endif + mova [rsp+265*mmsize], m0 + mova [rsp+266*mmsize], m1 + mova [rsp+267*mmsize], m2 + mova [rsp+268*mmsize], m3 + mova [rsp+269*mmsize], m4 + mova [rsp+270*mmsize], m5 + mova [rsp+271*mmsize], m6 + + ; r257-260: t0-3 + ; r265-272: t4/5a/6a/7/8/9a/10/11a + ; r261-264: t12a/13/14a/15 + ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit + + mova m0, [%2+ 1*%3] ; in1 + mova m1, [%2+15*%3] ; in15 + mova m2, [%2+17*%3] ; in17 + mova m3, [%2+31*%3] ; in31 + SUMSUB_MUL 0, 3, 4, 5, 16364, 804 ; m0=t31a, m3=t16a + SUMSUB_MUL 2, 1, 4, 5, 11003, 12140 ; m2=t30a, m1=t17a + SUMSUB_BA d, 1, 3, 4 ; m1=t16, m3=t17 + SUMSUB_BA d, 2, 0, 4 ; m2=t31, m0=t30 + SUMSUB_MUL 0, 3, 4, 5, 16069, 3196 ; m0=t30a, m3=t17a + SCRATCH 0, 8, rsp+275*mmsize + SCRATCH 2, 9, rsp+276*mmsize + + ; end of stage 1-3 first quart + + mova m0, [%2+ 7*%3] ; in7 + mova m2, [%2+ 9*%3] ; in9 + mova m4, [%2+23*%3] ; in23 + mova m5, [%2+25*%3] ; in25 + SUMSUB_MUL 2, 4, 6, 7, 14811, 7005 ; m2=t29a, m4=t18a + SUMSUB_MUL 5, 0, 6, 7, 5520, 15426 ; m5=t28a, m0=t19a + SUMSUB_BA d, 4, 0, 6 ; m4=t19, m0=t18 + SUMSUB_BA d, 2, 5, 6 ; m2=t28, m5=t29 + SUMSUB_MUL 5, 0, 6, 7, 3196, m16069 ; m5=t29a, m0=t18a + + ; end of stage 1-3 second quart + + SUMSUB_BA d, 4, 1, 6 ; m4=t16a, m1=t19a + SUMSUB_BA d, 0, 3, 6 ; m0=t17, m3=t18 + UNSCRATCH 6, 8, rsp+275*mmsize ; t30a + UNSCRATCH 7, 9, rsp+276*mmsize ; t31 + mova [rsp+273*mmsize], m4 + mova [rsp+274*mmsize], m0 + SUMSUB_BA d, 2, 7, 0 ; m2=t31a, m7=t28a + SUMSUB_BA d, 5, 6, 0 ; m5=t30, m6=t29 + SUMSUB_MUL 6, 3, 0, 4, 15137, 6270 ; m6=t29a, m3=t18a + SUMSUB_MUL 7, 1, 0, 4, 15137, 6270 ; m7=t28, m1=t19 + SCRATCH 3, 10, rsp+277*mmsize + SCRATCH 1, 11, rsp+278*mmsize + SCRATCH 7, 12, rsp+279*mmsize + SCRATCH 6, 13, rsp+280*mmsize + SCRATCH 5, 14, rsp+281*mmsize + SCRATCH 2, 15, rsp+282*mmsize + + ; end of stage 4-5 first half + + mova m0, [%2+ 5*%3] ; in5 + mova m1, [%2+11*%3] ; in11 + mova m2, [%2+21*%3] ; in21 + mova m3, [%2+27*%3] ; in27 + SUMSUB_MUL 0, 3, 4, 5, 15893, 3981 ; m0=t27a, m3=t20a + SUMSUB_MUL 2, 1, 4, 5, 8423, 14053 ; m2=t26a, m1=t21a + SUMSUB_BA d, 1, 3, 4 ; m1=t20, m3=t21 + SUMSUB_BA d, 2, 0, 4 ; m2=t27, m0=t26 + SUMSUB_MUL 0, 3, 4, 5, 9102, 13623 ; m0=t26a, m3=t21a + SCRATCH 0, 8, rsp+275*mmsize + SCRATCH 2, 9, rsp+276*mmsize + + ; end of stage 1-3 third quart + + mova m0, [%2+ 3*%3] ; in3 + mova m2, [%2+13*%3] ; in13 + mova m4, [%2+19*%3] ; in19 + mova m5, [%2+29*%3] ; in29 + SUMSUB_MUL 2, 4, 6, 7, 13160, 9760 ; m2=t25a, m4=t22a + SUMSUB_MUL 5, 0, 6, 7, 2404, 16207 ; m5=t24a, m0=t23a + SUMSUB_BA d, 4, 0, 6 ; m4=t23, m0=t22 + SUMSUB_BA d, 2, 5, 6 ; m2=t24, m5=t25 + SUMSUB_MUL 5, 0, 6, 7, 13623, m9102 ; m5=t25a, m0=t22a + + ; end of stage 1-3 fourth quart + + SUMSUB_BA d, 1, 4, 6 ; m1=t23a, m4=t20a + SUMSUB_BA d, 3, 0, 6 ; m3=t22, m0=t21 + UNSCRATCH 6, 8, rsp+275*mmsize ; t26a + UNSCRATCH 7, 9, rsp+276*mmsize ; t27 + SCRATCH 3, 8, rsp+275*mmsize + SCRATCH 1, 9, rsp+276*mmsize + SUMSUB_BA d, 7, 2, 1 ; m7=t24a, m2=t27a + SUMSUB_BA d, 6, 5, 1 ; m6=t25, m5=t26 + SUMSUB_MUL 2, 4, 1, 3, 6270, m15137 ; m2=t27, m4=t20 + SUMSUB_MUL 5, 0, 1, 3, 6270, m15137 ; m5=t26a, m0=t21a + + ; end of stage 4-5 second half + + UNSCRATCH 1, 12, rsp+279*mmsize ; t28 + UNSCRATCH 3, 13, rsp+280*mmsize ; t29a + SCRATCH 4, 12, rsp+279*mmsize + SCRATCH 0, 13, rsp+280*mmsize + SUMSUB_BA d, 5, 3, 0 ; m5=t29, m3=t26 + SUMSUB_BA d, 2, 1, 0 ; m2=t28a, m1=t27a + UNSCRATCH 0, 14, rsp+281*mmsize ; t30 + UNSCRATCH 4, 15, rsp+282*mmsize ; t31a + SCRATCH 2, 14, rsp+281*mmsize + SCRATCH 5, 15, rsp+282*mmsize + SUMSUB_BA d, 6, 0, 2 ; m6=t30a, m0=t25a + SUMSUB_BA d, 7, 4, 2 ; m7=t31, m4=t24 + + mova m2, [rsp+273*mmsize] ; t16a + mova m5, [rsp+274*mmsize] ; t17 + mova [rsp+273*mmsize], m6 + mova [rsp+274*mmsize], m7 + UNSCRATCH 6, 10, rsp+277*mmsize ; t18a + UNSCRATCH 7, 11, rsp+278*mmsize ; t19 + SCRATCH 4, 10, rsp+277*mmsize + SCRATCH 0, 11, rsp+278*mmsize + UNSCRATCH 4, 12, rsp+279*mmsize ; t20 + UNSCRATCH 0, 13, rsp+280*mmsize ; t21a + SCRATCH 3, 12, rsp+279*mmsize + SCRATCH 1, 13, rsp+280*mmsize + SUMSUB_BA d, 0, 6, 1 ; m0=t18, m6=t21 + SUMSUB_BA d, 4, 7, 1 ; m4=t19a, m7=t20a + UNSCRATCH 3, 8, rsp+275*mmsize ; t22 + UNSCRATCH 1, 9, rsp+276*mmsize ; t23a + SCRATCH 0, 8, rsp+275*mmsize + SCRATCH 4, 9, rsp+276*mmsize + SUMSUB_BA d, 3, 5, 0 ; m3=t17a, m5=t22a + SUMSUB_BA d, 1, 2, 0 ; m1=t16, m2=t23 + + ; end of stage 6 + + UNSCRATCH 0, 10, rsp+277*mmsize ; t24 + UNSCRATCH 4, 11, rsp+278*mmsize ; t25a + SCRATCH 1, 10, rsp+277*mmsize + SCRATCH 3, 11, rsp+278*mmsize + SUMSUB_MUL 0, 2, 1, 3, 11585, 11585 ; m0=t24a, m2=t23a + SUMSUB_MUL 4, 5, 1, 3, 11585, 11585 ; m4=t25, m5=t22 + UNSCRATCH 1, 12, rsp+279*mmsize ; t26 + UNSCRATCH 3, 13, rsp+280*mmsize ; t27a + SCRATCH 0, 12, rsp+279*mmsize + SCRATCH 4, 13, rsp+280*mmsize + SUMSUB_MUL 3, 7, 0, 4, 11585, 11585 ; m3=t27, m7=t20 + SUMSUB_MUL 1, 6, 0, 4, 11585, 11585 ; m1=t26a, m6=t21a + + ; end of stage 7 + + mova m0, [rsp+269*mmsize] ; t8 + mova m4, [rsp+270*mmsize] ; t9a + mova [rsp+269*mmsize], m1 ; t26a + mova [rsp+270*mmsize], m3 ; t27 + mova m3, [rsp+271*mmsize] ; t10 + SUMSUB_BA d, 2, 0, 1 ; m2=out8, m0=out23 + SUMSUB_BA d, 5, 4, 1 ; m5=out9, m4=out22 + SUMSUB_BA d, 6, 3, 1 ; m6=out10, m3=out21 + mova m1, [rsp+272*mmsize] ; t11a + mova [rsp+271*mmsize], m0 + SUMSUB_BA d, 7, 1, 0 ; m7=out11, m1=out20 + +%if %1 == 1 + TRANSPOSE4x4D 2, 5, 6, 7, 0 + mova [ptrq+ 2*mmsize], m2 + mova [ptrq+10*mmsize], m5 + mova [ptrq+18*mmsize], m6 + mova [ptrq+26*mmsize], m7 +%else ; %1 == 2 + pxor m0, m0 + lea dstq, [dstq+strideq*8] + ROUND_AND_STORE_4x4 2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6 +%endif + mova m2, [rsp+271*mmsize] +%if %1 == 1 + TRANSPOSE4x4D 1, 3, 4, 2, 0 + mova [ptrq+ 5*mmsize], m1 + mova [ptrq+13*mmsize], m3 + mova [ptrq+21*mmsize], m4 + mova [ptrq+29*mmsize], m2 +%else ; %1 == 2 + lea dstq, [dstq+stride3q*4] + ROUND_AND_STORE_4x4 1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6 +%endif + + ; end of last stage + store for out8-11 and out20-23 + + UNSCRATCH 0, 9, rsp+276*mmsize ; t19a + UNSCRATCH 1, 8, rsp+275*mmsize ; t18 + UNSCRATCH 2, 11, rsp+278*mmsize ; t17a + UNSCRATCH 3, 10, rsp+277*mmsize ; t16 + mova m7, [rsp+261*mmsize] ; t12a + mova m6, [rsp+262*mmsize] ; t13 + mova m5, [rsp+263*mmsize] ; t14a + SUMSUB_BA d, 0, 7, 4 ; m0=out12, m7=out19 + SUMSUB_BA d, 1, 6, 4 ; m1=out13, m6=out18 + SUMSUB_BA d, 2, 5, 4 ; m2=out14, m5=out17 + mova m4, [rsp+264*mmsize] ; t15 + SCRATCH 7, 8, rsp+275*mmsize + SUMSUB_BA d, 3, 4, 7 ; m3=out15, m4=out16 + +%if %1 == 1 + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 3*mmsize], m0 + mova [ptrq+11*mmsize], m1 + mova [ptrq+19*mmsize], m2 + mova [ptrq+27*mmsize], m3 +%else ; %1 == 2 +%if ARCH_X86_64 + SWAP 7, 9 + lea dstq, [dstbakq+stride3q*4] +%else ; x86-32 + pxor m7, m7 + mov dstq, dstm + lea dstq, [dstq+stride3q*4] +%endif + ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6 +%endif + UNSCRATCH 0, 8, rsp+275*mmsize ; out19 +%if %1 == 1 + TRANSPOSE4x4D 4, 5, 6, 0, 7 + mova [ptrq+ 4*mmsize], m4 + mova [ptrq+12*mmsize], m5 + mova [ptrq+20*mmsize], m6 + mova [ptrq+28*mmsize], m0 +%else ; %1 == 2 + lea dstq, [dstq+strideq*4] + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6 +%endif + + ; end of last stage + store for out12-19 + +%if ARCH_X86_64 + SWAP 7, 8 +%endif + mova m7, [rsp+257*mmsize] ; t0 + mova m6, [rsp+258*mmsize] ; t1 + mova m5, [rsp+259*mmsize] ; t2 + mova m4, [rsp+260*mmsize] ; t3 + mova m0, [rsp+274*mmsize] ; t31 + mova m1, [rsp+273*mmsize] ; t30a + UNSCRATCH 2, 15, rsp+282*mmsize ; t29 + SUMSUB_BA d, 0, 7, 3 ; m0=out0, m7=out31 + SUMSUB_BA d, 1, 6, 3 ; m1=out1, m6=out30 + SUMSUB_BA d, 2, 5, 3 ; m2=out2, m5=out29 + SCRATCH 0, 9, rsp+276*mmsize + UNSCRATCH 3, 14, rsp+281*mmsize ; t28a + SUMSUB_BA d, 3, 4, 0 ; m3=out3, m4=out28 + +%if %1 == 1 + TRANSPOSE4x4D 4, 5, 6, 7, 0 + mova [ptrq+ 7*mmsize], m4 + mova [ptrq+15*mmsize], m5 + mova [ptrq+23*mmsize], m6 + mova [ptrq+31*mmsize], m7 +%else ; %1 == 2 +%if ARCH_X86_64 + SWAP 0, 8 +%else ; x86-32 + pxor m0, m0 +%endif + lea dstq, [dstq+stride3q*4] + ROUND_AND_STORE_4x4 4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6 +%endif + UNSCRATCH 7, 9, rsp+276*mmsize ; out0 +%if %1 == 1 + TRANSPOSE4x4D 7, 1, 2, 3, 0 + mova [ptrq+ 0*mmsize], m7 + mova [ptrq+ 8*mmsize], m1 + mova [ptrq+16*mmsize], m2 + mova [ptrq+24*mmsize], m3 +%else ; %1 == 2 +%if ARCH_X86_64 + DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst +%else ; x86-32 + mov dstq, dstm +%endif + ROUND_AND_STORE_4x4 7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6 +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak +%endif +%endif + + ; end of last stage + store for out0-3 and out28-31 + +%if ARCH_X86_64 + SWAP 0, 8 +%endif + mova m7, [rsp+265*mmsize] ; t4 + mova m6, [rsp+266*mmsize] ; t5a + mova m5, [rsp+267*mmsize] ; t6a + mova m4, [rsp+268*mmsize] ; t7 + mova m0, [rsp+270*mmsize] ; t27 + mova m1, [rsp+269*mmsize] ; t26a + UNSCRATCH 2, 13, rsp+280*mmsize ; t25 + SUMSUB_BA d, 0, 7, 3 ; m0=out4, m7=out27 + SUMSUB_BA d, 1, 6, 3 ; m1=out5, m6=out26 + SUMSUB_BA d, 2, 5, 3 ; m2=out6, m5=out25 + UNSCRATCH 3, 12, rsp+279*mmsize ; t24a + SCRATCH 7, 9, rsp+276*mmsize + SUMSUB_BA d, 3, 4, 7 ; m3=out7, m4=out24 + +%if %1 == 1 + TRANSPOSE4x4D 0, 1, 2, 3, 7 + mova [ptrq+ 1*mmsize], m0 + mova [ptrq+ 9*mmsize], m1 + mova [ptrq+17*mmsize], m2 + mova [ptrq+25*mmsize], m3 +%else ; %1 == 2 +%if ARCH_X86_64 + SWAP 7, 8 + lea dstq, [dstbakq+strideq*4] +%else ; x86-32 + pxor m7, m7 + lea dstq, [dstq+strideq*4] +%endif + ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6 +%endif + UNSCRATCH 0, 9, rsp+276*mmsize ; out27 +%if %1 == 1 + TRANSPOSE4x4D 4, 5, 6, 0, 7 + mova [ptrq+ 6*mmsize], m4 + mova [ptrq+14*mmsize], m5 + mova [ptrq+22*mmsize], m6 + mova [ptrq+30*mmsize], m0 +%else ; %1 == 2 +%if ARCH_X86_64 + lea dstq, [dstbakq+stride3q*8] +%else + mov dstq, dstm + lea dstq, [dstq+stride3q*8] +%endif + ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6 +%endif + + ; end of last stage + store for out4-7 and out24-27 +%endmacro + +INIT_XMM sse2 +cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \ + 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_1023] + cmp eobd, 1 + jg .idctfull + + ; dc-only - the 10bit version can be done entirely in 32bit, since the max + ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily + ; fits in 32bit + DEFINE_ARGS dst, stride, block, coef + pxor m2, m2 + DC_ONLY 6, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 32 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize + STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize + add dstq, strideq + dec cntd + jg .loop_dc + RET + +.idctfull: + mova [rsp+256*mmsize], m0 + DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak +%if ARCH_X86_64 + mov dstbakq, dstq + movsxd cntq, cntd +%endif +%ifdef PIC + lea ptrq, [default_32x32] + movzx cntd, byte [ptrq+cntq-1] +%else + movzx cntd, byte [default_32x32+cntq-1] +%endif + mov skipd, 8 + sub skipd, cntd + mov ptrq, rsp +.loop_1: + IDCT32_1D 1, blockq + + add ptrq, 32 * mmsize + add blockq, mmsize + dec cntd + jg .loop_1 + + ; zero-pad the remainder (skipped cols) + test skipd, skipd + jz .end + shl skipd, 2 + lea blockq, [blockq+skipq*(mmsize/4)] + pxor m0, m0 +.loop_z: + mova [ptrq+mmsize*0], m0 + mova [ptrq+mmsize*1], m0 + mova [ptrq+mmsize*2], m0 + mova [ptrq+mmsize*3], m0 + mova [ptrq+mmsize*4], m0 + mova [ptrq+mmsize*5], m0 + mova [ptrq+mmsize*6], m0 + mova [ptrq+mmsize*7], m0 + add ptrq, 8 * mmsize + dec skipd + jg .loop_z +.end: + + DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak + lea stride3q, [strideq*3] + mov cntd, 8 + mov ptrq, rsp +.loop_2: + IDCT32_1D 2, ptrq + + add ptrq, mmsize +%if ARCH_X86_64 + add dstbakq, 8 + mov dstq, dstbakq +%else + add dword dstm, 8 + mov dstq, dstm +%endif + dec cntd + jg .loop_2 + + ; m7 is still zero + ZERO_BLOCK blockq-8*mmsize, 128, 32, m7 + RET + +INIT_XMM sse2 +cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \ + 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \ + dst, stride, block, eob + mova m0, [pw_4095] + cmp eobd, 1 + jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull + + ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign + ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies + DEFINE_ARGS dst, stride, block, coef, coefl + pxor m2, m2 + DC_ONLY_64BIT 6, m2 + movd m1, coefd + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + DEFINE_ARGS dst, stride, cnt + mov cntd, 32 +.loop_dc: + STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize + STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize + add dstq, strideq + dec cntd + jg .loop_dc + RET diff --git a/media/ffvpx/libavcodec/x86/vp9itxfm_template.asm b/media/ffvpx/libavcodec/x86/vp9itxfm_template.asm new file mode 100644 index 0000000000..d2f2257d84 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9itxfm_template.asm @@ -0,0 +1,142 @@ +;****************************************************************************** +;* VP9 IDCT SIMD optimizations +;* +;* Copyright (C) 2013 Clément Bœsch <u pkh me> +;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%macro VP9_IWHT4_1D 0 + SWAP 1, 2, 3 + paddw m0, m2 + psubw m3, m1 + psubw m4, m0, m3 + psraw m4, 1 + psubw m5, m4, m1 + SWAP 5, 1 + psubw m4, m2 + SWAP 4, 2 + psubw m0, m1 + paddw m3, m2 + SWAP 3, 2, 1 +%endmacro + +; (a*x + b*y + round) >> shift +%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2 + pmaddwd m%1, m%2, %4 + pmaddwd m%2, %5 + paddd m%1, %3 + paddd m%2, %3 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2 + VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3] + VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2 +%if %0 == 7 + punpckhwd m%6, m%2, m%1 + punpcklwd m%2, m%1 + VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7 +%else + punpckhwd m%8, m%4, m%3 + punpcklwd m%2, m%4, m%3 + VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9 +%endif +%endmacro + +%macro VP9_IDCT4_1D_FINALIZE 0 + SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0 + SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1 + SWAP 0, 3, 2 ; 3102 -> 0123 +%endmacro + +%macro VP9_IDCT4_1D 0 +%if cpuflag(ssse3) + SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2) + pmulhrsw m2, m6 ; m2=t0 + pmulhrsw m0, m6 ; m0=t1 +%else ; <= sse2 + VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0 +%endif + VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3 + VP9_IDCT4_1D_FINALIZE +%endmacro + +%macro VP9_IADST4_1D 0 + movq2dq xmm0, m0 + movq2dq xmm1, m1 + movq2dq xmm2, m2 + movq2dq xmm3, m3 +%if cpuflag(ssse3) + paddw m3, m0 +%endif + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + pmaddwd xmm1, xmm0, [pw_5283_13377] + pmaddwd xmm4, xmm0, [pw_9929_13377] +%if notcpuflag(ssse3) + pmaddwd xmm6, xmm0, [pw_13377_0] +%endif + pmaddwd xmm0, [pw_15212_m13377] + pmaddwd xmm3, xmm2, [pw_15212_9929] +%if notcpuflag(ssse3) + pmaddwd xmm7, xmm2, [pw_m13377_13377] +%endif + pmaddwd xmm2, [pw_m5283_m15212] +%if cpuflag(ssse3) + psubw m3, m2 +%else + paddd xmm6, xmm7 +%endif + paddd xmm0, xmm2 + paddd xmm3, xmm5 + paddd xmm2, xmm5 +%if notcpuflag(ssse3) + paddd xmm6, xmm5 +%endif + paddd xmm1, xmm3 + paddd xmm0, xmm3 + paddd xmm4, xmm2 + psrad xmm1, 14 + psrad xmm0, 14 + psrad xmm4, 14 +%if cpuflag(ssse3) + pmulhrsw m3, [pw_13377x2] ; out2 +%else + psrad xmm6, 14 +%endif + packssdw xmm0, xmm0 + packssdw xmm1, xmm1 + packssdw xmm4, xmm4 +%if notcpuflag(ssse3) + packssdw xmm6, xmm6 +%endif + movdq2q m0, xmm0 ; out3 + movdq2q m1, xmm1 ; out0 + movdq2q m2, xmm4 ; out1 +%if notcpuflag(ssse3) + movdq2q m3, xmm6 ; out2 +%endif + SWAP 0, 1, 2, 3 +%endmacro diff --git a/media/ffvpx/libavcodec/x86/vp9lpf.asm b/media/ffvpx/libavcodec/x86/vp9lpf.asm new file mode 100644 index 0000000000..4e7ede2235 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9lpf.asm @@ -0,0 +1,1211 @@ +;****************************************************************************** +;* VP9 loop filter SIMD optimizations +;* +;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me> +;* Copyright (C) 2014 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pb_3 +cextern pb_80 + +pb_4: times 16 db 0x04 +pb_10: times 16 db 0x10 +pb_40: times 16 db 0x40 +pb_81: times 16 db 0x81 +pb_f8: times 16 db 0xf8 +pb_fe: times 16 db 0xfe +pb_ff: times 16 db 0xff + +cextern pw_4 +cextern pw_8 + +; with mix functions, two 8-bit thresholds are stored in a 16-bit storage, +; the following mask is used to splat both in the same register +mask_mix: times 8 db 0 + times 8 db 1 + +mask_mix84: times 8 db 0xff + times 8 db 0x00 +mask_mix48: times 8 db 0x00 + times 8 db 0xff + +SECTION .text + +%macro SCRATCH 3 +%ifdef m8 + SWAP %1, %2 +%else + mova [%3], m%1 +%endif +%endmacro + +%macro UNSCRATCH 3 +%ifdef m8 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%endmacro + +; %1 = abs(%2-%3) +%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp +%ifdef m8 + psubusb %1, %3, %2 + psubusb %4, %2, %3 +%else + mova %1, %3 + mova %4, %2 + psubusb %1, %2 + psubusb %4, %3 +%endif + por %1, %4 +%endmacro + +; %1 = %1>%2 +%macro CMP_GT 2-3 ; src/dst, cmp, pb_80 +%if %0 == 3 + pxor %1, %3 +%endif + pcmpgtb %1, %2 +%endmacro + +; %1 = abs(%2-%3) > %4 +%macro ABSSUB_GT 5-6 [pb_80]; dst, src1, src2, cmp, tmp, [pb_80] + ABSSUB %1, %2, %3, %5 ; dst = abs(src1-src2) + CMP_GT %1, %4, %6 ; dst > cmp +%endmacro + +%macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp + pand %1, %3 ; new &= mask + pandn %4, %3, %2 ; tmp = ~mask & old + por %1, %4 ; new&mask | old&~mask +%endmacro + +%macro UNPACK 4 +%ifdef m8 + punpck%1bw %2, %3, %4 +%else + mova %2, %3 + punpck%1bw %2, %4 +%endif +%endmacro + +%macro FILTER_SUBx2_ADDx2 11 ; %1=dst %2=h/l %3=cache %4=stack_off %5=sub1 %6=sub2 %7=add1 + ; %8=add2 %9=rshift, [unpack], [unpack_is_mem_on_x86_32] + psubw %3, [rsp+%4+%5*mmsize*2] + psubw %3, [rsp+%4+%6*mmsize*2] + paddw %3, [rsp+%4+%7*mmsize*2] +%ifnidn %10, "" +%if %11 == 0 + punpck%2bw %1, %10, m0 +%else + UNPACK %2, %1, %10, m0 +%endif + mova [rsp+%4+%8*mmsize*2], %1 + paddw %3, %1 +%else + paddw %3, [rsp+%4+%8*mmsize*2] +%endif + psraw %1, %3, %9 +%endmacro + +; FIXME interleave l/h better (for instruction pairing) +%macro FILTER_INIT 9 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, filterid, mask, source + FILTER%7_INIT %1, l, %3, %6 + 0 + FILTER%7_INIT %2, h, %4, %6 + mmsize + packuswb %1, %2 + MASK_APPLY %1, %9, %8, %2 + mova %5, %1 +%endmacro + + +%macro FILTER_UPDATE 12-16 "", "", "", 0 ; tmp1, tmp2, cacheL, cacheH, dstp, stack_off, -, -, +, +, rshift, + ; mask, [source], [unpack + src], [unpack_is_mem_on_x86_32] +; FIXME interleave this properly with the subx2/addx2 +%ifnidn %15, "" +%if %16 == 0 || ARCH_X86_64 + mova %14, %15 +%endif +%endif + FILTER_SUBx2_ADDx2 %1, l, %3, %6 + 0, %7, %8, %9, %10, %11, %14, %16 + FILTER_SUBx2_ADDx2 %2, h, %4, %6 + mmsize, %7, %8, %9, %10, %11, %14, %16 + packuswb %1, %2 +%ifnidn %13, "" + MASK_APPLY %1, %13, %12, %2 +%else + MASK_APPLY %1, %5, %12, %2 +%endif + mova %5, %1 +%endmacro + +%macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp + mova %4, [pb_f8] + pand %1, %4 + pand %2, %4 + psrlq %1, 3 + psrlq %2, 3 + pxor %1, %3 + pxor %2, %3 + psubb %1, %3 + psubb %2, %3 +%endmacro + +%macro EXTRACT_POS_NEG 3 ; i8, neg, pos + pxor %3, %3 + pxor %2, %2 + pcmpgtb %3, %1 ; i8 < 0 mask + psubb %2, %1 ; neg values (only the originally - will be kept) + pand %2, %3 ; negative values of i8 (but stored as +) + pandn %3, %1 ; positive values of i8 +%endmacro + +; clip_u8(u8 + i8) +%macro SIGN_ADD 4 ; dst, u8, i8, tmp1 + EXTRACT_POS_NEG %3, %4, %1 + paddusb %1, %2 ; add the positives + psubusb %1, %4 ; sub the negatives +%endmacro + +; clip_u8(u8 - i8) +%macro SIGN_SUB 4 ; dst, u8, i8, tmp1 + EXTRACT_POS_NEG %3, %1, %4 + paddusb %1, %2 ; add the negatives + psubusb %1, %4 ; sub the positives +%endmacro + +%macro FILTER6_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off + UNPACK %2, %1, rp3, m0 ; p3: B->W + mova [rsp+%4+0*mmsize*2], %1 + paddw %3, %1, %1 ; p3*2 + paddw %3, %1 ; p3*3 + punpck%2bw %1, m1, m0 ; p2: B->W + mova [rsp+%4+1*mmsize*2], %1 + paddw %3, %1 ; p3*3 + p2 + paddw %3, %1 ; p3*3 + p2*2 + UNPACK %2, %1, rp1, m0 ; p1: B->W + mova [rsp+%4+2*mmsize*2], %1 + paddw %3, %1 ; p3*3 + p2*2 + p1 + UNPACK %2, %1, rp0, m0 ; p0: B->W + mova [rsp+%4+3*mmsize*2], %1 + paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 + UNPACK %2, %1, rq0, m0 ; q0: B->W + mova [rsp+%4+4*mmsize*2], %1 + paddw %3, %1 ; p3*3 + p2*2 + p1 + p0 + q0 + paddw %3, [pw_4] ; p3*3 + p2*2 + p1 + p0 + q0 + 4 + psraw %1, %3, 3 ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3 +%endmacro + +%macro FILTER14_INIT 4 ; %1=dst %2=h/l %3=cache, %4=stack_off + punpck%2bw %1, m2, m0 ; p7: B->W + mova [rsp+%4+ 8*mmsize*2], %1 + psllw %3, %1, 3 ; p7*8 + psubw %3, %1 ; p7*7 + punpck%2bw %1, m3, m0 ; p6: B->W + mova [rsp+%4+ 9*mmsize*2], %1 + paddw %3, %1 ; p7*7 + p6 + paddw %3, %1 ; p7*7 + p6*2 + UNPACK %2, %1, rp5, m0 ; p5: B->W + mova [rsp+%4+10*mmsize*2], %1 + paddw %3, %1 ; p7*7 + p6*2 + p5 + UNPACK %2, %1, rp4, m0 ; p4: B->W + mova [rsp+%4+11*mmsize*2], %1 + paddw %3, %1 ; p7*7 + p6*2 + p5 + p4 + paddw %3, [rsp+%4+ 0*mmsize*2] ; p7*7 + p6*2 + p5 + p4 + p3 + paddw %3, [rsp+%4+ 1*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p2 + paddw %3, [rsp+%4+ 2*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p1 + paddw %3, [rsp+%4+ 3*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p0 + paddw %3, [rsp+%4+ 4*mmsize*2] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + paddw %3, [pw_8] ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8 + psraw %1, %3, 4 ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4 +%endmacro + +%macro TRANSPOSE16x16B 17 + mova %17, m%16 + SBUTTERFLY bw, %1, %2, %16 + SBUTTERFLY bw, %3, %4, %16 + SBUTTERFLY bw, %5, %6, %16 + SBUTTERFLY bw, %7, %8, %16 + SBUTTERFLY bw, %9, %10, %16 + SBUTTERFLY bw, %11, %12, %16 + SBUTTERFLY bw, %13, %14, %16 + mova m%16, %17 + mova %17, m%14 + SBUTTERFLY bw, %15, %16, %14 + SBUTTERFLY wd, %1, %3, %14 + SBUTTERFLY wd, %2, %4, %14 + SBUTTERFLY wd, %5, %7, %14 + SBUTTERFLY wd, %6, %8, %14 + SBUTTERFLY wd, %9, %11, %14 + SBUTTERFLY wd, %10, %12, %14 + SBUTTERFLY wd, %13, %15, %14 + mova m%14, %17 + mova %17, m%12 + SBUTTERFLY wd, %14, %16, %12 + SBUTTERFLY dq, %1, %5, %12 + SBUTTERFLY dq, %2, %6, %12 + SBUTTERFLY dq, %3, %7, %12 + SBUTTERFLY dq, %4, %8, %12 + SBUTTERFLY dq, %9, %13, %12 + SBUTTERFLY dq, %10, %14, %12 + SBUTTERFLY dq, %11, %15, %12 + mova m%12, %17 + mova %17, m%8 + SBUTTERFLY dq, %12, %16, %8 + SBUTTERFLY qdq, %1, %9, %8 + SBUTTERFLY qdq, %2, %10, %8 + SBUTTERFLY qdq, %3, %11, %8 + SBUTTERFLY qdq, %4, %12, %8 + SBUTTERFLY qdq, %5, %13, %8 + SBUTTERFLY qdq, %6, %14, %8 + SBUTTERFLY qdq, %7, %15, %8 + mova m%8, %17 + mova %17, m%1 + SBUTTERFLY qdq, %8, %16, %1 + mova m%1, %17 + SWAP %2, %9 + SWAP %3, %5 + SWAP %4, %13 + SWAP %6, %11 + SWAP %8, %15 + SWAP %12, %14 +%endmacro + +%macro TRANSPOSE8x8B 13 + SBUTTERFLY bw, %1, %2, %7 + movdq%10 m%7, %9 + movdqa %11, m%2 + SBUTTERFLY bw, %3, %4, %2 + SBUTTERFLY bw, %5, %6, %2 + SBUTTERFLY bw, %7, %8, %2 + SBUTTERFLY wd, %1, %3, %2 + movdqa m%2, %11 + movdqa %11, m%3 + SBUTTERFLY wd, %2, %4, %3 + SBUTTERFLY wd, %5, %7, %3 + SBUTTERFLY wd, %6, %8, %3 + SBUTTERFLY dq, %1, %5, %3 + SBUTTERFLY dq, %2, %6, %3 + movdqa m%3, %11 + movh %12, m%2 + movhps %13, m%2 + SBUTTERFLY dq, %3, %7, %2 + SBUTTERFLY dq, %4, %8, %2 + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +%macro DEFINE_REAL_P7_TO_Q7 0-1 0 +%define P7 dstq + 4*mstrideq + %1 +%define P6 dstq + mstride3q + %1 +%define P5 dstq + 2*mstrideq + %1 +%define P4 dstq + mstrideq + %1 +%define P3 dstq + %1 +%define P2 dstq + strideq + %1 +%define P1 dstq + 2* strideq + %1 +%define P0 dstq + stride3q + %1 +%define Q0 dstq + 4* strideq + %1 +%define Q1 dst2q + mstride3q + %1 +%define Q2 dst2q + 2*mstrideq + %1 +%define Q3 dst2q + mstrideq + %1 +%define Q4 dst2q + %1 +%define Q5 dst2q + strideq + %1 +%define Q6 dst2q + 2* strideq + %1 +%define Q7 dst2q + stride3q + %1 +%endmacro + +%macro DEFINE_TRANSPOSED_P7_TO_Q7 0-1 0 +%define P3 rsp + 0*mmsize + %1 +%define P2 rsp + 1*mmsize + %1 +%define P1 rsp + 2*mmsize + %1 +%define P0 rsp + 3*mmsize + %1 +%define Q0 rsp + 4*mmsize + %1 +%define Q1 rsp + 5*mmsize + %1 +%define Q2 rsp + 6*mmsize + %1 +%define Q3 rsp + 7*mmsize + %1 +%if mmsize == 16 +%define P7 rsp + 8*mmsize + %1 +%define P6 rsp + 9*mmsize + %1 +%define P5 rsp + 10*mmsize + %1 +%define P4 rsp + 11*mmsize + %1 +%define Q4 rsp + 12*mmsize + %1 +%define Q5 rsp + 13*mmsize + %1 +%define Q6 rsp + 14*mmsize + %1 +%define Q7 rsp + 15*mmsize + %1 +%endif +%endmacro + +; ..............AB -> AAAAAAAABBBBBBBB +%macro SPLATB_MIX 1-2 [mask_mix] +%if cpuflag(ssse3) + pshufb %1, %2 +%else + punpcklbw %1, %1 + punpcklwd %1, %1 + punpckldq %1, %1 +%endif +%endmacro + +%macro LOOPFILTER 5 ; %1=v/h %2=size1 %3+%4=stack, %5=mmx/32bit stack only +%assign %%ext 0 +%if ARCH_X86_32 || mmsize == 8 +%assign %%ext %5 +%endif + +%if UNIX64 +cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 5, 9, 16, %3 + %4 + %%ext, dst, stride, E, I, H, mstride, dst2, stride3, mstride3 +%else +%if WIN64 +cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 4, 8, 16, %3 + %4 + %%ext, dst, stride, E, I, mstride, dst2, stride3, mstride3 +%else +cglobal vp9_loop_filter_%1_%2_ %+ mmsize, 2, 6, 16, %3 + %4 + %%ext, dst, stride, mstride, dst2, stride3, mstride3 +%define Ed dword r2m +%define Id dword r3m +%endif +%define Hd dword r4m +%endif + + mov mstrideq, strideq + neg mstrideq + + lea stride3q, [strideq*3] + lea mstride3q, [mstrideq*3] + +%ifidn %1, h +%if %2 != 16 +%if mmsize == 16 +%define movx movh +%else +%define movx mova +%endif + lea dstq, [dstq + 4*strideq - 4] +%else +%define movx movu + lea dstq, [dstq + 4*strideq - 8] ; go from top center (h pos) to center left (v pos) +%endif +%else + lea dstq, [dstq + 4*mstrideq] +%endif + ; FIXME we shouldn't need two dts registers if mmsize == 8 + lea dst2q, [dstq + 8*strideq] + + DEFINE_REAL_P7_TO_Q7 + +%ifidn %1, h + movx m0, [P7] + movx m1, [P6] + movx m2, [P5] + movx m3, [P4] + movx m4, [P3] + movx m5, [P2] +%if (ARCH_X86_64 && mmsize == 16) || %2 > 16 + movx m6, [P1] +%endif + movx m7, [P0] +%ifdef m8 + movx m8, [Q0] + movx m9, [Q1] + movx m10, [Q2] + movx m11, [Q3] + movx m12, [Q4] + movx m13, [Q5] + movx m14, [Q6] + movx m15, [Q7] + DEFINE_TRANSPOSED_P7_TO_Q7 +%if %2 == 16 + TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] + mova [P7], m0 + mova [P6], m1 + mova [P5], m2 + mova [P4], m3 +%else ; %2 == 44/48/84/88 + ; 8x16 transpose + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + punpcklbw m8, m9 + punpcklbw m10, m11 + punpcklbw m12, m13 + punpcklbw m14, m15 + TRANSPOSE8x8W 0, 2, 4, 6, 8, 10, 12, 14, 15 + SWAP 0, 4 + SWAP 2, 5 + SWAP 0, 6 + SWAP 0, 7 + SWAP 10, 9 + SWAP 12, 10 + SWAP 14, 11 +%endif ; %2 + mova [P3], m4 + mova [P2], m5 + mova [P1], m6 + mova [P0], m7 + mova [Q0], m8 + mova [Q1], m9 + mova [Q2], m10 + mova [Q3], m11 +%if %2 == 16 + mova [Q4], m12 + mova [Q5], m13 + mova [Q6], m14 + mova [Q7], m15 +%endif ; %2 +%else ; x86-32 +%if %2 == 16 + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [P1], u, [rsp+%3+%4], [rsp+64], [rsp+80] + DEFINE_TRANSPOSED_P7_TO_Q7 + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 + DEFINE_REAL_P7_TO_Q7 + movx m0, [Q0] + movx m1, [Q1] + movx m2, [Q2] + movx m3, [Q3] + movx m4, [Q4] + movx m5, [Q5] + movx m7, [Q7] + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [Q6], u, [rsp+%3+%4], [rsp+72], [rsp+88] + DEFINE_TRANSPOSED_P7_TO_Q7 8 + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 + DEFINE_TRANSPOSED_P7_TO_Q7 +%elif %2 > 16 ; %2 == 44/48/84/88 + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 + movx m1, [Q0] + movx m3, [Q1] + movx m5, [Q2] + movx m7, [Q3] + punpcklbw m1, m3 + punpcklbw m5, m7 + movx m3, [Q4] + movx m7, [Q5] + punpcklbw m3, m7 + mova [rsp], m3 + movx m3, [Q6] + movx m7, [Q7] + punpcklbw m3, m7 + DEFINE_TRANSPOSED_P7_TO_Q7 + TRANSPOSE8x8W 0, 2, 4, 6, 1, 5, 7, 3, [rsp], [Q0], 1 + mova [P3], m0 + mova [P2], m2 + mova [P1], m4 + mova [P0], m6 + mova [Q1], m5 + mova [Q2], m7 + mova [Q3], m3 +%else ; %2 == 4 || %2 == 8 + SBUTTERFLY bw, 0, 1, 6 + SBUTTERFLY bw, 2, 3, 6 + SBUTTERFLY bw, 4, 5, 6 + mova [rsp+4*mmsize], m5 + mova m6, [P1] + SBUTTERFLY bw, 6, 7, 5 + DEFINE_TRANSPOSED_P7_TO_Q7 + TRANSPOSE4x4W 0, 2, 4, 6, 5 + mova [P3], m0 + mova [P2], m2 + mova [P1], m4 + mova [P0], m6 + mova m5, [rsp+4*mmsize] + TRANSPOSE4x4W 1, 3, 5, 7, 0 + mova [Q0], m1 + mova [Q1], m3 + mova [Q2], m5 + mova [Q3], m7 +%endif ; %2 +%endif ; x86-32/64 +%endif ; %1 == h + + ; calc fm mask +%if %2 == 16 || mmsize == 8 +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m2, I, m0 ; I I I I ... + SPLATB_REG m3, E, m0 ; E E E E ... +%else +%if cpuflag(ssse3) + mova m0, [mask_mix] +%endif + movd m2, Id + movd m3, Ed + SPLATB_MIX m2, m0 + SPLATB_MIX m3, m0 +%endif + mova m0, [pb_80] + pxor m2, m0 + pxor m3, m0 +%ifdef m8 +%ifidn %1, v + mova m8, [P3] + mova m9, [P2] + mova m10, [P1] + mova m11, [P0] + mova m12, [Q0] + mova m13, [Q1] + mova m14, [Q2] + mova m15, [Q3] +%else + ; In case of horizontal, P3..Q3 are already present in some registers due + ; to the previous transpose, so we just swap registers. + SWAP 8, 4, 12 + SWAP 9, 5, 13 + SWAP 10, 6, 14 + SWAP 11, 7, 15 +%endif +%define rp3 m8 +%define rp2 m9 +%define rp1 m10 +%define rp0 m11 +%define rq0 m12 +%define rq1 m13 +%define rq2 m14 +%define rq3 m15 +%else +%define rp3 [P3] +%define rp2 [P2] +%define rp1 [P1] +%define rp0 [P0] +%define rq0 [Q0] +%define rq1 [Q1] +%define rq2 [Q2] +%define rq3 [Q3] +%endif + ABSSUB_GT m5, rp3, rp2, m2, m7, m0 ; m5 = abs(p3-p2) <= I + ABSSUB_GT m1, rp2, rp1, m2, m7, m0 ; m1 = abs(p2-p1) <= I + por m5, m1 + ABSSUB_GT m1, rp1, rp0, m2, m7, m0 ; m1 = abs(p1-p0) <= I + por m5, m1 + ABSSUB_GT m1, rq0, rq1, m2, m7, m0 ; m1 = abs(q1-q0) <= I + por m5, m1 + ABSSUB_GT m1, rq1, rq2, m2, m7, m0 ; m1 = abs(q2-q1) <= I + por m5, m1 + ABSSUB_GT m1, rq2, rq3, m2, m7, m0 ; m1 = abs(q3-q2) <= I + por m5, m1 + ABSSUB m1, rp0, rq0, m7 ; abs(p0-q0) + paddusb m1, m1 ; abs(p0-q0) * 2 + ABSSUB m2, rp1, rq1, m7 ; abs(p1-q1) + pand m2, [pb_fe] ; drop lsb so shift can work + psrlq m2, 1 ; abs(p1-q1)/2 + paddusb m1, m2 ; abs(p0-q0)*2 + abs(p1-q1)/2 + pxor m1, m0 + pcmpgtb m1, m3 + por m1, m5 ; fm final value + SWAP 1, 3 + pxor m3, [pb_ff] + + ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3) + ; calc flat8in (if not 44_16) and hev masks +%if %2 != 44 && %2 != 4 + mova m6, [pb_81] ; [1 1 1 1 ...] ^ 0x80 + ABSSUB_GT m2, rp3, rp0, m6, m5 ; abs(p3 - p0) <= 1 +%ifdef m8 + mova m8, [pb_80] +%define rb80 m8 +%else +%define rb80 [pb_80] +%endif + ABSSUB_GT m1, rp2, rp0, m6, m5, rb80 ; abs(p2 - p0) <= 1 + por m2, m1 + ABSSUB m4, rp1, rp0, m5 ; abs(p1 - p0) +%if %2 <= 16 +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m7, H, m0 ; H H H H ... +%else + movd m7, Hd + SPLATB_MIX m7 +%endif + pxor m7, rb80 + pxor m4, rb80 + pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition) + CMP_GT m4, m6 ; abs(p1 - p0) <= 1 + por m2, m4 ; (flat8in) + ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0) + pxor m4, rb80 + pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition) + por m0, m5 ; hev final value + CMP_GT m4, m6 ; abs(q1 - q0) <= 1 + por m2, m4 ; (flat8in) + ABSSUB_GT m1, rq2, rq0, m6, m5, rb80 ; abs(q2 - q0) <= 1 + por m2, m1 + ABSSUB_GT m1, rq3, rq0, m6, m5, rb80 ; abs(q3 - q0) <= 1 + por m2, m1 ; flat8in final value + pxor m2, [pb_ff] +%if %2 == 84 || %2 == 48 + pand m2, [mask_mix%2] +%endif +%else + mova m6, [pb_80] +%if %2 == 44 + movd m7, Hd + SPLATB_MIX m7 +%else +%if cpuflag(ssse3) + pxor m0, m0 +%endif + SPLATB_REG m7, H, m0 ; H H H H ... +%endif + pxor m7, m6 + ABSSUB m4, rp1, rp0, m1 ; abs(p1 - p0) + pxor m4, m6 + pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition) + ABSSUB m4, rq1, rq0, m1 ; abs(q1 - q0) + pxor m4, m6 + pcmpgtb m5, m4, m7 ; abs(q1 - q0) > H (2/2 hev condition) + por m0, m5 ; hev final value +%endif + +%if %2 == 16 + ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3) + ; calc flat8out mask +%ifdef m8 + mova m8, [P7] + mova m9, [P6] +%define rp7 m8 +%define rp6 m9 +%else +%define rp7 [P7] +%define rp6 [P6] +%endif + ABSSUB_GT m1, rp7, rp0, m6, m5 ; abs(p7 - p0) <= 1 + ABSSUB_GT m7, rp6, rp0, m6, m5 ; abs(p6 - p0) <= 1 + por m1, m7 +%ifdef m8 + mova m8, [P5] + mova m9, [P4] +%define rp5 m8 +%define rp4 m9 +%else +%define rp5 [P5] +%define rp4 [P4] +%endif + ABSSUB_GT m7, rp5, rp0, m6, m5 ; abs(p5 - p0) <= 1 + por m1, m7 + ABSSUB_GT m7, rp4, rp0, m6, m5 ; abs(p4 - p0) <= 1 + por m1, m7 +%ifdef m8 + mova m14, [Q4] + mova m15, [Q5] +%define rq4 m14 +%define rq5 m15 +%else +%define rq4 [Q4] +%define rq5 [Q5] +%endif + ABSSUB_GT m7, rq4, rq0, m6, m5 ; abs(q4 - q0) <= 1 + por m1, m7 + ABSSUB_GT m7, rq5, rq0, m6, m5 ; abs(q5 - q0) <= 1 + por m1, m7 +%ifdef m8 + mova m14, [Q6] + mova m15, [Q7] +%define rq6 m14 +%define rq7 m15 +%else +%define rq6 [Q6] +%define rq7 [Q7] +%endif + ABSSUB_GT m7, rq6, rq0, m6, m5 ; abs(q4 - q0) <= 1 + por m1, m7 + ABSSUB_GT m7, rq7, rq0, m6, m5 ; abs(q5 - q0) <= 1 + por m1, m7 ; flat8out final value + pxor m1, [pb_ff] +%endif + + ; if (fm) { + ; if (out && in) filter_14() + ; else if (in) filter_6() + ; else if (hev) filter_2() + ; else filter_4() + ; } + ; + ; f14: fm & out & in + ; f6: fm & ~f14 & in => fm & ~(out & in) & in => fm & ~out & in + ; f2: fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev => fm & ~in & hev + ; f4: fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm & ~in & ~hev + + ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7) + ; filter2() +%if %2 != 44 && %2 != 4 + mova m6, [pb_80] ; already in m6 if 44_16 + SCRATCH 2, 15, rsp+%3+%4 +%if %2 == 16 + SCRATCH 1, 8, rsp+%3+%4+16 +%endif +%endif + pxor m2, m6, rq0 ; q0 ^ 0x80 + pxor m4, m6, rp0 ; p0 ^ 0x80 + psubsb m2, m4 ; (signed) q0 - p0 + pxor m4, m6, rp1 ; p1 ^ 0x80 + pxor m5, m6, rq1 ; q1 ^ 0x80 + psubsb m4, m5 ; (signed) p1 - q1 + paddsb m4, m2 ; (q0 - p0) + (p1 - q1) + paddsb m4, m2 ; 2*(q0 - p0) + (p1 - q1) + paddsb m4, m2 ; 3*(q0 - p0) + (p1 - q1) + paddsb m6, m4, [pb_4] ; m6: f1 = clip(f + 4, 127) + paddsb m4, [pb_3] ; m4: f2 = clip(f + 3, 127) +%ifdef m8 + mova m14, [pb_10] ; will be reused in filter4() +%define rb10 m14 +%else +%define rb10 [pb_10] +%endif + SRSHIFT3B_2X m6, m4, rb10, m7 ; f1 and f2 sign byte shift by 3 + SIGN_SUB m7, rq0, m6, m5 ; m7 = q0 - f1 + SIGN_ADD m1, rp0, m4, m5 ; m1 = p0 + f2 +%if %2 != 44 && %2 != 4 +%ifdef m8 + pandn m6, m15, m3 ; ~mask(in) & mask(fm) +%else + mova m6, [rsp+%3+%4] + pandn m6, m3 +%endif + pand m6, m0 ; (~mask(in) & mask(fm)) & mask(hev) +%else + pand m6, m3, m0 +%endif + MASK_APPLY m7, rq0, m6, m5 ; m7 = filter2(q0) & mask / we write it in filter4() + MASK_APPLY m1, rp0, m6, m5 ; m1 = filter2(p0) & mask / we write it in filter4() + + ; (m0: hev, m1: p0', m2: q0-p0, m3: fm, m7: q0', [m8: flat8out], m10..13: p1 p0 q0 q1, m14: pb_10, [m15: flat8in], ) + ; filter4() + mova m4, m2 + paddsb m2, m4 ; 2 * (q0 - p0) + paddsb m2, m4 ; 3 * (q0 - p0) + paddsb m6, m2, [pb_4] ; m6: f1 = clip(f + 4, 127) + paddsb m2, [pb_3] ; m2: f2 = clip(f + 3, 127) + SRSHIFT3B_2X m6, m2, rb10, m4 ; f1 and f2 sign byte shift by 3 +%if %2 != 44 && %2 != 4 +%ifdef m8 + pandn m5, m15, m3 ; ~mask(in) & mask(fm) +%else + mova m5, [rsp+%3+%4] + pandn m5, m3 +%endif + pandn m0, m5 ; ~mask(hev) & (~mask(in) & mask(fm)) +%else + pandn m0, m3 +%endif + SIGN_SUB m5, rq0, m6, m4 ; q0 - f1 + MASK_APPLY m5, m7, m0, m4 ; filter4(q0) & mask + mova [Q0], m5 + SIGN_ADD m7, rp0, m2, m4 ; p0 + f2 + MASK_APPLY m7, m1, m0, m4 ; filter4(p0) & mask + mova [P0], m7 + paddb m6, [pb_80] ; + pxor m1, m1 ; f=(f1+1)>>1 + pavgb m6, m1 ; + psubb m6, [pb_40] ; + SIGN_ADD m1, rp1, m6, m2 ; p1 + f + SIGN_SUB m4, rq1, m6, m2 ; q1 - f + MASK_APPLY m1, rp1, m0, m2 ; m1 = filter4(p1) + MASK_APPLY m4, rq1, m0, m2 ; m4 = filter4(q1) + mova [P1], m1 + mova [Q1], m4 + +%if %2 != 44 && %2 != 4 + UNSCRATCH 2, 15, rsp+%3+%4 +%endif + + ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1) + ; filter6() +%if %2 != 44 && %2 != 4 + pxor m0, m0 +%if %2 != 16 + pand m3, m2 +%else + pand m2, m3 ; mask(fm) & mask(in) +%ifdef m8 + pandn m3, m8, m2 ; ~mask(out) & (mask(fm) & mask(in)) +%else + mova m3, [rsp+%3+%4+16] + pandn m3, m2 +%endif +%endif +%ifdef m8 + mova m14, [P3] + mova m9, [Q3] +%define rp3 m14 +%define rq3 m9 +%else +%define rp3 [P3] +%define rq3 [Q3] +%endif + mova m1, [P2] + FILTER_INIT m4, m5, m6, m7, [P2], %4, 6, m3, m1 ; [p2] + mova m1, [Q2] + FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 0, 1, 2, 5, 3, m3, "", rq1, "", 1 ; [p1] -p3 -p2 +p1 +q1 + FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 0, 2, 3, 6, 3, m3, "", m1 ; [p0] -p3 -p1 +p0 +q2 + FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 0, 3, 4, 7, 3, m3, "", rq3, "", 1 ; [q0] -p3 -p0 +q0 +q3 + FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 1, 4, 5, 7, 3, m3, "" ; [q1] -p2 -q0 +q1 +q3 + FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 2, 5, 6, 7, 3, m3, m1 ; [q2] -p1 -q1 +q2 +q3 +%endif + +%if %2 == 16 + UNSCRATCH 1, 8, rsp+%3+%4+16 +%endif + + ; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2) + ; filter14() + ; + ; m2 m3 m8 m9 m14 m15 m10 m11 m12 m13 + ; + ; q2 q3 p3 p2 p1 p0 q0 q1 + ; p6 -7 p7 p6 p5 p4 . . . . . + ; p5 -6 -p7 -p6 +p5 +q1 . . . . + ; p4 -5 -p7 -p5 +p4 +q2 . . . q2 + ; p3 -4 -p7 -p4 +p3 +q3 . . . q3 + ; p2 -3 -p7 -p3 +p2 +q4 . . . q4 + ; p1 -2 -p7 -p2 +p1 +q5 . . . q5 + ; p0 -1 -p7 -p1 +p0 +q6 . . . q6 + ; q0 +0 -p7 -p0 +q0 +q7 . . . q7 + ; q1 +1 -p6 -q0 +q1 +q7 q1 . . . + ; q2 +2 -p5 -q1 +q2 +q7 . q2 . . + ; q3 +3 -p4 -q2 +q3 +q7 . q3 . . + ; q4 +4 -p3 -q3 +q4 +q7 . q4 . . + ; q5 +5 -p2 -q4 +q5 +q7 . q5 . . + ; q6 +6 -p1 -q5 +q6 +q7 . q6 . . + +%if %2 == 16 + pand m1, m2 ; mask(out) & (mask(fm) & mask(in)) + mova m2, [P7] + mova m3, [P6] +%ifdef m8 + mova m8, [P5] + mova m9, [P4] +%define rp5 m8 +%define rp4 m9 +%define rp5s m8 +%define rp4s m9 +%define rp3s m14 +%define rq4 m8 +%define rq5 m9 +%define rq6 m14 +%define rq7 m15 +%define rq4s m8 +%define rq5s m9 +%define rq6s m14 +%else +%define rp5 [P5] +%define rp4 [P4] +%define rp5s "" +%define rp4s "" +%define rp3s "" +%define rq4 [Q4] +%define rq5 [Q5] +%define rq6 [Q6] +%define rq7 [Q7] +%define rq4s "" +%define rq5s "" +%define rq6s "" +%endif + FILTER_INIT m4, m5, m6, m7, [P6], %4, 14, m1, m3 ; [p6] + FILTER_UPDATE m4, m5, m6, m7, [P5], %4, 8, 9, 10, 5, 4, m1, rp5s ; [p5] -p7 -p6 +p5 +q1 + FILTER_UPDATE m4, m5, m6, m7, [P4], %4, 8, 10, 11, 6, 4, m1, rp4s ; [p4] -p7 -p5 +p4 +q2 + FILTER_UPDATE m4, m5, m6, m7, [P3], %4, 8, 11, 0, 7, 4, m1, rp3s ; [p3] -p7 -p4 +p3 +q3 + FILTER_UPDATE m4, m5, m6, m7, [P2], %4, 8, 0, 1, 12, 4, m1, "", rq4, [Q4], 1 ; [p2] -p7 -p3 +p2 +q4 + FILTER_UPDATE m4, m5, m6, m7, [P1], %4, 8, 1, 2, 13, 4, m1, "", rq5, [Q5], 1 ; [p1] -p7 -p2 +p1 +q5 + FILTER_UPDATE m4, m5, m6, m7, [P0], %4, 8, 2, 3, 14, 4, m1, "", rq6, [Q6], 1 ; [p0] -p7 -p1 +p0 +q6 + FILTER_UPDATE m4, m5, m6, m7, [Q0], %4, 8, 3, 4, 15, 4, m1, "", rq7, [Q7], 1 ; [q0] -p7 -p0 +q0 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q1], %4, 9, 4, 5, 15, 4, m1, "" ; [q1] -p6 -q0 +q1 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q2], %4, 10, 5, 6, 15, 4, m1, "" ; [q2] -p5 -q1 +q2 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q3], %4, 11, 6, 7, 15, 4, m1, "" ; [q3] -p4 -q2 +q3 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q4], %4, 0, 7, 12, 15, 4, m1, rq4s ; [q4] -p3 -q3 +q4 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q5], %4, 1, 12, 13, 15, 4, m1, rq5s ; [q5] -p2 -q4 +q5 +q7 + FILTER_UPDATE m4, m5, m6, m7, [Q6], %4, 2, 13, 14, 15, 4, m1, rq6s ; [q6] -p1 -q5 +q6 +q7 +%endif + +%ifidn %1, h +%if %2 == 16 + mova m0, [P7] + mova m1, [P6] + mova m2, [P5] + mova m3, [P4] + mova m4, [P3] + mova m5, [P2] +%if ARCH_X86_64 + mova m6, [P1] +%endif + mova m7, [P0] +%if ARCH_X86_64 + mova m8, [Q0] + mova m9, [Q1] + mova m10, [Q2] + mova m11, [Q3] + mova m12, [Q4] + mova m13, [Q5] + mova m14, [Q6] + mova m15, [Q7] + TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] + DEFINE_REAL_P7_TO_Q7 + movu [P7], m0 + movu [P6], m1 + movu [P5], m2 + movu [P4], m3 + movu [P3], m4 + movu [P2], m5 + movu [P1], m6 + movu [P0], m7 + movu [Q0], m8 + movu [Q1], m9 + movu [Q2], m10 + movu [Q3], m11 + movu [Q4], m12 + movu [Q5], m13 + movu [Q6], m14 + movu [Q7], m15 +%else + DEFINE_REAL_P7_TO_Q7 + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+32], a, [rsp+%3+%4], [Q0], [Q1] + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 + DEFINE_TRANSPOSED_P7_TO_Q7 + mova m0, [Q0] + mova m1, [Q1] + mova m2, [Q2] + mova m3, [Q3] + mova m4, [Q4] + mova m5, [Q5] + mova m7, [Q7] + DEFINE_REAL_P7_TO_Q7 8 + TRANSPOSE8x8B 0, 1, 2, 3, 4, 5, 6, 7, [rsp+224], a, [rsp+%3+%4], [Q0], [Q1] + movh [P7], m0 + movh [P5], m1 + movh [P3], m2 + movh [P1], m3 + movh [Q2], m5 + movh [Q4], m6 + movh [Q6], m7 + movhps [P6], m0 + movhps [P4], m1 + movhps [P2], m2 + movhps [P0], m3 + movhps [Q3], m5 + movhps [Q5], m6 + movhps [Q7], m7 +%endif +%elif %2 == 44 || %2 == 4 + SWAP 0, 1 ; m0 = p1 + SWAP 1, 7 ; m1 = p0 + SWAP 2, 5 ; m2 = q0 + SWAP 3, 4 ; m3 = q1 + DEFINE_REAL_P7_TO_Q7 2 + SBUTTERFLY bw, 0, 1, 4 + SBUTTERFLY bw, 2, 3, 4 + SBUTTERFLY wd, 0, 2, 4 + SBUTTERFLY wd, 1, 3, 4 +%if mmsize == 16 + movd [P7], m0 + movd [P3], m2 + movd [Q0], m1 + movd [Q4], m3 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P6], m0 + movd [P2], m2 + movd [Q1], m1 + movd [Q5], m3 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P5], m0 + movd [P1], m2 + movd [Q2], m1 + movd [Q6], m3 + psrldq m0, 4 + psrldq m1, 4 + psrldq m2, 4 + psrldq m3, 4 + movd [P4], m0 + movd [P0], m2 + movd [Q3], m1 + movd [Q7], m3 +%else + movd [P7], m0 + movd [P5], m2 + movd [P3], m1 + movd [P1], m3 + psrlq m0, 32 + psrlq m2, 32 + psrlq m1, 32 + psrlq m3, 32 + movd [P6], m0 + movd [P4], m2 + movd [P2], m1 + movd [P0], m3 +%endif +%else + ; the following code do a transpose of 8 full lines to 16 half + ; lines (high part). It is inlined to avoid the need of a staging area + mova m0, [P3] + mova m1, [P2] + mova m2, [P1] + mova m3, [P0] + mova m4, [Q0] + mova m5, [Q1] +%ifdef m8 + mova m6, [Q2] +%endif + mova m7, [Q3] + DEFINE_REAL_P7_TO_Q7 +%ifdef m8 + SBUTTERFLY bw, 0, 1, 8 + SBUTTERFLY bw, 2, 3, 8 + SBUTTERFLY bw, 4, 5, 8 + SBUTTERFLY bw, 6, 7, 8 + SBUTTERFLY wd, 0, 2, 8 + SBUTTERFLY wd, 1, 3, 8 + SBUTTERFLY wd, 4, 6, 8 + SBUTTERFLY wd, 5, 7, 8 + SBUTTERFLY dq, 0, 4, 8 + SBUTTERFLY dq, 1, 5, 8 + SBUTTERFLY dq, 2, 6, 8 + SBUTTERFLY dq, 3, 7, 8 +%else + SBUTTERFLY bw, 0, 1, 6 + mova [rsp+mmsize*4], m1 + mova m6, [rsp+mmsize*6] + SBUTTERFLY bw, 2, 3, 1 + SBUTTERFLY bw, 4, 5, 1 + SBUTTERFLY bw, 6, 7, 1 + SBUTTERFLY wd, 0, 2, 1 + mova [rsp+mmsize*6], m2 + mova m1, [rsp+mmsize*4] + SBUTTERFLY wd, 1, 3, 2 + SBUTTERFLY wd, 4, 6, 2 + SBUTTERFLY wd, 5, 7, 2 + SBUTTERFLY dq, 0, 4, 2 + SBUTTERFLY dq, 1, 5, 2 +%if mmsize == 16 + movh [Q0], m1 + movhps [Q1], m1 +%else + mova [P3], m1 +%endif + mova m2, [rsp+mmsize*6] + SBUTTERFLY dq, 2, 6, 1 + SBUTTERFLY dq, 3, 7, 1 +%endif + SWAP 3, 6 + SWAP 1, 4 +%if mmsize == 16 + movh [P7], m0 + movhps [P6], m0 + movh [P5], m1 + movhps [P4], m1 + movh [P3], m2 + movhps [P2], m2 + movh [P1], m3 + movhps [P0], m3 +%ifdef m8 + movh [Q0], m4 + movhps [Q1], m4 +%endif + movh [Q2], m5 + movhps [Q3], m5 + movh [Q4], m6 + movhps [Q5], m6 + movh [Q6], m7 + movhps [Q7], m7 +%else + mova [P7], m0 + mova [P6], m1 + mova [P5], m2 + mova [P4], m3 + mova [P2], m5 + mova [P1], m6 + mova [P0], m7 +%endif +%endif +%endif + + RET +%endmacro + +%macro LPF_16_VH 5 +INIT_XMM %5 +LOOPFILTER v, %1, %2, 0, %4 +LOOPFILTER h, %1, %2, %3, %4 +%endmacro + +%macro LPF_16_VH_ALL_OPTS 4 +LPF_16_VH %1, %2, %3, %4, sse2 +LPF_16_VH %1, %2, %3, %4, ssse3 +LPF_16_VH %1, %2, %3, %4, avx +%endmacro + +LPF_16_VH_ALL_OPTS 16, 512, 256, 32 +LPF_16_VH_ALL_OPTS 44, 0, 128, 0 +LPF_16_VH_ALL_OPTS 48, 256, 128, 16 +LPF_16_VH_ALL_OPTS 84, 256, 128, 16 +LPF_16_VH_ALL_OPTS 88, 256, 128, 16 + +INIT_MMX mmxext +LOOPFILTER v, 4, 0, 0, 0 +LOOPFILTER h, 4, 0, 64, 0 +LOOPFILTER v, 8, 128, 0, 8 +LOOPFILTER h, 8, 128, 64, 8 diff --git a/media/ffvpx/libavcodec/x86/vp9lpf_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9lpf_16bpp.asm new file mode 100644 index 0000000000..c0888170c9 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9lpf_16bpp.asm @@ -0,0 +1,823 @@ +;****************************************************************************** +;* VP9 loop filter SIMD optimizations +;* +;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_511: times 16 dw 511 +pw_2047: times 16 dw 2047 +pw_16384: times 16 dw 16384 +pw_m512: times 16 dw -512 +pw_m2048: times 16 dw -2048 + +cextern pw_1 +cextern pw_3 +cextern pw_4 +cextern pw_8 +cextern pw_16 +cextern pw_256 +cextern pw_1023 +cextern pw_4095 +cextern pw_m1 + +SECTION .text + +%macro SCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%if %0 == 4 +%define reg_%4 m%2 +%endif +%else + mova [%3], m%1 +%if %0 == 4 +%define reg_%4 [%3] +%endif +%endif +%endmacro + +%macro UNSCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%if %0 == 4 +%undef reg_%4 +%endif +%endmacro + +%macro PRELOAD 2-3 +%if ARCH_X86_64 + mova m%1, [%2] +%if %0 == 3 +%define reg_%3 m%1 +%endif +%elif %0 == 3 +%define reg_%3 [%2] +%endif +%endmacro + +; calculate p or q portion of flat8out +%macro FLAT8OUT_HALF 0 + psubw m4, m0 ; q4-q0 + psubw m5, m0 ; q5-q0 + psubw m6, m0 ; q6-q0 + psubw m7, m0 ; q7-q0 + ABS2 m4, m5, m2, m3 ; abs(q4-q0) | abs(q5-q0) + ABS2 m6, m7, m2, m3 ; abs(q6-q0) | abs(q7-q0) + pcmpgtw m4, reg_F ; abs(q4-q0) > F + pcmpgtw m5, reg_F ; abs(q5-q0) > F + pcmpgtw m6, reg_F ; abs(q6-q0) > F + pcmpgtw m7, reg_F ; abs(q7-q0) > F + por m5, m4 + por m7, m6 + por m7, m5 ; !flat8out, q portion +%endmacro + +; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition) +%macro FLAT8IN_HALF 1 +%if %1 > 4 + psubw m4, m3, m0 ; q3-q0 + psubw m5, m2, m0 ; q2-q0 + ABS2 m4, m5, m6, m7 ; abs(q3-q0) | abs(q2-q0) + pcmpgtw m4, reg_F ; abs(q3-q0) > F + pcmpgtw m5, reg_F ; abs(q2-q0) > F +%endif + psubw m3, m2 ; q3-q2 + psubw m2, m1 ; q2-q1 + ABS2 m3, m2, m6, m7 ; abs(q3-q2) | abs(q2-q1) + pcmpgtw m3, reg_I ; abs(q3-q2) > I + pcmpgtw m2, reg_I ; abs(q2-q1) > I +%if %1 > 4 + por m4, m5 +%endif + por m2, m3 + psubw m3, m1, m0 ; q1-q0 + ABS1 m3, m5 ; abs(q1-q0) +%if %1 > 4 + pcmpgtw m6, m3, reg_F ; abs(q1-q0) > F +%endif + pcmpgtw m7, m3, reg_H ; abs(q1-q0) > H + pcmpgtw m3, reg_I ; abs(q1-q0) > I +%if %1 > 4 + por m4, m6 +%endif + por m2, m3 +%endmacro + +; one step in filter_14/filter_6 +; +; take sum $reg, downshift, apply mask and write into dst +; +; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next +; step's sum $reg. This is omitted for the last row in each filter. +; +; if dont_store is set, don't write the result into memory, instead keep the +; values in register so we can write it out later +%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \ + ; src/sub1, sub2, add1, add2, dont_store + psrlw %1, %2, %4 + psubw %1, %6 ; abs->delta +%ifnidn %7, "" + psubw %2, %6 + psubw %2, %7 + paddw %2, %8 + paddw %2, %9 +%endif + pand %1, reg_%3 ; apply mask +%if %10 == 1 + paddw %6, %1 ; delta->abs +%else + paddw %1, %6 ; delta->abs + mova [%5], %1 +%endif +%endmacro + +; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8} + +%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12] + +%if ARCH_X86_64 +%if %2 == 16 +%assign %%num_xmm_regs 16 +%elif %2 == 8 +%assign %%num_xmm_regs 15 +%else ; %2 == 4 +%assign %%num_xmm_regs 14 +%endif ; %2 +%assign %%bak_mem 0 +%else ; ARCH_X86_32 +%assign %%num_xmm_regs 8 +%if %2 == 16 +%assign %%bak_mem 7 +%elif %2 == 8 +%assign %%bak_mem 6 +%else ; %2 == 4 +%assign %%bak_mem 5 +%endif ; %2 +%endif ; ARCH_X86_64/32 + +%if %2 == 16 +%ifidn %1, v +%assign %%num_gpr_regs 6 +%else ; %1 == h +%assign %%num_gpr_regs 5 +%endif ; %1 +%assign %%wd_mem 6 +%else ; %2 == 8/4 +%assign %%num_gpr_regs 5 +%if ARCH_X86_32 && %2 == 8 +%assign %%wd_mem 2 +%else ; ARCH_X86_64 || %2 == 4 +%assign %%wd_mem 0 +%endif ; ARCH_X86_64/32 etc. +%endif ; %2 + +%ifidn %1, v +%assign %%tsp_mem 0 +%elif %2 == 16 ; && %1 == h +%assign %%tsp_mem 16 +%else ; %1 == h && %1 == 8/4 +%assign %%tsp_mem 8 +%endif ; %1/%2 + +%assign %%off %%wd_mem +%assign %%tspoff %%bak_mem+%%wd_mem +%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize) + +%if %3 == 10 +%define %%maxsgn 511 +%define %%minsgn m512 +%define %%maxusgn 1023 +%define %%maxf 4 +%else ; %3 == 12 +%define %%maxsgn 2047 +%define %%minsgn m2048 +%define %%maxusgn 4095 +%define %%maxf 16 +%endif ; %3 + +cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H + ; prepare E, I and H masks + shl Ed, %3-8 + shl Id, %3-8 + shl Hd, %3-8 +%if cpuflag(ssse3) + mova m0, [pw_256] +%endif + movd m1, Ed + movd m2, Id + movd m3, Hd +%if cpuflag(ssse3) + pshufb m1, m0 ; E << (bit_depth - 8) + pshufb m2, m0 ; I << (bit_depth - 8) + pshufb m3, m0 ; H << (bit_depth - 8) +%else + punpcklwd m1, m1 + punpcklwd m2, m2 + punpcklwd m3, m3 + pshufd m1, m1, q0000 + pshufd m2, m2, q0000 + pshufd m3, m3, q0000 +%endif + SCRATCH 1, 8, rsp+(%%off+0)*mmsize, E + SCRATCH 2, 9, rsp+(%%off+1)*mmsize, I + SCRATCH 3, 10, rsp+(%%off+2)*mmsize, H +%if %2 > 4 + PRELOAD 11, pw_ %+ %%maxf, F +%endif + + ; set up variables to load data +%ifidn %1, v + DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12 + lea stride3q, [strideq*3] + neg strideq +%if %2 == 16 + lea dst0q, [dst8q+strideq*8] +%else + lea dst4q, [dst8q+strideq*4] +%endif + neg strideq +%if %2 == 16 + lea dst12q, [dst8q+strideq*4] + lea dst4q, [dst0q+strideq*4] +%endif + +%if %2 == 16 +%define %%p7 dst0q +%define %%p6 dst0q+strideq +%define %%p5 dst0q+strideq*2 +%define %%p4 dst0q+stride3q +%endif +%define %%p3 dst4q +%define %%p2 dst4q+strideq +%define %%p1 dst4q+strideq*2 +%define %%p0 dst4q+stride3q +%define %%q0 dst8q +%define %%q1 dst8q+strideq +%define %%q2 dst8q+strideq*2 +%define %%q3 dst8q+stride3q +%if %2 == 16 +%define %%q4 dst12q +%define %%q5 dst12q+strideq +%define %%q6 dst12q+strideq*2 +%define %%q7 dst12q+stride3q +%endif +%else ; %1 == h + DEFINE_ARGS dst0, stride, stride3, dst4 + lea stride3q, [strideq*3] + lea dst4q, [dst0q+strideq*4] + +%define %%p3 rsp+(%%tspoff+0)*mmsize +%define %%p2 rsp+(%%tspoff+1)*mmsize +%define %%p1 rsp+(%%tspoff+2)*mmsize +%define %%p0 rsp+(%%tspoff+3)*mmsize +%define %%q0 rsp+(%%tspoff+4)*mmsize +%define %%q1 rsp+(%%tspoff+5)*mmsize +%define %%q2 rsp+(%%tspoff+6)*mmsize +%define %%q3 rsp+(%%tspoff+7)*mmsize + +%if %2 < 16 + movu m0, [dst0q+strideq*0-8] + movu m1, [dst0q+strideq*1-8] + movu m2, [dst0q+strideq*2-8] + movu m3, [dst0q+stride3q -8] + movu m4, [dst4q+strideq*0-8] + movu m5, [dst4q+strideq*1-8] + movu m6, [dst4q+strideq*2-8] + movu m7, [dst4q+stride3q -8] + +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0] +%endif + + mova [%%p3], m0 + mova [%%p2], m1 + mova [%%p1], m2 + mova [%%p0], m3 +%if ARCH_X86_64 + mova [%%q0], m4 +%endif + mova [%%q1], m5 + mova [%%q2], m6 + mova [%%q3], m7 + + ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register + ; order here accordingly +%else ; %2 == 16 + +%define %%p7 rsp+(%%tspoff+ 8)*mmsize +%define %%p6 rsp+(%%tspoff+ 9)*mmsize +%define %%p5 rsp+(%%tspoff+10)*mmsize +%define %%p4 rsp+(%%tspoff+11)*mmsize +%define %%q4 rsp+(%%tspoff+12)*mmsize +%define %%q5 rsp+(%%tspoff+13)*mmsize +%define %%q6 rsp+(%%tspoff+14)*mmsize +%define %%q7 rsp+(%%tspoff+15)*mmsize + + mova m0, [dst0q+strideq*0-16] + mova m1, [dst0q+strideq*1-16] + mova m2, [dst0q+strideq*2-16] + mova m3, [dst0q+stride3q -16] + mova m4, [dst4q+strideq*0-16] + mova m5, [dst4q+strideq*1-16] +%if ARCH_X86_64 + mova m6, [dst4q+strideq*2-16] +%endif + mova m7, [dst4q+stride3q -16] + +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1 +%endif + + mova [%%p7], m0 + mova [%%p6], m1 + mova [%%p5], m2 + mova [%%p4], m3 +%if ARCH_X86_64 + mova [%%p3], m4 +%endif + mova [%%p2], m5 + mova [%%p1], m6 + mova [%%p0], m7 + + mova m0, [dst0q+strideq*0] + mova m1, [dst0q+strideq*1] + mova m2, [dst0q+strideq*2] + mova m3, [dst0q+stride3q ] + mova m4, [dst4q+strideq*0] + mova m5, [dst4q+strideq*1] +%if ARCH_X86_64 + mova m6, [dst4q+strideq*2] +%endif + mova m7, [dst4q+stride3q ] + +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1 +%endif + + mova [%%q0], m0 + mova [%%q1], m1 + mova [%%q2], m2 + mova [%%q3], m3 +%if ARCH_X86_64 + mova [%%q4], m4 +%endif + mova [%%q5], m5 + mova [%%q6], m6 + mova [%%q7], m7 + + ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register + ; order here accordingly +%endif ; %2 +%endif ; %1 + + ; load q0|q4-7 data + mova m0, [%%q0] +%if %2 == 16 + mova m4, [%%q4] + mova m5, [%%q5] + mova m6, [%%q6] + mova m7, [%%q7] + + ; flat8out q portion + FLAT8OUT_HALF + SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O +%endif + + ; load q1-3 data + mova m1, [%%q1] + mova m2, [%%q2] + mova m3, [%%q3] + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flatout[q] + ; m12-14=free + ; m0-3=q0-q3 + ; m4-7=free + + ; flat8in|fm|hev q portion + FLAT8IN_HALF %2 + SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV +%if %2 > 4 + SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8I +%endif + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flat8out[q] + ; r10[m13]=hev[q] + ; r11[m14]=!flat8in[q] + ; m2=!fm[q] + ; m0,1=q0-q1 + ; m2-7=free + ; m12=free + + ; load p0-1 + mova m3, [%%p0] + mova m4, [%%p1] + + ; fm mb_edge portion + psubw m5, m3, m0 ; q0-p0 + psubw m6, m4, m1 ; q1-p1 +%if ARCH_X86_64 + ABS2 m5, m6, m7, m12 ; abs(q0-p0) | abs(q1-p1) +%else + ABS1 m5, m7 ; abs(q0-p0) + ABS1 m6, m7 ; abs(q1-p1) +%endif + paddw m5, m5 + psraw m6, 1 + paddw m6, m5 ; abs(q0-p0)*2+(abs(q1-p1)>>1) + pcmpgtw m6, reg_E + por m2, m6 + SCRATCH 2, 12, rsp+(%%off+3)*mmsize, FM + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flat8out[q] + ; r10[m13]=hev[q] + ; r11[m14]=!flat8in[q] + ; r12[m12]=!fm[q] + ; m3-4=q0-1 + ; m0-2/5-7=free + + ; load p4-7 data + SWAP 3, 0 ; p0 + SWAP 4, 1 ; p1 +%if %2 == 16 + mova m7, [%%p7] + mova m6, [%%p6] + mova m5, [%%p5] + mova m4, [%%p4] + + ; flat8out p portion + FLAT8OUT_HALF + por m7, reg_F8O + SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O +%endif + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flat8out + ; r10[m13]=hev[q] + ; r11[m14]=!flat8in[q] + ; r12[m12]=!fm[q] + ; m0=p0 + ; m1-7=free + + ; load p2-3 data + mova m2, [%%p2] + mova m3, [%%p3] + + ; flat8in|fm|hev p portion + FLAT8IN_HALF %2 + por m7, reg_HEV +%if %2 > 4 + por m4, reg_F8I +%endif + por m2, reg_FM +%if %2 > 4 + por m4, m2 ; !flat8|!fm +%if %2 == 16 + por m5, m4, reg_F8O ; !flat16|!fm + pandn m2, m4 ; filter4_mask + pandn m4, m5 ; filter8_mask + pxor m5, [pw_m1] ; filter16_mask + SCRATCH 5, 15, rsp+(%%off+6)*mmsize, F16M +%else + pandn m2, m4 ; filter4_mask + pxor m4, [pw_m1] ; filter8_mask +%endif + SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8M +%else + pxor m2, [pw_m1] ; filter4_mask +%endif + SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV + SCRATCH 2, 12, rsp+(%%off+3)*mmsize, F4M + + ; r9[m15]=filter16_mask + ; r10[m13]=hev + ; r11[m14]=filter8_mask + ; r12[m12]=filter4_mask + ; m0,1=p0-p1 + ; m2-7=free + ; m8-11=free + +%if %2 > 4 +%if %2 == 16 + ; filter_14 + mova m2, [%%p7] + mova m3, [%%p6] + mova m6, [%%p5] + mova m7, [%%p4] + PRELOAD 8, %%p3, P3 + PRELOAD 9, %%p2, P2 +%endif + PRELOAD 10, %%q0, Q0 + PRELOAD 11, %%q1, Q1 +%if %2 == 16 + psllw m4, m2, 3 + paddw m5, m3, m3 + paddw m4, m6 + paddw m5, m7 + paddw m4, reg_P3 + paddw m5, reg_P2 + paddw m4, m1 + paddw m5, m0 + paddw m4, reg_Q0 ; q0+p1+p3+p5+p7*8 + psubw m5, m2 ; p0+p2+p4+p6*2-p7 + paddw m4, [pw_8] + paddw m5, m4 ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8 + + ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction + ; at the end of the filter + + mova [rsp+0*mmsize], m3 + FILTER_STEP m4, m5, F16M, 4, %%p6, m3, m2, m6, reg_Q1 +%endif + mova m3, [%%q2] +%if %2 == 16 + mova [rsp+1*mmsize], m6 + FILTER_STEP m4, m5, F16M, 4, %%p5, m6, m2, m7, m3 +%endif + mova m6, [%%q3] +%if %2 == 16 + mova [rsp+2*mmsize], m7 + FILTER_STEP m4, m5, F16M, 4, %%p4, m7, m2, reg_P3, m6 + mova m7, [%%q4] +%if ARCH_X86_64 + mova [rsp+3*mmsize], reg_P3 +%else + mova m4, reg_P3 + mova [rsp+3*mmsize], m4 +%endif + FILTER_STEP m4, m5, F16M, 4, %%p3, reg_P3, m2, reg_P2, m7 + PRELOAD 8, %%q5, Q5 +%if ARCH_X86_64 + mova [rsp+4*mmsize], reg_P2 +%else + mova m4, reg_P2 + mova [rsp+4*mmsize], m4 +%endif + FILTER_STEP m4, m5, F16M, 4, %%p2, reg_P2, m2, m1, reg_Q5 + PRELOAD 9, %%q6, Q6 + mova [rsp+5*mmsize], m1 + FILTER_STEP m4, m5, F16M, 4, %%p1, m1, m2, m0, reg_Q6 + mova m1, [%%q7] + FILTER_STEP m4, m5, F16M, 4, %%p0, m0, m2, reg_Q0, m1, 1 + FILTER_STEP m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1, ARCH_X86_64 + FILTER_STEP m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3, m1, ARCH_X86_64 + FILTER_STEP m4, m5, F16M, 4, %%q2, m3, [rsp+2*mmsize], m6, m1, 1 + FILTER_STEP m4, m5, F16M, 4, %%q3, m6, [rsp+3*mmsize], m7, m1 + FILTER_STEP m4, m5, F16M, 4, %%q4, m7, [rsp+4*mmsize], reg_Q5, m1 + FILTER_STEP m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1 + FILTER_STEP m4, m5, F16M, 4, %%q6, reg_Q6 + + mova m7, [%%p1] +%else + SWAP 1, 7 +%endif + + mova m2, [%%p3] + mova m1, [%%p2] + + ; reg_Q0-1 (m10-m11) + ; m0=p0 + ; m1=p2 + ; m2=p3 + ; m3=q2 + ; m4-5=free + ; m6=q3 + ; m7=p1 + ; m8-9 unused + + ; filter_6 + psllw m4, m2, 2 + paddw m5, m1, m1 + paddw m4, m7 + psubw m5, m2 + paddw m4, m0 + paddw m5, reg_Q0 + paddw m4, [pw_4] + paddw m5, m4 + +%if ARCH_X86_64 + mova m8, m1 + mova m9, m7 +%else + mova [rsp+0*mmsize], m1 + mova [rsp+1*mmsize], m7 +%endif +%ifidn %1, v + FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1 +%else + FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1, 1 +%endif + FILTER_STEP m4, m5, F8M, 3, %%p1, m7, m2, m0, m3, 1 + FILTER_STEP m4, m5, F8M, 3, %%p0, m0, m2, reg_Q0, m6, 1 +%if ARCH_X86_64 + FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64 + FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64 +%else + FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64 + FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3, m6, ARCH_X86_64 +%endif + FILTER_STEP m4, m5, F8M, 3, %%q2, m3 + + UNSCRATCH 2, 10, %%q0 + UNSCRATCH 6, 11, %%q1 +%else + SWAP 1, 7 + mova m2, [%%q0] + mova m6, [%%q1] +%endif + UNSCRATCH 3, 13, rsp+(%%off+4)*mmsize, HEV + + ; m0=p0 + ; m1=p2 + ; m2=q0 + ; m3=hev_mask + ; m4-5=free + ; m6=q1 + ; m7=p1 + + ; filter_4 + psubw m4, m7, m6 ; p1-q1 + psubw m5, m2, m0 ; q0-p0 + pand m4, m3 + pminsw m4, [pw_ %+ %%maxsgn] + pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(p1-q1, 9) -> f + paddw m4, m5 + paddw m5, m5 + paddw m4, m5 ; 3*(q0-p0)+f + pminsw m4, [pw_ %+ %%maxsgn] + pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(3*(q0-p0)+f, 9) -> f + pand m4, reg_F4M + paddw m5, m4, [pw_4] + paddw m4, [pw_3] + pminsw m5, [pw_ %+ %%maxsgn] + pminsw m4, [pw_ %+ %%maxsgn] + psraw m5, 3 ; min_intp2(f+4, 9)>>3 -> f1 + psraw m4, 3 ; min_intp2(f+3, 9)>>3 -> f2 + psubw m2, m5 ; q0-f1 + paddw m0, m4 ; p0+f2 + pandn m3, m5 ; f1 & !hev (for p1/q1 adj) + pxor m4, m4 + mova m5, [pw_ %+ %%maxusgn] + pmaxsw m2, m4 + pmaxsw m0, m4 + pminsw m2, m5 + pminsw m0, m5 +%if cpuflag(ssse3) + pmulhrsw m3, [pw_16384] ; (f1+1)>>1 +%else + paddw m3, [pw_1] + psraw m3, 1 +%endif + paddw m7, m3 ; p1+f + psubw m6, m3 ; q1-f + pmaxsw m7, m4 + pmaxsw m6, m4 + pminsw m7, m5 + pminsw m6, m5 + + ; store +%ifidn %1, v + mova [%%p1], m7 + mova [%%p0], m0 + mova [%%q0], m2 + mova [%%q1], m6 +%else ; %1 == h +%if %2 == 4 + TRANSPOSE4x4W 7, 0, 2, 6, 1 + movh [dst0q+strideq*0-4], m7 + movhps [dst0q+strideq*1-4], m7 + movh [dst0q+strideq*2-4], m0 + movhps [dst0q+stride3q -4], m0 + movh [dst4q+strideq*0-4], m2 + movhps [dst4q+strideq*1-4], m2 + movh [dst4q+strideq*2-4], m6 + movhps [dst4q+stride3q -4], m6 +%elif %2 == 8 + mova m3, [%%p3] + mova m4, [%%q2] + mova m5, [%%q3] + +%if ARCH_X86_64 + TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, 8 +%else + TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1 + mova m2, [%%q0] +%endif + + movu [dst0q+strideq*0-8], m3 + movu [dst0q+strideq*1-8], m1 + movu [dst0q+strideq*2-8], m7 + movu [dst0q+stride3q -8], m0 + movu [dst4q+strideq*0-8], m2 + movu [dst4q+strideq*1-8], m6 + movu [dst4q+strideq*2-8], m4 + movu [dst4q+stride3q -8], m5 +%else ; %2 == 16 + SCRATCH 2, 8, %%q0 + SCRATCH 6, 9, %%q1 + mova m2, [%%p7] + mova m3, [%%p6] + mova m4, [%%p5] + mova m5, [%%p4] + mova m6, [%%p3] + +%if ARCH_X86_64 + TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, 10 +%else + mova [%%p1], m7 + TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1 +%endif + + mova [dst0q+strideq*0-16], m2 + mova [dst0q+strideq*1-16], m3 + mova [dst0q+strideq*2-16], m4 + mova [dst0q+stride3q -16], m5 +%if ARCH_X86_64 + mova [dst4q+strideq*0-16], m6 +%endif + mova [dst4q+strideq*1-16], m1 + mova [dst4q+strideq*2-16], m7 + mova [dst4q+stride3q -16], m0 + + UNSCRATCH 2, 8, %%q0 + UNSCRATCH 6, 9, %%q1 + mova m0, [%%q2] + mova m1, [%%q3] + mova m3, [%%q4] + mova m4, [%%q5] +%if ARCH_X86_64 + mova m5, [%%q6] +%endif + mova m7, [%%q7] + +%if ARCH_X86_64 + TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, 8 +%else + TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1 +%endif + + mova [dst0q+strideq*0], m2 + mova [dst0q+strideq*1], m6 + mova [dst0q+strideq*2], m0 + mova [dst0q+stride3q ], m1 +%if ARCH_X86_64 + mova [dst4q+strideq*0], m3 +%endif + mova [dst4q+strideq*1], m4 + mova [dst4q+strideq*2], m5 + mova [dst4q+stride3q ], m7 +%endif ; %2 +%endif ; %1 + RET +%endmacro + +%macro LOOP_FILTER_CPUSETS 3 +INIT_XMM sse2 +LOOP_FILTER %1, %2, %3 +INIT_XMM ssse3 +LOOP_FILTER %1, %2, %3 +INIT_XMM avx +LOOP_FILTER %1, %2, %3 +%endmacro + +%macro LOOP_FILTER_WDSETS 2 +LOOP_FILTER_CPUSETS %1, 4, %2 +LOOP_FILTER_CPUSETS %1, 8, %2 +LOOP_FILTER_CPUSETS %1, 16, %2 +%endmacro + +LOOP_FILTER_WDSETS h, 10 +LOOP_FILTER_WDSETS v, 10 +LOOP_FILTER_WDSETS h, 12 +LOOP_FILTER_WDSETS v, 12 diff --git a/media/ffvpx/libavcodec/x86/vp9mc.asm b/media/ffvpx/libavcodec/x86/vp9mc.asm new file mode 100644 index 0000000000..efc4cfbef1 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9mc.asm @@ -0,0 +1,680 @@ +;****************************************************************************** +;* VP9 motion compensation SIMD optimizations +;* +;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +cextern pw_256 +cextern pw_64 + +%macro F8_SSSE3_TAPS 8 +times 16 db %1, %2 +times 16 db %3, %4 +times 16 db %5, %6 +times 16 db %7, %8 +%endmacro + +%macro F8_SSE2_TAPS 8 +times 8 dw %1 +times 8 dw %2 +times 8 dw %3 +times 8 dw %4 +times 8 dw %5 +times 8 dw %6 +times 8 dw %7 +times 8 dw %8 +%endmacro + +%macro F8_16BPP_TAPS 8 +times 8 dw %1, %2 +times 8 dw %3, %4 +times 8 dw %5, %6 +times 8 dw %7, %8 +%endmacro + +%macro FILTER 1 +const filters_%1 ; smooth + F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 + F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 + F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 + F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0 + F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0 + F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0 + F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1 + F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1 + F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1 + F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1 + F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2 + F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2 + F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2 + F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 + F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 + ; regular + F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 + F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 + F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 + F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1 + F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1 + F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1 + F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1 + F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1 + F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1 + F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1 + F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1 + F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1 + F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1 + F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 + F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 + ; sharp + F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 + F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 + F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 + F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2 + F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3 + F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3 + F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4 + F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4 + F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4 + F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4 + F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4 + F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4 + F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3 + F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2 + F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1 +%endmacro + +%define F8_TAPS F8_SSSE3_TAPS +; int8_t ff_filters_ssse3[3][15][4][32] +FILTER ssse3 +%define F8_TAPS F8_SSE2_TAPS +; int16_t ff_filters_sse2[3][15][8][8] +FILTER sse2 +%define F8_TAPS F8_16BPP_TAPS +; int16_t ff_filters_16bpp[3][15][4][16] +FILTER 16bpp + +SECTION .text + +%macro filter_sse2_h_fn 1 +%assign %%px mmsize/2 +cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery + pxor m5, m5 + mova m6, [pw_64] + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+ 16] + mova m9, [filteryq+ 32] + mova m10, [filteryq+ 48] + mova m11, [filteryq+ 64] + mova m12, [filteryq+ 80] + mova m13, [filteryq+ 96] + mova m14, [filteryq+112] +%endif +.loop: + movh m0, [srcq-3] + movh m1, [srcq-2] + movh m2, [srcq-1] + movh m3, [srcq+0] + movh m4, [srcq+1] + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m8 + pmullw m2, m9 + pmullw m3, m10 + pmullw m4, m11 +%else + pmullw m1, [filteryq+ 16] + pmullw m2, [filteryq+ 32] + pmullw m3, [filteryq+ 48] + pmullw m4, [filteryq+ 64] +%endif + paddw m0, m1 + paddw m2, m3 + paddw m0, m4 + movh m1, [srcq+2] + movh m3, [srcq+3] + movh m4, [srcq+4] + add srcq, sstrideq + punpcklbw m1, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m12 + pmullw m3, m13 + pmullw m4, m14 +%else + pmullw m1, [filteryq+ 80] + pmullw m3, [filteryq+ 96] + pmullw m4, [filteryq+112] +%endif + paddw m0, m1 + paddw m3, m4 + paddw m0, m6 + paddw m2, m3 + paddsw m0, m2 + psraw m0, 7 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX mmxext +filter_sse2_h_fn put +filter_sse2_h_fn avg + +INIT_XMM sse2 +filter_sse2_h_fn put +filter_sse2_h_fn avg + +%macro filter_h_fn 1 +%assign %%px mmsize/2 +cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery + mova m6, [pw_256] + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+32] + mova m9, [filteryq+64] + mova m10, [filteryq+96] +%endif +.loop: + movh m0, [srcq-3] + movh m1, [srcq-2] + movh m2, [srcq-1] + movh m3, [srcq+0] + movh m4, [srcq+1] + movh m5, [srcq+2] + punpcklbw m0, m1 + punpcklbw m2, m3 + movh m1, [srcq+3] + movh m3, [srcq+4] + add srcq, sstrideq + punpcklbw m4, m5 + punpcklbw m1, m3 + pmaddubsw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddubsw m2, m8 + pmaddubsw m4, m9 + pmaddubsw m1, m10 +%else + pmaddubsw m2, [filteryq+32] + pmaddubsw m4, [filteryq+64] + pmaddubsw m1, [filteryq+96] +%endif + paddw m0, m4 + paddw m2, m1 + paddsw m0, m2 + pmulhrsw m0, m6 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX ssse3 +filter_h_fn put +filter_h_fn avg + +INIT_XMM ssse3 +filter_h_fn put +filter_h_fn avg + +%if ARCH_X86_64 +%macro filter_hx2_fn 1 +%assign %%px mmsize +cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery + mova m13, [pw_256] + mova m8, [filteryq+ 0] + mova m9, [filteryq+32] + mova m10, [filteryq+64] + mova m11, [filteryq+96] +.loop: + movu m0, [srcq-3] + movu m1, [srcq-2] + movu m2, [srcq-1] + movu m3, [srcq+0] + movu m4, [srcq+1] + movu m5, [srcq+2] + movu m6, [srcq+3] + movu m7, [srcq+4] + add srcq, sstrideq + SBUTTERFLY bw, 0, 1, 12 + SBUTTERFLY bw, 2, 3, 12 + SBUTTERFLY bw, 4, 5, 12 + SBUTTERFLY bw, 6, 7, 12 + pmaddubsw m0, m8 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + paddsw m0, m2 + paddsw m1, m3 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 +%ifidn %1, avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter_hx2_fn put +filter_hx2_fn avg + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +filter_hx2_fn put +filter_hx2_fn avg +%endif + +%endif ; ARCH_X86_64 + +%macro filter_sse2_v_fn 1 +%assign %%px mmsize/2 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%define hd r4mp +%endif + pxor m5, m5 + mova m6, [pw_64] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+ 16] + mova m9, [filteryq+ 32] + mova m10, [filteryq+ 48] + mova m11, [filteryq+ 64] + mova m12, [filteryq+ 80] + mova m13, [filteryq+ 96] + mova m14, [filteryq+112] +%endif +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movh m0, [srcq] + movh m1, [srcq+sstrideq] + movh m2, [srcq+sstrideq*2] + movh m3, [srcq+sstride3q] + add srcq, sstrideq + movh m4, [src4q] + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m8 + pmullw m2, m9 + pmullw m3, m10 + pmullw m4, m11 +%else + pmullw m1, [filteryq+ 16] + pmullw m2, [filteryq+ 32] + pmullw m3, [filteryq+ 48] + pmullw m4, [filteryq+ 64] +%endif + paddw m0, m1 + paddw m2, m3 + paddw m0, m4 + movh m1, [src4q+sstrideq] + movh m3, [src4q+sstrideq*2] + movh m4, [src4q+sstride3q] + add src4q, sstrideq + punpcklbw m1, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 +%if ARCH_X86_64 && mmsize > 8 + pmullw m1, m12 + pmullw m3, m13 + pmullw m4, m14 +%else + pmullw m1, [filteryq+ 80] + pmullw m3, [filteryq+ 96] + pmullw m4, [filteryq+112] +%endif + paddw m0, m1 + paddw m3, m4 + paddw m0, m6 + paddw m2, m3 + paddsw m0, m2 + psraw m0, 7 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX mmxext +filter_sse2_v_fn put +filter_sse2_v_fn avg + +INIT_XMM sse2 +filter_sse2_v_fn put +filter_sse2_v_fn avg + +%macro filter_v_fn 1 +%assign %%px mmsize/2 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%define hd r4mp +%endif + mova m6, [pw_256] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+32] + mova m9, [filteryq+64] + mova m10, [filteryq+96] +%endif +.loop: + ; FIXME maybe reuse loads from previous rows, or just more generally + ; unroll this to prevent multiple loads of the same data? + movh m0, [srcq] + movh m1, [srcq+sstrideq] + movh m2, [srcq+sstrideq*2] + movh m3, [srcq+sstride3q] + movh m4, [src4q] + movh m5, [src4q+sstrideq] + punpcklbw m0, m1 + punpcklbw m2, m3 + movh m1, [src4q+sstrideq*2] + movh m3, [src4q+sstride3q] + add srcq, sstrideq + add src4q, sstrideq + punpcklbw m4, m5 + punpcklbw m1, m3 + pmaddubsw m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddubsw m2, m8 + pmaddubsw m4, m9 + pmaddubsw m1, m10 +%else + pmaddubsw m2, [filteryq+32] + pmaddubsw m4, [filteryq+64] + pmaddubsw m1, [filteryq+96] +%endif + paddw m0, m4 + paddw m2, m1 + paddsw m0, m2 + pmulhrsw m0, m6 +%ifidn %1, avg + movh m1, [dstq] +%endif + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_MMX ssse3 +filter_v_fn put +filter_v_fn avg + +INIT_XMM ssse3 +filter_v_fn put +filter_v_fn avg + +%if ARCH_X86_64 + +%macro filter_vx2_fn 1 +%assign %%px mmsize +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3 + mova m13, [pw_256] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m8, [filteryq+ 0] + mova m9, [filteryq+32] + mova m10, [filteryq+64] + mova m11, [filteryq+96] +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movu m0, [srcq] + movu m1, [srcq+sstrideq] + movu m2, [srcq+sstrideq*2] + movu m3, [srcq+sstride3q] + movu m4, [src4q] + movu m5, [src4q+sstrideq] + movu m6, [src4q+sstrideq*2] + movu m7, [src4q+sstride3q] + add srcq, sstrideq + add src4q, sstrideq + SBUTTERFLY bw, 0, 1, 12 + SBUTTERFLY bw, 2, 3, 12 + SBUTTERFLY bw, 4, 5, 12 + SBUTTERFLY bw, 6, 7, 12 + pmaddubsw m0, m8 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + paddsw m0, m2 + paddsw m1, m3 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 +%ifidn %1, avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter_vx2_fn put +filter_vx2_fn avg + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +filter_vx2_fn put +filter_vx2_fn avg +%endif + +%endif ; ARCH_X86_64 + +%macro fpel_fn 6-8 0, 4 +%if %2 == 4 +%define %%srcfn movh +%define %%dstfn movh +%else +%define %%srcfn movu +%define %%dstfn mova +%endif + +%if %7 == 8 +%define %%pavg pavgb +%define %%szsuf _8 +%elif %7 == 16 +%define %%pavg pavgw +%define %%szsuf _16 +%else +%define %%szsuf +%endif + +%if %2 <= mmsize +cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 + lea sstride3q, [sstrideq*3] + lea dstride3q, [dstrideq*3] +%else +cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h +%endif +.loop: + %%srcfn m0, [srcq] + %%srcfn m1, [srcq+s%3] + %%srcfn m2, [srcq+s%4] + %%srcfn m3, [srcq+s%5] +%if %2/mmsize == 8 + %%srcfn m4, [srcq+mmsize*4] + %%srcfn m5, [srcq+mmsize*5] + %%srcfn m6, [srcq+mmsize*6] + %%srcfn m7, [srcq+mmsize*7] +%endif + lea srcq, [srcq+sstrideq*%6] +%ifidn %1, avg + %%pavg m0, [dstq] + %%pavg m1, [dstq+d%3] + %%pavg m2, [dstq+d%4] +%if %2 == 4 + %%srcfn m4, [dstq+d%5] + %%pavg m3, m4 +%else + %%pavg m3, [dstq+d%5] +%endif +%if %2/mmsize == 8 + %%pavg m4, [dstq+mmsize*4] + %%pavg m5, [dstq+mmsize*5] + %%pavg m6, [dstq+mmsize*6] + %%pavg m7, [dstq+mmsize*7] +%endif +%endif + %%dstfn [dstq], m0 + %%dstfn [dstq+d%3], m1 + %%dstfn [dstq+d%4], m2 + %%dstfn [dstq+d%5], m3 +%if %2/mmsize == 8 + %%dstfn [dstq+mmsize*4], m4 + %%dstfn [dstq+mmsize*5], m5 + %%dstfn [dstq+mmsize*6], m6 + %%dstfn [dstq+mmsize*7], m7 +%endif + lea dstq, [dstq+dstrideq*%6] + sub hd, %6 + jnz .loop + RET +%endmacro + +%define d16 16 +%define s16 16 +%define d32 32 +%define s32 32 +INIT_MMX mmx +fpel_fn put, 4, strideq, strideq*2, stride3q, 4 +fpel_fn put, 8, strideq, strideq*2, stride3q, 4 +INIT_MMX mmxext +fpel_fn avg, 4, strideq, strideq*2, stride3q, 4, 8 +fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 8 +INIT_XMM sse +fpel_fn put, 16, strideq, strideq*2, stride3q, 4 +fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 +fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 +fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 0, 8 +INIT_XMM sse2 +fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8 +fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 8 +fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 8 +INIT_YMM avx +fpel_fn put, 32, strideq, strideq*2, stride3q, 4 +fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2 +fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8 +fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 8 +%endif +INIT_MMX mmxext +fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 16 +INIT_XMM sse2 +fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 16 +fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 16 +fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 16 +fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16, 8 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 16 +fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 16 +fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16 +%endif +%undef s16 +%undef d16 +%undef s32 +%undef d32 diff --git a/media/ffvpx/libavcodec/x86/vp9mc_16bpp.asm b/media/ffvpx/libavcodec/x86/vp9mc_16bpp.asm new file mode 100644 index 0000000000..9a462eaf80 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vp9mc_16bpp.asm @@ -0,0 +1,431 @@ +;****************************************************************************** +;* VP9 MC SIMD optimizations +;* +;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pd_64: times 8 dd 64 + +cextern pw_1023 +cextern pw_4095 + +SECTION .text + +%macro filter_h4_fn 1-2 12 +cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery + mova m5, [pw_1023] +.body: +%if notcpuflag(sse4) && ARCH_X86_64 + pxor m11, m11 +%endif + mova m6, [pd_64] + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+32] + mova m9, [filteryq+64] + mova m10, [filteryq+96] +%endif +.loop: + movh m0, [srcq-6] + movh m1, [srcq-4] + movh m2, [srcq-2] + movh m3, [srcq+0] + movh m4, [srcq+2] + punpcklwd m0, m1 + punpcklwd m2, m3 + pmaddwd m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m2, m8 +%else + pmaddwd m2, [filteryq+32] +%endif + movu m1, [srcq+4] + movu m3, [srcq+6] + paddd m0, m2 + movu m2, [srcq+8] + add srcq, sstrideq + punpcklwd m4, m1 + punpcklwd m3, m2 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m4, m9 + pmaddwd m3, m10 +%else + pmaddwd m4, [filteryq+64] + pmaddwd m3, [filteryq+96] +%endif + paddd m0, m4 + paddd m0, m3 + paddd m0, m6 + psrad m0, 7 +%if cpuflag(sse4) + packusdw m0, m0 +%else + packssdw m0, m0 +%endif +%ifidn %1, avg + movh m1, [dstq] +%endif + pminsw m0, m5 +%if notcpuflag(sse4) +%if ARCH_X86_64 + pmaxsw m0, m11 +%else + pxor m2, m2 + pmaxsw m0, m2 +%endif +%endif +%ifidn %1, avg + pavgw m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET + +cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +filter_h4_fn put +filter_h4_fn avg + +%macro filter_h_fn 1-2 12 +%assign %%px mmsize/2 +cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery + mova m5, [pw_1023] +.body: +%if notcpuflag(sse4) && ARCH_X86_64 + pxor m11, m11 +%endif + mova m6, [pd_64] + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+32] + mova m9, [filteryq+64] + mova m10, [filteryq+96] +%endif +.loop: + movu m0, [srcq-6] + movu m1, [srcq-4] + movu m2, [srcq-2] + movu m3, [srcq+0] + movu m4, [srcq+2] + pmaddwd m0, m7 + pmaddwd m1, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m2, m8 + pmaddwd m3, m8 + pmaddwd m4, m9 +%else + pmaddwd m2, [filteryq+32] + pmaddwd m3, [filteryq+32] + pmaddwd m4, [filteryq+64] +%endif + paddd m0, m2 + paddd m1, m3 + paddd m0, m4 + movu m2, [srcq+4] + movu m3, [srcq+6] + movu m4, [srcq+8] + add srcq, sstrideq +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m2, m9 + pmaddwd m3, m10 + pmaddwd m4, m10 +%else + pmaddwd m2, [filteryq+64] + pmaddwd m3, [filteryq+96] + pmaddwd m4, [filteryq+96] +%endif + paddd m1, m2 + paddd m0, m3 + paddd m1, m4 + paddd m0, m6 + paddd m1, m6 + psrad m0, 7 + psrad m1, 7 +%if cpuflag(sse4) + packusdw m0, m0 + packusdw m1, m1 +%else + packssdw m0, m0 + packssdw m1, m1 +%endif + punpcklwd m0, m1 + pminsw m0, m5 +%if notcpuflag(sse4) +%if ARCH_X86_64 + pmaxsw m0, m11 +%else + pxor m2, m2 + pmaxsw m0, m2 +%endif +%endif +%ifidn %1, avg + pavgw m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET + +cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +filter_h_fn put +filter_h_fn avg +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +filter_h_fn put +filter_h_fn avg +%endif + +%macro filter_v4_fn 1-2 12 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%define hd r4mp +%endif + mova m5, [pw_1023] +.body: +%if notcpuflag(sse4) && ARCH_X86_64 + pxor m11, m11 +%endif + mova m6, [pd_64] + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+ 32] + mova m9, [filteryq+ 64] + mova m10, [filteryq+ 96] +%endif +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movh m0, [srcq] + movh m1, [srcq+sstrideq] + movh m2, [srcq+sstrideq*2] + movh m3, [srcq+sstride3q] + add srcq, sstrideq + movh m4, [src4q] + punpcklwd m0, m1 + punpcklwd m2, m3 + pmaddwd m0, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m2, m8 +%else + pmaddwd m2, [filteryq+ 32] +%endif + movh m1, [src4q+sstrideq] + movh m3, [src4q+sstrideq*2] + paddd m0, m2 + movh m2, [src4q+sstride3q] + add src4q, sstrideq + punpcklwd m4, m1 + punpcklwd m3, m2 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m4, m9 + pmaddwd m3, m10 +%else + pmaddwd m4, [filteryq+ 64] + pmaddwd m3, [filteryq+ 96] +%endif + paddd m0, m4 + paddd m0, m3 + paddd m0, m6 + psrad m0, 7 +%if cpuflag(sse4) + packusdw m0, m0 +%else + packssdw m0, m0 +%endif +%ifidn %1, avg + movh m1, [dstq] +%endif + pminsw m0, m5 +%if notcpuflag(sse4) +%if ARCH_X86_64 + pmaxsw m0, m11 +%else + pxor m2, m2 + pmaxsw m0, m2 +%endif +%endif +%ifidn %1, avg + pavgw m0, m1 +%endif + movh [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET + +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%endif + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +filter_v4_fn put +filter_v4_fn avg + +%macro filter_v_fn 1-2 13 +%assign %%px mmsize/2 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%define hd r4mp +%endif + mova m5, [pw_1023] +.body: +%if notcpuflag(sse4) && ARCH_X86_64 + pxor m12, m12 +%endif +%if ARCH_X86_64 + mova m11, [pd_64] +%endif + lea sstride3q, [sstrideq*3] + lea src4q, [srcq+sstrideq] + sub srcq, sstride3q + mova m7, [filteryq+ 0] +%if ARCH_X86_64 && mmsize > 8 + mova m8, [filteryq+ 32] + mova m9, [filteryq+ 64] + mova m10, [filteryq+ 96] +%endif +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movu m0, [srcq] + movu m1, [srcq+sstrideq] + movu m2, [srcq+sstrideq*2] + movu m3, [srcq+sstride3q] + add srcq, sstrideq + movu m4, [src4q] + SBUTTERFLY wd, 0, 1, 6 + SBUTTERFLY wd, 2, 3, 6 + pmaddwd m0, m7 + pmaddwd m1, m7 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m2, m8 + pmaddwd m3, m8 +%else + pmaddwd m2, [filteryq+ 32] + pmaddwd m3, [filteryq+ 32] +%endif + paddd m0, m2 + paddd m1, m3 + movu m2, [src4q+sstrideq] + movu m3, [src4q+sstrideq*2] + SBUTTERFLY wd, 4, 2, 6 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m4, m9 + pmaddwd m2, m9 +%else + pmaddwd m4, [filteryq+ 64] + pmaddwd m2, [filteryq+ 64] +%endif + paddd m0, m4 + paddd m1, m2 + movu m4, [src4q+sstride3q] + add src4q, sstrideq + SBUTTERFLY wd, 3, 4, 6 +%if ARCH_X86_64 && mmsize > 8 + pmaddwd m3, m10 + pmaddwd m4, m10 +%else + pmaddwd m3, [filteryq+ 96] + pmaddwd m4, [filteryq+ 96] +%endif + paddd m0, m3 + paddd m1, m4 +%if ARCH_X86_64 + paddd m0, m11 + paddd m1, m11 +%else + paddd m0, [pd_64] + paddd m1, [pd_64] +%endif + psrad m0, 7 + psrad m1, 7 +%if cpuflag(sse4) + packusdw m0, m1 +%else + packssdw m0, m1 +%endif + pminsw m0, m5 +%if notcpuflag(sse4) +%if ARCH_X86_64 + pmaxsw m0, m12 +%else + pxor m2, m2 + pmaxsw m0, m2 +%endif +%endif +%ifidn %1, avg + pavgw m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET + +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3 + mov filteryq, r5mp +%endif + mova m5, [pw_4095] + jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body +%endmacro + +INIT_XMM sse2 +filter_v_fn put +filter_v_fn avg +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +filter_v_fn put +filter_v_fn avg +%endif diff --git a/media/ffvpx/libavcodec/x86/vpx_arith.h b/media/ffvpx/libavcodec/x86/vpx_arith.h new file mode 100644 index 0000000000..d9e4c0dec4 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/vpx_arith.h @@ -0,0 +1,55 @@ +/** + * VP5 and VP6 compatible video decoder (arith decoder) + * + * Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> + * Copyright (C) 2010 Eli Friedman + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_VPX_ARITH_H +#define AVCODEC_X86_VPX_ARITH_H + +#include "libavutil/x86/asm.h" + +#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS +#include "libavutil/attributes.h" + +#define vpx_rac_get_prob vpx_rac_get_prob +static av_always_inline int vpx_rac_get_prob(VPXRangeCoder *c, uint8_t prob) +{ + unsigned int code_word = vpx_rac_renorm(c); + unsigned int low = 1 + (((c->high - 1) * prob) >> 8); + unsigned int low_shift = low << 16; + int bit = 0; + c->code_word = code_word; + + __asm__( + "subl %4, %1 \n\t" + "subl %3, %2 \n\t" + "setae %b0 \n\t" + "cmovb %4, %1 \n\t" + "cmovb %5, %2 \n\t" + : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word) + : "r"(low_shift), "r"(low), "r"(code_word) + ); + + return bit; +} +#endif + +#endif /* AVCODEC_X86_VPX_ARITH_H */ diff --git a/media/ffvpx/libavcodec/xiph.c b/media/ffvpx/libavcodec/xiph.c new file mode 100644 index 0000000000..218b0813e9 --- /dev/null +++ b/media/ffvpx/libavcodec/xiph.c @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2007 The FFmpeg Project + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <limits.h> +#include "libavutil/error.h" +#include "libavutil/intreadwrite.h" +#include "xiph.h" + +int avpriv_split_xiph_headers(const uint8_t *extradata, int extradata_size, + int first_header_size, const uint8_t *header_start[3], + int header_len[3]) +{ + int i; + + if (extradata_size >= 6 && AV_RB16(extradata) == first_header_size) { + int overall_len = 6; + for (i=0; i<3; i++) { + header_len[i] = AV_RB16(extradata); + extradata += 2; + header_start[i] = extradata; + extradata += header_len[i]; + if (overall_len > extradata_size - header_len[i]) + return AVERROR_INVALIDDATA; + overall_len += header_len[i]; + } + } else if (extradata_size >= 3 && extradata_size < INT_MAX - 0x1ff && extradata[0] == 2) { + int overall_len = 3; + extradata++; + for (i=0; i<2; i++, extradata++) { + header_len[i] = 0; + for (; overall_len < extradata_size && *extradata==0xff; extradata++) { + header_len[i] += 0xff; + overall_len += 0xff + 1; + } + header_len[i] += *extradata; + overall_len += *extradata; + if (overall_len > extradata_size) + return AVERROR_INVALIDDATA; + } + header_len[2] = extradata_size - overall_len; + header_start[0] = extradata; + header_start[1] = header_start[0] + header_len[0]; + header_start[2] = header_start[1] + header_len[1]; + } else { + return -1; + } + return 0; +} diff --git a/media/ffvpx/libavcodec/xiph.h b/media/ffvpx/libavcodec/xiph.h new file mode 100644 index 0000000000..4ab2469528 --- /dev/null +++ b/media/ffvpx/libavcodec/xiph.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2007 The FFmpeg Project + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_XIPH_H +#define AVCODEC_XIPH_H + +#include <stdint.h> + +/** + * Split a single extradata buffer into the three headers that most + * Xiph codecs use. (e.g. Theora and Vorbis) + * Works both with Matroska's packing and lavc's packing. + * + * @param[in] extradata The single chunk that combines all three headers + * @param[in] extradata_size The size of the extradata buffer + * @param[in] first_header_size The size of the first header, used to + * differentiate between the Matroska packing and lavc packing. + * @param[out] header_start Pointers to the start of the three separate headers. + * @param[out] header_len The sizes of each of the three headers. + * @return On error a negative value is returned, on success zero. + */ +int avpriv_split_xiph_headers(const uint8_t *extradata, int extradata_size, + int first_header_size, const uint8_t *header_start[3], + int header_len[3]); + +#endif /* AVCODEC_XIPH_H */ diff --git a/media/ffvpx/libavcodec/xvididct.h b/media/ffvpx/libavcodec/xvididct.h new file mode 100644 index 0000000000..e0bc1a2b91 --- /dev/null +++ b/media/ffvpx/libavcodec/xvididct.h @@ -0,0 +1,36 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_XVIDIDCT_H +#define AVCODEC_XVIDIDCT_H + +#include <stdint.h> + +#include "avcodec.h" +#include "idctdsp.h" + +void ff_xvid_idct(int16_t *const in); + +void ff_xvid_idct_init(IDCTDSPContext *c, AVCodecContext *avctx); + +void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_xvid_idct_init_mips(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); + +#endif /* AVCODEC_XVIDIDCT_H */ |