diff options
Diffstat (limited to 'media/ffvpx/libavcodec/aarch64/h264dsp_neon.S')
-rw-r--r-- | media/ffvpx/libavcodec/aarch64/h264dsp_neon.S | 498 |
1 files changed, 498 insertions, 0 deletions
diff --git a/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S b/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S new file mode 100644 index 0000000000..4ec35f2905 --- /dev/null +++ b/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +.macro h264_loop_filter_start + cmp w2, #0 + ldr w6, [x4] + ccmp w3, #0, #0, ne + mov v24.S[0], w6 + and w6, w6, w6, lsl #16 + b.eq 1f + ands w6, w6, w6, lsl #8 + b.ge 2f +1: + ret +2: +.endm + +.macro h264_loop_filter_luma + dup v22.16B, w2 // alpha + uxtl v24.8H, v24.8B + uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0) + uxtl v24.4S, v24.4H + uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0) + sli v24.8H, v24.8H, #8 + uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0) + sli v24.4S, v24.4S, #16 + cmhi v21.16B, v22.16B, v21.16B // < alpha + dup v22.16B, w3 // beta + cmlt v23.16B, v24.16B, #0 + cmhi v28.16B, v22.16B, v28.16B // < beta + cmhi v30.16B, v22.16B, v30.16B // < beta + bic v21.16B, v21.16B, v23.16B + uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) + and v21.16B, v21.16B, v28.16B + uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) + cmhi v17.16B, v22.16B, v17.16B // < beta + and v21.16B, v21.16B, v30.16B + cmhi v19.16B, v22.16B, v19.16B // < beta + and v17.16B, v17.16B, v21.16B + and v19.16B, v19.16B, v21.16B + and v24.16B, v24.16B, v21.16B + urhadd v28.16B, v16.16B, v0.16B + sub v21.16B, v24.16B, v17.16B + uqadd v23.16B, v18.16B, v24.16B + uhadd v20.16B, v20.16B, v28.16B + sub v21.16B, v21.16B, v19.16B + uhadd v28.16B, v4.16B, v28.16B + umin v23.16B, v23.16B, v20.16B + uqsub v22.16B, v18.16B, v24.16B + uqadd v4.16B, v2.16B, v24.16B + umax v23.16B, v23.16B, v22.16B + uqsub v22.16B, v2.16B, v24.16B + umin v28.16B, v4.16B, v28.16B + uxtl v4.8H, v0.8B + umax v28.16B, v28.16B, v22.16B + uxtl2 v20.8H, v0.16B + usubw v4.8H, v4.8H, v16.8B + usubw2 v20.8H, v20.8H, v16.16B + shl v4.8H, v4.8H, #2 + shl v20.8H, v20.8H, #2 + uaddw v4.8H, v4.8H, v18.8B + uaddw2 v20.8H, v20.8H, v18.16B + usubw v4.8H, v4.8H, v2.8B + usubw2 v20.8H, v20.8H, v2.16B + rshrn v4.8B, v4.8H, #3 + rshrn2 v4.16B, v20.8H, #3 + bsl v17.16B, v23.16B, v18.16B + bsl v19.16B, v28.16B, v2.16B + neg v23.16B, v21.16B + uxtl v28.8H, v16.8B + smin v4.16B, v4.16B, v21.16B + uxtl2 v21.8H, v16.16B + smax v4.16B, v4.16B, v23.16B + uxtl v22.8H, v0.8B + uxtl2 v24.8H, v0.16B + saddw v28.8H, v28.8H, v4.8B + saddw2 v21.8H, v21.8H, v4.16B + ssubw v22.8H, v22.8H, v4.8B + ssubw2 v24.8H, v24.8H, v4.16B + sqxtun v16.8B, v28.8H + sqxtun2 v16.16B, v21.8H + sqxtun v0.8B, v22.8H + sqxtun2 v0.16B, v24.8H +.endm + +function ff_h264_v_loop_filter_luma_neon, export=1 + h264_loop_filter_start + sxtw x1, w1 + + ld1 {v0.16B}, [x0], x1 + ld1 {v2.16B}, [x0], x1 + ld1 {v4.16B}, [x0], x1 + sub x0, x0, x1, lsl #2 + sub x0, x0, x1, lsl #1 + ld1 {v20.16B}, [x0], x1 + ld1 {v18.16B}, [x0], x1 + ld1 {v16.16B}, [x0], x1 + + h264_loop_filter_luma + + sub x0, x0, x1, lsl #1 + st1 {v17.16B}, [x0], x1 + st1 {v16.16B}, [x0], x1 + st1 {v0.16B}, [x0], x1 + st1 {v19.16B}, [x0] + + ret +endfunc + +function ff_h264_h_loop_filter_luma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, #4 + ld1 {v6.8B}, [x0], x1 + ld1 {v20.8B}, [x0], x1 + ld1 {v18.8B}, [x0], x1 + ld1 {v16.8B}, [x0], x1 + ld1 {v0.8B}, [x0], x1 + ld1 {v2.8B}, [x0], x1 + ld1 {v4.8B}, [x0], x1 + ld1 {v26.8B}, [x0], x1 + ld1 {v6.D}[1], [x0], x1 + ld1 {v20.D}[1], [x0], x1 + ld1 {v18.D}[1], [x0], x1 + ld1 {v16.D}[1], [x0], x1 + ld1 {v0.D}[1], [x0], x1 + ld1 {v2.D}[1], [x0], x1 + ld1 {v4.D}[1], [x0], x1 + ld1 {v26.D}[1], [x0], x1 + + transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 + + h264_loop_filter_luma + + transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27 + + sub x0, x0, x1, lsl #4 + add x0, x0, #2 + st1 {v17.S}[0], [x0], x1 + st1 {v16.S}[0], [x0], x1 + st1 {v0.S}[0], [x0], x1 + st1 {v19.S}[0], [x0], x1 + st1 {v17.S}[1], [x0], x1 + st1 {v16.S}[1], [x0], x1 + st1 {v0.S}[1], [x0], x1 + st1 {v19.S}[1], [x0], x1 + st1 {v17.S}[2], [x0], x1 + st1 {v16.S}[2], [x0], x1 + st1 {v0.S}[2], [x0], x1 + st1 {v19.S}[2], [x0], x1 + st1 {v17.S}[3], [x0], x1 + st1 {v16.S}[3], [x0], x1 + st1 {v0.S}[3], [x0], x1 + st1 {v19.S}[3], [x0], x1 + + ret +endfunc + +.macro h264_loop_filter_chroma + dup v22.8B, w2 // alpha + uxtl v24.8H, v24.8B + uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) + uxtl v4.8H, v0.8B + uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) + usubw v4.8H, v4.8H, v16.8B + sli v24.8H, v24.8H, #8 + shl v4.8H, v4.8H, #2 + uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) + uaddw v4.8H, v4.8H, v18.8B + cmhi v26.8B, v22.8B, v26.8B // < alpha + usubw v4.8H, v4.8H, v2.8B + dup v22.8B, w3 // beta + rshrn v4.8B, v4.8H, #3 + cmhi v28.8B, v22.8B, v28.8B // < beta + cmhi v30.8B, v22.8B, v30.8B // < beta + smin v4.8B, v4.8B, v24.8B + neg v25.8B, v24.8B + and v26.8B, v26.8B, v28.8B + smax v4.8B, v4.8B, v25.8B + and v26.8B, v26.8B, v30.8B + uxtl v22.8H, v0.8B + and v4.8B, v4.8B, v26.8B + uxtl v28.8H, v16.8B + saddw v28.8H, v28.8H, v4.8B + ssubw v22.8H, v22.8H, v4.8B + sqxtun v16.8B, v28.8H + sqxtun v0.8B, v22.8H +.endm + +function ff_h264_v_loop_filter_chroma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, x1, lsl #1 + ld1 {v18.8B}, [x0], x1 + ld1 {v16.8B}, [x0], x1 + ld1 {v0.8B}, [x0], x1 + ld1 {v2.8B}, [x0] + + h264_loop_filter_chroma + + sub x0, x0, x1, lsl #1 + st1 {v16.8B}, [x0], x1 + st1 {v0.8B}, [x0], x1 + + ret +endfunc + +function ff_h264_h_loop_filter_chroma_neon, export=1 + h264_loop_filter_start + + sub x0, x0, #2 + ld1 {v18.S}[0], [x0], x1 + ld1 {v16.S}[0], [x0], x1 + ld1 {v0.S}[0], [x0], x1 + ld1 {v2.S}[0], [x0], x1 + ld1 {v18.S}[1], [x0], x1 + ld1 {v16.S}[1], [x0], x1 + ld1 {v0.S}[1], [x0], x1 + ld1 {v2.S}[1], [x0], x1 + + transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 + + h264_loop_filter_chroma + + transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 + + sub x0, x0, x1, lsl #3 + st1 {v18.S}[0], [x0], x1 + st1 {v16.S}[0], [x0], x1 + st1 {v0.S}[0], [x0], x1 + st1 {v2.S}[0], [x0], x1 + st1 {v18.S}[1], [x0], x1 + st1 {v16.S}[1], [x0], x1 + st1 {v0.S}[1], [x0], x1 + st1 {v2.S}[1], [x0], x1 + + ret +endfunc + +.macro biweight_16 macs, macd + dup v0.16B, w5 + dup v1.16B, w6 + mov v4.16B, v16.16B + mov v6.16B, v16.16B +1: subs w3, w3, #2 + ld1 {v20.16B}, [x0], x2 + \macd v4.8H, v0.8B, v20.8B + \macd\()2 v6.8H, v0.16B, v20.16B + ld1 {v22.16B}, [x1], x2 + \macs v4.8H, v1.8B, v22.8B + \macs\()2 v6.8H, v1.16B, v22.16B + mov v24.16B, v16.16B + ld1 {v28.16B}, [x0], x2 + mov v26.16B, v16.16B + \macd v24.8H, v0.8B, v28.8B + \macd\()2 v26.8H, v0.16B, v28.16B + ld1 {v30.16B}, [x1], x2 + \macs v24.8H, v1.8B, v30.8B + \macs\()2 v26.8H, v1.16B, v30.16B + sshl v4.8H, v4.8H, v18.8H + sshl v6.8H, v6.8H, v18.8H + sqxtun v4.8B, v4.8H + sqxtun2 v4.16B, v6.8H + sshl v24.8H, v24.8H, v18.8H + sshl v26.8H, v26.8H, v18.8H + sqxtun v24.8B, v24.8H + sqxtun2 v24.16B, v26.8H + mov v6.16B, v16.16B + st1 {v4.16B}, [x7], x2 + mov v4.16B, v16.16B + st1 {v24.16B}, [x7], x2 + b.ne 1b + ret +.endm + +.macro biweight_8 macs, macd + dup v0.8B, w5 + dup v1.8B, w6 + mov v2.16B, v16.16B + mov v20.16B, v16.16B +1: subs w3, w3, #2 + ld1 {v4.8B}, [x0], x2 + \macd v2.8H, v0.8B, v4.8B + ld1 {v5.8B}, [x1], x2 + \macs v2.8H, v1.8B, v5.8B + ld1 {v6.8B}, [x0], x2 + \macd v20.8H, v0.8B, v6.8B + ld1 {v7.8B}, [x1], x2 + \macs v20.8H, v1.8B, v7.8B + sshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + sshl v20.8H, v20.8H, v18.8H + sqxtun v4.8B, v20.8H + mov v20.16B, v16.16B + st1 {v2.8B}, [x7], x2 + mov v2.16B, v16.16B + st1 {v4.8B}, [x7], x2 + b.ne 1b + ret +.endm + +.macro biweight_4 macs, macd + dup v0.8B, w5 + dup v1.8B, w6 + mov v2.16B, v16.16B + mov v20.16B,v16.16B +1: subs w3, w3, #4 + ld1 {v4.S}[0], [x0], x2 + ld1 {v4.S}[1], [x0], x2 + \macd v2.8H, v0.8B, v4.8B + ld1 {v5.S}[0], [x1], x2 + ld1 {v5.S}[1], [x1], x2 + \macs v2.8H, v1.8B, v5.8B + b.lt 2f + ld1 {v6.S}[0], [x0], x2 + ld1 {v6.S}[1], [x0], x2 + \macd v20.8H, v0.8B, v6.8B + ld1 {v7.S}[0], [x1], x2 + ld1 {v7.S}[1], [x1], x2 + \macs v20.8H, v1.8B, v7.8B + sshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + sshl v20.8H, v20.8H, v18.8H + sqxtun v4.8B, v20.8H + mov v20.16B, v16.16B + st1 {v2.S}[0], [x7], x2 + st1 {v2.S}[1], [x7], x2 + mov v2.16B, v16.16B + st1 {v4.S}[0], [x7], x2 + st1 {v4.S}[1], [x7], x2 + b.ne 1b + ret +2: sshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + st1 {v2.S}[0], [x7], x2 + st1 {v2.S}[1], [x7], x2 + ret +.endm + +.macro biweight_func w +function ff_biweight_h264_pixels_\w\()_neon, export=1 + sxtw x2, w2 + lsr w8, w5, #31 + add w7, w7, #1 + eor w8, w8, w6, lsr #30 + orr w7, w7, #1 + dup v18.8H, w4 + lsl w7, w7, w4 + not v18.16B, v18.16B + dup v16.8H, w7 + mov x7, x0 + cbz w8, 10f + subs w8, w8, #1 + b.eq 20f + subs w8, w8, #1 + b.eq 30f + b 40f +10: biweight_\w umlal, umlal +20: neg w5, w5 + biweight_\w umlal, umlsl +30: neg w5, w5 + neg w6, w6 + biweight_\w umlsl, umlsl +40: neg w6, w6 + biweight_\w umlsl, umlal +endfunc +.endm + + biweight_func 16 + biweight_func 8 + biweight_func 4 + +.macro weight_16 add + dup v0.16B, w4 +1: subs w2, w2, #2 + ld1 {v20.16B}, [x0], x1 + umull v4.8H, v0.8B, v20.8B + umull2 v6.8H, v0.16B, v20.16B + ld1 {v28.16B}, [x0], x1 + umull v24.8H, v0.8B, v28.8B + umull2 v26.8H, v0.16B, v28.16B + \add v4.8H, v16.8H, v4.8H + srshl v4.8H, v4.8H, v18.8H + \add v6.8H, v16.8H, v6.8H + srshl v6.8H, v6.8H, v18.8H + sqxtun v4.8B, v4.8H + sqxtun2 v4.16B, v6.8H + \add v24.8H, v16.8H, v24.8H + srshl v24.8H, v24.8H, v18.8H + \add v26.8H, v16.8H, v26.8H + srshl v26.8H, v26.8H, v18.8H + sqxtun v24.8B, v24.8H + sqxtun2 v24.16B, v26.8H + st1 {v4.16B}, [x5], x1 + st1 {v24.16B}, [x5], x1 + b.ne 1b + ret +.endm + +.macro weight_8 add + dup v0.8B, w4 +1: subs w2, w2, #2 + ld1 {v4.8B}, [x0], x1 + umull v2.8H, v0.8B, v4.8B + ld1 {v6.8B}, [x0], x1 + umull v20.8H, v0.8B, v6.8B + \add v2.8H, v16.8H, v2.8H + srshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + \add v20.8H, v16.8H, v20.8H + srshl v20.8H, v20.8H, v18.8H + sqxtun v4.8B, v20.8H + st1 {v2.8B}, [x5], x1 + st1 {v4.8B}, [x5], x1 + b.ne 1b + ret +.endm + +.macro weight_4 add + dup v0.8B, w4 +1: subs w2, w2, #4 + ld1 {v4.S}[0], [x0], x1 + ld1 {v4.S}[1], [x0], x1 + umull v2.8H, v0.8B, v4.8B + b.lt 2f + ld1 {v6.S}[0], [x0], x1 + ld1 {v6.S}[1], [x0], x1 + umull v20.8H, v0.8B, v6.8B + \add v2.8H, v16.8H, v2.8H + srshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + \add v20.8H, v16.8H, v20.8H + srshl v20.8H, v20.8h, v18.8H + sqxtun v4.8B, v20.8H + st1 {v2.S}[0], [x5], x1 + st1 {v2.S}[1], [x5], x1 + st1 {v4.S}[0], [x5], x1 + st1 {v4.S}[1], [x5], x1 + b.ne 1b + ret +2: \add v2.8H, v16.8H, v2.8H + srshl v2.8H, v2.8H, v18.8H + sqxtun v2.8B, v2.8H + st1 {v2.S}[0], [x5], x1 + st1 {v2.S}[1], [x5], x1 + ret +.endm + +.macro weight_func w +function ff_weight_h264_pixels_\w\()_neon, export=1 + sxtw x1, w1 + cmp w3, #1 + mov w6, #1 + lsl w5, w5, w3 + dup v16.8H, w5 + mov x5, x0 + b.le 20f + sub w6, w6, w3 + dup v18.8H, w6 + cmp w4, #0 + b.lt 10f + weight_\w shadd +10: neg w4, w4 + weight_\w shsub +20: neg w6, w3 + dup v18.8H, w6 + cmp w4, #0 + b.lt 10f + weight_\w add +10: neg w4, w4 + weight_\w sub +endfunc +.endm + + weight_func 16 + weight_func 8 + weight_func 4 |