/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #define FILTER_OUT_STRIDE 384 .macro sgr_funcs bpc // void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp, // const pixel *src, // const ptrdiff_t src_stride, // const int32_t **a, // const int16_t **b, // const int w, const int h); function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] ldp x7, x8, [x3] ldp x9, x3, [x3, #16] ldp x10, x11, [x4] ldp x12, x4, [x4, #16] mov x13, #FILTER_OUT_STRIDE cmp w6, #1 add x2, x1, x2 // src + stride csel x2, x1, x2, le // if (h <= 1) x2 = x1 add x13, x0, x13, lsl #1 movi v30.8h, #3 movi v31.4s, #3 1: ld1 {v0.8h, v1.8h}, [x10], #32 ld1 {v2.8h, v3.8h}, [x11], #32 ld1 {v4.8h, v5.8h}, [x12], #32 ld1 {v6.8h, v7.8h}, [x4], #32 ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48 ld1 {v22.4s, v23.4s, v24.4s}, [x9], #48 ld1 {v25.4s, v26.4s, v27.4s}, [x3], #48 2: ext v8.16b, v0.16b, v1.16b, #2 // [0][1] ext v9.16b, v2.16b, v3.16b, #2 // [1][1] ext v10.16b, v4.16b, v5.16b, #2 // [2][1] ext v11.16b, v0.16b, v1.16b, #4 // [0][2] ext v12.16b, v2.16b, v3.16b, #4 // [1][2] ext v13.16b, v4.16b, v5.16b, #4 // [2][2] add v14.8h, v2.8h, v8.8h // [1][0] + [0][1] add v15.8h, v9.8h, v10.8h // [1][1] + [2][1] add v28.8h, v0.8h, v11.8h // [0][0] + [0][2] add v14.8h, v14.8h, v12.8h // () + [1][2] add v29.8h, v4.8h, v13.8h // [2][0] + [2][2] ext v8.16b, v6.16b, v7.16b, #2 // [3][1] ext v11.16b, v6.16b, v7.16b, #4 // [3][2] add v14.8h, v14.8h, v15.8h // mid add v15.8h, v28.8h, v29.8h // corners add v28.8h, v4.8h, v9.8h // [2][0] + [1][1] add v29.8h, v10.8h, v8.8h // [2][1] + [3][1] add v2.8h, v2.8h, v12.8h // [1][0] + [1][2] add v28.8h, v28.8h, v13.8h // () + [2][2] add v4.8h, v6.8h, v11.8h // [3][0] + [3][2] add v0.8h, v28.8h, v29.8h // mid add v2.8h, v2.8h, v4.8h // corners shl v4.8h, v14.8h, #2 mla v4.8h, v15.8h, v30.8h // * 3 -> a shl v0.8h, v0.8h, #2 mla v0.8h, v2.8h, v30.8h // * 3 -> a ext v8.16b, v16.16b, v17.16b, #4 // [0][1] ext v9.16b, v17.16b, v18.16b, #4 ext v10.16b, v16.16b, v17.16b, #8 // [0][2] ext v11.16b, v17.16b, v18.16b, #8 ext v12.16b, v19.16b, v20.16b, #4 // [1][1] ext v13.16b, v20.16b, v21.16b, #4 add v8.4s, v8.4s, v19.4s // [0][1] + [1][0] add v9.4s, v9.4s, v20.4s add v16.4s, v16.4s, v10.4s // [0][0] + [0][2] add v17.4s, v17.4s, v11.4s ext v14.16b, v19.16b, v20.16b, #8 // [1][2] ext v15.16b, v20.16b, v21.16b, #8 add v16.4s, v16.4s, v22.4s // () + [2][0] add v17.4s, v17.4s, v23.4s add v28.4s, v12.4s, v14.4s // [1][1] + [1][2] add v29.4s, v13.4s, v15.4s ext v10.16b, v22.16b, v23.16b, #4 // [2][1] ext v11.16b, v23.16b, v24.16b, #4 add v8.4s, v8.4s, v28.4s // mid (incomplete) add v9.4s, v9.4s, v29.4s add v19.4s, v19.4s, v14.4s // [1][0] + [1][2] add v20.4s, v20.4s, v15.4s add v14.4s, v22.4s, v12.4s // [2][0] + [1][1] add v15.4s, v23.4s, v13.4s ext v12.16b, v22.16b, v23.16b, #8 // [2][2] ext v13.16b, v23.16b, v24.16b, #8 ext v28.16b, v25.16b, v26.16b, #4 // [3][1] ext v29.16b, v26.16b, v27.16b, #4 add v8.4s, v8.4s, v10.4s // () + [2][1] = mid add v9.4s, v9.4s, v11.4s add v14.4s, v14.4s, v10.4s // () + [2][1] add v15.4s, v15.4s, v11.4s ext v10.16b, v25.16b, v26.16b, #8 // [3][2] ext v11.16b, v26.16b, v27.16b, #8 add v16.4s, v16.4s, v12.4s // () + [2][2] = corner add v17.4s, v17.4s, v13.4s add v12.4s, v12.4s, v28.4s // [2][2] + [3][1] add v13.4s, v13.4s, v29.4s add v25.4s, v25.4s, v10.4s // [3][0] + [3][2] add v26.4s, v26.4s, v11.4s add v14.4s, v14.4s, v12.4s // mid add v15.4s, v15.4s, v13.4s add v19.4s, v19.4s, v25.4s // corner add v20.4s, v20.4s, v26.4s .if \bpc == 8 ld1 {v25.8b}, [x1], #8 // src ld1 {v26.8b}, [x2], #8 .else ld1 {v25.8h}, [x1], #16 // src ld1 {v26.8h}, [x2], #16 .endif shl v8.4s, v8.4s, #2 shl v9.4s, v9.4s, #2 mla v8.4s, v16.4s, v31.4s // * 3 -> b mla v9.4s, v17.4s, v31.4s .if \bpc == 8 uxtl v25.8h, v25.8b // src uxtl v26.8h, v26.8b .endif shl v14.4s, v14.4s, #2 shl v15.4s, v15.4s, #2 mla v14.4s, v19.4s, v31.4s // * 3 -> b mla v15.4s, v20.4s, v31.4s umlal v8.4s, v4.4h, v25.4h // b + a * src umlal2 v9.4s, v4.8h, v25.8h umlal v14.4s, v0.4h, v26.4h // b + a * src umlal2 v15.4s, v0.8h, v26.8h mov v0.16b, v1.16b rshrn v8.4h, v8.4s, #9 rshrn2 v8.8h, v9.4s, #9 mov v2.16b, v3.16b rshrn v14.4h, v14.4s, #9 rshrn2 v14.8h, v15.4s, #9 subs w5, w5, #8 mov v4.16b, v5.16b st1 {v8.8h}, [x0], #16 mov v6.16b, v7.16b st1 {v14.8h}, [x13], #16 b.le 3f mov v16.16b, v18.16b mov v19.16b, v21.16b mov v22.16b, v24.16b mov v25.16b, v27.16b ld1 {v1.8h}, [x10], #16 ld1 {v3.8h}, [x11], #16 ld1 {v5.8h}, [x12], #16 ld1 {v7.8h}, [x4], #16 ld1 {v17.4s, v18.4s}, [x7], #32 ld1 {v20.4s, v21.4s}, [x8], #32 ld1 {v23.4s, v24.4s}, [x9], #32 ld1 {v26.4s, v27.4s}, [x3], #32 b 2b 3: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret endfunc // void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst, // const int32_t **a, const int16_t **b, // const int w, const int w1, // const int bitdepth_max); function sgr_finish_weighted1_\bpc\()bpc_neon, export=1 ldp x7, x8, [x1] ldr x1, [x1, #16] ldp x9, x10, [x2] ldr x2, [x2, #16] dup v31.8h, w4 dup v30.8h, w5 movi v6.8h, #3 movi v7.4s, #3 1: ld1 {v0.8h, v1.8h}, [x9], #32 ld1 {v2.8h, v3.8h}, [x10], #32 ld1 {v4.8h, v5.8h}, [x2], #32 ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48 ld1 {v22.4s, v23.4s, v24.4s}, [x1], #48 2: ext v25.16b, v0.16b, v1.16b, #2 // -stride ext v26.16b, v2.16b, v3.16b, #2 // 0 ext v27.16b, v4.16b, v5.16b, #2 // +stride ext v28.16b, v0.16b, v1.16b, #4 // +1-stride ext v29.16b, v2.16b, v3.16b, #4 // +1 add v2.8h, v2.8h, v25.8h // -1, -stride ext v25.16b, v4.16b, v5.16b, #4 // +1+stride add v26.8h, v26.8h, v27.8h // 0, +stride add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride add v2.8h, v2.8h, v26.8h add v4.8h, v4.8h, v25.8h // -1+stride, +1+stride add v2.8h, v2.8h, v29.8h // +1 add v0.8h, v0.8h, v4.8h ext v25.16b, v16.16b, v17.16b, #4 // -stride ext v26.16b, v17.16b, v18.16b, #4 shl v2.8h, v2.8h, #2 ext v27.16b, v16.16b, v17.16b, #8 // +1-stride ext v28.16b, v17.16b, v18.16b, #8 ext v29.16b, v19.16b, v20.16b, #4 // 0 ext v4.16b, v20.16b, v21.16b, #4 mla v2.8h, v0.8h, v6.8h // * 3 -> a add v25.4s, v25.4s, v19.4s // -stride, -1 add v26.4s, v26.4s, v20.4s add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride add v17.4s, v17.4s, v28.4s ext v27.16b, v19.16b, v20.16b, #8 // +1 ext v28.16b, v20.16b, v21.16b, #8 add v16.4s, v16.4s, v22.4s // -1+stride add v17.4s, v17.4s, v23.4s add v29.4s, v29.4s, v27.4s // 0, +1 add v4.4s, v4.4s, v28.4s add v25.4s, v25.4s, v29.4s add v26.4s, v26.4s, v4.4s ext v27.16b, v22.16b, v23.16b, #4 // +stride ext v28.16b, v23.16b, v24.16b, #4 ext v29.16b, v22.16b, v23.16b, #8 // +1+stride ext v4.16b, v23.16b, v24.16b, #8 .if \bpc == 8 ld1 {v19.8b}, [x0] // src .else ld1 {v19.8h}, [x0] // src .endif add v25.4s, v25.4s, v27.4s // +stride add v26.4s, v26.4s, v28.4s add v16.4s, v16.4s, v29.4s // +1+stride add v17.4s, v17.4s, v4.4s shl v25.4s, v25.4s, #2 shl v26.4s, v26.4s, #2 mla v25.4s, v16.4s, v7.4s // * 3 -> b mla v26.4s, v17.4s, v7.4s .if \bpc == 8 uxtl v19.8h, v19.8b // src .endif mov v0.16b, v1.16b umlal v25.4s, v2.4h, v19.4h // b + a * src umlal2 v26.4s, v2.8h, v19.8h mov v2.16b, v3.16b rshrn v25.4h, v25.4s, #9 rshrn2 v25.8h, v26.4s, #9 subs w3, w3, #8 // weighted1 shl v19.8h, v19.8h, #4 // u mov v4.16b, v5.16b sub v25.8h, v25.8h, v19.8h // t1 - u ld1 {v1.8h}, [x9], #16 ushll v26.4s, v19.4h, #7 // u << 7 ushll2 v27.4s, v19.8h, #7 // u << 7 ld1 {v3.8h}, [x10], #16 smlal v26.4s, v25.4h, v31.4h // v smlal2 v27.4s, v25.8h, v31.8h // v ld1 {v5.8h}, [x2], #16 .if \bpc == 8 rshrn v26.4h, v26.4s, #11 rshrn2 v26.8h, v27.4s, #11 mov v16.16b, v18.16b sqxtun v26.8b, v26.8h mov v19.16b, v21.16b mov v22.16b, v24.16b st1 {v26.8b}, [x0], #8 .else sqrshrun v26.4h, v26.4s, #11 sqrshrun2 v26.8h, v27.4s, #11 mov v16.16b, v18.16b umin v26.8h, v26.8h, v30.8h mov v19.16b, v21.16b mov v22.16b, v24.16b st1 {v26.8h}, [x0], #16 .endif b.le 3f ld1 {v17.4s, v18.4s}, [x7], #32 ld1 {v20.4s, v21.4s}, [x8], #32 ld1 {v23.4s, v24.4s}, [x1], #32 b 2b 3: ret endfunc // void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp, // const pixel *src, // const ptrdiff_t stride, // const int32_t **a, // const int16_t **b, // const int w, const int h); function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] ldp x3, x7, [x3] ldp x4, x8, [x4] mov x10, #FILTER_OUT_STRIDE cmp w6, #1 add x2, x1, x2 // src + stride csel x2, x1, x2, le // if (h <= 1) x2 = x1 add x10, x0, x10, lsl #1 movi v4.8h, #5 movi v5.4s, #5 movi v6.8h, #6 movi v7.4s, #6 1: ld1 {v0.8h, v1.8h}, [x4], #32 ld1 {v2.8h, v3.8h}, [x8], #32 ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 2: ext v24.16b, v0.16b, v1.16b, #4 // +1-stride ext v25.16b, v2.16b, v3.16b, #4 // +1+stride ext v22.16b, v0.16b, v1.16b, #2 // -stride ext v23.16b, v2.16b, v3.16b, #2 // +stride add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride add v2.8h, v22.8h, v23.8h // -stride, +stride add v0.8h, v0.8h, v25.8h mul v8.8h, v25.8h, v4.8h // * 5 mla v8.8h, v23.8h, v6.8h // * 6 ext v22.16b, v16.16b, v17.16b, #4 // -stride ext v23.16b, v17.16b, v18.16b, #4 ext v24.16b, v19.16b, v20.16b, #4 // +stride ext v25.16b, v20.16b, v21.16b, #4 ext v26.16b, v16.16b, v17.16b, #8 // +1-stride ext v27.16b, v17.16b, v18.16b, #8 ext v28.16b, v19.16b, v20.16b, #8 // +1+stride ext v29.16b, v20.16b, v21.16b, #8 mul v0.8h, v0.8h, v4.8h // * 5 mla v0.8h, v2.8h, v6.8h // * 6 .if \bpc == 8 ld1 {v31.8b}, [x1], #8 ld1 {v30.8b}, [x2], #8 .else ld1 {v31.8h}, [x1], #16 ld1 {v30.8h}, [x2], #16 .endif add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride add v17.4s, v17.4s, v27.4s add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride add v20.4s, v20.4s, v29.4s add v16.4s, v16.4s, v19.4s add v17.4s, v17.4s, v20.4s mul v9.4s, v19.4s, v5.4s // * 5 mla v9.4s, v24.4s, v7.4s // * 6 mul v10.4s, v20.4s, v5.4s // * 5 mla v10.4s, v25.4s, v7.4s // * 6 add v22.4s, v22.4s, v24.4s // -stride, +stride add v23.4s, v23.4s, v25.4s // This is, surprisingly, faster than other variants where the // mul+mla pairs are further apart, on Cortex A53. mul v16.4s, v16.4s, v5.4s // * 5 mla v16.4s, v22.4s, v7.4s // * 6 mul v17.4s, v17.4s, v5.4s // * 5 mla v17.4s, v23.4s, v7.4s // * 6 .if \bpc == 8 uxtl v31.8h, v31.8b uxtl v30.8h, v30.8b .endif umlal v16.4s, v0.4h, v31.4h // b + a * src umlal2 v17.4s, v0.8h, v31.8h umlal v9.4s, v8.4h, v30.4h // b + a * src umlal2 v10.4s, v8.8h, v30.8h mov v0.16b, v1.16b rshrn v16.4h, v16.4s, #9 rshrn2 v16.8h, v17.4s, #9 rshrn v9.4h, v9.4s, #8 rshrn2 v9.8h, v10.4s, #8 subs w5, w5, #8 mov v2.16b, v3.16b st1 {v16.8h}, [x0], #16 st1 {v9.8h}, [x10], #16 b.le 9f mov v16.16b, v18.16b mov v19.16b, v21.16b ld1 {v1.8h}, [x4], #16 ld1 {v3.8h}, [x8], #16 ld1 {v17.4s, v18.4s}, [x3], #32 ld1 {v20.4s, v21.4s}, [x7], #32 b 2b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret endfunc // void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, // const int32_t **a, // const int16_t **b, // const int w, const int h, // const int w1, // const int bitdepth_max); function sgr_finish_weighted2_\bpc\()bpc_neon, export=1 stp d8, d9, [sp, #-0x30]! str d10, [sp, #0x10] stp d14, d15, [sp, #0x20] dup v14.8h, w6 dup v15.8h, w7 ldp x2, x7, [x2] ldp x3, x8, [x3] cmp w5, #1 add x1, x0, x1 // src + stride // if (h <= 1), set the pointer to the second row to any dummy buffer // we can clobber (x2 in this case) csel x1, x2, x1, le movi v4.8h, #5 movi v5.4s, #5 movi v6.8h, #6 movi v7.4s, #6 1: ld1 {v0.8h, v1.8h}, [x3], #32 ld1 {v2.8h, v3.8h}, [x8], #32 ld1 {v16.4s, v17.4s, v18.4s}, [x2], #48 ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 2: ext v24.16b, v0.16b, v1.16b, #4 // +1-stride ext v25.16b, v2.16b, v3.16b, #4 // +1+stride ext v22.16b, v0.16b, v1.16b, #2 // -stride ext v23.16b, v2.16b, v3.16b, #2 // +stride add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride add v2.8h, v22.8h, v23.8h // -stride, +stride add v0.8h, v0.8h, v25.8h mul v8.8h, v25.8h, v4.8h // * 5 mla v8.8h, v23.8h, v6.8h // * 6 ext v22.16b, v16.16b, v17.16b, #4 // -stride ext v23.16b, v17.16b, v18.16b, #4 ext v24.16b, v19.16b, v20.16b, #4 // +stride ext v25.16b, v20.16b, v21.16b, #4 ext v26.16b, v16.16b, v17.16b, #8 // +1-stride ext v27.16b, v17.16b, v18.16b, #8 ext v28.16b, v19.16b, v20.16b, #8 // +1+stride ext v29.16b, v20.16b, v21.16b, #8 mul v0.8h, v0.8h, v4.8h // * 5 mla v0.8h, v2.8h, v6.8h // * 6 .if \bpc == 8 ld1 {v31.8b}, [x0] ld1 {v30.8b}, [x1] .else ld1 {v31.8h}, [x0] ld1 {v30.8h}, [x1] .endif add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride add v17.4s, v17.4s, v27.4s add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride add v20.4s, v20.4s, v29.4s add v16.4s, v16.4s, v19.4s add v17.4s, v17.4s, v20.4s mul v9.4s, v19.4s, v5.4s // * 5 mla v9.4s, v24.4s, v7.4s // * 6 mul v10.4s, v20.4s, v5.4s // * 5 mla v10.4s, v25.4s, v7.4s // * 6 add v22.4s, v22.4s, v24.4s // -stride, +stride add v23.4s, v23.4s, v25.4s // This is, surprisingly, faster than other variants where the // mul+mla pairs are further apart, on Cortex A53. mul v16.4s, v16.4s, v5.4s // * 5 mla v16.4s, v22.4s, v7.4s // * 6 mul v17.4s, v17.4s, v5.4s // * 5 mla v17.4s, v23.4s, v7.4s // * 6 .if \bpc == 8 uxtl v31.8h, v31.8b uxtl v30.8h, v30.8b .endif umlal v16.4s, v0.4h, v31.4h // b + a * src umlal2 v17.4s, v0.8h, v31.8h umlal v9.4s, v8.4h, v30.4h // b + a * src umlal2 v10.4s, v8.8h, v30.8h mov v0.16b, v1.16b rshrn v16.4h, v16.4s, #9 rshrn2 v16.8h, v17.4s, #9 rshrn v9.4h, v9.4s, #8 rshrn2 v9.8h, v10.4s, #8 subs w4, w4, #8 // weighted1 shl v31.8h, v31.8h, #4 // u shl v30.8h, v30.8h, #4 mov v2.16b, v3.16b sub v16.8h, v16.8h, v31.8h // t1 - u sub v9.8h, v9.8h, v30.8h ld1 {v1.8h}, [x3], #16 ushll v22.4s, v31.4h, #7 // u << 7 ushll2 v23.4s, v31.8h, #7 ushll v24.4s, v30.4h, #7 ushll2 v25.4s, v30.8h, #7 ld1 {v3.8h}, [x8], #16 smlal v22.4s, v16.4h, v14.4h // v smlal2 v23.4s, v16.8h, v14.8h mov v16.16b, v18.16b smlal v24.4s, v9.4h, v14.4h smlal2 v25.4s, v9.8h, v14.8h mov v19.16b, v21.16b .if \bpc == 8 rshrn v22.4h, v22.4s, #11 rshrn2 v22.8h, v23.4s, #11 rshrn v23.4h, v24.4s, #11 rshrn2 v23.8h, v25.4s, #11 sqxtun v22.8b, v22.8h sqxtun v23.8b, v23.8h st1 {v22.8b}, [x0], #8 st1 {v23.8b}, [x1], #8 .else sqrshrun v22.4h, v22.4s, #11 sqrshrun2 v22.8h, v23.4s, #11 sqrshrun v23.4h, v24.4s, #11 sqrshrun2 v23.8h, v25.4s, #11 umin v22.8h, v22.8h, v15.8h umin v23.8h, v23.8h, v15.8h st1 {v22.8h}, [x0], #16 st1 {v23.8h}, [x1], #16 .endif b.le 3f ld1 {v17.4s, v18.4s}, [x2], #32 ld1 {v20.4s, v21.4s}, [x7], #32 b 2b 3: ldp d14, d15, [sp, #0x20] ldr d10, [sp, #0x10] ldp d8, d9, [sp], 0x30 ret endfunc // void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *t1, const int16_t *t2, // const int w, const int h, // const int16_t wt[2], const int bitdepth_max); function sgr_weighted2_\bpc\()bpc_neon, export=1 .if \bpc == 8 ldr x8, [sp] .else ldp x8, x9, [sp] .endif cmp w7, #2 add x10, x0, x1 add x11, x2, x3 add x12, x4, #2*FILTER_OUT_STRIDE add x13, x5, #2*FILTER_OUT_STRIDE ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1] .if \bpc == 16 dup v29.8h, w9 .endif mov x8, #4*FILTER_OUT_STRIDE lsl x1, x1, #1 lsl x3, x3, #1 add x9, x6, #7 bic x9, x9, #7 // Aligned width .if \bpc == 8 sub x1, x1, x9 sub x3, x3, x9 .else sub x1, x1, x9, lsl #1 sub x3, x3, x9, lsl #1 .endif sub x8, x8, x9, lsl #1 mov w9, w6 b.lt 2f 1: .if \bpc == 8 ld1 {v0.8b}, [x2], #8 ld1 {v16.8b}, [x11], #8 .else ld1 {v0.8h}, [x2], #16 ld1 {v16.8h}, [x11], #16 .endif ld1 {v1.8h}, [x4], #16 ld1 {v17.8h}, [x12], #16 ld1 {v2.8h}, [x5], #16 ld1 {v18.8h}, [x13], #16 subs w6, w6, #8 .if \bpc == 8 ushll v0.8h, v0.8b, #4 // u ushll v16.8h, v16.8b, #4 // u .else shl v0.8h, v0.8h, #4 // u shl v16.8h, v16.8h, #4 // u .endif sub v1.8h, v1.8h, v0.8h // t1 - u sub v2.8h, v2.8h, v0.8h // t2 - u sub v17.8h, v17.8h, v16.8h // t1 - u sub v18.8h, v18.8h, v16.8h // t2 - u ushll v3.4s, v0.4h, #7 // u << 7 ushll2 v4.4s, v0.8h, #7 // u << 7 ushll v19.4s, v16.4h, #7 // u << 7 ushll2 v20.4s, v16.8h, #7 // u << 7 smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u) smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u) smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u) smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u) .if \bpc == 8 rshrn v3.4h, v3.4s, #11 rshrn2 v3.8h, v4.4s, #11 rshrn v19.4h, v19.4s, #11 rshrn2 v19.8h, v20.4s, #11 sqxtun v3.8b, v3.8h sqxtun v19.8b, v19.8h st1 {v3.8b}, [x0], #8 st1 {v19.8b}, [x10], #8 .else sqrshrun v3.4h, v3.4s, #11 sqrshrun2 v3.8h, v4.4s, #11 sqrshrun v19.4h, v19.4s, #11 sqrshrun2 v19.8h, v20.4s, #11 umin v3.8h, v3.8h, v29.8h umin v19.8h, v19.8h, v29.8h st1 {v3.8h}, [x0], #16 st1 {v19.8h}, [x10], #16 .endif b.gt 1b subs w7, w7, #2 cmp w7, #1 b.lt 0f mov w6, w9 add x0, x0, x1 add x10, x10, x1 add x2, x2, x3 add x11, x11, x3 add x4, x4, x8 add x12, x12, x8 add x5, x5, x8 add x13, x13, x8 b.eq 2f b 1b 2: .if \bpc == 8 ld1 {v0.8b}, [x2], #8 .else ld1 {v0.8h}, [x2], #16 .endif ld1 {v1.8h}, [x4], #16 ld1 {v2.8h}, [x5], #16 subs w6, w6, #8 .if \bpc == 8 ushll v0.8h, v0.8b, #4 // u .else shl v0.8h, v0.8h, #4 // u .endif sub v1.8h, v1.8h, v0.8h // t1 - u sub v2.8h, v2.8h, v0.8h // t2 - u ushll v3.4s, v0.4h, #7 // u << 7 ushll2 v4.4s, v0.8h, #7 // u << 7 smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) .if \bpc == 8 rshrn v3.4h, v3.4s, #11 rshrn2 v3.8h, v4.4s, #11 sqxtun v3.8b, v3.8h st1 {v3.8b}, [x0], #8 .else sqrshrun v3.4h, v3.4s, #11 sqrshrun2 v3.8h, v4.4s, #11 umin v3.8h, v3.8h, v29.8h st1 {v3.8h}, [x0], #16 .endif b.gt 1b 0: ret endfunc .endm