diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /third_party/dav1d/src/arm/64/ipred16.S | |
parent | Initial commit. (diff) | |
download | firefox-esr-upstream.tar.xz firefox-esr-upstream.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/arm/64/ipred16.S')
-rw-r--r-- | third_party/dav1d/src/arm/64/ipred16.S | 4204 |
1 files changed, 4204 insertions, 0 deletions
diff --git a/third_party/dav1d/src/arm/64/ipred16.S b/third_party/dav1d/src/arm/64/ipred16.S new file mode 100644 index 0000000000..c48c48583c --- /dev/null +++ b/third_party/dav1d/src/arm/64/ipred16.S @@ -0,0 +1,4204 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height, +// const int bitdepth_max); +function ipred_dc_128_16bpc_neon, export=1 + ldr w8, [sp] + clz w3, w3 + adr x5, L(ipred_dc_128_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + dup v0.8h, w8 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + urshr v0.8h, v0.8h, #1 + br x5 +4: + AARCH64_VALID_JUMP_TARGET + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + sub x1, x1, #64 +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_128_tbl): + .hword L(ipred_dc_128_tbl) - 640b + .hword L(ipred_dc_128_tbl) - 320b + .hword L(ipred_dc_128_tbl) - 160b + .hword L(ipred_dc_128_tbl) - 8b + .hword L(ipred_dc_128_tbl) - 4b +endfunc + +// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_v_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #2 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2] +4: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2] +8: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h}, [x2] +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + sub x1, x1, #64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_v_tbl): + .hword L(ipred_v_tbl) - 640b + .hword L(ipred_v_tbl) - 320b + .hword L(ipred_v_tbl) - 160b + .hword L(ipred_v_tbl) - 80b + .hword L(ipred_v_tbl) - 40b +endfunc + +// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_h_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + sub x2, x2, #8 + sub x5, x5, w3, uxtw + mov x7, #-8 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +4: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + st1 {v3.4h}, [x0], x1 + st1 {v2.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v1.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +8: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +16: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 16b + ret +32: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + stp q3, q3, [x0, #32] + stp q2, q2, [x6, #32] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + stp q1, q1, [x0, #32] + stp q0, q0, [x6, #32] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 32b + ret +64: + AARCH64_VALID_JUMP_TARGET + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + stp q3, q3, [x0, #32] + stp q2, q2, [x6, #32] + stp q3, q3, [x0, #64] + stp q2, q2, [x6, #64] + stp q3, q3, [x0, #96] + stp q2, q2, [x6, #96] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + stp q1, q1, [x0, #32] + stp q0, q0, [x6, #32] + stp q1, q1, [x0, #64] + stp q0, q0, [x6, #64] + stp q1, q1, [x0, #96] + stp q0, q0, [x6, #96] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_h_tbl): + .hword L(ipred_h_tbl) - 64b + .hword L(ipred_h_tbl) - 32b + .hword L(ipred_h_tbl) - 16b + .hword L(ipred_h_tbl) - 8b + .hword L(ipred_h_tbl) - 4b +endfunc + +// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_dc_top_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #2 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.4h, v0.h[0] +4: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] +8: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addv h0, v0.8h + urshr v2.4h, v0.4h, #4 + dup v0.8h, v2.h[0] + dup v1.8h, v2.h[0] +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #5 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #6 + sub x1, x1, #64 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_top_tbl): + .hword L(ipred_dc_top_tbl) - 640b + .hword L(ipred_dc_top_tbl) - 320b + .hword L(ipred_dc_top_tbl) - 160b + .hword L(ipred_dc_top_tbl) - 80b + .hword L(ipred_dc_top_tbl) - 40b +endfunc + +// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_16bpc_neon, export=1 + sub x2, x2, w4, uxtw #1 + clz w3, w3 + clz w7, w4 + adr x5, L(ipred_dc_left_tbl) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w7, w7, #25 + ldrh w3, [x5, w3, uxtw #1] + ldrh w7, [x5, w7, uxtw #1] + sub x3, x5, w3, uxtw + sub x5, x5, w7, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_left_h4): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + br x3 +L(ipred_dc_left_w4): + AARCH64_VALID_JUMP_TARGET + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt L(ipred_dc_left_w4) + ret + +L(ipred_dc_left_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x3 +L(ipred_dc_left_w8): + AARCH64_VALID_JUMP_TARGET + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt L(ipred_dc_left_w8) + ret + +L(ipred_dc_left_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addv h0, v0.8h + urshr v2.4h, v0.4h, #4 + dup v0.8h, v2.h[0] + dup v1.8h, v2.h[0] + br x3 +L(ipred_dc_left_w16): + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b +1: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + uaddlp v0.4s, v0.8h + addv s0, v0.4s + rshrn v4.4h, v0.4s, #5 + dup v0.8h, v4.h[0] + br x3 +L(ipred_dc_left_w32): + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b +1: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_h64): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #6 + dup v0.8h, v4.h[0] + br x3 +L(ipred_dc_left_w64): + AARCH64_VALID_JUMP_TARGET + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + sub x1, x1, #64 +1: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_tbl): + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) +endfunc + +// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_16bpc_neon, export=1 + sub x2, x2, w4, uxtw #1 + add w7, w3, w4 // width + height + clz w3, w3 + clz w6, w4 + dup v16.4s, w7 // width + height + adr x5, L(ipred_dc_tbl) + rbit w7, w7 // rbit(width + height) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w6, w6, #25 + clz w7, w7 // ctz(width + height) + ldrh w3, [x5, w3, uxtw #1] + ldrh w6, [x5, w6, uxtw #1] + neg w7, w7 // -ctz(width + height) + sub x3, x5, w3, uxtw + sub x5, x5, w6, uxtw + ushr v16.4s, v16.4s, #1 // (width + height) >> 1 + dup v17.4s, w7 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_h4): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2], #8 + uaddlv s0, v0.4h + add x2, x2, #2 + br x3 +L(ipred_dc_w4): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.4h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s1, v1.4h + cmp w4, #4 + add v0.2s, v0.2s, v1.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.4h, v0.h[0] +2: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2], #16 + uaddlv s0, v0.8h + add x2, x2, #2 + br x3 +L(ipred_dc_w8): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.8h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s1, v1.8h + cmp w4, #8 + add v0.2s, v0.2s, v1.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] +2: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h}, [x2], #32 + addp v0.8h, v0.8h, v1.8h + add x2, x2, #2 + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w16): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.8h, v2.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + uaddlv s1, v1.8h + cmp w4, #16 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/8/32/64 + tst w4, #(32+16+8) // 16 added to make a consecutive bitmask + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + add x2, x2, #2 + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w32): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + addp v3.8h, v3.8h, v4.8h + addp v1.8h, v1.8h, v3.8h + uaddlv s1, v1.8h + cmp w4, #32 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16/64 + cmp w4, #8 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h64): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + add x2, x2, #2 + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w64): + AARCH64_VALID_JUMP_TARGET + ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] + addp v3.8h, v3.8h, v4.8h + addp v20.8h, v20.8h, v21.8h + addp v22.8h, v22.8h, v23.8h + addp v1.8h, v1.8h, v3.8h + addp v20.8h, v20.8h, v22.8h + addp v1.8h, v1.8h, v20.8h + uaddlv s1, v1.8h + cmp w4, #64 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 16/32 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + sub x1, x1, #64 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_tbl): + .hword L(ipred_dc_tbl) - L(ipred_dc_h64) + .hword L(ipred_dc_tbl) - L(ipred_dc_h32) + .hword L(ipred_dc_tbl) - L(ipred_dc_h16) + .hword L(ipred_dc_tbl) - L(ipred_dc_h8) + .hword L(ipred_dc_tbl) - L(ipred_dc_h4) + .hword L(ipred_dc_tbl) - L(ipred_dc_w64) + .hword L(ipred_dc_tbl) - L(ipred_dc_w32) + .hword L(ipred_dc_tbl) - L(ipred_dc_w16) + .hword L(ipred_dc_tbl) - L(ipred_dc_w8) + .hword L(ipred_dc_tbl) - L(ipred_dc_w4) +endfunc + +// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_16bpc_neon, export=1 + clz w9, w3 + adr x5, L(ipred_paeth_tbl) + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x2] + add x8, x2, #2 + sub x2, x2, #8 + sub x5, x5, w9, uxtw + mov x7, #-8 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v5.2d}, [x8] + sub v6.8h, v5.8h, v4.8h // top - topleft +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 + zip1 v0.2d, v0.2d, v1.2d + zip1 v2.2d, v2.2d, v3.2d + add v16.8h, v6.8h, v0.8h // base + add v17.8h, v6.8h, v2.8h + sabd v20.8h, v5.8h, v16.8h // tdiff + sabd v21.8h, v5.8h, v17.8h + sabd v22.8h, v4.8h, v16.8h // tldiff + sabd v23.8h, v4.8h, v17.8h + sabd v16.8h, v0.8h, v16.8h // ldiff + sabd v17.8h, v2.8h, v17.8h + umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) + umin v19.8h, v21.8h, v23.8h + cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff + cmge v21.8h, v23.8h, v21.8h + cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff + cmge v17.8h, v19.8h, v17.8h + bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v20.16b, v5.16b, v4.16b + bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... + bit v20.16b, v0.16b, v16.16b + st1 {v21.d}[1], [x0], x1 + st1 {v21.d}[0], [x6], x1 + subs w4, w4, #4 + st1 {v20.d}[1], [x0], x1 + st1 {v20.d}[0], [x6], x1 + b.gt 4b + ret +80: +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + ld1 {v5.8h}, [x8], #16 + mov w9, w3 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 +1: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 +2: + sub v6.8h, v5.8h, v4.8h // top - topleft + add v16.8h, v6.8h, v0.8h // base + add v17.8h, v6.8h, v1.8h + add v18.8h, v6.8h, v2.8h + add v19.8h, v6.8h, v3.8h + sabd v20.8h, v5.8h, v16.8h // tdiff + sabd v21.8h, v5.8h, v17.8h + sabd v22.8h, v5.8h, v18.8h + sabd v23.8h, v5.8h, v19.8h + sabd v24.8h, v4.8h, v16.8h // tldiff + sabd v25.8h, v4.8h, v17.8h + sabd v26.8h, v4.8h, v18.8h + sabd v27.8h, v4.8h, v19.8h + sabd v16.8h, v0.8h, v16.8h // ldiff + sabd v17.8h, v1.8h, v17.8h + sabd v18.8h, v2.8h, v18.8h + sabd v19.8h, v3.8h, v19.8h + umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) + umin v29.8h, v21.8h, v25.8h + umin v30.8h, v22.8h, v26.8h + umin v31.8h, v23.8h, v27.8h + cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff + cmge v21.8h, v25.8h, v21.8h + cmge v22.8h, v26.8h, v22.8h + cmge v23.8h, v27.8h, v23.8h + cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff + cmge v17.8h, v29.8h, v17.8h + cmge v18.8h, v30.8h, v18.8h + cmge v19.8h, v31.8h, v19.8h + bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v22.16b, v5.16b, v4.16b + bsl v21.16b, v5.16b, v4.16b + bsl v20.16b, v5.16b, v4.16b + bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... + bit v22.16b, v2.16b, v18.16b + bit v21.16b, v1.16b, v17.16b + bit v20.16b, v0.16b, v16.16b + st1 {v23.8h}, [x0], #16 + st1 {v22.8h}, [x6], #16 + subs w3, w3, #8 + st1 {v21.8h}, [x5], #16 + st1 {v20.8h}, [x10], #16 + b.le 8f + ld1 {v5.8h}, [x8], #16 + b 2b +8: + subs w4, w4, #4 + b.le 9f + // End of horizontal loop, move pointers to next four rows + sub x8, x8, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + // Load the top row as early as possible + ld1 {v5.8h}, [x8], #16 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_paeth_tbl): + .hword L(ipred_paeth_tbl) - 640b + .hword L(ipred_paeth_tbl) - 320b + .hword L(ipred_paeth_tbl) - 160b + .hword L(ipred_paeth_tbl) - 80b + .hword L(ipred_paeth_tbl) - 40b +endfunc + +// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_16bpc_neon, export=1 + movrel x10, X(sm_weights) + add x11, x10, w4, uxtw + add x10, x10, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_tbl) + sub x12, x2, w4, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x12] // bottom + add x8, x2, #2 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v6.2d}, [x8] // top + ld1r {v7.2s}, [x10] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + dup v5.8h, v6.h[3] // right + sub v6.8h, v6.8h, v4.8h // top-bottom + uxtl v7.8h, v7.8b // weights_hor + add v31.4h, v4.4h, v5.4h // bottom+right +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + zip1 v1.2d, v1.2d, v0.2d // left, flipped + zip1 v0.2d, v3.2d, v2.2d + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v18.8h, v18.8b + smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor + smlal2 v21.4s, v0.8h, v7.8h + smlal v22.4s, v1.4h, v7.4h + smlal2 v23.4s, v1.8h, v7.8h + smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v6.8h, v16.8h + smlal v22.4s, v6.4h, v18.4h + smlal2 v23.4s, v6.8h, v18.8h + rshrn v20.4h, v20.4s, #9 + rshrn v21.4h, v21.4s, #9 + rshrn v22.4h, v22.4s, #9 + rshrn v23.4h, v23.4s, #9 + st1 {v20.4h}, [x0], x1 + st1 {v21.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.4h}, [x0], x1 + st1 {v23.4h}, [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v6.8h}, [x8] // top + ld1 {v7.8b}, [x10] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + dup v5.8h, v6.h[7] // right + sub v6.8h, v6.8h, v4.8h // top-bottom + uxtl v7.8h, v7.8b // weights_hor + add v31.4h, v4.4h, v5.4h // bottom+right +8: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + ushll v24.4s, v31.4h, #8 + ushll v25.4s, v31.4h, #8 + ushll v26.4s, v31.4h, #8 + ushll v27.4s, v31.4h, #8 + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sub v2.8h, v2.8h, v5.8h + sub v3.8h, v3.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor + smlal2 v21.4s, v3.8h, v7.8h // (left flipped) + smlal v22.4s, v2.4h, v7.4h + smlal2 v23.4s, v2.8h, v7.8h + smlal v24.4s, v1.4h, v7.4h + smlal2 v25.4s, v1.8h, v7.8h + smlal v26.4s, v0.4h, v7.4h + smlal2 v27.4s, v0.8h, v7.8h + smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v6.8h, v16.8h + smlal v22.4s, v6.4h, v17.4h + smlal2 v23.4s, v6.8h, v17.8h + smlal v24.4s, v6.4h, v18.4h + smlal2 v25.4s, v6.8h, v18.8h + smlal v26.4s, v6.4h, v19.4h + smlal2 v27.4s, v6.8h, v19.8h + rshrn v20.4h, v20.4s, #9 + rshrn2 v20.8h, v21.4s, #9 + rshrn v21.4h, v22.4s, #9 + rshrn2 v21.8h, v23.4s, #9 + rshrn v22.4h, v24.4s, #9 + rshrn2 v22.8h, v25.4s, #9 + rshrn v23.4h, v26.4s, #9 + rshrn2 v23.8h, v27.4s, #9 + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + add x12, x2, w3, uxtw #1 + sub x1, x1, w3, uxtw #1 + ld1r {v5.8h}, [x12] // right + sub x2, x2, #4 + mov x7, #-4 + mov w9, w3 + add v31.4h, v4.4h, v5.4h // bottom+right + +1: + ld2r {v0.8h, v1.8h}, [x2], x7 // left + ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b +2: + ld1 {v7.16b}, [x10], #16 // weights_hor + ld1 {v2.8h, v3.8h}, [x8], #32 // top + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + ushll v24.4s, v31.4h, #8 + ushll v25.4s, v31.4h, #8 + ushll v26.4s, v31.4h, #8 + ushll v27.4s, v31.4h, #8 + uxtl v6.8h, v7.8b // weights_hor + uxtl2 v7.8h, v7.16b + sub v2.8h, v2.8h, v4.8h // top-bottom + sub v3.8h, v3.8h, v4.8h + smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor + smlal2 v21.4s, v1.8h, v6.8h // (left flipped) + smlal v22.4s, v1.4h, v7.4h + smlal2 v23.4s, v1.8h, v7.8h + smlal v24.4s, v0.4h, v6.4h + smlal2 v25.4s, v0.8h, v6.8h + smlal v26.4s, v0.4h, v7.4h + smlal2 v27.4s, v0.8h, v7.8h + smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v2.8h, v16.8h + smlal v22.4s, v3.4h, v16.4h + smlal2 v23.4s, v3.8h, v16.8h + smlal v24.4s, v2.4h, v17.4h + smlal2 v25.4s, v2.8h, v17.8h + smlal v26.4s, v3.4h, v17.4h + smlal2 v27.4s, v3.8h, v17.8h + rshrn v20.4h, v20.4s, #9 + rshrn2 v20.8h, v21.4s, #9 + rshrn v21.4h, v22.4s, #9 + rshrn2 v21.8h, v23.4s, #9 + rshrn v22.4h, v24.4s, #9 + rshrn2 v22.8h, v25.4s, #9 + rshrn v23.4h, v26.4s, #9 + rshrn2 v23.8h, v27.4s, #9 + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + b.gt 2b + subs w4, w4, #2 + b.le 9f + sub x8, x8, w9, uxtw #1 + sub x10, x10, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_tbl): + .hword L(ipred_smooth_tbl) - 640b + .hword L(ipred_smooth_tbl) - 320b + .hword L(ipred_smooth_tbl) - 160b + .hword L(ipred_smooth_tbl) - 80b + .hword L(ipred_smooth_tbl) - 40b +endfunc + +// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_16bpc_neon, export=1 + movrel x7, X(sm_weights) + add x7, x7, w4, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_v_tbl) + sub x8, x2, w4, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x8] // bottom + add x2, x2, #2 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v6.2d}, [x2] // top + sub v6.8h, v6.8h, v4.8h // top-bottom +4: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v18.8h, v18.8b, #7 + sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v6.8h, v18.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + st1 {v20.d}[0], [x0], x1 + st1 {v20.d}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.d}[0], [x0], x1 + st1 {v21.d}[1], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v6.8h}, [x2] // top + sub v6.8h, v6.8h, v4.8h // top-bottom +8: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v17.8h, v17.8b, #7 + ushll v18.8h, v18.8b, #7 + ushll v19.8h, v19.8b, #7 + sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v6.8h, v17.8h + sqrdmulh v22.8h, v6.8h, v18.8h + sqrdmulh v23.8h, v6.8h, v19.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + add v22.8h, v22.8h, v4.8h + add v23.8h, v23.8h, v4.8h + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + // Set up pointers for four rows in parallel; x0, x6, x5, x8 + add x5, x0, x1 + add x8, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v17.8h, v17.8b, #7 + ushll v18.8h, v18.8b, #7 + ushll v19.8h, v19.8b, #7 +2: + ld1 {v2.8h, v3.8h}, [x2], #32 // top + sub v2.8h, v2.8h, v4.8h // top-bottom + sub v3.8h, v3.8h, v4.8h + sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v3.8h, v16.8h + sqrdmulh v22.8h, v2.8h, v17.8h + sqrdmulh v23.8h, v3.8h, v17.8h + sqrdmulh v24.8h, v2.8h, v18.8h + sqrdmulh v25.8h, v3.8h, v18.8h + sqrdmulh v26.8h, v2.8h, v19.8h + sqrdmulh v27.8h, v3.8h, v19.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + add v22.8h, v22.8h, v4.8h + add v23.8h, v23.8h, v4.8h + add v24.8h, v24.8h, v4.8h + add v25.8h, v25.8h, v4.8h + add v26.8h, v26.8h, v4.8h + add v27.8h, v27.8h, v4.8h + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + st1 {v24.8h, v25.8h}, [x5], #32 + st1 {v26.8h, v27.8h}, [x8], #32 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x2, x2, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x8, x8, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_v_tbl): + .hword L(ipred_smooth_v_tbl) - 640b + .hword L(ipred_smooth_v_tbl) - 320b + .hword L(ipred_smooth_v_tbl) - 160b + .hword L(ipred_smooth_v_tbl) - 80b + .hword L(ipred_smooth_v_tbl) - 40b +endfunc + +// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_16bpc_neon, export=1 + movrel x8, X(sm_weights) + add x8, x8, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_h_tbl) + add x12, x2, w3, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v5.8h}, [x12] // right + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ld1r {v7.2s}, [x8] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + ushll v7.8h, v7.8b, #7 // weights_hor << 7 +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left + zip1 v1.2d, v1.2d, v0.2d // left, flipped + zip1 v0.2d, v3.2d, v2.2d + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v1.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + st1 {v20.d}[0], [x0], x1 + st1 {v20.d}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.d}[0], [x0], x1 + st1 {v21.d}[1], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ld1 {v7.8b}, [x8] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + ushll v7.8h, v7.8b, #7 // weights_hor << 7 +8: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + sub v3.8h, v3.8h, v5.8h // left-right + sub v2.8h, v2.8h, v5.8h + sub v1.8h, v1.8h, v5.8h + sub v0.8h, v0.8h, v5.8h + sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) + sqrdmulh v22.8h, v1.8h, v7.8h + sqrdmulh v23.8h, v0.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v5.8h + add v23.8h, v23.8h, v5.8h + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + sub x2, x2, #8 + mov x7, #-8 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sub v2.8h, v2.8h, v5.8h + sub v3.8h, v3.8h, v5.8h +2: + ld1 {v7.16b}, [x8], #16 // weights_hor + ushll v6.8h, v7.8b, #7 // weights_hor << 7 + ushll2 v7.8h, v7.16b, #7 + sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) + sqrdmulh v22.8h, v2.8h, v6.8h + sqrdmulh v23.8h, v2.8h, v7.8h + sqrdmulh v24.8h, v1.8h, v6.8h + sqrdmulh v25.8h, v1.8h, v7.8h + sqrdmulh v26.8h, v0.8h, v6.8h + sqrdmulh v27.8h, v0.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v5.8h + add v23.8h, v23.8h, v5.8h + add v24.8h, v24.8h, v5.8h + add v25.8h, v25.8h, v5.8h + add v26.8h, v26.8h, v5.8h + add v27.8h, v27.8h, v5.8h + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + st1 {v24.8h, v25.8h}, [x5], #32 + st1 {v26.8h, v27.8h}, [x10], #32 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x8, x8, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_h_tbl): + .hword L(ipred_smooth_h_tbl) - 640b + .hword L(ipred_smooth_h_tbl) - 320b + .hword L(ipred_smooth_h_tbl) - 160b + .hword L(ipred_smooth_h_tbl) - 80b + .hword L(ipred_smooth_h_tbl) - 40b +endfunc + +const padding_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +padding_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz, +// const pixel *const in, const int end, +// const int bitdepth_max); +function ipred_z1_upsample_edge_16bpc_neon, export=1 + dup v30.8h, w4 // bitdepth_max + movrel x4, padding_mask + ld1 {v0.8h, v1.8h}, [x2] // in[] + add x5, x2, w3, uxtw #1 // in[end] + sub x4, x4, w3, uxtw #1 + + ld1r {v2.8h}, [x5] // padding + ld1 {v3.8h, v4.8h}, [x4] // padding_mask + + movi v31.8h, #9 + + bit v0.16b, v2.16b, v3.16b // padded in[] + bit v1.16b, v2.16b, v4.16b + + ext v4.16b, v0.16b, v1.16b, #2 + ext v5.16b, v1.16b, v2.16b, #2 + ext v6.16b, v0.16b, v1.16b, #4 + ext v7.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + + add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2] + add v19.8h, v5.8h, v7.8h + add v20.8h, v0.8h, v16.8h + add v21.8h, v1.8h, v17.8h + umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2]) + umull2 v23.4s, v18.8h, v31.8h + umull v24.4s, v19.4h, v31.4h + umull2 v25.4s, v19.8h, v31.8h + usubw v22.4s, v22.4s, v20.4h + usubw2 v23.4s, v23.4s, v20.8h + usubw v24.4s, v24.4s, v21.4h + usubw2 v25.4s, v25.4s, v21.8h + + sqrshrun v16.4h, v22.4s, #4 + sqrshrun2 v16.8h, v23.4s, #4 + sqrshrun v17.4h, v24.4s, #4 + sqrshrun2 v17.8h, v25.4s, #4 + + smin v16.8h, v16.8h, v30.8h + smin v17.8h, v17.8h, v30.8h + + zip1 v0.8h, v4.8h, v16.8h + zip2 v1.8h, v4.8h, v16.8h + zip1 v2.8h, v5.8h, v17.8h + zip2 v3.8h, v5.8h, v17.8h + + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + + ret +endfunc + +const edge_filter + .short 0, 4, 8, 0 + .short 0, 5, 6, 0 +// Leaving out the coeffs for strength=3 +// .byte 2, 4, 4, 0 +endconst + +// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz, +// const pixel *const in, const int end, +// const int strength); +function ipred_z1_filter_edge_16bpc_neon, export=1 + cmp w4, #3 + b.eq L(fivetap) // if (strength == 3) goto fivetap + + movrel x5, edge_filter, -6 + add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1) + + ld1 {v31.s}[0], [x5] // kernel[1-2] + + ld1 {v0.8h}, [x2], #16 + + dup v30.8h, v31.h[0] + dup v31.8h, v31.h[1] +1: + // in[end], is the last valid pixel. We produce 16 pixels out by + // using 18 pixels in - the last pixel used is [17] of the ones + // read/buffered. + cmp w3, #17 + ld1 {v1.8h, v2.8h}, [x2], #32 + b.lt 2f + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + mul v16.8h, v0.8h, v30.8h + mla v16.8h, v3.8h, v31.8h + mla v16.8h, v5.8h, v30.8h + mul v17.8h, v1.8h, v30.8h + mla v17.8h, v4.8h, v31.8h + mla v17.8h, v6.8h, v30.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + urshr v16.8h, v16.8h, #4 + urshr v17.8h, v17.8h, #4 + sub w3, w3, #16 + st1 {v16.8h, v17.8h}, [x0], #32 + b.gt 1b + ret +2: + // Right padding + + // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead) + movrel x5, padding_mask + sub w6, w3, #24 + sub x5, x5, w3, uxtw #1 + add x6, x2, w6, sxtw #1 + + ld1 {v3.8h, v4.8h}, [x5] // padding_mask + + ld1r {v2.8h}, [x6] + bit v0.16b, v2.16b, v3.16b // Pad v0-v1 + bit v1.16b, v2.16b, v4.16b + + // Filter one block + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + mul v16.8h, v0.8h, v30.8h + mla v16.8h, v3.8h, v31.8h + mla v16.8h, v5.8h, v30.8h + mul v17.8h, v1.8h, v30.8h + mla v17.8h, v4.8h, v31.8h + mla v17.8h, v6.8h, v30.8h + subs w1, w1, #16 + urshr v16.8h, v16.8h, #4 + urshr v17.8h, v17.8h, #4 + st1 {v16.8h, v17.8h}, [x0], #32 + b.le 9f +5: + // After one block, any remaining output would only be filtering + // padding - thus just store the padding. + subs w1, w1, #16 + st1 {v2.16b}, [x0], #16 + b.gt 5b +9: + ret + +L(fivetap): + sub x2, x2, #2 // topleft -= 1 pixel + movi v29.8h, #2 + ld1 {v0.8h}, [x2], #16 + movi v30.8h, #4 + movi v31.8h, #4 + ins v0.h[0], v0.h[1] +1: + // in[end+1], is the last valid pixel. We produce 16 pixels out by + // using 20 pixels in - the last pixel used is [19] of the ones + // read/buffered. + cmp w3, #18 + ld1 {v1.8h, v2.8h}, [x2], #32 + b.lt 2f // if (end + 1 < 19) + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + ext v18.16b, v0.16b, v1.16b, #8 + ext v19.16b, v1.16b, v2.16b, #8 + mul v20.8h, v0.8h, v29.8h + mla v20.8h, v3.8h, v30.8h + mla v20.8h, v5.8h, v31.8h + mla v20.8h, v16.8h, v30.8h + mla v20.8h, v18.8h, v29.8h + mul v21.8h, v1.8h, v29.8h + mla v21.8h, v4.8h, v30.8h + mla v21.8h, v6.8h, v31.8h + mla v21.8h, v17.8h, v30.8h + mla v21.8h, v19.8h, v29.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + urshr v20.8h, v20.8h, #4 + urshr v21.8h, v21.8h, #4 + sub w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + b.gt 1b + ret +2: + // Right padding + + // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead) + movrel x5, padding_mask, -2 + sub w6, w3, #23 + sub x5, x5, w3, uxtw #1 + add x6, x2, w6, sxtw #1 + + ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask + + ld1r {v28.8h}, [x6] + bit v0.16b, v28.16b, v3.16b // Pad v0-v2 + bit v1.16b, v28.16b, v4.16b + bit v2.16b, v28.16b, v5.16b +4: + // Filter one block + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + ext v18.16b, v0.16b, v1.16b, #8 + ext v19.16b, v1.16b, v2.16b, #8 + mul v20.8h, v0.8h, v29.8h + mla v20.8h, v3.8h, v30.8h + mla v20.8h, v5.8h, v31.8h + mla v20.8h, v16.8h, v30.8h + mla v20.8h, v18.8h, v29.8h + mul v21.8h, v1.8h, v29.8h + mla v21.8h, v4.8h, v30.8h + mla v21.8h, v6.8h, v31.8h + mla v21.8h, v17.8h, v30.8h + mla v21.8h, v19.8h, v29.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + mov v1.16b, v28.16b + mov v2.16b, v28.16b + urshr v20.8h, v20.8h, #4 + urshr v21.8h, v21.8h, #4 + sub w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + b.le 9f + // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to + // filter properly once more - aka (w3 >= 0). + cmp w3, #0 + b.ge 4b +5: + // When w3 <= 0, all remaining pixels in v0-v1 are equal to the + // last valid pixel - thus just output that without filtering. + subs w1, w1, #8 + st1 {v28.8h}, [x0], #16 + b.gt 5b +9: + ret +endfunc + +// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px, +// const int n); +function ipred_pixel_set_16bpc_neon, export=1 + dup v0.8h, w1 +1: + subs w2, w2, #8 + st1 {v0.8h}, [x0], #16 + b.gt 1b + ret +endfunc + +// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const top, +// const int width, const int height, +// const int dx, const int max_base_x); +function ipred_z1_fill1_16bpc_neon, export=1 + clz w9, w3 + adr x8, L(ipred_z1_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw #1 // top[max_base_x] + sub x8, x8, w9, uxtw + ld1r {v31.8h}, [x10] // padding + mov w7, w5 + mov w15, #64 + br x8 +40: + AARCH64_VALID_JUMP_TARGET +4: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + ext v1.16b, v0.16b, v0.16b, #2 // top[base+1] + ext v3.16b, v2.16b, v2.16b, #2 + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + st1 {v16.4h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.4h}, [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.4h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.4h}, [x0], x1 + b.gt 49b + ret + +80: + AARCH64_VALID_JUMP_TARGET +8: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h}, [x8] // top[base] + ld1 {v2.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + ldr h1, [x8, #16] + ldr h3, [x10, #16] + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + ext v1.16b, v0.16b, v1.16b, #2 // top[base+1] + ext v3.16b, v2.16b, v3.16b, #2 + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v1.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v3.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v3.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + st1 {v16.8h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8h}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8h}, [x0], x1 + b.gt 89b + ret + +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + + mov w12, w3 + + add x13, x0, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 169f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v6.8h, w9 // frac + dup v7.8h, w11 + ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base] + ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v16.8h, w9 // 64 - frac + dup v17.8h, w11 + add w7, w7, w5 // xpos += dx +2: + ext v18.16b, v0.16b, v1.16b, #2 // top[base+1] + ext v19.16b, v1.16b, v2.16b, #2 + ext v20.16b, v3.16b, v4.16b, #2 + ext v21.16b, v4.16b, v5.16b, #2 + subs w3, w3, #16 + umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac) + umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac + umull2 v23.4s, v0.8h, v16.8h + umlal2 v23.4s, v18.8h, v6.8h + umull v24.4s, v1.4h, v16.4h + umlal v24.4s, v19.4h, v6.4h + umull2 v25.4s, v1.8h, v16.8h + umlal2 v25.4s, v19.8h, v6.8h + umull v26.4s, v3.4h, v17.4h + umlal v26.4s, v20.4h, v7.4h + umull2 v27.4s, v3.8h, v17.8h + umlal2 v27.4s, v20.8h, v7.8h + umull v28.4s, v4.4h, v17.4h + umlal v28.4s, v21.4h, v7.4h + umull2 v29.4s, v4.8h, v17.8h + umlal2 v29.4s, v21.8h, v7.8h + rshrn v22.4h, v22.4s, #6 + rshrn2 v22.8h, v23.4s, #6 + rshrn v23.4h, v24.4s, #6 + rshrn2 v23.8h, v25.4s, #6 + rshrn v24.4h, v26.4s, #6 + rshrn2 v24.8h, v27.4s, #6 + rshrn v25.4h, v28.4s, #6 + rshrn2 v25.8h, v29.4s, #6 + st1 {v22.8h, v23.8h}, [x0], #32 + st1 {v24.8h, v25.8h}, [x13], #32 + b.le 3f + mov v0.16b, v2.16b + ld1 {v1.8h, v2.8h}, [x8], #32 // top[base] + mov v3.16b, v5.16b + ld1 {v4.8h, v5.8h}, [x10], #32 + b 2b + +3: + subs w4, w4, #2 + b.le 9f + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 1b +9: + ret + +169: + st1 {v31.8h}, [x0], #16 + subs w3, w3, #8 + st1 {v31.8h}, [x13], #16 + b.gt 169b + subs w4, w4, #2 + b.le 9b + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 169b + +L(ipred_z1_fill1_tbl): + .hword L(ipred_z1_fill1_tbl) - 640b + .hword L(ipred_z1_fill1_tbl) - 320b + .hword L(ipred_z1_fill1_tbl) - 160b + .hword L(ipred_z1_fill1_tbl) - 80b + .hword L(ipred_z1_fill1_tbl) - 40b +endfunc + +function ipred_z1_fill2_16bpc_neon, export=1 + cmp w3, #8 + add x10, x2, w6, uxtw // top[max_base_x] + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + b.eq 8f + +4: // w == 4 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + uzp2 v1.8h, v0.8h, v0.8h // top[base+1] + uzp1 v0.8h, v0.8h, v0.8h // top[base] + uzp2 v3.8h, v2.8h, v2.8h + uzp1 v2.8h, v2.8h, v2.8h + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + st1 {v16.4h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.4h}, [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.4h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.4h}, [x0], x1 + b.gt 49b + ret + +8: // w == 8 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h, v1.8h}, [x8] // top[base] + ld1 {v2.8h, v3.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + uzp2 v20.8h, v0.8h, v1.8h // top[base+1] + uzp1 v0.8h, v0.8h, v1.8h // top[base] + uzp2 v21.8h, v2.8h, v3.8h + uzp1 v2.8h, v2.8h, v3.8h + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v20.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v21.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v21.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + st1 {v16.8h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8h}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8h}, [x0], x1 + b.gt 89b + ret +endfunc + +// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src, +// const int n); +function ipred_reverse_16bpc_neon, export=1 + sub x1, x1, #16 + add x3, x0, #8 + mov x4, #16 +1: + ld1 {v0.8h}, [x1] + subs w2, w2, #8 + rev64 v0.8h, v0.8h + sub x1, x1, #16 + st1 {v0.d}[1], [x0], x4 + st1 {v0.d}[0], [x3], x4 + b.gt 1b + ret +endfunc + +// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const left, +// const int width, const int height, +// const int dy, const int max_base_y); +function ipred_z3_fill1_16bpc_neon, export=1 + clz w9, w4 + adr x8, L(ipred_z3_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw #1 // left[max_base_y] + sub x8, x8, w9, uxtw + ld1r {v31.8h}, [x10] // padding + mov w7, w5 + mov w15, #64 + add x13, x0, x1 + lsl x1, x1, #1 + br x8 + +40: + AARCH64_VALID_JUMP_TARGET +4: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // left[base] + ldr q2, [x2, w10, uxtw] + dup v4.8h, w9 // frac + dup v5.8h, w11 + ext v1.16b, v0.16b, v0.16b, #2 // left[base+1] + ext v3.16b, v2.16b, v2.16b, #2 + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + add w7, w7, w5 // xpos += dx + st1 {v18.s}[2], [x0] + st1 {v18.s}[3], [x13] + b.le 9f + sub x0, x0, x1 // ptr -= 4 * (2*stride) + sub x13, x13, x1 + add x0, x0, #4 + add x13, x13, #4 + b 4b +9: + ret + +80: + AARCH64_VALID_JUMP_TARGET +8: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h}, [x8] // left[base] + ld1 {v2.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + ldr h1, [x8, #16] + ldr h3, [x10, #16] + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + ext v1.16b, v0.16b, v1.16b, #2 // left[base+1] + ext v3.16b, v2.16b, v3.16b, #2 + umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac) + umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v1.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v3.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v3.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + zip2 v19.8h, v16.8h, v17.8h + add w7, w7, w5 // xpos += dx + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + b.le 9f + sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) + sub x13, x13, x1, lsl #2 + add x0, x0, #4 + add x13, x13, #4 + b 8b +9: + ret + +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + mov w12, w4 +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // ypos += dy + cmp w8, w6 // base >= max_base_y + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v6.8h, w9 // frac + dup v7.8h, w11 + ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base] + ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v16.8h, w9 // 64 - frac + dup v17.8h, w11 + add w7, w7, w5 // ypos += dy +2: + ext v18.16b, v0.16b, v1.16b, #2 // left[base+1] + ext v19.16b, v1.16b, v2.16b, #2 + ext v20.16b, v3.16b, v4.16b, #2 + ext v21.16b, v4.16b, v5.16b, #2 + subs w4, w4, #16 + umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac) + umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac + umull2 v23.4s, v0.8h, v16.8h + umlal2 v23.4s, v18.8h, v6.8h + umull v24.4s, v1.4h, v16.4h + umlal v24.4s, v19.4h, v6.4h + umull2 v25.4s, v1.8h, v16.8h + umlal2 v25.4s, v19.8h, v6.8h + umull v26.4s, v3.4h, v17.4h + umlal v26.4s, v20.4h, v7.4h + umull2 v27.4s, v3.8h, v17.8h + umlal2 v27.4s, v20.8h, v7.8h + umull v28.4s, v4.4h, v17.4h + umlal v28.4s, v21.4h, v7.4h + umull2 v29.4s, v4.8h, v17.8h + umlal2 v29.4s, v21.8h, v7.8h + rshrn v22.4h, v22.4s, #6 + rshrn2 v22.8h, v23.4s, #6 + rshrn v23.4h, v24.4s, #6 + rshrn2 v23.8h, v25.4s, #6 + rshrn v24.4h, v26.4s, #6 + rshrn2 v24.8h, v27.4s, #6 + rshrn v25.4h, v28.4s, #6 + rshrn2 v25.8h, v29.4s, #6 + zip1 v18.8h, v22.8h, v24.8h + zip2 v19.8h, v22.8h, v24.8h + zip1 v20.8h, v23.8h, v25.8h + zip2 v21.8h, v23.8h, v25.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x13], x1 + st1 {v20.s}[2], [x0], x1 + st1 {v20.s}[3], [x13], x1 + st1 {v21.s}[0], [x0], x1 + st1 {v21.s}[1], [x13], x1 + st1 {v21.s}[2], [x0], x1 + st1 {v21.s}[3], [x13], x1 + b.le 3f + mov v0.16b, v2.16b + ld1 {v1.8h, v2.8h}, [x8], #32 // left[base] + mov v3.16b, v5.16b + ld1 {v4.8h, v5.8h}, [x10], #32 + b 2b + +3: + subs w3, w3, #2 + b.le 9f + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + lsl x1, x1, #1 + add x0, x0, #4 + add x13, x13, #4 + mov w4, w12 + b 1b +9: + ret + +L(ipred_z3_fill1_tbl): + .hword L(ipred_z3_fill1_tbl) - 640b + .hword L(ipred_z3_fill1_tbl) - 320b + .hword L(ipred_z3_fill1_tbl) - 160b + .hword L(ipred_z3_fill1_tbl) - 80b + .hword L(ipred_z3_fill1_tbl) - 40b +endfunc + +function ipred_z3_fill_padding_neon, export=0 + cmp w3, #8 + adr x8, L(ipred_z3_fill_padding_tbl) + b.gt L(ipred_z3_fill_padding_wide) + // w3 = remaining width, w4 = constant height + mov w12, w4 + +1: + // Fill a WxH rectangle with padding. W can be any number; + // this fills the exact width by filling in the largest + // power of two in the remaining width, and repeating. + clz w9, w3 + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + sub x9, x8, w9, uxtw + br x9 + +2: + st1 {v31.s}[0], [x0], x1 + subs w4, w4, #4 + st1 {v31.s}[0], [x13], x1 + st1 {v31.s}[0], [x0], x1 + st1 {v31.s}[0], [x13], x1 + b.gt 2b + subs w3, w3, #2 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #4 + add x13, x13, #4 + mov w4, w12 + b 1b + +4: + st1 {v31.4h}, [x0], x1 + subs w4, w4, #4 + st1 {v31.4h}, [x13], x1 + st1 {v31.4h}, [x0], x1 + st1 {v31.4h}, [x13], x1 + b.gt 4b + subs w3, w3, #4 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #8 + add x13, x13, #8 + mov w4, w12 + b 1b + +8: +16: +32: +64: + st1 {v31.8h}, [x0], x1 + subs w4, w4, #4 + st1 {v31.8h}, [x13], x1 + st1 {v31.8h}, [x0], x1 + st1 {v31.8h}, [x13], x1 + b.gt 4b + subs w3, w3, #8 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #16 + add x13, x13, #16 + mov w4, w12 + b 1b + +9: + ret + +L(ipred_z3_fill_padding_tbl): + .hword L(ipred_z3_fill_padding_tbl) - 64b + .hword L(ipred_z3_fill_padding_tbl) - 32b + .hword L(ipred_z3_fill_padding_tbl) - 16b + .hword L(ipred_z3_fill_padding_tbl) - 8b + .hword L(ipred_z3_fill_padding_tbl) - 4b + .hword L(ipred_z3_fill_padding_tbl) - 2b + +L(ipred_z3_fill_padding_wide): + // Fill a WxH rectangle with padding, with W > 8. + lsr x1, x1, #1 + mov w12, w3 + sub x1, x1, w3, uxtw #1 +1: + ands w5, w3, #7 + b.eq 2f + // If the width isn't aligned to 8, first do one 8 pixel write + // and align the start pointer. + sub w3, w3, w5 + st1 {v31.8h}, [x0] + add x0, x0, w5, uxtw #1 +2: + // Fill the rest of the line with aligned 8 pixel writes. + subs w3, w3, #8 + st1 {v31.8h}, [x0], #16 + b.gt 2b + subs w4, w4, #1 + add x0, x0, x1 + b.le 9f + mov w3, w12 + b 1b +9: + ret +endfunc + +function ipred_z3_fill2_16bpc_neon, export=1 + cmp w4, #8 + add x10, x2, w6, uxtw // left[max_base_y] + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + add x13, x0, x1 + lsl x1, x1, #1 + b.eq 8f + +4: // h == 4 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + uzp2 v1.8h, v0.8h, v0.8h // top[base+1] + uzp1 v0.8h, v0.8h, v0.8h // top[base] + uzp2 v3.8h, v2.8h, v2.8h + uzp1 v2.8h, v2.8h, v2.8h + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + add w7, w7, w5 // xpos += dx + st1 {v18.s}[2], [x0] + st1 {v18.s}[3], [x13] + b.le 9f + sub x0, x0, x1 // ptr -= 4 * (2*stride) + sub x13, x13, x1 + add x0, x0, #4 + add x13, x13, #4 + b 4b +9: + ret + +8: // h == 8 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h, v1.8h}, [x8] // top[base] + ld1 {v2.8h, v3.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + uzp2 v20.8h, v0.8h, v1.8h // top[base+1] + uzp1 v0.8h, v0.8h, v1.8h // top[base] + uzp2 v21.8h, v2.8h, v3.8h + uzp1 v2.8h, v2.8h, v3.8h + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v20.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v21.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v21.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + zip2 v19.8h, v16.8h, v17.8h + add w7, w7, w5 // xpos += dx + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + b.le 9f + sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) + sub x13, x13, x1, lsl #2 + add x0, x0, #4 + add x13, x13, #4 + b 8b +9: + ret +endfunc + + +// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height, +// const int bitdepth_max); +.macro filter_fn bpc +function ipred_filter_\bpc\()bpc_neon + and w5, w5, #511 + movrel x6, X(filter_intra_taps) + lsl w5, w5, #6 + add x6, x6, w5, uxtw + ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 + clz w9, w3 + adr x5, L(ipred_filter\bpc\()_tbl) + ld1 {v20.8b, v21.8b, v22.8b}, [x6] + sub w9, w9, #26 + ldrh w9, [x5, w9, uxtw #1] + sxtl v16.8h, v16.8b + sxtl v17.8h, v17.8b + sub x5, x5, w9, uxtw + sxtl v18.8h, v18.8b + sxtl v19.8h, v19.8b + add x6, x0, x1 + lsl x1, x1, #1 + sxtl v20.8h, v20.8b + sxtl v21.8h, v21.8b + sxtl v22.8h, v22.8b + dup v31.8h, w8 +.if \bpc == 10 + movi v30.8h, #0 +.endif + br x5 +40: + AARCH64_VALID_JUMP_TARGET + ldur d0, [x2, #2] // top (0-3) + sub x2, x2, #4 + mov x7, #-4 +4: + ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) +.if \bpc == 10 + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + srshr v2.8h, v2.8h, #4 + smax v2.8h, v2.8h, v30.8h +.else + smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) + smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) + smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) + smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) + smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) + smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) + smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) + smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) + smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + sqrshrun v2.4h, v2.4s, #4 + sqrshrun2 v2.8h, v3.4s, #4 +.endif + smin v2.8h, v2.8h, v31.8h + subs w4, w4, #2 + st1 {v2.d}[0], [x0], x1 + ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] + st1 {v2.d}[1], [x6], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + ldur q0, [x2, #2] // top (0-7) + sub x2, x2, #4 + mov x7, #-4 +8: + ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) +.if \bpc == 10 + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) + srshr v2.8h, v2.8h, #4 + smax v2.8h, v2.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) + mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) + mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) + srshr v3.8h, v3.8h, #4 + smax v3.8h, v3.8h, v30.8h +.else + smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) + smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) + smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) + smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) + smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) + smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) + smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) + smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) + smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) + smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) + smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) + sqrshrun v2.4h, v2.4s, #4 + sqrshrun2 v2.8h, v3.4s, #4 + smin v2.8h, v2.8h, v31.8h + smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) + smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) + smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) + smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) + smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) + smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) + smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) + smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) + smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) + smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) + smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) + sqrshrun v3.4h, v4.4s, #4 + sqrshrun2 v3.8h, v5.4s, #4 +.endif + smin v3.8h, v3.8h, v31.8h + subs w4, w4, #2 + st2 {v2.d, v3.d}[0], [x0], x1 + zip2 v0.2d, v2.2d, v3.2d + st2 {v2.d, v3.d}[1], [x6], x1 + b.gt 8b + ret +160: +320: + AARCH64_VALID_JUMP_TARGET + add x8, x2, #2 + sub x2, x2, #4 + mov x7, #-4 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) +2: + ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) +.if \bpc == 10 + mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) + mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) + mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) + mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) + + mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) + mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) + mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) + srshr v3.8h, v3.8h, #4 + smax v3.8h, v3.8h, v30.8h + smin v3.8h, v3.8h, v31.8h + mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) + mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) + mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) + mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) + + mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) + mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) + mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) + srshr v4.8h, v4.8h, #4 + smax v4.8h, v4.8h, v30.8h + smin v4.8h, v4.8h, v31.8h + mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) + mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) + mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) + mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) + + mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) + mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) + mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) + srshr v5.8h, v5.8h, #4 + smax v5.8h, v5.8h, v30.8h + smin v5.8h, v5.8h, v31.8h + mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) + mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) + mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) + mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) + + subs w3, w3, #16 + srshr v6.8h, v6.8h, #4 + smax v6.8h, v6.8h, v30.8h +.else + smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) + smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) + smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) + smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) + smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) + smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) + smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) + smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) + smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) + smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) + smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) + smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) + smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) + smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) + + smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) + smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) + smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) + sqrshrun v3.4h, v3.4s, #4 + sqrshrun2 v3.8h, v4.4s, #4 + smin v3.8h, v3.8h, v31.8h + smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) + smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) + smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) + smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) + smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) + smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) + smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) + smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) + smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) + smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) + smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) + + smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) + smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) + smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) + sqrshrun v4.4h, v5.4s, #4 + sqrshrun2 v4.8h, v6.4s, #4 + smin v4.8h, v4.8h, v31.8h + smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) + smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) + smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) + smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) + smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) + smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) + smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) + smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) + smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) + smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) + smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) + + smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) + smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) + smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) + sqrshrun v5.4h, v24.4s, #4 + sqrshrun2 v5.8h, v25.4s, #4 + smin v5.8h, v5.8h, v31.8h + smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) + smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) + smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) + smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) + smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) + smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) + smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) + smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) + smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) + smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) + smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) + + subs w3, w3, #16 + sqrshrun v6.4h, v26.4s, #4 + sqrshrun2 v6.8h, v27.4s, #4 +.endif + smin v6.8h, v6.8h, v31.8h + + ins v0.h[2], v2.h[7] + st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 + ins v0.h[0], v6.h[7] + st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 + ins v0.h[1], v6.h[3] + b.gt 2b + subs w4, w4, #2 + b.le 9f + sub x8, x6, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_filter\bpc\()_tbl): + .hword L(ipred_filter\bpc\()_tbl) - 320b + .hword L(ipred_filter\bpc\()_tbl) - 160b + .hword L(ipred_filter\bpc\()_tbl) - 80b + .hword L(ipred_filter\bpc\()_tbl) - 40b +endfunc +.endm + +filter_fn 10 +filter_fn 12 + +function ipred_filter_16bpc_neon, export=1 + ldr w8, [sp] + cmp w8, 0x3ff + b.le ipred_filter_10bpc_neon + b ipred_filter_12bpc_neon +endfunc + +// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_16bpc_neon, export=1 + ld1 {v30.8h}, [x2] + clz w9, w4 + adr x6, L(pal_pred_tbl) + sub w9, w9, #25 + ldrh w9, [x6, w9, uxtw #1] + movi v31.8h, #1, lsl #8 + sub x6, x6, w9, uxtw + br x6 +40: + AARCH64_VALID_JUMP_TARGET + add x2, x0, x1 + lsl x1, x1, #1 +4: + ld1 {v1.16b}, [x3], #16 + subs w5, w5, #4 + // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... + add v1.16b, v1.16b, v1.16b + zip1 v0.16b, v1.16b, v1.16b + zip2 v1.16b, v1.16b, v1.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + st1 {v0.d}[0], [x0], x1 + tbl v1.16b, {v30.16b}, v1.16b + st1 {v0.d}[1], [x2], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x2], x1 + b.gt 4b + ret +80: + AARCH64_VALID_JUMP_TARGET + add x2, x0, x1 + lsl x1, x1, #1 +8: + ld1 {v2.16b, v3.16b}, [x3], #32 + subs w5, w5, #4 + add v2.16b, v2.16b, v2.16b + add v3.16b, v3.16b, v3.16b + zip1 v0.16b, v2.16b, v2.16b + zip2 v1.16b, v2.16b, v2.16b + zip1 v2.16b, v3.16b, v3.16b + zip2 v3.16b, v3.16b, v3.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + tbl v1.16b, {v30.16b}, v1.16b + st1 {v0.8h}, [x0], x1 + tbl v2.16b, {v30.16b}, v2.16b + st1 {v1.8h}, [x2], x1 + tbl v3.16b, {v30.16b}, v3.16b + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x2], x1 + b.gt 8b + ret +160: + AARCH64_VALID_JUMP_TARGET + add x2, x0, x1 + lsl x1, x1, #1 +16: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #4 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + st1 {v2.8h, v3.8h}, [x2], x1 + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h}, [x0], x1 + st1 {v6.8h, v7.8h}, [x2], x1 + b.gt 16b + ret +320: + AARCH64_VALID_JUMP_TARGET + add x2, x0, x1 + lsl x1, x1, #1 +32: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #2 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 + b.gt 32b + ret +640: + AARCH64_VALID_JUMP_TARGET + add x2, x0, #64 +64: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #1 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 + b.gt 64b + ret + +L(pal_pred_tbl): + .hword L(pal_pred_tbl) - 640b + .hword L(pal_pred_tbl) - 320b + .hword L(pal_pred_tbl) - 160b + .hword L(pal_pred_tbl) - 80b + .hword L(pal_pred_tbl) - 40b +endfunc + +// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_128_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + clz w9, w3 + adr x7, L(ipred_cfl_128_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + urshr v0.8h, v31.8h, #1 + dup v1.8h, w6 // alpha + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 +L(ipred_cfl_splat_w4): + AARCH64_VALID_JUMP_TARGET + ld1 {v4.8h, v5.8h}, [x5], #32 + subs w4, w4, #4 + smull v2.4s, v4.4h, v1.4h // diff = ac * alpha + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + cmlt v16.4s, v2.4s, #0 // sign + cmlt v17.4s, v3.4s, #0 + cmlt v18.4s, v4.4s, #0 + cmlt v19.4s, v5.4s, #0 + add v2.4s, v2.4s, v16.4s // diff + sign + add v3.4s, v3.4s, v17.4s + add v4.4s, v4.4s, v18.4s + add v5.4s, v5.4s, v19.4s + rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v2.d}[0], [x0], x1 + st1 {v2.d}[1], [x6], x1 + st1 {v3.d}[0], [x0], x1 + st1 {v3.d}[1], [x6], x1 + b.gt L(ipred_cfl_splat_w4) + ret +L(ipred_cfl_splat_w8): + AARCH64_VALID_JUMP_TARGET + ld1 {v4.8h, v5.8h}, [x5], #32 + subs w4, w4, #2 + smull v2.4s, v4.4h, v1.4h // diff = ac * alpha + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + cmlt v16.4s, v2.4s, #0 // sign + cmlt v17.4s, v3.4s, #0 + cmlt v18.4s, v4.4s, #0 + cmlt v19.4s, v5.4s, #0 + add v2.4s, v2.4s, v16.4s // diff + sign + add v3.4s, v3.4s, v17.4s + add v4.4s, v4.4s, v18.4s + add v5.4s, v5.4s, v19.4s + rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x6], x1 + b.gt L(ipred_cfl_splat_w8) + ret +L(ipred_cfl_splat_w16): + AARCH64_VALID_JUMP_TARGET + add x7, x5, w3, uxtw #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 +1: + ld1 {v2.8h, v3.8h}, [x5], #32 + ld1 {v4.8h, v5.8h}, [x7], #32 + subs w3, w3, #16 + smull v16.4s, v2.4h, v1.4h // diff = ac * alpha + smull2 v17.4s, v2.8h, v1.8h + smull v18.4s, v3.4h, v1.4h + smull2 v19.4s, v3.8h, v1.8h + smull v2.4s, v4.4h, v1.4h + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + cmlt v20.4s, v16.4s, #0 // sign + cmlt v21.4s, v17.4s, #0 + cmlt v22.4s, v18.4s, #0 + cmlt v23.4s, v19.4s, #0 + cmlt v24.4s, v2.4s, #0 + cmlt v25.4s, v3.4s, #0 + cmlt v26.4s, v4.4s, #0 + cmlt v27.4s, v5.4s, #0 + add v16.4s, v16.4s, v20.4s // diff + sign + add v17.4s, v17.4s, v21.4s + add v18.4s, v18.4s, v22.4s + add v19.4s, v19.4s, v23.4s + add v2.4s, v2.4s, v24.4s + add v3.4s, v3.4s, v25.4s + add v4.4s, v4.4s, v26.4s + add v5.4s, v5.4s, v27.4s + rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + rshrn v6.4h, v2.4s, #6 + rshrn2 v6.8h, v3.4s, #6 + rshrn v7.4h, v4.4s, #6 + rshrn2 v7.8h, v5.4s, #6 + add v2.8h, v16.8h, v0.8h // dc + apply_sign() + add v3.8h, v17.8h, v0.8h + add v4.8h, v6.8h, v0.8h + add v5.8h, v7.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smax v4.8h, v4.8h, v30.8h + smax v5.8h, v5.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + smin v4.8h, v4.8h, v31.8h + smin v5.8h, v5.8h, v31.8h + st1 {v2.8h, v3.8h}, [x0], #32 + st1 {v4.8h, v5.8h}, [x6], #32 + b.gt 1b + subs w4, w4, #2 + add x5, x5, w9, uxtw #1 + add x7, x7, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b.gt 1b + ret + +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) +endfunc + +// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_top_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + clz w9, w3 + adr x7, L(ipred_cfl_top_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + dup v1.8h, w6 // alpha + add x2, x2, #2 + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 +4: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) +8: + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) +16: + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h}, [x2] + addp v0.8h, v2.8h, v3.8h + addv h0, v0.8h + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) +32: + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + uaddlv s0, v0.8h + rshrn v0.4h, v0.4s, #5 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_top_tbl): + .hword L(ipred_cfl_top_tbl) - 32b + .hword L(ipred_cfl_top_tbl) - 16b + .hword L(ipred_cfl_top_tbl) - 8b + .hword L(ipred_cfl_top_tbl) - 4b +endfunc + +// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_left_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + sub x2, x2, w4, uxtw #1 + clz w9, w3 + clz w8, w4 + adr x10, L(ipred_cfl_splat_tbl) + adr x7, L(ipred_cfl_left_tbl) + sub w9, w9, #26 + sub w8, w8, #26 + ldrh w9, [x10, w9, uxtw #1] + ldrh w8, [x7, w8, uxtw #1] + dup v1.8h, w6 // alpha + sub x9, x10, w9, uxtw + sub x7, x7, w8, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 + +L(ipred_cfl_left_h4): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h}, [x2] + addp v0.8h, v2.8h, v3.8h + addv h0, v0.8h + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + uaddlv s0, v0.8h + rshrn v0.4h, v0.4s, #5 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_tbl): + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) +endfunc + +// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + sub x2, x2, w4, uxtw #1 + add w8, w3, w4 // width + height + dup v1.8h, w6 // alpha + clz w9, w3 + clz w6, w4 + dup v16.4s, w8 // width + height + adr x7, L(ipred_cfl_tbl) + rbit w8, w8 // rbit(width + height) + sub w9, w9, #22 // 26 leading bits, minus table offset 4 + sub w6, w6, #26 + clz w8, w8 // ctz(width + height) + ldrh w9, [x7, w9, uxtw #1] + ldrh w6, [x7, w6, uxtw #1] + neg w8, w8 // -ctz(width + height) + sub x9, x7, w9, uxtw + sub x7, x7, w6, uxtw + ushr v16.4s, v16.4s, #1 // (width + height) >> 1 + dup v17.4s, w8 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 + +L(ipred_cfl_h4): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.4h}, [x2], #8 + uaddlv s0, v0.4h + add x2, x2, #2 + br x9 +L(ipred_cfl_w4): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.4h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s2, v2.4h + cmp w4, #4 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + AARCH64_VALID_JUMP_TARGET + ld1 {v0.8h}, [x2], #16 + uaddlv s0, v0.8h + add x2, x2, #2 + br x9 +L(ipred_cfl_w8): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s2, v2.8h + cmp w4, #8 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h}, [x2], #32 + addp v0.8h, v2.8h, v3.8h + add x2, x2, #2 + uaddlv s0, v0.8h + br x9 +L(ipred_cfl_w16): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v2.8h, v2.8h, v3.8h + uaddlv s2, v2.8h + cmp w4, #16 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/8/32 + tst w4, #(32+16+8) // 16 added to make a consecutive bitmask + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + add x2, x2, #2 + uaddlv s0, v0.8h + br x9 +L(ipred_cfl_w32): + AARCH64_VALID_JUMP_TARGET + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + add v0.4s, v0.4s, v16.4s + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v2.8h, v2.8h, v4.8h + cmp w4, #32 + uaddlv s2, v2.8h + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #8 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_tbl): + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) +endfunc + +// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_420_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_420_w4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v2.8h + addp v1.8h, v1.8h, v3.8h + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + subs w8, w8, #2 + st1 {v0.8h}, [x0], #16 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + b.gt 1b + trn2 v1.2d, v0.2d, v0.2d + trn2 v0.2d, v0.2d, v0.2d +L(ipred_cfl_ac_420_w4_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 2b +3: +L(ipred_cfl_ac_420_w4_calc_subtract_dc): + // Aggregate the sums + add v24.4s, v24.4s, v25.4s + add v26.4s, v26.4s, v27.4s + add v0.4s, v24.4s, v26.4s + addv s0, v0.4s // sum + sub x0, x0, w6, uxtw #3 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz + dup v4.8h, v4.h[0] +6: // Subtract dc from ac + ld1 {v0.8h, v1.8h}, [x0] + subs w6, w6, #4 + sub v0.8h, v0.8h, v4.8h + sub v1.8h, v1.8h, v4.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 6b + ret + +L(ipred_cfl_ac_420_w8): + AARCH64_VALID_JUMP_TARGET + cbnz w3, L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v4.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + mov v0.16b, v1.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v2.8h + addp v1.8h, v1.8h, v3.8h + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + dup v1.4h, v0.h[3] + dup v3.4h, v0.h[7] + trn2 v2.2d, v0.2d, v0.2d + subs w8, w8, #2 + st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw v25.4s, v25.4s, v1.4h + uaddw v26.4s, v26.4s, v2.4h + uaddw v27.4s, v27.4s, v3.4h + b.gt 1b + trn1 v0.2d, v2.2d, v3.2d + trn1 v1.2d, v2.2d, v3.2d + +L(ipred_cfl_ac_420_w8_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 2b +3: + + // Double the height and reuse the w4 summing/subtracting + lsl w6, w6, #1 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_w16): + AARCH64_VALID_JUMP_TARGET + adr x7, L(ipred_cfl_ac_420_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_420_w16_wpad0): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 + add v0.8h, v0.8h, v4.8h + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 + add v2.8h, v2.8h, v6.8h + addp v16.8h, v16.8h, v17.8h + addp v18.8h, v18.8h, v19.8h + addp v20.8h, v20.8h, v21.8h + addp v22.8h, v22.8h, v23.8h + add v16.8h, v16.8h, v20.8h + add v18.8h, v18.8h, v22.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v2.8h, #1 + shl v2.8h, v16.8h, #1 + shl v3.8h, v18.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 4 + ldr q2, [x1, #32] + ld1 {v0.8h, v1.8h}, [x1], x2 + ldr q5, [x10, #32] + ld1 {v3.8h, v4.8h}, [x10], x2 + addp v2.8h, v2.8h, v2.8h + addp v0.8h, v0.8h, v1.8h + addp v5.8h, v5.8h, v5.8h + addp v3.8h, v3.8h, v4.8h + ldr q18, [x1, #32] + add v2.4h, v2.4h, v5.4h + ld1 {v16.8h, v17.8h}, [x1], x2 + add v0.8h, v0.8h, v3.8h + ldr q21, [x10, #32] + ld1 {v19.8h, v20.8h}, [x10], x2 + addp v18.8h, v18.8h, v18.8h + addp v16.8h, v16.8h, v17.8h + addp v21.8h, v21.8h, v21.8h + addp v19.8h, v19.8h, v20.8h + add v18.4h, v18.4h, v21.4h + add v16.8h, v16.8h, v19.8h + shl v1.4h, v2.4h, #1 + shl v0.8h, v0.8h, #1 + shl v3.4h, v18.4h, #1 + shl v2.8h, v16.8h, #1 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 8 + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + shl v0.8h, v0.8h, #1 + shl v2.8h, v4.8h, #1 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 12 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + ld1 {v4.8h}, [x1], x2 + ld1 {v6.8h}, [x10], x2 + addp v0.8h, v0.8h, v4.8h + addp v2.8h, v2.8h, v6.8h + add v0.8h, v0.8h, v2.8h + shl v0.8h, v0.8h, #1 + dup v1.8h, v0.h[3] + dup v3.8h, v0.h[7] + trn2 v2.2d, v0.2d, v3.2d + trn1 v0.2d, v0.2d, v1.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + +L(ipred_cfl_ac_420_w16_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 2b +3: + + // Quadruple the height and reuse the w4 summing/subtracting + lsl w6, w6, #2 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_tbl): + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) + .hword 0 + +L(ipred_cfl_ac_420_w16_tbl): + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) +endfunc + +// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_422_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_422_w4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + subs w8, w8, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + AARCH64_VALID_JUMP_TARGET + cbnz w3, L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + shl v2.8h, v4.8h, #2 + shl v3.8h, v6.8h, #2 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v4.4h, v0.h[3] + dup v5.8h, v0.h[7] + dup v6.4h, v2.h[3] + dup v7.8h, v2.h[7] + trn2 v1.2d, v0.2d, v5.2d + trn1 v0.2d, v0.2d, v4.2d + trn2 v3.2d, v2.2d, v7.2d + trn1 v2.2d, v2.2d, v6.2d + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + AARCH64_VALID_JUMP_TARGET + adr x7, L(ipred_cfl_ac_422_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_422_w16_wpad0): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + shl v2.8h, v4.8h, #2 + shl v3.8h, v6.8h, #2 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 4 + ldr q2, [x1, #32] + ld1 {v0.8h, v1.8h}, [x1], x2 + ldr q6, [x10, #32] + ld1 {v4.8h, v5.8h}, [x10], x2 + addp v2.8h, v2.8h, v2.8h + addp v0.8h, v0.8h, v1.8h + addp v6.8h, v6.8h, v6.8h + addp v4.8h, v4.8h, v5.8h + shl v1.4h, v2.4h, #2 + shl v0.8h, v0.8h, #2 + shl v3.4h, v6.4h, #2 + shl v2.8h, v4.8h, #2 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 8 + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): + AARCH64_VALID_JUMP_TARGET +1: // Copy and subsample input, padding 12 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + addp v0.8h, v0.8h, v0.8h + addp v2.8h, v2.8h, v2.8h + shl v0.4h, v0.4h, #2 + shl v2.4h, v2.4h, #2 + dup v1.8h, v0.h[3] + dup v3.8h, v2.h[3] + trn1 v0.2d, v0.2d, v1.2d + trn1 v2.2d, v2.2d, v3.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_tbl): + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) + .hword 0 + +L(ipred_cfl_ac_422_w16_tbl): + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) +endfunc + +// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_444_tbl) + sub w8, w8, #26 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_444_w4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input + ld1 {v0.4h}, [x1], x2 + ld1 {v0.d}[1], [x10], x2 + ld1 {v1.4h}, [x1], x2 + ld1 {v1.d}[1], [x10], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + ld1 {v3.8h}, [x10], x2 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + AARCH64_VALID_JUMP_TARGET + cbnz w3, L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + shl v0.8h, v0.8h, #3 + shl v2.8h, v2.8h, #3 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + AARCH64_VALID_JUMP_TARGET + adr x7, L(ipred_cfl_ac_444_w32_tbl) + ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 + lsr x2, x2, #1 // Restore the stride to one line increments + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_444_w32_wpad0): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, padding 8 + ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 + shl v2.8h, v2.8h, #3 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + dup v3.8h, v2.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, padding 16 + ld1 {v0.8h, v1.8h}, [x1], x2 + shl v1.8h, v1.8h, #3 + shl v0.8h, v0.8h, #3 + dup v2.8h, v1.h[7] + dup v3.8h, v1.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): + AARCH64_VALID_JUMP_TARGET +1: // Copy and expand input, padding 24 + ld1 {v0.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + dup v1.8h, v0.h[7] + dup v2.8h, v0.h[7] + dup v3.8h, v0.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + +L(ipred_cfl_ac_444_w32_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 2b +3: + + // Multiply the height by eight and reuse the w4 subtracting + lsl w6, w6, #3 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_444_tbl): + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) + +L(ipred_cfl_ac_444_w32_tbl): + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) +endfunc |