From 9e3c08db40b8916968b9f30096c7be3f00ce9647 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:44:51 +0200 Subject: Adding upstream version 1:115.7.0. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/arm/64/msac.S | 480 ++++++++++++++++++++++++++++++++++++ 1 file changed, 480 insertions(+) create mode 100644 third_party/dav1d/src/arm/64/msac.S (limited to 'third_party/dav1d/src/arm/64/msac.S') diff --git a/third_party/dav1d/src/arm/64/msac.S b/third_party/dav1d/src/arm/64/msac.S new file mode 100644 index 0000000000..3a6cf900a9 --- /dev/null +++ b/third_party/dav1d/src/arm/64/msac.S @@ -0,0 +1,480 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define BUF_POS 0 +#define BUF_END 8 +#define DIF 16 +#define RNG 24 +#define CNT 28 +#define ALLOW_UPDATE_CDF 32 + +const coeffs + .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 + .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +endconst + +const bits + .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 +endconst + +.macro ld1_n d0, d1, src, sz, n +.if \n <= 8 + ld1 {\d0\sz}, [\src] +.else + ld1 {\d0\sz, \d1\sz}, [\src] +.endif +.endm + +.macro st1_n s0, s1, dst, sz, n +.if \n <= 8 + st1 {\s0\sz}, [\dst] +.else + st1 {\s0\sz, \s1\sz}, [\dst] +.endif +.endm + +.macro ushr_n d0, d1, s0, s1, shift, sz, n + ushr \d0\sz, \s0\sz, \shift +.if \n == 16 + ushr \d1\sz, \s1\sz, \shift +.endif +.endm + +.macro add_n d0, d1, s0, s1, s2, s3, sz, n + add \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + add \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro sub_n d0, d1, s0, s1, s2, s3, sz, n + sub \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + sub \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro and_n d0, d1, s0, s1, s2, s3, sz, n + and \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + and \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n + cmhs \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + cmhs \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n + urhadd \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + urhadd \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n + sshl \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + sshl \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n + sqdmulh \d0\sz, \s0\sz, \s2\sz +.if \n == 16 + sqdmulh \d1\sz, \s1\sz, \s3\sz +.endif +.endm + +.macro str_n idx0, idx1, dstreg, dstoff, n + str \idx0, [\dstreg, \dstoff] +.if \n == 16 + str \idx1, [\dstreg, \dstoff + 16] +.endif +.endm + +// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, +// size_t n_symbols); + +function msac_decode_symbol_adapt4_neon, export=1 +.macro decode_update sz, szb, n + sub sp, sp, #48 + add x8, x0, #RNG + ld1_n v0, v1, x1, \sz, \n // cdf + ld1r {v4\sz}, [x8] // rng + movrel x9, coeffs, 30 + movi v31\sz, #0x7f, lsl #8 // 0x7f00 + sub x9, x9, x2, lsl #1 + mvni v30\sz, #0x3f // 0xffc0 + and v7\szb, v4\szb, v31\szb // rng & 0x7f00 + str h4, [sp, #14] // store original u = s->rng + and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0 + + ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret) + sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add x8, x0, #DIF + 6 + + add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) + add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + + ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16) + movrel x8, bits + str_n q4, q5, sp, #16, \n // store v values to allow indexed access + + ld1_n v16, v17, x8, .8h, \n + + cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v + + and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask +.if \n == 16 + add v6.8h, v6.8h, v7.8h +.endif + addv h6, v6.8h // Aggregate mask bits + ldr w4, [x0, #ALLOW_UPDATE_CDF] + umov w3, v6.h[0] + rbit w3, w3 + clz w15, w3 // ret + + cbz w4, L(renorm) + // update_cdf + ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols] + movi v5\szb, #0xff +.if \n == 16 + mov w4, #-5 +.else + mvn w14, w2 + mov w4, #-4 + cmn w14, #3 // set C if n_symbols <= 2 +.endif + urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768 +.if \n == 16 + sub w4, w4, w3, lsr #4 // -((count >> 4) + 5) +.else + lsr w14, w3, #4 // count >> 4 + sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4) +.endif + sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) + dup v6\sz, w4 // -rate + + sub w3, w3, w3, lsr #5 // count - (count == 32) + sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) + sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate + add w3, w3, #1 // count + (count < 32) + add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate + st1_n v0, v1, x1, \sz, \n + strh w3, [x1, x2, lsl #1] +.endm + + decode_update .4h, .8b, 4 + +L(renorm): + add x8, sp, #16 + add x8, x8, w15, uxtw #1 + ldrh w3, [x8] // v + ldurh w4, [x8, #-2] // u + ldr w6, [x0, #CNT] + ldr x7, [x0, #DIF] + sub w4, w4, w3 // rng = u - v + clz w5, w4 // clz(rng) + eor w5, w5, #16 // d = clz(rng) ^ 16 + mvn x7, x7 // ~dif + add x7, x7, x3, lsl #48 // ~dif + (v << 48) +L(renorm2): + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (~dif + (v << 48)) << d + str w4, [x0, #RNG] + mvn x7, x7 // ~dif + b.hs 9f + + // refill + ldp x3, x4, [x0] // BUF_POS, BUF_END + add x5, x3, #8 + cmp x5, x4 + b.gt 2f + + ldr x3, [x3] // next_bits + add w8, w6, #23 // shift_bits = cnt + 23 + add w6, w6, #16 // cnt += 16 + rev x3, x3 // next_bits = bswap(next_bits) + sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 + and w8, w8, #24 // shift_bits &= 24 + lsr x3, x3, x8 // next_bits >>= shift_bits + sub w8, w8, w6 // shift_bits -= 16 + cnt + str x5, [x0, #BUF_POS] + lsl x3, x3, x8 // next_bits <<= shift_bits + mov w4, #48 + sub w6, w4, w8 // cnt = cnt + 64 - shift_bits + eor x7, x7, x3 // dif ^= next_bits + b 9f + +2: // refill_eob + mov w14, #40 + sub w5, w14, w6 // c = 40 - cnt +3: + cmp x3, x4 + b.ge 4f + ldrb w8, [x3], #1 + lsl x8, x8, x5 + eor x7, x7, x8 + subs w5, w5, #8 + b.ge 3b + +4: // refill_eob_end + str x3, [x0, #BUF_POS] + sub w6, w14, w5 // cnt = 40 - c + +9: + str w6, [x0, #CNT] + str x7, [x0, #DIF] + + mov w0, w15 + add sp, sp, #48 + ret +endfunc + +function msac_decode_symbol_adapt8_neon, export=1 + decode_update .8h, .16b, 8 + b L(renorm) +endfunc + +function msac_decode_symbol_adapt16_neon, export=1 + decode_update .8h, .16b, 16 + b L(renorm) +endfunc + +function msac_decode_hi_tok_neon, export=1 + ld1 {v0.4h}, [x1] // cdf + add x16, x0, #RNG + movi v31.4h, #0x7f, lsl #8 // 0x7f00 + movrel x17, coeffs, 30-2*3 + mvni v30.4h, #0x3f // 0xffc0 + ldrh w9, [x1, #6] // count = cdf[n_symbols] + ld1r {v3.4h}, [x16] // rng + movrel x16, bits + ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) + add x17, x0, #DIF + 6 + ld1 {v16.8h}, [x16] + mov w13, #-24 + and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 + ldr w10, [x0, #ALLOW_UPDATE_CDF] + ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16) + sub sp, sp, #48 + ldr w6, [x0, #CNT] + ldr x7, [x0, #DIF] +1: + and v7.8b, v3.8b, v31.8b // rng & 0x7f00 + sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) + add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + str h3, [sp, #14] // store original u = s->rng + cmhs v2.8h, v1.8h, v4.8h // c >= v + str q4, [sp, #16] // store v values to allow indexed access + and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask + addv h6, v6.8h // Aggregate mask bits + umov w3, v6.h[0] + add w13, w13, #5 + rbit w3, w3 + add x8, sp, #16 + clz w15, w3 // ret + + cbz w10, 2f + // update_cdf + movi v5.8b, #0xff + mov w4, #-5 + urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768 + sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) + sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) + dup v6.4h, w4 // -rate + + sub w9, w9, w9, lsr #5 // count - (count == 32) + sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0) + sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate + add w9, w9, #1 // count + (count < 32) + add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate + st1 {v0.4h}, [x1] + and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 + strh w9, [x1, #6] + +2: + add x8, x8, w15, uxtw #1 + ldrh w3, [x8] // v + ldurh w4, [x8, #-2] // u + sub w4, w4, w3 // rng = u - v + clz w5, w4 // clz(rng) + eor w5, w5, #16 // d = clz(rng) ^ 16 + mvn x7, x7 // ~dif + add x7, x7, x3, lsl #48 // ~dif + (v << 48) + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (~dif + (v << 48)) << d + str w4, [x0, #RNG] + dup v3.4h, w4 + mvn x7, x7 // ~dif + b.hs 9f + + // refill + ldp x3, x4, [x0] // BUF_POS, BUF_END + add x5, x3, #8 + cmp x5, x4 + b.gt 2f + + ldr x3, [x3] // next_bits + add w8, w6, #23 // shift_bits = cnt + 23 + add w6, w6, #16 // cnt += 16 + rev x3, x3 // next_bits = bswap(next_bits) + sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 + and w8, w8, #24 // shift_bits &= 24 + lsr x3, x3, x8 // next_bits >>= shift_bits + sub w8, w8, w6 // shift_bits -= 16 + cnt + str x5, [x0, #BUF_POS] + lsl x3, x3, x8 // next_bits <<= shift_bits + mov w4, #48 + sub w6, w4, w8 // cnt = cnt + 64 - shift_bits + eor x7, x7, x3 // dif ^= next_bits + b 9f + +2: // refill_eob + mov w14, #40 + sub w5, w14, w6 // c = 40 - cnt +3: + cmp x3, x4 + b.ge 4f + ldrb w8, [x3], #1 + lsl x8, x8, x5 + eor x7, x7, x8 + subs w5, w5, #8 + b.ge 3b + +4: // refill_eob_end + str x3, [x0, #BUF_POS] + sub w6, w14, w5 // cnt = 40 - c + +9: + lsl w15, w15, #1 + sub w15, w15, #5 + lsr x12, x7, #48 + adds w13, w13, w15 // carry = tok_br < 3 || tok == 15 + dup v1.8h, w12 + b.cc 1b // loop if !carry + add w13, w13, #30 + str w6, [x0, #CNT] + add sp, sp, #48 + str x7, [x0, #DIF] + lsr w0, w13, #1 + ret +endfunc + +function msac_decode_bool_equi_neon, export=1 + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + bic w4, w5, #0xff // r &= 0xff00 + add w4, w4, #8 + subs x8, x7, x4, lsl #47 // dif - vw + lsr w4, w4, #1 // v + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + b L(renorm2) +endfunc + +function msac_decode_bool_neon, export=1 + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + lsr w4, w5, #8 // r >> 8 + bic w1, w1, #0x3f // f &= ~63 + mul w4, w4, w1 + lsr w4, w4, #7 + add w4, w4, #4 // v + subs x8, x7, x4, lsl #48 // dif - vw + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + b L(renorm2) +endfunc + +function msac_decode_bool_adapt_neon, export=1 + ldr w9, [x1] // cdf[0-1] + ldp w5, w6, [x0, #RNG] // + CNT + sub sp, sp, #48 + ldr x7, [x0, #DIF] + lsr w4, w5, #8 // r >> 8 + and w2, w9, #0xffc0 // f &= ~63 + mul w4, w4, w2 + lsr w4, w4, #7 + add w4, w4, #4 // v + subs x8, x7, x4, lsl #48 // dif - vw + sub w5, w5, w4 // r - v + cset w15, lo + csel w4, w5, w4, hs // if (ret) v = r - v; + csel x7, x8, x7, hs // if (ret) dif = dif - vw; + + ldr w10, [x0, #ALLOW_UPDATE_CDF] + + clz w5, w4 // clz(rng) + mvn x7, x7 // ~dif + eor w5, w5, #16 // d = clz(rng) ^ 16 + + cbz w10, L(renorm2) + + lsr w2, w9, #16 // count = cdf[1] + and w9, w9, #0xffff // cdf[0] + + sub w3, w2, w2, lsr #5 // count - (count >= 32) + lsr w2, w2, #4 // count >> 4 + add w10, w3, #1 // count + (count < 32) + add w2, w2, #4 // rate = (count >> 4) | 4 + + sub w9, w9, w15 // cdf[0] -= bit + sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769} + asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate + sub w9, w9, w11 // cdf[0] + + strh w9, [x1] + strh w10, [x1, #2] + + b L(renorm2) +endfunc -- cgit v1.2.3