/*
 * Copyright © 2021, VideoLAN and dav1d authors
 * Copyright © 2021, Martin Storsjo
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "src/arm/asm.S"
#include "util.S"

// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
//                          int bx4, int bw4, int bh4)

function splat_mv_neon, export=1
        ld1             {v3.16b},  [x1]
        clz             w3,  w3
        adr             x5,  L(splat_tbl)
        sub             w3,  w3,  #26
        ext             v2.16b,  v3.16b,  v3.16b,  #12
        ldrh            w3,  [x5, w3, uxtw #1]
        add             w2,  w2,  w2,  lsl #1
        ext             v0.16b,  v2.16b,  v3.16b,  #4
        sub             x3,  x5,  w3, uxtw
        ext             v1.16b,  v2.16b,  v3.16b,  #8
        lsl             w2,  w2,  #2
        ext             v2.16b,  v2.16b,  v3.16b,  #12
1:
        ldr             x1,  [x0],  #8
        subs            w4,  w4,  #1
        add             x1,  x1,  x2
        br              x3

10:
        AARCH64_VALID_JUMP_TARGET
        st1             {v0.8b}, [x1]
        str             s2,  [x1, #8]
        b.gt            1b
        ret
20:
        AARCH64_VALID_JUMP_TARGET
        st1             {v0.16b}, [x1]
        str             d1,  [x1, #16]
        b.gt            1b
        ret
320:
        AARCH64_VALID_JUMP_TARGET
        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
160:
        AARCH64_VALID_JUMP_TARGET
        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
80:
        AARCH64_VALID_JUMP_TARGET
        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
40:
        AARCH64_VALID_JUMP_TARGET
        st1             {v0.16b, v1.16b, v2.16b}, [x1]
        b.gt            1b
        ret

L(splat_tbl):
        .hword L(splat_tbl) -  320b
        .hword L(splat_tbl) -  160b
        .hword L(splat_tbl) -   80b
        .hword L(splat_tbl) -   40b
        .hword L(splat_tbl) -   20b
        .hword L(splat_tbl) -   10b
endfunc

const mv_tbls, align=4
        .byte           255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
        .byte           0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
endconst

const mask_mult, align=4
        .byte           1, 2, 1, 2, 0, 0, 0, 0
endconst

// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
//                           refmvs_block **rr, const uint8_t *ref_sign,
//                           int col_end8, int row_end8,
//                           int col_start8, int row_start8)
function save_tmvs_neon, export=1
        AARCH64_SIGN_LINK_REGISTER
        stp             x29, x30, [sp, #-16]!
        mov             x29, sp

        movi            v30.8b,  #0
        ld1             {v31.8b}, [x3]
        adr             x8,  L(save_tmvs_tbl)
        movrel          x16, mask_mult
        movrel          x13, mv_tbls
        ld1             {v29.8b}, [x16]
        ext             v31.8b,  v30.8b,  v31.8b,  #7 // [0, ref_sign]
        mov             w15, #5
        mov             w14, #12*2
        sxtw            x4,  w4
        sxtw            x6,  w6
        mul             w1,  w1,  w15             // stride *= 5
        sub             w5,  w5,  w7              // h = row_end8 - row_start8
        lsl             w7,  w7,  #1              // row_start8 <<= 1
1:
        mov             w15, #5
        and             w9,  w7,  #30             // (y & 15) * 2
        ldr             x9,  [x2, w9, uxtw #3]    // b = rr[(y & 15) * 2]
        add             x9,  x9,  #12             // &b[... + 1]
        madd            x10, x4,  x14,  x9        // end_cand_b = &b[col_end8*2 + 1]
        madd            x9,  x6,  x14,  x9        // cand_b = &b[x*2 + 1]

        madd            x3,  x6,  x15,  x0        // &rp[x]

2:
        ldrb            w11, [x9, #10]            // cand_b->bs
        ld1             {v0.16b}, [x9]            // cand_b->mv
        add             x11, x8,  w11, uxtw #2
        ldr             h1,  [x9, #8]             // cand_b->ref
        ldrh            w12, [x11]                // bw8
        mov             x15, x8
        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
        cmp             x9,  x10
        mov             v2.8b,   v0.8b
        b.ge            3f

        ldrb            w15, [x9, #10]            // cand_b->bs
        add             x16, x9,  #8
        ld1             {v4.16b}, [x9]            // cand_b->mv
        add             x15, x8,  w15, uxtw #2
        ld1             {v1.h}[1], [x16]          // cand_b->ref
        ldrh            w12, [x15]                // bw8
        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
        trn1            v2.2d,   v0.2d,   v4.2d

3:
        abs             v2.8h,   v2.8h            // abs(mv[].xy)
        tbl             v1.8b, {v31.16b}, v1.8b   // ref_sign[ref]
        ushr            v2.8h,   v2.8h,   #12     // abs(mv[].xy) >> 12
        umull           v1.8h,   v1.8b,   v29.8b  // ref_sign[ref] * {1, 2}
        cmeq            v2.4s,   v2.4s,   #0      // abs(mv[].xy) <= 4096
        xtn             v2.4h,   v2.4s            // abs() condition to 16 bit
        and             v1.8b,   v1.8b,   v2.8b   // h[0-3] contains conditions for mv[0-1]
        addp            v1.4h,   v1.4h,   v1.4h   // Combine condition for [1] and [0]
        umov            w16, v1.h[0]              // Extract case for first block
        umov            w17, v1.h[1]
        ldrh            w11, [x11, #2]            // Fetch jump table entry
        ldrh            w15, [x15, #2]
        ldr             q1, [x13, w16, uxtw #4]   // Load permutation table base on case
        ldr             q5, [x13, w17, uxtw #4]
        sub             x11, x8,  w11, uxtw       // Find jump table target
        sub             x15, x8,  w15, uxtw
        tbl             v0.16b, {v0.16b}, v1.16b  // Permute cand_b to output refmvs_temporal_block
        tbl             v4.16b, {v4.16b}, v5.16b

        // v1 follows on v0, with another 3 full repetitions of the pattern.
        ext             v1.16b,  v0.16b,  v0.16b,  #1
        ext             v5.16b,  v4.16b,  v4.16b,  #1
        // v2 ends with 3 complete repetitions of the pattern.
        ext             v2.16b,  v0.16b,  v1.16b,  #4
        ext             v6.16b,  v4.16b,  v5.16b,  #4

        blr             x11
        b.ge            4f  // if (cand_b >= end)
        mov             v0.16b,  v4.16b
        mov             v1.16b,  v5.16b
        mov             v2.16b,  v6.16b
        cmp             x9,  x10
        blr             x15
        b.lt            2b  // if (cand_b < end)

4:
        subs            w5,  w5,  #1              // h--
        add             w7,  w7,  #2              // y += 2
        add             x0,  x0,  x1              // rp += stride
        b.gt            1b

        ldp             x29, x30, [sp], #16
        AARCH64_VALIDATE_LINK_REGISTER
        ret

10:
        AARCH64_VALID_CALL_TARGET
        add             x16, x3,  #4
        st1             {v0.s}[0], [x3]
        st1             {v0.b}[4], [x16]
        add             x3,  x3,  #5
        ret
20:
        AARCH64_VALID_CALL_TARGET
        add             x16, x3,  #8
        st1             {v0.d}[0], [x3]
        st1             {v0.h}[4], [x16]
        add             x3,  x3,  #2*5
        ret
40:
        AARCH64_VALID_CALL_TARGET
        st1             {v0.16b}, [x3]
        str             s1, [x3, #16]
        add             x3,  x3,  #4*5
        ret
80:
        AARCH64_VALID_CALL_TARGET
        // This writes 6 full entries plus 2 extra bytes
        st1             {v0.16b, v1.16b}, [x3]
        // Write the last few, overlapping with the first write.
        stur            q2, [x3, #(8*5-16)]
        add             x3,  x3,  #8*5
        ret
160:
        AARCH64_VALID_CALL_TARGET
        add             x16, x3,  #6*5
        add             x17, x3,  #12*5
        // This writes 6 full entries plus 2 extra bytes
        st1             {v0.16b, v1.16b}, [x3]
        // Write another 6 full entries, slightly overlapping with the first set
        st1             {v0.16b, v1.16b}, [x16]
        // Write 8 bytes (one full entry) after the first 12
        st1             {v0.8b}, [x17]
        // Write the last 3 entries
        str             q2, [x3, #(16*5-16)]
        add             x3,  x3,  #16*5
        ret

L(save_tmvs_tbl):
        .hword 16 * 12
        .hword L(save_tmvs_tbl) - 160b
        .hword 16 * 12
        .hword L(save_tmvs_tbl) - 160b
        .hword 8 * 12
        .hword L(save_tmvs_tbl) -  80b
        .hword 8 * 12
        .hword L(save_tmvs_tbl) -  80b
        .hword 8 * 12
        .hword L(save_tmvs_tbl) -  80b
        .hword 8 * 12
        .hword L(save_tmvs_tbl) -  80b
        .hword 4 * 12
        .hword L(save_tmvs_tbl) -  40b
        .hword 4 * 12
        .hword L(save_tmvs_tbl) -  40b
        .hword 4 * 12
        .hword L(save_tmvs_tbl) -  40b
        .hword 4 * 12
        .hword L(save_tmvs_tbl) -  40b
        .hword 2 * 12
        .hword L(save_tmvs_tbl) -  20b
        .hword 2 * 12
        .hword L(save_tmvs_tbl) -  20b
        .hword 2 * 12
        .hword L(save_tmvs_tbl) -  20b
        .hword 2 * 12
        .hword L(save_tmvs_tbl) -  20b
        .hword 2 * 12
        .hword L(save_tmvs_tbl) -  20b
        .hword 1 * 12
        .hword L(save_tmvs_tbl) -  10b
        .hword 1 * 12
        .hword L(save_tmvs_tbl) -  10b
        .hword 1 * 12
        .hword L(save_tmvs_tbl) -  10b
        .hword 1 * 12
        .hword L(save_tmvs_tbl) -  10b
        .hword 1 * 12
        .hword L(save_tmvs_tbl) -  10b
        .hword 1 * 12
        .hword L(save_tmvs_tbl) -  10b
        .hword 1 * 12
        .hword L(save_tmvs_tbl) -  10b
endfunc