diff options
Diffstat (limited to 'src/VBox/Devices/EFI/Firmware/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S')
-rw-r--r-- | src/VBox/Devices/EFI/Firmware/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S | 236 |
1 files changed, 236 insertions, 0 deletions
diff --git a/src/VBox/Devices/EFI/Firmware/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S b/src/VBox/Devices/EFI/Firmware/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S new file mode 100644 index 00000000..2c6bc4d4 --- /dev/null +++ b/src/VBox/Devices/EFI/Firmware/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S @@ -0,0 +1,236 @@ +// +// Copyright (c) 2012 - 2016, Linaro Limited +// All rights reserved. +// Copyright (c) 2015 ARM Ltd +// All rights reserved. +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +// Assumptions: +// +// ARMv8-a, AArch64, unaligned accesses. +// +// + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l srcend +#define F_h dst +#define tmp1 x9 +#define tmp2 x3 + +#define L(l) .L ## l + +// Copies are split into 3 main cases: small copies of up to 16 bytes, +// medium copies of 17..96 bytes which are fully unrolled. Large copies +// of more than 96 bytes align the destination and use an unrolled loop +// processing 64 bytes per iteration. +// Small and medium copies read all data before writing, allowing any +// kind of overlap, and memmove tailcalls memcpy for these cases as +// well as non-overlapping copies. + +__memcpy: + prfm PLDL1KEEP, [src] + add srcend, src, count + add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + // Medium copies: 17..96 bytes. + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + // Small copies: 0..16 bytes. +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 4 +1: + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] + ret + + // Copy 0..3 bytes. Use a branchless sequence that copies the same + // byte 3 times if count==1, or the 2nd byte twice if count==2. +1: + cbz count, 2f + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +2: ret + + .p2align 4 + // Copy 64..96 bytes. Copy 64 bytes from the start and + // 32 bytes from the end. +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] + ret + + // Align DST to 16 byte alignment so that we don't cross cache line + // boundaries on both loads and stores. There are at least 96 bytes + // to copy, so copy 16 bytes unaligned and then align. The loop + // copies 64 bytes per iteration and prefetches one iteration ahead. + + .p2align 4 +L(copy_long): + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 // Count is now 16 too large. + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 // Test and readjust count. + b.ls 2f +1: + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi 1b + + // Write the last full set of 64 bytes. The remainder is at most 64 + // bytes, so it is safe to always copy 64 bytes from the end even if + // there is just 1 byte left. +2: + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + +// +// All memmoves up to 96 bytes are done by memcpy as it supports overlaps. +// Larger backwards copies are also handled by memcpy. The only remaining +// case is forward large copies. The destination is aligned, and an +// unrolled loop processes 64 bytes per iteration. +// + +ASM_GLOBAL ASM_PFX(InternalMemCopyMem) +ASM_PFX(InternalMemCopyMem): + sub tmp2, dstin, src + cmp count, 96 + ccmp tmp2, count, 2, hi + b.hs __memcpy + + cbz tmp2, 3f + add dstend, dstin, count + add srcend, src, count + + // Align dstend to 16 byte alignment so that we don't cross cache line + // boundaries on both loads and stores. There are at least 96 bytes + // to copy, so copy 16 bytes unaligned and then align. The loop + // copies 64 bytes per iteration and prefetches one iteration ahead. + + and tmp2, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp2 + sub count, count, tmp2 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp2 + subs count, count, 128 + b.ls 2f + nop +1: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + // Write the last full set of 64 bytes. The remainder is at most 64 + // bytes, so it is safe to always copy 64 bytes from the start even if + // there is just 1 byte left. +2: + ldp E_l, E_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp E_l, E_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret |