diff options
Diffstat (limited to 'src/runtime/memclr_arm64.s')
-rw-r--r-- | src/runtime/memclr_arm64.s | 182 |
1 files changed, 182 insertions, 0 deletions
diff --git a/src/runtime/memclr_arm64.s b/src/runtime/memclr_arm64.s new file mode 100644 index 0000000..1c35dfe --- /dev/null +++ b/src/runtime/memclr_arm64.s @@ -0,0 +1,182 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// See memclrNoHeapPointers Go doc for important implementation constraints. + +// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) +// Also called from assembly in sys_windows_arm64.s without g (but using Go stack convention). +TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16 + CMP $16, R1 + // If n is equal to 16 bytes, use zero_exact_16 to zero + BEQ zero_exact_16 + + // If n is greater than 16 bytes, use zero_by_16 to zero + BHI zero_by_16 + + // n is less than 16 bytes + ADD R1, R0, R7 + TBZ $3, R1, less_than_8 + MOVD ZR, (R0) + MOVD ZR, -8(R7) + RET + +less_than_8: + TBZ $2, R1, less_than_4 + MOVW ZR, (R0) + MOVW ZR, -4(R7) + RET + +less_than_4: + CBZ R1, ending + MOVB ZR, (R0) + TBZ $1, R1, ending + MOVH ZR, -2(R7) + +ending: + RET + +zero_exact_16: + // n is exactly 16 bytes + STP (ZR, ZR), (R0) + RET + +zero_by_16: + // n greater than 16 bytes, check if the start address is aligned + NEG R0, R4 + ANDS $15, R4, R4 + // Try zeroing using zva if the start address is aligned with 16 + BEQ try_zva + + // Non-aligned store + STP (ZR, ZR), (R0) + // Make the destination aligned + SUB R4, R1, R1 + ADD R4, R0, R0 + B try_zva + +tail_maybe_long: + CMP $64, R1 + BHS no_zva + +tail63: + ANDS $48, R1, R3 + BEQ last16 + CMPW $32, R3 + BEQ last48 + BLT last32 + STP.P (ZR, ZR), 16(R0) +last48: + STP.P (ZR, ZR), 16(R0) +last32: + STP.P (ZR, ZR), 16(R0) + // The last store length is at most 16, so it is safe to use + // stp to write last 16 bytes +last16: + ANDS $15, R1, R1 + CBZ R1, last_end + ADD R1, R0, R0 + STP (ZR, ZR), -16(R0) +last_end: + RET + +no_zva: + SUB $16, R0, R0 + SUB $64, R1, R1 + +loop_64: + STP (ZR, ZR), 16(R0) + STP (ZR, ZR), 32(R0) + STP (ZR, ZR), 48(R0) + STP.W (ZR, ZR), 64(R0) + SUBS $64, R1, R1 + BGE loop_64 + ANDS $63, R1, ZR + ADD $16, R0, R0 + BNE tail63 + RET + +try_zva: + // Try using the ZVA feature to zero entire cache lines + // It is not meaningful to use ZVA if the block size is less than 64, + // so make sure that n is greater than or equal to 64 + CMP $63, R1 + BLE tail63 + + CMP $128, R1 + // Ensure n is at least 128 bytes, so that there is enough to copy after + // alignment. + BLT no_zva + // Check if ZVA is allowed from user code, and if so get the block size + MOVW block_size<>(SB), R5 + TBNZ $31, R5, no_zva + CBNZ R5, zero_by_line + // DCZID_EL0 bit assignments + // [63:5] Reserved + // [4] DZP, if bit set DC ZVA instruction is prohibited, else permitted + // [3:0] log2 of the block size in words, eg. if it returns 0x4 then block size is 16 words + MRS DCZID_EL0, R3 + TBZ $4, R3, init + // ZVA not available + MOVW $~0, R5 + MOVW R5, block_size<>(SB) + B no_zva + +init: + MOVW $4, R9 + ANDW $15, R3, R5 + LSLW R5, R9, R5 + MOVW R5, block_size<>(SB) + + ANDS $63, R5, R9 + // Block size is less than 64. + BNE no_zva + +zero_by_line: + CMP R5, R1 + // Not enough memory to reach alignment + BLO no_zva + SUB $1, R5, R6 + NEG R0, R4 + ANDS R6, R4, R4 + // Already aligned + BEQ aligned + + // check there is enough to copy after alignment + SUB R4, R1, R3 + + // Check that the remaining length to ZVA after alignment + // is greater than 64. + CMP $64, R3 + CCMP GE, R3, R5, $10 // condition code GE, NZCV=0b1010 + BLT no_zva + + // We now have at least 64 bytes to zero, update n + MOVD R3, R1 + +loop_zva_prolog: + STP (ZR, ZR), (R0) + STP (ZR, ZR), 16(R0) + STP (ZR, ZR), 32(R0) + SUBS $64, R4, R4 + STP (ZR, ZR), 48(R0) + ADD $64, R0, R0 + BGE loop_zva_prolog + + ADD R4, R0, R0 + +aligned: + SUB R5, R1, R1 + +loop_zva: + WORD $0xd50b7420 // DC ZVA, R0 + ADD R5, R0, R0 + SUBS R5, R1, R1 + BHS loop_zva + ANDS R6, R1, R1 + BNE tail_maybe_long + RET + +GLOBL block_size<>(SB), NOPTR, $8 |