diff options
Diffstat (limited to 'src/internal/bytealg/compare_riscv64.s')
-rw-r--r-- | src/internal/bytealg/compare_riscv64.s | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s new file mode 100644 index 0000000..a4164a2 --- /dev/null +++ b/src/internal/bytealg/compare_riscv64.s @@ -0,0 +1,222 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 + // X10 = a_base + // X11 = a_len + // X12 = a_cap (unused) + // X13 = b_base (want in X12) + // X14 = b_len (want in X13) + // X15 = b_cap (unused) + MOV X13, X12 + MOV X14, X13 + JMP compare<>(SB) + +TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 + // X10 = a_base + // X11 = a_len + // X12 = b_base + // X13 = b_len + JMP compare<>(SB) + +// On entry: +// X10 points to start of a +// X11 length of a +// X12 points to start of b +// X13 length of b +// for non-regabi X14 points to the address to store the return value (-1/0/1) +// for regabi the return value in X10 +TEXT compare<>(SB),NOSPLIT|NOFRAME,$0 + BEQ X10, X12, cmp_len + + MOV X11, X5 + BGE X13, X5, use_a_len // X5 = min(len(a), len(b)) + MOV X13, X5 +use_a_len: + BEQZ X5, cmp_len + + MOV $32, X6 + BLT X5, X6, check8_unaligned + + // Check alignment - if alignment differs we have to do one byte at a time. + AND $7, X10, X7 + AND $7, X12, X8 + BNE X7, X8, check8_unaligned + BEQZ X7, compare32 + + // Check one byte at a time until we reach 8 byte alignment. + SUB X7, X0, X7 + ADD $8, X7, X7 + SUB X7, X5, X5 +align: + ADD $-1, X7 + MOVBU 0(X10), X8 + MOVBU 0(X12), X9 + BNE X8, X9, cmp + ADD $1, X10 + ADD $1, X12 + BNEZ X7, align + +check32: + // X6 contains $32 + BLT X5, X6, compare16 +compare32: + MOV 0(X10), X15 + MOV 0(X12), X16 + MOV 8(X10), X17 + MOV 8(X12), X18 + BNE X15, X16, cmp8a + BNE X17, X18, cmp8b + MOV 16(X10), X15 + MOV 16(X12), X16 + MOV 24(X10), X17 + MOV 24(X12), X18 + BNE X15, X16, cmp8a + BNE X17, X18, cmp8b + ADD $32, X10 + ADD $32, X12 + ADD $-32, X5 + BGE X5, X6, compare32 + BEQZ X5, cmp_len + +check16: + MOV $16, X6 + BLT X5, X6, check8_unaligned +compare16: + MOV 0(X10), X15 + MOV 0(X12), X16 + MOV 8(X10), X17 + MOV 8(X12), X18 + BNE X15, X16, cmp8a + BNE X17, X18, cmp8b + ADD $16, X10 + ADD $16, X12 + ADD $-16, X5 + BEQZ X5, cmp_len + +check8_unaligned: + MOV $8, X6 + BLT X5, X6, check4_unaligned +compare8_unaligned: + MOVBU 0(X10), X8 + MOVBU 1(X10), X15 + MOVBU 2(X10), X17 + MOVBU 3(X10), X19 + MOVBU 4(X10), X21 + MOVBU 5(X10), X23 + MOVBU 6(X10), X25 + MOVBU 7(X10), X29 + MOVBU 0(X12), X9 + MOVBU 1(X12), X16 + MOVBU 2(X12), X18 + MOVBU 3(X12), X20 + MOVBU 4(X12), X22 + MOVBU 5(X12), X24 + MOVBU 6(X12), X28 + MOVBU 7(X12), X30 + BNE X8, X9, cmp1a + BNE X15, X16, cmp1b + BNE X17, X18, cmp1c + BNE X19, X20, cmp1d + BNE X21, X22, cmp1e + BNE X23, X24, cmp1f + BNE X25, X28, cmp1g + BNE X29, X30, cmp1h + ADD $8, X10 + ADD $8, X12 + ADD $-8, X5 + BGE X5, X6, compare8_unaligned + BEQZ X5, cmp_len + +check4_unaligned: + MOV $4, X6 + BLT X5, X6, compare1 +compare4_unaligned: + MOVBU 0(X10), X8 + MOVBU 1(X10), X15 + MOVBU 2(X10), X17 + MOVBU 3(X10), X19 + MOVBU 0(X12), X9 + MOVBU 1(X12), X16 + MOVBU 2(X12), X18 + MOVBU 3(X12), X20 + BNE X8, X9, cmp1a + BNE X15, X16, cmp1b + BNE X17, X18, cmp1c + BNE X19, X20, cmp1d + ADD $4, X10 + ADD $4, X12 + ADD $-4, X5 + BGE X5, X6, compare4_unaligned + +compare1: + BEQZ X5, cmp_len + MOVBU 0(X10), X8 + MOVBU 0(X12), X9 + BNE X8, X9, cmp + ADD $1, X10 + ADD $1, X12 + ADD $-1, X5 + JMP compare1 + + // Compare 8 bytes of memory in X15/X16 that are known to differ. +cmp8a: + MOV X15, X17 + MOV X16, X18 + + // Compare 8 bytes of memory in X17/X18 that are known to differ. +cmp8b: + MOV $0xff, X19 +cmp8_loop: + AND X17, X19, X8 + AND X18, X19, X9 + BNE X8, X9, cmp + SLLI $8, X19 + JMP cmp8_loop + +cmp1a: + SLTU X9, X8, X5 + SLTU X8, X9, X6 + JMP cmp_ret +cmp1b: + SLTU X16, X15, X5 + SLTU X15, X16, X6 + JMP cmp_ret +cmp1c: + SLTU X18, X17, X5 + SLTU X17, X18, X6 + JMP cmp_ret +cmp1d: + SLTU X20, X19, X5 + SLTU X19, X20, X6 + JMP cmp_ret +cmp1e: + SLTU X22, X21, X5 + SLTU X21, X22, X6 + JMP cmp_ret +cmp1f: + SLTU X24, X23, X5 + SLTU X23, X24, X6 + JMP cmp_ret +cmp1g: + SLTU X28, X25, X5 + SLTU X25, X28, X6 + JMP cmp_ret +cmp1h: + SLTU X30, X29, X5 + SLTU X29, X30, X6 + JMP cmp_ret + +cmp_len: + MOV X11, X8 + MOV X13, X9 +cmp: + SLTU X9, X8, X5 + SLTU X8, X9, X6 +cmp_ret: + SUB X5, X6, X10 + RET |