diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:23:18 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:23:18 +0000 |
commit | 43a123c1ae6613b3efeed291fa552ecd909d3acf (patch) | |
tree | fd92518b7024bc74031f78a1cf9e454b65e73665 /src/internal/bytealg | |
parent | Initial commit. (diff) | |
download | golang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.tar.xz golang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.zip |
Adding upstream version 1.20.14.upstream/1.20.14upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/internal/bytealg')
58 files changed, 6876 insertions, 0 deletions
diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go new file mode 100644 index 0000000..ebebce7 --- /dev/null +++ b/src/internal/bytealg/bytealg.go @@ -0,0 +1,150 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +import ( + "internal/cpu" + "unsafe" +) + +// Offsets into internal/cpu records for use in assembly. +const ( + offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42) + offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2) + offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT) + + offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX) + + offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9) +) + +// MaxLen is the maximum length of the string to be searched for (argument b) in Index. +// If MaxLen is not 0, make sure MaxLen >= 4. +var MaxLen int + +// FIXME: the logic of HashStrBytes, HashStrRevBytes, IndexRabinKarpBytes and HashStr, HashStrRev, +// IndexRabinKarp are exactly the same, except that the types are different. Can we eliminate +// three of them without causing allocation? + +// PrimeRK is the prime base used in Rabin-Karp algorithm. +const PrimeRK = 16777619 + +// HashStrBytes returns the hash and the appropriate multiplicative +// factor for use in Rabin-Karp algorithm. +func HashStrBytes(sep []byte) (uint32, uint32) { + hash := uint32(0) + for i := 0; i < len(sep); i++ { + hash = hash*PrimeRK + uint32(sep[i]) + } + var pow, sq uint32 = 1, PrimeRK + for i := len(sep); i > 0; i >>= 1 { + if i&1 != 0 { + pow *= sq + } + sq *= sq + } + return hash, pow +} + +// HashStr returns the hash and the appropriate multiplicative +// factor for use in Rabin-Karp algorithm. +func HashStr(sep string) (uint32, uint32) { + hash := uint32(0) + for i := 0; i < len(sep); i++ { + hash = hash*PrimeRK + uint32(sep[i]) + } + var pow, sq uint32 = 1, PrimeRK + for i := len(sep); i > 0; i >>= 1 { + if i&1 != 0 { + pow *= sq + } + sq *= sq + } + return hash, pow +} + +// HashStrRevBytes returns the hash of the reverse of sep and the +// appropriate multiplicative factor for use in Rabin-Karp algorithm. +func HashStrRevBytes(sep []byte) (uint32, uint32) { + hash := uint32(0) + for i := len(sep) - 1; i >= 0; i-- { + hash = hash*PrimeRK + uint32(sep[i]) + } + var pow, sq uint32 = 1, PrimeRK + for i := len(sep); i > 0; i >>= 1 { + if i&1 != 0 { + pow *= sq + } + sq *= sq + } + return hash, pow +} + +// HashStrRev returns the hash of the reverse of sep and the +// appropriate multiplicative factor for use in Rabin-Karp algorithm. +func HashStrRev(sep string) (uint32, uint32) { + hash := uint32(0) + for i := len(sep) - 1; i >= 0; i-- { + hash = hash*PrimeRK + uint32(sep[i]) + } + var pow, sq uint32 = 1, PrimeRK + for i := len(sep); i > 0; i >>= 1 { + if i&1 != 0 { + pow *= sq + } + sq *= sq + } + return hash, pow +} + +// IndexRabinKarpBytes uses the Rabin-Karp search algorithm to return the index of the +// first occurrence of substr in s, or -1 if not present. +func IndexRabinKarpBytes(s, sep []byte) int { + // Rabin-Karp search + hashsep, pow := HashStrBytes(sep) + n := len(sep) + var h uint32 + for i := 0; i < n; i++ { + h = h*PrimeRK + uint32(s[i]) + } + if h == hashsep && Equal(s[:n], sep) { + return 0 + } + for i := n; i < len(s); { + h *= PrimeRK + h += uint32(s[i]) + h -= pow * uint32(s[i-n]) + i++ + if h == hashsep && Equal(s[i-n:i], sep) { + return i - n + } + } + return -1 +} + +// IndexRabinKarp uses the Rabin-Karp search algorithm to return the index of the +// first occurrence of substr in s, or -1 if not present. +func IndexRabinKarp(s, substr string) int { + // Rabin-Karp search + hashss, pow := HashStr(substr) + n := len(substr) + var h uint32 + for i := 0; i < n; i++ { + h = h*PrimeRK + uint32(s[i]) + } + if h == hashss && s[:n] == substr { + return 0 + } + for i := n; i < len(s); { + h *= PrimeRK + h += uint32(s[i]) + h -= pow * uint32(s[i-n]) + i++ + if h == hashss && s[i-n:i] == substr { + return i - n + } + } + return -1 +} diff --git a/src/internal/bytealg/compare_386.s b/src/internal/bytealg/compare_386.s new file mode 100644 index 0000000..27b660c --- /dev/null +++ b/src/internal/bytealg/compare_386.s @@ -0,0 +1,144 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare(SB),NOSPLIT,$0-28 + MOVL a_base+0(FP), SI + MOVL a_len+4(FP), BX + MOVL b_base+12(FP), DI + MOVL b_len+16(FP), DX + LEAL ret+24(FP), AX + JMP cmpbody<>(SB) + +TEXT runtime·cmpstring(SB),NOSPLIT,$0-20 + MOVL a_base+0(FP), SI + MOVL a_len+4(FP), BX + MOVL b_base+8(FP), DI + MOVL b_len+12(FP), DX + LEAL ret+16(FP), AX + JMP cmpbody<>(SB) + +// input: +// SI = a +// DI = b +// BX = alen +// DX = blen +// AX = address of return word (set to 1/0/-1) +TEXT cmpbody<>(SB),NOSPLIT,$0-0 + MOVL DX, BP + SUBL BX, DX // DX = blen-alen + JLE 2(PC) + MOVL BX, BP // BP = min(alen, blen) + CMPL SI, DI + JEQ allsame + CMPL BP, $4 + JB small +#ifdef GO386_softfloat + JMP mediumloop +#endif +largeloop: + CMPL BP, $16 + JB mediumloop + MOVOU (SI), X0 + MOVOU (DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, BX + XORL $0xffff, BX // convert EQ to NE + JNE diff16 // branch if at least one byte is not equal + ADDL $16, SI + ADDL $16, DI + SUBL $16, BP + JMP largeloop + +diff16: + BSFL BX, BX // index of first byte that differs + XORL DX, DX + MOVB (SI)(BX*1), CX + CMPB CX, (DI)(BX*1) + SETHI DX + LEAL -1(DX*2), DX // convert 1/0 to +1/-1 + MOVL DX, (AX) + RET + +mediumloop: + CMPL BP, $4 + JBE _0through4 + MOVL (SI), BX + MOVL (DI), CX + CMPL BX, CX + JNE diff4 + ADDL $4, SI + ADDL $4, DI + SUBL $4, BP + JMP mediumloop + +_0through4: + MOVL -4(SI)(BP*1), BX + MOVL -4(DI)(BP*1), CX + CMPL BX, CX + JEQ allsame + +diff4: + BSWAPL BX // reverse order of bytes + BSWAPL CX + XORL BX, CX // find bit differences + BSRL CX, CX // index of highest bit difference + SHRL CX, BX // move a's bit to bottom + ANDL $1, BX // mask bit + LEAL -1(BX*2), BX // 1/0 => +1/-1 + MOVL BX, (AX) + RET + + // 0-3 bytes in common +small: + LEAL (BP*8), CX + NEGL CX + JEQ allsame + + // load si + CMPB SI, $0xfc + JA si_high + MOVL (SI), SI + JMP si_finish +si_high: + MOVL -4(SI)(BP*1), SI + SHRL CX, SI +si_finish: + SHLL CX, SI + + // same for di + CMPB DI, $0xfc + JA di_high + MOVL (DI), DI + JMP di_finish +di_high: + MOVL -4(DI)(BP*1), DI + SHRL CX, DI +di_finish: + SHLL CX, DI + + BSWAPL SI // reverse order of bytes + BSWAPL DI + XORL SI, DI // find bit differences + JEQ allsame + BSRL DI, CX // index of highest bit difference + SHRL CX, SI // move a's bit to bottom + ANDL $1, SI // mask bit + LEAL -1(SI*2), BX // 1/0 => +1/-1 + MOVL BX, (AX) + RET + + // all the bytes in common are the same, so we just need + // to compare the lengths. +allsame: + XORL BX, BX + XORL CX, CX + TESTL DX, DX + SETLT BX // 1 if alen > blen + SETEQ CX // 1 if alen == blen + LEAL -1(CX)(BX*2), BX // 1,0,-1 result + MOVL BX, (AX) + RET diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s new file mode 100644 index 0000000..fdd015f --- /dev/null +++ b/src/internal/bytealg/compare_amd64.s @@ -0,0 +1,237 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "asm_amd64.h" +#include "textflag.h" + +TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56 + // AX = a_base (want in SI) + // BX = a_len (want in BX) + // CX = a_cap (unused) + // DI = b_base (want in DI) + // SI = b_len (want in DX) + // R8 = b_cap (unused) + MOVQ SI, DX + MOVQ AX, SI + JMP cmpbody<>(SB) + +TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40 + // AX = a_base (want in SI) + // BX = a_len (want in BX) + // CX = b_base (want in DI) + // DI = b_len (want in DX) + MOVQ AX, SI + MOVQ DI, DX + MOVQ CX, DI + JMP cmpbody<>(SB) + +// input: +// SI = a +// DI = b +// BX = alen +// DX = blen +// output: +// AX = output (-1/0/1) +TEXT cmpbody<>(SB),NOSPLIT,$0-0 + CMPQ SI, DI + JEQ allsame + CMPQ BX, DX + MOVQ DX, R8 + CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare + CMPQ R8, $8 + JB small + + CMPQ R8, $63 + JBE loop +#ifndef hasAVX2 + CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 + JEQ big_loop_avx2 + JMP big_loop +#else + JMP big_loop_avx2 +#endif +loop: + CMPQ R8, $16 + JBE _0through16 + MOVOU (SI), X0 + MOVOU (DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX // convert EQ to NE + JNE diff16 // branch if at least one byte is not equal + ADDQ $16, SI + ADDQ $16, DI + SUBQ $16, R8 + JMP loop + +diff64: + ADDQ $48, SI + ADDQ $48, DI + JMP diff16 +diff48: + ADDQ $32, SI + ADDQ $32, DI + JMP diff16 +diff32: + ADDQ $16, SI + ADDQ $16, DI + // AX = bit mask of differences +diff16: + BSFQ AX, BX // index of first byte that differs + XORQ AX, AX + MOVB (SI)(BX*1), CX + CMPB CX, (DI)(BX*1) + SETHI AX + LEAQ -1(AX*2), AX // convert 1/0 to +1/-1 + RET + + // 0 through 16 bytes left, alen>=8, blen>=8 +_0through16: + CMPQ R8, $8 + JBE _0through8 + MOVQ (SI), AX + MOVQ (DI), CX + CMPQ AX, CX + JNE diff8 +_0through8: + MOVQ -8(SI)(R8*1), AX + MOVQ -8(DI)(R8*1), CX + CMPQ AX, CX + JEQ allsame + + // AX and CX contain parts of a and b that differ. +diff8: + BSWAPQ AX // reverse order of bytes + BSWAPQ CX + XORQ AX, CX + BSRQ CX, CX // index of highest bit difference + SHRQ CX, AX // move a's bit to bottom + ANDQ $1, AX // mask bit + LEAQ -1(AX*2), AX // 1/0 => +1/-1 + RET + + // 0-7 bytes in common +small: + LEAQ (R8*8), CX // bytes left -> bits left + NEGQ CX // - bits lift (== 64 - bits left mod 64) + JEQ allsame + + // load bytes of a into high bytes of AX + CMPB SI, $0xf8 + JA si_high + MOVQ (SI), SI + JMP si_finish +si_high: + MOVQ -8(SI)(R8*1), SI + SHRQ CX, SI +si_finish: + SHLQ CX, SI + + // load bytes of b in to high bytes of BX + CMPB DI, $0xf8 + JA di_high + MOVQ (DI), DI + JMP di_finish +di_high: + MOVQ -8(DI)(R8*1), DI + SHRQ CX, DI +di_finish: + SHLQ CX, DI + + BSWAPQ SI // reverse order of bytes + BSWAPQ DI + XORQ SI, DI // find bit differences + JEQ allsame + BSRQ DI, CX // index of highest bit difference + SHRQ CX, SI // move a's bit to bottom + ANDQ $1, SI // mask bit + LEAQ -1(SI*2), AX // 1/0 => +1/-1 + RET + +allsame: + XORQ AX, AX + XORQ CX, CX + CMPQ BX, DX + SETGT AX // 1 if alen > blen + SETEQ CX // 1 if alen == blen + LEAQ -1(CX)(AX*2), AX // 1,0,-1 result + RET + + // this works for >= 64 bytes of data. +#ifndef hasAVX2 +big_loop: + MOVOU (SI), X0 + MOVOU (DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff16 + + MOVOU 16(SI), X0 + MOVOU 16(DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff32 + + MOVOU 32(SI), X0 + MOVOU 32(DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff48 + + MOVOU 48(SI), X0 + MOVOU 48(DI), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, AX + XORQ $0xffff, AX + JNE diff64 + + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, R8 + CMPQ R8, $64 + JBE loop + JMP big_loop +#endif + + // Compare 64-bytes per loop iteration. + // Loop is unrolled and uses AVX2. +big_loop_avx2: + VMOVDQU (SI), Y2 + VMOVDQU (DI), Y3 + VMOVDQU 32(SI), Y4 + VMOVDQU 32(DI), Y5 + VPCMPEQB Y2, Y3, Y0 + VPMOVMSKB Y0, AX + XORL $0xffffffff, AX + JNE diff32_avx2 + VPCMPEQB Y4, Y5, Y6 + VPMOVMSKB Y6, AX + XORL $0xffffffff, AX + JNE diff64_avx2 + + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, R8 + CMPQ R8, $64 + JB big_loop_avx2_exit + JMP big_loop_avx2 + + // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk. +diff32_avx2: + VZEROUPPER + JMP diff16 + + // Same as diff32_avx2, but for last 32 bytes. +diff64_avx2: + VZEROUPPER + JMP diff48 + + // For <64 bytes remainder jump to normal loop. +big_loop_avx2_exit: + VZEROUPPER + JMP loop diff --git a/src/internal/bytealg/compare_arm.s b/src/internal/bytealg/compare_arm.s new file mode 100644 index 0000000..80d01a2 --- /dev/null +++ b/src/internal/bytealg/compare_arm.s @@ -0,0 +1,86 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-28 + MOVW a_base+0(FP), R2 + MOVW a_len+4(FP), R0 + MOVW b_base+12(FP), R3 + MOVW b_len+16(FP), R1 + ADD $28, R13, R7 + B cmpbody<>(SB) + +TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-20 + MOVW a_base+0(FP), R2 + MOVW a_len+4(FP), R0 + MOVW b_base+8(FP), R3 + MOVW b_len+12(FP), R1 + ADD $20, R13, R7 + B cmpbody<>(SB) + +// On entry: +// R0 is the length of a +// R1 is the length of b +// R2 points to the start of a +// R3 points to the start of b +// R7 points to return value (-1/0/1 will be written here) +// +// On exit: +// R4, R5, R6 and R8 are clobbered +TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 + CMP R2, R3 + BEQ samebytes + CMP R0, R1 + MOVW R0, R6 + MOVW.LT R1, R6 // R6 is min(R0, R1) + + CMP $0, R6 + BEQ samebytes + CMP $4, R6 + ADD R2, R6 // R2 is current byte in a, R6 is the end of the range to compare + BLT byte_loop // length < 4 + AND $3, R2, R8 + CMP $0, R8 + BNE byte_loop // unaligned a, use byte-wise compare (TODO: try to align a) +aligned_a: + AND $3, R3, R8 + CMP $0, R8 + BNE byte_loop // unaligned b, use byte-wise compare + AND $0xfffffffc, R6, R8 + // length >= 4 +chunk4_loop: + MOVW.P 4(R2), R4 + MOVW.P 4(R3), R5 + CMP R4, R5 + BNE cmp + CMP R2, R8 + BNE chunk4_loop + CMP R2, R6 + BEQ samebytes // all compared bytes were the same; compare lengths +byte_loop: + MOVBU.P 1(R2), R4 + MOVBU.P 1(R3), R5 + CMP R4, R5 + BNE ret + CMP R2, R6 + BNE byte_loop +samebytes: + CMP R0, R1 + MOVW.LT $1, R0 + MOVW.GT $-1, R0 + MOVW.EQ $0, R0 + MOVW R0, (R7) + RET +ret: + // bytes differed + MOVW.LT $1, R0 + MOVW.GT $-1, R0 + MOVW R0, (R7) + RET +cmp: + SUB $4, R2, R2 + SUB $4, R3, R3 + B byte_loop diff --git a/src/internal/bytealg/compare_arm64.s b/src/internal/bytealg/compare_arm64.s new file mode 100644 index 0000000..cc02c46 --- /dev/null +++ b/src/internal/bytealg/compare_arm64.s @@ -0,0 +1,125 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 + // R0 = a_base (want in R0) + // R1 = a_len (want in R1) + // R2 = a_cap (unused) + // R3 = b_base (want in R2) + // R4 = b_len (want in R3) + // R5 = b_cap (unused) + MOVD R3, R2 + MOVD R4, R3 + B cmpbody<>(SB) + +TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 + // R0 = a_base + // R1 = a_len + // R2 = b_base + // R3 = b_len + B cmpbody<>(SB) + +// On entry: +// R0 points to the start of a +// R1 is the length of a +// R2 points to the start of b +// R3 is the length of b +// +// On exit: +// R0 is the result +// R4, R5, R6, R8, R9 and R10 are clobbered +TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 + CMP R0, R2 + BEQ samebytes // same starting pointers; compare lengths + CMP R1, R3 + CSEL LT, R3, R1, R6 // R6 is min(R1, R3) + + CBZ R6, samebytes + BIC $0xf, R6, R10 + CBZ R10, small // length < 16 + ADD R0, R10 // end of chunk16 + // length >= 16 +chunk16_loop: + LDP.P 16(R0), (R4, R8) + LDP.P 16(R2), (R5, R9) + CMP R4, R5 + BNE cmp + CMP R8, R9 + BNE cmpnext + CMP R10, R0 + BNE chunk16_loop + AND $0xf, R6, R6 + CBZ R6, samebytes + SUBS $8, R6 + BLT tail + // the length of tail > 8 bytes + MOVD.P 8(R0), R4 + MOVD.P 8(R2), R5 + CMP R4, R5 + BNE cmp + SUB $8, R6 + // compare last 8 bytes +tail: + MOVD (R0)(R6), R4 + MOVD (R2)(R6), R5 + CMP R4, R5 + BEQ samebytes +cmp: + REV R4, R4 + REV R5, R5 + CMP R4, R5 +ret: + MOVD $1, R0 + CNEG HI, R0, R0 + RET +small: + TBZ $3, R6, lt_8 + MOVD (R0), R4 + MOVD (R2), R5 + CMP R4, R5 + BNE cmp + SUBS $8, R6 + BEQ samebytes + ADD $8, R0 + ADD $8, R2 + SUB $8, R6 + B tail +lt_8: + TBZ $2, R6, lt_4 + MOVWU (R0), R4 + MOVWU (R2), R5 + CMPW R4, R5 + BNE cmp + SUBS $4, R6 + BEQ samebytes + ADD $4, R0 + ADD $4, R2 +lt_4: + TBZ $1, R6, lt_2 + MOVHU (R0), R4 + MOVHU (R2), R5 + CMPW R4, R5 + BNE cmp + ADD $2, R0 + ADD $2, R2 +lt_2: + TBZ $0, R6, samebytes +one: + MOVBU (R0), R4 + MOVBU (R2), R5 + CMPW R4, R5 + BNE ret +samebytes: + CMP R3, R1 + CSET NE, R0 + CNEG LO, R0, R0 + RET +cmpnext: + REV R8, R4 + REV R9, R5 + CMP R4, R5 + B ret diff --git a/src/internal/bytealg/compare_generic.go b/src/internal/bytealg/compare_generic.go new file mode 100644 index 0000000..b04e275 --- /dev/null +++ b/src/internal/bytealg/compare_generic.go @@ -0,0 +1,60 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le && !riscv64 + +package bytealg + +import _ "unsafe" // for go:linkname + +func Compare(a, b []byte) int { + l := len(a) + if len(b) < l { + l = len(b) + } + if l == 0 || &a[0] == &b[0] { + goto samebytes + } + for i := 0; i < l; i++ { + c1, c2 := a[i], b[i] + if c1 < c2 { + return -1 + } + if c1 > c2 { + return +1 + } + } +samebytes: + if len(a) < len(b) { + return -1 + } + if len(a) > len(b) { + return +1 + } + return 0 +} + +//go:linkname runtime_cmpstring runtime.cmpstring +func runtime_cmpstring(a, b string) int { + l := len(a) + if len(b) < l { + l = len(b) + } + for i := 0; i < l; i++ { + c1, c2 := a[i], b[i] + if c1 < c2 { + return -1 + } + if c1 > c2 { + return +1 + } + } + if len(a) < len(b) { + return -1 + } + if len(a) > len(b) { + return +1 + } + return 0 +} diff --git a/src/internal/bytealg/compare_loong64.s b/src/internal/bytealg/compare_loong64.s new file mode 100644 index 0000000..54c2dab --- /dev/null +++ b/src/internal/bytealg/compare_loong64.s @@ -0,0 +1,86 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare(SB),NOSPLIT,$0-56 + MOVV a_base+0(FP), R6 + MOVV b_base+24(FP), R7 + MOVV a_len+8(FP), R4 + MOVV b_len+32(FP), R5 + MOVV $ret+48(FP), R13 + JMP cmpbody<>(SB) + +TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 + MOVV a_base+0(FP), R6 + MOVV b_base+16(FP), R7 + MOVV a_len+8(FP), R4 + MOVV b_len+24(FP), R5 + MOVV $ret+32(FP), R13 + JMP cmpbody<>(SB) + +// On entry: +// R4 length of a +// R5 length of b +// R6 points to the start of a +// R7 points to the start of b +// R13 points to the return value (-1/0/1) +TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0 + BEQ R6, R7, samebytes // same start of a and b + + SGTU R4, R5, R9 + BNE R0, R9, r2_lt_r1 + MOVV R4, R14 + JMP entry +r2_lt_r1: + MOVV R5, R14 // R14 is min(R4, R5) +entry: + ADDV R6, R14, R12 // R6 start of a, R14 end of a + BEQ R6, R12, samebytes // length is 0 + + SRLV $4, R14 // R14 is number of chunks + BEQ R0, R14, byte_loop + + // make sure both a and b are aligned. + OR R6, R7, R15 + AND $7, R15 + BNE R0, R15, byte_loop + +chunk16_loop: + BEQ R0, R14, byte_loop + MOVV (R6), R8 + MOVV (R7), R9 + BNE R8, R9, byte_loop + MOVV 8(R6), R16 + MOVV 8(R7), R17 + ADDV $16, R6 + ADDV $16, R7 + SUBVU $1, R14 + BEQ R16, R17, chunk16_loop + SUBV $8, R6 + SUBV $8, R7 + +byte_loop: + BEQ R6, R12, samebytes + MOVBU (R6), R8 + ADDVU $1, R6 + MOVBU (R7), R9 + ADDVU $1, R7 + BEQ R8, R9, byte_loop + +byte_cmp: + SGTU R8, R9, R12 // R12 = 1 if (R8 > R9) + BNE R0, R12, ret + MOVV $-1, R12 + JMP ret + +samebytes: + SGTU R4, R5, R8 + SGTU R5, R4, R9 + SUBV R9, R8, R12 + +ret: + MOVV R12, (R13) + RET diff --git a/src/internal/bytealg/compare_mips64x.s b/src/internal/bytealg/compare_mips64x.s new file mode 100644 index 0000000..117a9ef --- /dev/null +++ b/src/internal/bytealg/compare_mips64x.s @@ -0,0 +1,88 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build mips64 || mips64le + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare(SB),NOSPLIT,$0-56 + MOVV a_base+0(FP), R3 + MOVV b_base+24(FP), R4 + MOVV a_len+8(FP), R1 + MOVV b_len+32(FP), R2 + MOVV $ret+48(FP), R9 + JMP cmpbody<>(SB) + +TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 + MOVV a_base+0(FP), R3 + MOVV b_base+16(FP), R4 + MOVV a_len+8(FP), R1 + MOVV b_len+24(FP), R2 + MOVV $ret+32(FP), R9 + JMP cmpbody<>(SB) + +// On entry: +// R1 length of a +// R2 length of b +// R3 points to the start of a +// R4 points to the start of b +// R9 points to the return value (-1/0/1) +TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0 + BEQ R3, R4, samebytes // same start of a and b + + SGTU R1, R2, R7 + BNE R0, R7, r2_lt_r1 + MOVV R1, R10 + JMP entry +r2_lt_r1: + MOVV R2, R10 // R10 is min(R1, R2) +entry: + ADDV R3, R10, R8 // R3 start of a, R8 end of a + BEQ R3, R8, samebytes // length is 0 + + SRLV $4, R10 // R10 is number of chunks + BEQ R0, R10, byte_loop + + // make sure both a and b are aligned. + OR R3, R4, R11 + AND $7, R11 + BNE R0, R11, byte_loop + +chunk16_loop: + BEQ R0, R10, byte_loop + MOVV (R3), R6 + MOVV (R4), R7 + BNE R6, R7, byte_loop + MOVV 8(R3), R13 + MOVV 8(R4), R14 + ADDV $16, R3 + ADDV $16, R4 + SUBVU $1, R10 + BEQ R13, R14, chunk16_loop + SUBV $8, R3 + SUBV $8, R4 + +byte_loop: + BEQ R3, R8, samebytes + MOVBU (R3), R6 + ADDVU $1, R3 + MOVBU (R4), R7 + ADDVU $1, R4 + BEQ R6, R7, byte_loop + +byte_cmp: + SGTU R6, R7, R8 // R8 = 1 if (R6 > R7) + BNE R0, R8, ret + MOVV $-1, R8 + JMP ret + +samebytes: + SGTU R1, R2, R6 + SGTU R2, R1, R7 + SUBV R7, R6, R8 + +ret: + MOVV R8, (R9) + RET diff --git a/src/internal/bytealg/compare_mipsx.s b/src/internal/bytealg/compare_mipsx.s new file mode 100644 index 0000000..857ac13 --- /dev/null +++ b/src/internal/bytealg/compare_mipsx.s @@ -0,0 +1,72 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build mips || mipsle + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare(SB),NOSPLIT,$0-28 + MOVW a_base+0(FP), R3 + MOVW b_base+12(FP), R4 + MOVW a_len+4(FP), R1 + MOVW b_len+16(FP), R2 + BEQ R3, R4, samebytes + SGTU R1, R2, R7 + MOVW R1, R8 + CMOVN R7, R2, R8 // R8 is min(R1, R2) + + ADDU R3, R8 // R3 is current byte in a, R8 is last byte in a to compare +loop: + BEQ R3, R8, samebytes + + MOVBU (R3), R6 + ADDU $1, R3 + MOVBU (R4), R7 + ADDU $1, R4 + BEQ R6, R7 , loop + + SGTU R6, R7, R8 + MOVW $-1, R6 + CMOVZ R8, R6, R8 + JMP cmp_ret +samebytes: + SGTU R1, R2, R6 + SGTU R2, R1, R7 + SUBU R7, R6, R8 +cmp_ret: + MOVW R8, ret+24(FP) + RET + +TEXT runtime·cmpstring(SB),NOSPLIT,$0-20 + MOVW a_base+0(FP), R3 + MOVW a_len+4(FP), R1 + MOVW b_base+8(FP), R4 + MOVW b_len+12(FP), R2 + BEQ R3, R4, samebytes + SGTU R1, R2, R7 + MOVW R1, R8 + CMOVN R7, R2, R8 // R8 is min(R1, R2) + + ADDU R3, R8 // R3 is current byte in a, R8 is last byte in a to compare +loop: + BEQ R3, R8, samebytes // all compared bytes were the same; compare lengths + + MOVBU (R3), R6 + ADDU $1, R3 + MOVBU (R4), R7 + ADDU $1, R4 + BEQ R6, R7 , loop + // bytes differed + SGTU R6, R7, R8 + MOVW $-1, R6 + CMOVZ R8, R6, R8 + JMP cmp_ret +samebytes: + SGTU R1, R2, R6 + SGTU R2, R1, R7 + SUBU R7, R6, R8 +cmp_ret: + MOVW R8, ret+16(FP) + RET diff --git a/src/internal/bytealg/compare_native.go b/src/internal/bytealg/compare_native.go new file mode 100644 index 0000000..34964e2 --- /dev/null +++ b/src/internal/bytealg/compare_native.go @@ -0,0 +1,19 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build 386 || amd64 || s390x || arm || arm64 || loong64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le || riscv64 + +package bytealg + +import _ "unsafe" // For go:linkname + +//go:noescape +func Compare(a, b []byte) int + +// The declaration below generates ABI wrappers for functions +// implemented in assembly in this package but declared in another +// package. + +//go:linkname abigen_runtime_cmpstring runtime.cmpstring +func abigen_runtime_cmpstring(a, b string) int diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s new file mode 100644 index 0000000..cbe0525 --- /dev/null +++ b/src/internal/bytealg/compare_ppc64x.s @@ -0,0 +1,502 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64 || ppc64le + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 + // incoming: + // R3 a addr -> R5 + // R4 a len -> R3 + // R5 a cap unused + // R6 b addr -> R6 + // R7 b len -> R4 + // R8 b cap unused + MOVD R3, R5 + MOVD R4, R3 + MOVD R7, R4 + CMP R5,R6,CR7 + CMP R3,R4,CR6 + BEQ CR7,equal + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + CMP R16,$1 + BNE power8 + BR cmpbodyp9<>(SB) +power8: + BR cmpbody<>(SB) +equal: + BEQ CR6,done + MOVD $1, R8 + BGT CR6,greater + NEG R8 +greater: + MOVD R8, R3 + RET +done: + MOVD $0, R3 + RET + +TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 + // incoming: + // R3 a addr -> R5 + // R4 a len -> R3 + // R5 b addr -> R6 + // R6 b len -> R4 + MOVD R6, R7 + MOVD R5, R6 + MOVD R3, R5 + MOVD R4, R3 + MOVD R7, R4 + CMP R5,R6,CR7 + CMP R3,R4,CR6 + BEQ CR7,equal + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + CMP R16,$1 + BNE power8 + BR cmpbodyp9<>(SB) +power8: + BR cmpbody<>(SB) +equal: + BEQ CR6,done + MOVD $1, R8 + BGT CR6,greater + NEG R8 +greater: + MOVD R8, R3 + RET + +done: + MOVD $0, R3 + RET + +#ifdef GOARCH_ppc64le +DATA byteswap<>+0(SB)/8, $0x0706050403020100 +DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 +GLOBL byteswap<>+0(SB), RODATA, $16 +#define SWAP V21 +#endif + +// Do an efficient memcmp for ppc64le/ppc64/POWER8 +// R3 = a len +// R4 = b len +// R5 = a addr +// R6 = b addr +// On exit: +// R3 = return value +TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 + MOVD R3,R8 // set up length + CMP R3,R4,CR2 // unequal? + BLT CR2,setuplen // BLT CR2 + MOVD R4,R8 // use R4 for comparison len +setuplen: + CMP R8,$32 // optimize >= 32 + MOVD R8,R9 + BLT setup8a // optimize < 32 + MOVD $16,R10 // set offsets to load into vectors + CMP R8,$64 + BLT cmp32 // process size 32-63 + + DCBT (R5) // optimize >= 64 + DCBT (R6) // cache hint + MOVD $32,R11 // set offsets to load into vector + MOVD $48,R12 // set offsets to load into vector + +loop64a:// process size 64 and greater + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different // jump out if its different + + LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector + LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector + LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector + LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $-64,R9,R9 // reduce remaining size by 64 + ADD $64,R5,R5 // increment to next 64 bytes of A + ADD $64,R6,R6 // increment to next 64 bytes of B + CMPU R9,$64 + BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining + + CMPU R9,$32 + BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining + CMPU R9,$0 + BNE rem // loop to rem if the remainder is not 0 + + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +cmp32: + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector + LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $-32,R9,R9 // reduce remaining size by 32 + ADD $32,R5,R5 // increment to next 32 bytes of A + ADD $32,R6,R6 // increment to next 32 bytes of B + CMPU R9,$0 + BNE rem // loop to rem if the remainder is not 0 + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +rem: + MOVD R9,R8 + ANDCC $24,R8,R9 // Any 8 byte chunks? + BEQ leftover // and result is 0 + BR setup8a + +different: +#ifdef GOARCH_ppc64le + MOVD $byteswap<>+00(SB), R16 + LXVD2X (R16)(R0),SWAP // Set up swap string + + VPERM V3,V3,SWAP,V3 + VPERM V4,V4,SWAP,V4 +#endif + MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison + MFVSRD VS36,R10 + + CMPU R16,R10 + BEQ lower + BGT greater + MOVD $-1,R3 // return value if A < B + RET +lower: + VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison + MFVSRD VS35,R16 + VSLDOI $8,V4,V4,V4 + MFVSRD VS36,R10 + + CMPU R16,R10 + BGT greater + MOVD $-1,R3 // return value if A < B + RET +setup8a: + SRADCC $3,R8,R9 // get the 8 byte count + BEQ leftover // shifted value is 0 + CMPU R8,$8 // optimize 8byte move + BEQ size8 + CMPU R8,$16 + BEQ size16 + MOVD R9,CTR // loop count for doublewords +loop8: +#ifdef GOARCH_ppc64le + MOVDBR (R5+R0),R16 // doublewords to compare + MOVDBR (R6+R0),R10 // LE compare order +#else + MOVD (R5+R0),R16 // doublewords to compare + MOVD (R6+R0),R10 // BE compare order +#endif + ADD $8,R5 + ADD $8,R6 + CMPU R16,R10 // match? + BC 8,2,loop8 // bt ctr <> 0 && cr + BGT greater + BLT less +leftover: + ANDCC $7,R8,R9 // check for leftover bytes + BEQ zeroremainder +simplecheck: + MOVD R0,R14 + CMP R9,$4 // process 4 bytes + BLT halfword +#ifdef GOARCH_ppc64le + MOVWBR (R5)(R14),R10 + MOVWBR (R6)(R14),R11 +#else + MOVWZ (R5)(R14),R10 + MOVWZ (R6)(R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $-4,R9 + ADD $4,R14 + PCALIGN $16 + +halfword: + CMP R9,$2 // process 2 bytes + BLT byte +#ifdef GOARCH_ppc64le + MOVHBR (R5)(R14),R10 + MOVHBR (R6)(R14),R11 +#else + MOVHZ (R5)(R14),R10 + MOVHZ (R6)(R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $-2,R9 + ADD $2,R14 + PCALIGN $16 +byte: + CMP R9,$0 // process 1 byte + BEQ skip + MOVBZ (R5)(R14),R10 + MOVBZ (R6)(R14),R11 + CMPU R10,R11 + BGT greater + BLT less + PCALIGN $16 +skip: + BEQ CR2,equal + BGT CR2,greater + +less: MOVD $-1,R3 // return value if A < B + RET +size16: + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different +zeroremainder: + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +size8: +#ifdef GOARCH_ppc64le + MOVDBR (R5+R0),R16 // doublewords to compare + MOVDBR (R6+R0),R10 // LE compare order +#else + MOVD (R5+R0),R16 // doublewords to compare + MOVD (R6+R0),R10 // BE compare order +#endif + CMPU R16,R10 // match? + BGT greater + BLT less + BGT CR2,greater // 2nd len > 1st len + BLT CR2,less // 2nd len < 1st len +equal: + MOVD $0, R3 // return value if A == B + RET +greater: + MOVD $1,R3 // return value if A > B + RET + +// Do an efficient memcmp for ppc64le/ppc64/POWER9 +// R3 = a len +// R4 = b len +// R5 = a addr +// R6 = b addr +// On exit: +// R3 = return value +TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0 + MOVD R3,R8 // set up length + CMP R3,R4,CR2 // unequal? + BLT CR2,setuplen // BLT CR2 + MOVD R4,R8 // use R4 for comparison len +setuplen: + CMP R8,$16 // optimize for size<16 + MOVD R8,R9 + BLT simplecheck + MOVD $16,R10 // set offsets to load into vectors + CMP R8,$32 // optimize for size 16-31 + BLT cmp16 + CMP R8,$64 + BLT cmp32 // optimize for size 32-63 + DCBT (R5) // optimize for size>=64 + DCBT (R6) // cache hint + + MOVD $32,R11 // set offsets to load into vector + MOVD $48,R12 // set offsets to load into vector + +loop64a:// process size 64 and greater + LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector + LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector + VCMPNEBCC V3,V4,V1 // record comparison into V1 + BNE CR6,different // jump out if its different + + LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector + LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector + LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + ADD $-64,R9,R9 // reduce remaining size by 64 + ADD $64,R5,R5 // increment to next 64 bytes of A + ADD $64,R6,R6 // increment to next 64 bytes of B + CMPU R9,$64 + BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining + + CMPU R9,$32 + BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining + CMPU R9,$16 + BGE cmp16 // loop to cmp16 if there are 16-31 bytes left + CMPU R9,$0 + BNE simplecheck // loop to simplecheck for remaining bytes + + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +cmp32: + LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector + LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector + + VCMPNEBCC V3,V4,V1 // record comparison into V1 + BNE CR6,different // jump out if its different + + LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + ADD $-32,R9,R9 // reduce remaining size by 32 + ADD $32,R5,R5 // increment to next 32 bytes of A + ADD $32,R6,R6 // increment to next 32 bytes of B + CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left + BGE cmp16 + CMPU R9,$0 + BNE simplecheck // loop to simplecheck for remainder bytes + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +different: + + MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison + MFVSRD VS36,R10 + + CMPU R16,R10 + BEQ lower + BGT greater + MOVD $-1,R3 // return value if A < B + RET +lower: + MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison + MFVSRLD VS36,R10 + + CMPU R16,R10 + BGT greater + MOVD $-1,R3 // return value if A < B + RET + +greater: + MOVD $1,R3 // return value if A > B + RET +cmp16: + ANDCC $16,R9,R31 + BEQ tail + + LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $16,R5 + ADD $16,R6 +tail: + ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b) + BEQ end + + ADD R9,R5 + ADD R9,R6 + MOVD $-16,R10 + + LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different +end: + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B) + BR greater // jump to greater otherwise +simplecheck: + MOVD $0,R14 // process 8 bytes + CMP R9,$8 + BLT word +#ifdef GOARCH_ppc64le + MOVDBR (R5+R14),R10 + MOVDBR (R6+R14),R11 +#else + MOVD (R5+R14),R10 + MOVD (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $8,R14 + ADD $-8,R9 + PCALIGN $16 +word: + CMP R9,$4 // process 4 bytes + BLT halfword +#ifdef GOARCH_ppc64le + MOVWBR (R5+R14),R10 + MOVWBR (R6+R14),R11 +#else + MOVWZ (R5+R14),R10 + MOVWZ (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $4,R14 + ADD $-4,R9 + PCALIGN $16 +halfword: + CMP R9,$2 // process 2 bytes + BLT byte +#ifdef GOARCH_ppc64le + MOVHBR (R5+R14),R10 + MOVHBR (R6+R14),R11 +#else + MOVHZ (R5+R14),R10 + MOVHZ (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $2,R14 + ADD $-2,R9 + PCALIGN $16 +byte: + CMP R9,$0 // process 1 byte + BEQ skip + MOVBZ (R5+R14),R10 + MOVBZ (R6+R14),R11 + CMPU R10,R11 + BGT greater + BLT less + PCALIGN $16 +skip: + BEQ CR2,equal + BGT CR2,greater +less: + MOVD $-1,R3 // return value if A < B + RET +equal: + MOVD $0, R3 // return value if A == B + RET diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s new file mode 100644 index 0000000..44a743d --- /dev/null +++ b/src/internal/bytealg/compare_riscv64.s @@ -0,0 +1,187 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 + // X10 = a_base + // X11 = a_len + // X12 = a_cap (unused) + // X13 = b_base (want in X12) + // X14 = b_len (want in X13) + // X15 = b_cap (unused) + MOV X13, X12 + MOV X14, X13 + JMP compare<>(SB) + +TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 + // X10 = a_base + // X11 = a_len + // X12 = b_base + // X13 = b_len + JMP compare<>(SB) + +// On entry: +// X10 points to start of a +// X11 length of a +// X12 points to start of b +// X13 length of b +// for non-regabi X14 points to the address to store the return value (-1/0/1) +// for regabi the return value in X10 +TEXT compare<>(SB),NOSPLIT|NOFRAME,$0 + BEQ X10, X12, cmp_len + + MOV X11, X5 + BGE X13, X5, use_a_len // X5 = min(len(a), len(b)) + MOV X13, X5 +use_a_len: + BEQZ X5, cmp_len + + MOV $32, X6 + BLT X5, X6, loop4_check + + // Check alignment - if alignment differs we have to do one byte at a time. + AND $7, X10, X7 + AND $7, X12, X8 + BNE X7, X8, loop4_check + BEQZ X7, loop32_check + + // Check one byte at a time until we reach 8 byte alignment. + SUB X7, X5, X5 +align: + ADD $-1, X7 + MOVBU 0(X10), X8 + MOVBU 0(X12), X9 + BNE X8, X9, cmp + ADD $1, X10 + ADD $1, X12 + BNEZ X7, align + +loop32_check: + MOV $32, X7 + BLT X5, X7, loop16_check +loop32: + MOV 0(X10), X15 + MOV 0(X12), X16 + MOV 8(X10), X17 + MOV 8(X12), X18 + BEQ X15, X16, loop32a + JMP cmp8a +loop32a: + BEQ X17, X18, loop32b + JMP cmp8b +loop32b: + MOV 16(X10), X15 + MOV 16(X12), X16 + MOV 24(X10), X17 + MOV 24(X12), X18 + BEQ X15, X16, loop32c + JMP cmp8a +loop32c: + BEQ X17, X18, loop32d + JMP cmp8b +loop32d: + ADD $32, X10 + ADD $32, X12 + ADD $-32, X5 + BGE X5, X7, loop32 + BEQZ X5, cmp_len + +loop16_check: + MOV $16, X6 + BLT X5, X6, loop4_check +loop16: + MOV 0(X10), X15 + MOV 0(X12), X16 + MOV 8(X10), X17 + MOV 8(X12), X18 + BEQ X15, X16, loop16a + JMP cmp8a +loop16a: + BEQ X17, X18, loop16b + JMP cmp8b +loop16b: + ADD $16, X10 + ADD $16, X12 + ADD $-16, X5 + BGE X5, X6, loop16 + BEQZ X5, cmp_len + +loop4_check: + MOV $4, X6 + BLT X5, X6, loop1 +loop4: + MOVBU 0(X10), X8 + MOVBU 0(X12), X9 + MOVBU 1(X10), X15 + MOVBU 1(X12), X16 + BEQ X8, X9, loop4a + SLTU X9, X8, X5 + SLTU X8, X9, X6 + JMP cmp_ret +loop4a: + BEQ X15, X16, loop4b + SLTU X16, X15, X5 + SLTU X15, X16, X6 + JMP cmp_ret +loop4b: + MOVBU 2(X10), X21 + MOVBU 2(X12), X22 + MOVBU 3(X10), X23 + MOVBU 3(X12), X24 + BEQ X21, X22, loop4c + SLTU X22, X21, X5 + SLTU X21, X22, X6 + JMP cmp_ret +loop4c: + BEQ X23, X24, loop4d + SLTU X24, X23, X5 + SLTU X23, X24, X6 + JMP cmp_ret +loop4d: + ADD $4, X10 + ADD $4, X12 + ADD $-4, X5 + BGE X5, X6, loop4 + +loop1: + BEQZ X5, cmp_len + MOVBU 0(X10), X8 + MOVBU 0(X12), X9 + BNE X8, X9, cmp + ADD $1, X10 + ADD $1, X12 + ADD $-1, X5 + JMP loop1 + + // Compare 8 bytes of memory in X15/X16 that are known to differ. +cmp8a: + MOV $0xff, X19 +cmp8a_loop: + AND X15, X19, X8 + AND X16, X19, X9 + BNE X8, X9, cmp + SLLI $8, X19 + JMP cmp8a_loop + + // Compare 8 bytes of memory in X17/X18 that are known to differ. +cmp8b: + MOV $0xff, X19 +cmp8b_loop: + AND X17, X19, X8 + AND X18, X19, X9 + BNE X8, X9, cmp + SLLI $8, X19 + JMP cmp8b_loop + +cmp_len: + MOV X11, X8 + MOV X13, X9 +cmp: + SLTU X9, X8, X5 + SLTU X8, X9, X6 +cmp_ret: + SUB X5, X6, X10 + RET diff --git a/src/internal/bytealg/compare_s390x.s b/src/internal/bytealg/compare_s390x.s new file mode 100644 index 0000000..5394548 --- /dev/null +++ b/src/internal/bytealg/compare_s390x.s @@ -0,0 +1,69 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56 + MOVD a_base+0(FP), R3 + MOVD a_len+8(FP), R4 + MOVD b_base+24(FP), R5 + MOVD b_len+32(FP), R6 + LA ret+48(FP), R7 + BR cmpbody<>(SB) + +TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 + MOVD a_base+0(FP), R3 + MOVD a_len+8(FP), R4 + MOVD b_base+16(FP), R5 + MOVD b_len+24(FP), R6 + LA ret+32(FP), R7 + BR cmpbody<>(SB) + +// input: +// R3 = a +// R4 = alen +// R5 = b +// R6 = blen +// R7 = address of output word (stores -1/0/1 here) +TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 + CMPBEQ R3, R5, cmplengths + MOVD R4, R8 + CMPBLE R4, R6, amin + MOVD R6, R8 +amin: + CMPBEQ R8, $0, cmplengths + CMP R8, $256 + BLE tail +loop: + CLC $256, 0(R3), 0(R5) + BGT gt + BLT lt + SUB $256, R8 + MOVD $256(R3), R3 + MOVD $256(R5), R5 + CMP R8, $256 + BGT loop +tail: + SUB $1, R8 + EXRL $cmpbodyclc<>(SB), R8 + BGT gt + BLT lt +cmplengths: + CMP R4, R6 + BEQ eq + BLT lt +gt: + MOVD $1, 0(R7) + RET +lt: + MOVD $-1, 0(R7) + RET +eq: + MOVD $0, 0(R7) + RET + +TEXT cmpbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0 + CLC $1, 0(R3), 0(R5) + RET diff --git a/src/internal/bytealg/compare_wasm.s b/src/internal/bytealg/compare_wasm.s new file mode 100644 index 0000000..dc8fb33 --- /dev/null +++ b/src/internal/bytealg/compare_wasm.s @@ -0,0 +1,115 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare(SB), NOSPLIT, $0-56 + Get SP + I64Load a_base+0(FP) + I64Load a_len+8(FP) + I64Load b_base+24(FP) + I64Load b_len+32(FP) + Call cmpbody<>(SB) + I64Store ret+48(FP) + RET + +TEXT runtime·cmpstring(SB), NOSPLIT, $0-40 + Get SP + I64Load a_base+0(FP) + I64Load a_len+8(FP) + I64Load b_base+16(FP) + I64Load b_len+24(FP) + Call cmpbody<>(SB) + I64Store ret+32(FP) + RET + +// params: a, alen, b, blen +// ret: -1/0/1 +TEXT cmpbody<>(SB), NOSPLIT, $0-0 + // len = min(alen, blen) + Get R1 + Get R3 + Get R1 + Get R3 + I64LtU + Select + Set R4 + + Get R0 + I32WrapI64 + Get R2 + I32WrapI64 + Get R4 + I32WrapI64 + Call memcmp<>(SB) + I64ExtendI32S + Tee R5 + + I64Eqz + If + // check length + Get R1 + Get R3 + I64Sub + Set R5 + End + + I64Const $0 + I64Const $-1 + I64Const $1 + Get R5 + I64Const $0 + I64LtS + Select + Get R5 + I64Eqz + Select + Return + +// compiled with emscripten +// params: a, b, len +// ret: <0/0/>0 +TEXT memcmp<>(SB), NOSPLIT, $0-0 + Get R2 + If $1 + Loop + Get R0 + I32Load8S $0 + Tee R3 + Get R1 + I32Load8S $0 + Tee R4 + I32Eq + If + Get R0 + I32Const $1 + I32Add + Set R0 + Get R1 + I32Const $1 + I32Add + Set R1 + I32Const $0 + Get R2 + I32Const $-1 + I32Add + Tee R2 + I32Eqz + BrIf $3 + Drop + Br $1 + End + End + Get R3 + I32Const $255 + I32And + Get R4 + I32Const $255 + I32And + I32Sub + Else + I32Const $0 + End + Return diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s new file mode 100644 index 0000000..efb17f8 --- /dev/null +++ b/src/internal/bytealg/count_amd64.s @@ -0,0 +1,208 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "asm_amd64.h" +#include "textflag.h" + +TEXT ·Count(SB),NOSPLIT,$0-40 +#ifndef hasPOPCNT + CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 + JEQ 2(PC) + JMP ·countGeneric(SB) +#endif + MOVQ b_base+0(FP), SI + MOVQ b_len+8(FP), BX + MOVB c+24(FP), AL + LEAQ ret+32(FP), R8 + JMP countbody<>(SB) + +TEXT ·CountString(SB),NOSPLIT,$0-32 +#ifndef hasPOPCNT + CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 + JEQ 2(PC) + JMP ·countGenericString(SB) +#endif + MOVQ s_base+0(FP), SI + MOVQ s_len+8(FP), BX + MOVB c+16(FP), AL + LEAQ ret+24(FP), R8 + JMP countbody<>(SB) + +// input: +// SI: data +// BX: data len +// AL: byte sought +// R8: address to put result +// This function requires the POPCNT instruction. +TEXT countbody<>(SB),NOSPLIT,$0 + // Shuffle X0 around so that each byte contains + // the character we're looking for. + MOVD AX, X0 + PUNPCKLBW X0, X0 + PUNPCKLBW X0, X0 + PSHUFL $0, X0, X0 + + CMPQ BX, $16 + JLT small + + MOVQ $0, R12 // Accumulator + + MOVQ SI, DI + + CMPQ BX, $32 + JA avx2 +sse: + LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes + JMP sseloopentry + +sseloop: + // Move the next 16-byte chunk of the data into X1. + MOVOU (DI), X1 + // Compare bytes in X0 to X1. + PCMPEQB X0, X1 + // Take the top bit of each byte in X1 and put the result in DX. + PMOVMSKB X1, DX + // Count number of matching bytes + POPCNTL DX, DX + // Accumulate into R12 + ADDQ DX, R12 + // Advance to next block. + ADDQ $16, DI +sseloopentry: + CMPQ DI, AX + JBE sseloop + + // Get the number of bytes to consider in the last 16 bytes + ANDQ $15, BX + JZ end + + // Create mask to ignore overlap between previous 16 byte block + // and the next. + MOVQ $16,CX + SUBQ BX, CX + MOVQ $0xFFFF, R10 + SARQ CL, R10 + SALQ CL, R10 + + // Process the last 16-byte chunk. This chunk may overlap with the + // chunks we've already searched so we need to mask part of it. + MOVOU (AX), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + POPCNTL DX, DX + ADDQ DX, R12 +end: + MOVQ R12, (R8) + RET + +// handle for lengths < 16 +small: + TESTQ BX, BX + JEQ endzero + + // Check if we'll load across a page boundary. + LEAQ 16(SI), AX + TESTW $0xff0, AX + JEQ endofpage + + // We must ignore high bytes as they aren't part of our slice. + // Create mask. + MOVB BX, CX + MOVQ $1, R10 + SALQ CL, R10 + SUBQ $1, R10 + + // Load data + MOVOU (SI), X1 + // Compare target byte with each byte in data. + PCMPEQB X0, X1 + // Move result bits to integer register. + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + POPCNTL DX, DX + // Directly return DX, we don't need to accumulate + // since we have <16 bytes. + MOVQ DX, (R8) + RET +endzero: + MOVQ $0, (R8) + RET + +endofpage: + // We must ignore low bytes as they aren't part of our slice. + MOVQ $16,CX + SUBQ BX, CX + MOVQ $0xFFFF, R10 + SARQ CL, R10 + SALQ CL, R10 + + // Load data into the high end of X1. + MOVOU -16(SI)(BX*1), X1 + // Compare target byte with each byte in data. + PCMPEQB X0, X1 + // Move result bits to integer register. + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + // Directly return DX, we don't need to accumulate + // since we have <16 bytes. + POPCNTL DX, DX + MOVQ DX, (R8) + RET + +avx2: +#ifndef hasAVX2 + CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 + JNE sse +#endif + MOVD AX, X0 + LEAQ -32(SI)(BX*1), R11 + VPBROADCASTB X0, Y1 +avx2_loop: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, DX + POPCNTL DX, DX + ADDQ DX, R12 + ADDQ $32, DI + CMPQ DI, R11 + JLE avx2_loop + + // If last block is already processed, + // skip to the end. + CMPQ DI, R11 + JEQ endavx + + // Load address of the last 32 bytes. + // There is an overlap with the previous block. + MOVQ R11, DI + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, DX + // Exit AVX mode. + VZEROUPPER + + // Create mask to ignore overlap between previous 32 byte block + // and the next. + ANDQ $31, BX + MOVQ $32,CX + SUBQ BX, CX + MOVQ $0xFFFFFFFF, R10 + SARQ CL, R10 + SALQ CL, R10 + // Apply mask + ANDQ R10, DX + POPCNTL DX, DX + ADDQ DX, R12 + MOVQ R12, (R8) + RET +endavx: + // Exit AVX mode. + VZEROUPPER + MOVQ R12, (R8) + RET diff --git a/src/internal/bytealg/count_arm.s b/src/internal/bytealg/count_arm.s new file mode 100644 index 0000000..f704ea0 --- /dev/null +++ b/src/internal/bytealg/count_arm.s @@ -0,0 +1,43 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Count(SB),NOSPLIT,$0-20 + MOVW b_base+0(FP), R0 + MOVW b_len+4(FP), R1 + MOVBU c+12(FP), R2 + MOVW $ret+16(FP), R7 + B countbytebody<>(SB) + +TEXT ·CountString(SB),NOSPLIT,$0-16 + MOVW s_base+0(FP), R0 + MOVW s_len+4(FP), R1 + MOVBU c+8(FP), R2 + MOVW $ret+12(FP), R7 + B countbytebody<>(SB) + +// Input: +// R0: data +// R1: data length +// R2: byte to find +// R7: address to put result +// +// On exit: +// R4 and R8 are clobbered +TEXT countbytebody<>(SB),NOSPLIT,$0 + MOVW $0, R8 // R8 = count of byte to search + CMP $0, R1 + B.EQ done // short path to handle 0-byte case + ADD R0, R1 // R1 is the end of the range +byte_loop: + MOVBU.P 1(R0), R4 + CMP R4, R2 + ADD.EQ $1, R8 + CMP R0, R1 + B.NE byte_loop +done: + MOVW R8, (R7) + RET diff --git a/src/internal/bytealg/count_arm64.s b/src/internal/bytealg/count_arm64.s new file mode 100644 index 0000000..8cd703d --- /dev/null +++ b/src/internal/bytealg/count_arm64.s @@ -0,0 +1,90 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Count(SB),NOSPLIT,$0-40 + MOVD b_base+0(FP), R0 + MOVD b_len+8(FP), R2 + MOVBU c+24(FP), R1 + MOVD $ret+32(FP), R8 + B countbytebody<>(SB) + +TEXT ·CountString(SB),NOSPLIT,$0-32 + MOVD s_base+0(FP), R0 + MOVD s_len+8(FP), R2 + MOVBU c+16(FP), R1 + MOVD $ret+24(FP), R8 + B countbytebody<>(SB) + +// input: +// R0: data +// R2: data len +// R1: byte to find +// R8: address to put result +TEXT countbytebody<>(SB),NOSPLIT,$0 + // R11 = count of byte to search + MOVD $0, R11 + // short path to handle 0-byte case + CBZ R2, done + CMP $0x20, R2 + // jump directly to tail if length < 32 + BLO tail + ANDS $0x1f, R0, R9 + BEQ chunk + // Work with not 32-byte aligned head + BIC $0x1f, R0, R3 + ADD $0x20, R3 +head_loop: + MOVBU.P 1(R0), R5 + CMP R5, R1 + CINC EQ, R11, R11 + SUB $1, R2, R2 + CMP R0, R3 + BNE head_loop + // Work with 32-byte aligned chunks +chunk: + BIC $0x1f, R2, R9 + // The first chunk can also be the last + CBZ R9, tail + // R3 = end of 32-byte chunks + ADD R0, R9, R3 + MOVD $1, R5 + VMOV R5, V5.B16 + // R2 = length of tail + SUB R9, R2, R2 + // Duplicate R1 (byte to search) to 16 1-byte elements of V0 + VMOV R1, V0.B16 + // Clear the low 64-bit element of V7 and V8 + VEOR V7.B8, V7.B8, V7.B8 + VEOR V8.B8, V8.B8, V8.B8 + // Count the target byte in 32-byte chunk +chunk_loop: + VLD1.P (R0), [V1.B16, V2.B16] + CMP R0, R3 + VCMEQ V0.B16, V1.B16, V3.B16 + VCMEQ V0.B16, V2.B16, V4.B16 + // Clear the higher 7 bits + VAND V5.B16, V3.B16, V3.B16 + VAND V5.B16, V4.B16, V4.B16 + // Count lanes match the requested byte + VADDP V4.B16, V3.B16, V6.B16 // 32B->16B + VUADDLV V6.B16, V7 + // Accumulate the count in low 64-bit element of V8 when inside the loop + VADD V7, V8 + BNE chunk_loop + VMOV V8.D[0], R6 + ADD R6, R11, R11 + CBZ R2, done +tail: + // Work with tail shorter than 32 bytes + MOVBU.P 1(R0), R5 + SUB $1, R2, R2 + CMP R5, R1 + CINC EQ, R11, R11 + CBNZ R2, tail +done: + MOVD R11, (R8) + RET diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go new file mode 100644 index 0000000..932a7c5 --- /dev/null +++ b/src/internal/bytealg/count_generic.go @@ -0,0 +1,27 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 && !arm && !arm64 && !ppc64le && !ppc64 && !riscv64 && !s390x + +package bytealg + +func Count(b []byte, c byte) int { + n := 0 + for _, x := range b { + if x == c { + n++ + } + } + return n +} + +func CountString(s string, c byte) int { + n := 0 + for i := 0; i < len(s); i++ { + if s[i] == c { + n++ + } + } + return n +} diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go new file mode 100644 index 0000000..90189c9 --- /dev/null +++ b/src/internal/bytealg/count_native.go @@ -0,0 +1,33 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 || arm || arm64 || ppc64le || ppc64 || riscv64 || s390x + +package bytealg + +//go:noescape +func Count(b []byte, c byte) int + +//go:noescape +func CountString(s string, c byte) int + +// A backup implementation to use by assembly. +func countGeneric(b []byte, c byte) int { + n := 0 + for _, x := range b { + if x == c { + n++ + } + } + return n +} +func countGenericString(s string, c byte) int { + n := 0 + for i := 0; i < len(s); i++ { + if s[i] == c { + n++ + } + } + return n +} diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s new file mode 100644 index 0000000..2d2490b --- /dev/null +++ b/src/internal/bytealg/count_ppc64x.s @@ -0,0 +1,96 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64le || ppc64 + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 + // R3 = byte array pointer + // R4 = length + MOVBZ R6, R5 // R5 = byte + BR countbytebody<>(SB) + +TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32 + // R3 = byte array pointer + // R4 = length + MOVBZ R5, R5 // R5 = byte + BR countbytebody<>(SB) + +// R3: addr of string +// R4: len of string +// R5: byte to count +// On exit: +// R3: return value +// endianness shouldn't matter since we are just counting and order +// is irrelevant +TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0 + DCBT (R3) // Prepare cache line. + MOVD R0, R18 // byte count + MOVD R3, R19 // Save base address for calculating the index later. + MOVD R4, R16 + + MOVD R5, R6 + RLDIMI $8, R6, $48, R6 + RLDIMI $16, R6, $32, R6 + RLDIMI $32, R6, $0, R6 // fill reg with the byte to count + + VSPLTISW $3, V4 // used for shift + MTVRD R6, V1 // move compare byte + VSPLTB $7, V1, V1 // replicate byte across V1 + + CMPU R4, $32 // Check if it's a small string (<32 bytes) + BLT tail // Jump to the small string case + XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator + +cmploop: + LXVW4X (R3), VS32 // load bytes from string + + // when the bytes match, the corresponding byte contains all 1s + VCMPEQUB V1, V0, V2 // compare bytes + VPOPCNTD V2, V3 // each double word contains its count + VADDUDM V3, V5, V5 // accumulate bit count in each double word + ADD $16, R3, R3 // increment pointer + SUB $16, R16, R16 // remaining bytes + CMP R16, $16 // at least 16 remaining? + BGE cmploop + VSRD V5, V4, V5 // shift by 3 to convert bits to bytes + VSLDOI $8, V5, V5, V6 // get the double word values from vector + MFVSRD V5, R9 + MFVSRD V6, R10 + ADD R9, R10, R9 + ADD R9, R18, R18 + +tail: + CMP R16, $8 // 8 bytes left? + BLT small + + MOVD (R3), R12 // load 8 bytes + CMPB R12, R6, R17 // compare bytes + POPCNTD R17, R15 // bit count + SRD $3, R15, R15 // byte count + ADD R15, R18, R18 // add to byte count + +next1: + ADD $8, R3, R3 + SUB $8, R16, R16 // remaining bytes + BR tail + +small: + CMP $0, R16 // any remaining + BEQ done + MOVBZ (R3), R12 // check each remaining byte + CMP R12, R5 + BNE next2 + ADD $1, R18 + +next2: + SUB $1, R16 + ADD $1, R3 // inc address + BR small + +done: + MOVD R18, R3 // return count + RET diff --git a/src/internal/bytealg/count_riscv64.s b/src/internal/bytealg/count_riscv64.s new file mode 100644 index 0000000..d123cbd --- /dev/null +++ b/src/internal/bytealg/count_riscv64.s @@ -0,0 +1,47 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40 + // X10 = b_base + // X11 = b_len + // X12 = b_cap (unused) + // X13 = byte to count (want in X12) + AND $0xff, X13, X12 + MOV ZERO, X14 // count + ADD X10, X11 // end + +loop: + BEQ X10, X11, done + MOVBU (X10), X15 + ADD $1, X10 + BNE X12, X15, loop + ADD $1, X14 + JMP loop + +done: + MOV X14, X10 + RET + +TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32 + // X10 = s_base + // X11 = s_len + // X12 = byte to count + AND $0xff, X12 + MOV ZERO, X14 // count + ADD X10, X11 // end + +loop: + BEQ X10, X11, done + MOVBU (X10), X15 + ADD $1, X10 + BNE X12, X15, loop + ADD $1, X14 + JMP loop + +done: + MOV X14, X10 + RET diff --git a/src/internal/bytealg/count_s390x.s b/src/internal/bytealg/count_s390x.s new file mode 100644 index 0000000..2a3b5c0 --- /dev/null +++ b/src/internal/bytealg/count_s390x.s @@ -0,0 +1,169 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// condition code masks +#define EQ 8 +#define NE 7 + +// register assignments +#define R_ZERO R0 +#define R_VAL R1 +#define R_TMP R2 +#define R_PTR R3 +#define R_LEN R4 +#define R_CHAR R5 +#define R_RET R6 +#define R_ITER R7 +#define R_CNT R8 +#define R_MPTR R9 + +// vector register assignments +#define V_ZERO V0 +#define V_CHAR V1 +#define V_MASK V2 +#define V_VAL V3 +#define V_CNT V4 + +// mask for trailing bytes in vector implementation +GLOBL countbytemask<>(SB), RODATA, $16 +DATA countbytemask<>+0(SB)/8, $0x0101010101010101 +DATA countbytemask<>+8(SB)/8, $0x0101010101010101 + +// func Count(b []byte, c byte) int +TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40 + LMG b+0(FP), R_PTR, R_LEN + MOVBZ c+24(FP), R_CHAR + MOVD $ret+32(FP), R_RET + BR countbytebody<>(SB) + +// func CountString(s string, c byte) int +TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32 + LMG s+0(FP), R_PTR, R_LEN + MOVBZ c+16(FP), R_CHAR + MOVD $ret+24(FP), R_RET + BR countbytebody<>(SB) + +// input: +// R_PTR = address of array of bytes +// R_LEN = number of bytes in array +// R_CHAR = byte value to count zero (extended to register width) +// R_RET = address of return value +TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0 + MOVD $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP + MOVD $countbytemask<>(SB), R_MPTR + CGIJ $EQ, R_LEN, $0, ret0 // return if length is 0. + SRD $4, R_LEN, R_ITER // R_ITER is the number of 16-byte chunks + MOVBZ (R_TMP), R_TMP // load bool indicating support for vector facility + CGIJ $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available + + // Start of vector code (have vector facility). + // + // Set R_LEN to be the length mod 16 minus 1 to use as an index for + // vector 'load with length' (VLL). It will be in the range [-1,14]. + // Also replicate c across a 16-byte vector and initialize V_ZERO. + ANDW $0xf, R_LEN + VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0} + VZERO V_ZERO // V_ZERO = [1]uint128{0} + ADDW $-1, R_LEN + VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c} + + // Jump to loop if we have more than 15 bytes to process. + CGIJ $NE, R_ITER, $0, vxchunks + + // Load 1-15 bytes and corresponding mask. + // Note: only the low 32-bits of R_LEN are used for the index. + VLL R_LEN, (R_PTR), V_VAL + VLL R_LEN, (R_MPTR), V_MASK + + // Compare each byte in input chunk against byte to be counted. + // Each byte element will be set to either 0 (no match) or 1 (match). + VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00 + VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits + + // Accumulate matched byte count in 128-bit integer value. + VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} + VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} + + // Return rightmost (lowest) 64-bit part of accumulator. + VSTEG $1, V_CNT, (R_RET) + RET + +vxchunks: + // Load 0x01 into every byte element in the 16-byte mask vector. + VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1} + VZERO V_CNT // initial uint128 count of 0 + +vxloop: + // Load input bytes in 16-byte chunks. + VL (R_PTR), V_VAL + + // Compare each byte in input chunk against byte to be counted. + // Each byte element will be set to either 0 (no match) or 1 (match). + VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00 + VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits + + // Increment input string address. + MOVD $16(R_PTR), R_PTR + + // Accumulate matched byte count in 128-bit integer value. + VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} + VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} + VAQ V_VAL, V_CNT, V_CNT // accumulate + + // Repeat until all 16-byte chunks are done. + BRCTG R_ITER, vxloop + + // Skip to end if there are no trailing bytes. + CIJ $EQ, R_LEN, $-1, vxret + + // Load 1-15 bytes and corresponding mask. + // Note: only the low 32-bits of R_LEN are used for the index. + VLL R_LEN, (R_PTR), V_VAL + VLL R_LEN, (R_MPTR), V_MASK + + // Compare each byte in input chunk against byte to be counted. + // Each byte element will be set to either 0 (no match) or 1 (match). + VCEQB V_CHAR, V_VAL, V_VAL + VN V_MASK, V_VAL, V_VAL + + // Accumulate matched byte count in 128-bit integer value. + VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} + VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} + VAQ V_VAL, V_CNT, V_CNT // accumulate + +vxret: + // Return rightmost (lowest) 64-bit part of accumulator. + VSTEG $1, V_CNT, (R_RET) + RET + +novx: + // Start of non-vector code (the vector facility not available). + // + // Initialise counter and constant zero. + MOVD $0, R_CNT + MOVD $0, R_ZERO + +loop: + // Read 1-byte from input and compare. + // Note: avoid putting LOCGR in critical path. + MOVBZ (R_PTR), R_VAL + MOVD $1, R_TMP + MOVD $1(R_PTR), R_PTR + CMPW R_VAL, R_CHAR + LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match) + ADD R_TMP, R_CNT // accumulate 64-bit result + + // Repeat until all bytes have been checked. + BRCTG R_LEN, loop + +ret: + MOVD R_CNT, (R_RET) + RET + +ret0: + MOVD $0, (R_RET) + RET diff --git a/src/internal/bytealg/equal_386.s b/src/internal/bytealg/equal_386.s new file mode 100644 index 0000000..58b3cbe --- /dev/null +++ b/src/internal/bytealg/equal_386.s @@ -0,0 +1,130 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal(SB),NOSPLIT,$0-13 + MOVL a+0(FP), SI + MOVL b+4(FP), DI + CMPL SI, DI + JEQ eq + MOVL size+8(FP), BX + LEAL ret+12(FP), AX + JMP memeqbody<>(SB) +eq: + MOVB $1, ret+12(FP) + RET + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9 + MOVL a+0(FP), SI + MOVL b+4(FP), DI + CMPL SI, DI + JEQ eq + MOVL 4(DX), BX // compiler stores size at offset 4 in the closure + LEAL ret+8(FP), AX + JMP memeqbody<>(SB) +eq: + MOVB $1, ret+8(FP) + RET + +// a in SI +// b in DI +// count in BX +// address of result byte in AX +TEXT memeqbody<>(SB),NOSPLIT,$0-0 + CMPL BX, $4 + JB small + + // 64 bytes at a time using xmm registers +hugeloop: + CMPL BX, $64 + JB bigloop +#ifdef GO386_softfloat + JMP bigloop +#endif + MOVOU (SI), X0 + MOVOU (DI), X1 + MOVOU 16(SI), X2 + MOVOU 16(DI), X3 + MOVOU 32(SI), X4 + MOVOU 32(DI), X5 + MOVOU 48(SI), X6 + MOVOU 48(DI), X7 + PCMPEQB X1, X0 + PCMPEQB X3, X2 + PCMPEQB X5, X4 + PCMPEQB X7, X6 + PAND X2, X0 + PAND X6, X4 + PAND X4, X0 + PMOVMSKB X0, DX + ADDL $64, SI + ADDL $64, DI + SUBL $64, BX + CMPL DX, $0xffff + JEQ hugeloop + MOVB $0, (AX) + RET + + // 4 bytes at a time using 32-bit register +bigloop: + CMPL BX, $4 + JBE leftover + MOVL (SI), CX + MOVL (DI), DX + ADDL $4, SI + ADDL $4, DI + SUBL $4, BX + CMPL CX, DX + JEQ bigloop + MOVB $0, (AX) + RET + + // remaining 0-4 bytes +leftover: + MOVL -4(SI)(BX*1), CX + MOVL -4(DI)(BX*1), DX + CMPL CX, DX + SETEQ (AX) + RET + +small: + CMPL BX, $0 + JEQ equal + + LEAL 0(BX*8), CX + NEGL CX + + MOVL SI, DX + CMPB DX, $0xfc + JA si_high + + // load at SI won't cross a page boundary. + MOVL (SI), SI + JMP si_finish +si_high: + // address ends in 111111xx. Load up to bytes we want, move to correct position. + MOVL -4(SI)(BX*1), SI + SHRL CX, SI +si_finish: + + // same for DI. + MOVL DI, DX + CMPB DX, $0xfc + JA di_high + MOVL (DI), DI + JMP di_finish +di_high: + MOVL -4(DI)(BX*1), DI + SHRL CX, DI +di_finish: + + SUBL SI, DI + SHLL CX, DI +equal: + SETEQ (AX) + RET diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s new file mode 100644 index 0000000..d178a33 --- /dev/null +++ b/src/internal/bytealg/equal_amd64.s @@ -0,0 +1,162 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "asm_amd64.h" +#include "textflag.h" + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25 + // AX = a (want in SI) + // BX = b (want in DI) + // CX = size (want in BX) + CMPQ AX, BX + JNE neq + MOVQ $1, AX // return 1 + RET +neq: + MOVQ AX, SI + MOVQ BX, DI + MOVQ CX, BX + JMP memeqbody<>(SB) + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17 + // AX = a (want in SI) + // BX = b (want in DI) + // 8(DX) = size (want in BX) + CMPQ AX, BX + JNE neq + MOVQ $1, AX // return 1 + RET +neq: + MOVQ AX, SI + MOVQ BX, DI + MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure + JMP memeqbody<>(SB) + +// Input: +// a in SI +// b in DI +// count in BX +// Output: +// result in AX +TEXT memeqbody<>(SB),NOSPLIT,$0-0 + CMPQ BX, $8 + JB small + CMPQ BX, $64 + JB bigloop +#ifndef hasAVX2 + CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 + JE hugeloop_avx2 + + // 64 bytes at a time using xmm registers +hugeloop: + CMPQ BX, $64 + JB bigloop + MOVOU (SI), X0 + MOVOU (DI), X1 + MOVOU 16(SI), X2 + MOVOU 16(DI), X3 + MOVOU 32(SI), X4 + MOVOU 32(DI), X5 + MOVOU 48(SI), X6 + MOVOU 48(DI), X7 + PCMPEQB X1, X0 + PCMPEQB X3, X2 + PCMPEQB X5, X4 + PCMPEQB X7, X6 + PAND X2, X0 + PAND X6, X4 + PAND X4, X0 + PMOVMSKB X0, DX + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, BX + CMPL DX, $0xffff + JEQ hugeloop + XORQ AX, AX // return 0 + RET +#endif + + // 64 bytes at a time using ymm registers +hugeloop_avx2: + CMPQ BX, $64 + JB bigloop_avx2 + VMOVDQU (SI), Y0 + VMOVDQU (DI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU 32(DI), Y3 + VPCMPEQB Y1, Y0, Y4 + VPCMPEQB Y2, Y3, Y5 + VPAND Y4, Y5, Y6 + VPMOVMSKB Y6, DX + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, BX + CMPL DX, $0xffffffff + JEQ hugeloop_avx2 + VZEROUPPER + XORQ AX, AX // return 0 + RET + +bigloop_avx2: + VZEROUPPER + + // 8 bytes at a time using 64-bit register +bigloop: + CMPQ BX, $8 + JBE leftover + MOVQ (SI), CX + MOVQ (DI), DX + ADDQ $8, SI + ADDQ $8, DI + SUBQ $8, BX + CMPQ CX, DX + JEQ bigloop + XORQ AX, AX // return 0 + RET + + // remaining 0-8 bytes +leftover: + MOVQ -8(SI)(BX*1), CX + MOVQ -8(DI)(BX*1), DX + CMPQ CX, DX + SETEQ AX + RET + +small: + CMPQ BX, $0 + JEQ equal + + LEAQ 0(BX*8), CX + NEGQ CX + + CMPB SI, $0xf8 + JA si_high + + // load at SI won't cross a page boundary. + MOVQ (SI), SI + JMP si_finish +si_high: + // address ends in 11111xxx. Load up to bytes we want, move to correct position. + MOVQ -8(SI)(BX*1), SI + SHRQ CX, SI +si_finish: + + // same for DI. + CMPB DI, $0xf8 + JA di_high + MOVQ (DI), DI + JMP di_finish +di_high: + MOVQ -8(DI)(BX*1), DI + SHRQ CX, DI +di_finish: + + SUBQ SI, DI + SHLQ CX, DI +equal: + SETEQ AX + RET diff --git a/src/internal/bytealg/equal_arm.s b/src/internal/bytealg/equal_arm.s new file mode 100644 index 0000000..a6c4369 --- /dev/null +++ b/src/internal/bytealg/equal_arm.s @@ -0,0 +1,91 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-13 + MOVW a+0(FP), R0 + MOVW b+4(FP), R2 + CMP R0, R2 + B.EQ eq + MOVW size+8(FP), R1 + CMP $0, R1 + B.EQ eq // short path to handle 0-byte case + MOVW $ret+12(FP), R7 + B memeqbody<>(SB) +eq: + MOVW $1, R0 + MOVB R0, ret+12(FP) + RET + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-9 + MOVW a+0(FP), R0 + MOVW b+4(FP), R2 + CMP R0, R2 + B.EQ eq + MOVW 4(R7), R1 // compiler stores size at offset 4 in the closure + CMP $0, R1 + B.EQ eq // short path to handle 0-byte case + MOVW $ret+8(FP), R7 + B memeqbody<>(SB) +eq: + MOVW $1, R0 + MOVB R0, ret+8(FP) + RET + +// Input: +// R0: data of a +// R1: length +// R2: data of b +// R7: points to return value +// +// On exit: +// R4, R5 and R6 are clobbered +TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0 + CMP $1, R1 + B.EQ one // 1-byte special case for better performance + + CMP $4, R1 + ADD R0, R1 // R1 is the end of the range to compare + B.LT byte_loop // length < 4 + AND $3, R0, R6 + CMP $0, R6 + B.NE byte_loop // unaligned a, use byte-wise compare (TODO: try to align a) + AND $3, R2, R6 + CMP $0, R6 + B.NE byte_loop // unaligned b, use byte-wise compare + AND $0xfffffffc, R1, R6 + // length >= 4 +chunk4_loop: + MOVW.P 4(R0), R4 + MOVW.P 4(R2), R5 + CMP R4, R5 + B.NE notequal + CMP R0, R6 + B.NE chunk4_loop + CMP R0, R1 + B.EQ equal // reached the end +byte_loop: + MOVBU.P 1(R0), R4 + MOVBU.P 1(R2), R5 + CMP R4, R5 + B.NE notequal + CMP R0, R1 + B.NE byte_loop +equal: + MOVW $1, R0 + MOVB R0, (R7) + RET +one: + MOVBU (R0), R4 + MOVBU (R2), R5 + CMP R4, R5 + B.EQ equal +notequal: + MOVW $0, R0 + MOVB R0, (R7) + RET diff --git a/src/internal/bytealg/equal_arm64.s b/src/internal/bytealg/equal_arm64.s new file mode 100644 index 0000000..d3aabba --- /dev/null +++ b/src/internal/bytealg/equal_arm64.s @@ -0,0 +1,121 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25 + // short path to handle 0-byte case + CBZ R2, equal + B memeqbody<>(SB) +equal: + MOVD $1, R0 + RET + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17 + CMP R0, R1 + BEQ eq + MOVD 8(R26), R2 // compiler stores size at offset 8 in the closure + CBZ R2, eq + B memeqbody<>(SB) +eq: + MOVD $1, R0 + RET + +// input: +// R0: pointer a +// R1: pointer b +// R2: data len +// at return: result in R0 +TEXT memeqbody<>(SB),NOSPLIT,$0 + CMP $1, R2 + // handle 1-byte special case for better performance + BEQ one + CMP $16, R2 + // handle specially if length < 16 + BLO tail + BIC $0x3f, R2, R3 + CBZ R3, chunk16 + // work with 64-byte chunks + ADD R3, R0, R6 // end of chunks +chunk64_loop: + VLD1.P (R0), [V0.D2, V1.D2, V2.D2, V3.D2] + VLD1.P (R1), [V4.D2, V5.D2, V6.D2, V7.D2] + VCMEQ V0.D2, V4.D2, V8.D2 + VCMEQ V1.D2, V5.D2, V9.D2 + VCMEQ V2.D2, V6.D2, V10.D2 + VCMEQ V3.D2, V7.D2, V11.D2 + VAND V8.B16, V9.B16, V8.B16 + VAND V8.B16, V10.B16, V8.B16 + VAND V8.B16, V11.B16, V8.B16 + CMP R0, R6 + VMOV V8.D[0], R4 + VMOV V8.D[1], R5 + CBZ R4, not_equal + CBZ R5, not_equal + BNE chunk64_loop + AND $0x3f, R2, R2 + CBZ R2, equal +chunk16: + // work with 16-byte chunks + BIC $0xf, R2, R3 + CBZ R3, tail + ADD R3, R0, R6 // end of chunks +chunk16_loop: + LDP.P 16(R0), (R4, R5) + LDP.P 16(R1), (R7, R9) + EOR R4, R7 + CBNZ R7, not_equal + EOR R5, R9 + CBNZ R9, not_equal + CMP R0, R6 + BNE chunk16_loop + AND $0xf, R2, R2 + CBZ R2, equal +tail: + // special compare of tail with length < 16 + TBZ $3, R2, lt_8 + MOVD (R0), R4 + MOVD (R1), R5 + EOR R4, R5 + CBNZ R5, not_equal + SUB $8, R2, R6 // offset of the last 8 bytes + MOVD (R0)(R6), R4 + MOVD (R1)(R6), R5 + EOR R4, R5 + CBNZ R5, not_equal + B equal +lt_8: + TBZ $2, R2, lt_4 + MOVWU (R0), R4 + MOVWU (R1), R5 + EOR R4, R5 + CBNZ R5, not_equal + SUB $4, R2, R6 // offset of the last 4 bytes + MOVWU (R0)(R6), R4 + MOVWU (R1)(R6), R5 + EOR R4, R5 + CBNZ R5, not_equal + B equal +lt_4: + TBZ $1, R2, lt_2 + MOVHU.P 2(R0), R4 + MOVHU.P 2(R1), R5 + CMP R4, R5 + BNE not_equal +lt_2: + TBZ $0, R2, equal +one: + MOVBU (R0), R4 + MOVBU (R1), R5 + CMP R4, R5 + BNE not_equal +equal: + MOVD $1, R0 + RET +not_equal: + MOVB ZR, R0 + RET diff --git a/src/internal/bytealg/equal_generic.go b/src/internal/bytealg/equal_generic.go new file mode 100644 index 0000000..59bdf8f --- /dev/null +++ b/src/internal/bytealg/equal_generic.go @@ -0,0 +1,18 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +// Equal reports whether a and b +// are the same length and contain the same bytes. +// A nil argument is equivalent to an empty slice. +// +// Equal is equivalent to bytes.Equal. +// It is provided here for convenience, +// because some packages cannot depend on bytes. +func Equal(a, b []byte) bool { + // Neither cmd/compile nor gccgo allocates for these string conversions. + // There is a test for this in package bytes. + return string(a) == string(b) +} diff --git a/src/internal/bytealg/equal_loong64.s b/src/internal/bytealg/equal_loong64.s new file mode 100644 index 0000000..dcdde89 --- /dev/null +++ b/src/internal/bytealg/equal_loong64.s @@ -0,0 +1,52 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +#define REGCTXT R29 + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 + MOVV a+0(FP), R4 + MOVV b+8(FP), R5 + BEQ R4, R5, eq + MOVV size+16(FP), R6 + ADDV R4, R6, R7 +loop: + BNE R4, R7, test + MOVV $1, R4 + MOVB R4, ret+24(FP) + RET +test: + MOVBU (R4), R9 + ADDV $1, R4 + MOVBU (R5), R10 + ADDV $1, R5 + BEQ R9, R10, loop + + MOVB R0, ret+24(FP) + RET +eq: + MOVV $1, R4 + MOVB R4, ret+24(FP) + RET + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17 + MOVV a+0(FP), R4 + MOVV b+8(FP), R5 + BEQ R4, R5, eq + MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure + MOVV R4, 8(R3) + MOVV R5, 16(R3) + MOVV R6, 24(R3) + JAL runtime·memequal(SB) + MOVBU 32(R3), R4 + MOVB R4, ret+16(FP) + RET +eq: + MOVV $1, R4 + MOVB R4, ret+16(FP) + RET diff --git a/src/internal/bytealg/equal_mips64x.s b/src/internal/bytealg/equal_mips64x.s new file mode 100644 index 0000000..d92f225 --- /dev/null +++ b/src/internal/bytealg/equal_mips64x.s @@ -0,0 +1,118 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build mips64 || mips64le + +#include "go_asm.h" +#include "textflag.h" + +#define REGCTXT R22 + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 + MOVV a+0(FP), R1 + MOVV b+8(FP), R2 + BEQ R1, R2, eq + MOVV size+16(FP), R3 + ADDV R1, R3, R4 + + // chunk size is 16 + SGTU $16, R3, R8 + BEQ R0, R8, chunk_entry + +byte_loop: + BNE R1, R4, byte_test + MOVV $1, R1 + MOVB R1, ret+24(FP) + RET +byte_test: + MOVBU (R1), R6 + ADDV $1, R1 + MOVBU (R2), R7 + ADDV $1, R2 + BEQ R6, R7, byte_loop + JMP not_eq + +chunk_entry: + // make sure both a and b are aligned + OR R1, R2, R9 + AND $0x7, R9 + BNE R0, R9, byte_loop + JMP chunk_loop_1 + +chunk_loop: + // chunk size is 16 + SGTU $16, R3, R8 + BNE R0, R8, chunk_tail_8 +chunk_loop_1: + MOVV (R1), R6 + MOVV (R2), R7 + BNE R6, R7, not_eq + MOVV 8(R1), R12 + MOVV 8(R2), R13 + ADDV $16, R1 + ADDV $16, R2 + SUBV $16, R3 + BEQ R12, R13, chunk_loop + JMP not_eq + +chunk_tail_8: + AND $8, R3, R14 + BEQ R0, R14, chunk_tail_4 + MOVV (R1), R6 + MOVV (R2), R7 + BNE R6, R7, not_eq + ADDV $8, R1 + ADDV $8, R2 + +chunk_tail_4: + AND $4, R3, R14 + BEQ R0, R14, chunk_tail_2 + MOVWU (R1), R6 + MOVWU (R2), R7 + BNE R6, R7, not_eq + ADDV $4, R1 + ADDV $4, R2 + +chunk_tail_2: + AND $2, R3, R14 + BEQ R0, R14, chunk_tail_1 + MOVHU (R1), R6 + MOVHU (R2), R7 + BNE R6, R7, not_eq + ADDV $2, R1 + ADDV $2, R2 + +chunk_tail_1: + AND $1, R3, R14 + BEQ R0, R14, eq + MOVBU (R1), R6 + MOVBU (R2), R7 + BEQ R6, R7, eq + +not_eq: + MOVB R0, ret+24(FP) + RET +eq: + MOVV $1, R1 + MOVB R1, ret+24(FP) + RET + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17 + MOVV a+0(FP), R1 + MOVV b+8(FP), R2 + BEQ R1, R2, eq + MOVV 8(REGCTXT), R3 // compiler stores size at offset 8 in the closure + MOVV R1, 8(R29) + MOVV R2, 16(R29) + MOVV R3, 24(R29) + JAL runtime·memequal(SB) + MOVBU 32(R29), R1 + MOVB R1, ret+16(FP) + RET +eq: + MOVV $1, R1 + MOVB R1, ret+16(FP) + RET diff --git a/src/internal/bytealg/equal_mipsx.s b/src/internal/bytealg/equal_mipsx.s new file mode 100644 index 0000000..4c46dd4 --- /dev/null +++ b/src/internal/bytealg/equal_mipsx.s @@ -0,0 +1,62 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build mips || mipsle + +#include "go_asm.h" +#include "textflag.h" + +#define REGCTXT R22 + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal(SB),NOSPLIT,$0-13 + MOVW a+0(FP), R1 + MOVW b+4(FP), R2 + BEQ R1, R2, eq + MOVW size+8(FP), R3 + ADDU R1, R3, R4 +loop: + BNE R1, R4, test + MOVW $1, R1 + MOVB R1, ret+12(FP) + RET +test: + MOVBU (R1), R6 + ADDU $1, R1 + MOVBU (R2), R7 + ADDU $1, R2 + BEQ R6, R7, loop + + MOVB R0, ret+12(FP) + RET +eq: + MOVW $1, R1 + MOVB R1, ret+12(FP) + RET + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9 + MOVW a+0(FP), R1 + MOVW b+4(FP), R2 + BEQ R1, R2, eq + MOVW 4(REGCTXT), R3 // compiler stores size at offset 4 in the closure + ADDU R1, R3, R4 +loop: + BNE R1, R4, test + MOVW $1, R1 + MOVB R1, ret+8(FP) + RET +test: + MOVBU (R1), R6 + ADDU $1, R1 + MOVBU (R2), R7 + ADDU $1, R2 + BEQ R6, R7, loop + + MOVB R0, ret+8(FP) + RET +eq: + MOVW $1, R1 + MOVB R1, ret+8(FP) + RET diff --git a/src/internal/bytealg/equal_native.go b/src/internal/bytealg/equal_native.go new file mode 100644 index 0000000..cf3a245 --- /dev/null +++ b/src/internal/bytealg/equal_native.go @@ -0,0 +1,21 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +import "unsafe" + +// The declarations below generate ABI wrappers for functions +// implemented in assembly in this package but declared in another +// package. + +// The compiler generates calls to runtime.memequal and runtime.memequal_varlen. +// In addition, the runtime calls runtime.memequal explicitly. +// Those functions are implemented in this package. + +//go:linkname abigen_runtime_memequal runtime.memequal +func abigen_runtime_memequal(a, b unsafe.Pointer, size uintptr) bool + +//go:linkname abigen_runtime_memequal_varlen runtime.memequal_varlen +func abigen_runtime_memequal_varlen(a, b unsafe.Pointer) bool diff --git a/src/internal/bytealg/equal_ppc64x.s b/src/internal/bytealg/equal_ppc64x.s new file mode 100644 index 0000000..f2c7cc1 --- /dev/null +++ b/src/internal/bytealg/equal_ppc64x.s @@ -0,0 +1,205 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64 || ppc64le + +#include "go_asm.h" +#include "textflag.h" + +// 4K (smallest case) page size offset mask for PPC64. +#define PAGE_OFFSET 4095 + +// TODO: At writing, ISEL and BC do not support CR bit type arguments, +// define them here for readability. +#define CR0LT 4*0+0 +#define CR0EQ 4*0+2 +#define CR1LT 4*1+0 +#define CR6LT 4*6+0 + +// Likewise, the BC opcode is hard to read, and no extended +// mnemonics are offered for these forms. +#define BGELR_CR6 BC 4, CR6LT, (LR) +#define BEQLR BC 12, CR0EQ, (LR) + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25 + // R3 = a + // R4 = b + // R5 = size + BR memeqbody<>(SB) + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17 + // R3 = a + // R4 = b + CMP R3, R4 + BEQ eq + MOVD 8(R11), R5 // compiler stores size at offset 8 in the closure + BR memeqbody<>(SB) +eq: + MOVD $1, R3 + RET + +// Do an efficient memequal for ppc64 +// R3 = s1 +// R4 = s2 +// R5 = len +// On exit: +// R3 = return value +TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0 + MOVD R3, R8 // Move s1 into R8 + ADD R5, R3, R9 // &s1[len(s1)] + ADD R5, R4, R10 // &s2[len(s2)] + MOVD $1, R11 + CMP R5, $16 // Use GPR checks for check for len <= 16 + BLE check0_16 + MOVD $0, R3 // Assume no-match in case BGELR CR6 returns + CMP R5, $32 // Use overlapping VSX loads for len <= 32 + BLE check17_32 // Do a pair of overlapping VSR compares + CMP R5, $64 + BLE check33_64 // Hybrid check + overlap compare. + +setup64: + SRD $6, R5, R6 // number of 64 byte chunks to compare + MOVD R6, CTR + MOVD $16, R14 // index for VSX loads and stores + MOVD $32, R15 + MOVD $48, R16 + ANDCC $0x3F, R5, R5 // len%64==0? + + PCALIGN $32 +loop64: + LXVD2X (R8+R0), V0 + LXVD2X (R4+R0), V1 + VCMPEQUBCC V0, V1, V2 // compare, setting CR6 + BGELR_CR6 + LXVD2X (R8+R14), V0 + LXVD2X (R4+R14), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + LXVD2X (R8+R15), V0 + LXVD2X (R4+R15), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + LXVD2X (R8+R16), V0 + LXVD2X (R4+R16), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + ADD $64,R8 // bump up to next 64 + ADD $64,R4 + BDNZ loop64 + + ISEL $CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0. + BEQLR // return if no tail. + + ADD $-64, R9, R8 + ADD $-64, R10, R4 + LXVD2X (R8+R0), V0 + LXVD2X (R4+R0), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + LXVD2X (R8+R14), V0 + LXVD2X (R4+R14), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + LXVD2X (R8+R15), V0 + LXVD2X (R4+R15), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + LXVD2X (R8+R16), V0 + LXVD2X (R4+R16), V1 + VCMPEQUBCC V0, V1, V2 + ISEL $CR6LT, R11, R0, R3 + RET + +check33_64: + // Bytes 0-15 + LXVD2X (R8+R0), V0 + LXVD2X (R4+R0), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + ADD $16, R8 + ADD $16, R4 + + // Bytes 16-31 + LXVD2X (R8+R0), V0 + LXVD2X (R4+R0), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + + // A little tricky, but point R4,R8 to &sx[len-32], + // and reuse check17_32 to check the next 1-31 bytes (with some overlap) + ADD $-32, R9, R8 + ADD $-32, R10, R4 + // Fallthrough + +check17_32: + LXVD2X (R8+R0), V0 + LXVD2X (R4+R0), V1 + VCMPEQUBCC V0, V1, V2 + ISEL $CR6LT, R11, R0, R5 + + // Load sX[len(sX)-16:len(sX)] and compare. + ADD $-16, R9 + ADD $-16, R10 + LXVD2X (R9+R0), V0 + LXVD2X (R10+R0), V1 + VCMPEQUBCC V0, V1, V2 + ISEL $CR6LT, R5, R0, R3 + RET + +check0_16: + CMP R5, $8 + BLT check0_7 + // Load sX[0:7] and compare. + MOVD (R8), R6 + MOVD (R4), R7 + CMP R6, R7 + ISEL $CR0EQ, R11, R0, R5 + // Load sX[len(sX)-8:len(sX)] and compare. + MOVD -8(R9), R6 + MOVD -8(R10), R7 + CMP R6, R7 + ISEL $CR0EQ, R5, R0, R3 + RET + +check0_7: + CMP R5,$0 + MOVD $1, R3 + BEQLR // return if len == 0 + + // Check < 8B loads with a single compare, but select the load address + // such that it cannot cross a page boundary. Load a few bytes from the + // lower address if that does not cross the lower page. Or, load a few + // extra bytes from the higher addresses. And align those values + // consistently in register as either address may have differing + // alignment requirements. + ANDCC $PAGE_OFFSET, R8, R6 // &sX & PAGE_OFFSET + ANDCC $PAGE_OFFSET, R4, R9 + SUBC R5, $8, R12 // 8-len + SLD $3, R12, R14 // (8-len)*8 + CMPU R6, R12, CR1 // Enough bytes lower in the page to load lower? + CMPU R9, R12, CR0 + SUB R12, R8, R6 // compute lower load address + SUB R12, R4, R9 + ISEL $CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len)) + ISEL $CR0LT, R4, R9, R4 // Similar for s2 + MOVD (R8), R15 + MOVD (R4), R16 + SLD R14, R15, R7 + SLD R14, R16, R17 + SRD R14, R7, R7 // Clear the upper (8-len) bytes (with 2 shifts) + SRD R14, R17, R17 + SRD R14, R15, R6 // Clear the lower (8-len) bytes + SRD R14, R16, R9 +#ifdef GOARCH_ppc64le + ISEL $CR1LT, R7, R6, R8 // Choose the correct len bytes to compare based on alignment + ISEL $CR0LT, R17, R9, R4 +#else + ISEL $CR1LT, R6, R7, R8 + ISEL $CR0LT, R9, R17, R4 +#endif + CMP R4, R8 + ISEL $CR0EQ, R11, R0, R3 + RET diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s new file mode 100644 index 0000000..3834083 --- /dev/null +++ b/src/internal/bytealg/equal_riscv64.s @@ -0,0 +1,124 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +#define CTXT S10 + +// func memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25 + // X10 = a_base + // X11 = b_base + // X12 = size + JMP memequal<>(SB) + +// func memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17 + MOV 8(CTXT), X12 // compiler stores size at offset 8 in the closure + // X10 = a_base + // X11 = b_base + JMP memequal<>(SB) + +// On entry X10 and X11 contain pointers, X12 contains length. +// For non-regabi X13 contains address for return value. +// For regabi return value in X10. +TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0 + BEQ X10, X11, eq + + MOV $32, X23 + BLT X12, X23, loop4_check + + // Check alignment - if alignment differs we have to do one byte at a time. + AND $7, X10, X9 + AND $7, X11, X19 + BNE X9, X19, loop4_check + BEQZ X9, loop32_check + + // Check one byte at a time until we reach 8 byte alignment. + SUB X9, X12, X12 +align: + ADD $-1, X9 + MOVBU 0(X10), X19 + MOVBU 0(X11), X20 + BNE X19, X20, not_eq + ADD $1, X10 + ADD $1, X11 + BNEZ X9, align + +loop32_check: + MOV $32, X9 + BLT X12, X9, loop16_check +loop32: + MOV 0(X10), X19 + MOV 0(X11), X20 + MOV 8(X10), X21 + MOV 8(X11), X22 + BNE X19, X20, not_eq + BNE X21, X22, not_eq + MOV 16(X10), X14 + MOV 16(X11), X15 + MOV 24(X10), X16 + MOV 24(X11), X17 + BNE X14, X15, not_eq + BNE X16, X17, not_eq + ADD $32, X10 + ADD $32, X11 + ADD $-32, X12 + BGE X12, X9, loop32 + BEQZ X12, eq + +loop16_check: + MOV $16, X23 + BLT X12, X23, loop4_check +loop16: + MOV 0(X10), X19 + MOV 0(X11), X20 + MOV 8(X10), X21 + MOV 8(X11), X22 + BNE X19, X20, not_eq + BNE X21, X22, not_eq + ADD $16, X10 + ADD $16, X11 + ADD $-16, X12 + BGE X12, X23, loop16 + BEQZ X12, eq + +loop4_check: + MOV $4, X23 + BLT X12, X23, loop1 +loop4: + MOVBU 0(X10), X19 + MOVBU 0(X11), X20 + MOVBU 1(X10), X21 + MOVBU 1(X11), X22 + BNE X19, X20, not_eq + BNE X21, X22, not_eq + MOVBU 2(X10), X14 + MOVBU 2(X11), X15 + MOVBU 3(X10), X16 + MOVBU 3(X11), X17 + BNE X14, X15, not_eq + BNE X16, X17, not_eq + ADD $4, X10 + ADD $4, X11 + ADD $-4, X12 + BGE X12, X23, loop4 + +loop1: + BEQZ X12, eq + MOVBU 0(X10), X19 + MOVBU 0(X11), X20 + BNE X19, X20, not_eq + ADD $1, X10 + ADD $1, X11 + ADD $-1, X12 + JMP loop1 + +not_eq: + MOVB ZERO, X10 + RET +eq: + MOV $1, X10 + RET diff --git a/src/internal/bytealg/equal_s390x.s b/src/internal/bytealg/equal_s390x.s new file mode 100644 index 0000000..67f814d --- /dev/null +++ b/src/internal/bytealg/equal_s390x.s @@ -0,0 +1,92 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 + MOVD a+0(FP), R3 + MOVD b+8(FP), R5 + MOVD size+16(FP), R6 + LA ret+24(FP), R7 + BR memeqbody<>(SB) + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17 + MOVD a+0(FP), R3 + MOVD b+8(FP), R5 + MOVD 8(R12), R6 // compiler stores size at offset 8 in the closure + LA ret+16(FP), R7 + BR memeqbody<>(SB) + +// input: +// R3 = a +// R5 = b +// R6 = len +// R7 = address of output byte (stores 0 or 1 here) +// a and b have the same length +TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0 + CMPBEQ R3, R5, equal +loop: + CMPBEQ R6, $0, equal + CMPBLT R6, $32, tiny + CMP R6, $256 + BLT tail + CLC $256, 0(R3), 0(R5) + BNE notequal + SUB $256, R6 + LA 256(R3), R3 + LA 256(R5), R5 + BR loop +tail: + SUB $1, R6, R8 + EXRL $memeqbodyclc<>(SB), R8 + BEQ equal +notequal: + MOVB $0, 0(R7) + RET +equal: + MOVB $1, 0(R7) + RET +tiny: + MOVD $0, R2 + CMPBLT R6, $16, lt16 + MOVD 0(R3), R8 + MOVD 0(R5), R9 + CMPBNE R8, R9, notequal + MOVD 8(R3), R8 + MOVD 8(R5), R9 + CMPBNE R8, R9, notequal + LA 16(R2), R2 + SUB $16, R6 +lt16: + CMPBLT R6, $8, lt8 + MOVD 0(R3)(R2*1), R8 + MOVD 0(R5)(R2*1), R9 + CMPBNE R8, R9, notequal + LA 8(R2), R2 + SUB $8, R6 +lt8: + CMPBLT R6, $4, lt4 + MOVWZ 0(R3)(R2*1), R8 + MOVWZ 0(R5)(R2*1), R9 + CMPBNE R8, R9, notequal + LA 4(R2), R2 + SUB $4, R6 +lt4: +#define CHECK(n) \ + CMPBEQ R6, $n, equal \ + MOVB n(R3)(R2*1), R8 \ + MOVB n(R5)(R2*1), R9 \ + CMPBNE R8, R9, notequal + CHECK(0) + CHECK(1) + CHECK(2) + CHECK(3) + BR equal + +TEXT memeqbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0 + CLC $1, 0(R3), 0(R5) + RET diff --git a/src/internal/bytealg/equal_wasm.s b/src/internal/bytealg/equal_wasm.s new file mode 100644 index 0000000..a2b76c1 --- /dev/null +++ b/src/internal/bytealg/equal_wasm.s @@ -0,0 +1,77 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// memequal(p, q unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal(SB), NOSPLIT, $0-25 + Get SP + I64Load a+0(FP) + I64Load b+8(FP) + I64Load size+16(FP) + Call memeqbody<>(SB) + I64Store8 ret+24(FP) + RET + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen(SB), NOSPLIT, $0-17 + Get SP + I64Load a+0(FP) + I64Load b+8(FP) + I64Load 8(CTXT) // compiler stores size at offset 8 in the closure + Call memeqbody<>(SB) + I64Store8 ret+16(FP) + RET + +// params: a, b, len +// ret: 0/1 +TEXT memeqbody<>(SB), NOSPLIT, $0-0 + Get R0 + Get R1 + I64Eq + If + I64Const $1 + Return + End + +loop: + Loop + Get R2 + I64Eqz + If + I64Const $1 + Return + End + + Get R0 + I32WrapI64 + I64Load8U $0 + Get R1 + I32WrapI64 + I64Load8U $0 + I64Ne + If + I64Const $0 + Return + End + + Get R0 + I64Const $1 + I64Add + Set R0 + + Get R1 + I64Const $1 + I64Add + Set R1 + + Get R2 + I64Const $1 + I64Sub + Set R2 + + Br loop + End + UNDEF diff --git a/src/internal/bytealg/index_amd64.go b/src/internal/bytealg/index_amd64.go new file mode 100644 index 0000000..c7a1941 --- /dev/null +++ b/src/internal/bytealg/index_amd64.go @@ -0,0 +1,26 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +import "internal/cpu" + +const MaxBruteForce = 64 + +func init() { + if cpu.X86.HasAVX2 { + MaxLen = 63 + } else { + MaxLen = 31 + } +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + // 1 error per 8 characters, plus a few slop to start. + return (n + 16) / 8 +} diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s new file mode 100644 index 0000000..0431491 --- /dev/null +++ b/src/internal/bytealg/index_amd64.s @@ -0,0 +1,276 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Index(SB),NOSPLIT,$0-56 + MOVQ a_base+0(FP), DI + MOVQ a_len+8(FP), DX + MOVQ b_base+24(FP), R8 + MOVQ b_len+32(FP), AX + MOVQ DI, R10 + LEAQ ret+48(FP), R11 + JMP indexbody<>(SB) + +TEXT ·IndexString(SB),NOSPLIT,$0-40 + MOVQ a_base+0(FP), DI + MOVQ a_len+8(FP), DX + MOVQ b_base+16(FP), R8 + MOVQ b_len+24(FP), AX + MOVQ DI, R10 + LEAQ ret+32(FP), R11 + JMP indexbody<>(SB) + +// AX: length of string, that we are searching for +// DX: length of string, in which we are searching +// DI: pointer to string, in which we are searching +// R8: pointer to string, that we are searching for +// R11: address, where to put return value +// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them +TEXT indexbody<>(SB),NOSPLIT,$0 + CMPQ AX, DX + JA fail + CMPQ DX, $16 + JAE sse42 +no_sse42: + CMPQ AX, $2 + JA _3_or_more + MOVW (R8), R8 + LEAQ -1(DI)(DX*1), DX +loop2: + MOVW (DI), SI + CMPW SI,R8 + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop2 + JMP fail +_3_or_more: + CMPQ AX, $3 + JA _4_or_more + MOVW 1(R8), BX + MOVW (R8), R8 + LEAQ -2(DI)(DX*1), DX +loop3: + MOVW (DI), SI + CMPW SI,R8 + JZ partial_success3 + ADDQ $1,DI + CMPQ DI,DX + JB loop3 + JMP fail +partial_success3: + MOVW 1(DI), SI + CMPW SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop3 + JMP fail +_4_or_more: + CMPQ AX, $4 + JA _5_or_more + MOVL (R8), R8 + LEAQ -3(DI)(DX*1), DX +loop4: + MOVL (DI), SI + CMPL SI,R8 + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop4 + JMP fail +_5_or_more: + CMPQ AX, $7 + JA _8_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVL -4(R8)(AX*1), BX + MOVL (R8), R8 +loop5to7: + MOVL (DI), SI + CMPL SI,R8 + JZ partial_success5to7 + ADDQ $1,DI + CMPQ DI,DX + JB loop5to7 + JMP fail +partial_success5to7: + MOVL -4(AX)(DI*1), SI + CMPL SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop5to7 + JMP fail +_8_or_more: + CMPQ AX, $8 + JA _9_or_more + MOVQ (R8), R8 + LEAQ -7(DI)(DX*1), DX +loop8: + MOVQ (DI), SI + CMPQ SI,R8 + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop8 + JMP fail +_9_or_more: + CMPQ AX, $15 + JA _16_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVQ -8(R8)(AX*1), BX + MOVQ (R8), R8 +loop9to15: + MOVQ (DI), SI + CMPQ SI,R8 + JZ partial_success9to15 + ADDQ $1,DI + CMPQ DI,DX + JB loop9to15 + JMP fail +partial_success9to15: + MOVQ -8(AX)(DI*1), SI + CMPQ SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop9to15 + JMP fail +_16_or_more: + CMPQ AX, $16 + JA _17_or_more + MOVOU (R8), X1 + LEAQ -15(DI)(DX*1), DX +loop16: + MOVOU (DI), X2 + PCMPEQB X1, X2 + PMOVMSKB X2, SI + CMPQ SI, $0xffff + JE success + ADDQ $1,DI + CMPQ DI,DX + JB loop16 + JMP fail +_17_or_more: + CMPQ AX, $31 + JA _32_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVOU -16(R8)(AX*1), X0 + MOVOU (R8), X1 +loop17to31: + MOVOU (DI), X2 + PCMPEQB X1,X2 + PMOVMSKB X2, SI + CMPQ SI, $0xffff + JE partial_success17to31 + ADDQ $1,DI + CMPQ DI,DX + JB loop17to31 + JMP fail +partial_success17to31: + MOVOU -16(AX)(DI*1), X3 + PCMPEQB X0, X3 + PMOVMSKB X3, SI + CMPQ SI, $0xffff + JE success + ADDQ $1,DI + CMPQ DI,DX + JB loop17to31 + JMP fail +// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 +// So no need to check cpuid +_32_or_more: + CMPQ AX, $32 + JA _33_to_63 + VMOVDQU (R8), Y1 + LEAQ -31(DI)(DX*1), DX +loop32: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, SI + CMPL SI, $0xffffffff + JE success_avx2 + ADDQ $1,DI + CMPQ DI,DX + JB loop32 + JMP fail_avx2 +_33_to_63: + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + VMOVDQU -32(R8)(AX*1), Y0 + VMOVDQU (R8), Y1 +loop33to63: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, SI + CMPL SI, $0xffffffff + JE partial_success33to63 + ADDQ $1,DI + CMPQ DI,DX + JB loop33to63 + JMP fail_avx2 +partial_success33to63: + VMOVDQU -32(AX)(DI*1), Y3 + VPCMPEQB Y0, Y3, Y4 + VPMOVMSKB Y4, SI + CMPL SI, $0xffffffff + JE success_avx2 + ADDQ $1,DI + CMPQ DI,DX + JB loop33to63 +fail_avx2: + VZEROUPPER +fail: + MOVQ $-1, (R11) + RET +success_avx2: + VZEROUPPER + JMP success +sse42: +#ifndef hasSSE42 + CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1 + JNE no_sse42 +#endif + CMPQ AX, $12 + // PCMPESTRI is slower than normal compare, + // so using it makes sense only if we advance 4+ bytes per compare + // This value was determined experimentally and is the ~same + // on Nehalem (first with SSE42) and Haswell. + JAE _9_or_more + LEAQ 16(R8), SI + TESTW $0xff0, SI + JEQ no_sse42 + MOVOU (R8), X1 + LEAQ -15(DI)(DX*1), SI + MOVQ $16, R9 + SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 +loop_sse42: + // 0x0c means: unsigned byte compare (bits 0,1 are 00) + // for equality (bits 2,3 are 11) + // result is not masked or inverted (bits 4,5 are 00) + // and corresponds to first matching byte (bit 6 is 0) + PCMPESTRI $0x0c, (DI), X1 + // CX == 16 means no match, + // CX > R9 means partial match at the end of the string, + // otherwise sep is at offset CX from X1 start + CMPQ CX, R9 + JBE sse42_success + ADDQ R9, DI + CMPQ DI, SI + JB loop_sse42 + PCMPESTRI $0x0c, -1(SI), X1 + CMPQ CX, R9 + JA fail + LEAQ -1(SI), DI +sse42_success: + ADDQ CX, DI +success: + SUBQ R10, DI + MOVQ DI, (R11) + RET diff --git a/src/internal/bytealg/index_arm64.go b/src/internal/bytealg/index_arm64.go new file mode 100644 index 0000000..e87c109 --- /dev/null +++ b/src/internal/bytealg/index_arm64.go @@ -0,0 +1,23 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +// Empirical data shows that using Index can get better +// performance when len(s) <= 16. +const MaxBruteForce = 16 + +func init() { + // Optimize cases where the length of the substring is less than 32 bytes + MaxLen = 32 +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + // 1 error per 16 characters, plus a few slop to start. + return 4 + n>>4 +} diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s new file mode 100644 index 0000000..3a551a7 --- /dev/null +++ b/src/internal/bytealg/index_arm64.s @@ -0,0 +1,206 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Index(SB),NOSPLIT,$0-56 + MOVD a_base+0(FP), R0 + MOVD a_len+8(FP), R1 + MOVD b_base+24(FP), R2 + MOVD b_len+32(FP), R3 + MOVD $ret+48(FP), R9 + B indexbody<>(SB) + +TEXT ·IndexString(SB),NOSPLIT,$0-40 + MOVD a_base+0(FP), R0 + MOVD a_len+8(FP), R1 + MOVD b_base+16(FP), R2 + MOVD b_len+24(FP), R3 + MOVD $ret+32(FP), R9 + B indexbody<>(SB) + +// input: +// R0: haystack +// R1: length of haystack +// R2: needle +// R3: length of needle (2 <= len <= 32) +// R9: address to put result +TEXT indexbody<>(SB),NOSPLIT,$0-56 + // main idea is to load 'sep' into separate register(s) + // to avoid repeatedly re-load it again and again + // for sebsequent substring comparisons + SUB R3, R1, R4 + // R4 contains the start of last substring for comparison + ADD R0, R4, R4 + ADD $1, R0, R8 + + CMP $8, R3 + BHI greater_8 + TBZ $3, R3, len_2_7 +len_8: + // R5 contains 8-byte of sep + MOVD (R2), R5 +loop_8: + // R6 contains substring for comparison + CMP R4, R0 + BHI not_found + MOVD.P 1(R0), R6 + CMP R5, R6 + BNE loop_8 + B found +len_2_7: + TBZ $2, R3, len_2_3 + TBZ $1, R3, len_4_5 + TBZ $0, R3, len_6 +len_7: + // R5 and R6 contain 7-byte of sep + MOVWU (R2), R5 + // 1-byte overlap with R5 + MOVWU 3(R2), R6 +loop_7: + CMP R4, R0 + BHI not_found + MOVWU.P 1(R0), R3 + CMP R5, R3 + BNE loop_7 + MOVWU 2(R0), R3 + CMP R6, R3 + BNE loop_7 + B found +len_6: + // R5 and R6 contain 6-byte of sep + MOVWU (R2), R5 + MOVHU 4(R2), R6 +loop_6: + CMP R4, R0 + BHI not_found + MOVWU.P 1(R0), R3 + CMP R5, R3 + BNE loop_6 + MOVHU 3(R0), R3 + CMP R6, R3 + BNE loop_6 + B found +len_4_5: + TBZ $0, R3, len_4 +len_5: + // R5 and R7 contain 5-byte of sep + MOVWU (R2), R5 + MOVBU 4(R2), R7 +loop_5: + CMP R4, R0 + BHI not_found + MOVWU.P 1(R0), R3 + CMP R5, R3 + BNE loop_5 + MOVBU 3(R0), R3 + CMP R7, R3 + BNE loop_5 + B found +len_4: + // R5 contains 4-byte of sep + MOVWU (R2), R5 +loop_4: + CMP R4, R0 + BHI not_found + MOVWU.P 1(R0), R6 + CMP R5, R6 + BNE loop_4 + B found +len_2_3: + TBZ $0, R3, len_2 +len_3: + // R6 and R7 contain 3-byte of sep + MOVHU (R2), R6 + MOVBU 2(R2), R7 +loop_3: + CMP R4, R0 + BHI not_found + MOVHU.P 1(R0), R3 + CMP R6, R3 + BNE loop_3 + MOVBU 1(R0), R3 + CMP R7, R3 + BNE loop_3 + B found +len_2: + // R5 contains 2-byte of sep + MOVHU (R2), R5 +loop_2: + CMP R4, R0 + BHI not_found + MOVHU.P 1(R0), R6 + CMP R5, R6 + BNE loop_2 +found: + SUB R8, R0, R0 + MOVD R0, (R9) + RET +not_found: + MOVD $-1, R0 + MOVD R0, (R9) + RET +greater_8: + SUB $9, R3, R11 // len(sep) - 9, offset of R0 for last 8 bytes + CMP $16, R3 + BHI greater_16 +len_9_16: + MOVD.P 8(R2), R5 // R5 contains the first 8-byte of sep + SUB $16, R3, R7 // len(sep) - 16, offset of R2 for last 8 bytes + MOVD (R2)(R7), R6 // R6 contains the last 8-byte of sep +loop_9_16: + // search the first 8 bytes first + CMP R4, R0 + BHI not_found + MOVD.P 1(R0), R7 + CMP R5, R7 + BNE loop_9_16 + MOVD (R0)(R11), R7 + CMP R6, R7 // compare the last 8 bytes + BNE loop_9_16 + B found +greater_16: + CMP $24, R3 + BHI len_25_32 +len_17_24: + LDP.P 16(R2), (R5, R6) // R5 and R6 contain the first 16-byte of sep + SUB $24, R3, R10 // len(sep) - 24 + MOVD (R2)(R10), R7 // R7 contains the last 8-byte of sep +loop_17_24: + // search the first 16 bytes first + CMP R4, R0 + BHI not_found + MOVD.P 1(R0), R10 + CMP R5, R10 + BNE loop_17_24 + MOVD 7(R0), R10 + CMP R6, R10 + BNE loop_17_24 + MOVD (R0)(R11), R10 + CMP R7, R10 // compare the last 8 bytes + BNE loop_17_24 + B found +len_25_32: + LDP.P 16(R2), (R5, R6) + MOVD.P 8(R2), R7 // R5, R6 and R7 contain the first 24-byte of sep + SUB $32, R3, R12 // len(sep) - 32 + MOVD (R2)(R12), R10 // R10 contains the last 8-byte of sep +loop_25_32: + // search the first 24 bytes first + CMP R4, R0 + BHI not_found + MOVD.P 1(R0), R12 + CMP R5, R12 + BNE loop_25_32 + MOVD 7(R0), R12 + CMP R6, R12 + BNE loop_25_32 + MOVD 15(R0), R12 + CMP R7, R12 + BNE loop_25_32 + MOVD (R0)(R11), R12 + CMP R10, R12 // compare the last 8 bytes + BNE loop_25_32 + B found diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go new file mode 100644 index 0000000..a59e329 --- /dev/null +++ b/src/internal/bytealg/index_generic.go @@ -0,0 +1,29 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64 + +package bytealg + +const MaxBruteForce = 0 + +// Index returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func Index(a, b []byte) int { + panic("unimplemented") +} + +// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func IndexString(a, b string) int { + panic("unimplemented") +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + panic("unimplemented") +} diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go new file mode 100644 index 0000000..6e4a2f3 --- /dev/null +++ b/src/internal/bytealg/index_native.go @@ -0,0 +1,19 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 || arm64 || s390x || ppc64le || ppc64 + +package bytealg + +//go:noescape + +// Index returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func Index(a, b []byte) int + +//go:noescape + +// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a. +// Requires 2 <= len(b) <= MaxLen. +func IndexString(a, b string) int diff --git a/src/internal/bytealg/index_ppc64x.go b/src/internal/bytealg/index_ppc64x.go new file mode 100644 index 0000000..ab3cbe5 --- /dev/null +++ b/src/internal/bytealg/index_ppc64x.go @@ -0,0 +1,26 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (aix || linux) && (ppc64 || ppc64le) + +package bytealg + +import "internal/cpu" + +const MaxBruteForce = 16 + +var SupportsPower9 = cpu.PPC64.IsPOWER9 + +func init() { + MaxLen = 32 +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + // 1 error per 8 characters, plus a few slop to start. + return (n + 16) / 8 +} diff --git a/src/internal/bytealg/index_ppc64x.s b/src/internal/bytealg/index_ppc64x.s new file mode 100644 index 0000000..26205ce --- /dev/null +++ b/src/internal/bytealg/index_ppc64x.s @@ -0,0 +1,802 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This is an implementation based on the s390x +// implementation. + +// Find a separator with 2 <= len <= 32 within a string. +// Separators with lengths of 2, 3 or 4 are handled +// specially. + +// This works on power8 and above. The loads and +// compares are done in big endian order +// since that allows the used of VCLZD, and allows +// the same implementation to work on big and little +// endian platforms with minimal conditional changes. + +// NOTE: There is a power9 implementation that +// improves performance by 10-15% on little +// endian for some of the benchmarks. +// Unrolled index2to16 loop by 4 on ppc64le/power9 +// Work is still needed for a big endian +// implementation on power9. + +//go:build ppc64 || ppc64le + +#include "go_asm.h" +#include "textflag.h" + +// Needed to swap LXVD2X loads to the correct +// byte order to work on POWER8. + +#ifdef GOARCH_ppc64 +DATA byteswap<>+0(SB)/8, $0x0001020304050607 +DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f +#else +DATA byteswap<>+0(SB)/8, $0x0706050403020100 +DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 +#endif + +// Load bytes in big endian order. Address +// alignment does not need checking. +#define VLOADSWAP(base, index, vreg, vsreg) \ + LXVD2X (base)(index), vsreg; \ + VPERM vreg, vreg, SWAP, vreg + +GLOBL byteswap<>+0(SB), RODATA, $16 + +TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 + // R3 = byte array pointer + // R4 = length + MOVD R6, R5 // R5 = separator pointer + MOVD R7, R6 // R6 = separator length + +#ifdef GOARCH_ppc64le + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7 + CMP R7, $1 + BNE power8 + BR indexbodyp9<>(SB) +#endif +power8: + BR indexbody<>(SB) + +TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 + // R3 = string + // R4 = length + // R5 = separator pointer + // R6 = separator length + +#ifdef GOARCH_ppc64le + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7 + CMP R7, $1 + BNE power8 + BR indexbodyp9<>(SB) + +#endif +power8: + BR indexbody<>(SB) + + // s: string we are searching + // sep: string to search for + // R3=&s[0], R4=len(s) + // R5=&sep[0], R6=len(sep) + // R14=&ret (index where sep found) + // R7=working addr of string + // R16=index value 16 + // R17=index value 17 + // R18=index value 18 + // R19=index value 1 + // R26=LASTBYTE of string + // R27=LASTSTR last start byte to compare with sep + // R8, R9 scratch + // V0=sep left justified zero fill + // CR4=sep length >= 16 + +#define SEPMASK V17 +#define LASTBYTE R26 +#define LASTSTR R27 +#define ONES V20 +#define SWAP V21 +#define SWAP_ VS53 +TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0 + CMP R6, R4 // Compare lengths + BGT notfound // If sep len is > string, notfound + ADD R4, R3, LASTBYTE // find last byte addr + SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) + CMP R6, $0 // Check sep len + BEQ notfound // sep len 0 -- not found + MOVD R3, R7 // Copy of string addr + MOVD $16, R16 // Index value 16 + MOVD $17, R17 // Index value 17 + MOVD $18, R18 // Index value 18 + MOVD $1, R19 // Index value 1 + MOVD $byteswap<>+00(SB), R8 + VSPLTISB $0xFF, ONES // splat all 1s + LXVD2X (R8)(R0), SWAP_ // Set up swap string + + CMP R6, $16, CR4 // CR4 for len(sep) >= 16 + VOR ONES, ONES, SEPMASK // Set up full SEPMASK + BGE CR4, loadge16 // Load for len(sep) >= 16 + SUB R6, R16, R9 // 16-len of sep + SLD $3, R9 // Set up for VSLO + MTVSRD R9, V9 // Set up for VSLO + VSLDOI $8, V9, V9, V9 // Set up for VSLO + VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 + +loadge16: + ANDCC $15, R5, R9 // Find byte offset of sep + ADD R9, R6, R10 // Add sep len + CMP R10, $16 // Check if sep len+offset > 16 + BGT sepcross16 // Sep crosses 16 byte boundary + + RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container + VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0 + SLD $3, R9 // Set up shift count for VSLO + MTVSRD R9, V8 // Set up shift count for VSLO + VSLDOI $8, V8, V8, V8 + VSLO V0, V8, V0 // Shift by start byte + + VAND V0, SEPMASK, V0 // Mask separator (< 16) + BR index2plus + +sepcross16: + VLOADSWAP(R5, R0, V0, V0) // Load 16 bytes @R5 into V0 + + VAND V0, SEPMASK, V0 // mask out separator + BLE CR4, index2to16 + BR index17plus // Handle sep > 16 + +index2plus: + CMP R6, $2 // Check length of sep + BNE index3plus // If not 2, check for 3 + ADD $16, R7, R9 // Check if next 16 bytes past last + CMP R9, LASTBYTE // compare with last + BGE index2to16 // 2 <= len(string) <= 16 + MOVD $0xff00, R21 // Mask for later + MTVSRD R21, V25 // Move to Vreg + VSPLTH $3, V25, V31 // Splat mask + VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep + VSPLTISB $0, V10 // Clear V10 + + // First case: 2 byte separator + // V1: 2 byte separator splatted + // V2: 16 bytes at addr + // V4: 16 bytes at addr+1 + // Compare 2 byte separator at start + // and at start+1. Use VSEL to combine + // those results to find the first + // matching start byte, returning + // that value when found. Loop as + // long as len(string) > 16 +index2loop2: + VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3 + +index2loop: + VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 + VCMPEQUH V1, V2, V5 // Search for sep + VCMPEQUH V1, V3, V6 // Search for sep offset by 1 + VSEL V6, V5, V31, V7 // merge even and odd indices + VCLZD V7, V18 // find index of first match + MFVSRD V18, R25 // get first value + CMP R25, $64 // Found if < 64 + BLT foundR25 // Return byte index where found + VSLDOI $8, V18, V18, V18 // Adjust 2nd value + MFVSRD V18, R25 // get second value + CMP R25, $64 // Found if < 64 + ADD $64, R25 // Update byte offset + BLT foundR25 // Return value + ADD $16, R7 // R7+=16 Update string pointer + ADD $17, R7, R9 // R9=F7+17 since loop unrolled + CMP R9, LASTBYTE // Compare addr+17 against last byte + BLT index2loop2 // If < last, continue loop + CMP R7, LASTBYTE // Compare addr+16 against last byte + BLT index2to16 // If < 16 handle specially + VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3 + VSLDOI $1, V3, V10, V3 // Shift left by 1 byte + BR index2loop + +index3plus: + CMP R6, $3 // Check if sep == 3 + BNE index4plus // If not check larger + ADD $19, R7, R9 // Find bytes for use in this loop + CMP R9, LASTBYTE // Compare against last byte + BGE index2to16 // Remaining string 2<=len<=16 + MOVD $0xff00, R21 // Set up mask for upcoming loop + MTVSRD R21, V25 // Move mask to Vreg + VSPLTH $3, V25, V31 // Splat mask + VSPLTH $0, V0, V1 // Splat 1st two bytes of sep + VSPLTB $2, V0, V8 // Splat 3rd byte of sep + + // Loop to process 3 byte separator. + // string[0:16] is in V2 + // string[2:18] is in V3 + // sep[0:2] splatted in V1 + // sec[3] splatted in v8 + // Load vectors at string, string+1 + // and string+2. Compare string, string+1 + // against first 2 bytes of separator + // splatted, and string+2 against 3rd + // byte splatted. Merge the results with + // VSEL to find the first byte of a match. + + // Special handling for last 16 bytes if the + // string fits in 16 byte multiple. +index3loop2: + MOVD $2, R21 // Set up index for 2 + VSPLTISB $0, V10 // Clear V10 + VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3 + VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes + +index3loop: + VLOADSWAP(R7, R0, V2, V2) // Load with correct order + VSLDOI $1, V2, V3, V4 // string[1:17] + VSLDOI $2, V2, V3, V9 // string[2:18] + VCMPEQUH V1, V2, V5 // compare hw even indices + VCMPEQUH V1, V4, V6 // compare hw odd indices + VCMPEQUB V8, V9, V10 // compare 3rd to last byte + VSEL V6, V5, V31, V7 // Find 1st matching byte using mask + VAND V7, V10, V7 // AND matched bytes with matched 3rd byte + VCLZD V7, V18 // Find first nonzero indexes + MFVSRD V18, R25 // Move 1st doubleword + CMP R25, $64 // If < 64 found + BLT foundR25 // Return matching index + VSLDOI $8, V18, V18, V18 // Move value + MFVSRD V18, R25 // Move 2nd doubleword + CMP R25, $64 // If < 64 found + ADD $64, R25 // Update byte index + BLT foundR25 // Return matching index + ADD $16, R7 // R7+=16 string ptr + ADD $19, R7, R9 // Number of string bytes for loop + CMP R9, LASTBYTE // Compare against last byte of string + BLT index3loop2 // If within, continue this loop + CMP R7, LASTSTR // Compare against last start byte + BLT index2to16 // Process remainder + VSPLTISB $0, V3 // Special case for last 16 bytes + BR index3loop // Continue this loop + + // Loop to process 4 byte separator + // string[0:16] in V2 + // string[3:16] in V3 + // sep[0:4] splatted in V1 + // Set up vectors with strings at offsets + // 0, 1, 2, 3 and compare against the 4 byte + // separator also splatted. Use VSEL with the + // compare results to find the first byte where + // a separator match is found. +index4plus: + CMP R6, $4 // Check if 4 byte separator + BNE index5plus // If not next higher + ADD $20, R7, R9 // Check string size to load + CMP R9, LASTBYTE // Verify string length + BGE index2to16 // If not large enough, process remaining + MOVD $2, R15 // Set up index + + // Set up masks for use with VSEL + MOVD $0xff, R21 // Set up mask 0xff000000ff000000... + SLD $24, R21 + MTVSRD R21, V10 + VSPLTW $1, V10, V29 + VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... + MOVD $0xffff, R21 + SLD $16, R21 + MTVSRD R21, V10 + VSPLTW $1, V10, V31 // Mask 0xffff0000ffff0000... + VSPLTW $0, V0, V1 // Splat 1st word of separator + +index4loop: + VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 + +next4: + VSPLTISB $0, V10 // Clear + MOVD $3, R9 // Number of bytes beyond 16 + VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+3 into V3 + VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes + VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 + VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 + VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 + VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep + VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep + VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep + VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep + VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask + VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask + VSEL V14, V13, V31, V7 // final merge + VCLZD V7, V18 // Find first index for each half + MFVSRD V18, R25 // Isolate value + CMP R25, $64 // If < 64, found + BLT foundR25 // Return found index + VSLDOI $8, V18, V18, V18 // Move for MFVSRD + MFVSRD V18, R25 // Isolate other value + CMP R25, $64 // If < 64, found + ADD $64, R25 // Update index for high doubleword + BLT foundR25 // Return found index + ADD $16, R7 // R7+=16 for next string + ADD $20, R7, R9 // R+20 for all bytes to load + CMP R9, LASTBYTE // Past end? Maybe check for extra? + BLT index4loop // If not, continue loop + CMP R7, LASTSTR // Check remainder + BLE index2to16 // Process remainder + BR notfound // Not found + +index5plus: + CMP R6, $16 // Check for sep > 16 + BGT index17plus // Handle large sep + + // Assumption is that the separator is smaller than the string at this point +index2to16: + CMP R7, LASTSTR // Compare last start byte + BGT notfound // last takes len(sep) into account + + ADD $16, R7, R9 // Check for last byte of string + CMP R9, LASTBYTE + BGT index2to16tail + + // At least 16 bytes of string left + // Mask the number of bytes in sep +index2to16loop: + VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1 + +compare: + VAND V1, SEPMASK, V2 // Mask out sep size + VCMPEQUBCC V0, V2, V3 // Compare masked string + BLT CR6, found // All equal + ADD $1, R7 // Update ptr to next byte + CMP R7, LASTSTR // Still less than last start byte + BGT notfound // Not found + ADD $16, R7, R9 // Verify remaining bytes + CMP R9, LASTBYTE // At least 16 + BLT index2to16loop // Try again + + // Less than 16 bytes remaining in string + // Separator >= 2 +index2to16tail: + ADD R3, R4, R9 // End of string + SUB R7, R9, R9 // Number of bytes left + ANDCC $15, R7, R10 // 16 byte offset + ADD R10, R9, R11 // offset + len + CMP R11, $16 // >= 16? + BLE short // Does not cross 16 bytes + VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1 + BR index2to16next // Continue on + +short: + RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container + VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1 + SLD $3, R10 // Set up shift + MTVSRD R10, V8 // Set up shift + VSLDOI $8, V8, V8, V8 + VSLO V1, V8, V1 // Shift by start byte + VSPLTISB $0, V25 // Clear for later use + +index2to16next: + VAND V1, SEPMASK, V2 // Just compare size of sep + VCMPEQUBCC V0, V2, V3 // Compare sep and partial string + BLT CR6, found // Found + ADD $1, R7 // Not found, try next partial string + CMP R7, LASTSTR // Check for end of string + BGT notfound // If at end, then not found + VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte + BR index2to16next // Check the next partial string + +index17plus: + CMP R6, $32 // Check if 17 < len(sep) <= 32 + BGT index33plus + SUB $16, R6, R9 // Extra > 16 + SLD $56, R9, R10 // Shift to use in VSLO + MTVSRD R10, V9 // Set up for VSLO + VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1 + VSLO V1, V9, V1 // Shift left + VSPLTISB $0xff, V7 // Splat 1s + VSPLTISB $0, V27 // Splat 0 + +index17to32loop: + VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2 + +next17: + VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+R9 into V3 + VSLO V3, V9, V3 // Shift left + VCMPEQUB V0, V2, V4 // Compare first 16 bytes + VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes + VAND V4, V5, V6 // Check if both equal + VCMPEQUBCC V6, V7, V8 // All equal? + BLT CR6, found // Yes + ADD $1, R7 // On to next byte + CMP R7, LASTSTR // Check if last start byte + BGT notfound // If too high, not found + BR index17to32loop // Continue + +notfound: + MOVD $-1, R3 // Return -1 if not found + RET + +index33plus: + MOVD $0, (R0) // Case not implemented + RET // Crash before return + +foundR25: + SRD $3, R25 // Convert from bits to bytes + ADD R25, R7 // Add to current string address + SUB R3, R7 // Subtract from start of string + MOVD R7, R3 // Return byte where found + RET + +found: + SUB R3, R7 // Return byte where found + MOVD R7, R3 + RET + +TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0 + CMP R6, R4 // Compare lengths + BGT notfound // If sep len is > string, notfound + ADD R4, R3, LASTBYTE // find last byte addr + SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index) + CMP R6, $0 // Check sep len + BEQ notfound // sep len 0 -- not found + MOVD R3, R7 // Copy of string addr + MOVD $16, R16 // Index value 16 + MOVD $17, R17 // Index value 17 + MOVD $18, R18 // Index value 18 + MOVD $1, R19 // Index value 1 + VSPLTISB $0xFF, ONES // splat all 1s + + CMP R6, $16, CR4 // CR4 for len(sep) >= 16 + VOR ONES, ONES, SEPMASK // Set up full SEPMASK + BGE CR4, loadge16 // Load for len(sep) >= 16 + SUB R6, R16, R9 // 16-len of sep + SLD $3, R9 // Set up for VSLO + MTVSRD R9, V9 // Set up for VSLO + VSLDOI $8, V9, V9, V9 // Set up for VSLO + VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16 + +loadge16: + ANDCC $15, R5, R9 // Find byte offset of sep + ADD R9, R6, R10 // Add sep len + CMP R10, $16 // Check if sep len+offset > 16 + BGT sepcross16 // Sep crosses 16 byte boundary + + RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container + LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0 + SLD $3, R9 // Set up shift count for VSLO + MTVSRD R9, V8 // Set up shift count for VSLO + VSLDOI $8, V8, V8, V8 + VSLO V0, V8, V0 // Shift by start byte + + VAND V0, SEPMASK, V0 // Mask separator (< 16) + BR index2plus + +sepcross16: + LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0 + + VAND V0, SEPMASK, V0 // mask out separator + BLE CR4, index2to16 + BR index17plus // Handle sep > 16 + +index2plus: + CMP R6, $2 // Check length of sep + BNE index3plus // If not 2, check for 3 + ADD $16, R7, R9 // Check if next 16 bytes past last + CMP R9, LASTBYTE // compare with last + BGE index2to16 // 2 <= len(string) <= 16 + MOVD $0xff00, R21 // Mask for later + MTVSRD R21, V25 // Move to Vreg + VSPLTH $3, V25, V31 // Splat mask + VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep + VSPLTISB $0, V10 // Clear V10 + + // First case: 2 byte separator + // V1: 2 byte separator splatted + // V2: 16 bytes at addr + // V4: 16 bytes at addr+1 + // Compare 2 byte separator at start + // and at start+1. Use VSEL to combine + // those results to find the first + // matching start byte, returning + // that value when found. Loop as + // long as len(string) > 16 +index2loop2: + LXVB16X (R7)(R19), V3 // Load 16 bytes @R7+1 into V3 + +index2loop: + LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 + VCMPEQUH V1, V2, V5 // Search for sep + VCMPEQUH V1, V3, V6 // Search for sep offset by 1 + VSEL V6, V5, V31, V7 // merge even and odd indices + VCLZD V7, V18 // find index of first match + MFVSRD V18, R25 // get first value + CMP R25, $64 // Found if < 64 + BLT foundR25 // Return byte index where found + + MFVSRLD V18, R25 // get second value + CMP R25, $64 // Found if < 64 + ADD $64, R25 // Update byte offset + BLT foundR25 // Return value + ADD $16, R7 // R7+=16 Update string pointer + ADD $17, R7, R9 // R9=F7+17 since loop unrolled + CMP R9, LASTBYTE // Compare addr+17 against last byte + BLT index2loop2 // If < last, continue loop + CMP R7, LASTBYTE // Compare addr+16 against last byte + BLT index2to16 // If < 16 handle specially + LXVB16X (R7)(R0), V3 // Load 16 bytes @R7 into V3 + VSLDOI $1, V3, V10, V3 // Shift left by 1 byte + BR index2loop + +index3plus: + CMP R6, $3 // Check if sep == 3 + BNE index4plus // If not check larger + ADD $19, R7, R9 // Find bytes for use in this loop + CMP R9, LASTBYTE // Compare against last byte + BGE index2to16 // Remaining string 2<=len<=16 + MOVD $0xff00, R21 // Set up mask for upcoming loop + MTVSRD R21, V25 // Move mask to Vreg + VSPLTH $3, V25, V31 // Splat mask + VSPLTH $0, V0, V1 // Splat 1st two bytes of sep + VSPLTB $2, V0, V8 // Splat 3rd byte of sep + + // Loop to process 3 byte separator. + // string[0:16] is in V2 + // string[2:18] is in V3 + // sep[0:2] splatted in V1 + // sec[3] splatted in v8 + // Load vectors at string, string+1 + // and string+2. Compare string, string+1 + // against first 2 bytes of separator + // splatted, and string+2 against 3rd + // byte splatted. Merge the results with + // VSEL to find the first byte of a match. + + // Special handling for last 16 bytes if the + // string fits in 16 byte multiple. +index3loop2: + MOVD $2, R21 // Set up index for 2 + VSPLTISB $0, V10 // Clear V10 + LXVB16X (R7)(R21), V3 // Load 16 bytes @R7+2 into V3 + VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes + +index3loop: + LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 + VSLDOI $1, V2, V3, V4 // string[1:17] + VSLDOI $2, V2, V3, V9 // string[2:18] + VCMPEQUH V1, V2, V5 // compare hw even indices + VCMPEQUH V1, V4, V6 // compare hw odd indices + VCMPEQUB V8, V9, V10 // compare 3rd to last byte + VSEL V6, V5, V31, V7 // Find 1st matching byte using mask + VAND V7, V10, V7 // AND matched bytes with matched 3rd byte + VCLZD V7, V18 // Find first nonzero indexes + MFVSRD V18, R25 // Move 1st doubleword + CMP R25, $64 // If < 64 found + BLT foundR25 // Return matching index + + MFVSRLD V18, R25 // Move 2nd doubleword + CMP R25, $64 // If < 64 found + ADD $64, R25 // Update byte index + BLT foundR25 // Return matching index + ADD $16, R7 // R7+=16 string ptr + ADD $19, R7, R9 // Number of string bytes for loop + CMP R9, LASTBYTE // Compare against last byte of string + BLT index3loop2 // If within, continue this loop + CMP R7, LASTSTR // Compare against last start byte + BLT index2to16 // Process remainder + VSPLTISB $0, V3 // Special case for last 16 bytes + BR index3loop // Continue this loop + + // Loop to process 4 byte separator + // string[0:16] in V2 + // string[3:16] in V3 + // sep[0:4] splatted in V1 + // Set up vectors with strings at offsets + // 0, 1, 2, 3 and compare against the 4 byte + // separator also splatted. Use VSEL with the + // compare results to find the first byte where + // a separator match is found. +index4plus: + CMP R6, $4 // Check if 4 byte separator + BNE index5plus // If not next higher + ADD $20, R7, R9 // Check string size to load + CMP R9, LASTBYTE // Verify string length + BGE index2to16 // If not large enough, process remaining + + // Set up masks for use with VSEL + MOVD $0xff, R21 // Set up mask 0xff000000ff000000... + SLD $24, R21 + MTVSRWS R21, V29 + + VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00... + MOVD $0xffff, R21 + SLD $16, R21 + MTVSRWS R21, V31 + + VSPLTW $0, V0, V1 // Splat 1st word of separator + +index4loop: + LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 + +next4: + VSPLTISB $0, V10 // Clear + MOVD $3, R9 // Number of bytes beyond 16 + LXVB16X (R7)(R9), V3 // Load 16 bytes @R7 into V3 + VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes + VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1 + VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2 + VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3 + VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep + VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep + VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep + VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep + VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask + VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask + VSEL V14, V13, V31, V7 // final merge + VCLZD V7, V18 // Find first index for each half + MFVSRD V18, R25 // Isolate value + CMP R25, $64 // If < 64, found + BLT foundR25 // Return found index + + MFVSRLD V18, R25 // Isolate other value + CMP R25, $64 // If < 64, found + ADD $64, R25 // Update index for high doubleword + BLT foundR25 // Return found index + ADD $16, R7 // R7+=16 for next string + ADD $20, R7, R9 // R+20 for all bytes to load + CMP R9, LASTBYTE // Past end? Maybe check for extra? + BLT index4loop // If not, continue loop + CMP R7, LASTSTR // Check remainder + BLE index2to16 // Process remainder + BR notfound // Not found + +index5plus: + CMP R6, $16 // Check for sep > 16 + BGT index17plus // Handle large sep + + // Assumption is that the separator is smaller than the string at this point +index2to16: + CMP R7, LASTSTR // Compare last start byte + BGT notfound // last takes len(sep) into account + + ADD $19, R7, R9 // To check 4 indices per iteration, need at least 16+3 bytes + CMP R9, LASTBYTE + // At least 16 bytes of string left + // Mask the number of bytes in sep + VSPLTISB $0, V10 // Clear + BGT index2to16tail + + MOVD $3, R17 // Number of bytes beyond 16 + PCALIGN $32 +index2to16loop: + LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7 + LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3 + + VSLDOI $13, V5, V10, V2 // Shift left last 3 bytes + VSLDOI $1, V1, V2, V3 // V3=(V1:V2)<<1 + VSLDOI $2, V1, V2, V4 // V4=(V1:V2)<<2 + VAND V1, SEPMASK, V8 // Mask out sep size 0th index + VAND V3, SEPMASK, V9 // Mask out sep size 1st index + VAND V4, SEPMASK, V11 // Mask out sep size 2nd index + VAND V5, SEPMASK, V12 // Mask out sep size 3rd index + VCMPEQUBCC V0, V8, V8 // compare masked string + BLT CR6, found // All equal while comparing 0th index + VCMPEQUBCC V0, V9, V9 // compare masked string + BLT CR6, found2 // All equal while comparing 1st index + VCMPEQUBCC V0, V11, V11 // compare masked string + BLT CR6, found3 // All equal while comparing 2nd index + VCMPEQUBCC V0, V12, V12 // compare masked string + BLT CR6, found4 // All equal while comparing 3rd index + + ADD $4, R7 // Update ptr to next 4 bytes + CMP R7, LASTSTR // Still less than last start byte + BGT notfound // Not found + ADD $19, R7, R9 // Verify remaining bytes + CMP R9, LASTBYTE // length of string at least 19 + BLE index2to16loop // Try again, else do post processing and jump to index2to16next + + // <19 bytes left, post process the remaining string +index2to16tail: + ADD R3, R4, R9 // End of string + SUB R7, R9, R9 // Number of bytes left + ANDCC $15, R7, R10 // 16 byte offset + ADD R10, R9, R11 // offset + len + CMP R11, $16 // >= 16? + BLE short // Does not cross 16 bytes + LXVB16X (R7)(R0), V1 // Load 16 bytes @R7 into V1 + CMP R9, $16 // Post-processing of unrolled loop + BLE index2to16next // continue to index2to16next if <= 16 bytes + SUB R16, R9, R10 // R9 should be 18 or 17 hence R10 is 1 or 2 + LXVB16X (R7)(R10), V9 + CMP R10, $1 // string length is 17, compare 1 more byte + BNE extra2 // string length is 18, compare 2 more bytes + VSLDOI $15, V9, V10, V25 + VAND V1, SEPMASK, V2 // Just compare size of sep + VCMPEQUBCC V0, V2, V3 // Compare sep and partial string + BLT CR6, found // Found + ADD $1, R7 // Not found, try next partial string + CMP R7, LASTSTR // Check for end of string + BGT notfound // If at end, then not found + VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte + BR index2to16next // go to remainder loop +extra2: + VSLDOI $14, V9, V10, V25 + VAND V1, SEPMASK, V2 // Just compare size of sep + VCMPEQUBCC V0, V2, V3 // Compare sep and partial string + BLT CR6, found // Found + ADD $1, R7 // Not found, try next partial string + CMP R7, LASTSTR // Check for end of string + BGT notfound // If at end, then not found + VOR V1, V1, V4 // save remaining string + VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte for 17th byte + VAND V1, SEPMASK, V2 // Just compare size of sep + VCMPEQUBCC V0, V2, V3 // Compare sep and partial string + BLT CR6, found // Found + ADD $1, R7 // Not found, try next partial string + CMP R7, LASTSTR // Check for end of string + BGT notfound // If at end, then not found + VSLDOI $2, V4, V25, V1 // Shift saved string left by 2 bytes for 18th byte + BR index2to16next // Check the remaining partial string in index2to16next + +short: + RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container + LXVB16X (R9)(R0), V1 // Load 16 bytes @R9 into V1 + SLD $3, R10 // Set up shift + MTVSRD R10, V8 // Set up shift + VSLDOI $8, V8, V8, V8 + VSLO V1, V8, V1 // Shift by start byte + PCALIGN $32 +index2to16next: + VAND V1, SEPMASK, V2 // Just compare size of sep + VCMPEQUBCC V0, V2, V3 // Compare sep and partial string + BLT CR6, found // Found + ADD $1, R7 // Not found, try next partial string + CMP R7, LASTSTR // Check for end of string + BGT notfound // If at end, then not found + VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte + BR index2to16next // Check the next partial string + +index17plus: + CMP R6, $32 // Check if 17 < len(sep) <= 32 + BGT index33plus + SUB $16, R6, R9 // Extra > 16 + SLD $56, R9, R10 // Shift to use in VSLO + MTVSRD R10, V9 // Set up for VSLO + LXVB16X (R5)(R9), V1 // Load 16 bytes @R5+R9 into V1 + VSLO V1, V9, V1 // Shift left + VSPLTISB $0xff, V7 // Splat 1s + VSPLTISB $0, V27 // Splat 0 + +index17to32loop: + LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2 + +next17: + LXVB16X (R7)(R9), V3 // Load 16 bytes @R7+R9 into V3 + VSLO V3, V9, V3 // Shift left + VCMPEQUB V0, V2, V4 // Compare first 16 bytes + VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes + VAND V4, V5, V6 // Check if both equal + VCMPEQUBCC V6, V7, V8 // All equal? + BLT CR6, found // Yes + ADD $1, R7 // On to next byte + CMP R7, LASTSTR // Check if last start byte + BGT notfound // If too high, not found + BR index17to32loop // Continue + +notfound: + MOVD $-1, R3 // Return -1 if not found + RET + +index33plus: + MOVD $0, (R0) // Case not implemented + RET // Crash before return + +foundR25: + SRD $3, R25 // Convert from bits to bytes + ADD R25, R7 // Add to current string address + SUB R3, R7 // Subtract from start of string + MOVD R7, R3 // Return byte where found + RET +found4: + ADD $1, R7 // found from unrolled loop at index 3 +found3: + ADD $1, R7 // found from unrolled loop at index 2 +found2: + ADD $1, R7 // found from unrolled loop at index 1 +found: // found at index 0 + SUB R3, R7 // Return byte where found + MOVD R7, R3 + RET diff --git a/src/internal/bytealg/index_s390x.go b/src/internal/bytealg/index_s390x.go new file mode 100644 index 0000000..9340cf1 --- /dev/null +++ b/src/internal/bytealg/index_s390x.go @@ -0,0 +1,31 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytealg + +import "internal/cpu" + +const MaxBruteForce = 64 + +func init() { + // Note: we're kind of lucky that this flag is available at this point. + // The runtime sets HasVX when processing auxv records, and that happens + // to happen *before* running the init functions of packages that + // the runtime depends on. + // TODO: it would really be nicer for internal/cpu to figure out this + // flag by itself. Then we wouldn't need to depend on quirks of + // early startup initialization order. + if cpu.S390X.HasVX { + MaxLen = 64 + } +} + +// Cutover reports the number of failures of IndexByte we should tolerate +// before switching over to Index. +// n is the number of bytes processed so far. +// See the bytes.Index implementation for details. +func Cutover(n int) int { + // 1 error per 8 characters, plus a few slop to start. + return (n + 16) / 8 +} diff --git a/src/internal/bytealg/index_s390x.s b/src/internal/bytealg/index_s390x.s new file mode 100644 index 0000000..491d5bc --- /dev/null +++ b/src/internal/bytealg/index_s390x.s @@ -0,0 +1,216 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// Caller must confirm availability of vx facility before calling. +TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56 + LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s) + LMG b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep) + MOVD $ret+48(FP), R5 + BR indexbody<>(SB) + +// Caller must confirm availability of vx facility before calling. +TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40 + LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s) + LMG b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep) + MOVD $ret+32(FP), R5 + BR indexbody<>(SB) + +// s: string we are searching +// sep: string to search for +// R1=&s[0], R2=len(s) +// R3=&sep[0], R4=len(sep) +// R5=&ret (int) +// Caller must confirm availability of vx facility before calling. +TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0 + CMPBGT R4, R2, notfound + ADD R1, R2 + SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index) + CMPBEQ R4, $0, notfound + SUB $1, R4 // R4=len(sep)-1 for use as VLL index + VLL R4, (R3), V0 // contains first 16 bytes of sep + MOVD R1, R7 +index2plus: + CMPBNE R4, $1, index3plus + MOVD $15(R7), R9 + CMPBGE R9, R2, index2to16 + VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00... + VONE V16 + VREPH $0, V0, V1 + CMPBGE R9, R2, index2to16 +index2loop: + VL 0(R7), V2 // 16 bytes, even indices + VL 1(R7), V4 // 16 bytes, odd indices + VCEQH V1, V2, V5 // compare even indices + VCEQH V1, V4, V6 // compare odd indices + VSEL V5, V6, V31, V7 // merge even and odd indices + VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found + BLT foundV17 + MOVD $16(R7), R7 // R7+=16 + ADD $15, R7, R9 + CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search) + CMPBLE R7, R2, index2to16 + BR notfound + +index3plus: + CMPBNE R4, $2, index4plus + ADD $15, R7, R9 + CMPBGE R9, R2, index2to16 + MOVD $1, R0 + VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00... + VONE V16 + VREPH $0, V0, V1 + VREPB $2, V0, V8 +index3loop: + VL (R7), V2 // load 16-bytes into V2 + VLL R0, 16(R7), V3 // load 2-bytes into V3 + VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1 + VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2 + VCEQH V1, V2, V5 // compare 2-byte even indices + VCEQH V1, V4, V6 // compare 2-byte odd indices + VCEQB V8, V9, V10 // compare last bytes + VSEL V5, V6, V31, V7 // merge even and odd indices + VN V7, V10, V7 // AND indices with last byte + VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found + BLT foundV17 + MOVD $16(R7), R7 // R7+=16 + ADD $15, R7, R9 + CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search) + CMPBLE R7, R2, index2to16 + BR notfound + +index4plus: + CMPBNE R4, $3, index5plus + ADD $15, R7, R9 + CMPBGE R9, R2, index2to16 + MOVD $2, R0 + VGBM $0x8888, V29 // 0xff000000ff000000... + VGBM $0x2222, V30 // 0x0000ff000000ff00... + VGBM $0xcccc, V31 // 0xffff0000ffff0000... + VONE V16 + VREPF $0, V0, V1 +index4loop: + VL (R7), V2 // load 16-bytes into V2 + VLL R0, 16(R7), V3 // load 3-bytes into V3 + VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1 + VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1 + VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1 + VCEQF V1, V2, V5 // compare index 0, 4, ... + VCEQF V1, V4, V6 // compare index 1, 5, ... + VCEQF V1, V9, V11 // compare index 2, 6, ... + VCEQF V1, V10, V12 // compare index 3, 7, ... + VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ... + VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ... + VSEL V13, V14, V31, V7 // final merge + VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found + BLT foundV17 + MOVD $16(R7), R7 // R7+=16 + ADD $15, R7, R9 + CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search) + CMPBLE R7, R2, index2to16 + BR notfound + +index5plus: + CMPBGT R4, $15, index17plus +index2to16: + CMPBGT R7, R2, notfound + MOVD $1(R7), R8 + CMPBGT R8, R2, index2to16tail +index2to16loop: + // unrolled 2x + VLL R4, (R7), V1 + VLL R4, 1(R7), V2 + VCEQGS V0, V1, V3 + BEQ found + MOVD $1(R7), R7 + VCEQGS V0, V2, V4 + BEQ found + MOVD $1(R7), R7 + CMPBLT R7, R2, index2to16loop + CMPBGT R7, R2, notfound +index2to16tail: + VLL R4, (R7), V1 + VCEQGS V0, V1, V2 + BEQ found + BR notfound + +index17plus: + CMPBGT R4, $31, index33plus + SUB $16, R4, R0 + VLL R0, 16(R3), V1 + VONE V7 +index17to32loop: + VL (R7), V2 + VLL R0, 16(R7), V3 + VCEQG V0, V2, V4 + VCEQG V1, V3, V5 + VN V4, V5, V6 + VCEQGS V6, V7, V8 + BEQ found + MOVD $1(R7), R7 + CMPBLE R7, R2, index17to32loop + BR notfound + +index33plus: + CMPBGT R4, $47, index49plus + SUB $32, R4, R0 + VL 16(R3), V1 + VLL R0, 32(R3), V2 + VONE V11 +index33to48loop: + VL (R7), V3 + VL 16(R7), V4 + VLL R0, 32(R7), V5 + VCEQG V0, V3, V6 + VCEQG V1, V4, V7 + VCEQG V2, V5, V8 + VN V6, V7, V9 + VN V8, V9, V10 + VCEQGS V10, V11, V12 + BEQ found + MOVD $1(R7), R7 + CMPBLE R7, R2, index33to48loop + BR notfound + +index49plus: + CMPBGT R4, $63, index65plus + SUB $48, R4, R0 + VL 16(R3), V1 + VL 32(R3), V2 + VLL R0, 48(R3), V3 + VONE V15 +index49to64loop: + VL (R7), V4 + VL 16(R7), V5 + VL 32(R7), V6 + VLL R0, 48(R7), V7 + VCEQG V0, V4, V8 + VCEQG V1, V5, V9 + VCEQG V2, V6, V10 + VCEQG V3, V7, V11 + VN V8, V9, V12 + VN V10, V11, V13 + VN V12, V13, V14 + VCEQGS V14, V15, V16 + BEQ found + MOVD $1(R7), R7 + CMPBLE R7, R2, index49to64loop +notfound: + MOVD $-1, (R5) + RET + +index65plus: + // not implemented + MOVD $0, (R0) + RET + +foundV17: // index is in doubleword V17[0] + VLGVG $0, V17, R8 + ADD R8, R7 +found: + SUB R1, R7 + MOVD R7, (R5) + RET diff --git a/src/internal/bytealg/indexbyte_386.s b/src/internal/bytealg/indexbyte_386.s new file mode 100644 index 0000000..8a03054 --- /dev/null +++ b/src/internal/bytealg/indexbyte_386.s @@ -0,0 +1,34 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte(SB),NOSPLIT,$0-20 + MOVL b_base+0(FP), SI + MOVL b_len+4(FP), CX + MOVB c+12(FP), AL + MOVL SI, DI + CLD; REPN; SCASB + JZ 3(PC) + MOVL $-1, ret+16(FP) + RET + SUBL SI, DI + SUBL $1, DI + MOVL DI, ret+16(FP) + RET + +TEXT ·IndexByteString(SB),NOSPLIT,$0-16 + MOVL s_base+0(FP), SI + MOVL s_len+4(FP), CX + MOVB c+8(FP), AL + MOVL SI, DI + CLD; REPN; SCASB + JZ 3(PC) + MOVL $-1, ret+12(FP) + RET + SUBL SI, DI + SUBL $1, DI + MOVL DI, ret+12(FP) + RET diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s new file mode 100644 index 0000000..1ca70e3 --- /dev/null +++ b/src/internal/bytealg/indexbyte_amd64.s @@ -0,0 +1,149 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte(SB), NOSPLIT, $0-40 + MOVQ b_base+0(FP), SI + MOVQ b_len+8(FP), BX + MOVB c+24(FP), AL + LEAQ ret+32(FP), R8 + JMP indexbytebody<>(SB) + +TEXT ·IndexByteString(SB), NOSPLIT, $0-32 + MOVQ s_base+0(FP), SI + MOVQ s_len+8(FP), BX + MOVB c+16(FP), AL + LEAQ ret+24(FP), R8 + JMP indexbytebody<>(SB) + +// input: +// SI: data +// BX: data len +// AL: byte sought +// R8: address to put result +TEXT indexbytebody<>(SB), NOSPLIT, $0 + // Shuffle X0 around so that each byte contains + // the character we're looking for. + MOVD AX, X0 + PUNPCKLBW X0, X0 + PUNPCKLBW X0, X0 + PSHUFL $0, X0, X0 + + CMPQ BX, $16 + JLT small + + MOVQ SI, DI + + CMPQ BX, $32 + JA avx2 +sse: + LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes + JMP sseloopentry + +sseloop: + // Move the next 16-byte chunk of the data into X1. + MOVOU (DI), X1 + // Compare bytes in X0 to X1. + PCMPEQB X0, X1 + // Take the top bit of each byte in X1 and put the result in DX. + PMOVMSKB X1, DX + // Find first set bit, if any. + BSFL DX, DX + JNZ ssesuccess + // Advance to next block. + ADDQ $16, DI +sseloopentry: + CMPQ DI, AX + JB sseloop + + // Search the last 16-byte chunk. This chunk may overlap with the + // chunks we've already searched, but that's ok. + MOVQ AX, DI + MOVOU (AX), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, DX + BSFL DX, DX + JNZ ssesuccess + +failure: + MOVQ $-1, (R8) + RET + +// We've found a chunk containing the byte. +// The chunk was loaded from DI. +// The index of the matching byte in the chunk is DX. +// The start of the data is SI. +ssesuccess: + SUBQ SI, DI // Compute offset of chunk within data. + ADDQ DX, DI // Add offset of byte within chunk. + MOVQ DI, (R8) + RET + +// handle for lengths < 16 +small: + TESTQ BX, BX + JEQ failure + + // Check if we'll load across a page boundary. + LEAQ 16(SI), AX + TESTW $0xff0, AX + JEQ endofpage + + MOVOU (SI), X1 // Load data + PCMPEQB X0, X1 // Compare target byte with each byte in data. + PMOVMSKB X1, DX // Move result bits to integer register. + BSFL DX, DX // Find first set bit. + JZ failure // No set bit, failure. + CMPL DX, BX + JAE failure // Match is past end of data. + MOVQ DX, (R8) + RET + +endofpage: + MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. + PCMPEQB X0, X1 // Compare target byte with each byte in data. + PMOVMSKB X1, DX // Move result bits to integer register. + MOVL BX, CX + SHLL CX, DX + SHRL $16, DX // Shift desired bits down to bottom of register. + BSFL DX, DX // Find first set bit. + JZ failure // No set bit, failure. + MOVQ DX, (R8) + RET + +avx2: +#ifndef hasAVX2 + CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 + JNE sse +#endif + MOVD AX, X0 + LEAQ -32(SI)(BX*1), R11 + VPBROADCASTB X0, Y1 +avx2_loop: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPTEST Y3, Y3 + JNZ avx2success + ADDQ $32, DI + CMPQ DI, R11 + JLT avx2_loop + MOVQ R11, DI + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPTEST Y3, Y3 + JNZ avx2success + VZEROUPPER + MOVQ $-1, (R8) + RET + +avx2success: + VPMOVMSKB Y3, DX + BSFL DX, DX + SUBQ SI, DI + ADDQ DI, DX + MOVQ DX, (R8) + VZEROUPPER + RET diff --git a/src/internal/bytealg/indexbyte_arm.s b/src/internal/bytealg/indexbyte_arm.s new file mode 100644 index 0000000..faf9797 --- /dev/null +++ b/src/internal/bytealg/indexbyte_arm.s @@ -0,0 +1,46 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte(SB),NOSPLIT,$0-20 + MOVW b_base+0(FP), R0 + MOVW b_len+4(FP), R1 + MOVBU c+12(FP), R2 // byte to find + MOVW $ret+16(FP), R5 + B indexbytebody<>(SB) + +TEXT ·IndexByteString(SB),NOSPLIT,$0-16 + MOVW s_base+0(FP), R0 + MOVW s_len+4(FP), R1 + MOVBU c+8(FP), R2 // byte to find + MOVW $ret+12(FP), R5 + B indexbytebody<>(SB) + +// input: +// R0: data +// R1: data length +// R2: byte to find +// R5: address to put result +TEXT indexbytebody<>(SB),NOSPLIT,$0-0 + MOVW R0, R4 // store base for later + ADD R0, R1 // end + +loop: + CMP R0, R1 + B.EQ notfound + MOVBU.P 1(R0), R3 + CMP R2, R3 + B.NE loop + + SUB $1, R0 // R0 will be one beyond the position we want + SUB R4, R0 // remove base + MOVW R0, (R5) + RET + +notfound: + MOVW $-1, R0 + MOVW R0, (R5) + RET diff --git a/src/internal/bytealg/indexbyte_arm64.s b/src/internal/bytealg/indexbyte_arm64.s new file mode 100644 index 0000000..40843fb --- /dev/null +++ b/src/internal/bytealg/indexbyte_arm64.s @@ -0,0 +1,126 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +TEXT ·IndexByte(SB),NOSPLIT,$0-40 + MOVD b_base+0(FP), R0 + MOVD b_len+8(FP), R2 + MOVBU c+24(FP), R1 + MOVD $ret+32(FP), R8 + B indexbytebody<>(SB) + +TEXT ·IndexByteString(SB),NOSPLIT,$0-32 + MOVD s_base+0(FP), R0 + MOVD s_len+8(FP), R2 + MOVBU c+16(FP), R1 + MOVD $ret+24(FP), R8 + B indexbytebody<>(SB) + +// input: +// R0: data +// R1: byte to search +// R2: data len +// R8: address to put result +TEXT indexbytebody<>(SB),NOSPLIT,$0 + // Core algorithm: + // For each 32-byte chunk we calculate a 64-bit syndrome value, + // with two bits per byte. For each tuple, bit 0 is set if the + // relevant byte matched the requested character and bit 1 is + // not used (faster than using a 32bit syndrome). Since the bits + // in the syndrome reflect exactly the order in which things occur + // in the original string, counting trailing zeros allows to + // identify exactly which byte has matched. + + CBZ R2, fail + MOVD R0, R11 + // Magic constant 0x40100401 allows us to identify + // which lane matches the requested byte. + // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24)) + // Different bytes have different bit masks (i.e: 1, 4, 16, 64) + MOVD $0x40100401, R5 + VMOV R1, V0.B16 + // Work with aligned 32-byte chunks + BIC $0x1f, R0, R3 + VMOV R5, V5.S4 + ANDS $0x1f, R0, R9 + AND $0x1f, R2, R10 + BEQ loop + + // Input string is not 32-byte aligned. We calculate the + // syndrome value for the aligned 32 bytes block containing + // the first bytes and mask off the irrelevant part. + VLD1.P (R3), [V1.B16, V2.B16] + SUB $0x20, R9, R4 + ADDS R4, R2, R2 + VCMEQ V0.B16, V1.B16, V3.B16 + VCMEQ V0.B16, V2.B16, V4.B16 + VAND V5.B16, V3.B16, V3.B16 + VAND V5.B16, V4.B16, V4.B16 + VADDP V4.B16, V3.B16, V6.B16 // 256->128 + VADDP V6.B16, V6.B16, V6.B16 // 128->64 + VMOV V6.D[0], R6 + // Clear the irrelevant lower bits + LSL $1, R9, R4 + LSR R4, R6, R6 + LSL R4, R6, R6 + // The first block can also be the last + BLS masklast + // Have we found something already? + CBNZ R6, tail + +loop: + VLD1.P (R3), [V1.B16, V2.B16] + SUBS $0x20, R2, R2 + VCMEQ V0.B16, V1.B16, V3.B16 + VCMEQ V0.B16, V2.B16, V4.B16 + // If we're out of data we finish regardless of the result + BLS end + // Use a fast check for the termination condition + VORR V4.B16, V3.B16, V6.B16 + VADDP V6.D2, V6.D2, V6.D2 + VMOV V6.D[0], R6 + // We're not out of data, loop if we haven't found the character + CBZ R6, loop + +end: + // Termination condition found, let's calculate the syndrome value + VAND V5.B16, V3.B16, V3.B16 + VAND V5.B16, V4.B16, V4.B16 + VADDP V4.B16, V3.B16, V6.B16 + VADDP V6.B16, V6.B16, V6.B16 + VMOV V6.D[0], R6 + // Only do the clear for the last possible block with less than 32 bytes + // Condition flags come from SUBS in the loop + BHS tail + +masklast: + // Clear the irrelevant upper bits + ADD R9, R10, R4 + AND $0x1f, R4, R4 + SUB $0x20, R4, R4 + NEG R4<<1, R4 + LSL R4, R6, R6 + LSR R4, R6, R6 + +tail: + // Check that we have found a character + CBZ R6, fail + // Count the trailing zeros using bit reversing + RBIT R6, R6 + // Compensate the last post-increment + SUB $0x20, R3, R3 + // And count the leading zeros + CLZ R6, R6 + // R6 is twice the offset into the fragment + ADD R6>>1, R3, R0 + // Compute the offset result + SUB R11, R0, R0 + MOVD R0, (R8) + RET + +fail: + MOVD $-1, R0 + MOVD R0, (R8) + RET diff --git a/src/internal/bytealg/indexbyte_generic.go b/src/internal/bytealg/indexbyte_generic.go new file mode 100644 index 0000000..b89d34f --- /dev/null +++ b/src/internal/bytealg/indexbyte_generic.go @@ -0,0 +1,25 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !mips && !mipsle && !mips64 && !mips64le && !riscv64 && !wasm + +package bytealg + +func IndexByte(b []byte, c byte) int { + for i, x := range b { + if x == c { + return i + } + } + return -1 +} + +func IndexByteString(s string, c byte) int { + for i := 0; i < len(s); i++ { + if s[i] == c { + return i + } + } + return -1 +} diff --git a/src/internal/bytealg/indexbyte_loong64.s b/src/internal/bytealg/indexbyte_loong64.s new file mode 100644 index 0000000..baa9c86 --- /dev/null +++ b/src/internal/bytealg/indexbyte_loong64.s @@ -0,0 +1,52 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte(SB),NOSPLIT,$0-40 + MOVV b_base+0(FP), R4 + MOVV b_len+8(FP), R5 + MOVBU c+24(FP), R6 // byte to find + MOVV R4, R7 // store base for later + ADDV R4, R5 // end + ADDV $-1, R4 + +loop: + ADDV $1, R4 + BEQ R4, R5, notfound + MOVBU (R4), R8 + BNE R6, R8, loop + + SUBV R7, R4 // remove base + MOVV R4, ret+32(FP) + RET + +notfound: + MOVV $-1, R4 + MOVV R4, ret+32(FP) + RET + +TEXT ·IndexByteString(SB),NOSPLIT,$0-32 + MOVV s_base+0(FP), R4 + MOVV s_len+8(FP), R5 + MOVBU c+16(FP), R6 // byte to find + MOVV R4, R7 // store base for later + ADDV R4, R5 // end + ADDV $-1, R4 + +loop: + ADDV $1, R4 + BEQ R4, R5, notfound + MOVBU (R4), R8 + BNE R6, R8, loop + + SUBV R7, R4 // remove base + MOVV R4, ret+24(FP) + RET + +notfound: + MOVV $-1, R4 + MOVV R4, ret+24(FP) + RET diff --git a/src/internal/bytealg/indexbyte_mips64x.s b/src/internal/bytealg/indexbyte_mips64x.s new file mode 100644 index 0000000..5689f84 --- /dev/null +++ b/src/internal/bytealg/indexbyte_mips64x.s @@ -0,0 +1,54 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build mips64 || mips64le + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte(SB),NOSPLIT,$0-40 + MOVV b_base+0(FP), R1 + MOVV b_len+8(FP), R2 + MOVBU c+24(FP), R3 // byte to find + MOVV R1, R4 // store base for later + ADDV R1, R2 // end + ADDV $-1, R1 + +loop: + ADDV $1, R1 + BEQ R1, R2, notfound + MOVBU (R1), R5 + BNE R3, R5, loop + + SUBV R4, R1 // remove base + MOVV R1, ret+32(FP) + RET + +notfound: + MOVV $-1, R1 + MOVV R1, ret+32(FP) + RET + +TEXT ·IndexByteString(SB),NOSPLIT,$0-32 + MOVV s_base+0(FP), R1 + MOVV s_len+8(FP), R2 + MOVBU c+16(FP), R3 // byte to find + MOVV R1, R4 // store base for later + ADDV R1, R2 // end + ADDV $-1, R1 + +loop: + ADDV $1, R1 + BEQ R1, R2, notfound + MOVBU (R1), R5 + BNE R3, R5, loop + + SUBV R4, R1 // remove base + MOVV R1, ret+24(FP) + RET + +notfound: + MOVV $-1, R1 + MOVV R1, ret+24(FP) + RET diff --git a/src/internal/bytealg/indexbyte_mipsx.s b/src/internal/bytealg/indexbyte_mipsx.s new file mode 100644 index 0000000..1c2b104 --- /dev/null +++ b/src/internal/bytealg/indexbyte_mipsx.s @@ -0,0 +1,52 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build mips || mipsle + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte(SB),NOSPLIT,$0-20 + MOVW b_base+0(FP), R1 + MOVW b_len+4(FP), R2 + MOVBU c+12(FP), R3 // byte to find + ADDU $1, R1, R4 // store base+1 for later + ADDU R1, R2 // end + +loop: + BEQ R1, R2, notfound + MOVBU (R1), R5 + ADDU $1, R1 + BNE R3, R5, loop + + SUBU R4, R1 // R1 will be one beyond the position we want so remove (base+1) + MOVW R1, ret+16(FP) + RET + +notfound: + MOVW $-1, R1 + MOVW R1, ret+16(FP) + RET + +TEXT ·IndexByteString(SB),NOSPLIT,$0-16 + MOVW s_base+0(FP), R1 + MOVW s_len+4(FP), R2 + MOVBU c+8(FP), R3 // byte to find + ADDU $1, R1, R4 // store base+1 for later + ADDU R1, R2 // end + +loop: + BEQ R1, R2, notfound + MOVBU (R1), R5 + ADDU $1, R1 + BNE R3, R5, loop + + SUBU R4, R1 // remove (base+1) + MOVW R1, ret+12(FP) + RET + +notfound: + MOVW $-1, R1 + MOVW R1, ret+12(FP) + RET diff --git a/src/internal/bytealg/indexbyte_native.go b/src/internal/bytealg/indexbyte_native.go new file mode 100644 index 0000000..c5bb2df --- /dev/null +++ b/src/internal/bytealg/indexbyte_native.go @@ -0,0 +1,13 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build 386 || amd64 || s390x || arm || arm64 || loong64 || ppc64 || ppc64le || mips || mipsle || mips64 || mips64le || riscv64 || wasm + +package bytealg + +//go:noescape +func IndexByte(b []byte, c byte) int + +//go:noescape +func IndexByteString(s string, c byte) int diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s new file mode 100644 index 0000000..1a6e852 --- /dev/null +++ b/src/internal/bytealg/indexbyte_ppc64x.s @@ -0,0 +1,391 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64 || ppc64le + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 + // R3 = byte array pointer + // R4 = length + MOVD R6, R5 // R5 = byte + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + BR indexbytebody<>(SB) + +TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32 + // R3 = string + // R4 = length + // R5 = byte + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + BR indexbytebody<>(SB) + +// R3 = addr of string +// R4 = len of string +// R5 = byte to find +// R16 = 1 if running on a POWER9 system, 0 otherwise +// On exit: +// R3 = return value +TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 + MOVD R3,R17 // Save base address for calculating the index later. + RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. + RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. + ADD R4,R3,R7 // Last acceptable address in R7. + + RLDIMI $16,R5,$32,R5 + CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently. + MOVD $-1,R9 + RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8 + RLDIMI $32,R5,$0,R5 + MOVD R7,R10 // Save last acceptable address in R10 for later. + ADD $-1,R7,R7 +#ifdef GOARCH_ppc64le + SLD R6,R9,R9 // Prepare mask for Little Endian +#else + SRD R6,R9,R9 // Same for Big Endian +#endif + BLT small_string // Jump to the small string case if it's <32 bytes. + CMP R16,$1 // optimize for power8 v power9 + BNE power8 + VSPLTISB $3,V10 // Use V10 as control for VBPERMQ + MTVRD R5,V1 + LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0} + VSLB V11,V10,V10 // to extract the first bit of match result into GPR + VSPLTB $7,V1,V1 // Replicate byte across V1 + CMP R4,$64 + MOVD $16,R11 + MOVD R3,R8 + BLT cmp32 + MOVD $32,R12 + MOVD $48,R6 + +loop64: + LXVB16X (R0)(R8),V2 // scan 64 bytes at a time + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // match found at R8, jump out + + LXVB16X (R8)(R11),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat1 // match found at R8+16 bytes, jump out + + LXVB16X (R8)(R12),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat2 // match found at R8+32 bytes, jump out + + LXVB16X (R8)(R6),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat3 // match found at R8+48 bytes, jump out + ADD $64,R8 + ADD $-64,R4 + CMP R4,$64 // >=64 bytes left to scan? + BGE loop64 + CMP R4,$32 + BLT rem // jump to rem if there are < 32 bytes left +cmp32: + LXVB16X (R0)(R8),V2 // 32-63 bytes left + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // match found at R8 + + LXVB16X (R11)(R8),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat1 // match found at R8+16 + + ADD $32,R8 + ADD $-32,R4 +rem: + RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing + BR small_string + +foundat3: + ADD $16,R8 +foundat2: + ADD $16,R8 +foundat1: + ADD $16,R8 +foundat0: + // Compress the result into a single doubleword and + // move it to a GPR for the final calculation. + VBPERMQ V6,V10,V6 + MFVRD V6,R3 + // count leading zeroes upto the match that ends up in low 16 bits + // in both endian modes, compute index by subtracting the number by 16 + CNTLZW R3,R11 + ADD $-16,R11 + ADD R8,R11,R3 // Calculate byte address + SUB R17,R3 + RET +power8: + // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values + // in V0, V1 and V10, then branch to the preloop. + ANDCC $63,R3,R11 + BEQ CR0,qw_align + RLDICL $0,R3,$61,R11 + + MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. + CMPB R12,R5,R3 // Check for a match. + AND R9,R3,R3 // Mask bytes below s_base + RLDICR $0,R7,$60,R7 // Last doubleword in R7 + CMPU R3,$0,CR7 // If we have a match, jump to the final computation + BNE CR7,done + ADD $8,R8,R8 + ADD $-8,R4,R4 + ADD R4,R11,R4 + + // Check for quadword alignment + ANDCC $15,R8,R11 + BEQ CR0,qw_align + + // Not aligned, so handle the next doubleword + MOVD 0(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR7 + BNE CR7,done + ADD $8,R8,R8 + ADD $-8,R4,R4 + + // Either quadword aligned or 64-byte at this point. We can use LVX. +qw_align: + + // Set up auxiliary data for the vectorized algorithm. + VSPLTISB $0,V0 // Replicate 0 across V0 + VSPLTISB $3,V10 // Use V10 as control for VBPERMQ + MTVRD R5,V1 + LVSL (R0+R0),V11 + VSLB V11,V10,V10 + VSPLTB $7,V1,V1 // Replicate byte across V1 + CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop + BLE tail + + // We will load 4 quardwords per iteration in the loop, so check for + // 64-byte alignment. If 64-byte aligned, then branch to the preloop. + ANDCC $63,R8,R11 + BEQ CR0,preloop + + // Not 64-byte aligned. Load one quadword at a time until aligned. + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 // Check for byte in V4 + BNE CR6,found_qw_align + ADD $16,R8,R8 + ADD $-16,R4,R4 + + ANDCC $63,R8,R11 + BEQ CR0,preloop + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 // Check for byte in V4 + BNE CR6,found_qw_align + ADD $16,R8,R8 + ADD $-16,R4,R4 + + ANDCC $63,R8,R11 + BEQ CR0,preloop + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 // Check for byte in V4 + BNE CR6,found_qw_align + ADD $-16,R4,R4 + ADD $16,R8,R8 + + // 64-byte aligned. Prepare for the main loop. +preloop: + CMPU R4,$64 + BLE tail // If len ≤ 64, don't use the vectorized loop + + // We are now aligned to a 64-byte boundary. We will load 4 quadwords + // per loop iteration. The last doubleword is in R10, so our loop counter + // starts at (R10-R8)/64. + SUB R8,R10,R6 + SRD $6,R6,R9 // Loop counter in R9 + MOVD R9,CTR + + ADD $-64,R8,R8 // Adjust index for loop entry + MOVD $16,R11 // Load offsets for the vector loads + MOVD $32,R9 + MOVD $48,R7 + + // Main loop we will load 64 bytes per iteration +loop: + ADD $64,R8,R8 // Fuse addi+lvx for performance + LVX (R8+R0),V2 // Load 4 16-byte vectors + LVX (R8+R11),V3 + VCMPEQUB V1,V2,V6 // Look for byte in each vector + VCMPEQUB V1,V3,V7 + + LVX (R8+R9),V4 + LVX (R8+R7),V5 + VCMPEQUB V1,V4,V8 + VCMPEQUB V1,V5,V9 + + VOR V6,V7,V11 // Compress the result in a single vector + VOR V8,V9,V12 + VOR V11,V12,V13 + VCMPEQUBCC V0,V13,V14 // Check for byte + BGE CR6,found + BC 16,0,loop // bdnz loop + + // Handle the tailing bytes or R4 ≤ 64 + RLDICL $0,R6,$58,R4 + ADD $64,R8,R8 +tail: + CMPU R4,$0 + BEQ notfound + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align + ADD $16,R8,R8 + CMPU R4,$16,CR6 + BLE CR6,notfound + ADD $-16,R4,R4 + + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align + ADD $16,R8,R8 + CMPU R4,$16,CR6 + BLE CR6,notfound + ADD $-16,R4,R4 + + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align + ADD $16,R8,R8 + CMPU R4,$16,CR6 + BLE CR6,notfound + ADD $-16,R4,R4 + + LVX (R8+R0),V4 + VCMPEQUBCC V1,V4,V6 + BNE CR6,found_qw_align + +notfound: + MOVD $-1, R3 + RET + +found: + // We will now compress the results into a single doubleword, + // so it can be moved to a GPR for the final index calculation. + + // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the + // first bit of each byte into bits 48-63. + VBPERMQ V6,V10,V6 + VBPERMQ V7,V10,V7 + VBPERMQ V8,V10,V8 + VBPERMQ V9,V10,V9 + + // Shift each 16-bit component into its correct position for + // merging into a single doubleword. +#ifdef GOARCH_ppc64le + VSLDOI $2,V7,V7,V7 + VSLDOI $4,V8,V8,V8 + VSLDOI $6,V9,V9,V9 +#else + VSLDOI $6,V6,V6,V6 + VSLDOI $4,V7,V7,V7 + VSLDOI $2,V8,V8,V8 +#endif + + // Merge V6-V9 into a single doubleword and move to a GPR. + VOR V6,V7,V11 + VOR V8,V9,V4 + VOR V4,V11,V4 + MFVRD V4,R3 + +#ifdef GOARCH_ppc64le + ADD $-1,R3,R11 + ANDN R3,R11,R11 + POPCNTD R11,R11 // Count trailing zeros (Little Endian). +#else + CNTLZD R3,R11 // Count leading zeros (Big Endian). +#endif + ADD R8,R11,R3 // Calculate byte address + +return: + SUB R17, R3 + RET + +found_qw_align: + // Use the same algorithm as above. Compress the result into + // a single doubleword and move it to a GPR for the final + // calculation. + VBPERMQ V6,V10,V6 + +#ifdef GOARCH_ppc64le + MFVRD V6,R3 + ADD $-1,R3,R11 + ANDN R3,R11,R11 + POPCNTD R11,R11 +#else + VSLDOI $6,V6,V6,V6 + MFVRD V6,R3 + CNTLZD R3,R11 +#endif + ADD R8,R11,R3 + CMPU R11,R4 + BLT return + BR notfound + PCALIGN $16 + +done: + ADD $-1,R10,R6 + // Offset of last index for the final + // doubleword comparison + RLDICL $0,R6,$61,R6 + // At this point, R3 has 0xFF in the same position as the byte we are + // looking for in the doubleword. Use that to calculate the exact index + // of the byte. +#ifdef GOARCH_ppc64le + ADD $-1,R3,R11 + ANDN R3,R11,R11 + POPCNTD R11,R11 // Count trailing zeros (Little Endian). +#else + CNTLZD R3,R11 // Count leading zeros (Big Endian). +#endif + CMPU R8,R7 // Check if we are at the last doubleword. + SRD $3,R11 // Convert trailing zeros to bytes. + ADD R11,R8,R3 + CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset. + BNE return + BLE CR7,return + BR notfound + +small_string: + // process string of length < 32 bytes + // We unroll this loop for better performance. + CMPU R4,$0 // Check for length=0 + BEQ notfound + + MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. + CMPB R12,R5,R3 // Check for a match. + AND R9,R3,R3 // Mask bytes below s_base. + CMPU R3,$0,CR7 // If we have a match, jump to the final computation. + RLDICR $0,R7,$60,R7 // Last doubleword in R7. + CMPU R8,R7 + BNE CR7,done + BEQ notfound // Hit length. + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + CMPU R8,R7 + BNE CR6,done + BEQ notfound + + MOVDU 8(R8),R12 + CMPB R12,R5,R3 + CMPU R3,$0,CR6 + BNE CR6,done + BR notfound + diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s new file mode 100644 index 0000000..8be78ed --- /dev/null +++ b/src/internal/bytealg/indexbyte_riscv64.s @@ -0,0 +1,51 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40 + // X10 = b_base + // X11 = b_len + // X12 = b_cap (unused) + // X13 = byte to find + AND $0xff, X13 + MOV X10, X12 // store base for later + ADD X10, X11 // end + ADD $-1, X10 + +loop: + ADD $1, X10 + BEQ X10, X11, notfound + MOVBU (X10), X14 + BNE X13, X14, loop + + SUB X12, X10 // remove base + RET + +notfound: + MOV $-1, X10 + RET + +TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32 + // X10 = b_base + // X11 = b_len + // X12 = byte to find + AND $0xff, X12 + MOV X10, X13 // store base for later + ADD X10, X11 // end + ADD $-1, X10 + +loop: + ADD $1, X10 + BEQ X10, X11, notfound + MOVBU (X10), X14 + BNE X12, X14, loop + + SUB X13, X10 // remove base + RET + +notfound: + MOV $-1, X10 + RET diff --git a/src/internal/bytealg/indexbyte_s390x.s b/src/internal/bytealg/indexbyte_s390x.s new file mode 100644 index 0000000..cf88d92 --- /dev/null +++ b/src/internal/bytealg/indexbyte_s390x.s @@ -0,0 +1,108 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 + MOVD b_base+0(FP), R3// b_base => R3 + MOVD b_len+8(FP), R4 // b_len => R4 + MOVBZ c+24(FP), R5 // c => R5 + MOVD $ret+32(FP), R2 // &ret => R9 + BR indexbytebody<>(SB) + +TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32 + MOVD s_base+0(FP), R3// s_base => R3 + MOVD s_len+8(FP), R4 // s_len => R4 + MOVBZ c+16(FP), R5 // c => R5 + MOVD $ret+24(FP), R2 // &ret => R9 + BR indexbytebody<>(SB) + +// input: +// R3: s +// R4: s_len +// R5: c -- byte sought +// R2: &ret -- address to put index into +TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0 + CMPBEQ R4, $0, notfound + MOVD R3, R6 // store base for later + ADD R3, R4, R8 // the address after the end of the string + //if the length is small, use loop; otherwise, use vector or srst search + CMPBGE R4, $16, large + +residual: + CMPBEQ R3, R8, notfound + MOVBZ 0(R3), R7 + LA 1(R3), R3 + CMPBNE R7, R5, residual + +found: + SUB R6, R3 + SUB $1, R3 + MOVD R3, 0(R2) + RET + +notfound: + MOVD $-1, 0(R2) + RET + +large: + MOVBZ internal∕cpu·S390X+const_offsetS390xHasVX(SB), R1 + CMPBNE R1, $0, vectorimpl + +srstimpl: // no vector facility + MOVBZ R5, R0 // c needs to be in R0, leave until last minute as currently R0 is expected to be 0 +srstloop: + WORD $0xB25E0083 // srst %r8, %r3 (search the range [R3, R8)) + BVS srstloop // interrupted - continue + BGT notfoundr0 +foundr0: + XOR R0, R0 // reset R0 + SUB R6, R8 // remove base + MOVD R8, 0(R2) + RET +notfoundr0: + XOR R0, R0 // reset R0 + MOVD $-1, 0(R2) + RET + +vectorimpl: + //if the address is not 16byte aligned, use loop for the header + MOVD R3, R8 + AND $15, R8 + CMPBGT R8, $0, notaligned + +aligned: + ADD R6, R4, R8 + MOVD R8, R7 + AND $-16, R7 + // replicate c across V17 + VLVGB $0, R5, V19 + VREPB $0, V19, V17 + +vectorloop: + CMPBGE R3, R7, residual + VL 0(R3), V16 // load string to be searched into V16 + ADD $16, R3 + VFEEBS V16, V17, V18 // search V17 in V16 and set conditional code accordingly + BVS vectorloop + + // when vector search found c in the string + VLGVB $7, V18, R7 // load 7th element of V18 containing index into R7 + SUB $16, R3 + SUB R6, R3 + ADD R3, R7 + MOVD R7, 0(R2) + RET + +notaligned: + MOVD R3, R8 + AND $-16, R8 + ADD $16, R8 +notalignedloop: + CMPBEQ R3, R8, aligned + MOVBZ 0(R3), R7 + LA 1(R3), R3 + CMPBNE R7, R5, notalignedloop + BR found diff --git a/src/internal/bytealg/indexbyte_wasm.s b/src/internal/bytealg/indexbyte_wasm.s new file mode 100644 index 0000000..ef4bd93 --- /dev/null +++ b/src/internal/bytealg/indexbyte_wasm.s @@ -0,0 +1,195 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte(SB), NOSPLIT, $0-40 + I64Load b_base+0(FP) + I32WrapI64 + I32Load8U c+24(FP) + I64Load b_len+8(FP) + I32WrapI64 + Call memchr<>(SB) + I64ExtendI32S + Set R0 + + Get SP + I64Const $-1 + Get R0 + I64Load b_base+0(FP) + I64Sub + Get R0 + I64Eqz $0 + Select + I64Store ret+32(FP) + + RET + +TEXT ·IndexByteString(SB), NOSPLIT, $0-32 + Get SP + I64Load s_base+0(FP) + I32WrapI64 + I32Load8U c+16(FP) + I64Load s_len+8(FP) + I32WrapI64 + Call memchr<>(SB) + I64ExtendI32S + Set R0 + + I64Const $-1 + Get R0 + I64Load s_base+0(FP) + I64Sub + Get R0 + I64Eqz $0 + Select + I64Store ret+24(FP) + + RET + +// initially compiled with emscripten and then modified over time. +// params: +// R0: s +// R1: c +// R2: len +// ret: index +TEXT memchr<>(SB), NOSPLIT, $0 + Get R1 + Set R4 + Block + Block + Get R2 + I32Const $0 + I32Ne + Tee R3 + Get R0 + I32Const $3 + I32And + I32Const $0 + I32Ne + I32And + If + Loop + Get R0 + I32Load8U $0 + Get R1 + I32Eq + BrIf $2 + Get R2 + I32Const $-1 + I32Add + Tee R2 + I32Const $0 + I32Ne + Tee R3 + Get R0 + I32Const $1 + I32Add + Tee R0 + I32Const $3 + I32And + I32Const $0 + I32Ne + I32And + BrIf $0 + End + End + Get R3 + BrIf $0 + I32Const $0 + Set R1 + Br $1 + End + Get R0 + I32Load8U $0 + Get R4 + Tee R3 + I32Eq + If + Get R2 + Set R1 + Else + Get R4 + I32Const $16843009 + I32Mul + Set R4 + Block + Block + Get R2 + I32Const $3 + I32GtU + If + Get R2 + Set R1 + Loop + Get R0 + I32Load $0 + Get R4 + I32Xor + Tee R2 + I32Const $-2139062144 + I32And + I32Const $-2139062144 + I32Xor + Get R2 + I32Const $-16843009 + I32Add + I32And + I32Eqz + If + Get R0 + I32Const $4 + I32Add + Set R0 + Get R1 + I32Const $-4 + I32Add + Tee R1 + I32Const $3 + I32GtU + BrIf $1 + Br $3 + End + End + Else + Get R2 + Set R1 + Br $1 + End + Br $1 + End + Get R1 + I32Eqz + If + I32Const $0 + Set R1 + Br $3 + End + End + Loop + Get R0 + I32Load8U $0 + Get R3 + I32Eq + BrIf $2 + Get R0 + I32Const $1 + I32Add + Set R0 + Get R1 + I32Const $-1 + I32Add + Tee R1 + BrIf $0 + I32Const $0 + Set R1 + End + End + End + Get R0 + I32Const $0 + Get R1 + Select + Return |