summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:23:18 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:23:18 +0000
commit43a123c1ae6613b3efeed291fa552ecd909d3acf (patch)
treefd92518b7024bc74031f78a1cf9e454b65e73665 /src/internal/bytealg
parentInitial commit. (diff)
downloadgolang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.tar.xz
golang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.zip
Adding upstream version 1.20.14.upstream/1.20.14upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/internal/bytealg')
-rw-r--r--src/internal/bytealg/bytealg.go150
-rw-r--r--src/internal/bytealg/compare_386.s144
-rw-r--r--src/internal/bytealg/compare_amd64.s237
-rw-r--r--src/internal/bytealg/compare_arm.s86
-rw-r--r--src/internal/bytealg/compare_arm64.s125
-rw-r--r--src/internal/bytealg/compare_generic.go60
-rw-r--r--src/internal/bytealg/compare_loong64.s86
-rw-r--r--src/internal/bytealg/compare_mips64x.s88
-rw-r--r--src/internal/bytealg/compare_mipsx.s72
-rw-r--r--src/internal/bytealg/compare_native.go19
-rw-r--r--src/internal/bytealg/compare_ppc64x.s502
-rw-r--r--src/internal/bytealg/compare_riscv64.s187
-rw-r--r--src/internal/bytealg/compare_s390x.s69
-rw-r--r--src/internal/bytealg/compare_wasm.s115
-rw-r--r--src/internal/bytealg/count_amd64.s208
-rw-r--r--src/internal/bytealg/count_arm.s43
-rw-r--r--src/internal/bytealg/count_arm64.s90
-rw-r--r--src/internal/bytealg/count_generic.go27
-rw-r--r--src/internal/bytealg/count_native.go33
-rw-r--r--src/internal/bytealg/count_ppc64x.s96
-rw-r--r--src/internal/bytealg/count_riscv64.s47
-rw-r--r--src/internal/bytealg/count_s390x.s169
-rw-r--r--src/internal/bytealg/equal_386.s130
-rw-r--r--src/internal/bytealg/equal_amd64.s162
-rw-r--r--src/internal/bytealg/equal_arm.s91
-rw-r--r--src/internal/bytealg/equal_arm64.s121
-rw-r--r--src/internal/bytealg/equal_generic.go18
-rw-r--r--src/internal/bytealg/equal_loong64.s52
-rw-r--r--src/internal/bytealg/equal_mips64x.s118
-rw-r--r--src/internal/bytealg/equal_mipsx.s62
-rw-r--r--src/internal/bytealg/equal_native.go21
-rw-r--r--src/internal/bytealg/equal_ppc64x.s205
-rw-r--r--src/internal/bytealg/equal_riscv64.s124
-rw-r--r--src/internal/bytealg/equal_s390x.s92
-rw-r--r--src/internal/bytealg/equal_wasm.s77
-rw-r--r--src/internal/bytealg/index_amd64.go26
-rw-r--r--src/internal/bytealg/index_amd64.s276
-rw-r--r--src/internal/bytealg/index_arm64.go23
-rw-r--r--src/internal/bytealg/index_arm64.s206
-rw-r--r--src/internal/bytealg/index_generic.go29
-rw-r--r--src/internal/bytealg/index_native.go19
-rw-r--r--src/internal/bytealg/index_ppc64x.go26
-rw-r--r--src/internal/bytealg/index_ppc64x.s802
-rw-r--r--src/internal/bytealg/index_s390x.go31
-rw-r--r--src/internal/bytealg/index_s390x.s216
-rw-r--r--src/internal/bytealg/indexbyte_386.s34
-rw-r--r--src/internal/bytealg/indexbyte_amd64.s149
-rw-r--r--src/internal/bytealg/indexbyte_arm.s46
-rw-r--r--src/internal/bytealg/indexbyte_arm64.s126
-rw-r--r--src/internal/bytealg/indexbyte_generic.go25
-rw-r--r--src/internal/bytealg/indexbyte_loong64.s52
-rw-r--r--src/internal/bytealg/indexbyte_mips64x.s54
-rw-r--r--src/internal/bytealg/indexbyte_mipsx.s52
-rw-r--r--src/internal/bytealg/indexbyte_native.go13
-rw-r--r--src/internal/bytealg/indexbyte_ppc64x.s391
-rw-r--r--src/internal/bytealg/indexbyte_riscv64.s51
-rw-r--r--src/internal/bytealg/indexbyte_s390x.s108
-rw-r--r--src/internal/bytealg/indexbyte_wasm.s195
58 files changed, 6876 insertions, 0 deletions
diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go
new file mode 100644
index 0000000..ebebce7
--- /dev/null
+++ b/src/internal/bytealg/bytealg.go
@@ -0,0 +1,150 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import (
+ "internal/cpu"
+ "unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+ offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42)
+ offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
+ offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+
+ offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+
+ offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
+)
+
+// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+// If MaxLen is not 0, make sure MaxLen >= 4.
+var MaxLen int
+
+// FIXME: the logic of HashStrBytes, HashStrRevBytes, IndexRabinKarpBytes and HashStr, HashStrRev,
+// IndexRabinKarp are exactly the same, except that the types are different. Can we eliminate
+// three of them without causing allocation?
+
+// PrimeRK is the prime base used in Rabin-Karp algorithm.
+const PrimeRK = 16777619
+
+// HashStrBytes returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func HashStrBytes(sep []byte) (uint32, uint32) {
+ hash := uint32(0)
+ for i := 0; i < len(sep); i++ {
+ hash = hash*PrimeRK + uint32(sep[i])
+ }
+ var pow, sq uint32 = 1, PrimeRK
+ for i := len(sep); i > 0; i >>= 1 {
+ if i&1 != 0 {
+ pow *= sq
+ }
+ sq *= sq
+ }
+ return hash, pow
+}
+
+// HashStr returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func HashStr(sep string) (uint32, uint32) {
+ hash := uint32(0)
+ for i := 0; i < len(sep); i++ {
+ hash = hash*PrimeRK + uint32(sep[i])
+ }
+ var pow, sq uint32 = 1, PrimeRK
+ for i := len(sep); i > 0; i >>= 1 {
+ if i&1 != 0 {
+ pow *= sq
+ }
+ sq *= sq
+ }
+ return hash, pow
+}
+
+// HashStrRevBytes returns the hash of the reverse of sep and the
+// appropriate multiplicative factor for use in Rabin-Karp algorithm.
+func HashStrRevBytes(sep []byte) (uint32, uint32) {
+ hash := uint32(0)
+ for i := len(sep) - 1; i >= 0; i-- {
+ hash = hash*PrimeRK + uint32(sep[i])
+ }
+ var pow, sq uint32 = 1, PrimeRK
+ for i := len(sep); i > 0; i >>= 1 {
+ if i&1 != 0 {
+ pow *= sq
+ }
+ sq *= sq
+ }
+ return hash, pow
+}
+
+// HashStrRev returns the hash of the reverse of sep and the
+// appropriate multiplicative factor for use in Rabin-Karp algorithm.
+func HashStrRev(sep string) (uint32, uint32) {
+ hash := uint32(0)
+ for i := len(sep) - 1; i >= 0; i-- {
+ hash = hash*PrimeRK + uint32(sep[i])
+ }
+ var pow, sq uint32 = 1, PrimeRK
+ for i := len(sep); i > 0; i >>= 1 {
+ if i&1 != 0 {
+ pow *= sq
+ }
+ sq *= sq
+ }
+ return hash, pow
+}
+
+// IndexRabinKarpBytes uses the Rabin-Karp search algorithm to return the index of the
+// first occurrence of substr in s, or -1 if not present.
+func IndexRabinKarpBytes(s, sep []byte) int {
+ // Rabin-Karp search
+ hashsep, pow := HashStrBytes(sep)
+ n := len(sep)
+ var h uint32
+ for i := 0; i < n; i++ {
+ h = h*PrimeRK + uint32(s[i])
+ }
+ if h == hashsep && Equal(s[:n], sep) {
+ return 0
+ }
+ for i := n; i < len(s); {
+ h *= PrimeRK
+ h += uint32(s[i])
+ h -= pow * uint32(s[i-n])
+ i++
+ if h == hashsep && Equal(s[i-n:i], sep) {
+ return i - n
+ }
+ }
+ return -1
+}
+
+// IndexRabinKarp uses the Rabin-Karp search algorithm to return the index of the
+// first occurrence of substr in s, or -1 if not present.
+func IndexRabinKarp(s, substr string) int {
+ // Rabin-Karp search
+ hashss, pow := HashStr(substr)
+ n := len(substr)
+ var h uint32
+ for i := 0; i < n; i++ {
+ h = h*PrimeRK + uint32(s[i])
+ }
+ if h == hashss && s[:n] == substr {
+ return 0
+ }
+ for i := n; i < len(s); {
+ h *= PrimeRK
+ h += uint32(s[i])
+ h -= pow * uint32(s[i-n])
+ i++
+ if h == hashss && s[i-n:i] == substr {
+ return i - n
+ }
+ }
+ return -1
+}
diff --git a/src/internal/bytealg/compare_386.s b/src/internal/bytealg/compare_386.s
new file mode 100644
index 0000000..27b660c
--- /dev/null
+++ b/src/internal/bytealg/compare_386.s
@@ -0,0 +1,144 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+ MOVL a_base+0(FP), SI
+ MOVL a_len+4(FP), BX
+ MOVL b_base+12(FP), DI
+ MOVL b_len+16(FP), DX
+ LEAL ret+24(FP), AX
+ JMP cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+ MOVL a_base+0(FP), SI
+ MOVL a_len+4(FP), BX
+ MOVL b_base+8(FP), DI
+ MOVL b_len+12(FP), DX
+ LEAL ret+16(FP), AX
+ JMP cmpbody<>(SB)
+
+// input:
+// SI = a
+// DI = b
+// BX = alen
+// DX = blen
+// AX = address of return word (set to 1/0/-1)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+ MOVL DX, BP
+ SUBL BX, DX // DX = blen-alen
+ JLE 2(PC)
+ MOVL BX, BP // BP = min(alen, blen)
+ CMPL SI, DI
+ JEQ allsame
+ CMPL BP, $4
+ JB small
+#ifdef GO386_softfloat
+ JMP mediumloop
+#endif
+largeloop:
+ CMPL BP, $16
+ JB mediumloop
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, BX
+ XORL $0xffff, BX // convert EQ to NE
+ JNE diff16 // branch if at least one byte is not equal
+ ADDL $16, SI
+ ADDL $16, DI
+ SUBL $16, BP
+ JMP largeloop
+
+diff16:
+ BSFL BX, BX // index of first byte that differs
+ XORL DX, DX
+ MOVB (SI)(BX*1), CX
+ CMPB CX, (DI)(BX*1)
+ SETHI DX
+ LEAL -1(DX*2), DX // convert 1/0 to +1/-1
+ MOVL DX, (AX)
+ RET
+
+mediumloop:
+ CMPL BP, $4
+ JBE _0through4
+ MOVL (SI), BX
+ MOVL (DI), CX
+ CMPL BX, CX
+ JNE diff4
+ ADDL $4, SI
+ ADDL $4, DI
+ SUBL $4, BP
+ JMP mediumloop
+
+_0through4:
+ MOVL -4(SI)(BP*1), BX
+ MOVL -4(DI)(BP*1), CX
+ CMPL BX, CX
+ JEQ allsame
+
+diff4:
+ BSWAPL BX // reverse order of bytes
+ BSWAPL CX
+ XORL BX, CX // find bit differences
+ BSRL CX, CX // index of highest bit difference
+ SHRL CX, BX // move a's bit to bottom
+ ANDL $1, BX // mask bit
+ LEAL -1(BX*2), BX // 1/0 => +1/-1
+ MOVL BX, (AX)
+ RET
+
+ // 0-3 bytes in common
+small:
+ LEAL (BP*8), CX
+ NEGL CX
+ JEQ allsame
+
+ // load si
+ CMPB SI, $0xfc
+ JA si_high
+ MOVL (SI), SI
+ JMP si_finish
+si_high:
+ MOVL -4(SI)(BP*1), SI
+ SHRL CX, SI
+si_finish:
+ SHLL CX, SI
+
+ // same for di
+ CMPB DI, $0xfc
+ JA di_high
+ MOVL (DI), DI
+ JMP di_finish
+di_high:
+ MOVL -4(DI)(BP*1), DI
+ SHRL CX, DI
+di_finish:
+ SHLL CX, DI
+
+ BSWAPL SI // reverse order of bytes
+ BSWAPL DI
+ XORL SI, DI // find bit differences
+ JEQ allsame
+ BSRL DI, CX // index of highest bit difference
+ SHRL CX, SI // move a's bit to bottom
+ ANDL $1, SI // mask bit
+ LEAL -1(SI*2), BX // 1/0 => +1/-1
+ MOVL BX, (AX)
+ RET
+
+ // all the bytes in common are the same, so we just need
+ // to compare the lengths.
+allsame:
+ XORL BX, BX
+ XORL CX, CX
+ TESTL DX, DX
+ SETLT BX // 1 if alen > blen
+ SETEQ CX // 1 if alen == blen
+ LEAL -1(CX)(BX*2), BX // 1,0,-1 result
+ MOVL BX, (AX)
+ RET
diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s
new file mode 100644
index 0000000..fdd015f
--- /dev/null
+++ b/src/internal/bytealg/compare_amd64.s
@@ -0,0 +1,237 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "asm_amd64.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
+ // AX = a_base (want in SI)
+ // BX = a_len (want in BX)
+ // CX = a_cap (unused)
+ // DI = b_base (want in DI)
+ // SI = b_len (want in DX)
+ // R8 = b_cap (unused)
+ MOVQ SI, DX
+ MOVQ AX, SI
+ JMP cmpbody<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
+ // AX = a_base (want in SI)
+ // BX = a_len (want in BX)
+ // CX = b_base (want in DI)
+ // DI = b_len (want in DX)
+ MOVQ AX, SI
+ MOVQ DI, DX
+ MOVQ CX, DI
+ JMP cmpbody<>(SB)
+
+// input:
+// SI = a
+// DI = b
+// BX = alen
+// DX = blen
+// output:
+// AX = output (-1/0/1)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+ CMPQ SI, DI
+ JEQ allsame
+ CMPQ BX, DX
+ MOVQ DX, R8
+ CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
+ CMPQ R8, $8
+ JB small
+
+ CMPQ R8, $63
+ JBE loop
+#ifndef hasAVX2
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+ JEQ big_loop_avx2
+ JMP big_loop
+#else
+ JMP big_loop_avx2
+#endif
+loop:
+ CMPQ R8, $16
+ JBE _0through16
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX // convert EQ to NE
+ JNE diff16 // branch if at least one byte is not equal
+ ADDQ $16, SI
+ ADDQ $16, DI
+ SUBQ $16, R8
+ JMP loop
+
+diff64:
+ ADDQ $48, SI
+ ADDQ $48, DI
+ JMP diff16
+diff48:
+ ADDQ $32, SI
+ ADDQ $32, DI
+ JMP diff16
+diff32:
+ ADDQ $16, SI
+ ADDQ $16, DI
+ // AX = bit mask of differences
+diff16:
+ BSFQ AX, BX // index of first byte that differs
+ XORQ AX, AX
+ MOVB (SI)(BX*1), CX
+ CMPB CX, (DI)(BX*1)
+ SETHI AX
+ LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
+ RET
+
+ // 0 through 16 bytes left, alen>=8, blen>=8
+_0through16:
+ CMPQ R8, $8
+ JBE _0through8
+ MOVQ (SI), AX
+ MOVQ (DI), CX
+ CMPQ AX, CX
+ JNE diff8
+_0through8:
+ MOVQ -8(SI)(R8*1), AX
+ MOVQ -8(DI)(R8*1), CX
+ CMPQ AX, CX
+ JEQ allsame
+
+ // AX and CX contain parts of a and b that differ.
+diff8:
+ BSWAPQ AX // reverse order of bytes
+ BSWAPQ CX
+ XORQ AX, CX
+ BSRQ CX, CX // index of highest bit difference
+ SHRQ CX, AX // move a's bit to bottom
+ ANDQ $1, AX // mask bit
+ LEAQ -1(AX*2), AX // 1/0 => +1/-1
+ RET
+
+ // 0-7 bytes in common
+small:
+ LEAQ (R8*8), CX // bytes left -> bits left
+ NEGQ CX // - bits lift (== 64 - bits left mod 64)
+ JEQ allsame
+
+ // load bytes of a into high bytes of AX
+ CMPB SI, $0xf8
+ JA si_high
+ MOVQ (SI), SI
+ JMP si_finish
+si_high:
+ MOVQ -8(SI)(R8*1), SI
+ SHRQ CX, SI
+si_finish:
+ SHLQ CX, SI
+
+ // load bytes of b in to high bytes of BX
+ CMPB DI, $0xf8
+ JA di_high
+ MOVQ (DI), DI
+ JMP di_finish
+di_high:
+ MOVQ -8(DI)(R8*1), DI
+ SHRQ CX, DI
+di_finish:
+ SHLQ CX, DI
+
+ BSWAPQ SI // reverse order of bytes
+ BSWAPQ DI
+ XORQ SI, DI // find bit differences
+ JEQ allsame
+ BSRQ DI, CX // index of highest bit difference
+ SHRQ CX, SI // move a's bit to bottom
+ ANDQ $1, SI // mask bit
+ LEAQ -1(SI*2), AX // 1/0 => +1/-1
+ RET
+
+allsame:
+ XORQ AX, AX
+ XORQ CX, CX
+ CMPQ BX, DX
+ SETGT AX // 1 if alen > blen
+ SETEQ CX // 1 if alen == blen
+ LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
+ RET
+
+ // this works for >= 64 bytes of data.
+#ifndef hasAVX2
+big_loop:
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff16
+
+ MOVOU 16(SI), X0
+ MOVOU 16(DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff32
+
+ MOVOU 32(SI), X0
+ MOVOU 32(DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff48
+
+ MOVOU 48(SI), X0
+ MOVOU 48(DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff64
+
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, R8
+ CMPQ R8, $64
+ JBE loop
+ JMP big_loop
+#endif
+
+ // Compare 64-bytes per loop iteration.
+ // Loop is unrolled and uses AVX2.
+big_loop_avx2:
+ VMOVDQU (SI), Y2
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(SI), Y4
+ VMOVDQU 32(DI), Y5
+ VPCMPEQB Y2, Y3, Y0
+ VPMOVMSKB Y0, AX
+ XORL $0xffffffff, AX
+ JNE diff32_avx2
+ VPCMPEQB Y4, Y5, Y6
+ VPMOVMSKB Y6, AX
+ XORL $0xffffffff, AX
+ JNE diff64_avx2
+
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, R8
+ CMPQ R8, $64
+ JB big_loop_avx2_exit
+ JMP big_loop_avx2
+
+ // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
+diff32_avx2:
+ VZEROUPPER
+ JMP diff16
+
+ // Same as diff32_avx2, but for last 32 bytes.
+diff64_avx2:
+ VZEROUPPER
+ JMP diff48
+
+ // For <64 bytes remainder jump to normal loop.
+big_loop_avx2_exit:
+ VZEROUPPER
+ JMP loop
diff --git a/src/internal/bytealg/compare_arm.s b/src/internal/bytealg/compare_arm.s
new file mode 100644
index 0000000..80d01a2
--- /dev/null
+++ b/src/internal/bytealg/compare_arm.s
@@ -0,0 +1,86 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-28
+ MOVW a_base+0(FP), R2
+ MOVW a_len+4(FP), R0
+ MOVW b_base+12(FP), R3
+ MOVW b_len+16(FP), R1
+ ADD $28, R13, R7
+ B cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-20
+ MOVW a_base+0(FP), R2
+ MOVW a_len+4(FP), R0
+ MOVW b_base+8(FP), R3
+ MOVW b_len+12(FP), R1
+ ADD $20, R13, R7
+ B cmpbody<>(SB)
+
+// On entry:
+// R0 is the length of a
+// R1 is the length of b
+// R2 points to the start of a
+// R3 points to the start of b
+// R7 points to return value (-1/0/1 will be written here)
+//
+// On exit:
+// R4, R5, R6 and R8 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+ CMP R2, R3
+ BEQ samebytes
+ CMP R0, R1
+ MOVW R0, R6
+ MOVW.LT R1, R6 // R6 is min(R0, R1)
+
+ CMP $0, R6
+ BEQ samebytes
+ CMP $4, R6
+ ADD R2, R6 // R2 is current byte in a, R6 is the end of the range to compare
+ BLT byte_loop // length < 4
+ AND $3, R2, R8
+ CMP $0, R8
+ BNE byte_loop // unaligned a, use byte-wise compare (TODO: try to align a)
+aligned_a:
+ AND $3, R3, R8
+ CMP $0, R8
+ BNE byte_loop // unaligned b, use byte-wise compare
+ AND $0xfffffffc, R6, R8
+ // length >= 4
+chunk4_loop:
+ MOVW.P 4(R2), R4
+ MOVW.P 4(R3), R5
+ CMP R4, R5
+ BNE cmp
+ CMP R2, R8
+ BNE chunk4_loop
+ CMP R2, R6
+ BEQ samebytes // all compared bytes were the same; compare lengths
+byte_loop:
+ MOVBU.P 1(R2), R4
+ MOVBU.P 1(R3), R5
+ CMP R4, R5
+ BNE ret
+ CMP R2, R6
+ BNE byte_loop
+samebytes:
+ CMP R0, R1
+ MOVW.LT $1, R0
+ MOVW.GT $-1, R0
+ MOVW.EQ $0, R0
+ MOVW R0, (R7)
+ RET
+ret:
+ // bytes differed
+ MOVW.LT $1, R0
+ MOVW.GT $-1, R0
+ MOVW R0, (R7)
+ RET
+cmp:
+ SUB $4, R2, R2
+ SUB $4, R3, R3
+ B byte_loop
diff --git a/src/internal/bytealg/compare_arm64.s b/src/internal/bytealg/compare_arm64.s
new file mode 100644
index 0000000..cc02c46
--- /dev/null
+++ b/src/internal/bytealg/compare_arm64.s
@@ -0,0 +1,125 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+ // R0 = a_base (want in R0)
+ // R1 = a_len (want in R1)
+ // R2 = a_cap (unused)
+ // R3 = b_base (want in R2)
+ // R4 = b_len (want in R3)
+ // R5 = b_cap (unused)
+ MOVD R3, R2
+ MOVD R4, R3
+ B cmpbody<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+ // R0 = a_base
+ // R1 = a_len
+ // R2 = b_base
+ // R3 = b_len
+ B cmpbody<>(SB)
+
+// On entry:
+// R0 points to the start of a
+// R1 is the length of a
+// R2 points to the start of b
+// R3 is the length of b
+//
+// On exit:
+// R0 is the result
+// R4, R5, R6, R8, R9 and R10 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+ CMP R0, R2
+ BEQ samebytes // same starting pointers; compare lengths
+ CMP R1, R3
+ CSEL LT, R3, R1, R6 // R6 is min(R1, R3)
+
+ CBZ R6, samebytes
+ BIC $0xf, R6, R10
+ CBZ R10, small // length < 16
+ ADD R0, R10 // end of chunk16
+ // length >= 16
+chunk16_loop:
+ LDP.P 16(R0), (R4, R8)
+ LDP.P 16(R2), (R5, R9)
+ CMP R4, R5
+ BNE cmp
+ CMP R8, R9
+ BNE cmpnext
+ CMP R10, R0
+ BNE chunk16_loop
+ AND $0xf, R6, R6
+ CBZ R6, samebytes
+ SUBS $8, R6
+ BLT tail
+ // the length of tail > 8 bytes
+ MOVD.P 8(R0), R4
+ MOVD.P 8(R2), R5
+ CMP R4, R5
+ BNE cmp
+ SUB $8, R6
+ // compare last 8 bytes
+tail:
+ MOVD (R0)(R6), R4
+ MOVD (R2)(R6), R5
+ CMP R4, R5
+ BEQ samebytes
+cmp:
+ REV R4, R4
+ REV R5, R5
+ CMP R4, R5
+ret:
+ MOVD $1, R0
+ CNEG HI, R0, R0
+ RET
+small:
+ TBZ $3, R6, lt_8
+ MOVD (R0), R4
+ MOVD (R2), R5
+ CMP R4, R5
+ BNE cmp
+ SUBS $8, R6
+ BEQ samebytes
+ ADD $8, R0
+ ADD $8, R2
+ SUB $8, R6
+ B tail
+lt_8:
+ TBZ $2, R6, lt_4
+ MOVWU (R0), R4
+ MOVWU (R2), R5
+ CMPW R4, R5
+ BNE cmp
+ SUBS $4, R6
+ BEQ samebytes
+ ADD $4, R0
+ ADD $4, R2
+lt_4:
+ TBZ $1, R6, lt_2
+ MOVHU (R0), R4
+ MOVHU (R2), R5
+ CMPW R4, R5
+ BNE cmp
+ ADD $2, R0
+ ADD $2, R2
+lt_2:
+ TBZ $0, R6, samebytes
+one:
+ MOVBU (R0), R4
+ MOVBU (R2), R5
+ CMPW R4, R5
+ BNE ret
+samebytes:
+ CMP R3, R1
+ CSET NE, R0
+ CNEG LO, R0, R0
+ RET
+cmpnext:
+ REV R8, R4
+ REV R9, R5
+ CMP R4, R5
+ B ret
diff --git a/src/internal/bytealg/compare_generic.go b/src/internal/bytealg/compare_generic.go
new file mode 100644
index 0000000..b04e275
--- /dev/null
+++ b/src/internal/bytealg/compare_generic.go
@@ -0,0 +1,60 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le && !riscv64
+
+package bytealg
+
+import _ "unsafe" // for go:linkname
+
+func Compare(a, b []byte) int {
+ l := len(a)
+ if len(b) < l {
+ l = len(b)
+ }
+ if l == 0 || &a[0] == &b[0] {
+ goto samebytes
+ }
+ for i := 0; i < l; i++ {
+ c1, c2 := a[i], b[i]
+ if c1 < c2 {
+ return -1
+ }
+ if c1 > c2 {
+ return +1
+ }
+ }
+samebytes:
+ if len(a) < len(b) {
+ return -1
+ }
+ if len(a) > len(b) {
+ return +1
+ }
+ return 0
+}
+
+//go:linkname runtime_cmpstring runtime.cmpstring
+func runtime_cmpstring(a, b string) int {
+ l := len(a)
+ if len(b) < l {
+ l = len(b)
+ }
+ for i := 0; i < l; i++ {
+ c1, c2 := a[i], b[i]
+ if c1 < c2 {
+ return -1
+ }
+ if c1 > c2 {
+ return +1
+ }
+ }
+ if len(a) < len(b) {
+ return -1
+ }
+ if len(a) > len(b) {
+ return +1
+ }
+ return 0
+}
diff --git a/src/internal/bytealg/compare_loong64.s b/src/internal/bytealg/compare_loong64.s
new file mode 100644
index 0000000..54c2dab
--- /dev/null
+++ b/src/internal/bytealg/compare_loong64.s
@@ -0,0 +1,86 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-56
+ MOVV a_base+0(FP), R6
+ MOVV b_base+24(FP), R7
+ MOVV a_len+8(FP), R4
+ MOVV b_len+32(FP), R5
+ MOVV $ret+48(FP), R13
+ JMP cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+ MOVV a_base+0(FP), R6
+ MOVV b_base+16(FP), R7
+ MOVV a_len+8(FP), R4
+ MOVV b_len+24(FP), R5
+ MOVV $ret+32(FP), R13
+ JMP cmpbody<>(SB)
+
+// On entry:
+// R4 length of a
+// R5 length of b
+// R6 points to the start of a
+// R7 points to the start of b
+// R13 points to the return value (-1/0/1)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
+ BEQ R6, R7, samebytes // same start of a and b
+
+ SGTU R4, R5, R9
+ BNE R0, R9, r2_lt_r1
+ MOVV R4, R14
+ JMP entry
+r2_lt_r1:
+ MOVV R5, R14 // R14 is min(R4, R5)
+entry:
+ ADDV R6, R14, R12 // R6 start of a, R14 end of a
+ BEQ R6, R12, samebytes // length is 0
+
+ SRLV $4, R14 // R14 is number of chunks
+ BEQ R0, R14, byte_loop
+
+ // make sure both a and b are aligned.
+ OR R6, R7, R15
+ AND $7, R15
+ BNE R0, R15, byte_loop
+
+chunk16_loop:
+ BEQ R0, R14, byte_loop
+ MOVV (R6), R8
+ MOVV (R7), R9
+ BNE R8, R9, byte_loop
+ MOVV 8(R6), R16
+ MOVV 8(R7), R17
+ ADDV $16, R6
+ ADDV $16, R7
+ SUBVU $1, R14
+ BEQ R16, R17, chunk16_loop
+ SUBV $8, R6
+ SUBV $8, R7
+
+byte_loop:
+ BEQ R6, R12, samebytes
+ MOVBU (R6), R8
+ ADDVU $1, R6
+ MOVBU (R7), R9
+ ADDVU $1, R7
+ BEQ R8, R9, byte_loop
+
+byte_cmp:
+ SGTU R8, R9, R12 // R12 = 1 if (R8 > R9)
+ BNE R0, R12, ret
+ MOVV $-1, R12
+ JMP ret
+
+samebytes:
+ SGTU R4, R5, R8
+ SGTU R5, R4, R9
+ SUBV R9, R8, R12
+
+ret:
+ MOVV R12, (R13)
+ RET
diff --git a/src/internal/bytealg/compare_mips64x.s b/src/internal/bytealg/compare_mips64x.s
new file mode 100644
index 0000000..117a9ef
--- /dev/null
+++ b/src/internal/bytealg/compare_mips64x.s
@@ -0,0 +1,88 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-56
+ MOVV a_base+0(FP), R3
+ MOVV b_base+24(FP), R4
+ MOVV a_len+8(FP), R1
+ MOVV b_len+32(FP), R2
+ MOVV $ret+48(FP), R9
+ JMP cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+ MOVV a_base+0(FP), R3
+ MOVV b_base+16(FP), R4
+ MOVV a_len+8(FP), R1
+ MOVV b_len+24(FP), R2
+ MOVV $ret+32(FP), R9
+ JMP cmpbody<>(SB)
+
+// On entry:
+// R1 length of a
+// R2 length of b
+// R3 points to the start of a
+// R4 points to the start of b
+// R9 points to the return value (-1/0/1)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
+ BEQ R3, R4, samebytes // same start of a and b
+
+ SGTU R1, R2, R7
+ BNE R0, R7, r2_lt_r1
+ MOVV R1, R10
+ JMP entry
+r2_lt_r1:
+ MOVV R2, R10 // R10 is min(R1, R2)
+entry:
+ ADDV R3, R10, R8 // R3 start of a, R8 end of a
+ BEQ R3, R8, samebytes // length is 0
+
+ SRLV $4, R10 // R10 is number of chunks
+ BEQ R0, R10, byte_loop
+
+ // make sure both a and b are aligned.
+ OR R3, R4, R11
+ AND $7, R11
+ BNE R0, R11, byte_loop
+
+chunk16_loop:
+ BEQ R0, R10, byte_loop
+ MOVV (R3), R6
+ MOVV (R4), R7
+ BNE R6, R7, byte_loop
+ MOVV 8(R3), R13
+ MOVV 8(R4), R14
+ ADDV $16, R3
+ ADDV $16, R4
+ SUBVU $1, R10
+ BEQ R13, R14, chunk16_loop
+ SUBV $8, R3
+ SUBV $8, R4
+
+byte_loop:
+ BEQ R3, R8, samebytes
+ MOVBU (R3), R6
+ ADDVU $1, R3
+ MOVBU (R4), R7
+ ADDVU $1, R4
+ BEQ R6, R7, byte_loop
+
+byte_cmp:
+ SGTU R6, R7, R8 // R8 = 1 if (R6 > R7)
+ BNE R0, R8, ret
+ MOVV $-1, R8
+ JMP ret
+
+samebytes:
+ SGTU R1, R2, R6
+ SGTU R2, R1, R7
+ SUBV R7, R6, R8
+
+ret:
+ MOVV R8, (R9)
+ RET
diff --git a/src/internal/bytealg/compare_mipsx.s b/src/internal/bytealg/compare_mipsx.s
new file mode 100644
index 0000000..857ac13
--- /dev/null
+++ b/src/internal/bytealg/compare_mipsx.s
@@ -0,0 +1,72 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+ MOVW a_base+0(FP), R3
+ MOVW b_base+12(FP), R4
+ MOVW a_len+4(FP), R1
+ MOVW b_len+16(FP), R2
+ BEQ R3, R4, samebytes
+ SGTU R1, R2, R7
+ MOVW R1, R8
+ CMOVN R7, R2, R8 // R8 is min(R1, R2)
+
+ ADDU R3, R8 // R3 is current byte in a, R8 is last byte in a to compare
+loop:
+ BEQ R3, R8, samebytes
+
+ MOVBU (R3), R6
+ ADDU $1, R3
+ MOVBU (R4), R7
+ ADDU $1, R4
+ BEQ R6, R7 , loop
+
+ SGTU R6, R7, R8
+ MOVW $-1, R6
+ CMOVZ R8, R6, R8
+ JMP cmp_ret
+samebytes:
+ SGTU R1, R2, R6
+ SGTU R2, R1, R7
+ SUBU R7, R6, R8
+cmp_ret:
+ MOVW R8, ret+24(FP)
+ RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+ MOVW a_base+0(FP), R3
+ MOVW a_len+4(FP), R1
+ MOVW b_base+8(FP), R4
+ MOVW b_len+12(FP), R2
+ BEQ R3, R4, samebytes
+ SGTU R1, R2, R7
+ MOVW R1, R8
+ CMOVN R7, R2, R8 // R8 is min(R1, R2)
+
+ ADDU R3, R8 // R3 is current byte in a, R8 is last byte in a to compare
+loop:
+ BEQ R3, R8, samebytes // all compared bytes were the same; compare lengths
+
+ MOVBU (R3), R6
+ ADDU $1, R3
+ MOVBU (R4), R7
+ ADDU $1, R4
+ BEQ R6, R7 , loop
+ // bytes differed
+ SGTU R6, R7, R8
+ MOVW $-1, R6
+ CMOVZ R8, R6, R8
+ JMP cmp_ret
+samebytes:
+ SGTU R1, R2, R6
+ SGTU R2, R1, R7
+ SUBU R7, R6, R8
+cmp_ret:
+ MOVW R8, ret+16(FP)
+ RET
diff --git a/src/internal/bytealg/compare_native.go b/src/internal/bytealg/compare_native.go
new file mode 100644
index 0000000..34964e2
--- /dev/null
+++ b/src/internal/bytealg/compare_native.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64 || s390x || arm || arm64 || loong64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le || riscv64
+
+package bytealg
+
+import _ "unsafe" // For go:linkname
+
+//go:noescape
+func Compare(a, b []byte) int
+
+// The declaration below generates ABI wrappers for functions
+// implemented in assembly in this package but declared in another
+// package.
+
+//go:linkname abigen_runtime_cmpstring runtime.cmpstring
+func abigen_runtime_cmpstring(a, b string) int
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s
new file mode 100644
index 0000000..cbe0525
--- /dev/null
+++ b/src/internal/bytealg/compare_ppc64x.s
@@ -0,0 +1,502 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+ // incoming:
+ // R3 a addr -> R5
+ // R4 a len -> R3
+ // R5 a cap unused
+ // R6 b addr -> R6
+ // R7 b len -> R4
+ // R8 b cap unused
+ MOVD R3, R5
+ MOVD R4, R3
+ MOVD R7, R4
+ CMP R5,R6,CR7
+ CMP R3,R4,CR6
+ BEQ CR7,equal
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+ CMP R16,$1
+ BNE power8
+ BR cmpbodyp9<>(SB)
+power8:
+ BR cmpbody<>(SB)
+equal:
+ BEQ CR6,done
+ MOVD $1, R8
+ BGT CR6,greater
+ NEG R8
+greater:
+ MOVD R8, R3
+ RET
+done:
+ MOVD $0, R3
+ RET
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+ // incoming:
+ // R3 a addr -> R5
+ // R4 a len -> R3
+ // R5 b addr -> R6
+ // R6 b len -> R4
+ MOVD R6, R7
+ MOVD R5, R6
+ MOVD R3, R5
+ MOVD R4, R3
+ MOVD R7, R4
+ CMP R5,R6,CR7
+ CMP R3,R4,CR6
+ BEQ CR7,equal
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+ CMP R16,$1
+ BNE power8
+ BR cmpbodyp9<>(SB)
+power8:
+ BR cmpbody<>(SB)
+equal:
+ BEQ CR6,done
+ MOVD $1, R8
+ BGT CR6,greater
+ NEG R8
+greater:
+ MOVD R8, R3
+ RET
+
+done:
+ MOVD $0, R3
+ RET
+
+#ifdef GOARCH_ppc64le
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+GLOBL byteswap<>+0(SB), RODATA, $16
+#define SWAP V21
+#endif
+
+// Do an efficient memcmp for ppc64le/ppc64/POWER8
+// R3 = a len
+// R4 = b len
+// R5 = a addr
+// R6 = b addr
+// On exit:
+// R3 = return value
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+ MOVD R3,R8 // set up length
+ CMP R3,R4,CR2 // unequal?
+ BLT CR2,setuplen // BLT CR2
+ MOVD R4,R8 // use R4 for comparison len
+setuplen:
+ CMP R8,$32 // optimize >= 32
+ MOVD R8,R9
+ BLT setup8a // optimize < 32
+ MOVD $16,R10 // set offsets to load into vectors
+ CMP R8,$64
+ BLT cmp32 // process size 32-63
+
+ DCBT (R5) // optimize >= 64
+ DCBT (R6) // cache hint
+ MOVD $32,R11 // set offsets to load into vector
+ MOVD $48,R12 // set offsets to load into vector
+
+loop64a:// process size 64 and greater
+ LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
+ LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different // jump out if its different
+
+ LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
+ LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector
+ LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector
+ LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ ADD $-64,R9,R9 // reduce remaining size by 64
+ ADD $64,R5,R5 // increment to next 64 bytes of A
+ ADD $64,R6,R6 // increment to next 64 bytes of B
+ CMPU R9,$64
+ BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
+
+ CMPU R9,$32
+ BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
+ CMPU R9,$0
+ BNE rem // loop to rem if the remainder is not 0
+
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+cmp32:
+ LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
+ LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
+ LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ ADD $-32,R9,R9 // reduce remaining size by 32
+ ADD $32,R5,R5 // increment to next 32 bytes of A
+ ADD $32,R6,R6 // increment to next 32 bytes of B
+ CMPU R9,$0
+ BNE rem // loop to rem if the remainder is not 0
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+rem:
+ MOVD R9,R8
+ ANDCC $24,R8,R9 // Any 8 byte chunks?
+ BEQ leftover // and result is 0
+ BR setup8a
+
+different:
+#ifdef GOARCH_ppc64le
+ MOVD $byteswap<>+00(SB), R16
+ LXVD2X (R16)(R0),SWAP // Set up swap string
+
+ VPERM V3,V3,SWAP,V3
+ VPERM V4,V4,SWAP,V4
+#endif
+ MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
+ MFVSRD VS36,R10
+
+ CMPU R16,R10
+ BEQ lower
+ BGT greater
+ MOVD $-1,R3 // return value if A < B
+ RET
+lower:
+ VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison
+ MFVSRD VS35,R16
+ VSLDOI $8,V4,V4,V4
+ MFVSRD VS36,R10
+
+ CMPU R16,R10
+ BGT greater
+ MOVD $-1,R3 // return value if A < B
+ RET
+setup8a:
+ SRADCC $3,R8,R9 // get the 8 byte count
+ BEQ leftover // shifted value is 0
+ CMPU R8,$8 // optimize 8byte move
+ BEQ size8
+ CMPU R8,$16
+ BEQ size16
+ MOVD R9,CTR // loop count for doublewords
+loop8:
+#ifdef GOARCH_ppc64le
+ MOVDBR (R5+R0),R16 // doublewords to compare
+ MOVDBR (R6+R0),R10 // LE compare order
+#else
+ MOVD (R5+R0),R16 // doublewords to compare
+ MOVD (R6+R0),R10 // BE compare order
+#endif
+ ADD $8,R5
+ ADD $8,R6
+ CMPU R16,R10 // match?
+ BC 8,2,loop8 // bt ctr <> 0 && cr
+ BGT greater
+ BLT less
+leftover:
+ ANDCC $7,R8,R9 // check for leftover bytes
+ BEQ zeroremainder
+simplecheck:
+ MOVD R0,R14
+ CMP R9,$4 // process 4 bytes
+ BLT halfword
+#ifdef GOARCH_ppc64le
+ MOVWBR (R5)(R14),R10
+ MOVWBR (R6)(R14),R11
+#else
+ MOVWZ (R5)(R14),R10
+ MOVWZ (R6)(R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $-4,R9
+ ADD $4,R14
+ PCALIGN $16
+
+halfword:
+ CMP R9,$2 // process 2 bytes
+ BLT byte
+#ifdef GOARCH_ppc64le
+ MOVHBR (R5)(R14),R10
+ MOVHBR (R6)(R14),R11
+#else
+ MOVHZ (R5)(R14),R10
+ MOVHZ (R6)(R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $-2,R9
+ ADD $2,R14
+ PCALIGN $16
+byte:
+ CMP R9,$0 // process 1 byte
+ BEQ skip
+ MOVBZ (R5)(R14),R10
+ MOVBZ (R6)(R14),R11
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ PCALIGN $16
+skip:
+ BEQ CR2,equal
+ BGT CR2,greater
+
+less: MOVD $-1,R3 // return value if A < B
+ RET
+size16:
+ LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
+ LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+zeroremainder:
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+size8:
+#ifdef GOARCH_ppc64le
+ MOVDBR (R5+R0),R16 // doublewords to compare
+ MOVDBR (R6+R0),R10 // LE compare order
+#else
+ MOVD (R5+R0),R16 // doublewords to compare
+ MOVD (R6+R0),R10 // BE compare order
+#endif
+ CMPU R16,R10 // match?
+ BGT greater
+ BLT less
+ BGT CR2,greater // 2nd len > 1st len
+ BLT CR2,less // 2nd len < 1st len
+equal:
+ MOVD $0, R3 // return value if A == B
+ RET
+greater:
+ MOVD $1,R3 // return value if A > B
+ RET
+
+// Do an efficient memcmp for ppc64le/ppc64/POWER9
+// R3 = a len
+// R4 = b len
+// R5 = a addr
+// R6 = b addr
+// On exit:
+// R3 = return value
+TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
+ MOVD R3,R8 // set up length
+ CMP R3,R4,CR2 // unequal?
+ BLT CR2,setuplen // BLT CR2
+ MOVD R4,R8 // use R4 for comparison len
+setuplen:
+ CMP R8,$16 // optimize for size<16
+ MOVD R8,R9
+ BLT simplecheck
+ MOVD $16,R10 // set offsets to load into vectors
+ CMP R8,$32 // optimize for size 16-31
+ BLT cmp16
+ CMP R8,$64
+ BLT cmp32 // optimize for size 32-63
+ DCBT (R5) // optimize for size>=64
+ DCBT (R6) // cache hint
+
+ MOVD $32,R11 // set offsets to load into vector
+ MOVD $48,R12 // set offsets to load into vector
+
+loop64a:// process size 64 and greater
+ LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
+ LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
+ VCMPNEBCC V3,V4,V1 // record comparison into V1
+ BNE CR6,different // jump out if its different
+
+ LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector
+ LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector
+ LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ ADD $-64,R9,R9 // reduce remaining size by 64
+ ADD $64,R5,R5 // increment to next 64 bytes of A
+ ADD $64,R6,R6 // increment to next 64 bytes of B
+ CMPU R9,$64
+ BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
+
+ CMPU R9,$32
+ BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
+ CMPU R9,$16
+ BGE cmp16 // loop to cmp16 if there are 16-31 bytes left
+ CMPU R9,$0
+ BNE simplecheck // loop to simplecheck for remaining bytes
+
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+cmp32:
+ LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
+ LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
+
+ VCMPNEBCC V3,V4,V1 // record comparison into V1
+ BNE CR6,different // jump out if its different
+
+ LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ ADD $-32,R9,R9 // reduce remaining size by 32
+ ADD $32,R5,R5 // increment to next 32 bytes of A
+ ADD $32,R6,R6 // increment to next 32 bytes of B
+ CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left
+ BGE cmp16
+ CMPU R9,$0
+ BNE simplecheck // loop to simplecheck for remainder bytes
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+different:
+
+ MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
+ MFVSRD VS36,R10
+
+ CMPU R16,R10
+ BEQ lower
+ BGT greater
+ MOVD $-1,R3 // return value if A < B
+ RET
+lower:
+ MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison
+ MFVSRLD VS36,R10
+
+ CMPU R16,R10
+ BGT greater
+ MOVD $-1,R3 // return value if A < B
+ RET
+
+greater:
+ MOVD $1,R3 // return value if A > B
+ RET
+cmp16:
+ ANDCC $16,R9,R31
+ BEQ tail
+
+ LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ ADD $16,R5
+ ADD $16,R6
+tail:
+ ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b)
+ BEQ end
+
+ ADD R9,R5
+ ADD R9,R6
+ MOVD $-16,R10
+
+ LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+end:
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B)
+ BR greater // jump to greater otherwise
+simplecheck:
+ MOVD $0,R14 // process 8 bytes
+ CMP R9,$8
+ BLT word
+#ifdef GOARCH_ppc64le
+ MOVDBR (R5+R14),R10
+ MOVDBR (R6+R14),R11
+#else
+ MOVD (R5+R14),R10
+ MOVD (R6+R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $8,R14
+ ADD $-8,R9
+ PCALIGN $16
+word:
+ CMP R9,$4 // process 4 bytes
+ BLT halfword
+#ifdef GOARCH_ppc64le
+ MOVWBR (R5+R14),R10
+ MOVWBR (R6+R14),R11
+#else
+ MOVWZ (R5+R14),R10
+ MOVWZ (R6+R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $4,R14
+ ADD $-4,R9
+ PCALIGN $16
+halfword:
+ CMP R9,$2 // process 2 bytes
+ BLT byte
+#ifdef GOARCH_ppc64le
+ MOVHBR (R5+R14),R10
+ MOVHBR (R6+R14),R11
+#else
+ MOVHZ (R5+R14),R10
+ MOVHZ (R6+R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $2,R14
+ ADD $-2,R9
+ PCALIGN $16
+byte:
+ CMP R9,$0 // process 1 byte
+ BEQ skip
+ MOVBZ (R5+R14),R10
+ MOVBZ (R6+R14),R11
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ PCALIGN $16
+skip:
+ BEQ CR2,equal
+ BGT CR2,greater
+less:
+ MOVD $-1,R3 // return value if A < B
+ RET
+equal:
+ MOVD $0, R3 // return value if A == B
+ RET
diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s
new file mode 100644
index 0000000..44a743d
--- /dev/null
+++ b/src/internal/bytealg/compare_riscv64.s
@@ -0,0 +1,187 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+ // X10 = a_base
+ // X11 = a_len
+ // X12 = a_cap (unused)
+ // X13 = b_base (want in X12)
+ // X14 = b_len (want in X13)
+ // X15 = b_cap (unused)
+ MOV X13, X12
+ MOV X14, X13
+ JMP compare<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+ // X10 = a_base
+ // X11 = a_len
+ // X12 = b_base
+ // X13 = b_len
+ JMP compare<>(SB)
+
+// On entry:
+// X10 points to start of a
+// X11 length of a
+// X12 points to start of b
+// X13 length of b
+// for non-regabi X14 points to the address to store the return value (-1/0/1)
+// for regabi the return value in X10
+TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
+ BEQ X10, X12, cmp_len
+
+ MOV X11, X5
+ BGE X13, X5, use_a_len // X5 = min(len(a), len(b))
+ MOV X13, X5
+use_a_len:
+ BEQZ X5, cmp_len
+
+ MOV $32, X6
+ BLT X5, X6, loop4_check
+
+ // Check alignment - if alignment differs we have to do one byte at a time.
+ AND $7, X10, X7
+ AND $7, X12, X8
+ BNE X7, X8, loop4_check
+ BEQZ X7, loop32_check
+
+ // Check one byte at a time until we reach 8 byte alignment.
+ SUB X7, X5, X5
+align:
+ ADD $-1, X7
+ MOVBU 0(X10), X8
+ MOVBU 0(X12), X9
+ BNE X8, X9, cmp
+ ADD $1, X10
+ ADD $1, X12
+ BNEZ X7, align
+
+loop32_check:
+ MOV $32, X7
+ BLT X5, X7, loop16_check
+loop32:
+ MOV 0(X10), X15
+ MOV 0(X12), X16
+ MOV 8(X10), X17
+ MOV 8(X12), X18
+ BEQ X15, X16, loop32a
+ JMP cmp8a
+loop32a:
+ BEQ X17, X18, loop32b
+ JMP cmp8b
+loop32b:
+ MOV 16(X10), X15
+ MOV 16(X12), X16
+ MOV 24(X10), X17
+ MOV 24(X12), X18
+ BEQ X15, X16, loop32c
+ JMP cmp8a
+loop32c:
+ BEQ X17, X18, loop32d
+ JMP cmp8b
+loop32d:
+ ADD $32, X10
+ ADD $32, X12
+ ADD $-32, X5
+ BGE X5, X7, loop32
+ BEQZ X5, cmp_len
+
+loop16_check:
+ MOV $16, X6
+ BLT X5, X6, loop4_check
+loop16:
+ MOV 0(X10), X15
+ MOV 0(X12), X16
+ MOV 8(X10), X17
+ MOV 8(X12), X18
+ BEQ X15, X16, loop16a
+ JMP cmp8a
+loop16a:
+ BEQ X17, X18, loop16b
+ JMP cmp8b
+loop16b:
+ ADD $16, X10
+ ADD $16, X12
+ ADD $-16, X5
+ BGE X5, X6, loop16
+ BEQZ X5, cmp_len
+
+loop4_check:
+ MOV $4, X6
+ BLT X5, X6, loop1
+loop4:
+ MOVBU 0(X10), X8
+ MOVBU 0(X12), X9
+ MOVBU 1(X10), X15
+ MOVBU 1(X12), X16
+ BEQ X8, X9, loop4a
+ SLTU X9, X8, X5
+ SLTU X8, X9, X6
+ JMP cmp_ret
+loop4a:
+ BEQ X15, X16, loop4b
+ SLTU X16, X15, X5
+ SLTU X15, X16, X6
+ JMP cmp_ret
+loop4b:
+ MOVBU 2(X10), X21
+ MOVBU 2(X12), X22
+ MOVBU 3(X10), X23
+ MOVBU 3(X12), X24
+ BEQ X21, X22, loop4c
+ SLTU X22, X21, X5
+ SLTU X21, X22, X6
+ JMP cmp_ret
+loop4c:
+ BEQ X23, X24, loop4d
+ SLTU X24, X23, X5
+ SLTU X23, X24, X6
+ JMP cmp_ret
+loop4d:
+ ADD $4, X10
+ ADD $4, X12
+ ADD $-4, X5
+ BGE X5, X6, loop4
+
+loop1:
+ BEQZ X5, cmp_len
+ MOVBU 0(X10), X8
+ MOVBU 0(X12), X9
+ BNE X8, X9, cmp
+ ADD $1, X10
+ ADD $1, X12
+ ADD $-1, X5
+ JMP loop1
+
+ // Compare 8 bytes of memory in X15/X16 that are known to differ.
+cmp8a:
+ MOV $0xff, X19
+cmp8a_loop:
+ AND X15, X19, X8
+ AND X16, X19, X9
+ BNE X8, X9, cmp
+ SLLI $8, X19
+ JMP cmp8a_loop
+
+ // Compare 8 bytes of memory in X17/X18 that are known to differ.
+cmp8b:
+ MOV $0xff, X19
+cmp8b_loop:
+ AND X17, X19, X8
+ AND X18, X19, X9
+ BNE X8, X9, cmp
+ SLLI $8, X19
+ JMP cmp8b_loop
+
+cmp_len:
+ MOV X11, X8
+ MOV X13, X9
+cmp:
+ SLTU X9, X8, X5
+ SLTU X8, X9, X6
+cmp_ret:
+ SUB X5, X6, X10
+ RET
diff --git a/src/internal/bytealg/compare_s390x.s b/src/internal/bytealg/compare_s390x.s
new file mode 100644
index 0000000..5394548
--- /dev/null
+++ b/src/internal/bytealg/compare_s390x.s
@@ -0,0 +1,69 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+ MOVD a_base+0(FP), R3
+ MOVD a_len+8(FP), R4
+ MOVD b_base+24(FP), R5
+ MOVD b_len+32(FP), R6
+ LA ret+48(FP), R7
+ BR cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+ MOVD a_base+0(FP), R3
+ MOVD a_len+8(FP), R4
+ MOVD b_base+16(FP), R5
+ MOVD b_len+24(FP), R6
+ LA ret+32(FP), R7
+ BR cmpbody<>(SB)
+
+// input:
+// R3 = a
+// R4 = alen
+// R5 = b
+// R6 = blen
+// R7 = address of output word (stores -1/0/1 here)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+ CMPBEQ R3, R5, cmplengths
+ MOVD R4, R8
+ CMPBLE R4, R6, amin
+ MOVD R6, R8
+amin:
+ CMPBEQ R8, $0, cmplengths
+ CMP R8, $256
+ BLE tail
+loop:
+ CLC $256, 0(R3), 0(R5)
+ BGT gt
+ BLT lt
+ SUB $256, R8
+ MOVD $256(R3), R3
+ MOVD $256(R5), R5
+ CMP R8, $256
+ BGT loop
+tail:
+ SUB $1, R8
+ EXRL $cmpbodyclc<>(SB), R8
+ BGT gt
+ BLT lt
+cmplengths:
+ CMP R4, R6
+ BEQ eq
+ BLT lt
+gt:
+ MOVD $1, 0(R7)
+ RET
+lt:
+ MOVD $-1, 0(R7)
+ RET
+eq:
+ MOVD $0, 0(R7)
+ RET
+
+TEXT cmpbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0
+ CLC $1, 0(R3), 0(R5)
+ RET
diff --git a/src/internal/bytealg/compare_wasm.s b/src/internal/bytealg/compare_wasm.s
new file mode 100644
index 0000000..dc8fb33
--- /dev/null
+++ b/src/internal/bytealg/compare_wasm.s
@@ -0,0 +1,115 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB), NOSPLIT, $0-56
+ Get SP
+ I64Load a_base+0(FP)
+ I64Load a_len+8(FP)
+ I64Load b_base+24(FP)
+ I64Load b_len+32(FP)
+ Call cmpbody<>(SB)
+ I64Store ret+48(FP)
+ RET
+
+TEXT runtime·cmpstring(SB), NOSPLIT, $0-40
+ Get SP
+ I64Load a_base+0(FP)
+ I64Load a_len+8(FP)
+ I64Load b_base+16(FP)
+ I64Load b_len+24(FP)
+ Call cmpbody<>(SB)
+ I64Store ret+32(FP)
+ RET
+
+// params: a, alen, b, blen
+// ret: -1/0/1
+TEXT cmpbody<>(SB), NOSPLIT, $0-0
+ // len = min(alen, blen)
+ Get R1
+ Get R3
+ Get R1
+ Get R3
+ I64LtU
+ Select
+ Set R4
+
+ Get R0
+ I32WrapI64
+ Get R2
+ I32WrapI64
+ Get R4
+ I32WrapI64
+ Call memcmp<>(SB)
+ I64ExtendI32S
+ Tee R5
+
+ I64Eqz
+ If
+ // check length
+ Get R1
+ Get R3
+ I64Sub
+ Set R5
+ End
+
+ I64Const $0
+ I64Const $-1
+ I64Const $1
+ Get R5
+ I64Const $0
+ I64LtS
+ Select
+ Get R5
+ I64Eqz
+ Select
+ Return
+
+// compiled with emscripten
+// params: a, b, len
+// ret: <0/0/>0
+TEXT memcmp<>(SB), NOSPLIT, $0-0
+ Get R2
+ If $1
+ Loop
+ Get R0
+ I32Load8S $0
+ Tee R3
+ Get R1
+ I32Load8S $0
+ Tee R4
+ I32Eq
+ If
+ Get R0
+ I32Const $1
+ I32Add
+ Set R0
+ Get R1
+ I32Const $1
+ I32Add
+ Set R1
+ I32Const $0
+ Get R2
+ I32Const $-1
+ I32Add
+ Tee R2
+ I32Eqz
+ BrIf $3
+ Drop
+ Br $1
+ End
+ End
+ Get R3
+ I32Const $255
+ I32And
+ Get R4
+ I32Const $255
+ I32And
+ I32Sub
+ Else
+ I32Const $0
+ End
+ Return
diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s
new file mode 100644
index 0000000..efb17f8
--- /dev/null
+++ b/src/internal/bytealg/count_amd64.s
@@ -0,0 +1,208 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "asm_amd64.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+#ifndef hasPOPCNT
+ CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
+ JEQ 2(PC)
+ JMP ·countGeneric(SB)
+#endif
+ MOVQ b_base+0(FP), SI
+ MOVQ b_len+8(FP), BX
+ MOVB c+24(FP), AL
+ LEAQ ret+32(FP), R8
+ JMP countbody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+#ifndef hasPOPCNT
+ CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
+ JEQ 2(PC)
+ JMP ·countGenericString(SB)
+#endif
+ MOVQ s_base+0(FP), SI
+ MOVQ s_len+8(FP), BX
+ MOVB c+16(FP), AL
+ LEAQ ret+24(FP), R8
+ JMP countbody<>(SB)
+
+// input:
+// SI: data
+// BX: data len
+// AL: byte sought
+// R8: address to put result
+// This function requires the POPCNT instruction.
+TEXT countbody<>(SB),NOSPLIT,$0
+ // Shuffle X0 around so that each byte contains
+ // the character we're looking for.
+ MOVD AX, X0
+ PUNPCKLBW X0, X0
+ PUNPCKLBW X0, X0
+ PSHUFL $0, X0, X0
+
+ CMPQ BX, $16
+ JLT small
+
+ MOVQ $0, R12 // Accumulator
+
+ MOVQ SI, DI
+
+ CMPQ BX, $32
+ JA avx2
+sse:
+ LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
+ JMP sseloopentry
+
+sseloop:
+ // Move the next 16-byte chunk of the data into X1.
+ MOVOU (DI), X1
+ // Compare bytes in X0 to X1.
+ PCMPEQB X0, X1
+ // Take the top bit of each byte in X1 and put the result in DX.
+ PMOVMSKB X1, DX
+ // Count number of matching bytes
+ POPCNTL DX, DX
+ // Accumulate into R12
+ ADDQ DX, R12
+ // Advance to next block.
+ ADDQ $16, DI
+sseloopentry:
+ CMPQ DI, AX
+ JBE sseloop
+
+ // Get the number of bytes to consider in the last 16 bytes
+ ANDQ $15, BX
+ JZ end
+
+ // Create mask to ignore overlap between previous 16 byte block
+ // and the next.
+ MOVQ $16,CX
+ SUBQ BX, CX
+ MOVQ $0xFFFF, R10
+ SARQ CL, R10
+ SALQ CL, R10
+
+ // Process the last 16-byte chunk. This chunk may overlap with the
+ // chunks we've already searched so we need to mask part of it.
+ MOVOU (AX), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, DX
+ // Apply mask
+ ANDQ R10, DX
+ POPCNTL DX, DX
+ ADDQ DX, R12
+end:
+ MOVQ R12, (R8)
+ RET
+
+// handle for lengths < 16
+small:
+ TESTQ BX, BX
+ JEQ endzero
+
+ // Check if we'll load across a page boundary.
+ LEAQ 16(SI), AX
+ TESTW $0xff0, AX
+ JEQ endofpage
+
+ // We must ignore high bytes as they aren't part of our slice.
+ // Create mask.
+ MOVB BX, CX
+ MOVQ $1, R10
+ SALQ CL, R10
+ SUBQ $1, R10
+
+ // Load data
+ MOVOU (SI), X1
+ // Compare target byte with each byte in data.
+ PCMPEQB X0, X1
+ // Move result bits to integer register.
+ PMOVMSKB X1, DX
+ // Apply mask
+ ANDQ R10, DX
+ POPCNTL DX, DX
+ // Directly return DX, we don't need to accumulate
+ // since we have <16 bytes.
+ MOVQ DX, (R8)
+ RET
+endzero:
+ MOVQ $0, (R8)
+ RET
+
+endofpage:
+ // We must ignore low bytes as they aren't part of our slice.
+ MOVQ $16,CX
+ SUBQ BX, CX
+ MOVQ $0xFFFF, R10
+ SARQ CL, R10
+ SALQ CL, R10
+
+ // Load data into the high end of X1.
+ MOVOU -16(SI)(BX*1), X1
+ // Compare target byte with each byte in data.
+ PCMPEQB X0, X1
+ // Move result bits to integer register.
+ PMOVMSKB X1, DX
+ // Apply mask
+ ANDQ R10, DX
+ // Directly return DX, we don't need to accumulate
+ // since we have <16 bytes.
+ POPCNTL DX, DX
+ MOVQ DX, (R8)
+ RET
+
+avx2:
+#ifndef hasAVX2
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+ JNE sse
+#endif
+ MOVD AX, X0
+ LEAQ -32(SI)(BX*1), R11
+ VPBROADCASTB X0, Y1
+avx2_loop:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, DX
+ POPCNTL DX, DX
+ ADDQ DX, R12
+ ADDQ $32, DI
+ CMPQ DI, R11
+ JLE avx2_loop
+
+ // If last block is already processed,
+ // skip to the end.
+ CMPQ DI, R11
+ JEQ endavx
+
+ // Load address of the last 32 bytes.
+ // There is an overlap with the previous block.
+ MOVQ R11, DI
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, DX
+ // Exit AVX mode.
+ VZEROUPPER
+
+ // Create mask to ignore overlap between previous 32 byte block
+ // and the next.
+ ANDQ $31, BX
+ MOVQ $32,CX
+ SUBQ BX, CX
+ MOVQ $0xFFFFFFFF, R10
+ SARQ CL, R10
+ SALQ CL, R10
+ // Apply mask
+ ANDQ R10, DX
+ POPCNTL DX, DX
+ ADDQ DX, R12
+ MOVQ R12, (R8)
+ RET
+endavx:
+ // Exit AVX mode.
+ VZEROUPPER
+ MOVQ R12, (R8)
+ RET
diff --git a/src/internal/bytealg/count_arm.s b/src/internal/bytealg/count_arm.s
new file mode 100644
index 0000000..f704ea0
--- /dev/null
+++ b/src/internal/bytealg/count_arm.s
@@ -0,0 +1,43 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-20
+ MOVW b_base+0(FP), R0
+ MOVW b_len+4(FP), R1
+ MOVBU c+12(FP), R2
+ MOVW $ret+16(FP), R7
+ B countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-16
+ MOVW s_base+0(FP), R0
+ MOVW s_len+4(FP), R1
+ MOVBU c+8(FP), R2
+ MOVW $ret+12(FP), R7
+ B countbytebody<>(SB)
+
+// Input:
+// R0: data
+// R1: data length
+// R2: byte to find
+// R7: address to put result
+//
+// On exit:
+// R4 and R8 are clobbered
+TEXT countbytebody<>(SB),NOSPLIT,$0
+ MOVW $0, R8 // R8 = count of byte to search
+ CMP $0, R1
+ B.EQ done // short path to handle 0-byte case
+ ADD R0, R1 // R1 is the end of the range
+byte_loop:
+ MOVBU.P 1(R0), R4
+ CMP R4, R2
+ ADD.EQ $1, R8
+ CMP R0, R1
+ B.NE byte_loop
+done:
+ MOVW R8, (R7)
+ RET
diff --git a/src/internal/bytealg/count_arm64.s b/src/internal/bytealg/count_arm64.s
new file mode 100644
index 0000000..8cd703d
--- /dev/null
+++ b/src/internal/bytealg/count_arm64.s
@@ -0,0 +1,90 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+ MOVD b_base+0(FP), R0
+ MOVD b_len+8(FP), R2
+ MOVBU c+24(FP), R1
+ MOVD $ret+32(FP), R8
+ B countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+ MOVD s_base+0(FP), R0
+ MOVD s_len+8(FP), R2
+ MOVBU c+16(FP), R1
+ MOVD $ret+24(FP), R8
+ B countbytebody<>(SB)
+
+// input:
+// R0: data
+// R2: data len
+// R1: byte to find
+// R8: address to put result
+TEXT countbytebody<>(SB),NOSPLIT,$0
+ // R11 = count of byte to search
+ MOVD $0, R11
+ // short path to handle 0-byte case
+ CBZ R2, done
+ CMP $0x20, R2
+ // jump directly to tail if length < 32
+ BLO tail
+ ANDS $0x1f, R0, R9
+ BEQ chunk
+ // Work with not 32-byte aligned head
+ BIC $0x1f, R0, R3
+ ADD $0x20, R3
+head_loop:
+ MOVBU.P 1(R0), R5
+ CMP R5, R1
+ CINC EQ, R11, R11
+ SUB $1, R2, R2
+ CMP R0, R3
+ BNE head_loop
+ // Work with 32-byte aligned chunks
+chunk:
+ BIC $0x1f, R2, R9
+ // The first chunk can also be the last
+ CBZ R9, tail
+ // R3 = end of 32-byte chunks
+ ADD R0, R9, R3
+ MOVD $1, R5
+ VMOV R5, V5.B16
+ // R2 = length of tail
+ SUB R9, R2, R2
+ // Duplicate R1 (byte to search) to 16 1-byte elements of V0
+ VMOV R1, V0.B16
+ // Clear the low 64-bit element of V7 and V8
+ VEOR V7.B8, V7.B8, V7.B8
+ VEOR V8.B8, V8.B8, V8.B8
+ // Count the target byte in 32-byte chunk
+chunk_loop:
+ VLD1.P (R0), [V1.B16, V2.B16]
+ CMP R0, R3
+ VCMEQ V0.B16, V1.B16, V3.B16
+ VCMEQ V0.B16, V2.B16, V4.B16
+ // Clear the higher 7 bits
+ VAND V5.B16, V3.B16, V3.B16
+ VAND V5.B16, V4.B16, V4.B16
+ // Count lanes match the requested byte
+ VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
+ VUADDLV V6.B16, V7
+ // Accumulate the count in low 64-bit element of V8 when inside the loop
+ VADD V7, V8
+ BNE chunk_loop
+ VMOV V8.D[0], R6
+ ADD R6, R11, R11
+ CBZ R2, done
+tail:
+ // Work with tail shorter than 32 bytes
+ MOVBU.P 1(R0), R5
+ SUB $1, R2, R2
+ CMP R5, R1
+ CINC EQ, R11, R11
+ CBNZ R2, tail
+done:
+ MOVD R11, (R8)
+ RET
diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go
new file mode 100644
index 0000000..932a7c5
--- /dev/null
+++ b/src/internal/bytealg/count_generic.go
@@ -0,0 +1,27 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !arm && !arm64 && !ppc64le && !ppc64 && !riscv64 && !s390x
+
+package bytealg
+
+func Count(b []byte, c byte) int {
+ n := 0
+ for _, x := range b {
+ if x == c {
+ n++
+ }
+ }
+ return n
+}
+
+func CountString(s string, c byte) int {
+ n := 0
+ for i := 0; i < len(s); i++ {
+ if s[i] == c {
+ n++
+ }
+ }
+ return n
+}
diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go
new file mode 100644
index 0000000..90189c9
--- /dev/null
+++ b/src/internal/bytealg/count_native.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm || arm64 || ppc64le || ppc64 || riscv64 || s390x
+
+package bytealg
+
+//go:noescape
+func Count(b []byte, c byte) int
+
+//go:noescape
+func CountString(s string, c byte) int
+
+// A backup implementation to use by assembly.
+func countGeneric(b []byte, c byte) int {
+ n := 0
+ for _, x := range b {
+ if x == c {
+ n++
+ }
+ }
+ return n
+}
+func countGenericString(s string, c byte) int {
+ n := 0
+ for i := 0; i < len(s); i++ {
+ if s[i] == c {
+ n++
+ }
+ }
+ return n
+}
diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s
new file mode 100644
index 0000000..2d2490b
--- /dev/null
+++ b/src/internal/bytealg/count_ppc64x.s
@@ -0,0 +1,96 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64le || ppc64
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+ // R3 = byte array pointer
+ // R4 = length
+ MOVBZ R6, R5 // R5 = byte
+ BR countbytebody<>(SB)
+
+TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
+ // R3 = byte array pointer
+ // R4 = length
+ MOVBZ R5, R5 // R5 = byte
+ BR countbytebody<>(SB)
+
+// R3: addr of string
+// R4: len of string
+// R5: byte to count
+// On exit:
+// R3: return value
+// endianness shouldn't matter since we are just counting and order
+// is irrelevant
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+ DCBT (R3) // Prepare cache line.
+ MOVD R0, R18 // byte count
+ MOVD R3, R19 // Save base address for calculating the index later.
+ MOVD R4, R16
+
+ MOVD R5, R6
+ RLDIMI $8, R6, $48, R6
+ RLDIMI $16, R6, $32, R6
+ RLDIMI $32, R6, $0, R6 // fill reg with the byte to count
+
+ VSPLTISW $3, V4 // used for shift
+ MTVRD R6, V1 // move compare byte
+ VSPLTB $7, V1, V1 // replicate byte across V1
+
+ CMPU R4, $32 // Check if it's a small string (<32 bytes)
+ BLT tail // Jump to the small string case
+ XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
+
+cmploop:
+ LXVW4X (R3), VS32 // load bytes from string
+
+ // when the bytes match, the corresponding byte contains all 1s
+ VCMPEQUB V1, V0, V2 // compare bytes
+ VPOPCNTD V2, V3 // each double word contains its count
+ VADDUDM V3, V5, V5 // accumulate bit count in each double word
+ ADD $16, R3, R3 // increment pointer
+ SUB $16, R16, R16 // remaining bytes
+ CMP R16, $16 // at least 16 remaining?
+ BGE cmploop
+ VSRD V5, V4, V5 // shift by 3 to convert bits to bytes
+ VSLDOI $8, V5, V5, V6 // get the double word values from vector
+ MFVSRD V5, R9
+ MFVSRD V6, R10
+ ADD R9, R10, R9
+ ADD R9, R18, R18
+
+tail:
+ CMP R16, $8 // 8 bytes left?
+ BLT small
+
+ MOVD (R3), R12 // load 8 bytes
+ CMPB R12, R6, R17 // compare bytes
+ POPCNTD R17, R15 // bit count
+ SRD $3, R15, R15 // byte count
+ ADD R15, R18, R18 // add to byte count
+
+next1:
+ ADD $8, R3, R3
+ SUB $8, R16, R16 // remaining bytes
+ BR tail
+
+small:
+ CMP $0, R16 // any remaining
+ BEQ done
+ MOVBZ (R3), R12 // check each remaining byte
+ CMP R12, R5
+ BNE next2
+ ADD $1, R18
+
+next2:
+ SUB $1, R16
+ ADD $1, R3 // inc address
+ BR small
+
+done:
+ MOVD R18, R3 // return count
+ RET
diff --git a/src/internal/bytealg/count_riscv64.s b/src/internal/bytealg/count_riscv64.s
new file mode 100644
index 0000000..d123cbd
--- /dev/null
+++ b/src/internal/bytealg/count_riscv64.s
@@ -0,0 +1,47 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
+ // X10 = b_base
+ // X11 = b_len
+ // X12 = b_cap (unused)
+ // X13 = byte to count (want in X12)
+ AND $0xff, X13, X12
+ MOV ZERO, X14 // count
+ ADD X10, X11 // end
+
+loop:
+ BEQ X10, X11, done
+ MOVBU (X10), X15
+ ADD $1, X10
+ BNE X12, X15, loop
+ ADD $1, X14
+ JMP loop
+
+done:
+ MOV X14, X10
+ RET
+
+TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
+ // X10 = s_base
+ // X11 = s_len
+ // X12 = byte to count
+ AND $0xff, X12
+ MOV ZERO, X14 // count
+ ADD X10, X11 // end
+
+loop:
+ BEQ X10, X11, done
+ MOVBU (X10), X15
+ ADD $1, X10
+ BNE X12, X15, loop
+ ADD $1, X14
+ JMP loop
+
+done:
+ MOV X14, X10
+ RET
diff --git a/src/internal/bytealg/count_s390x.s b/src/internal/bytealg/count_s390x.s
new file mode 100644
index 0000000..2a3b5c0
--- /dev/null
+++ b/src/internal/bytealg/count_s390x.s
@@ -0,0 +1,169 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// condition code masks
+#define EQ 8
+#define NE 7
+
+// register assignments
+#define R_ZERO R0
+#define R_VAL R1
+#define R_TMP R2
+#define R_PTR R3
+#define R_LEN R4
+#define R_CHAR R5
+#define R_RET R6
+#define R_ITER R7
+#define R_CNT R8
+#define R_MPTR R9
+
+// vector register assignments
+#define V_ZERO V0
+#define V_CHAR V1
+#define V_MASK V2
+#define V_VAL V3
+#define V_CNT V4
+
+// mask for trailing bytes in vector implementation
+GLOBL countbytemask<>(SB), RODATA, $16
+DATA countbytemask<>+0(SB)/8, $0x0101010101010101
+DATA countbytemask<>+8(SB)/8, $0x0101010101010101
+
+// func Count(b []byte, c byte) int
+TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
+ LMG b+0(FP), R_PTR, R_LEN
+ MOVBZ c+24(FP), R_CHAR
+ MOVD $ret+32(FP), R_RET
+ BR countbytebody<>(SB)
+
+// func CountString(s string, c byte) int
+TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
+ LMG s+0(FP), R_PTR, R_LEN
+ MOVBZ c+16(FP), R_CHAR
+ MOVD $ret+24(FP), R_RET
+ BR countbytebody<>(SB)
+
+// input:
+// R_PTR = address of array of bytes
+// R_LEN = number of bytes in array
+// R_CHAR = byte value to count zero (extended to register width)
+// R_RET = address of return value
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+ MOVD $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
+ MOVD $countbytemask<>(SB), R_MPTR
+ CGIJ $EQ, R_LEN, $0, ret0 // return if length is 0.
+ SRD $4, R_LEN, R_ITER // R_ITER is the number of 16-byte chunks
+ MOVBZ (R_TMP), R_TMP // load bool indicating support for vector facility
+ CGIJ $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
+
+ // Start of vector code (have vector facility).
+ //
+ // Set R_LEN to be the length mod 16 minus 1 to use as an index for
+ // vector 'load with length' (VLL). It will be in the range [-1,14].
+ // Also replicate c across a 16-byte vector and initialize V_ZERO.
+ ANDW $0xf, R_LEN
+ VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
+ VZERO V_ZERO // V_ZERO = [1]uint128{0}
+ ADDW $-1, R_LEN
+ VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
+
+ // Jump to loop if we have more than 15 bytes to process.
+ CGIJ $NE, R_ITER, $0, vxchunks
+
+ // Load 1-15 bytes and corresponding mask.
+ // Note: only the low 32-bits of R_LEN are used for the index.
+ VLL R_LEN, (R_PTR), V_VAL
+ VLL R_LEN, (R_MPTR), V_MASK
+
+ // Compare each byte in input chunk against byte to be counted.
+ // Each byte element will be set to either 0 (no match) or 1 (match).
+ VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+ VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+ // Accumulate matched byte count in 128-bit integer value.
+ VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+ VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+
+ // Return rightmost (lowest) 64-bit part of accumulator.
+ VSTEG $1, V_CNT, (R_RET)
+ RET
+
+vxchunks:
+ // Load 0x01 into every byte element in the 16-byte mask vector.
+ VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
+ VZERO V_CNT // initial uint128 count of 0
+
+vxloop:
+ // Load input bytes in 16-byte chunks.
+ VL (R_PTR), V_VAL
+
+ // Compare each byte in input chunk against byte to be counted.
+ // Each byte element will be set to either 0 (no match) or 1 (match).
+ VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+ VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+ // Increment input string address.
+ MOVD $16(R_PTR), R_PTR
+
+ // Accumulate matched byte count in 128-bit integer value.
+ VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+ VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+ VAQ V_VAL, V_CNT, V_CNT // accumulate
+
+ // Repeat until all 16-byte chunks are done.
+ BRCTG R_ITER, vxloop
+
+ // Skip to end if there are no trailing bytes.
+ CIJ $EQ, R_LEN, $-1, vxret
+
+ // Load 1-15 bytes and corresponding mask.
+ // Note: only the low 32-bits of R_LEN are used for the index.
+ VLL R_LEN, (R_PTR), V_VAL
+ VLL R_LEN, (R_MPTR), V_MASK
+
+ // Compare each byte in input chunk against byte to be counted.
+ // Each byte element will be set to either 0 (no match) or 1 (match).
+ VCEQB V_CHAR, V_VAL, V_VAL
+ VN V_MASK, V_VAL, V_VAL
+
+ // Accumulate matched byte count in 128-bit integer value.
+ VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+ VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+ VAQ V_VAL, V_CNT, V_CNT // accumulate
+
+vxret:
+ // Return rightmost (lowest) 64-bit part of accumulator.
+ VSTEG $1, V_CNT, (R_RET)
+ RET
+
+novx:
+ // Start of non-vector code (the vector facility not available).
+ //
+ // Initialise counter and constant zero.
+ MOVD $0, R_CNT
+ MOVD $0, R_ZERO
+
+loop:
+ // Read 1-byte from input and compare.
+ // Note: avoid putting LOCGR in critical path.
+ MOVBZ (R_PTR), R_VAL
+ MOVD $1, R_TMP
+ MOVD $1(R_PTR), R_PTR
+ CMPW R_VAL, R_CHAR
+ LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
+ ADD R_TMP, R_CNT // accumulate 64-bit result
+
+ // Repeat until all bytes have been checked.
+ BRCTG R_LEN, loop
+
+ret:
+ MOVD R_CNT, (R_RET)
+ RET
+
+ret0:
+ MOVD $0, (R_RET)
+ RET
diff --git a/src/internal/bytealg/equal_386.s b/src/internal/bytealg/equal_386.s
new file mode 100644
index 0000000..58b3cbe
--- /dev/null
+++ b/src/internal/bytealg/equal_386.s
@@ -0,0 +1,130 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
+ MOVL a+0(FP), SI
+ MOVL b+4(FP), DI
+ CMPL SI, DI
+ JEQ eq
+ MOVL size+8(FP), BX
+ LEAL ret+12(FP), AX
+ JMP memeqbody<>(SB)
+eq:
+ MOVB $1, ret+12(FP)
+ RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
+ MOVL a+0(FP), SI
+ MOVL b+4(FP), DI
+ CMPL SI, DI
+ JEQ eq
+ MOVL 4(DX), BX // compiler stores size at offset 4 in the closure
+ LEAL ret+8(FP), AX
+ JMP memeqbody<>(SB)
+eq:
+ MOVB $1, ret+8(FP)
+ RET
+
+// a in SI
+// b in DI
+// count in BX
+// address of result byte in AX
+TEXT memeqbody<>(SB),NOSPLIT,$0-0
+ CMPL BX, $4
+ JB small
+
+ // 64 bytes at a time using xmm registers
+hugeloop:
+ CMPL BX, $64
+ JB bigloop
+#ifdef GO386_softfloat
+ JMP bigloop
+#endif
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ MOVOU 16(SI), X2
+ MOVOU 16(DI), X3
+ MOVOU 32(SI), X4
+ MOVOU 32(DI), X5
+ MOVOU 48(SI), X6
+ MOVOU 48(DI), X7
+ PCMPEQB X1, X0
+ PCMPEQB X3, X2
+ PCMPEQB X5, X4
+ PCMPEQB X7, X6
+ PAND X2, X0
+ PAND X6, X4
+ PAND X4, X0
+ PMOVMSKB X0, DX
+ ADDL $64, SI
+ ADDL $64, DI
+ SUBL $64, BX
+ CMPL DX, $0xffff
+ JEQ hugeloop
+ MOVB $0, (AX)
+ RET
+
+ // 4 bytes at a time using 32-bit register
+bigloop:
+ CMPL BX, $4
+ JBE leftover
+ MOVL (SI), CX
+ MOVL (DI), DX
+ ADDL $4, SI
+ ADDL $4, DI
+ SUBL $4, BX
+ CMPL CX, DX
+ JEQ bigloop
+ MOVB $0, (AX)
+ RET
+
+ // remaining 0-4 bytes
+leftover:
+ MOVL -4(SI)(BX*1), CX
+ MOVL -4(DI)(BX*1), DX
+ CMPL CX, DX
+ SETEQ (AX)
+ RET
+
+small:
+ CMPL BX, $0
+ JEQ equal
+
+ LEAL 0(BX*8), CX
+ NEGL CX
+
+ MOVL SI, DX
+ CMPB DX, $0xfc
+ JA si_high
+
+ // load at SI won't cross a page boundary.
+ MOVL (SI), SI
+ JMP si_finish
+si_high:
+ // address ends in 111111xx. Load up to bytes we want, move to correct position.
+ MOVL -4(SI)(BX*1), SI
+ SHRL CX, SI
+si_finish:
+
+ // same for DI.
+ MOVL DI, DX
+ CMPB DX, $0xfc
+ JA di_high
+ MOVL (DI), DI
+ JMP di_finish
+di_high:
+ MOVL -4(DI)(BX*1), DI
+ SHRL CX, DI
+di_finish:
+
+ SUBL SI, DI
+ SHLL CX, DI
+equal:
+ SETEQ (AX)
+ RET
diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s
new file mode 100644
index 0000000..d178a33
--- /dev/null
+++ b/src/internal/bytealg/equal_amd64.s
@@ -0,0 +1,162 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "asm_amd64.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25
+ // AX = a (want in SI)
+ // BX = b (want in DI)
+ // CX = size (want in BX)
+ CMPQ AX, BX
+ JNE neq
+ MOVQ $1, AX // return 1
+ RET
+neq:
+ MOVQ AX, SI
+ MOVQ BX, DI
+ MOVQ CX, BX
+ JMP memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
+ // AX = a (want in SI)
+ // BX = b (want in DI)
+ // 8(DX) = size (want in BX)
+ CMPQ AX, BX
+ JNE neq
+ MOVQ $1, AX // return 1
+ RET
+neq:
+ MOVQ AX, SI
+ MOVQ BX, DI
+ MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure
+ JMP memeqbody<>(SB)
+
+// Input:
+// a in SI
+// b in DI
+// count in BX
+// Output:
+// result in AX
+TEXT memeqbody<>(SB),NOSPLIT,$0-0
+ CMPQ BX, $8
+ JB small
+ CMPQ BX, $64
+ JB bigloop
+#ifndef hasAVX2
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+ JE hugeloop_avx2
+
+ // 64 bytes at a time using xmm registers
+hugeloop:
+ CMPQ BX, $64
+ JB bigloop
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ MOVOU 16(SI), X2
+ MOVOU 16(DI), X3
+ MOVOU 32(SI), X4
+ MOVOU 32(DI), X5
+ MOVOU 48(SI), X6
+ MOVOU 48(DI), X7
+ PCMPEQB X1, X0
+ PCMPEQB X3, X2
+ PCMPEQB X5, X4
+ PCMPEQB X7, X6
+ PAND X2, X0
+ PAND X6, X4
+ PAND X4, X0
+ PMOVMSKB X0, DX
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, BX
+ CMPL DX, $0xffff
+ JEQ hugeloop
+ XORQ AX, AX // return 0
+ RET
+#endif
+
+ // 64 bytes at a time using ymm registers
+hugeloop_avx2:
+ CMPQ BX, $64
+ JB bigloop_avx2
+ VMOVDQU (SI), Y0
+ VMOVDQU (DI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU 32(DI), Y3
+ VPCMPEQB Y1, Y0, Y4
+ VPCMPEQB Y2, Y3, Y5
+ VPAND Y4, Y5, Y6
+ VPMOVMSKB Y6, DX
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, BX
+ CMPL DX, $0xffffffff
+ JEQ hugeloop_avx2
+ VZEROUPPER
+ XORQ AX, AX // return 0
+ RET
+
+bigloop_avx2:
+ VZEROUPPER
+
+ // 8 bytes at a time using 64-bit register
+bigloop:
+ CMPQ BX, $8
+ JBE leftover
+ MOVQ (SI), CX
+ MOVQ (DI), DX
+ ADDQ $8, SI
+ ADDQ $8, DI
+ SUBQ $8, BX
+ CMPQ CX, DX
+ JEQ bigloop
+ XORQ AX, AX // return 0
+ RET
+
+ // remaining 0-8 bytes
+leftover:
+ MOVQ -8(SI)(BX*1), CX
+ MOVQ -8(DI)(BX*1), DX
+ CMPQ CX, DX
+ SETEQ AX
+ RET
+
+small:
+ CMPQ BX, $0
+ JEQ equal
+
+ LEAQ 0(BX*8), CX
+ NEGQ CX
+
+ CMPB SI, $0xf8
+ JA si_high
+
+ // load at SI won't cross a page boundary.
+ MOVQ (SI), SI
+ JMP si_finish
+si_high:
+ // address ends in 11111xxx. Load up to bytes we want, move to correct position.
+ MOVQ -8(SI)(BX*1), SI
+ SHRQ CX, SI
+si_finish:
+
+ // same for DI.
+ CMPB DI, $0xf8
+ JA di_high
+ MOVQ (DI), DI
+ JMP di_finish
+di_high:
+ MOVQ -8(DI)(BX*1), DI
+ SHRQ CX, DI
+di_finish:
+
+ SUBQ SI, DI
+ SHLQ CX, DI
+equal:
+ SETEQ AX
+ RET
diff --git a/src/internal/bytealg/equal_arm.s b/src/internal/bytealg/equal_arm.s
new file mode 100644
index 0000000..a6c4369
--- /dev/null
+++ b/src/internal/bytealg/equal_arm.s
@@ -0,0 +1,91 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-13
+ MOVW a+0(FP), R0
+ MOVW b+4(FP), R2
+ CMP R0, R2
+ B.EQ eq
+ MOVW size+8(FP), R1
+ CMP $0, R1
+ B.EQ eq // short path to handle 0-byte case
+ MOVW $ret+12(FP), R7
+ B memeqbody<>(SB)
+eq:
+ MOVW $1, R0
+ MOVB R0, ret+12(FP)
+ RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-9
+ MOVW a+0(FP), R0
+ MOVW b+4(FP), R2
+ CMP R0, R2
+ B.EQ eq
+ MOVW 4(R7), R1 // compiler stores size at offset 4 in the closure
+ CMP $0, R1
+ B.EQ eq // short path to handle 0-byte case
+ MOVW $ret+8(FP), R7
+ B memeqbody<>(SB)
+eq:
+ MOVW $1, R0
+ MOVB R0, ret+8(FP)
+ RET
+
+// Input:
+// R0: data of a
+// R1: length
+// R2: data of b
+// R7: points to return value
+//
+// On exit:
+// R4, R5 and R6 are clobbered
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+ CMP $1, R1
+ B.EQ one // 1-byte special case for better performance
+
+ CMP $4, R1
+ ADD R0, R1 // R1 is the end of the range to compare
+ B.LT byte_loop // length < 4
+ AND $3, R0, R6
+ CMP $0, R6
+ B.NE byte_loop // unaligned a, use byte-wise compare (TODO: try to align a)
+ AND $3, R2, R6
+ CMP $0, R6
+ B.NE byte_loop // unaligned b, use byte-wise compare
+ AND $0xfffffffc, R1, R6
+ // length >= 4
+chunk4_loop:
+ MOVW.P 4(R0), R4
+ MOVW.P 4(R2), R5
+ CMP R4, R5
+ B.NE notequal
+ CMP R0, R6
+ B.NE chunk4_loop
+ CMP R0, R1
+ B.EQ equal // reached the end
+byte_loop:
+ MOVBU.P 1(R0), R4
+ MOVBU.P 1(R2), R5
+ CMP R4, R5
+ B.NE notequal
+ CMP R0, R1
+ B.NE byte_loop
+equal:
+ MOVW $1, R0
+ MOVB R0, (R7)
+ RET
+one:
+ MOVBU (R0), R4
+ MOVBU (R2), R5
+ CMP R4, R5
+ B.EQ equal
+notequal:
+ MOVW $0, R0
+ MOVB R0, (R7)
+ RET
diff --git a/src/internal/bytealg/equal_arm64.s b/src/internal/bytealg/equal_arm64.s
new file mode 100644
index 0000000..d3aabba
--- /dev/null
+++ b/src/internal/bytealg/equal_arm64.s
@@ -0,0 +1,121 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+ // short path to handle 0-byte case
+ CBZ R2, equal
+ B memeqbody<>(SB)
+equal:
+ MOVD $1, R0
+ RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
+ CMP R0, R1
+ BEQ eq
+ MOVD 8(R26), R2 // compiler stores size at offset 8 in the closure
+ CBZ R2, eq
+ B memeqbody<>(SB)
+eq:
+ MOVD $1, R0
+ RET
+
+// input:
+// R0: pointer a
+// R1: pointer b
+// R2: data len
+// at return: result in R0
+TEXT memeqbody<>(SB),NOSPLIT,$0
+ CMP $1, R2
+ // handle 1-byte special case for better performance
+ BEQ one
+ CMP $16, R2
+ // handle specially if length < 16
+ BLO tail
+ BIC $0x3f, R2, R3
+ CBZ R3, chunk16
+ // work with 64-byte chunks
+ ADD R3, R0, R6 // end of chunks
+chunk64_loop:
+ VLD1.P (R0), [V0.D2, V1.D2, V2.D2, V3.D2]
+ VLD1.P (R1), [V4.D2, V5.D2, V6.D2, V7.D2]
+ VCMEQ V0.D2, V4.D2, V8.D2
+ VCMEQ V1.D2, V5.D2, V9.D2
+ VCMEQ V2.D2, V6.D2, V10.D2
+ VCMEQ V3.D2, V7.D2, V11.D2
+ VAND V8.B16, V9.B16, V8.B16
+ VAND V8.B16, V10.B16, V8.B16
+ VAND V8.B16, V11.B16, V8.B16
+ CMP R0, R6
+ VMOV V8.D[0], R4
+ VMOV V8.D[1], R5
+ CBZ R4, not_equal
+ CBZ R5, not_equal
+ BNE chunk64_loop
+ AND $0x3f, R2, R2
+ CBZ R2, equal
+chunk16:
+ // work with 16-byte chunks
+ BIC $0xf, R2, R3
+ CBZ R3, tail
+ ADD R3, R0, R6 // end of chunks
+chunk16_loop:
+ LDP.P 16(R0), (R4, R5)
+ LDP.P 16(R1), (R7, R9)
+ EOR R4, R7
+ CBNZ R7, not_equal
+ EOR R5, R9
+ CBNZ R9, not_equal
+ CMP R0, R6
+ BNE chunk16_loop
+ AND $0xf, R2, R2
+ CBZ R2, equal
+tail:
+ // special compare of tail with length < 16
+ TBZ $3, R2, lt_8
+ MOVD (R0), R4
+ MOVD (R1), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ SUB $8, R2, R6 // offset of the last 8 bytes
+ MOVD (R0)(R6), R4
+ MOVD (R1)(R6), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ B equal
+lt_8:
+ TBZ $2, R2, lt_4
+ MOVWU (R0), R4
+ MOVWU (R1), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ SUB $4, R2, R6 // offset of the last 4 bytes
+ MOVWU (R0)(R6), R4
+ MOVWU (R1)(R6), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ B equal
+lt_4:
+ TBZ $1, R2, lt_2
+ MOVHU.P 2(R0), R4
+ MOVHU.P 2(R1), R5
+ CMP R4, R5
+ BNE not_equal
+lt_2:
+ TBZ $0, R2, equal
+one:
+ MOVBU (R0), R4
+ MOVBU (R1), R5
+ CMP R4, R5
+ BNE not_equal
+equal:
+ MOVD $1, R0
+ RET
+not_equal:
+ MOVB ZR, R0
+ RET
diff --git a/src/internal/bytealg/equal_generic.go b/src/internal/bytealg/equal_generic.go
new file mode 100644
index 0000000..59bdf8f
--- /dev/null
+++ b/src/internal/bytealg/equal_generic.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Equal reports whether a and b
+// are the same length and contain the same bytes.
+// A nil argument is equivalent to an empty slice.
+//
+// Equal is equivalent to bytes.Equal.
+// It is provided here for convenience,
+// because some packages cannot depend on bytes.
+func Equal(a, b []byte) bool {
+ // Neither cmd/compile nor gccgo allocates for these string conversions.
+ // There is a test for this in package bytes.
+ return string(a) == string(b)
+}
diff --git a/src/internal/bytealg/equal_loong64.s b/src/internal/bytealg/equal_loong64.s
new file mode 100644
index 0000000..dcdde89
--- /dev/null
+++ b/src/internal/bytealg/equal_loong64.s
@@ -0,0 +1,52 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define REGCTXT R29
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+ MOVV a+0(FP), R4
+ MOVV b+8(FP), R5
+ BEQ R4, R5, eq
+ MOVV size+16(FP), R6
+ ADDV R4, R6, R7
+loop:
+ BNE R4, R7, test
+ MOVV $1, R4
+ MOVB R4, ret+24(FP)
+ RET
+test:
+ MOVBU (R4), R9
+ ADDV $1, R4
+ MOVBU (R5), R10
+ ADDV $1, R5
+ BEQ R9, R10, loop
+
+ MOVB R0, ret+24(FP)
+ RET
+eq:
+ MOVV $1, R4
+ MOVB R4, ret+24(FP)
+ RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+ MOVV a+0(FP), R4
+ MOVV b+8(FP), R5
+ BEQ R4, R5, eq
+ MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure
+ MOVV R4, 8(R3)
+ MOVV R5, 16(R3)
+ MOVV R6, 24(R3)
+ JAL runtime·memequal(SB)
+ MOVBU 32(R3), R4
+ MOVB R4, ret+16(FP)
+ RET
+eq:
+ MOVV $1, R4
+ MOVB R4, ret+16(FP)
+ RET
diff --git a/src/internal/bytealg/equal_mips64x.s b/src/internal/bytealg/equal_mips64x.s
new file mode 100644
index 0000000..d92f225
--- /dev/null
+++ b/src/internal/bytealg/equal_mips64x.s
@@ -0,0 +1,118 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define REGCTXT R22
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+ MOVV a+0(FP), R1
+ MOVV b+8(FP), R2
+ BEQ R1, R2, eq
+ MOVV size+16(FP), R3
+ ADDV R1, R3, R4
+
+ // chunk size is 16
+ SGTU $16, R3, R8
+ BEQ R0, R8, chunk_entry
+
+byte_loop:
+ BNE R1, R4, byte_test
+ MOVV $1, R1
+ MOVB R1, ret+24(FP)
+ RET
+byte_test:
+ MOVBU (R1), R6
+ ADDV $1, R1
+ MOVBU (R2), R7
+ ADDV $1, R2
+ BEQ R6, R7, byte_loop
+ JMP not_eq
+
+chunk_entry:
+ // make sure both a and b are aligned
+ OR R1, R2, R9
+ AND $0x7, R9
+ BNE R0, R9, byte_loop
+ JMP chunk_loop_1
+
+chunk_loop:
+ // chunk size is 16
+ SGTU $16, R3, R8
+ BNE R0, R8, chunk_tail_8
+chunk_loop_1:
+ MOVV (R1), R6
+ MOVV (R2), R7
+ BNE R6, R7, not_eq
+ MOVV 8(R1), R12
+ MOVV 8(R2), R13
+ ADDV $16, R1
+ ADDV $16, R2
+ SUBV $16, R3
+ BEQ R12, R13, chunk_loop
+ JMP not_eq
+
+chunk_tail_8:
+ AND $8, R3, R14
+ BEQ R0, R14, chunk_tail_4
+ MOVV (R1), R6
+ MOVV (R2), R7
+ BNE R6, R7, not_eq
+ ADDV $8, R1
+ ADDV $8, R2
+
+chunk_tail_4:
+ AND $4, R3, R14
+ BEQ R0, R14, chunk_tail_2
+ MOVWU (R1), R6
+ MOVWU (R2), R7
+ BNE R6, R7, not_eq
+ ADDV $4, R1
+ ADDV $4, R2
+
+chunk_tail_2:
+ AND $2, R3, R14
+ BEQ R0, R14, chunk_tail_1
+ MOVHU (R1), R6
+ MOVHU (R2), R7
+ BNE R6, R7, not_eq
+ ADDV $2, R1
+ ADDV $2, R2
+
+chunk_tail_1:
+ AND $1, R3, R14
+ BEQ R0, R14, eq
+ MOVBU (R1), R6
+ MOVBU (R2), R7
+ BEQ R6, R7, eq
+
+not_eq:
+ MOVB R0, ret+24(FP)
+ RET
+eq:
+ MOVV $1, R1
+ MOVB R1, ret+24(FP)
+ RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+ MOVV a+0(FP), R1
+ MOVV b+8(FP), R2
+ BEQ R1, R2, eq
+ MOVV 8(REGCTXT), R3 // compiler stores size at offset 8 in the closure
+ MOVV R1, 8(R29)
+ MOVV R2, 16(R29)
+ MOVV R3, 24(R29)
+ JAL runtime·memequal(SB)
+ MOVBU 32(R29), R1
+ MOVB R1, ret+16(FP)
+ RET
+eq:
+ MOVV $1, R1
+ MOVB R1, ret+16(FP)
+ RET
diff --git a/src/internal/bytealg/equal_mipsx.s b/src/internal/bytealg/equal_mipsx.s
new file mode 100644
index 0000000..4c46dd4
--- /dev/null
+++ b/src/internal/bytealg/equal_mipsx.s
@@ -0,0 +1,62 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define REGCTXT R22
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
+ MOVW a+0(FP), R1
+ MOVW b+4(FP), R2
+ BEQ R1, R2, eq
+ MOVW size+8(FP), R3
+ ADDU R1, R3, R4
+loop:
+ BNE R1, R4, test
+ MOVW $1, R1
+ MOVB R1, ret+12(FP)
+ RET
+test:
+ MOVBU (R1), R6
+ ADDU $1, R1
+ MOVBU (R2), R7
+ ADDU $1, R2
+ BEQ R6, R7, loop
+
+ MOVB R0, ret+12(FP)
+ RET
+eq:
+ MOVW $1, R1
+ MOVB R1, ret+12(FP)
+ RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
+ MOVW a+0(FP), R1
+ MOVW b+4(FP), R2
+ BEQ R1, R2, eq
+ MOVW 4(REGCTXT), R3 // compiler stores size at offset 4 in the closure
+ ADDU R1, R3, R4
+loop:
+ BNE R1, R4, test
+ MOVW $1, R1
+ MOVB R1, ret+8(FP)
+ RET
+test:
+ MOVBU (R1), R6
+ ADDU $1, R1
+ MOVBU (R2), R7
+ ADDU $1, R2
+ BEQ R6, R7, loop
+
+ MOVB R0, ret+8(FP)
+ RET
+eq:
+ MOVW $1, R1
+ MOVB R1, ret+8(FP)
+ RET
diff --git a/src/internal/bytealg/equal_native.go b/src/internal/bytealg/equal_native.go
new file mode 100644
index 0000000..cf3a245
--- /dev/null
+++ b/src/internal/bytealg/equal_native.go
@@ -0,0 +1,21 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "unsafe"
+
+// The declarations below generate ABI wrappers for functions
+// implemented in assembly in this package but declared in another
+// package.
+
+// The compiler generates calls to runtime.memequal and runtime.memequal_varlen.
+// In addition, the runtime calls runtime.memequal explicitly.
+// Those functions are implemented in this package.
+
+//go:linkname abigen_runtime_memequal runtime.memequal
+func abigen_runtime_memequal(a, b unsafe.Pointer, size uintptr) bool
+
+//go:linkname abigen_runtime_memequal_varlen runtime.memequal_varlen
+func abigen_runtime_memequal_varlen(a, b unsafe.Pointer) bool
diff --git a/src/internal/bytealg/equal_ppc64x.s b/src/internal/bytealg/equal_ppc64x.s
new file mode 100644
index 0000000..f2c7cc1
--- /dev/null
+++ b/src/internal/bytealg/equal_ppc64x.s
@@ -0,0 +1,205 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// 4K (smallest case) page size offset mask for PPC64.
+#define PAGE_OFFSET 4095
+
+// TODO: At writing, ISEL and BC do not support CR bit type arguments,
+// define them here for readability.
+#define CR0LT 4*0+0
+#define CR0EQ 4*0+2
+#define CR1LT 4*1+0
+#define CR6LT 4*6+0
+
+// Likewise, the BC opcode is hard to read, and no extended
+// mnemonics are offered for these forms.
+#define BGELR_CR6 BC 4, CR6LT, (LR)
+#define BEQLR BC 12, CR0EQ, (LR)
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+ // R3 = a
+ // R4 = b
+ // R5 = size
+ BR memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
+ // R3 = a
+ // R4 = b
+ CMP R3, R4
+ BEQ eq
+ MOVD 8(R11), R5 // compiler stores size at offset 8 in the closure
+ BR memeqbody<>(SB)
+eq:
+ MOVD $1, R3
+ RET
+
+// Do an efficient memequal for ppc64
+// R3 = s1
+// R4 = s2
+// R5 = len
+// On exit:
+// R3 = return value
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+ MOVD R3, R8 // Move s1 into R8
+ ADD R5, R3, R9 // &s1[len(s1)]
+ ADD R5, R4, R10 // &s2[len(s2)]
+ MOVD $1, R11
+ CMP R5, $16 // Use GPR checks for check for len <= 16
+ BLE check0_16
+ MOVD $0, R3 // Assume no-match in case BGELR CR6 returns
+ CMP R5, $32 // Use overlapping VSX loads for len <= 32
+ BLE check17_32 // Do a pair of overlapping VSR compares
+ CMP R5, $64
+ BLE check33_64 // Hybrid check + overlap compare.
+
+setup64:
+ SRD $6, R5, R6 // number of 64 byte chunks to compare
+ MOVD R6, CTR
+ MOVD $16, R14 // index for VSX loads and stores
+ MOVD $32, R15
+ MOVD $48, R16
+ ANDCC $0x3F, R5, R5 // len%64==0?
+
+ PCALIGN $32
+loop64:
+ LXVD2X (R8+R0), V0
+ LXVD2X (R4+R0), V1
+ VCMPEQUBCC V0, V1, V2 // compare, setting CR6
+ BGELR_CR6
+ LXVD2X (R8+R14), V0
+ LXVD2X (R4+R14), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ LXVD2X (R8+R15), V0
+ LXVD2X (R4+R15), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ LXVD2X (R8+R16), V0
+ LXVD2X (R4+R16), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ ADD $64,R8 // bump up to next 64
+ ADD $64,R4
+ BDNZ loop64
+
+ ISEL $CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0.
+ BEQLR // return if no tail.
+
+ ADD $-64, R9, R8
+ ADD $-64, R10, R4
+ LXVD2X (R8+R0), V0
+ LXVD2X (R4+R0), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ LXVD2X (R8+R14), V0
+ LXVD2X (R4+R14), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ LXVD2X (R8+R15), V0
+ LXVD2X (R4+R15), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ LXVD2X (R8+R16), V0
+ LXVD2X (R4+R16), V1
+ VCMPEQUBCC V0, V1, V2
+ ISEL $CR6LT, R11, R0, R3
+ RET
+
+check33_64:
+ // Bytes 0-15
+ LXVD2X (R8+R0), V0
+ LXVD2X (R4+R0), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ ADD $16, R8
+ ADD $16, R4
+
+ // Bytes 16-31
+ LXVD2X (R8+R0), V0
+ LXVD2X (R4+R0), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+
+ // A little tricky, but point R4,R8 to &sx[len-32],
+ // and reuse check17_32 to check the next 1-31 bytes (with some overlap)
+ ADD $-32, R9, R8
+ ADD $-32, R10, R4
+ // Fallthrough
+
+check17_32:
+ LXVD2X (R8+R0), V0
+ LXVD2X (R4+R0), V1
+ VCMPEQUBCC V0, V1, V2
+ ISEL $CR6LT, R11, R0, R5
+
+ // Load sX[len(sX)-16:len(sX)] and compare.
+ ADD $-16, R9
+ ADD $-16, R10
+ LXVD2X (R9+R0), V0
+ LXVD2X (R10+R0), V1
+ VCMPEQUBCC V0, V1, V2
+ ISEL $CR6LT, R5, R0, R3
+ RET
+
+check0_16:
+ CMP R5, $8
+ BLT check0_7
+ // Load sX[0:7] and compare.
+ MOVD (R8), R6
+ MOVD (R4), R7
+ CMP R6, R7
+ ISEL $CR0EQ, R11, R0, R5
+ // Load sX[len(sX)-8:len(sX)] and compare.
+ MOVD -8(R9), R6
+ MOVD -8(R10), R7
+ CMP R6, R7
+ ISEL $CR0EQ, R5, R0, R3
+ RET
+
+check0_7:
+ CMP R5,$0
+ MOVD $1, R3
+ BEQLR // return if len == 0
+
+ // Check < 8B loads with a single compare, but select the load address
+ // such that it cannot cross a page boundary. Load a few bytes from the
+ // lower address if that does not cross the lower page. Or, load a few
+ // extra bytes from the higher addresses. And align those values
+ // consistently in register as either address may have differing
+ // alignment requirements.
+ ANDCC $PAGE_OFFSET, R8, R6 // &sX & PAGE_OFFSET
+ ANDCC $PAGE_OFFSET, R4, R9
+ SUBC R5, $8, R12 // 8-len
+ SLD $3, R12, R14 // (8-len)*8
+ CMPU R6, R12, CR1 // Enough bytes lower in the page to load lower?
+ CMPU R9, R12, CR0
+ SUB R12, R8, R6 // compute lower load address
+ SUB R12, R4, R9
+ ISEL $CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
+ ISEL $CR0LT, R4, R9, R4 // Similar for s2
+ MOVD (R8), R15
+ MOVD (R4), R16
+ SLD R14, R15, R7
+ SLD R14, R16, R17
+ SRD R14, R7, R7 // Clear the upper (8-len) bytes (with 2 shifts)
+ SRD R14, R17, R17
+ SRD R14, R15, R6 // Clear the lower (8-len) bytes
+ SRD R14, R16, R9
+#ifdef GOARCH_ppc64le
+ ISEL $CR1LT, R7, R6, R8 // Choose the correct len bytes to compare based on alignment
+ ISEL $CR0LT, R17, R9, R4
+#else
+ ISEL $CR1LT, R6, R7, R8
+ ISEL $CR0LT, R9, R17, R4
+#endif
+ CMP R4, R8
+ ISEL $CR0EQ, R11, R0, R3
+ RET
diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s
new file mode 100644
index 0000000..3834083
--- /dev/null
+++ b/src/internal/bytealg/equal_riscv64.s
@@ -0,0 +1,124 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define CTXT S10
+
+// func memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+ // X10 = a_base
+ // X11 = b_base
+ // X12 = size
+ JMP memequal<>(SB)
+
+// func memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
+ MOV 8(CTXT), X12 // compiler stores size at offset 8 in the closure
+ // X10 = a_base
+ // X11 = b_base
+ JMP memequal<>(SB)
+
+// On entry X10 and X11 contain pointers, X12 contains length.
+// For non-regabi X13 contains address for return value.
+// For regabi return value in X10.
+TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0
+ BEQ X10, X11, eq
+
+ MOV $32, X23
+ BLT X12, X23, loop4_check
+
+ // Check alignment - if alignment differs we have to do one byte at a time.
+ AND $7, X10, X9
+ AND $7, X11, X19
+ BNE X9, X19, loop4_check
+ BEQZ X9, loop32_check
+
+ // Check one byte at a time until we reach 8 byte alignment.
+ SUB X9, X12, X12
+align:
+ ADD $-1, X9
+ MOVBU 0(X10), X19
+ MOVBU 0(X11), X20
+ BNE X19, X20, not_eq
+ ADD $1, X10
+ ADD $1, X11
+ BNEZ X9, align
+
+loop32_check:
+ MOV $32, X9
+ BLT X12, X9, loop16_check
+loop32:
+ MOV 0(X10), X19
+ MOV 0(X11), X20
+ MOV 8(X10), X21
+ MOV 8(X11), X22
+ BNE X19, X20, not_eq
+ BNE X21, X22, not_eq
+ MOV 16(X10), X14
+ MOV 16(X11), X15
+ MOV 24(X10), X16
+ MOV 24(X11), X17
+ BNE X14, X15, not_eq
+ BNE X16, X17, not_eq
+ ADD $32, X10
+ ADD $32, X11
+ ADD $-32, X12
+ BGE X12, X9, loop32
+ BEQZ X12, eq
+
+loop16_check:
+ MOV $16, X23
+ BLT X12, X23, loop4_check
+loop16:
+ MOV 0(X10), X19
+ MOV 0(X11), X20
+ MOV 8(X10), X21
+ MOV 8(X11), X22
+ BNE X19, X20, not_eq
+ BNE X21, X22, not_eq
+ ADD $16, X10
+ ADD $16, X11
+ ADD $-16, X12
+ BGE X12, X23, loop16
+ BEQZ X12, eq
+
+loop4_check:
+ MOV $4, X23
+ BLT X12, X23, loop1
+loop4:
+ MOVBU 0(X10), X19
+ MOVBU 0(X11), X20
+ MOVBU 1(X10), X21
+ MOVBU 1(X11), X22
+ BNE X19, X20, not_eq
+ BNE X21, X22, not_eq
+ MOVBU 2(X10), X14
+ MOVBU 2(X11), X15
+ MOVBU 3(X10), X16
+ MOVBU 3(X11), X17
+ BNE X14, X15, not_eq
+ BNE X16, X17, not_eq
+ ADD $4, X10
+ ADD $4, X11
+ ADD $-4, X12
+ BGE X12, X23, loop4
+
+loop1:
+ BEQZ X12, eq
+ MOVBU 0(X10), X19
+ MOVBU 0(X11), X20
+ BNE X19, X20, not_eq
+ ADD $1, X10
+ ADD $1, X11
+ ADD $-1, X12
+ JMP loop1
+
+not_eq:
+ MOVB ZERO, X10
+ RET
+eq:
+ MOV $1, X10
+ RET
diff --git a/src/internal/bytealg/equal_s390x.s b/src/internal/bytealg/equal_s390x.s
new file mode 100644
index 0000000..67f814d
--- /dev/null
+++ b/src/internal/bytealg/equal_s390x.s
@@ -0,0 +1,92 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+ MOVD a+0(FP), R3
+ MOVD b+8(FP), R5
+ MOVD size+16(FP), R6
+ LA ret+24(FP), R7
+ BR memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
+ MOVD a+0(FP), R3
+ MOVD b+8(FP), R5
+ MOVD 8(R12), R6 // compiler stores size at offset 8 in the closure
+ LA ret+16(FP), R7
+ BR memeqbody<>(SB)
+
+// input:
+// R3 = a
+// R5 = b
+// R6 = len
+// R7 = address of output byte (stores 0 or 1 here)
+// a and b have the same length
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+ CMPBEQ R3, R5, equal
+loop:
+ CMPBEQ R6, $0, equal
+ CMPBLT R6, $32, tiny
+ CMP R6, $256
+ BLT tail
+ CLC $256, 0(R3), 0(R5)
+ BNE notequal
+ SUB $256, R6
+ LA 256(R3), R3
+ LA 256(R5), R5
+ BR loop
+tail:
+ SUB $1, R6, R8
+ EXRL $memeqbodyclc<>(SB), R8
+ BEQ equal
+notequal:
+ MOVB $0, 0(R7)
+ RET
+equal:
+ MOVB $1, 0(R7)
+ RET
+tiny:
+ MOVD $0, R2
+ CMPBLT R6, $16, lt16
+ MOVD 0(R3), R8
+ MOVD 0(R5), R9
+ CMPBNE R8, R9, notequal
+ MOVD 8(R3), R8
+ MOVD 8(R5), R9
+ CMPBNE R8, R9, notequal
+ LA 16(R2), R2
+ SUB $16, R6
+lt16:
+ CMPBLT R6, $8, lt8
+ MOVD 0(R3)(R2*1), R8
+ MOVD 0(R5)(R2*1), R9
+ CMPBNE R8, R9, notequal
+ LA 8(R2), R2
+ SUB $8, R6
+lt8:
+ CMPBLT R6, $4, lt4
+ MOVWZ 0(R3)(R2*1), R8
+ MOVWZ 0(R5)(R2*1), R9
+ CMPBNE R8, R9, notequal
+ LA 4(R2), R2
+ SUB $4, R6
+lt4:
+#define CHECK(n) \
+ CMPBEQ R6, $n, equal \
+ MOVB n(R3)(R2*1), R8 \
+ MOVB n(R5)(R2*1), R9 \
+ CMPBNE R8, R9, notequal
+ CHECK(0)
+ CHECK(1)
+ CHECK(2)
+ CHECK(3)
+ BR equal
+
+TEXT memeqbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0
+ CLC $1, 0(R3), 0(R5)
+ RET
diff --git a/src/internal/bytealg/equal_wasm.s b/src/internal/bytealg/equal_wasm.s
new file mode 100644
index 0000000..a2b76c1
--- /dev/null
+++ b/src/internal/bytealg/equal_wasm.s
@@ -0,0 +1,77 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB), NOSPLIT, $0-25
+ Get SP
+ I64Load a+0(FP)
+ I64Load b+8(FP)
+ I64Load size+16(FP)
+ Call memeqbody<>(SB)
+ I64Store8 ret+24(FP)
+ RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB), NOSPLIT, $0-17
+ Get SP
+ I64Load a+0(FP)
+ I64Load b+8(FP)
+ I64Load 8(CTXT) // compiler stores size at offset 8 in the closure
+ Call memeqbody<>(SB)
+ I64Store8 ret+16(FP)
+ RET
+
+// params: a, b, len
+// ret: 0/1
+TEXT memeqbody<>(SB), NOSPLIT, $0-0
+ Get R0
+ Get R1
+ I64Eq
+ If
+ I64Const $1
+ Return
+ End
+
+loop:
+ Loop
+ Get R2
+ I64Eqz
+ If
+ I64Const $1
+ Return
+ End
+
+ Get R0
+ I32WrapI64
+ I64Load8U $0
+ Get R1
+ I32WrapI64
+ I64Load8U $0
+ I64Ne
+ If
+ I64Const $0
+ Return
+ End
+
+ Get R0
+ I64Const $1
+ I64Add
+ Set R0
+
+ Get R1
+ I64Const $1
+ I64Add
+ Set R1
+
+ Get R2
+ I64Const $1
+ I64Sub
+ Set R2
+
+ Br loop
+ End
+ UNDEF
diff --git a/src/internal/bytealg/index_amd64.go b/src/internal/bytealg/index_amd64.go
new file mode 100644
index 0000000..c7a1941
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.go
@@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+ if cpu.X86.HasAVX2 {
+ MaxLen = 63
+ } else {
+ MaxLen = 31
+ }
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ // 1 error per 8 characters, plus a few slop to start.
+ return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
new file mode 100644
index 0000000..0431491
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.s
@@ -0,0 +1,276 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+ MOVQ a_base+0(FP), DI
+ MOVQ a_len+8(FP), DX
+ MOVQ b_base+24(FP), R8
+ MOVQ b_len+32(FP), AX
+ MOVQ DI, R10
+ LEAQ ret+48(FP), R11
+ JMP indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+ MOVQ a_base+0(FP), DI
+ MOVQ a_len+8(FP), DX
+ MOVQ b_base+16(FP), R8
+ MOVQ b_len+24(FP), AX
+ MOVQ DI, R10
+ LEAQ ret+32(FP), R11
+ JMP indexbody<>(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// R8: pointer to string, that we are searching for
+// R11: address, where to put return value
+// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
+TEXT indexbody<>(SB),NOSPLIT,$0
+ CMPQ AX, DX
+ JA fail
+ CMPQ DX, $16
+ JAE sse42
+no_sse42:
+ CMPQ AX, $2
+ JA _3_or_more
+ MOVW (R8), R8
+ LEAQ -1(DI)(DX*1), DX
+loop2:
+ MOVW (DI), SI
+ CMPW SI,R8
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop2
+ JMP fail
+_3_or_more:
+ CMPQ AX, $3
+ JA _4_or_more
+ MOVW 1(R8), BX
+ MOVW (R8), R8
+ LEAQ -2(DI)(DX*1), DX
+loop3:
+ MOVW (DI), SI
+ CMPW SI,R8
+ JZ partial_success3
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop3
+ JMP fail
+partial_success3:
+ MOVW 1(DI), SI
+ CMPW SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop3
+ JMP fail
+_4_or_more:
+ CMPQ AX, $4
+ JA _5_or_more
+ MOVL (R8), R8
+ LEAQ -3(DI)(DX*1), DX
+loop4:
+ MOVL (DI), SI
+ CMPL SI,R8
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop4
+ JMP fail
+_5_or_more:
+ CMPQ AX, $7
+ JA _8_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVL -4(R8)(AX*1), BX
+ MOVL (R8), R8
+loop5to7:
+ MOVL (DI), SI
+ CMPL SI,R8
+ JZ partial_success5to7
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop5to7
+ JMP fail
+partial_success5to7:
+ MOVL -4(AX)(DI*1), SI
+ CMPL SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop5to7
+ JMP fail
+_8_or_more:
+ CMPQ AX, $8
+ JA _9_or_more
+ MOVQ (R8), R8
+ LEAQ -7(DI)(DX*1), DX
+loop8:
+ MOVQ (DI), SI
+ CMPQ SI,R8
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop8
+ JMP fail
+_9_or_more:
+ CMPQ AX, $15
+ JA _16_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVQ -8(R8)(AX*1), BX
+ MOVQ (R8), R8
+loop9to15:
+ MOVQ (DI), SI
+ CMPQ SI,R8
+ JZ partial_success9to15
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop9to15
+ JMP fail
+partial_success9to15:
+ MOVQ -8(AX)(DI*1), SI
+ CMPQ SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop9to15
+ JMP fail
+_16_or_more:
+ CMPQ AX, $16
+ JA _17_or_more
+ MOVOU (R8), X1
+ LEAQ -15(DI)(DX*1), DX
+loop16:
+ MOVOU (DI), X2
+ PCMPEQB X1, X2
+ PMOVMSKB X2, SI
+ CMPQ SI, $0xffff
+ JE success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop16
+ JMP fail
+_17_or_more:
+ CMPQ AX, $31
+ JA _32_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVOU -16(R8)(AX*1), X0
+ MOVOU (R8), X1
+loop17to31:
+ MOVOU (DI), X2
+ PCMPEQB X1,X2
+ PMOVMSKB X2, SI
+ CMPQ SI, $0xffff
+ JE partial_success17to31
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop17to31
+ JMP fail
+partial_success17to31:
+ MOVOU -16(AX)(DI*1), X3
+ PCMPEQB X0, X3
+ PMOVMSKB X3, SI
+ CMPQ SI, $0xffff
+ JE success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop17to31
+ JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+ CMPQ AX, $32
+ JA _33_to_63
+ VMOVDQU (R8), Y1
+ LEAQ -31(DI)(DX*1), DX
+loop32:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, SI
+ CMPL SI, $0xffffffff
+ JE success_avx2
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop32
+ JMP fail_avx2
+_33_to_63:
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ VMOVDQU -32(R8)(AX*1), Y0
+ VMOVDQU (R8), Y1
+loop33to63:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, SI
+ CMPL SI, $0xffffffff
+ JE partial_success33to63
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop33to63
+ JMP fail_avx2
+partial_success33to63:
+ VMOVDQU -32(AX)(DI*1), Y3
+ VPCMPEQB Y0, Y3, Y4
+ VPMOVMSKB Y4, SI
+ CMPL SI, $0xffffffff
+ JE success_avx2
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop33to63
+fail_avx2:
+ VZEROUPPER
+fail:
+ MOVQ $-1, (R11)
+ RET
+success_avx2:
+ VZEROUPPER
+ JMP success
+sse42:
+#ifndef hasSSE42
+ CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
+ JNE no_sse42
+#endif
+ CMPQ AX, $12
+ // PCMPESTRI is slower than normal compare,
+ // so using it makes sense only if we advance 4+ bytes per compare
+ // This value was determined experimentally and is the ~same
+ // on Nehalem (first with SSE42) and Haswell.
+ JAE _9_or_more
+ LEAQ 16(R8), SI
+ TESTW $0xff0, SI
+ JEQ no_sse42
+ MOVOU (R8), X1
+ LEAQ -15(DI)(DX*1), SI
+ MOVQ $16, R9
+ SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+ // 0x0c means: unsigned byte compare (bits 0,1 are 00)
+ // for equality (bits 2,3 are 11)
+ // result is not masked or inverted (bits 4,5 are 00)
+ // and corresponds to first matching byte (bit 6 is 0)
+ PCMPESTRI $0x0c, (DI), X1
+ // CX == 16 means no match,
+ // CX > R9 means partial match at the end of the string,
+ // otherwise sep is at offset CX from X1 start
+ CMPQ CX, R9
+ JBE sse42_success
+ ADDQ R9, DI
+ CMPQ DI, SI
+ JB loop_sse42
+ PCMPESTRI $0x0c, -1(SI), X1
+ CMPQ CX, R9
+ JA fail
+ LEAQ -1(SI), DI
+sse42_success:
+ ADDQ CX, DI
+success:
+ SUBQ R10, DI
+ MOVQ DI, (R11)
+ RET
diff --git a/src/internal/bytealg/index_arm64.go b/src/internal/bytealg/index_arm64.go
new file mode 100644
index 0000000..e87c109
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.go
@@ -0,0 +1,23 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Empirical data shows that using Index can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+ // Optimize cases where the length of the substring is less than 32 bytes
+ MaxLen = 32
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ // 1 error per 16 characters, plus a few slop to start.
+ return 4 + n>>4
+}
diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s
new file mode 100644
index 0000000..3a551a7
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.s
@@ -0,0 +1,206 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+ MOVD a_base+0(FP), R0
+ MOVD a_len+8(FP), R1
+ MOVD b_base+24(FP), R2
+ MOVD b_len+32(FP), R3
+ MOVD $ret+48(FP), R9
+ B indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+ MOVD a_base+0(FP), R0
+ MOVD a_len+8(FP), R1
+ MOVD b_base+16(FP), R2
+ MOVD b_len+24(FP), R3
+ MOVD $ret+32(FP), R9
+ B indexbody<>(SB)
+
+// input:
+// R0: haystack
+// R1: length of haystack
+// R2: needle
+// R3: length of needle (2 <= len <= 32)
+// R9: address to put result
+TEXT indexbody<>(SB),NOSPLIT,$0-56
+ // main idea is to load 'sep' into separate register(s)
+ // to avoid repeatedly re-load it again and again
+ // for sebsequent substring comparisons
+ SUB R3, R1, R4
+ // R4 contains the start of last substring for comparison
+ ADD R0, R4, R4
+ ADD $1, R0, R8
+
+ CMP $8, R3
+ BHI greater_8
+ TBZ $3, R3, len_2_7
+len_8:
+ // R5 contains 8-byte of sep
+ MOVD (R2), R5
+loop_8:
+ // R6 contains substring for comparison
+ CMP R4, R0
+ BHI not_found
+ MOVD.P 1(R0), R6
+ CMP R5, R6
+ BNE loop_8
+ B found
+len_2_7:
+ TBZ $2, R3, len_2_3
+ TBZ $1, R3, len_4_5
+ TBZ $0, R3, len_6
+len_7:
+ // R5 and R6 contain 7-byte of sep
+ MOVWU (R2), R5
+ // 1-byte overlap with R5
+ MOVWU 3(R2), R6
+loop_7:
+ CMP R4, R0
+ BHI not_found
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE loop_7
+ MOVWU 2(R0), R3
+ CMP R6, R3
+ BNE loop_7
+ B found
+len_6:
+ // R5 and R6 contain 6-byte of sep
+ MOVWU (R2), R5
+ MOVHU 4(R2), R6
+loop_6:
+ CMP R4, R0
+ BHI not_found
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE loop_6
+ MOVHU 3(R0), R3
+ CMP R6, R3
+ BNE loop_6
+ B found
+len_4_5:
+ TBZ $0, R3, len_4
+len_5:
+ // R5 and R7 contain 5-byte of sep
+ MOVWU (R2), R5
+ MOVBU 4(R2), R7
+loop_5:
+ CMP R4, R0
+ BHI not_found
+ MOVWU.P 1(R0), R3
+ CMP R5, R3
+ BNE loop_5
+ MOVBU 3(R0), R3
+ CMP R7, R3
+ BNE loop_5
+ B found
+len_4:
+ // R5 contains 4-byte of sep
+ MOVWU (R2), R5
+loop_4:
+ CMP R4, R0
+ BHI not_found
+ MOVWU.P 1(R0), R6
+ CMP R5, R6
+ BNE loop_4
+ B found
+len_2_3:
+ TBZ $0, R3, len_2
+len_3:
+ // R6 and R7 contain 3-byte of sep
+ MOVHU (R2), R6
+ MOVBU 2(R2), R7
+loop_3:
+ CMP R4, R0
+ BHI not_found
+ MOVHU.P 1(R0), R3
+ CMP R6, R3
+ BNE loop_3
+ MOVBU 1(R0), R3
+ CMP R7, R3
+ BNE loop_3
+ B found
+len_2:
+ // R5 contains 2-byte of sep
+ MOVHU (R2), R5
+loop_2:
+ CMP R4, R0
+ BHI not_found
+ MOVHU.P 1(R0), R6
+ CMP R5, R6
+ BNE loop_2
+found:
+ SUB R8, R0, R0
+ MOVD R0, (R9)
+ RET
+not_found:
+ MOVD $-1, R0
+ MOVD R0, (R9)
+ RET
+greater_8:
+ SUB $9, R3, R11 // len(sep) - 9, offset of R0 for last 8 bytes
+ CMP $16, R3
+ BHI greater_16
+len_9_16:
+ MOVD.P 8(R2), R5 // R5 contains the first 8-byte of sep
+ SUB $16, R3, R7 // len(sep) - 16, offset of R2 for last 8 bytes
+ MOVD (R2)(R7), R6 // R6 contains the last 8-byte of sep
+loop_9_16:
+ // search the first 8 bytes first
+ CMP R4, R0
+ BHI not_found
+ MOVD.P 1(R0), R7
+ CMP R5, R7
+ BNE loop_9_16
+ MOVD (R0)(R11), R7
+ CMP R6, R7 // compare the last 8 bytes
+ BNE loop_9_16
+ B found
+greater_16:
+ CMP $24, R3
+ BHI len_25_32
+len_17_24:
+ LDP.P 16(R2), (R5, R6) // R5 and R6 contain the first 16-byte of sep
+ SUB $24, R3, R10 // len(sep) - 24
+ MOVD (R2)(R10), R7 // R7 contains the last 8-byte of sep
+loop_17_24:
+ // search the first 16 bytes first
+ CMP R4, R0
+ BHI not_found
+ MOVD.P 1(R0), R10
+ CMP R5, R10
+ BNE loop_17_24
+ MOVD 7(R0), R10
+ CMP R6, R10
+ BNE loop_17_24
+ MOVD (R0)(R11), R10
+ CMP R7, R10 // compare the last 8 bytes
+ BNE loop_17_24
+ B found
+len_25_32:
+ LDP.P 16(R2), (R5, R6)
+ MOVD.P 8(R2), R7 // R5, R6 and R7 contain the first 24-byte of sep
+ SUB $32, R3, R12 // len(sep) - 32
+ MOVD (R2)(R12), R10 // R10 contains the last 8-byte of sep
+loop_25_32:
+ // search the first 24 bytes first
+ CMP R4, R0
+ BHI not_found
+ MOVD.P 1(R0), R12
+ CMP R5, R12
+ BNE loop_25_32
+ MOVD 7(R0), R12
+ CMP R6, R12
+ BNE loop_25_32
+ MOVD 15(R0), R12
+ CMP R7, R12
+ BNE loop_25_32
+ MOVD (R0)(R11), R12
+ CMP R10, R12 // compare the last 8 bytes
+ BNE loop_25_32
+ B found
diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go
new file mode 100644
index 0000000..a59e329
--- /dev/null
+++ b/src/internal/bytealg/index_generic.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64
+
+package bytealg
+
+const MaxBruteForce = 0
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int {
+ panic("unimplemented")
+}
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int {
+ panic("unimplemented")
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ panic("unimplemented")
+}
diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go
new file mode 100644
index 0000000..6e4a2f3
--- /dev/null
+++ b/src/internal/bytealg/index_native.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm64 || s390x || ppc64le || ppc64
+
+package bytealg
+
+//go:noescape
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int
+
+//go:noescape
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int
diff --git a/src/internal/bytealg/index_ppc64x.go b/src/internal/bytealg/index_ppc64x.go
new file mode 100644
index 0000000..ab3cbe5
--- /dev/null
+++ b/src/internal/bytealg/index_ppc64x.go
@@ -0,0 +1,26 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (aix || linux) && (ppc64 || ppc64le)
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 16
+
+var SupportsPower9 = cpu.PPC64.IsPOWER9
+
+func init() {
+ MaxLen = 32
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ // 1 error per 8 characters, plus a few slop to start.
+ return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_ppc64x.s b/src/internal/bytealg/index_ppc64x.s
new file mode 100644
index 0000000..26205ce
--- /dev/null
+++ b/src/internal/bytealg/index_ppc64x.s
@@ -0,0 +1,802 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This is an implementation based on the s390x
+// implementation.
+
+// Find a separator with 2 <= len <= 32 within a string.
+// Separators with lengths of 2, 3 or 4 are handled
+// specially.
+
+// This works on power8 and above. The loads and
+// compares are done in big endian order
+// since that allows the used of VCLZD, and allows
+// the same implementation to work on big and little
+// endian platforms with minimal conditional changes.
+
+// NOTE: There is a power9 implementation that
+// improves performance by 10-15% on little
+// endian for some of the benchmarks.
+// Unrolled index2to16 loop by 4 on ppc64le/power9
+// Work is still needed for a big endian
+// implementation on power9.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Needed to swap LXVD2X loads to the correct
+// byte order to work on POWER8.
+
+#ifdef GOARCH_ppc64
+DATA byteswap<>+0(SB)/8, $0x0001020304050607
+DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
+#else
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+#endif
+
+// Load bytes in big endian order. Address
+// alignment does not need checking.
+#define VLOADSWAP(base, index, vreg, vsreg) \
+ LXVD2X (base)(index), vsreg; \
+ VPERM vreg, vreg, SWAP, vreg
+
+GLOBL byteswap<>+0(SB), RODATA, $16
+
+TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+ // R3 = byte array pointer
+ // R4 = length
+ MOVD R6, R5 // R5 = separator pointer
+ MOVD R7, R6 // R6 = separator length
+
+#ifdef GOARCH_ppc64le
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
+ CMP R7, $1
+ BNE power8
+ BR indexbodyp9<>(SB)
+#endif
+power8:
+ BR indexbody<>(SB)
+
+TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+ // R3 = string
+ // R4 = length
+ // R5 = separator pointer
+ // R6 = separator length
+
+#ifdef GOARCH_ppc64le
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
+ CMP R7, $1
+ BNE power8
+ BR indexbodyp9<>(SB)
+
+#endif
+power8:
+ BR indexbody<>(SB)
+
+ // s: string we are searching
+ // sep: string to search for
+ // R3=&s[0], R4=len(s)
+ // R5=&sep[0], R6=len(sep)
+ // R14=&ret (index where sep found)
+ // R7=working addr of string
+ // R16=index value 16
+ // R17=index value 17
+ // R18=index value 18
+ // R19=index value 1
+ // R26=LASTBYTE of string
+ // R27=LASTSTR last start byte to compare with sep
+ // R8, R9 scratch
+ // V0=sep left justified zero fill
+ // CR4=sep length >= 16
+
+#define SEPMASK V17
+#define LASTBYTE R26
+#define LASTSTR R27
+#define ONES V20
+#define SWAP V21
+#define SWAP_ VS53
+TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
+ CMP R6, R4 // Compare lengths
+ BGT notfound // If sep len is > string, notfound
+ ADD R4, R3, LASTBYTE // find last byte addr
+ SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
+ CMP R6, $0 // Check sep len
+ BEQ notfound // sep len 0 -- not found
+ MOVD R3, R7 // Copy of string addr
+ MOVD $16, R16 // Index value 16
+ MOVD $17, R17 // Index value 17
+ MOVD $18, R18 // Index value 18
+ MOVD $1, R19 // Index value 1
+ MOVD $byteswap<>+00(SB), R8
+ VSPLTISB $0xFF, ONES // splat all 1s
+ LXVD2X (R8)(R0), SWAP_ // Set up swap string
+
+ CMP R6, $16, CR4 // CR4 for len(sep) >= 16
+ VOR ONES, ONES, SEPMASK // Set up full SEPMASK
+ BGE CR4, loadge16 // Load for len(sep) >= 16
+ SUB R6, R16, R9 // 16-len of sep
+ SLD $3, R9 // Set up for VSLO
+ MTVSRD R9, V9 // Set up for VSLO
+ VSLDOI $8, V9, V9, V9 // Set up for VSLO
+ VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
+
+loadge16:
+ ANDCC $15, R5, R9 // Find byte offset of sep
+ ADD R9, R6, R10 // Add sep len
+ CMP R10, $16 // Check if sep len+offset > 16
+ BGT sepcross16 // Sep crosses 16 byte boundary
+
+ RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
+ VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0
+ SLD $3, R9 // Set up shift count for VSLO
+ MTVSRD R9, V8 // Set up shift count for VSLO
+ VSLDOI $8, V8, V8, V8
+ VSLO V0, V8, V0 // Shift by start byte
+
+ VAND V0, SEPMASK, V0 // Mask separator (< 16)
+ BR index2plus
+
+sepcross16:
+ VLOADSWAP(R5, R0, V0, V0) // Load 16 bytes @R5 into V0
+
+ VAND V0, SEPMASK, V0 // mask out separator
+ BLE CR4, index2to16
+ BR index17plus // Handle sep > 16
+
+index2plus:
+ CMP R6, $2 // Check length of sep
+ BNE index3plus // If not 2, check for 3
+ ADD $16, R7, R9 // Check if next 16 bytes past last
+ CMP R9, LASTBYTE // compare with last
+ BGE index2to16 // 2 <= len(string) <= 16
+ MOVD $0xff00, R21 // Mask for later
+ MTVSRD R21, V25 // Move to Vreg
+ VSPLTH $3, V25, V31 // Splat mask
+ VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep
+ VSPLTISB $0, V10 // Clear V10
+
+ // First case: 2 byte separator
+ // V1: 2 byte separator splatted
+ // V2: 16 bytes at addr
+ // V4: 16 bytes at addr+1
+ // Compare 2 byte separator at start
+ // and at start+1. Use VSEL to combine
+ // those results to find the first
+ // matching start byte, returning
+ // that value when found. Loop as
+ // long as len(string) > 16
+index2loop2:
+ VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3
+
+index2loop:
+ VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
+ VCMPEQUH V1, V2, V5 // Search for sep
+ VCMPEQUH V1, V3, V6 // Search for sep offset by 1
+ VSEL V6, V5, V31, V7 // merge even and odd indices
+ VCLZD V7, V18 // find index of first match
+ MFVSRD V18, R25 // get first value
+ CMP R25, $64 // Found if < 64
+ BLT foundR25 // Return byte index where found
+ VSLDOI $8, V18, V18, V18 // Adjust 2nd value
+ MFVSRD V18, R25 // get second value
+ CMP R25, $64 // Found if < 64
+ ADD $64, R25 // Update byte offset
+ BLT foundR25 // Return value
+ ADD $16, R7 // R7+=16 Update string pointer
+ ADD $17, R7, R9 // R9=F7+17 since loop unrolled
+ CMP R9, LASTBYTE // Compare addr+17 against last byte
+ BLT index2loop2 // If < last, continue loop
+ CMP R7, LASTBYTE // Compare addr+16 against last byte
+ BLT index2to16 // If < 16 handle specially
+ VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3
+ VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
+ BR index2loop
+
+index3plus:
+ CMP R6, $3 // Check if sep == 3
+ BNE index4plus // If not check larger
+ ADD $19, R7, R9 // Find bytes for use in this loop
+ CMP R9, LASTBYTE // Compare against last byte
+ BGE index2to16 // Remaining string 2<=len<=16
+ MOVD $0xff00, R21 // Set up mask for upcoming loop
+ MTVSRD R21, V25 // Move mask to Vreg
+ VSPLTH $3, V25, V31 // Splat mask
+ VSPLTH $0, V0, V1 // Splat 1st two bytes of sep
+ VSPLTB $2, V0, V8 // Splat 3rd byte of sep
+
+ // Loop to process 3 byte separator.
+ // string[0:16] is in V2
+ // string[2:18] is in V3
+ // sep[0:2] splatted in V1
+ // sec[3] splatted in v8
+ // Load vectors at string, string+1
+ // and string+2. Compare string, string+1
+ // against first 2 bytes of separator
+ // splatted, and string+2 against 3rd
+ // byte splatted. Merge the results with
+ // VSEL to find the first byte of a match.
+
+ // Special handling for last 16 bytes if the
+ // string fits in 16 byte multiple.
+index3loop2:
+ MOVD $2, R21 // Set up index for 2
+ VSPLTISB $0, V10 // Clear V10
+ VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3
+ VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
+
+index3loop:
+ VLOADSWAP(R7, R0, V2, V2) // Load with correct order
+ VSLDOI $1, V2, V3, V4 // string[1:17]
+ VSLDOI $2, V2, V3, V9 // string[2:18]
+ VCMPEQUH V1, V2, V5 // compare hw even indices
+ VCMPEQUH V1, V4, V6 // compare hw odd indices
+ VCMPEQUB V8, V9, V10 // compare 3rd to last byte
+ VSEL V6, V5, V31, V7 // Find 1st matching byte using mask
+ VAND V7, V10, V7 // AND matched bytes with matched 3rd byte
+ VCLZD V7, V18 // Find first nonzero indexes
+ MFVSRD V18, R25 // Move 1st doubleword
+ CMP R25, $64 // If < 64 found
+ BLT foundR25 // Return matching index
+ VSLDOI $8, V18, V18, V18 // Move value
+ MFVSRD V18, R25 // Move 2nd doubleword
+ CMP R25, $64 // If < 64 found
+ ADD $64, R25 // Update byte index
+ BLT foundR25 // Return matching index
+ ADD $16, R7 // R7+=16 string ptr
+ ADD $19, R7, R9 // Number of string bytes for loop
+ CMP R9, LASTBYTE // Compare against last byte of string
+ BLT index3loop2 // If within, continue this loop
+ CMP R7, LASTSTR // Compare against last start byte
+ BLT index2to16 // Process remainder
+ VSPLTISB $0, V3 // Special case for last 16 bytes
+ BR index3loop // Continue this loop
+
+ // Loop to process 4 byte separator
+ // string[0:16] in V2
+ // string[3:16] in V3
+ // sep[0:4] splatted in V1
+ // Set up vectors with strings at offsets
+ // 0, 1, 2, 3 and compare against the 4 byte
+ // separator also splatted. Use VSEL with the
+ // compare results to find the first byte where
+ // a separator match is found.
+index4plus:
+ CMP R6, $4 // Check if 4 byte separator
+ BNE index5plus // If not next higher
+ ADD $20, R7, R9 // Check string size to load
+ CMP R9, LASTBYTE // Verify string length
+ BGE index2to16 // If not large enough, process remaining
+ MOVD $2, R15 // Set up index
+
+ // Set up masks for use with VSEL
+ MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
+ SLD $24, R21
+ MTVSRD R21, V10
+ VSPLTW $1, V10, V29
+ VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
+ MOVD $0xffff, R21
+ SLD $16, R21
+ MTVSRD R21, V10
+ VSPLTW $1, V10, V31 // Mask 0xffff0000ffff0000...
+ VSPLTW $0, V0, V1 // Splat 1st word of separator
+
+index4loop:
+ VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
+
+next4:
+ VSPLTISB $0, V10 // Clear
+ MOVD $3, R9 // Number of bytes beyond 16
+ VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+3 into V3
+ VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
+ VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
+ VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
+ VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3
+ VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep
+ VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep
+ VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep
+ VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep
+ VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask
+ VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
+ VSEL V14, V13, V31, V7 // final merge
+ VCLZD V7, V18 // Find first index for each half
+ MFVSRD V18, R25 // Isolate value
+ CMP R25, $64 // If < 64, found
+ BLT foundR25 // Return found index
+ VSLDOI $8, V18, V18, V18 // Move for MFVSRD
+ MFVSRD V18, R25 // Isolate other value
+ CMP R25, $64 // If < 64, found
+ ADD $64, R25 // Update index for high doubleword
+ BLT foundR25 // Return found index
+ ADD $16, R7 // R7+=16 for next string
+ ADD $20, R7, R9 // R+20 for all bytes to load
+ CMP R9, LASTBYTE // Past end? Maybe check for extra?
+ BLT index4loop // If not, continue loop
+ CMP R7, LASTSTR // Check remainder
+ BLE index2to16 // Process remainder
+ BR notfound // Not found
+
+index5plus:
+ CMP R6, $16 // Check for sep > 16
+ BGT index17plus // Handle large sep
+
+ // Assumption is that the separator is smaller than the string at this point
+index2to16:
+ CMP R7, LASTSTR // Compare last start byte
+ BGT notfound // last takes len(sep) into account
+
+ ADD $16, R7, R9 // Check for last byte of string
+ CMP R9, LASTBYTE
+ BGT index2to16tail
+
+ // At least 16 bytes of string left
+ // Mask the number of bytes in sep
+index2to16loop:
+ VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
+
+compare:
+ VAND V1, SEPMASK, V2 // Mask out sep size
+ VCMPEQUBCC V0, V2, V3 // Compare masked string
+ BLT CR6, found // All equal
+ ADD $1, R7 // Update ptr to next byte
+ CMP R7, LASTSTR // Still less than last start byte
+ BGT notfound // Not found
+ ADD $16, R7, R9 // Verify remaining bytes
+ CMP R9, LASTBYTE // At least 16
+ BLT index2to16loop // Try again
+
+ // Less than 16 bytes remaining in string
+ // Separator >= 2
+index2to16tail:
+ ADD R3, R4, R9 // End of string
+ SUB R7, R9, R9 // Number of bytes left
+ ANDCC $15, R7, R10 // 16 byte offset
+ ADD R10, R9, R11 // offset + len
+ CMP R11, $16 // >= 16?
+ BLE short // Does not cross 16 bytes
+ VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
+ BR index2to16next // Continue on
+
+short:
+ RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
+ VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1
+ SLD $3, R10 // Set up shift
+ MTVSRD R10, V8 // Set up shift
+ VSLDOI $8, V8, V8, V8
+ VSLO V1, V8, V1 // Shift by start byte
+ VSPLTISB $0, V25 // Clear for later use
+
+index2to16next:
+ VAND V1, SEPMASK, V2 // Just compare size of sep
+ VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BGT notfound // If at end, then not found
+ VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
+ BR index2to16next // Check the next partial string
+
+index17plus:
+ CMP R6, $32 // Check if 17 < len(sep) <= 32
+ BGT index33plus
+ SUB $16, R6, R9 // Extra > 16
+ SLD $56, R9, R10 // Shift to use in VSLO
+ MTVSRD R10, V9 // Set up for VSLO
+ VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1
+ VSLO V1, V9, V1 // Shift left
+ VSPLTISB $0xff, V7 // Splat 1s
+ VSPLTISB $0, V27 // Splat 0
+
+index17to32loop:
+ VLOADSWAP(R7, R0, V2, V2) // Load 16 bytes @R7 into V2
+
+next17:
+ VLOADSWAP(R7, R9, V3, V3) // Load 16 bytes @R7+R9 into V3
+ VSLO V3, V9, V3 // Shift left
+ VCMPEQUB V0, V2, V4 // Compare first 16 bytes
+ VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
+ VAND V4, V5, V6 // Check if both equal
+ VCMPEQUBCC V6, V7, V8 // All equal?
+ BLT CR6, found // Yes
+ ADD $1, R7 // On to next byte
+ CMP R7, LASTSTR // Check if last start byte
+ BGT notfound // If too high, not found
+ BR index17to32loop // Continue
+
+notfound:
+ MOVD $-1, R3 // Return -1 if not found
+ RET
+
+index33plus:
+ MOVD $0, (R0) // Case not implemented
+ RET // Crash before return
+
+foundR25:
+ SRD $3, R25 // Convert from bits to bytes
+ ADD R25, R7 // Add to current string address
+ SUB R3, R7 // Subtract from start of string
+ MOVD R7, R3 // Return byte where found
+ RET
+
+found:
+ SUB R3, R7 // Return byte where found
+ MOVD R7, R3
+ RET
+
+TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
+ CMP R6, R4 // Compare lengths
+ BGT notfound // If sep len is > string, notfound
+ ADD R4, R3, LASTBYTE // find last byte addr
+ SUB R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
+ CMP R6, $0 // Check sep len
+ BEQ notfound // sep len 0 -- not found
+ MOVD R3, R7 // Copy of string addr
+ MOVD $16, R16 // Index value 16
+ MOVD $17, R17 // Index value 17
+ MOVD $18, R18 // Index value 18
+ MOVD $1, R19 // Index value 1
+ VSPLTISB $0xFF, ONES // splat all 1s
+
+ CMP R6, $16, CR4 // CR4 for len(sep) >= 16
+ VOR ONES, ONES, SEPMASK // Set up full SEPMASK
+ BGE CR4, loadge16 // Load for len(sep) >= 16
+ SUB R6, R16, R9 // 16-len of sep
+ SLD $3, R9 // Set up for VSLO
+ MTVSRD R9, V9 // Set up for VSLO
+ VSLDOI $8, V9, V9, V9 // Set up for VSLO
+ VSLO ONES, V9, SEPMASK // Mask for separator len(sep) < 16
+
+loadge16:
+ ANDCC $15, R5, R9 // Find byte offset of sep
+ ADD R9, R6, R10 // Add sep len
+ CMP R10, $16 // Check if sep len+offset > 16
+ BGT sepcross16 // Sep crosses 16 byte boundary
+
+ RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
+ LXVB16X (R8)(R0), V0 // Load 16 bytes @R8 into V0
+ SLD $3, R9 // Set up shift count for VSLO
+ MTVSRD R9, V8 // Set up shift count for VSLO
+ VSLDOI $8, V8, V8, V8
+ VSLO V0, V8, V0 // Shift by start byte
+
+ VAND V0, SEPMASK, V0 // Mask separator (< 16)
+ BR index2plus
+
+sepcross16:
+ LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0
+
+ VAND V0, SEPMASK, V0 // mask out separator
+ BLE CR4, index2to16
+ BR index17plus // Handle sep > 16
+
+index2plus:
+ CMP R6, $2 // Check length of sep
+ BNE index3plus // If not 2, check for 3
+ ADD $16, R7, R9 // Check if next 16 bytes past last
+ CMP R9, LASTBYTE // compare with last
+ BGE index2to16 // 2 <= len(string) <= 16
+ MOVD $0xff00, R21 // Mask for later
+ MTVSRD R21, V25 // Move to Vreg
+ VSPLTH $3, V25, V31 // Splat mask
+ VSPLTH $0, V0, V1 // Splat 1st 2 bytes of sep
+ VSPLTISB $0, V10 // Clear V10
+
+ // First case: 2 byte separator
+ // V1: 2 byte separator splatted
+ // V2: 16 bytes at addr
+ // V4: 16 bytes at addr+1
+ // Compare 2 byte separator at start
+ // and at start+1. Use VSEL to combine
+ // those results to find the first
+ // matching start byte, returning
+ // that value when found. Loop as
+ // long as len(string) > 16
+index2loop2:
+ LXVB16X (R7)(R19), V3 // Load 16 bytes @R7+1 into V3
+
+index2loop:
+ LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
+ VCMPEQUH V1, V2, V5 // Search for sep
+ VCMPEQUH V1, V3, V6 // Search for sep offset by 1
+ VSEL V6, V5, V31, V7 // merge even and odd indices
+ VCLZD V7, V18 // find index of first match
+ MFVSRD V18, R25 // get first value
+ CMP R25, $64 // Found if < 64
+ BLT foundR25 // Return byte index where found
+
+ MFVSRLD V18, R25 // get second value
+ CMP R25, $64 // Found if < 64
+ ADD $64, R25 // Update byte offset
+ BLT foundR25 // Return value
+ ADD $16, R7 // R7+=16 Update string pointer
+ ADD $17, R7, R9 // R9=F7+17 since loop unrolled
+ CMP R9, LASTBYTE // Compare addr+17 against last byte
+ BLT index2loop2 // If < last, continue loop
+ CMP R7, LASTBYTE // Compare addr+16 against last byte
+ BLT index2to16 // If < 16 handle specially
+ LXVB16X (R7)(R0), V3 // Load 16 bytes @R7 into V3
+ VSLDOI $1, V3, V10, V3 // Shift left by 1 byte
+ BR index2loop
+
+index3plus:
+ CMP R6, $3 // Check if sep == 3
+ BNE index4plus // If not check larger
+ ADD $19, R7, R9 // Find bytes for use in this loop
+ CMP R9, LASTBYTE // Compare against last byte
+ BGE index2to16 // Remaining string 2<=len<=16
+ MOVD $0xff00, R21 // Set up mask for upcoming loop
+ MTVSRD R21, V25 // Move mask to Vreg
+ VSPLTH $3, V25, V31 // Splat mask
+ VSPLTH $0, V0, V1 // Splat 1st two bytes of sep
+ VSPLTB $2, V0, V8 // Splat 3rd byte of sep
+
+ // Loop to process 3 byte separator.
+ // string[0:16] is in V2
+ // string[2:18] is in V3
+ // sep[0:2] splatted in V1
+ // sec[3] splatted in v8
+ // Load vectors at string, string+1
+ // and string+2. Compare string, string+1
+ // against first 2 bytes of separator
+ // splatted, and string+2 against 3rd
+ // byte splatted. Merge the results with
+ // VSEL to find the first byte of a match.
+
+ // Special handling for last 16 bytes if the
+ // string fits in 16 byte multiple.
+index3loop2:
+ MOVD $2, R21 // Set up index for 2
+ VSPLTISB $0, V10 // Clear V10
+ LXVB16X (R7)(R21), V3 // Load 16 bytes @R7+2 into V3
+ VSLDOI $14, V3, V10, V3 // Left justify next 2 bytes
+
+index3loop:
+ LXVB16X (R7)(R0), V2 // Load 16 bytes @R7
+ VSLDOI $1, V2, V3, V4 // string[1:17]
+ VSLDOI $2, V2, V3, V9 // string[2:18]
+ VCMPEQUH V1, V2, V5 // compare hw even indices
+ VCMPEQUH V1, V4, V6 // compare hw odd indices
+ VCMPEQUB V8, V9, V10 // compare 3rd to last byte
+ VSEL V6, V5, V31, V7 // Find 1st matching byte using mask
+ VAND V7, V10, V7 // AND matched bytes with matched 3rd byte
+ VCLZD V7, V18 // Find first nonzero indexes
+ MFVSRD V18, R25 // Move 1st doubleword
+ CMP R25, $64 // If < 64 found
+ BLT foundR25 // Return matching index
+
+ MFVSRLD V18, R25 // Move 2nd doubleword
+ CMP R25, $64 // If < 64 found
+ ADD $64, R25 // Update byte index
+ BLT foundR25 // Return matching index
+ ADD $16, R7 // R7+=16 string ptr
+ ADD $19, R7, R9 // Number of string bytes for loop
+ CMP R9, LASTBYTE // Compare against last byte of string
+ BLT index3loop2 // If within, continue this loop
+ CMP R7, LASTSTR // Compare against last start byte
+ BLT index2to16 // Process remainder
+ VSPLTISB $0, V3 // Special case for last 16 bytes
+ BR index3loop // Continue this loop
+
+ // Loop to process 4 byte separator
+ // string[0:16] in V2
+ // string[3:16] in V3
+ // sep[0:4] splatted in V1
+ // Set up vectors with strings at offsets
+ // 0, 1, 2, 3 and compare against the 4 byte
+ // separator also splatted. Use VSEL with the
+ // compare results to find the first byte where
+ // a separator match is found.
+index4plus:
+ CMP R6, $4 // Check if 4 byte separator
+ BNE index5plus // If not next higher
+ ADD $20, R7, R9 // Check string size to load
+ CMP R9, LASTBYTE // Verify string length
+ BGE index2to16 // If not large enough, process remaining
+
+ // Set up masks for use with VSEL
+ MOVD $0xff, R21 // Set up mask 0xff000000ff000000...
+ SLD $24, R21
+ MTVSRWS R21, V29
+
+ VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
+ MOVD $0xffff, R21
+ SLD $16, R21
+ MTVSRWS R21, V31
+
+ VSPLTW $0, V0, V1 // Splat 1st word of separator
+
+index4loop:
+ LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
+
+next4:
+ VSPLTISB $0, V10 // Clear
+ MOVD $3, R9 // Number of bytes beyond 16
+ LXVB16X (R7)(R9), V3 // Load 16 bytes @R7 into V3
+ VSLDOI $13, V3, V10, V3 // Shift left last 3 bytes
+ VSLDOI $1, V2, V3, V4 // V4=(V2:V3)<<1
+ VSLDOI $2, V2, V3, V9 // V9=(V2:V3)<<2
+ VSLDOI $3, V2, V3, V10 // V10=(V2:v3)<<3
+ VCMPEQUW V1, V2, V5 // compare index 0, 4, ... with sep
+ VCMPEQUW V1, V4, V6 // compare index 1, 5, ... with sep
+ VCMPEQUW V1, V9, V11 // compare index 2, 6, ... with sep
+ VCMPEQUW V1, V10, V12 // compare index 3, 7, ... with sep
+ VSEL V6, V5, V29, V13 // merge index 0, 1, 4, 5, using mask
+ VSEL V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
+ VSEL V14, V13, V31, V7 // final merge
+ VCLZD V7, V18 // Find first index for each half
+ MFVSRD V18, R25 // Isolate value
+ CMP R25, $64 // If < 64, found
+ BLT foundR25 // Return found index
+
+ MFVSRLD V18, R25 // Isolate other value
+ CMP R25, $64 // If < 64, found
+ ADD $64, R25 // Update index for high doubleword
+ BLT foundR25 // Return found index
+ ADD $16, R7 // R7+=16 for next string
+ ADD $20, R7, R9 // R+20 for all bytes to load
+ CMP R9, LASTBYTE // Past end? Maybe check for extra?
+ BLT index4loop // If not, continue loop
+ CMP R7, LASTSTR // Check remainder
+ BLE index2to16 // Process remainder
+ BR notfound // Not found
+
+index5plus:
+ CMP R6, $16 // Check for sep > 16
+ BGT index17plus // Handle large sep
+
+ // Assumption is that the separator is smaller than the string at this point
+index2to16:
+ CMP R7, LASTSTR // Compare last start byte
+ BGT notfound // last takes len(sep) into account
+
+ ADD $19, R7, R9 // To check 4 indices per iteration, need at least 16+3 bytes
+ CMP R9, LASTBYTE
+ // At least 16 bytes of string left
+ // Mask the number of bytes in sep
+ VSPLTISB $0, V10 // Clear
+ BGT index2to16tail
+
+ MOVD $3, R17 // Number of bytes beyond 16
+ PCALIGN $32
+index2to16loop:
+ LXVB16X (R7)(R0), V1 // Load next 16 bytes of string into V1 from R7
+ LXVB16X (R7)(R17), V5 // Load next 16 bytes of string into V5 from R7+3
+
+ VSLDOI $13, V5, V10, V2 // Shift left last 3 bytes
+ VSLDOI $1, V1, V2, V3 // V3=(V1:V2)<<1
+ VSLDOI $2, V1, V2, V4 // V4=(V1:V2)<<2
+ VAND V1, SEPMASK, V8 // Mask out sep size 0th index
+ VAND V3, SEPMASK, V9 // Mask out sep size 1st index
+ VAND V4, SEPMASK, V11 // Mask out sep size 2nd index
+ VAND V5, SEPMASK, V12 // Mask out sep size 3rd index
+ VCMPEQUBCC V0, V8, V8 // compare masked string
+ BLT CR6, found // All equal while comparing 0th index
+ VCMPEQUBCC V0, V9, V9 // compare masked string
+ BLT CR6, found2 // All equal while comparing 1st index
+ VCMPEQUBCC V0, V11, V11 // compare masked string
+ BLT CR6, found3 // All equal while comparing 2nd index
+ VCMPEQUBCC V0, V12, V12 // compare masked string
+ BLT CR6, found4 // All equal while comparing 3rd index
+
+ ADD $4, R7 // Update ptr to next 4 bytes
+ CMP R7, LASTSTR // Still less than last start byte
+ BGT notfound // Not found
+ ADD $19, R7, R9 // Verify remaining bytes
+ CMP R9, LASTBYTE // length of string at least 19
+ BLE index2to16loop // Try again, else do post processing and jump to index2to16next
+
+ // <19 bytes left, post process the remaining string
+index2to16tail:
+ ADD R3, R4, R9 // End of string
+ SUB R7, R9, R9 // Number of bytes left
+ ANDCC $15, R7, R10 // 16 byte offset
+ ADD R10, R9, R11 // offset + len
+ CMP R11, $16 // >= 16?
+ BLE short // Does not cross 16 bytes
+ LXVB16X (R7)(R0), V1 // Load 16 bytes @R7 into V1
+ CMP R9, $16 // Post-processing of unrolled loop
+ BLE index2to16next // continue to index2to16next if <= 16 bytes
+ SUB R16, R9, R10 // R9 should be 18 or 17 hence R10 is 1 or 2
+ LXVB16X (R7)(R10), V9
+ CMP R10, $1 // string length is 17, compare 1 more byte
+ BNE extra2 // string length is 18, compare 2 more bytes
+ VSLDOI $15, V9, V10, V25
+ VAND V1, SEPMASK, V2 // Just compare size of sep
+ VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BGT notfound // If at end, then not found
+ VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte
+ BR index2to16next // go to remainder loop
+extra2:
+ VSLDOI $14, V9, V10, V25
+ VAND V1, SEPMASK, V2 // Just compare size of sep
+ VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BGT notfound // If at end, then not found
+ VOR V1, V1, V4 // save remaining string
+ VSLDOI $1, V1, V25, V1 // Shift string left by 1 byte for 17th byte
+ VAND V1, SEPMASK, V2 // Just compare size of sep
+ VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BGT notfound // If at end, then not found
+ VSLDOI $2, V4, V25, V1 // Shift saved string left by 2 bytes for 18th byte
+ BR index2to16next // Check the remaining partial string in index2to16next
+
+short:
+ RLDICR $0, R7, $59, R9 // Adjust addr to 16 byte container
+ LXVB16X (R9)(R0), V1 // Load 16 bytes @R9 into V1
+ SLD $3, R10 // Set up shift
+ MTVSRD R10, V8 // Set up shift
+ VSLDOI $8, V8, V8, V8
+ VSLO V1, V8, V1 // Shift by start byte
+ PCALIGN $32
+index2to16next:
+ VAND V1, SEPMASK, V2 // Just compare size of sep
+ VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
+ BLT CR6, found // Found
+ ADD $1, R7 // Not found, try next partial string
+ CMP R7, LASTSTR // Check for end of string
+ BGT notfound // If at end, then not found
+ VSLDOI $1, V1, V10, V1 // Shift string left by 1 byte
+ BR index2to16next // Check the next partial string
+
+index17plus:
+ CMP R6, $32 // Check if 17 < len(sep) <= 32
+ BGT index33plus
+ SUB $16, R6, R9 // Extra > 16
+ SLD $56, R9, R10 // Shift to use in VSLO
+ MTVSRD R10, V9 // Set up for VSLO
+ LXVB16X (R5)(R9), V1 // Load 16 bytes @R5+R9 into V1
+ VSLO V1, V9, V1 // Shift left
+ VSPLTISB $0xff, V7 // Splat 1s
+ VSPLTISB $0, V27 // Splat 0
+
+index17to32loop:
+ LXVB16X (R7)(R0), V2 // Load 16 bytes @R7 into V2
+
+next17:
+ LXVB16X (R7)(R9), V3 // Load 16 bytes @R7+R9 into V3
+ VSLO V3, V9, V3 // Shift left
+ VCMPEQUB V0, V2, V4 // Compare first 16 bytes
+ VCMPEQUB V1, V3, V5 // Compare extra over 16 bytes
+ VAND V4, V5, V6 // Check if both equal
+ VCMPEQUBCC V6, V7, V8 // All equal?
+ BLT CR6, found // Yes
+ ADD $1, R7 // On to next byte
+ CMP R7, LASTSTR // Check if last start byte
+ BGT notfound // If too high, not found
+ BR index17to32loop // Continue
+
+notfound:
+ MOVD $-1, R3 // Return -1 if not found
+ RET
+
+index33plus:
+ MOVD $0, (R0) // Case not implemented
+ RET // Crash before return
+
+foundR25:
+ SRD $3, R25 // Convert from bits to bytes
+ ADD R25, R7 // Add to current string address
+ SUB R3, R7 // Subtract from start of string
+ MOVD R7, R3 // Return byte where found
+ RET
+found4:
+ ADD $1, R7 // found from unrolled loop at index 3
+found3:
+ ADD $1, R7 // found from unrolled loop at index 2
+found2:
+ ADD $1, R7 // found from unrolled loop at index 1
+found: // found at index 0
+ SUB R3, R7 // Return byte where found
+ MOVD R7, R3
+ RET
diff --git a/src/internal/bytealg/index_s390x.go b/src/internal/bytealg/index_s390x.go
new file mode 100644
index 0000000..9340cf1
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.go
@@ -0,0 +1,31 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+ // Note: we're kind of lucky that this flag is available at this point.
+ // The runtime sets HasVX when processing auxv records, and that happens
+ // to happen *before* running the init functions of packages that
+ // the runtime depends on.
+ // TODO: it would really be nicer for internal/cpu to figure out this
+ // flag by itself. Then we wouldn't need to depend on quirks of
+ // early startup initialization order.
+ if cpu.S390X.HasVX {
+ MaxLen = 64
+ }
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ // 1 error per 8 characters, plus a few slop to start.
+ return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_s390x.s b/src/internal/bytealg/index_s390x.s
new file mode 100644
index 0000000..491d5bc
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.s
@@ -0,0 +1,216 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
+ LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
+ LMG b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+ MOVD $ret+48(FP), R5
+ BR indexbody<>(SB)
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
+ LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
+ LMG b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+ MOVD $ret+32(FP), R5
+ BR indexbody<>(SB)
+
+// s: string we are searching
+// sep: string to search for
+// R1=&s[0], R2=len(s)
+// R3=&sep[0], R4=len(sep)
+// R5=&ret (int)
+// Caller must confirm availability of vx facility before calling.
+TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
+ CMPBGT R4, R2, notfound
+ ADD R1, R2
+ SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
+ CMPBEQ R4, $0, notfound
+ SUB $1, R4 // R4=len(sep)-1 for use as VLL index
+ VLL R4, (R3), V0 // contains first 16 bytes of sep
+ MOVD R1, R7
+index2plus:
+ CMPBNE R4, $1, index3plus
+ MOVD $15(R7), R9
+ CMPBGE R9, R2, index2to16
+ VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
+ VONE V16
+ VREPH $0, V0, V1
+ CMPBGE R9, R2, index2to16
+index2loop:
+ VL 0(R7), V2 // 16 bytes, even indices
+ VL 1(R7), V4 // 16 bytes, odd indices
+ VCEQH V1, V2, V5 // compare even indices
+ VCEQH V1, V4, V6 // compare odd indices
+ VSEL V5, V6, V31, V7 // merge even and odd indices
+ VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
+ BLT foundV17
+ MOVD $16(R7), R7 // R7+=16
+ ADD $15, R7, R9
+ CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
+ CMPBLE R7, R2, index2to16
+ BR notfound
+
+index3plus:
+ CMPBNE R4, $2, index4plus
+ ADD $15, R7, R9
+ CMPBGE R9, R2, index2to16
+ MOVD $1, R0
+ VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
+ VONE V16
+ VREPH $0, V0, V1
+ VREPB $2, V0, V8
+index3loop:
+ VL (R7), V2 // load 16-bytes into V2
+ VLL R0, 16(R7), V3 // load 2-bytes into V3
+ VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
+ VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2
+ VCEQH V1, V2, V5 // compare 2-byte even indices
+ VCEQH V1, V4, V6 // compare 2-byte odd indices
+ VCEQB V8, V9, V10 // compare last bytes
+ VSEL V5, V6, V31, V7 // merge even and odd indices
+ VN V7, V10, V7 // AND indices with last byte
+ VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
+ BLT foundV17
+ MOVD $16(R7), R7 // R7+=16
+ ADD $15, R7, R9
+ CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
+ CMPBLE R7, R2, index2to16
+ BR notfound
+
+index4plus:
+ CMPBNE R4, $3, index5plus
+ ADD $15, R7, R9
+ CMPBGE R9, R2, index2to16
+ MOVD $2, R0
+ VGBM $0x8888, V29 // 0xff000000ff000000...
+ VGBM $0x2222, V30 // 0x0000ff000000ff00...
+ VGBM $0xcccc, V31 // 0xffff0000ffff0000...
+ VONE V16
+ VREPF $0, V0, V1
+index4loop:
+ VL (R7), V2 // load 16-bytes into V2
+ VLL R0, 16(R7), V3 // load 3-bytes into V3
+ VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
+ VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1
+ VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1
+ VCEQF V1, V2, V5 // compare index 0, 4, ...
+ VCEQF V1, V4, V6 // compare index 1, 5, ...
+ VCEQF V1, V9, V11 // compare index 2, 6, ...
+ VCEQF V1, V10, V12 // compare index 3, 7, ...
+ VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ...
+ VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
+ VSEL V13, V14, V31, V7 // final merge
+ VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
+ BLT foundV17
+ MOVD $16(R7), R7 // R7+=16
+ ADD $15, R7, R9
+ CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
+ CMPBLE R7, R2, index2to16
+ BR notfound
+
+index5plus:
+ CMPBGT R4, $15, index17plus
+index2to16:
+ CMPBGT R7, R2, notfound
+ MOVD $1(R7), R8
+ CMPBGT R8, R2, index2to16tail
+index2to16loop:
+ // unrolled 2x
+ VLL R4, (R7), V1
+ VLL R4, 1(R7), V2
+ VCEQGS V0, V1, V3
+ BEQ found
+ MOVD $1(R7), R7
+ VCEQGS V0, V2, V4
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLT R7, R2, index2to16loop
+ CMPBGT R7, R2, notfound
+index2to16tail:
+ VLL R4, (R7), V1
+ VCEQGS V0, V1, V2
+ BEQ found
+ BR notfound
+
+index17plus:
+ CMPBGT R4, $31, index33plus
+ SUB $16, R4, R0
+ VLL R0, 16(R3), V1
+ VONE V7
+index17to32loop:
+ VL (R7), V2
+ VLL R0, 16(R7), V3
+ VCEQG V0, V2, V4
+ VCEQG V1, V3, V5
+ VN V4, V5, V6
+ VCEQGS V6, V7, V8
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLE R7, R2, index17to32loop
+ BR notfound
+
+index33plus:
+ CMPBGT R4, $47, index49plus
+ SUB $32, R4, R0
+ VL 16(R3), V1
+ VLL R0, 32(R3), V2
+ VONE V11
+index33to48loop:
+ VL (R7), V3
+ VL 16(R7), V4
+ VLL R0, 32(R7), V5
+ VCEQG V0, V3, V6
+ VCEQG V1, V4, V7
+ VCEQG V2, V5, V8
+ VN V6, V7, V9
+ VN V8, V9, V10
+ VCEQGS V10, V11, V12
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLE R7, R2, index33to48loop
+ BR notfound
+
+index49plus:
+ CMPBGT R4, $63, index65plus
+ SUB $48, R4, R0
+ VL 16(R3), V1
+ VL 32(R3), V2
+ VLL R0, 48(R3), V3
+ VONE V15
+index49to64loop:
+ VL (R7), V4
+ VL 16(R7), V5
+ VL 32(R7), V6
+ VLL R0, 48(R7), V7
+ VCEQG V0, V4, V8
+ VCEQG V1, V5, V9
+ VCEQG V2, V6, V10
+ VCEQG V3, V7, V11
+ VN V8, V9, V12
+ VN V10, V11, V13
+ VN V12, V13, V14
+ VCEQGS V14, V15, V16
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLE R7, R2, index49to64loop
+notfound:
+ MOVD $-1, (R5)
+ RET
+
+index65plus:
+ // not implemented
+ MOVD $0, (R0)
+ RET
+
+foundV17: // index is in doubleword V17[0]
+ VLGVG $0, V17, R8
+ ADD R8, R7
+found:
+ SUB R1, R7
+ MOVD R7, (R5)
+ RET
diff --git a/src/internal/bytealg/indexbyte_386.s b/src/internal/bytealg/indexbyte_386.s
new file mode 100644
index 0000000..8a03054
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_386.s
@@ -0,0 +1,34 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+ MOVL b_base+0(FP), SI
+ MOVL b_len+4(FP), CX
+ MOVB c+12(FP), AL
+ MOVL SI, DI
+ CLD; REPN; SCASB
+ JZ 3(PC)
+ MOVL $-1, ret+16(FP)
+ RET
+ SUBL SI, DI
+ SUBL $1, DI
+ MOVL DI, ret+16(FP)
+ RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+ MOVL s_base+0(FP), SI
+ MOVL s_len+4(FP), CX
+ MOVB c+8(FP), AL
+ MOVL SI, DI
+ CLD; REPN; SCASB
+ JZ 3(PC)
+ MOVL $-1, ret+12(FP)
+ RET
+ SUBL SI, DI
+ SUBL $1, DI
+ MOVL DI, ret+12(FP)
+ RET
diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s
new file mode 100644
index 0000000..1ca70e3
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_amd64.s
@@ -0,0 +1,149 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB), NOSPLIT, $0-40
+ MOVQ b_base+0(FP), SI
+ MOVQ b_len+8(FP), BX
+ MOVB c+24(FP), AL
+ LEAQ ret+32(FP), R8
+ JMP indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB), NOSPLIT, $0-32
+ MOVQ s_base+0(FP), SI
+ MOVQ s_len+8(FP), BX
+ MOVB c+16(FP), AL
+ LEAQ ret+24(FP), R8
+ JMP indexbytebody<>(SB)
+
+// input:
+// SI: data
+// BX: data len
+// AL: byte sought
+// R8: address to put result
+TEXT indexbytebody<>(SB), NOSPLIT, $0
+ // Shuffle X0 around so that each byte contains
+ // the character we're looking for.
+ MOVD AX, X0
+ PUNPCKLBW X0, X0
+ PUNPCKLBW X0, X0
+ PSHUFL $0, X0, X0
+
+ CMPQ BX, $16
+ JLT small
+
+ MOVQ SI, DI
+
+ CMPQ BX, $32
+ JA avx2
+sse:
+ LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
+ JMP sseloopentry
+
+sseloop:
+ // Move the next 16-byte chunk of the data into X1.
+ MOVOU (DI), X1
+ // Compare bytes in X0 to X1.
+ PCMPEQB X0, X1
+ // Take the top bit of each byte in X1 and put the result in DX.
+ PMOVMSKB X1, DX
+ // Find first set bit, if any.
+ BSFL DX, DX
+ JNZ ssesuccess
+ // Advance to next block.
+ ADDQ $16, DI
+sseloopentry:
+ CMPQ DI, AX
+ JB sseloop
+
+ // Search the last 16-byte chunk. This chunk may overlap with the
+ // chunks we've already searched, but that's ok.
+ MOVQ AX, DI
+ MOVOU (AX), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, DX
+ BSFL DX, DX
+ JNZ ssesuccess
+
+failure:
+ MOVQ $-1, (R8)
+ RET
+
+// We've found a chunk containing the byte.
+// The chunk was loaded from DI.
+// The index of the matching byte in the chunk is DX.
+// The start of the data is SI.
+ssesuccess:
+ SUBQ SI, DI // Compute offset of chunk within data.
+ ADDQ DX, DI // Add offset of byte within chunk.
+ MOVQ DI, (R8)
+ RET
+
+// handle for lengths < 16
+small:
+ TESTQ BX, BX
+ JEQ failure
+
+ // Check if we'll load across a page boundary.
+ LEAQ 16(SI), AX
+ TESTW $0xff0, AX
+ JEQ endofpage
+
+ MOVOU (SI), X1 // Load data
+ PCMPEQB X0, X1 // Compare target byte with each byte in data.
+ PMOVMSKB X1, DX // Move result bits to integer register.
+ BSFL DX, DX // Find first set bit.
+ JZ failure // No set bit, failure.
+ CMPL DX, BX
+ JAE failure // Match is past end of data.
+ MOVQ DX, (R8)
+ RET
+
+endofpage:
+ MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
+ PCMPEQB X0, X1 // Compare target byte with each byte in data.
+ PMOVMSKB X1, DX // Move result bits to integer register.
+ MOVL BX, CX
+ SHLL CX, DX
+ SHRL $16, DX // Shift desired bits down to bottom of register.
+ BSFL DX, DX // Find first set bit.
+ JZ failure // No set bit, failure.
+ MOVQ DX, (R8)
+ RET
+
+avx2:
+#ifndef hasAVX2
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+ JNE sse
+#endif
+ MOVD AX, X0
+ LEAQ -32(SI)(BX*1), R11
+ VPBROADCASTB X0, Y1
+avx2_loop:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPTEST Y3, Y3
+ JNZ avx2success
+ ADDQ $32, DI
+ CMPQ DI, R11
+ JLT avx2_loop
+ MOVQ R11, DI
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPTEST Y3, Y3
+ JNZ avx2success
+ VZEROUPPER
+ MOVQ $-1, (R8)
+ RET
+
+avx2success:
+ VPMOVMSKB Y3, DX
+ BSFL DX, DX
+ SUBQ SI, DI
+ ADDQ DI, DX
+ MOVQ DX, (R8)
+ VZEROUPPER
+ RET
diff --git a/src/internal/bytealg/indexbyte_arm.s b/src/internal/bytealg/indexbyte_arm.s
new file mode 100644
index 0000000..faf9797
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_arm.s
@@ -0,0 +1,46 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+ MOVW b_base+0(FP), R0
+ MOVW b_len+4(FP), R1
+ MOVBU c+12(FP), R2 // byte to find
+ MOVW $ret+16(FP), R5
+ B indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+ MOVW s_base+0(FP), R0
+ MOVW s_len+4(FP), R1
+ MOVBU c+8(FP), R2 // byte to find
+ MOVW $ret+12(FP), R5
+ B indexbytebody<>(SB)
+
+// input:
+// R0: data
+// R1: data length
+// R2: byte to find
+// R5: address to put result
+TEXT indexbytebody<>(SB),NOSPLIT,$0-0
+ MOVW R0, R4 // store base for later
+ ADD R0, R1 // end
+
+loop:
+ CMP R0, R1
+ B.EQ notfound
+ MOVBU.P 1(R0), R3
+ CMP R2, R3
+ B.NE loop
+
+ SUB $1, R0 // R0 will be one beyond the position we want
+ SUB R4, R0 // remove base
+ MOVW R0, (R5)
+ RET
+
+notfound:
+ MOVW $-1, R0
+ MOVW R0, (R5)
+ RET
diff --git a/src/internal/bytealg/indexbyte_arm64.s b/src/internal/bytealg/indexbyte_arm64.s
new file mode 100644
index 0000000..40843fb
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_arm64.s
@@ -0,0 +1,126 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+ MOVD b_base+0(FP), R0
+ MOVD b_len+8(FP), R2
+ MOVBU c+24(FP), R1
+ MOVD $ret+32(FP), R8
+ B indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+ MOVD s_base+0(FP), R0
+ MOVD s_len+8(FP), R2
+ MOVBU c+16(FP), R1
+ MOVD $ret+24(FP), R8
+ B indexbytebody<>(SB)
+
+// input:
+// R0: data
+// R1: byte to search
+// R2: data len
+// R8: address to put result
+TEXT indexbytebody<>(SB),NOSPLIT,$0
+ // Core algorithm:
+ // For each 32-byte chunk we calculate a 64-bit syndrome value,
+ // with two bits per byte. For each tuple, bit 0 is set if the
+ // relevant byte matched the requested character and bit 1 is
+ // not used (faster than using a 32bit syndrome). Since the bits
+ // in the syndrome reflect exactly the order in which things occur
+ // in the original string, counting trailing zeros allows to
+ // identify exactly which byte has matched.
+
+ CBZ R2, fail
+ MOVD R0, R11
+ // Magic constant 0x40100401 allows us to identify
+ // which lane matches the requested byte.
+ // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
+ // Different bytes have different bit masks (i.e: 1, 4, 16, 64)
+ MOVD $0x40100401, R5
+ VMOV R1, V0.B16
+ // Work with aligned 32-byte chunks
+ BIC $0x1f, R0, R3
+ VMOV R5, V5.S4
+ ANDS $0x1f, R0, R9
+ AND $0x1f, R2, R10
+ BEQ loop
+
+ // Input string is not 32-byte aligned. We calculate the
+ // syndrome value for the aligned 32 bytes block containing
+ // the first bytes and mask off the irrelevant part.
+ VLD1.P (R3), [V1.B16, V2.B16]
+ SUB $0x20, R9, R4
+ ADDS R4, R2, R2
+ VCMEQ V0.B16, V1.B16, V3.B16
+ VCMEQ V0.B16, V2.B16, V4.B16
+ VAND V5.B16, V3.B16, V3.B16
+ VAND V5.B16, V4.B16, V4.B16
+ VADDP V4.B16, V3.B16, V6.B16 // 256->128
+ VADDP V6.B16, V6.B16, V6.B16 // 128->64
+ VMOV V6.D[0], R6
+ // Clear the irrelevant lower bits
+ LSL $1, R9, R4
+ LSR R4, R6, R6
+ LSL R4, R6, R6
+ // The first block can also be the last
+ BLS masklast
+ // Have we found something already?
+ CBNZ R6, tail
+
+loop:
+ VLD1.P (R3), [V1.B16, V2.B16]
+ SUBS $0x20, R2, R2
+ VCMEQ V0.B16, V1.B16, V3.B16
+ VCMEQ V0.B16, V2.B16, V4.B16
+ // If we're out of data we finish regardless of the result
+ BLS end
+ // Use a fast check for the termination condition
+ VORR V4.B16, V3.B16, V6.B16
+ VADDP V6.D2, V6.D2, V6.D2
+ VMOV V6.D[0], R6
+ // We're not out of data, loop if we haven't found the character
+ CBZ R6, loop
+
+end:
+ // Termination condition found, let's calculate the syndrome value
+ VAND V5.B16, V3.B16, V3.B16
+ VAND V5.B16, V4.B16, V4.B16
+ VADDP V4.B16, V3.B16, V6.B16
+ VADDP V6.B16, V6.B16, V6.B16
+ VMOV V6.D[0], R6
+ // Only do the clear for the last possible block with less than 32 bytes
+ // Condition flags come from SUBS in the loop
+ BHS tail
+
+masklast:
+ // Clear the irrelevant upper bits
+ ADD R9, R10, R4
+ AND $0x1f, R4, R4
+ SUB $0x20, R4, R4
+ NEG R4<<1, R4
+ LSL R4, R6, R6
+ LSR R4, R6, R6
+
+tail:
+ // Check that we have found a character
+ CBZ R6, fail
+ // Count the trailing zeros using bit reversing
+ RBIT R6, R6
+ // Compensate the last post-increment
+ SUB $0x20, R3, R3
+ // And count the leading zeros
+ CLZ R6, R6
+ // R6 is twice the offset into the fragment
+ ADD R6>>1, R3, R0
+ // Compute the offset result
+ SUB R11, R0, R0
+ MOVD R0, (R8)
+ RET
+
+fail:
+ MOVD $-1, R0
+ MOVD R0, (R8)
+ RET
diff --git a/src/internal/bytealg/indexbyte_generic.go b/src/internal/bytealg/indexbyte_generic.go
new file mode 100644
index 0000000..b89d34f
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_generic.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !mips && !mipsle && !mips64 && !mips64le && !riscv64 && !wasm
+
+package bytealg
+
+func IndexByte(b []byte, c byte) int {
+ for i, x := range b {
+ if x == c {
+ return i
+ }
+ }
+ return -1
+}
+
+func IndexByteString(s string, c byte) int {
+ for i := 0; i < len(s); i++ {
+ if s[i] == c {
+ return i
+ }
+ }
+ return -1
+}
diff --git a/src/internal/bytealg/indexbyte_loong64.s b/src/internal/bytealg/indexbyte_loong64.s
new file mode 100644
index 0000000..baa9c86
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_loong64.s
@@ -0,0 +1,52 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+ MOVV b_base+0(FP), R4
+ MOVV b_len+8(FP), R5
+ MOVBU c+24(FP), R6 // byte to find
+ MOVV R4, R7 // store base for later
+ ADDV R4, R5 // end
+ ADDV $-1, R4
+
+loop:
+ ADDV $1, R4
+ BEQ R4, R5, notfound
+ MOVBU (R4), R8
+ BNE R6, R8, loop
+
+ SUBV R7, R4 // remove base
+ MOVV R4, ret+32(FP)
+ RET
+
+notfound:
+ MOVV $-1, R4
+ MOVV R4, ret+32(FP)
+ RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+ MOVV s_base+0(FP), R4
+ MOVV s_len+8(FP), R5
+ MOVBU c+16(FP), R6 // byte to find
+ MOVV R4, R7 // store base for later
+ ADDV R4, R5 // end
+ ADDV $-1, R4
+
+loop:
+ ADDV $1, R4
+ BEQ R4, R5, notfound
+ MOVBU (R4), R8
+ BNE R6, R8, loop
+
+ SUBV R7, R4 // remove base
+ MOVV R4, ret+24(FP)
+ RET
+
+notfound:
+ MOVV $-1, R4
+ MOVV R4, ret+24(FP)
+ RET
diff --git a/src/internal/bytealg/indexbyte_mips64x.s b/src/internal/bytealg/indexbyte_mips64x.s
new file mode 100644
index 0000000..5689f84
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_mips64x.s
@@ -0,0 +1,54 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+ MOVV b_base+0(FP), R1
+ MOVV b_len+8(FP), R2
+ MOVBU c+24(FP), R3 // byte to find
+ MOVV R1, R4 // store base for later
+ ADDV R1, R2 // end
+ ADDV $-1, R1
+
+loop:
+ ADDV $1, R1
+ BEQ R1, R2, notfound
+ MOVBU (R1), R5
+ BNE R3, R5, loop
+
+ SUBV R4, R1 // remove base
+ MOVV R1, ret+32(FP)
+ RET
+
+notfound:
+ MOVV $-1, R1
+ MOVV R1, ret+32(FP)
+ RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+ MOVV s_base+0(FP), R1
+ MOVV s_len+8(FP), R2
+ MOVBU c+16(FP), R3 // byte to find
+ MOVV R1, R4 // store base for later
+ ADDV R1, R2 // end
+ ADDV $-1, R1
+
+loop:
+ ADDV $1, R1
+ BEQ R1, R2, notfound
+ MOVBU (R1), R5
+ BNE R3, R5, loop
+
+ SUBV R4, R1 // remove base
+ MOVV R1, ret+24(FP)
+ RET
+
+notfound:
+ MOVV $-1, R1
+ MOVV R1, ret+24(FP)
+ RET
diff --git a/src/internal/bytealg/indexbyte_mipsx.s b/src/internal/bytealg/indexbyte_mipsx.s
new file mode 100644
index 0000000..1c2b104
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_mipsx.s
@@ -0,0 +1,52 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+ MOVW b_base+0(FP), R1
+ MOVW b_len+4(FP), R2
+ MOVBU c+12(FP), R3 // byte to find
+ ADDU $1, R1, R4 // store base+1 for later
+ ADDU R1, R2 // end
+
+loop:
+ BEQ R1, R2, notfound
+ MOVBU (R1), R5
+ ADDU $1, R1
+ BNE R3, R5, loop
+
+ SUBU R4, R1 // R1 will be one beyond the position we want so remove (base+1)
+ MOVW R1, ret+16(FP)
+ RET
+
+notfound:
+ MOVW $-1, R1
+ MOVW R1, ret+16(FP)
+ RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+ MOVW s_base+0(FP), R1
+ MOVW s_len+4(FP), R2
+ MOVBU c+8(FP), R3 // byte to find
+ ADDU $1, R1, R4 // store base+1 for later
+ ADDU R1, R2 // end
+
+loop:
+ BEQ R1, R2, notfound
+ MOVBU (R1), R5
+ ADDU $1, R1
+ BNE R3, R5, loop
+
+ SUBU R4, R1 // remove (base+1)
+ MOVW R1, ret+12(FP)
+ RET
+
+notfound:
+ MOVW $-1, R1
+ MOVW R1, ret+12(FP)
+ RET
diff --git a/src/internal/bytealg/indexbyte_native.go b/src/internal/bytealg/indexbyte_native.go
new file mode 100644
index 0000000..c5bb2df
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_native.go
@@ -0,0 +1,13 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64 || s390x || arm || arm64 || loong64 || ppc64 || ppc64le || mips || mipsle || mips64 || mips64le || riscv64 || wasm
+
+package bytealg
+
+//go:noescape
+func IndexByte(b []byte, c byte) int
+
+//go:noescape
+func IndexByteString(s string, c byte) int
diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s
new file mode 100644
index 0000000..1a6e852
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -0,0 +1,391 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+ // R3 = byte array pointer
+ // R4 = length
+ MOVD R6, R5 // R5 = byte
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+ BR indexbytebody<>(SB)
+
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
+ // R3 = string
+ // R4 = length
+ // R5 = byte
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+ BR indexbytebody<>(SB)
+
+// R3 = addr of string
+// R4 = len of string
+// R5 = byte to find
+// R16 = 1 if running on a POWER9 system, 0 otherwise
+// On exit:
+// R3 = return value
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+ MOVD R3,R17 // Save base address for calculating the index later.
+ RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
+ RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
+ ADD R4,R3,R7 // Last acceptable address in R7.
+
+ RLDIMI $16,R5,$32,R5
+ CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently.
+ MOVD $-1,R9
+ RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8
+ RLDIMI $32,R5,$0,R5
+ MOVD R7,R10 // Save last acceptable address in R10 for later.
+ ADD $-1,R7,R7
+#ifdef GOARCH_ppc64le
+ SLD R6,R9,R9 // Prepare mask for Little Endian
+#else
+ SRD R6,R9,R9 // Same for Big Endian
+#endif
+ BLT small_string // Jump to the small string case if it's <32 bytes.
+ CMP R16,$1 // optimize for power8 v power9
+ BNE power8
+ VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
+ MTVRD R5,V1
+ LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
+ VSLB V11,V10,V10 // to extract the first bit of match result into GPR
+ VSPLTB $7,V1,V1 // Replicate byte across V1
+ CMP R4,$64
+ MOVD $16,R11
+ MOVD R3,R8
+ BLT cmp32
+ MOVD $32,R12
+ MOVD $48,R6
+
+loop64:
+ LXVB16X (R0)(R8),V2 // scan 64 bytes at a time
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0 // match found at R8, jump out
+
+ LXVB16X (R8)(R11),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat1 // match found at R8+16 bytes, jump out
+
+ LXVB16X (R8)(R12),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat2 // match found at R8+32 bytes, jump out
+
+ LXVB16X (R8)(R6),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat3 // match found at R8+48 bytes, jump out
+ ADD $64,R8
+ ADD $-64,R4
+ CMP R4,$64 // >=64 bytes left to scan?
+ BGE loop64
+ CMP R4,$32
+ BLT rem // jump to rem if there are < 32 bytes left
+cmp32:
+ LXVB16X (R0)(R8),V2 // 32-63 bytes left
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0 // match found at R8
+
+ LXVB16X (R11)(R8),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat1 // match found at R8+16
+
+ ADD $32,R8
+ ADD $-32,R4
+rem:
+ RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing
+ BR small_string
+
+foundat3:
+ ADD $16,R8
+foundat2:
+ ADD $16,R8
+foundat1:
+ ADD $16,R8
+foundat0:
+ // Compress the result into a single doubleword and
+ // move it to a GPR for the final calculation.
+ VBPERMQ V6,V10,V6
+ MFVRD V6,R3
+ // count leading zeroes upto the match that ends up in low 16 bits
+ // in both endian modes, compute index by subtracting the number by 16
+ CNTLZW R3,R11
+ ADD $-16,R11
+ ADD R8,R11,R3 // Calculate byte address
+ SUB R17,R3
+ RET
+power8:
+ // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
+ // in V0, V1 and V10, then branch to the preloop.
+ ANDCC $63,R3,R11
+ BEQ CR0,qw_align
+ RLDICL $0,R3,$61,R11
+
+ MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
+ CMPB R12,R5,R3 // Check for a match.
+ AND R9,R3,R3 // Mask bytes below s_base
+ RLDICR $0,R7,$60,R7 // Last doubleword in R7
+ CMPU R3,$0,CR7 // If we have a match, jump to the final computation
+ BNE CR7,done
+ ADD $8,R8,R8
+ ADD $-8,R4,R4
+ ADD R4,R11,R4
+
+ // Check for quadword alignment
+ ANDCC $15,R8,R11
+ BEQ CR0,qw_align
+
+ // Not aligned, so handle the next doubleword
+ MOVD 0(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR7
+ BNE CR7,done
+ ADD $8,R8,R8
+ ADD $-8,R4,R4
+
+ // Either quadword aligned or 64-byte at this point. We can use LVX.
+qw_align:
+
+ // Set up auxiliary data for the vectorized algorithm.
+ VSPLTISB $0,V0 // Replicate 0 across V0
+ VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
+ MTVRD R5,V1
+ LVSL (R0+R0),V11
+ VSLB V11,V10,V10
+ VSPLTB $7,V1,V1 // Replicate byte across V1
+ CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop
+ BLE tail
+
+ // We will load 4 quardwords per iteration in the loop, so check for
+ // 64-byte alignment. If 64-byte aligned, then branch to the preloop.
+ ANDCC $63,R8,R11
+ BEQ CR0,preloop
+
+ // Not 64-byte aligned. Load one quadword at a time until aligned.
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6 // Check for byte in V4
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ ADD $-16,R4,R4
+
+ ANDCC $63,R8,R11
+ BEQ CR0,preloop
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6 // Check for byte in V4
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ ADD $-16,R4,R4
+
+ ANDCC $63,R8,R11
+ BEQ CR0,preloop
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6 // Check for byte in V4
+ BNE CR6,found_qw_align
+ ADD $-16,R4,R4
+ ADD $16,R8,R8
+
+ // 64-byte aligned. Prepare for the main loop.
+preloop:
+ CMPU R4,$64
+ BLE tail // If len ≤ 64, don't use the vectorized loop
+
+ // We are now aligned to a 64-byte boundary. We will load 4 quadwords
+ // per loop iteration. The last doubleword is in R10, so our loop counter
+ // starts at (R10-R8)/64.
+ SUB R8,R10,R6
+ SRD $6,R6,R9 // Loop counter in R9
+ MOVD R9,CTR
+
+ ADD $-64,R8,R8 // Adjust index for loop entry
+ MOVD $16,R11 // Load offsets for the vector loads
+ MOVD $32,R9
+ MOVD $48,R7
+
+ // Main loop we will load 64 bytes per iteration
+loop:
+ ADD $64,R8,R8 // Fuse addi+lvx for performance
+ LVX (R8+R0),V2 // Load 4 16-byte vectors
+ LVX (R8+R11),V3
+ VCMPEQUB V1,V2,V6 // Look for byte in each vector
+ VCMPEQUB V1,V3,V7
+
+ LVX (R8+R9),V4
+ LVX (R8+R7),V5
+ VCMPEQUB V1,V4,V8
+ VCMPEQUB V1,V5,V9
+
+ VOR V6,V7,V11 // Compress the result in a single vector
+ VOR V8,V9,V12
+ VOR V11,V12,V13
+ VCMPEQUBCC V0,V13,V14 // Check for byte
+ BGE CR6,found
+ BC 16,0,loop // bdnz loop
+
+ // Handle the tailing bytes or R4 ≤ 64
+ RLDICL $0,R6,$58,R4
+ ADD $64,R8,R8
+tail:
+ CMPU R4,$0
+ BEQ notfound
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ CMPU R4,$16,CR6
+ BLE CR6,notfound
+ ADD $-16,R4,R4
+
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ CMPU R4,$16,CR6
+ BLE CR6,notfound
+ ADD $-16,R4,R4
+
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
+ ADD $16,R8,R8
+ CMPU R4,$16,CR6
+ BLE CR6,notfound
+ ADD $-16,R4,R4
+
+ LVX (R8+R0),V4
+ VCMPEQUBCC V1,V4,V6
+ BNE CR6,found_qw_align
+
+notfound:
+ MOVD $-1, R3
+ RET
+
+found:
+ // We will now compress the results into a single doubleword,
+ // so it can be moved to a GPR for the final index calculation.
+
+ // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
+ // first bit of each byte into bits 48-63.
+ VBPERMQ V6,V10,V6
+ VBPERMQ V7,V10,V7
+ VBPERMQ V8,V10,V8
+ VBPERMQ V9,V10,V9
+
+ // Shift each 16-bit component into its correct position for
+ // merging into a single doubleword.
+#ifdef GOARCH_ppc64le
+ VSLDOI $2,V7,V7,V7
+ VSLDOI $4,V8,V8,V8
+ VSLDOI $6,V9,V9,V9
+#else
+ VSLDOI $6,V6,V6,V6
+ VSLDOI $4,V7,V7,V7
+ VSLDOI $2,V8,V8,V8
+#endif
+
+ // Merge V6-V9 into a single doubleword and move to a GPR.
+ VOR V6,V7,V11
+ VOR V8,V9,V4
+ VOR V4,V11,V4
+ MFVRD V4,R3
+
+#ifdef GOARCH_ppc64le
+ ADD $-1,R3,R11
+ ANDN R3,R11,R11
+ POPCNTD R11,R11 // Count trailing zeros (Little Endian).
+#else
+ CNTLZD R3,R11 // Count leading zeros (Big Endian).
+#endif
+ ADD R8,R11,R3 // Calculate byte address
+
+return:
+ SUB R17, R3
+ RET
+
+found_qw_align:
+ // Use the same algorithm as above. Compress the result into
+ // a single doubleword and move it to a GPR for the final
+ // calculation.
+ VBPERMQ V6,V10,V6
+
+#ifdef GOARCH_ppc64le
+ MFVRD V6,R3
+ ADD $-1,R3,R11
+ ANDN R3,R11,R11
+ POPCNTD R11,R11
+#else
+ VSLDOI $6,V6,V6,V6
+ MFVRD V6,R3
+ CNTLZD R3,R11
+#endif
+ ADD R8,R11,R3
+ CMPU R11,R4
+ BLT return
+ BR notfound
+ PCALIGN $16
+
+done:
+ ADD $-1,R10,R6
+ // Offset of last index for the final
+ // doubleword comparison
+ RLDICL $0,R6,$61,R6
+ // At this point, R3 has 0xFF in the same position as the byte we are
+ // looking for in the doubleword. Use that to calculate the exact index
+ // of the byte.
+#ifdef GOARCH_ppc64le
+ ADD $-1,R3,R11
+ ANDN R3,R11,R11
+ POPCNTD R11,R11 // Count trailing zeros (Little Endian).
+#else
+ CNTLZD R3,R11 // Count leading zeros (Big Endian).
+#endif
+ CMPU R8,R7 // Check if we are at the last doubleword.
+ SRD $3,R11 // Convert trailing zeros to bytes.
+ ADD R11,R8,R3
+ CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
+ BNE return
+ BLE CR7,return
+ BR notfound
+
+small_string:
+ // process string of length < 32 bytes
+ // We unroll this loop for better performance.
+ CMPU R4,$0 // Check for length=0
+ BEQ notfound
+
+ MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
+ CMPB R12,R5,R3 // Check for a match.
+ AND R9,R3,R3 // Mask bytes below s_base.
+ CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
+ RLDICR $0,R7,$60,R7 // Last doubleword in R7.
+ CMPU R8,R7
+ BNE CR7,done
+ BEQ notfound // Hit length.
+
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ CMPU R8,R7
+ BNE CR6,done
+ BEQ notfound
+
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ CMPU R8,R7
+ BNE CR6,done
+ BEQ notfound
+
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ CMPU R8,R7
+ BNE CR6,done
+ BEQ notfound
+
+ MOVDU 8(R8),R12
+ CMPB R12,R5,R3
+ CMPU R3,$0,CR6
+ BNE CR6,done
+ BR notfound
+
diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s
new file mode 100644
index 0000000..8be78ed
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_riscv64.s
@@ -0,0 +1,51 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+ // X10 = b_base
+ // X11 = b_len
+ // X12 = b_cap (unused)
+ // X13 = byte to find
+ AND $0xff, X13
+ MOV X10, X12 // store base for later
+ ADD X10, X11 // end
+ ADD $-1, X10
+
+loop:
+ ADD $1, X10
+ BEQ X10, X11, notfound
+ MOVBU (X10), X14
+ BNE X13, X14, loop
+
+ SUB X12, X10 // remove base
+ RET
+
+notfound:
+ MOV $-1, X10
+ RET
+
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+ // X10 = b_base
+ // X11 = b_len
+ // X12 = byte to find
+ AND $0xff, X12
+ MOV X10, X13 // store base for later
+ ADD X10, X11 // end
+ ADD $-1, X10
+
+loop:
+ ADD $1, X10
+ BEQ X10, X11, notfound
+ MOVBU (X10), X14
+ BNE X12, X14, loop
+
+ SUB X13, X10 // remove base
+ RET
+
+notfound:
+ MOV $-1, X10
+ RET
diff --git a/src/internal/bytealg/indexbyte_s390x.s b/src/internal/bytealg/indexbyte_s390x.s
new file mode 100644
index 0000000..cf88d92
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_s390x.s
@@ -0,0 +1,108 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
+ MOVD b_base+0(FP), R3// b_base => R3
+ MOVD b_len+8(FP), R4 // b_len => R4
+ MOVBZ c+24(FP), R5 // c => R5
+ MOVD $ret+32(FP), R2 // &ret => R9
+ BR indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
+ MOVD s_base+0(FP), R3// s_base => R3
+ MOVD s_len+8(FP), R4 // s_len => R4
+ MOVBZ c+16(FP), R5 // c => R5
+ MOVD $ret+24(FP), R2 // &ret => R9
+ BR indexbytebody<>(SB)
+
+// input:
+// R3: s
+// R4: s_len
+// R5: c -- byte sought
+// R2: &ret -- address to put index into
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0
+ CMPBEQ R4, $0, notfound
+ MOVD R3, R6 // store base for later
+ ADD R3, R4, R8 // the address after the end of the string
+ //if the length is small, use loop; otherwise, use vector or srst search
+ CMPBGE R4, $16, large
+
+residual:
+ CMPBEQ R3, R8, notfound
+ MOVBZ 0(R3), R7
+ LA 1(R3), R3
+ CMPBNE R7, R5, residual
+
+found:
+ SUB R6, R3
+ SUB $1, R3
+ MOVD R3, 0(R2)
+ RET
+
+notfound:
+ MOVD $-1, 0(R2)
+ RET
+
+large:
+ MOVBZ internal∕cpu·S390X+const_offsetS390xHasVX(SB), R1
+ CMPBNE R1, $0, vectorimpl
+
+srstimpl: // no vector facility
+ MOVBZ R5, R0 // c needs to be in R0, leave until last minute as currently R0 is expected to be 0
+srstloop:
+ WORD $0xB25E0083 // srst %r8, %r3 (search the range [R3, R8))
+ BVS srstloop // interrupted - continue
+ BGT notfoundr0
+foundr0:
+ XOR R0, R0 // reset R0
+ SUB R6, R8 // remove base
+ MOVD R8, 0(R2)
+ RET
+notfoundr0:
+ XOR R0, R0 // reset R0
+ MOVD $-1, 0(R2)
+ RET
+
+vectorimpl:
+ //if the address is not 16byte aligned, use loop for the header
+ MOVD R3, R8
+ AND $15, R8
+ CMPBGT R8, $0, notaligned
+
+aligned:
+ ADD R6, R4, R8
+ MOVD R8, R7
+ AND $-16, R7
+ // replicate c across V17
+ VLVGB $0, R5, V19
+ VREPB $0, V19, V17
+
+vectorloop:
+ CMPBGE R3, R7, residual
+ VL 0(R3), V16 // load string to be searched into V16
+ ADD $16, R3
+ VFEEBS V16, V17, V18 // search V17 in V16 and set conditional code accordingly
+ BVS vectorloop
+
+ // when vector search found c in the string
+ VLGVB $7, V18, R7 // load 7th element of V18 containing index into R7
+ SUB $16, R3
+ SUB R6, R3
+ ADD R3, R7
+ MOVD R7, 0(R2)
+ RET
+
+notaligned:
+ MOVD R3, R8
+ AND $-16, R8
+ ADD $16, R8
+notalignedloop:
+ CMPBEQ R3, R8, aligned
+ MOVBZ 0(R3), R7
+ LA 1(R3), R3
+ CMPBNE R7, R5, notalignedloop
+ BR found
diff --git a/src/internal/bytealg/indexbyte_wasm.s b/src/internal/bytealg/indexbyte_wasm.s
new file mode 100644
index 0000000..ef4bd93
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_wasm.s
@@ -0,0 +1,195 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB), NOSPLIT, $0-40
+ I64Load b_base+0(FP)
+ I32WrapI64
+ I32Load8U c+24(FP)
+ I64Load b_len+8(FP)
+ I32WrapI64
+ Call memchr<>(SB)
+ I64ExtendI32S
+ Set R0
+
+ Get SP
+ I64Const $-1
+ Get R0
+ I64Load b_base+0(FP)
+ I64Sub
+ Get R0
+ I64Eqz $0
+ Select
+ I64Store ret+32(FP)
+
+ RET
+
+TEXT ·IndexByteString(SB), NOSPLIT, $0-32
+ Get SP
+ I64Load s_base+0(FP)
+ I32WrapI64
+ I32Load8U c+16(FP)
+ I64Load s_len+8(FP)
+ I32WrapI64
+ Call memchr<>(SB)
+ I64ExtendI32S
+ Set R0
+
+ I64Const $-1
+ Get R0
+ I64Load s_base+0(FP)
+ I64Sub
+ Get R0
+ I64Eqz $0
+ Select
+ I64Store ret+24(FP)
+
+ RET
+
+// initially compiled with emscripten and then modified over time.
+// params:
+// R0: s
+// R1: c
+// R2: len
+// ret: index
+TEXT memchr<>(SB), NOSPLIT, $0
+ Get R1
+ Set R4
+ Block
+ Block
+ Get R2
+ I32Const $0
+ I32Ne
+ Tee R3
+ Get R0
+ I32Const $3
+ I32And
+ I32Const $0
+ I32Ne
+ I32And
+ If
+ Loop
+ Get R0
+ I32Load8U $0
+ Get R1
+ I32Eq
+ BrIf $2
+ Get R2
+ I32Const $-1
+ I32Add
+ Tee R2
+ I32Const $0
+ I32Ne
+ Tee R3
+ Get R0
+ I32Const $1
+ I32Add
+ Tee R0
+ I32Const $3
+ I32And
+ I32Const $0
+ I32Ne
+ I32And
+ BrIf $0
+ End
+ End
+ Get R3
+ BrIf $0
+ I32Const $0
+ Set R1
+ Br $1
+ End
+ Get R0
+ I32Load8U $0
+ Get R4
+ Tee R3
+ I32Eq
+ If
+ Get R2
+ Set R1
+ Else
+ Get R4
+ I32Const $16843009
+ I32Mul
+ Set R4
+ Block
+ Block
+ Get R2
+ I32Const $3
+ I32GtU
+ If
+ Get R2
+ Set R1
+ Loop
+ Get R0
+ I32Load $0
+ Get R4
+ I32Xor
+ Tee R2
+ I32Const $-2139062144
+ I32And
+ I32Const $-2139062144
+ I32Xor
+ Get R2
+ I32Const $-16843009
+ I32Add
+ I32And
+ I32Eqz
+ If
+ Get R0
+ I32Const $4
+ I32Add
+ Set R0
+ Get R1
+ I32Const $-4
+ I32Add
+ Tee R1
+ I32Const $3
+ I32GtU
+ BrIf $1
+ Br $3
+ End
+ End
+ Else
+ Get R2
+ Set R1
+ Br $1
+ End
+ Br $1
+ End
+ Get R1
+ I32Eqz
+ If
+ I32Const $0
+ Set R1
+ Br $3
+ End
+ End
+ Loop
+ Get R0
+ I32Load8U $0
+ Get R3
+ I32Eq
+ BrIf $2
+ Get R0
+ I32Const $1
+ I32Add
+ Set R0
+ Get R1
+ I32Const $-1
+ I32Add
+ Tee R1
+ BrIf $0
+ I32Const $0
+ Set R1
+ End
+ End
+ End
+ Get R0
+ I32Const $0
+ Get R1
+ Select
+ Return