58 files changed, 6876 insertions, 0 deletions
diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go
new file mode 100644
index 0000000..ebebce7
--- /dev/null
+++ b/src/internal/bytealg/bytealg.go
@@ -0,0 +1,150 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import (
+	"internal/cpu"
+	"unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+	offsetX86HasSSE42  = unsafe.Offsetof(cpu.X86.HasSSE42)
+	offsetX86HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
+	offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+
+	offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+
+	offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
+)
+
+// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+// If MaxLen is not 0, make sure MaxLen >= 4.
+var MaxLen int
+
+// FIXME: the logic of HashStrBytes, HashStrRevBytes, IndexRabinKarpBytes and HashStr, HashStrRev,
+// IndexRabinKarp are exactly the same, except that the types are different. Can we eliminate
+// three of them without causing allocation?
+
+// PrimeRK is the prime base used in Rabin-Karp algorithm.
+const PrimeRK = 16777619
+
+// HashStrBytes returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func HashStrBytes(sep []byte) (uint32, uint32) {
+	hash := uint32(0)
+	for i := 0; i < len(sep); i++ {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// HashStr returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func HashStr(sep string) (uint32, uint32) {
+	hash := uint32(0)
+	for i := 0; i < len(sep); i++ {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// HashStrRevBytes returns the hash of the reverse of sep and the
+// appropriate multiplicative factor for use in Rabin-Karp algorithm.
+func HashStrRevBytes(sep []byte) (uint32, uint32) {
+	hash := uint32(0)
+	for i := len(sep) - 1; i >= 0; i-- {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// HashStrRev returns the hash of the reverse of sep and the
+// appropriate multiplicative factor for use in Rabin-Karp algorithm.
+func HashStrRev(sep string) (uint32, uint32) {
+	hash := uint32(0)
+	for i := len(sep) - 1; i >= 0; i-- {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// IndexRabinKarpBytes uses the Rabin-Karp search algorithm to return the index of the
+// first occurrence of substr in s, or -1 if not present.
+func IndexRabinKarpBytes(s, sep []byte) int {
+	// Rabin-Karp search
+	hashsep, pow := HashStrBytes(sep)
+	n := len(sep)
+	var h uint32
+	for i := 0; i < n; i++ {
+		h = h*PrimeRK + uint32(s[i])
+	}
+	if h == hashsep && Equal(s[:n], sep) {
+		return 0
+	}
+	for i := n; i < len(s); {
+		h *= PrimeRK
+		h += uint32(s[i])
+		h -= pow * uint32(s[i-n])
+		i++
+		if h == hashsep && Equal(s[i-n:i], sep) {
+			return i - n
+		}
+	}
+	return -1
+}
+
+// IndexRabinKarp uses the Rabin-Karp search algorithm to return the index of the
+// first occurrence of substr in s, or -1 if not present.
+func IndexRabinKarp(s, substr string) int {
+	// Rabin-Karp search
+	hashss, pow := HashStr(substr)
+	n := len(substr)
+	var h uint32
+	for i := 0; i < n; i++ {
+		h = h*PrimeRK + uint32(s[i])
+	}
+	if h == hashss && s[:n] == substr {
+		return 0
+	}
+	for i := n; i < len(s); {
+		h *= PrimeRK
+		h += uint32(s[i])
+		h -= pow * uint32(s[i-n])
+		i++
+		if h == hashss && s[i-n:i] == substr {
+			return i - n
+		}
+	}
+	return -1
+}
diff --git a/src/internal/bytealg/compare_386.s b/src/internal/bytealg/compare_386.s
new file mode 100644
index 0000000..27b660c
--- /dev/null
+++ b/src/internal/bytealg/compare_386.s
@@ -0,0 +1,144 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+	MOVL	a_base+0(FP), SI
+	MOVL	a_len+4(FP), BX
+	MOVL	b_base+12(FP), DI
+	MOVL	b_len+16(FP), DX
+	LEAL	ret+24(FP), AX
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+	MOVL	a_base+0(FP), SI
+	MOVL	a_len+4(FP), BX
+	MOVL	b_base+8(FP), DI
+	MOVL	b_len+12(FP), DX
+	LEAL	ret+16(FP), AX
+	JMP	cmpbody<>(SB)
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+//   AX = address of return word (set to 1/0/-1)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+	MOVL	DX, BP
+	SUBL	BX, DX // DX = blen-alen
+	JLE	2(PC)
+	MOVL	BX, BP // BP = min(alen, blen)
+	CMPL	SI, DI
+	JEQ	allsame
+	CMPL	BP, $4
+	JB	small
+#ifdef GO386_softfloat
+	JMP	mediumloop
+#endif
+largeloop:
+	CMPL	BP, $16
+	JB	mediumloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, BX
+	XORL	$0xffff, BX	// convert EQ to NE
+	JNE	diff16	// branch if at least one byte is not equal
+	ADDL	$16, SI
+	ADDL	$16, DI
+	SUBL	$16, BP
+	JMP	largeloop
+
+diff16:
+	BSFL	BX, BX	// index of first byte that differs
+	XORL	DX, DX
+	MOVB	(SI)(BX*1), CX
+	CMPB	CX, (DI)(BX*1)
+	SETHI	DX
+	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
+	MOVL	DX, (AX)
+	RET
+
+mediumloop:
+	CMPL	BP, $4
+	JBE	_0through4
+	MOVL	(SI), BX
+	MOVL	(DI), CX
+	CMPL	BX, CX
+	JNE	diff4
+	ADDL	$4, SI
+	ADDL	$4, DI
+	SUBL	$4, BP
+	JMP	mediumloop
+
+_0through4:
+	MOVL	-4(SI)(BP*1), BX
+	MOVL	-4(DI)(BP*1), CX
+	CMPL	BX, CX
+	JEQ	allsame
+
+diff4:
+	BSWAPL	BX	// reverse order of bytes
+	BSWAPL	CX
+	XORL	BX, CX	// find bit differences
+	BSRL	CX, CX	// index of highest bit difference
+	SHRL	CX, BX	// move a's bit to bottom
+	ANDL	$1, BX	// mask bit
+	LEAL	-1(BX*2), BX // 1/0 => +1/-1
+	MOVL	BX, (AX)
+	RET
+
+	// 0-3 bytes in common
+small:
+	LEAL	(BP*8), CX
+	NEGL	CX
+	JEQ	allsame
+
+	// load si
+	CMPB	SI, $0xfc
+	JA	si_high
+	MOVL	(SI), SI
+	JMP	si_finish
+si_high:
+	MOVL	-4(SI)(BP*1), SI
+	SHRL	CX, SI
+si_finish:
+	SHLL	CX, SI
+
+	// same for di
+	CMPB	DI, $0xfc
+	JA	di_high
+	MOVL	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVL	-4(DI)(BP*1), DI
+	SHRL	CX, DI
+di_finish:
+	SHLL	CX, DI
+
+	BSWAPL	SI	// reverse order of bytes
+	BSWAPL	DI
+	XORL	SI, DI	// find bit differences
+	JEQ	allsame
+	BSRL	DI, CX	// index of highest bit difference
+	SHRL	CX, SI	// move a's bit to bottom
+	ANDL	$1, SI	// mask bit
+	LEAL	-1(SI*2), BX // 1/0 => +1/-1
+	MOVL	BX, (AX)
+	RET
+
+	// all the bytes in common are the same, so we just need
+	// to compare the lengths.
+allsame:
+	XORL	BX, BX
+	XORL	CX, CX
+	TESTL	DX, DX
+	SETLT	BX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
+	MOVL	BX, (AX)
+	RET
diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s
new file mode 100644
index 0000000..fdd015f
--- /dev/null
+++ b/src/internal/bytealg/compare_amd64.s
@@ -0,0 +1,237 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "asm_amd64.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
+	// AX = a_base (want in SI)
+	// BX = a_len  (want in BX)
+	// CX = a_cap  (unused)
+	// DI = b_base (want in DI)
+	// SI = b_len  (want in DX)
+	// R8 = b_cap  (unused)
+	MOVQ	SI, DX
+	MOVQ	AX, SI
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
+	// AX = a_base (want in SI)
+	// BX = a_len  (want in BX)
+	// CX = b_base (want in DI)
+	// DI = b_len  (want in DX)
+	MOVQ	AX, SI
+	MOVQ	DI, DX
+	MOVQ	CX, DI
+	JMP	cmpbody<>(SB)
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+// output:
+//   AX = output (-1/0/1)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+	CMPQ	SI, DI
+	JEQ	allsame
+	CMPQ	BX, DX
+	MOVQ	DX, R8
+	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
+	CMPQ	R8, $8
+	JB	small
+
+	CMPQ	R8, $63
+	JBE	loop
+#ifndef hasAVX2
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JEQ     big_loop_avx2
+	JMP	big_loop
+#else
+	JMP	big_loop_avx2
+#endif
+loop:
+	CMPQ	R8, $16
+	JBE	_0through16
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX	// convert EQ to NE
+	JNE	diff16	// branch if at least one byte is not equal
+	ADDQ	$16, SI
+	ADDQ	$16, DI
+	SUBQ	$16, R8
+	JMP	loop
+
+diff64:
+	ADDQ	$48, SI
+	ADDQ	$48, DI
+	JMP	diff16
+diff48:
+	ADDQ	$32, SI
+	ADDQ	$32, DI
+	JMP	diff16
+diff32:
+	ADDQ	$16, SI
+	ADDQ	$16, DI
+	// AX = bit mask of differences
+diff16:
+	BSFQ	AX, BX	// index of first byte that differs
+	XORQ	AX, AX
+	MOVB	(SI)(BX*1), CX
+	CMPB	CX, (DI)(BX*1)
+	SETHI	AX
+	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
+	RET
+
+	// 0 through 16 bytes left, alen>=8, blen>=8
+_0through16:
+	CMPQ	R8, $8
+	JBE	_0through8
+	MOVQ	(SI), AX
+	MOVQ	(DI), CX
+	CMPQ	AX, CX
+	JNE	diff8
+_0through8:
+	MOVQ	-8(SI)(R8*1), AX
+	MOVQ	-8(DI)(R8*1), CX
+	CMPQ	AX, CX
+	JEQ	allsame
+
+	// AX and CX contain parts of a and b that differ.
+diff8:
+	BSWAPQ	AX	// reverse order of bytes
+	BSWAPQ	CX
+	XORQ	AX, CX
+	BSRQ	CX, CX	// index of highest bit difference
+	SHRQ	CX, AX	// move a's bit to bottom
+	ANDQ	$1, AX	// mask bit
+	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
+	RET
+
+	// 0-7 bytes in common
+small:
+	LEAQ	(R8*8), CX	// bytes left -> bits left
+	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
+	JEQ	allsame
+
+	// load bytes of a into high bytes of AX
+	CMPB	SI, $0xf8
+	JA	si_high
+	MOVQ	(SI), SI
+	JMP	si_finish
+si_high:
+	MOVQ	-8(SI)(R8*1), SI
+	SHRQ	CX, SI
+si_finish:
+	SHLQ	CX, SI
+
+	// load bytes of b in to high bytes of BX
+	CMPB	DI, $0xf8
+	JA	di_high
+	MOVQ	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVQ	-8(DI)(R8*1), DI
+	SHRQ	CX, DI
+di_finish:
+	SHLQ	CX, DI
+
+	BSWAPQ	SI	// reverse order of bytes
+	BSWAPQ	DI
+	XORQ	SI, DI	// find bit differences
+	JEQ	allsame
+	BSRQ	DI, CX	// index of highest bit difference
+	SHRQ	CX, SI	// move a's bit to bottom
+	ANDQ	$1, SI	// mask bit
+	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
+	RET
+
+allsame:
+	XORQ	AX, AX
+	XORQ	CX, CX
+	CMPQ	BX, DX
+	SETGT	AX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
+	RET
+
+	// this works for >= 64 bytes of data.
+#ifndef hasAVX2
+big_loop:
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff16
+
+	MOVOU	16(SI), X0
+	MOVOU	16(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff32
+
+	MOVOU	32(SI), X0
+	MOVOU	32(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff48
+
+	MOVOU	48(SI), X0
+	MOVOU	48(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff64
+
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, R8
+	CMPQ	R8, $64
+	JBE	loop
+	JMP	big_loop
+#endif
+
+	// Compare 64-bytes per loop iteration.
+	// Loop is unrolled and uses AVX2.
+big_loop_avx2:
+	VMOVDQU	(SI), Y2
+	VMOVDQU	(DI), Y3
+	VMOVDQU	32(SI), Y4
+	VMOVDQU	32(DI), Y5
+	VPCMPEQB Y2, Y3, Y0
+	VPMOVMSKB Y0, AX
+	XORL	$0xffffffff, AX
+	JNE	diff32_avx2
+	VPCMPEQB Y4, Y5, Y6
+	VPMOVMSKB Y6, AX
+	XORL	$0xffffffff, AX
+	JNE	diff64_avx2
+
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, R8
+	CMPQ	R8, $64
+	JB	big_loop_avx2_exit
+	JMP	big_loop_avx2
+
+	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
+diff32_avx2:
+	VZEROUPPER
+	JMP diff16
+
+	// Same as diff32_avx2, but for last 32 bytes.
+diff64_avx2:
+	VZEROUPPER
+	JMP diff48
+
+	// For <64 bytes remainder jump to normal loop.
+big_loop_avx2_exit:
+	VZEROUPPER
+	JMP loop
diff --git a/src/internal/bytealg/compare_arm.s b/src/internal/bytealg/compare_arm.s
new file mode 100644
index 0000000..80d01a2
--- /dev/null
+++ b/src/internal/bytealg/compare_arm.s
@@ -0,0 +1,86 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	a_base+0(FP), R2
+	MOVW	a_len+4(FP), R0
+	MOVW	b_base+12(FP), R3
+	MOVW	b_len+16(FP), R1
+	ADD	$28, R13, R7
+	B	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-20
+	MOVW	a_base+0(FP), R2
+	MOVW	a_len+4(FP), R0
+	MOVW	b_base+8(FP), R3
+	MOVW	b_len+12(FP), R1
+	ADD	$20, R13, R7
+	B	cmpbody<>(SB)
+
+// On entry:
+// R0 is the length of a
+// R1 is the length of b
+// R2 points to the start of a
+// R3 points to the start of b
+// R7 points to return value (-1/0/1 will be written here)
+//
+// On exit:
+// R4, R5, R6 and R8 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	R2, R3
+	BEQ	samebytes
+	CMP 	R0, R1
+	MOVW 	R0, R6
+	MOVW.LT	R1, R6		// R6 is min(R0, R1)
+
+	CMP	$0, R6
+	BEQ	samebytes
+	CMP	$4, R6
+	ADD	R2, R6		// R2 is current byte in a, R6 is the end of the range to compare
+	BLT	byte_loop	// length < 4
+	AND	$3, R2, R8
+	CMP	$0, R8
+	BNE	byte_loop	// unaligned a, use byte-wise compare (TODO: try to align a)
+aligned_a:
+	AND	$3, R3, R8
+	CMP	$0, R8
+	BNE	byte_loop	// unaligned b, use byte-wise compare
+	AND	$0xfffffffc, R6, R8
+	// length >= 4
+chunk4_loop:
+	MOVW.P	4(R2), R4
+	MOVW.P	4(R3), R5
+	CMP	R4, R5
+	BNE	cmp
+	CMP	R2, R8
+	BNE	chunk4_loop
+	CMP	R2, R6
+	BEQ	samebytes	// all compared bytes were the same; compare lengths
+byte_loop:
+	MOVBU.P	1(R2), R4
+	MOVBU.P	1(R3), R5
+	CMP	R4, R5
+	BNE	ret
+	CMP	R2, R6
+	BNE	byte_loop
+samebytes:
+	CMP	R0, R1
+	MOVW.LT	$1, R0
+	MOVW.GT	$-1, R0
+	MOVW.EQ	$0, R0
+	MOVW	R0, (R7)
+	RET
+ret:
+	// bytes differed
+	MOVW.LT	$1, R0
+	MOVW.GT	$-1, R0
+	MOVW	R0, (R7)
+	RET
+cmp:
+	SUB	$4, R2, R2
+	SUB	$4, R3, R3
+	B	byte_loop
diff --git a/src/internal/bytealg/compare_arm64.s b/src/internal/bytealg/compare_arm64.s
new file mode 100644
index 0000000..cc02c46
--- /dev/null
+++ b/src/internal/bytealg/compare_arm64.s
@@ -0,0 +1,125 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+	// R0 = a_base (want in R0)
+	// R1 = a_len  (want in R1)
+	// R2 = a_cap  (unused)
+	// R3 = b_base (want in R2)
+	// R4 = b_len  (want in R3)
+	// R5 = b_cap  (unused)
+	MOVD	R3, R2
+	MOVD	R4, R3
+	B	cmpbody<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// R0 = a_base
+	// R1 = a_len
+	// R2 = b_base
+	// R3 = b_len
+	B	cmpbody<>(SB)
+
+// On entry:
+// R0 points to the start of a
+// R1 is the length of a
+// R2 points to the start of b
+// R3 is the length of b
+//
+// On exit:
+// R0 is the result
+// R4, R5, R6, R8, R9 and R10 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	R0, R2
+	BEQ	samebytes         // same starting pointers; compare lengths
+	CMP	R1, R3
+	CSEL	LT, R3, R1, R6    // R6 is min(R1, R3)
+
+	CBZ	R6, samebytes
+	BIC	$0xf, R6, R10
+	CBZ	R10, small        // length < 16
+	ADD	R0, R10           // end of chunk16
+	// length >= 16
+chunk16_loop:
+	LDP.P	16(R0), (R4, R8)
+	LDP.P	16(R2), (R5, R9)
+	CMP	R4, R5
+	BNE	cmp
+	CMP	R8, R9
+	BNE	cmpnext
+	CMP	R10, R0
+	BNE	chunk16_loop
+	AND	$0xf, R6, R6
+	CBZ	R6, samebytes
+	SUBS	$8, R6
+	BLT	tail
+	// the length of tail > 8 bytes
+	MOVD.P	8(R0), R4
+	MOVD.P	8(R2), R5
+	CMP	R4, R5
+	BNE	cmp
+	SUB	$8, R6
+	// compare last 8 bytes
+tail:
+	MOVD	(R0)(R6), R4
+	MOVD	(R2)(R6), R5
+	CMP	R4, R5
+	BEQ	samebytes
+cmp:
+	REV	R4, R4
+	REV	R5, R5
+	CMP	R4, R5
+ret:
+	MOVD	$1, R0
+	CNEG	HI, R0, R0
+	RET
+small:
+	TBZ	$3, R6, lt_8
+	MOVD	(R0), R4
+	MOVD	(R2), R5
+	CMP	R4, R5
+	BNE	cmp
+	SUBS	$8, R6
+	BEQ	samebytes
+	ADD	$8, R0
+	ADD	$8, R2
+	SUB	$8, R6
+	B	tail
+lt_8:
+	TBZ	$2, R6, lt_4
+	MOVWU	(R0), R4
+	MOVWU	(R2), R5
+	CMPW	R4, R5
+	BNE	cmp
+	SUBS	$4, R6
+	BEQ	samebytes
+	ADD	$4, R0
+	ADD	$4, R2
+lt_4:
+	TBZ	$1, R6, lt_2
+	MOVHU	(R0), R4
+	MOVHU	(R2), R5
+	CMPW	R4, R5
+	BNE	cmp
+	ADD	$2, R0
+	ADD	$2, R2
+lt_2:
+	TBZ	$0, R6, samebytes
+one:
+	MOVBU	(R0), R4
+	MOVBU	(R2), R5
+	CMPW	R4, R5
+	BNE	ret
+samebytes:
+	CMP	R3, R1
+	CSET	NE, R0
+	CNEG	LO, R0, R0
+	RET
+cmpnext:
+	REV	R8, R4
+	REV	R9, R5
+	CMP	R4, R5
+	B	ret
diff --git a/src/internal/bytealg/compare_generic.go b/src/internal/bytealg/compare_generic.go
new file mode 100644
index 0000000..b04e275
--- /dev/null
+++ b/src/internal/bytealg/compare_generic.go
@@ -0,0 +1,60 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le && !riscv64
+
+package bytealg
+
+import _ "unsafe" // for go:linkname
+
+func Compare(a, b []byte) int {
+	l := len(a)
+	if len(b) < l {
+		l = len(b)
+	}
+	if l == 0 || &a[0] == &b[0] {
+		goto samebytes
+	}
+	for i := 0; i < l; i++ {
+		c1, c2 := a[i], b[i]
+		if c1 < c2 {
+			return -1
+		}
+		if c1 > c2 {
+			return +1
+		}
+	}
+samebytes:
+	if len(a) < len(b) {
+		return -1
+	}
+	if len(a) > len(b) {
+		return +1
+	}
+	return 0
+}
+
+//go:linkname runtime_cmpstring runtime.cmpstring
+func runtime_cmpstring(a, b string) int {
+	l := len(a)
+	if len(b) < l {
+		l = len(b)
+	}
+	for i := 0; i < l; i++ {
+		c1, c2 := a[i], b[i]
+		if c1 < c2 {
+			return -1
+		}
+		if c1 > c2 {
+			return +1
+		}
+	}
+	if len(a) < len(b) {
+		return -1
+	}
+	if len(a) > len(b) {
+		return +1
+	}
+	return 0
+}
diff --git a/src/internal/bytealg/compare_loong64.s b/src/internal/bytealg/compare_loong64.s
new file mode 100644
index 0000000..54c2dab
--- /dev/null
+++ b/src/internal/bytealg/compare_loong64.s
@@ -0,0 +1,86 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-56
+	MOVV	a_base+0(FP), R6
+	MOVV	b_base+24(FP), R7
+	MOVV	a_len+8(FP), R4
+	MOVV	b_len+32(FP), R5
+	MOVV	$ret+48(FP), R13
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+	MOVV	a_base+0(FP), R6
+	MOVV	b_base+16(FP), R7
+	MOVV	a_len+8(FP), R4
+	MOVV	b_len+24(FP), R5
+	MOVV	$ret+32(FP), R13
+	JMP	cmpbody<>(SB)
+
+// On entry:
+// R4 length of a
+// R5 length of b
+// R6 points to the start of a
+// R7 points to the start of b
+// R13 points to the return value (-1/0/1)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
+	BEQ	R6, R7, samebytes // same start of a and b
+
+	SGTU	R4, R5, R9
+	BNE	R0, R9, r2_lt_r1
+	MOVV	R4, R14
+	JMP	entry
+r2_lt_r1:
+	MOVV	R5, R14	// R14 is min(R4, R5)
+entry:
+	ADDV	R6, R14, R12	// R6 start of a, R14 end of a
+	BEQ	R6, R12, samebytes // length is 0
+
+	SRLV	$4, R14		// R14 is number of chunks
+	BEQ	R0, R14, byte_loop
+
+	// make sure both a and b are aligned.
+	OR	R6, R7, R15
+	AND	$7, R15
+	BNE	R0, R15, byte_loop
+
+chunk16_loop:
+	BEQ	R0, R14, byte_loop
+	MOVV	(R6), R8
+	MOVV	(R7), R9
+	BNE	R8, R9, byte_loop
+	MOVV	8(R6), R16
+	MOVV	8(R7), R17
+	ADDV	$16, R6
+	ADDV	$16, R7
+	SUBVU	$1, R14
+	BEQ	R16, R17, chunk16_loop
+	SUBV	$8, R6
+	SUBV	$8, R7
+
+byte_loop:
+	BEQ	R6, R12, samebytes
+	MOVBU	(R6), R8
+	ADDVU	$1, R6
+	MOVBU	(R7), R9
+	ADDVU	$1, R7
+	BEQ	R8, R9, byte_loop
+
+byte_cmp:
+	SGTU	R8, R9, R12 // R12 = 1 if (R8 > R9)
+	BNE	R0, R12, ret
+	MOVV	$-1, R12
+	JMP	ret
+
+samebytes:
+	SGTU	R4, R5, R8
+	SGTU	R5, R4, R9
+	SUBV	R9, R8, R12
+
+ret:
+	MOVV	R12, (R13)
+	RET
diff --git a/src/internal/bytealg/compare_mips64x.s b/src/internal/bytealg/compare_mips64x.s
new file mode 100644
index 0000000..117a9ef
--- /dev/null
+++ b/src/internal/bytealg/compare_mips64x.s
@@ -0,0 +1,88 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-56
+	MOVV	a_base+0(FP), R3
+	MOVV	b_base+24(FP), R4
+	MOVV	a_len+8(FP), R1
+	MOVV	b_len+32(FP), R2
+	MOVV	$ret+48(FP), R9
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+	MOVV	a_base+0(FP), R3
+	MOVV	b_base+16(FP), R4
+	MOVV	a_len+8(FP), R1
+	MOVV	b_len+24(FP), R2
+	MOVV	$ret+32(FP), R9
+	JMP	cmpbody<>(SB)
+
+// On entry:
+// R1 length of a
+// R2 length of b
+// R3 points to the start of a
+// R4 points to the start of b
+// R9 points to the return value (-1/0/1)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
+	BEQ	R3, R4, samebytes // same start of a and b
+
+	SGTU	R1, R2, R7
+	BNE	R0, R7, r2_lt_r1
+	MOVV	R1, R10
+	JMP	entry
+r2_lt_r1:
+	MOVV	R2, R10	// R10 is min(R1, R2)
+entry:
+	ADDV	R3, R10, R8	// R3 start of a, R8 end of a
+	BEQ	R3, R8, samebytes // length is 0
+
+	SRLV	$4, R10		// R10 is number of chunks
+	BEQ	R0, R10, byte_loop
+
+	// make sure both a and b are aligned.
+	OR	R3, R4, R11
+	AND	$7, R11
+	BNE	R0, R11, byte_loop
+
+chunk16_loop:
+	BEQ	R0, R10, byte_loop
+	MOVV	(R3), R6
+	MOVV	(R4), R7
+	BNE	R6, R7, byte_loop
+	MOVV	8(R3), R13
+	MOVV	8(R4), R14
+	ADDV	$16, R3
+	ADDV	$16, R4
+	SUBVU	$1, R10
+	BEQ	R13, R14, chunk16_loop
+	SUBV	$8, R3
+	SUBV	$8, R4
+
+byte_loop:
+	BEQ	R3, R8, samebytes
+	MOVBU	(R3), R6
+	ADDVU	$1, R3
+	MOVBU	(R4), R7
+	ADDVU	$1, R4
+	BEQ	R6, R7, byte_loop
+
+byte_cmp:
+	SGTU	R6, R7, R8 // R8 = 1 if (R6 > R7)
+	BNE	R0, R8, ret
+	MOVV	$-1, R8
+	JMP	ret
+
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBV	R7, R6, R8
+
+ret:
+	MOVV	R8, (R9)
+	RET
diff --git a/src/internal/bytealg/compare_mipsx.s b/src/internal/bytealg/compare_mipsx.s
new file mode 100644
index 0000000..857ac13
--- /dev/null
+++ b/src/internal/bytealg/compare_mipsx.s
@@ -0,0 +1,72 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+	MOVW	a_base+0(FP), R3
+	MOVW	b_base+12(FP), R4
+	MOVW	a_len+4(FP), R1
+	MOVW	b_len+16(FP), R2
+	BEQ	R3, R4, samebytes
+	SGTU	R1, R2, R7
+	MOVW	R1, R8
+	CMOVN	R7, R2, R8	// R8 is min(R1, R2)
+
+	ADDU	R3, R8	// R3 is current byte in a, R8 is last byte in a to compare
+loop:
+	BEQ	R3, R8, samebytes
+
+	MOVBU	(R3), R6
+	ADDU	$1, R3
+	MOVBU	(R4), R7
+	ADDU	$1, R4
+	BEQ	R6, R7 , loop
+
+	SGTU	R6, R7, R8
+	MOVW	$-1, R6
+	CMOVZ	R8, R6, R8
+	JMP	cmp_ret
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBU	R7, R6, R8
+cmp_ret:
+	MOVW	R8, ret+24(FP)
+	RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+	MOVW	a_base+0(FP), R3
+	MOVW	a_len+4(FP), R1
+	MOVW	b_base+8(FP), R4
+	MOVW	b_len+12(FP), R2
+	BEQ	R3, R4, samebytes
+	SGTU	R1, R2, R7
+	MOVW	R1, R8
+	CMOVN	R7, R2, R8	// R8 is min(R1, R2)
+
+	ADDU	R3, R8	// R3 is current byte in a, R8 is last byte in a to compare
+loop:
+	BEQ	R3, R8, samebytes	// all compared bytes were the same; compare lengths
+
+	MOVBU	(R3), R6
+	ADDU	$1, R3
+	MOVBU	(R4), R7
+	ADDU	$1, R4
+	BEQ	R6, R7 , loop
+	// bytes differed
+	SGTU	R6, R7, R8
+	MOVW	$-1, R6
+	CMOVZ	R8, R6, R8
+	JMP	cmp_ret
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBU	R7, R6, R8
+cmp_ret:
+	MOVW	R8, ret+16(FP)
+	RET
diff --git a/src/internal/bytealg/compare_native.go b/src/internal/bytealg/compare_native.go
new file mode 100644
index 0000000..34964e2
--- /dev/null
+++ b/src/internal/bytealg/compare_native.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64 || s390x || arm || arm64 || loong64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le || riscv64
+
+package bytealg
+
+import _ "unsafe" // For go:linkname
+
+//go:noescape
+func Compare(a, b []byte) int
+
+// The declaration below generates ABI wrappers for functions
+// implemented in assembly in this package but declared in another
+// package.
+
+//go:linkname abigen_runtime_cmpstring runtime.cmpstring
+func abigen_runtime_cmpstring(a, b string) int
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s
new file mode 100644
index 0000000..cbe0525
--- /dev/null
+++ b/src/internal/bytealg/compare_ppc64x.s
@@ -0,0 +1,502 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+	// incoming:
+	// R3 a addr -> R5
+	// R4 a len  -> R3
+	// R5 a cap unused
+	// R6 b addr -> R6
+	// R7 b len  -> R4
+	// R8 b cap unused
+	MOVD	R3, R5
+	MOVD	R4, R3
+	MOVD	R7, R4
+	CMP     R5,R6,CR7
+	CMP	R3,R4,CR6
+	BEQ	CR7,equal
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	CMP	R16,$1
+	BNE	power8
+	BR	cmpbodyp9<>(SB)
+power8:
+	BR	cmpbody<>(SB)
+equal:
+	BEQ	CR6,done
+	MOVD	$1, R8
+	BGT	CR6,greater
+	NEG	R8
+greater:
+	MOVD	R8, R3
+	RET
+done:
+	MOVD	$0, R3
+	RET
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// incoming:
+	// R3 a addr -> R5
+	// R4 a len  -> R3
+	// R5 b addr -> R6
+	// R6 b len  -> R4
+	MOVD	R6, R7
+	MOVD	R5, R6
+	MOVD	R3, R5
+	MOVD	R4, R3
+	MOVD	R7, R4
+	CMP     R5,R6,CR7
+	CMP	R3,R4,CR6
+	BEQ	CR7,equal
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	CMP	R16,$1
+	BNE	power8
+	BR	cmpbodyp9<>(SB)
+power8:
+	BR	cmpbody<>(SB)
+equal:
+	BEQ	CR6,done
+	MOVD	$1, R8
+	BGT	CR6,greater
+	NEG	R8
+greater:
+	MOVD	R8, R3
+	RET
+
+done:
+	MOVD	$0, R3
+	RET
+
+#ifdef GOARCH_ppc64le
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+GLOBL byteswap<>+0(SB), RODATA, $16
+#define SWAP V21
+#endif
+
+// Do an efficient memcmp for ppc64le/ppc64/POWER8
+// R3 = a len
+// R4 = b len
+// R5 = a addr
+// R6 = b addr
+// On exit:
+// R3 = return value
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R8		// set up length
+	CMP	R3,R4,CR2	// unequal?
+	BLT	CR2,setuplen	// BLT CR2
+	MOVD	R4,R8		// use R4 for comparison len
+setuplen:
+	CMP	R8,$32		// optimize >= 32
+	MOVD	R8,R9
+	BLT	setup8a		// optimize < 32
+	MOVD	$16,R10		// set offsets to load into vectors
+	CMP	R8,$64
+	BLT	cmp32		// process size 32-63
+
+	DCBT	(R5)		// optimize >= 64
+	DCBT	(R6)		// cache hint
+	MOVD	$32,R11		// set offsets to load into vector
+	MOVD	$48,R12		// set offsets to load into vector
+
+loop64a:// process size 64 and greater
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different	// jump out if its different
+
+	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
+	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R11),V3	// load bytes of A at offset 32 into vector
+	LXVD2X	(R6)(R11),V4	// load bytes of B at offset 32 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R12),V3	// load bytes of A at offset 64 into vector
+	LXVD2X	(R6)(R12),V4	// load bytes of B at offset 64 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$-64,R9,R9	// reduce remaining size by 64
+	ADD	$64,R5,R5	// increment to next 64 bytes of A
+	ADD	$64,R6,R6	// increment to next 64 bytes of B
+	CMPU	R9,$64
+	BGE	loop64a		// loop back to loop64a only if there are >= 64 bytes remaining
+	
+	CMPU	R9,$32
+	BGE	cmp32		// loop to cmp32 if there are 32-64 bytes remaining
+	CMPU	R9,$0
+	BNE	rem		// loop to rem if the remainder is not 0
+
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+cmp32:
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
+	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$-32,R9,R9	// reduce remaining size by 32
+	ADD	$32,R5,R5	// increment to next 32 bytes of A
+	ADD	$32,R6,R6	// increment to next 32 bytes of B
+	CMPU	R9,$0
+	BNE	rem		// loop to rem if the remainder is not 0
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+rem:
+	MOVD	R9,R8
+	ANDCC	$24,R8,R9	// Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+	BR	setup8a
+
+different:
+#ifdef	GOARCH_ppc64le
+	MOVD	$byteswap<>+00(SB), R16
+	LXVD2X	(R16)(R0),SWAP	// Set up swap string
+
+	VPERM	V3,V3,SWAP,V3
+	VPERM	V4,V4,SWAP,V4
+#endif
+	MFVSRD	VS35,R16	// move upper doublwords of A and B into GPR for comparison
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BEQ	lower
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+lower:
+	VSLDOI	$8,V3,V3,V3	// move lower doublwords of A and B into GPR for comparison
+	MFVSRD	VS35,R16
+	VSLDOI	$8,V4,V4,V4
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+setup8a:
+	SRADCC	$3,R8,R9	// get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	CMPU	R8,$8		// optimize 8byte move
+	BEQ	size8
+	CMPU	R8,$16
+	BEQ	size16
+	MOVD	R9,CTR		// loop count for doublewords
+loop8:
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R0),R16	// doublewords to compare
+	MOVDBR	(R6+R0),R10	// LE compare order
+#else
+	MOVD	(R5+R0),R16	// doublewords to compare
+	MOVD	(R6+R0),R10	// BE compare order
+#endif
+	ADD	$8,R5
+	ADD	$8,R6
+	CMPU	R16,R10		// match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BGT	greater
+	BLT	less
+leftover:
+	ANDCC	$7,R8,R9	// check for leftover bytes
+	BEQ	zeroremainder
+simplecheck:
+	MOVD	R0,R14
+	CMP	R9,$4		// process 4 bytes
+	BLT	halfword
+#ifdef  GOARCH_ppc64le
+	MOVWBR	(R5)(R14),R10
+	MOVWBR	(R6)(R14),R11
+#else
+	MOVWZ	(R5)(R14),R10
+	MOVWZ	(R6)(R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$-4,R9
+	ADD	$4,R14
+	PCALIGN	$16
+
+halfword:
+	CMP	R9,$2		// process 2 bytes
+	BLT	byte
+#ifdef  GOARCH_ppc64le
+	MOVHBR	(R5)(R14),R10
+	MOVHBR	(R6)(R14),R11
+#else
+	MOVHZ	(R5)(R14),R10
+	MOVHZ	(R6)(R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$-2,R9
+	ADD	$2,R14
+	PCALIGN	$16
+byte:
+	CMP	R9,$0		// process 1 byte
+	BEQ	skip
+	MOVBZ	(R5)(R14),R10
+	MOVBZ	(R6)(R14),R11
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	PCALIGN	$16
+skip:
+	BEQ	CR2,equal
+	BGT	CR2,greater
+
+less:	MOVD	$-1,R3		// return value if A < B
+	RET
+size16:
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+zeroremainder:
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+size8:
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R0),R16	// doublewords to compare
+	MOVDBR	(R6+R0),R10	// LE compare order
+#else
+	MOVD	(R5+R0),R16	// doublewords to compare
+	MOVD	(R6+R0),R10	// BE compare order
+#endif
+	CMPU	R16,R10		// match?
+	BGT	greater
+	BLT	less
+	BGT	CR2,greater	// 2nd len > 1st len
+	BLT	CR2,less	// 2nd len < 1st len
+equal:
+	MOVD	$0, R3		// return value if A == B
+	RET
+greater:
+	MOVD	$1,R3		// return value if A > B
+	RET
+
+// Do an efficient memcmp for ppc64le/ppc64/POWER9
+// R3 = a len
+// R4 = b len
+// R5 = a addr
+// R6 = b addr
+// On exit:
+// R3 = return value
+TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R8		// set up length
+	CMP	R3,R4,CR2	// unequal?
+	BLT	CR2,setuplen	// BLT CR2
+	MOVD	R4,R8		// use R4 for comparison len
+setuplen:
+	CMP	R8,$16		// optimize for size<16
+	MOVD	R8,R9
+	BLT	simplecheck
+	MOVD	$16,R10		// set offsets to load into vectors
+	CMP	R8,$32		// optimize for size 16-31
+	BLT	cmp16
+	CMP	R8,$64
+	BLT	cmp32		// optimize for size 32-63
+	DCBT	(R5)		// optimize for size>=64
+	DCBT	(R6)		// cache hint
+
+	MOVD	$32,R11		// set offsets to load into vector
+	MOVD	$48,R12		// set offsets to load into vector
+
+loop64a:// process size 64 and greater
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 0 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 0 into vector
+	VCMPNEBCC	V3,V4,V1	// record comparison into V1
+	BNE	CR6,different	// jump out if its different
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	LXVB16X	(R11)(R5),V3	// load bytes of A at offset 32 into vector
+	LXVB16X	(R11)(R6),V4	// load bytes of B at offset 32 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	LXVB16X	(R12)(R5),V3	// load bytes of A at offset 48 into vector
+	LXVB16X	(R12)(R6),V4	// load bytes of B at offset 48 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	ADD	$-64,R9,R9	// reduce remaining size by 64
+	ADD	$64,R5,R5	// increment to next 64 bytes of A
+	ADD	$64,R6,R6	// increment to next 64 bytes of B
+	CMPU	R9,$64
+	BGE	loop64a		// loop back to loop64a only if there are >= 64 bytes remaining
+
+	CMPU	R9,$32
+	BGE	cmp32		// loop to cmp32 if there are 32-64 bytes remaining
+	CMPU	R9,$16
+	BGE	cmp16		// loop to cmp16 if there are 16-31 bytes left
+	CMPU	R9,$0
+	BNE	simplecheck	// loop to simplecheck for remaining bytes
+
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+cmp32:
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 0 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 0 into vector
+
+	VCMPNEBCC	V3,V4,V1	// record comparison into V1
+	BNE	CR6,different	// jump out if its different
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	ADD	$-32,R9,R9	// reduce remaining size by 32
+	ADD	$32,R5,R5	// increment to next 32 bytes of A
+	ADD	$32,R6,R6	// increment to next 32 bytes of B
+	CMPU	R9,$16		// loop to cmp16 if there are 16-31 bytes left
+	BGE	cmp16
+	CMPU	R9,$0
+	BNE	simplecheck	// loop to simplecheck for remainder bytes
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+different:
+
+	MFVSRD	VS35,R16	// move upper doublwords of A and B into GPR for comparison
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BEQ	lower
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+lower:
+	MFVSRLD	VS35,R16	// next move lower doublewords of A and B into GPR for comparison
+	MFVSRLD	VS36,R10
+
+	CMPU	R16,R10
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+
+greater:
+	MOVD	$1,R3		// return value if A > B
+	RET
+cmp16:
+	ANDCC	$16,R9,R31
+	BEQ	tail
+
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$16,R5
+	ADD	$16,R6
+tail:
+	ANDCC	$15,R9		// Load the last 16 bytes (we know there are at least 32b)
+	BEQ	end
+
+	ADD	R9,R5
+	ADD	R9,R6
+	MOVD	$-16,R10
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+end:
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if BLT CR2 that is, len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+simplecheck:
+	MOVD	$0,R14		// process 8 bytes
+	CMP	R9,$8
+	BLT	word
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R14),R10
+	MOVDBR	(R6+R14),R11
+#else
+	MOVD	(R5+R14),R10
+	MOVD	(R6+R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$8,R14
+	ADD	$-8,R9
+	PCALIGN	$16
+word:
+	CMP	R9,$4		// process 4 bytes
+	BLT	halfword
+#ifdef  GOARCH_ppc64le
+	MOVWBR	(R5+R14),R10
+	MOVWBR	(R6+R14),R11
+#else
+	MOVWZ	(R5+R14),R10
+	MOVWZ	(R6+R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$4,R14
+	ADD	$-4,R9
+	PCALIGN	$16
+halfword:
+	CMP	R9,$2		// process 2 bytes
+	BLT	byte
+#ifdef  GOARCH_ppc64le
+	MOVHBR	(R5+R14),R10
+	MOVHBR	(R6+R14),R11
+#else
+	MOVHZ	(R5+R14),R10
+	MOVHZ	(R6+R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$2,R14
+	ADD	$-2,R9
+	PCALIGN	$16
+byte:
+	CMP	R9,$0		// process 1 byte
+	BEQ	skip
+	MOVBZ	(R5+R14),R10
+	MOVBZ	(R6+R14),R11
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	PCALIGN	$16
+skip:
+	BEQ	CR2,equal
+	BGT	CR2,greater
+less:
+	MOVD	$-1,R3		// return value if A < B
+	RET
+equal:
+	MOVD	$0, R3		// return value if A == B
+	RET
diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s
new file mode 100644
index 0000000..44a743d
--- /dev/null
+++ b/src/internal/bytealg/compare_riscv64.s
@@ -0,0 +1,187 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+	// X10 = a_base
+	// X11 = a_len
+	// X12 = a_cap (unused)
+	// X13 = b_base (want in X12)
+	// X14 = b_len (want in X13)
+	// X15 = b_cap (unused)
+	MOV	X13, X12
+	MOV	X14, X13
+	JMP	compare<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// X10 = a_base
+	// X11 = a_len
+	// X12 = b_base
+	// X13 = b_len
+	JMP	compare<>(SB)
+
+// On entry:
+// X10 points to start of a
+// X11 length of a
+// X12 points to start of b
+// X13 length of b
+// for non-regabi X14 points to the address to store the return value (-1/0/1)
+// for regabi the return value in X10
+TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
+	BEQ	X10, X12, cmp_len
+
+	MOV	X11, X5
+	BGE	X13, X5, use_a_len // X5 = min(len(a), len(b))
+	MOV	X13, X5
+use_a_len:
+	BEQZ	X5, cmp_len
+
+	MOV	$32, X6
+	BLT	X5, X6, loop4_check
+
+	// Check alignment - if alignment differs we have to do one byte at a time.
+	AND	$7, X10, X7
+	AND	$7, X12, X8
+	BNE	X7, X8, loop4_check
+	BEQZ	X7, loop32_check
+
+	// Check one byte at a time until we reach 8 byte alignment.
+	SUB	X7, X5, X5
+align:
+	ADD	$-1, X7
+	MOVBU	0(X10), X8
+	MOVBU	0(X12), X9
+	BNE	X8, X9, cmp
+	ADD	$1, X10
+	ADD	$1, X12
+	BNEZ	X7, align
+
+loop32_check:
+	MOV	$32, X7
+	BLT	X5, X7, loop16_check
+loop32:
+	MOV	0(X10), X15
+	MOV	0(X12), X16
+	MOV	8(X10), X17
+	MOV	8(X12), X18
+	BEQ	X15, X16, loop32a
+	JMP	cmp8a
+loop32a:
+	BEQ	X17, X18, loop32b
+	JMP	cmp8b
+loop32b:
+	MOV	16(X10), X15
+	MOV	16(X12), X16
+	MOV	24(X10), X17
+	MOV	24(X12), X18
+	BEQ	X15, X16, loop32c
+	JMP	cmp8a
+loop32c:
+	BEQ	X17, X18, loop32d
+	JMP	cmp8b
+loop32d:
+	ADD	$32, X10
+	ADD	$32, X12
+	ADD	$-32, X5
+	BGE	X5, X7, loop32
+	BEQZ	X5, cmp_len
+
+loop16_check:
+	MOV	$16, X6
+	BLT	X5, X6, loop4_check
+loop16:
+	MOV	0(X10), X15
+	MOV	0(X12), X16
+	MOV	8(X10), X17
+	MOV	8(X12), X18
+	BEQ	X15, X16, loop16a
+	JMP	cmp8a
+loop16a:
+	BEQ	X17, X18, loop16b
+	JMP	cmp8b
+loop16b:
+	ADD	$16, X10
+	ADD	$16, X12
+	ADD	$-16, X5
+	BGE	X5, X6, loop16
+	BEQZ	X5, cmp_len
+
+loop4_check:
+	MOV	$4, X6
+	BLT	X5, X6, loop1
+loop4:
+	MOVBU	0(X10), X8
+	MOVBU	0(X12), X9
+	MOVBU	1(X10), X15
+	MOVBU	1(X12), X16
+	BEQ	X8, X9, loop4a
+	SLTU	X9, X8, X5
+	SLTU	X8, X9, X6
+	JMP	cmp_ret
+loop4a:
+	BEQ	X15, X16, loop4b
+	SLTU	X16, X15, X5
+	SLTU	X15, X16, X6
+	JMP	cmp_ret
+loop4b:
+	MOVBU	2(X10), X21
+	MOVBU	2(X12), X22
+	MOVBU	3(X10), X23
+	MOVBU	3(X12), X24
+	BEQ	X21, X22, loop4c
+	SLTU	X22, X21, X5
+	SLTU	X21, X22, X6
+	JMP	cmp_ret
+loop4c:
+	BEQ	X23, X24, loop4d
+	SLTU	X24, X23, X5
+	SLTU	X23, X24, X6
+	JMP	cmp_ret
+loop4d:
+	ADD	$4, X10
+	ADD	$4, X12
+	ADD	$-4, X5
+	BGE	X5, X6, loop4
+
+loop1:
+	BEQZ	X5, cmp_len
+	MOVBU	0(X10), X8
+	MOVBU	0(X12), X9
+	BNE	X8, X9, cmp
+	ADD	$1, X10
+	ADD	$1, X12
+	ADD	$-1, X5
+	JMP	loop1
+
+	// Compare 8 bytes of memory in X15/X16 that are known to differ.
+cmp8a:
+	MOV	$0xff, X19
+cmp8a_loop:
+	AND	X15, X19, X8
+	AND	X16, X19, X9
+	BNE	X8, X9, cmp
+	SLLI	$8, X19
+	JMP	cmp8a_loop
+
+	// Compare 8 bytes of memory in X17/X18 that are known to differ.
+cmp8b:
+	MOV	$0xff, X19
+cmp8b_loop:
+	AND	X17, X19, X8
+	AND	X18, X19, X9
+	BNE	X8, X9, cmp
+	SLLI	$8, X19
+	JMP	cmp8b_loop
+
+cmp_len:
+	MOV	X11, X8
+	MOV	X13, X9
+cmp:
+	SLTU	X9, X8, X5
+	SLTU	X8, X9, X6
+cmp_ret:
+	SUB	X5, X6, X10
+	RET
diff --git a/src/internal/bytealg/compare_s390x.s b/src/internal/bytealg/compare_s390x.s
new file mode 100644
index 0000000..5394548
--- /dev/null
+++ b/src/internal/bytealg/compare_s390x.s
@@ -0,0 +1,69 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+	MOVD	a_base+0(FP), R3
+	MOVD	a_len+8(FP), R4
+	MOVD	b_base+24(FP), R5
+	MOVD	b_len+32(FP), R6
+	LA	ret+48(FP), R7
+	BR	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	a_base+0(FP), R3
+	MOVD	a_len+8(FP), R4
+	MOVD	b_base+16(FP), R5
+	MOVD	b_len+24(FP), R6
+	LA	ret+32(FP), R7
+	BR	cmpbody<>(SB)
+
+// input:
+//   R3 = a
+//   R4 = alen
+//   R5 = b
+//   R6 = blen
+//   R7 = address of output word (stores -1/0/1 here)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, cmplengths
+	MOVD	R4, R8
+	CMPBLE	R4, R6, amin
+	MOVD	R6, R8
+amin:
+	CMPBEQ	R8, $0, cmplengths
+	CMP	R8, $256
+	BLE	tail
+loop:
+	CLC	$256, 0(R3), 0(R5)
+	BGT	gt
+	BLT	lt
+	SUB	$256, R8
+	MOVD	$256(R3), R3
+	MOVD	$256(R5), R5
+	CMP	R8, $256
+	BGT	loop
+tail:
+	SUB	$1, R8
+	EXRL	$cmpbodyclc<>(SB), R8
+	BGT	gt
+	BLT	lt
+cmplengths:
+	CMP	R4, R6
+	BEQ	eq
+	BLT	lt
+gt:
+	MOVD	$1, 0(R7)
+	RET
+lt:
+	MOVD	$-1, 0(R7)
+	RET
+eq:
+	MOVD	$0, 0(R7)
+	RET
+
+TEXT cmpbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
diff --git a/src/internal/bytealg/compare_wasm.s b/src/internal/bytealg/compare_wasm.s
new file mode 100644
index 0000000..dc8fb33
--- /dev/null
+++ b/src/internal/bytealg/compare_wasm.s
@@ -0,0 +1,115 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB), NOSPLIT, $0-56
+	Get SP
+	I64Load a_base+0(FP)
+	I64Load a_len+8(FP)
+	I64Load b_base+24(FP)
+	I64Load b_len+32(FP)
+	Call cmpbody<>(SB)
+	I64Store ret+48(FP)
+	RET
+
+TEXT runtime·cmpstring(SB), NOSPLIT, $0-40
+	Get SP
+	I64Load a_base+0(FP)
+	I64Load a_len+8(FP)
+	I64Load b_base+16(FP)
+	I64Load b_len+24(FP)
+	Call cmpbody<>(SB)
+	I64Store ret+32(FP)
+	RET
+
+// params: a, alen, b, blen
+// ret: -1/0/1
+TEXT cmpbody<>(SB), NOSPLIT, $0-0
+	// len = min(alen, blen)
+	Get R1
+	Get R3
+	Get R1
+	Get R3
+	I64LtU
+	Select
+	Set R4
+
+	Get R0
+	I32WrapI64
+	Get R2
+	I32WrapI64
+	Get R4
+	I32WrapI64
+	Call memcmp<>(SB)
+	I64ExtendI32S
+	Tee R5
+
+	I64Eqz
+	If
+		// check length
+		Get R1
+		Get R3
+		I64Sub
+		Set R5
+	End
+
+	I64Const $0
+	I64Const $-1
+	I64Const $1
+	Get R5
+	I64Const $0
+	I64LtS
+	Select
+	Get R5
+	I64Eqz
+	Select
+	Return
+
+// compiled with emscripten
+// params: a, b, len
+// ret: <0/0/>0
+TEXT memcmp<>(SB), NOSPLIT, $0-0
+	Get R2
+	If $1
+	Loop
+	Get R0
+	I32Load8S $0
+	Tee R3
+	Get R1
+	I32Load8S $0
+	Tee R4
+	I32Eq
+	If
+	Get R0
+	I32Const $1
+	I32Add
+	Set R0
+	Get R1
+	I32Const $1
+	I32Add
+	Set R1
+	I32Const $0
+	Get R2
+	I32Const $-1
+	I32Add
+	Tee R2
+	I32Eqz
+	BrIf $3
+	Drop
+	Br $1
+	End
+	End
+	Get R3
+	I32Const $255
+	I32And
+	Get R4
+	I32Const $255
+	I32And
+	I32Sub
+	Else
+	I32Const $0
+	End
+	Return
diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s
new file mode 100644
index 0000000..efb17f8
--- /dev/null
+++ b/src/internal/bytealg/count_amd64.s
@@ -0,0 +1,208 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "asm_amd64.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+#ifndef hasPOPCNT
+	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
+	JEQ	2(PC)
+	JMP	·countGeneric(SB)
+#endif
+	MOVQ	b_base+0(FP), SI
+	MOVQ	b_len+8(FP), BX
+	MOVB	c+24(FP), AL
+	LEAQ	ret+32(FP), R8
+	JMP	countbody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+#ifndef hasPOPCNT
+	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
+	JEQ	2(PC)
+	JMP	·countGenericString(SB)
+#endif
+	MOVQ	s_base+0(FP), SI
+	MOVQ	s_len+8(FP), BX
+	MOVB	c+16(FP), AL
+	LEAQ	ret+24(FP), R8
+	JMP	countbody<>(SB)
+
+// input:
+//   SI: data
+//   BX: data len
+//   AL: byte sought
+//   R8: address to put result
+// This function requires the POPCNT instruction.
+TEXT countbody<>(SB),NOSPLIT,$0
+	// Shuffle X0 around so that each byte contains
+	// the character we're looking for.
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
+
+	CMPQ BX, $16
+	JLT small
+
+	MOVQ $0, R12 // Accumulator
+
+	MOVQ SI, DI
+
+	CMPQ BX, $32
+	JA avx2
+sse:
+	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
+	JMP	sseloopentry
+
+sseloop:
+	// Move the next 16-byte chunk of the data into X1.
+	MOVOU	(DI), X1
+	// Compare bytes in X0 to X1.
+	PCMPEQB	X0, X1
+	// Take the top bit of each byte in X1 and put the result in DX.
+	PMOVMSKB X1, DX
+	// Count number of matching bytes
+	POPCNTL DX, DX
+	// Accumulate into R12
+	ADDQ DX, R12
+	// Advance to next block.
+	ADDQ	$16, DI
+sseloopentry:
+	CMPQ	DI, AX
+	JBE	sseloop
+
+	// Get the number of bytes to consider in the last 16 bytes
+	ANDQ $15, BX
+	JZ end
+
+	// Create mask to ignore overlap between previous 16 byte block
+	// and the next.
+	MOVQ $16,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+
+	// Process the last 16-byte chunk. This chunk may overlap with the
+	// chunks we've already searched so we need to mask part of it.
+	MOVOU	(AX), X1
+	PCMPEQB	X0, X1
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+end:
+	MOVQ R12, (R8)
+	RET
+
+// handle for lengths < 16
+small:
+	TESTQ	BX, BX
+	JEQ	endzero
+
+	// Check if we'll load across a page boundary.
+	LEAQ	16(SI), AX
+	TESTW	$0xff0, AX
+	JEQ	endofpage
+
+	// We must ignore high bytes as they aren't part of our slice.
+	// Create mask.
+	MOVB BX, CX
+	MOVQ $1, R10
+	SALQ CL, R10
+	SUBQ $1, R10
+
+	// Load data
+	MOVOU	(SI), X1
+	// Compare target byte with each byte in data.
+	PCMPEQB	X0, X1
+	// Move result bits to integer register.
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	// Directly return DX, we don't need to accumulate
+	// since we have <16 bytes.
+	MOVQ	DX, (R8)
+	RET
+endzero:
+	MOVQ $0, (R8)
+	RET
+
+endofpage:
+	// We must ignore low bytes as they aren't part of our slice.
+	MOVQ $16,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+
+	// Load data into the high end of X1.
+	MOVOU	-16(SI)(BX*1), X1
+	// Compare target byte with each byte in data.
+	PCMPEQB	X0, X1
+	// Move result bits to integer register.
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	// Directly return DX, we don't need to accumulate
+	// since we have <16 bytes.
+	POPCNTL DX, DX
+	MOVQ	DX, (R8)
+	RET
+
+avx2:
+#ifndef hasAVX2
+	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JNE sse
+#endif
+	MOVD AX, X0
+	LEAQ -32(SI)(BX*1), R11
+	VPBROADCASTB  X0, Y1
+avx2_loop:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+	ADDQ $32, DI
+	CMPQ DI, R11
+	JLE avx2_loop
+
+	// If last block is already processed,
+	// skip to the end.
+	CMPQ DI, R11
+	JEQ endavx
+
+	// Load address of the last 32 bytes.
+	// There is an overlap with the previous block.
+	MOVQ R11, DI
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, DX
+	// Exit AVX mode.
+	VZEROUPPER
+
+	// Create mask to ignore overlap between previous 32 byte block
+	// and the next.
+	ANDQ $31, BX
+	MOVQ $32,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFFFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+	MOVQ R12, (R8)
+	RET
+endavx:
+	// Exit AVX mode.
+	VZEROUPPER
+	MOVQ R12, (R8)
+	RET
diff --git a/src/internal/bytealg/count_arm.s b/src/internal/bytealg/count_arm.s
new file mode 100644
index 0000000..f704ea0
--- /dev/null
+++ b/src/internal/bytealg/count_arm.s
@@ -0,0 +1,43 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R0
+	MOVW	b_len+4(FP), R1
+	MOVBU	c+12(FP), R2
+	MOVW	$ret+16(FP), R7
+	B	countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R0
+	MOVW	s_len+4(FP), R1
+	MOVBU	c+8(FP), R2
+	MOVW	$ret+12(FP), R7
+	B	countbytebody<>(SB)
+
+// Input:
+// R0: data
+// R1: data length
+// R2: byte to find
+// R7: address to put result
+//
+// On exit:
+// R4 and R8 are clobbered
+TEXT countbytebody<>(SB),NOSPLIT,$0
+	MOVW	$0, R8	// R8 = count of byte to search
+	CMP	$0, R1
+	B.EQ	done	// short path to handle 0-byte case
+	ADD	R0, R1	// R1 is the end of the range
+byte_loop:
+	MOVBU.P	1(R0), R4
+	CMP	R4, R2
+	ADD.EQ	$1, R8
+	CMP	R0, R1
+	B.NE	byte_loop
+done:
+	MOVW	R8, (R7)
+	RET
diff --git a/src/internal/bytealg/count_arm64.s b/src/internal/bytealg/count_arm64.s
new file mode 100644
index 0000000..8cd703d
--- /dev/null
+++ b/src/internal/bytealg/count_arm64.s
@@ -0,0 +1,90 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+	MOVD	b_base+0(FP), R0
+	MOVD	b_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	MOVD	$ret+32(FP), R8
+	B	countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+16(FP), R1
+	MOVD	$ret+24(FP), R8
+	B	countbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R2: data len
+//   R1: byte to find
+//   R8: address to put result
+TEXT countbytebody<>(SB),NOSPLIT,$0
+	// R11 = count of byte to search
+	MOVD	$0, R11
+	// short path to handle 0-byte case
+	CBZ	R2, done
+	CMP	$0x20, R2
+	// jump directly to tail if length < 32
+	BLO	tail
+	ANDS	$0x1f, R0, R9
+	BEQ	chunk
+	// Work with not 32-byte aligned head
+	BIC	$0x1f, R0, R3
+	ADD	$0x20, R3
+head_loop:
+	MOVBU.P	1(R0), R5
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	SUB	$1, R2, R2
+	CMP	R0, R3
+	BNE	head_loop
+	// Work with 32-byte aligned chunks
+chunk:
+	BIC	$0x1f, R2, R9
+	// The first chunk can also be the last
+	CBZ	R9, tail
+	// R3 = end of 32-byte chunks
+	ADD	R0, R9, R3
+	MOVD	$1, R5
+	VMOV	R5, V5.B16
+	// R2 = length of tail
+	SUB	R9, R2, R2
+	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
+	VMOV	R1, V0.B16
+	// Clear the low 64-bit element of V7 and V8
+	VEOR	V7.B8, V7.B8, V7.B8
+	VEOR	V8.B8, V8.B8, V8.B8
+	// Count the target byte in 32-byte chunk
+chunk_loop:
+	VLD1.P	(R0), [V1.B16, V2.B16]
+	CMP	R0, R3
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// Clear the higher 7 bits
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	// Count lanes match the requested byte
+	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
+	VUADDLV	V6.B16, V7
+	// Accumulate the count in low 64-bit element of V8 when inside the loop
+	VADD	V7, V8
+	BNE	chunk_loop
+	VMOV	V8.D[0], R6
+	ADD	R6, R11, R11
+	CBZ	R2, done
+tail:
+	// Work with tail shorter than 32 bytes
+	MOVBU.P	1(R0), R5
+	SUB	$1, R2, R2
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	CBNZ	R2, tail
+done:
+	MOVD	R11, (R8)
+	RET
diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go
new file mode 100644
index 0000000..932a7c5
--- /dev/null
+++ b/src/internal/bytealg/count_generic.go
@@ -0,0 +1,27 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !arm && !arm64 && !ppc64le && !ppc64 && !riscv64 && !s390x
+
+package bytealg
+
+func Count(b []byte, c byte) int {
+	n := 0
+	for _, x := range b {
+		if x == c {
+			n++
+		}
+	}
+	return n
+}
+
+func CountString(s string, c byte) int {
+	n := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			n++
+		}
+	}
+	return n
+}
diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go
new file mode 100644
index 0000000..90189c9
--- /dev/null
+++ b/src/internal/bytealg/count_native.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm || arm64 || ppc64le || ppc64 || riscv64 || s390x
+
+package bytealg
+
+//go:noescape
+func Count(b []byte, c byte) int
+
+//go:noescape
+func CountString(s string, c byte) int
+
+// A backup implementation to use by assembly.
+func countGeneric(b []byte, c byte) int {
+	n := 0
+	for _, x := range b {
+		if x == c {
+			n++
+		}
+	}
+	return n
+}
+func countGenericString(s string, c byte) int {
+	n := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			n++
+		}
+	}
+	return n
+}
diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s
new file mode 100644
index 0000000..2d2490b
--- /dev/null
+++ b/src/internal/bytealg/count_ppc64x.s
@@ -0,0 +1,96 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64le || ppc64
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// R3 = byte array pointer 
+	// R4 = length
+	MOVBZ R6, R5              // R5 = byte
+	BR    countbytebody<>(SB)
+
+TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
+	// R3 = byte array pointer
+	// R4 = length
+	MOVBZ R5, R5              // R5 = byte
+	BR    countbytebody<>(SB)
+
+// R3: addr of string
+// R4: len of string
+// R5: byte to count
+// On exit:
+// R3: return value
+// endianness shouldn't matter since we are just counting and order
+// is irrelevant
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+	DCBT (R3)    // Prepare cache line.
+	MOVD R0, R18 // byte count
+	MOVD R3, R19 // Save base address for calculating the index later.
+	MOVD R4, R16
+
+	MOVD   R5, R6
+	RLDIMI $8, R6, $48, R6
+	RLDIMI $16, R6, $32, R6
+	RLDIMI $32, R6, $0, R6  // fill reg with the byte to count
+
+	VSPLTISW $3, V4     // used for shift
+	MTVRD    R6, V1     // move compare byte
+	VSPLTB   $7, V1, V1 // replicate byte across V1
+
+	CMPU   R4, $32          // Check if it's a small string (<32 bytes)
+	BLT    tail             // Jump to the small string case
+	XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
+
+cmploop:
+	LXVW4X (R3), VS32 // load bytes from string
+
+	// when the bytes match, the corresponding byte contains all 1s
+	VCMPEQUB V1, V0, V2     // compare bytes
+	VPOPCNTD V2, V3         // each double word contains its count
+	VADDUDM  V3, V5, V5     // accumulate bit count in each double word
+	ADD      $16, R3, R3    // increment pointer
+	SUB      $16, R16, R16  // remaining bytes
+	CMP      R16, $16       // at least 16 remaining?
+	BGE      cmploop
+	VSRD     V5, V4, V5     // shift by 3 to convert bits to bytes
+	VSLDOI   $8, V5, V5, V6 // get the double word values from vector
+	MFVSRD   V5, R9
+	MFVSRD   V6, R10
+	ADD      R9, R10, R9
+	ADD      R9, R18, R18
+
+tail:
+	CMP R16, $8 // 8 bytes left?
+	BLT small
+
+	MOVD    (R3), R12     // load 8 bytes
+	CMPB    R12, R6, R17  // compare bytes
+	POPCNTD R17, R15      // bit count
+	SRD     $3, R15, R15  // byte count
+	ADD     R15, R18, R18 // add to byte count
+
+next1:
+	ADD $8, R3, R3
+	SUB $8, R16, R16 // remaining bytes
+	BR  tail
+
+small:
+	CMP   $0, R16   // any remaining
+	BEQ   done
+	MOVBZ (R3), R12 // check each remaining byte
+	CMP   R12, R5
+	BNE   next2
+	ADD   $1, R18
+
+next2:
+	SUB $1, R16
+	ADD $1, R3  // inc address
+	BR  small
+
+done:
+	MOVD R18, R3    // return count
+	RET
diff --git a/src/internal/bytealg/count_riscv64.s b/src/internal/bytealg/count_riscv64.s
new file mode 100644
index 0000000..d123cbd
--- /dev/null
+++ b/src/internal/bytealg/count_riscv64.s
@@ -0,0 +1,47 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
+	// X10 = b_base
+	// X11 = b_len
+	// X12 = b_cap (unused)
+	// X13 = byte to count (want in X12)
+	AND	$0xff, X13, X12
+	MOV	ZERO, X14	// count
+	ADD	X10, X11	// end
+
+loop:
+	BEQ	X10, X11, done
+	MOVBU	(X10), X15
+	ADD	$1, X10
+	BNE	X12, X15, loop
+	ADD	$1, X14
+	JMP	loop
+
+done:
+	MOV	X14, X10
+	RET
+
+TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
+	// X10 = s_base
+	// X11 = s_len
+	// X12 = byte to count
+	AND	$0xff, X12
+	MOV	ZERO, X14	// count
+	ADD	X10, X11	// end
+
+loop:
+	BEQ	X10, X11, done
+	MOVBU	(X10), X15
+	ADD	$1, X10
+	BNE	X12, X15, loop
+	ADD	$1, X14
+	JMP	loop
+
+done:
+	MOV	X14, X10
+	RET
diff --git a/src/internal/bytealg/count_s390x.s b/src/internal/bytealg/count_s390x.s
new file mode 100644
index 0000000..2a3b5c0
--- /dev/null
+++ b/src/internal/bytealg/count_s390x.s
@@ -0,0 +1,169 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// condition code masks
+#define EQ 8
+#define NE 7
+
+// register assignments
+#define R_ZERO R0
+#define R_VAL  R1
+#define R_TMP  R2
+#define R_PTR  R3
+#define R_LEN  R4
+#define R_CHAR R5
+#define R_RET  R6
+#define R_ITER R7
+#define R_CNT  R8
+#define R_MPTR R9
+
+// vector register assignments
+#define V_ZERO V0
+#define V_CHAR V1
+#define V_MASK V2
+#define V_VAL  V3
+#define V_CNT  V4
+
+// mask for trailing bytes in vector implementation
+GLOBL countbytemask<>(SB), RODATA, $16
+DATA countbytemask<>+0(SB)/8, $0x0101010101010101
+DATA countbytemask<>+8(SB)/8, $0x0101010101010101
+
+// func Count(b []byte, c byte) int
+TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
+	LMG   b+0(FP), R_PTR, R_LEN
+	MOVBZ c+24(FP), R_CHAR
+	MOVD  $ret+32(FP), R_RET
+	BR    countbytebody<>(SB)
+
+// func CountString(s string, c byte) int
+TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
+	LMG   s+0(FP), R_PTR, R_LEN
+	MOVBZ c+16(FP), R_CHAR
+	MOVD  $ret+24(FP), R_RET
+	BR    countbytebody<>(SB)
+
+// input:
+// R_PTR  = address of array of bytes
+// R_LEN  = number of bytes in array
+// R_CHAR = byte value to count zero (extended to register width)
+// R_RET  = address of return value
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+	MOVD  $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
+	MOVD  $countbytemask<>(SB), R_MPTR
+	CGIJ  $EQ, R_LEN, $0, ret0 // return if length is 0.
+	SRD   $4, R_LEN, R_ITER    // R_ITER is the number of 16-byte chunks
+	MOVBZ (R_TMP), R_TMP       // load bool indicating support for vector facility
+	CGIJ  $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
+
+	// Start of vector code (have vector facility).
+	//
+	// Set R_LEN to be the length mod 16 minus 1 to use as an index for
+	// vector 'load with length' (VLL). It will be in the range [-1,14].
+	// Also replicate c across a 16-byte vector and initialize V_ZERO.
+	ANDW  $0xf, R_LEN
+	VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
+	VZERO V_ZERO             // V_ZERO = [1]uint128{0}
+	ADDW  $-1, R_LEN
+	VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
+
+	// Jump to loop if we have more than 15 bytes to process.
+	CGIJ $NE, R_ITER, $0, vxchunks
+
+	// Load 1-15 bytes and corresponding mask.
+	// Note: only the low 32-bits of R_LEN are used for the index.
+	VLL R_LEN, (R_PTR), V_VAL
+	VLL R_LEN, (R_MPTR), V_MASK
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+
+	// Return rightmost (lowest) 64-bit part of accumulator.
+	VSTEG $1, V_CNT, (R_RET)
+	RET
+
+vxchunks:
+	// Load 0x01 into every byte element in the 16-byte mask vector.
+	VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
+	VZERO  V_CNT      // initial uint128 count of 0
+
+vxloop:
+	// Load input bytes in 16-byte chunks.
+	VL (R_PTR), V_VAL
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+	// Increment input string address.
+	MOVD $16(R_PTR), R_PTR
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
+
+	// Repeat until all 16-byte chunks are done.
+	BRCTG R_ITER, vxloop
+
+	// Skip to end if there are no trailing bytes.
+	CIJ $EQ, R_LEN, $-1, vxret
+
+	// Load 1-15 bytes and corresponding mask.
+	// Note: only the low 32-bits of R_LEN are used for the index.
+	VLL R_LEN, (R_PTR), V_VAL
+	VLL R_LEN, (R_MPTR), V_MASK
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL
+	VN    V_MASK, V_VAL, V_VAL
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
+
+vxret:
+	// Return rightmost (lowest) 64-bit part of accumulator.
+	VSTEG $1, V_CNT, (R_RET)
+	RET
+
+novx:
+	// Start of non-vector code (the vector facility not available).
+	//
+	// Initialise counter and constant zero.
+	MOVD $0, R_CNT
+	MOVD $0, R_ZERO
+
+loop:
+	// Read 1-byte from input and compare.
+	// Note: avoid putting LOCGR in critical path.
+	MOVBZ (R_PTR), R_VAL
+	MOVD  $1, R_TMP
+	MOVD  $1(R_PTR), R_PTR
+	CMPW  R_VAL, R_CHAR
+	LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
+	ADD   R_TMP, R_CNT       // accumulate 64-bit result
+
+	// Repeat until all bytes have been checked.
+	BRCTG R_LEN, loop
+
+ret:
+	MOVD R_CNT, (R_RET)
+	RET
+
+ret0:
+	MOVD $0, (R_RET)
+	RET
diff --git a/src/internal/bytealg/equal_386.s b/src/internal/bytealg/equal_386.s
new file mode 100644
index 0000000..58b3cbe
--- /dev/null
+++ b/src/internal/bytealg/equal_386.s
@@ -0,0 +1,130 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
+	MOVL	a+0(FP), SI
+	MOVL	b+4(FP), DI
+	CMPL	SI, DI
+	JEQ	eq
+	MOVL	size+8(FP), BX
+	LEAL	ret+12(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB    $1, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
+	MOVL    a+0(FP), SI
+	MOVL    b+4(FP), DI
+	CMPL    SI, DI
+	JEQ     eq
+	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
+	LEAL	ret+8(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB    $1, ret+8(FP)
+	RET
+
+// a in SI
+// b in DI
+// count in BX
+// address of result byte in AX
+TEXT memeqbody<>(SB),NOSPLIT,$0-0
+	CMPL	BX, $4
+	JB	small
+
+	// 64 bytes at a time using xmm registers
+hugeloop:
+	CMPL	BX, $64
+	JB	bigloop
+#ifdef GO386_softfloat
+	JMP	bigloop
+#endif
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	MOVOU	16(SI), X2
+	MOVOU	16(DI), X3
+	MOVOU	32(SI), X4
+	MOVOU	32(DI), X5
+	MOVOU	48(SI), X6
+	MOVOU	48(DI), X7
+	PCMPEQB	X1, X0
+	PCMPEQB	X3, X2
+	PCMPEQB	X5, X4
+	PCMPEQB	X7, X6
+	PAND	X2, X0
+	PAND	X6, X4
+	PAND	X4, X0
+	PMOVMSKB X0, DX
+	ADDL	$64, SI
+	ADDL	$64, DI
+	SUBL	$64, BX
+	CMPL	DX, $0xffff
+	JEQ	hugeloop
+	MOVB	$0, (AX)
+	RET
+
+	// 4 bytes at a time using 32-bit register
+bigloop:
+	CMPL	BX, $4
+	JBE	leftover
+	MOVL	(SI), CX
+	MOVL	(DI), DX
+	ADDL	$4, SI
+	ADDL	$4, DI
+	SUBL	$4, BX
+	CMPL	CX, DX
+	JEQ	bigloop
+	MOVB	$0, (AX)
+	RET
+
+	// remaining 0-4 bytes
+leftover:
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	-4(DI)(BX*1), DX
+	CMPL	CX, DX
+	SETEQ	(AX)
+	RET
+
+small:
+	CMPL	BX, $0
+	JEQ	equal
+
+	LEAL	0(BX*8), CX
+	NEGL	CX
+
+	MOVL	SI, DX
+	CMPB	DX, $0xfc
+	JA	si_high
+
+	// load at SI won't cross a page boundary.
+	MOVL	(SI), SI
+	JMP	si_finish
+si_high:
+	// address ends in 111111xx. Load up to bytes we want, move to correct position.
+	MOVL	-4(SI)(BX*1), SI
+	SHRL	CX, SI
+si_finish:
+
+	// same for DI.
+	MOVL	DI, DX
+	CMPB	DX, $0xfc
+	JA	di_high
+	MOVL	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVL	-4(DI)(BX*1), DI
+	SHRL	CX, DI
+di_finish:
+
+	SUBL	SI, DI
+	SHLL	CX, DI
+equal:
+	SETEQ	(AX)
+	RET
diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s
new file mode 100644
index 0000000..d178a33
--- /dev/null
+++ b/src/internal/bytealg/equal_amd64.s
@@ -0,0 +1,162 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "asm_amd64.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25
+	// AX = a    (want in SI)
+	// BX = b    (want in DI)
+	// CX = size (want in BX)
+	CMPQ	AX, BX
+	JNE	neq
+	MOVQ	$1, AX	// return 1
+	RET
+neq:
+	MOVQ	AX, SI
+	MOVQ	BX, DI
+	MOVQ	CX, BX
+	JMP	memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
+	// AX = a       (want in SI)
+	// BX = b       (want in DI)
+	// 8(DX) = size (want in BX)
+	CMPQ	AX, BX
+	JNE	neq
+	MOVQ	$1, AX	// return 1
+	RET
+neq:
+	MOVQ	AX, SI
+	MOVQ	BX, DI
+	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
+	JMP	memeqbody<>(SB)
+
+// Input:
+//   a in SI
+//   b in DI
+//   count in BX
+// Output:
+//   result in AX
+TEXT memeqbody<>(SB),NOSPLIT,$0-0
+	CMPQ	BX, $8
+	JB	small
+	CMPQ	BX, $64
+	JB	bigloop
+#ifndef hasAVX2
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JE	hugeloop_avx2
+
+	// 64 bytes at a time using xmm registers
+hugeloop:
+	CMPQ	BX, $64
+	JB	bigloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	MOVOU	16(SI), X2
+	MOVOU	16(DI), X3
+	MOVOU	32(SI), X4
+	MOVOU	32(DI), X5
+	MOVOU	48(SI), X6
+	MOVOU	48(DI), X7
+	PCMPEQB	X1, X0
+	PCMPEQB	X3, X2
+	PCMPEQB	X5, X4
+	PCMPEQB	X7, X6
+	PAND	X2, X0
+	PAND	X6, X4
+	PAND	X4, X0
+	PMOVMSKB X0, DX
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, BX
+	CMPL	DX, $0xffff
+	JEQ	hugeloop
+	XORQ	AX, AX	// return 0
+	RET
+#endif
+
+	// 64 bytes at a time using ymm registers
+hugeloop_avx2:
+	CMPQ	BX, $64
+	JB	bigloop_avx2
+	VMOVDQU	(SI), Y0
+	VMOVDQU	(DI), Y1
+	VMOVDQU	32(SI), Y2
+	VMOVDQU	32(DI), Y3
+	VPCMPEQB	Y1, Y0, Y4
+	VPCMPEQB	Y2, Y3, Y5
+	VPAND	Y4, Y5, Y6
+	VPMOVMSKB Y6, DX
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, BX
+	CMPL	DX, $0xffffffff
+	JEQ	hugeloop_avx2
+	VZEROUPPER
+	XORQ	AX, AX	// return 0
+	RET
+
+bigloop_avx2:
+	VZEROUPPER
+
+	// 8 bytes at a time using 64-bit register
+bigloop:
+	CMPQ	BX, $8
+	JBE	leftover
+	MOVQ	(SI), CX
+	MOVQ	(DI), DX
+	ADDQ	$8, SI
+	ADDQ	$8, DI
+	SUBQ	$8, BX
+	CMPQ	CX, DX
+	JEQ	bigloop
+	XORQ	AX, AX	// return 0
+	RET
+
+	// remaining 0-8 bytes
+leftover:
+	MOVQ	-8(SI)(BX*1), CX
+	MOVQ	-8(DI)(BX*1), DX
+	CMPQ	CX, DX
+	SETEQ	AX
+	RET
+
+small:
+	CMPQ	BX, $0
+	JEQ	equal
+
+	LEAQ	0(BX*8), CX
+	NEGQ	CX
+
+	CMPB	SI, $0xf8
+	JA	si_high
+
+	// load at SI won't cross a page boundary.
+	MOVQ	(SI), SI
+	JMP	si_finish
+si_high:
+	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
+	MOVQ	-8(SI)(BX*1), SI
+	SHRQ	CX, SI
+si_finish:
+
+	// same for DI.
+	CMPB	DI, $0xf8
+	JA	di_high
+	MOVQ	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVQ	-8(DI)(BX*1), DI
+	SHRQ	CX, DI
+di_finish:
+
+	SUBQ	SI, DI
+	SHLQ	CX, DI
+equal:
+	SETEQ	AX
+	RET
diff --git a/src/internal/bytealg/equal_arm.s b/src/internal/bytealg/equal_arm.s
new file mode 100644
index 0000000..a6c4369
--- /dev/null
+++ b/src/internal/bytealg/equal_arm.s
@@ -0,0 +1,91 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-13
+	MOVW	a+0(FP), R0
+	MOVW	b+4(FP), R2
+	CMP	R0, R2
+	B.EQ	eq
+	MOVW	size+8(FP), R1
+	CMP	$0, R1
+	B.EQ	eq		// short path to handle 0-byte case
+	MOVW	$ret+12(FP), R7
+	B	memeqbody<>(SB)
+eq:
+	MOVW	$1, R0
+	MOVB	R0, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-9
+	MOVW	a+0(FP), R0
+	MOVW	b+4(FP), R2
+	CMP	R0, R2
+	B.EQ	eq
+	MOVW	4(R7), R1	// compiler stores size at offset 4 in the closure
+	CMP	$0, R1
+	B.EQ	eq		// short path to handle 0-byte case
+	MOVW	$ret+8(FP), R7
+	B	memeqbody<>(SB)
+eq:
+	MOVW	$1, R0
+	MOVB	R0, ret+8(FP)
+	RET
+
+// Input:
+// R0: data of a
+// R1: length
+// R2: data of b
+// R7: points to return value
+//
+// On exit:
+// R4, R5 and R6 are clobbered
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	$1, R1
+	B.EQ	one		// 1-byte special case for better performance
+
+	CMP	$4, R1
+	ADD	R0, R1		// R1 is the end of the range to compare
+	B.LT	byte_loop	// length < 4
+	AND	$3, R0, R6
+	CMP	$0, R6
+	B.NE	byte_loop	// unaligned a, use byte-wise compare (TODO: try to align a)
+	AND	$3, R2, R6
+	CMP	$0, R6
+	B.NE	byte_loop	// unaligned b, use byte-wise compare
+	AND	$0xfffffffc, R1, R6
+	// length >= 4
+chunk4_loop:
+	MOVW.P	4(R0), R4
+	MOVW.P	4(R2), R5
+	CMP	R4, R5
+	B.NE	notequal
+	CMP	R0, R6
+	B.NE	chunk4_loop
+	CMP	R0, R1
+	B.EQ	equal		// reached the end
+byte_loop:
+	MOVBU.P	1(R0), R4
+	MOVBU.P	1(R2), R5
+	CMP	R4, R5
+	B.NE	notequal
+	CMP	R0, R1
+	B.NE	byte_loop
+equal:
+	MOVW	$1, R0
+	MOVB	R0, (R7)
+	RET
+one:
+	MOVBU	(R0), R4
+	MOVBU	(R2), R5
+	CMP	R4, R5
+	B.EQ	equal
+notequal:
+	MOVW	$0, R0
+	MOVB	R0, (R7)
+	RET
diff --git a/src/internal/bytealg/equal_arm64.s b/src/internal/bytealg/equal_arm64.s
new file mode 100644
index 0000000..d3aabba
--- /dev/null
+++ b/src/internal/bytealg/equal_arm64.s
@@ -0,0 +1,121 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+	// short path to handle 0-byte case
+	CBZ	R2, equal
+	B	memeqbody<>(SB)
+equal:
+	MOVD	$1, R0
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
+	CMP	R0, R1
+	BEQ	eq
+	MOVD	8(R26), R2    // compiler stores size at offset 8 in the closure
+	CBZ	R2, eq
+	B	memeqbody<>(SB)
+eq:
+	MOVD	$1, R0
+	RET
+
+// input:
+// R0: pointer a
+// R1: pointer b
+// R2: data len
+// at return: result in R0
+TEXT memeqbody<>(SB),NOSPLIT,$0
+	CMP	$1, R2
+	// handle 1-byte special case for better performance
+	BEQ	one
+	CMP	$16, R2
+	// handle specially if length < 16
+	BLO	tail
+	BIC	$0x3f, R2, R3
+	CBZ	R3, chunk16
+	// work with 64-byte chunks
+	ADD	R3, R0, R6	// end of chunks
+chunk64_loop:
+	VLD1.P	(R0), [V0.D2, V1.D2, V2.D2, V3.D2]
+	VLD1.P	(R1), [V4.D2, V5.D2, V6.D2, V7.D2]
+	VCMEQ	V0.D2, V4.D2, V8.D2
+	VCMEQ	V1.D2, V5.D2, V9.D2
+	VCMEQ	V2.D2, V6.D2, V10.D2
+	VCMEQ	V3.D2, V7.D2, V11.D2
+	VAND	V8.B16, V9.B16, V8.B16
+	VAND	V8.B16, V10.B16, V8.B16
+	VAND	V8.B16, V11.B16, V8.B16
+	CMP	R0, R6
+	VMOV	V8.D[0], R4
+	VMOV	V8.D[1], R5
+	CBZ	R4, not_equal
+	CBZ	R5, not_equal
+	BNE	chunk64_loop
+	AND	$0x3f, R2, R2
+	CBZ	R2, equal
+chunk16:
+	// work with 16-byte chunks
+	BIC	$0xf, R2, R3
+	CBZ	R3, tail
+	ADD	R3, R0, R6	// end of chunks
+chunk16_loop:
+	LDP.P	16(R0), (R4, R5)
+	LDP.P	16(R1), (R7, R9)
+	EOR	R4, R7
+	CBNZ	R7, not_equal
+	EOR	R5, R9
+	CBNZ	R9, not_equal
+	CMP	R0, R6
+	BNE	chunk16_loop
+	AND	$0xf, R2, R2
+	CBZ	R2, equal
+tail:
+	// special compare of tail with length < 16
+	TBZ	$3, R2, lt_8
+	MOVD	(R0), R4
+	MOVD	(R1), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	SUB	$8, R2, R6	// offset of the last 8 bytes
+	MOVD	(R0)(R6), R4
+	MOVD	(R1)(R6), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	B	equal
+lt_8:
+	TBZ	$2, R2, lt_4
+	MOVWU	(R0), R4
+	MOVWU	(R1), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	SUB	$4, R2, R6	// offset of the last 4 bytes
+	MOVWU	(R0)(R6), R4
+	MOVWU	(R1)(R6), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	B	equal
+lt_4:
+	TBZ	$1, R2, lt_2
+	MOVHU.P	2(R0), R4
+	MOVHU.P	2(R1), R5
+	CMP	R4, R5
+	BNE	not_equal
+lt_2:
+	TBZ	$0, R2, equal
+one:
+	MOVBU	(R0), R4
+	MOVBU	(R1), R5
+	CMP	R4, R5
+	BNE	not_equal
+equal:
+	MOVD	$1, R0
+	RET
+not_equal:
+	MOVB	ZR, R0
+	RET
diff --git a/src/internal/bytealg/equal_generic.go b/src/internal/bytealg/equal_generic.go
new file mode 100644
index 0000000..59bdf8f
--- /dev/null
+++ b/src/internal/bytealg/equal_generic.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Equal reports whether a and b
+// are the same length and contain the same bytes.
+// A nil argument is equivalent to an empty slice.
+//
+// Equal is equivalent to bytes.Equal.
+// It is provided here for convenience,
+// because some packages cannot depend on bytes.
+func Equal(a, b []byte) bool {
+	// Neither cmd/compile nor gccgo allocates for these string conversions.
+	// There is a test for this in package bytes.
+	return string(a) == string(b)
+}
diff --git a/src/internal/bytealg/equal_loong64.s b/src/internal/bytealg/equal_loong64.s
new file mode 100644
index 0000000..dcdde89
--- /dev/null
+++ b/src/internal/bytealg/equal_loong64.s
@@ -0,0 +1,52 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	REGCTXT	R29
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVV	a+0(FP), R4
+	MOVV	b+8(FP), R5
+	BEQ	R4, R5, eq
+	MOVV	size+16(FP), R6
+	ADDV	R4, R6, R7
+loop:
+	BNE	R4, R7, test
+	MOVV	$1, R4
+	MOVB	R4, ret+24(FP)
+	RET
+test:
+	MOVBU	(R4), R9
+	ADDV	$1, R4
+	MOVBU	(R5), R10
+	ADDV	$1, R5
+	BEQ	R9, R10, loop
+
+	MOVB	R0, ret+24(FP)
+	RET
+eq:
+	MOVV	$1, R4
+	MOVB	R4, ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+	MOVV	a+0(FP), R4
+	MOVV	b+8(FP), R5
+	BEQ	R4, R5, eq
+	MOVV	8(REGCTXT), R6    // compiler stores size at offset 8 in the closure
+	MOVV	R4, 8(R3)
+	MOVV	R5, 16(R3)
+	MOVV	R6, 24(R3)
+	JAL	runtime·memequal(SB)
+	MOVBU	32(R3), R4
+	MOVB	R4, ret+16(FP)
+	RET
+eq:
+	MOVV	$1, R4
+	MOVB	R4, ret+16(FP)
+	RET
diff --git a/src/internal/bytealg/equal_mips64x.s b/src/internal/bytealg/equal_mips64x.s
new file mode 100644
index 0000000..d92f225
--- /dev/null
+++ b/src/internal/bytealg/equal_mips64x.s
@@ -0,0 +1,118 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	REGCTXT	R22
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVV	a+0(FP), R1
+	MOVV	b+8(FP), R2
+	BEQ	R1, R2, eq
+	MOVV	size+16(FP), R3
+	ADDV	R1, R3, R4
+
+	// chunk size is 16
+	SGTU	$16, R3, R8
+	BEQ	R0, R8, chunk_entry
+
+byte_loop:
+	BNE	R1, R4, byte_test
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	RET
+byte_test:
+	MOVBU	(R1), R6
+	ADDV	$1, R1
+	MOVBU	(R2), R7
+	ADDV	$1, R2
+	BEQ	R6, R7, byte_loop
+	JMP	not_eq
+
+chunk_entry:
+	// make sure both a and b are aligned
+	OR	R1, R2, R9
+	AND	$0x7, R9
+	BNE	R0, R9, byte_loop
+	JMP	chunk_loop_1
+
+chunk_loop:
+	// chunk size is 16
+	SGTU	$16, R3, R8
+	BNE	R0, R8, chunk_tail_8
+chunk_loop_1:
+	MOVV	(R1), R6
+	MOVV	(R2), R7
+	BNE	R6, R7, not_eq
+	MOVV	8(R1), R12
+	MOVV	8(R2), R13
+	ADDV	$16, R1
+	ADDV	$16, R2
+	SUBV	$16, R3
+	BEQ	R12, R13, chunk_loop
+	JMP	not_eq
+
+chunk_tail_8:
+	AND	$8, R3, R14
+	BEQ	R0, R14, chunk_tail_4
+	MOVV	(R1), R6
+	MOVV	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$8, R1
+	ADDV	$8, R2
+
+chunk_tail_4:
+	AND	$4, R3, R14
+	BEQ	R0, R14, chunk_tail_2
+	MOVWU	(R1), R6
+	MOVWU	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$4, R1
+	ADDV	$4, R2
+
+chunk_tail_2:
+	AND	$2, R3, R14
+	BEQ	R0, R14, chunk_tail_1
+	MOVHU	(R1), R6
+	MOVHU	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$2, R1
+	ADDV	$2, R2
+
+chunk_tail_1:
+	AND	$1, R3, R14
+	BEQ	R0, R14, eq
+	MOVBU	(R1), R6
+	MOVBU	(R2), R7
+	BEQ	R6, R7, eq
+
+not_eq:
+	MOVB	R0, ret+24(FP)
+	RET
+eq:
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+	MOVV	a+0(FP), R1
+	MOVV	b+8(FP), R2
+	BEQ	R1, R2, eq
+	MOVV	8(REGCTXT), R3    // compiler stores size at offset 8 in the closure
+	MOVV	R1, 8(R29)
+	MOVV	R2, 16(R29)
+	MOVV	R3, 24(R29)
+	JAL	runtime·memequal(SB)
+	MOVBU	32(R29), R1
+	MOVB	R1, ret+16(FP)
+	RET
+eq:
+	MOVV	$1, R1
+	MOVB	R1, ret+16(FP)
+	RET
diff --git a/src/internal/bytealg/equal_mipsx.s b/src/internal/bytealg/equal_mipsx.s
new file mode 100644
index 0000000..4c46dd4
--- /dev/null
+++ b/src/internal/bytealg/equal_mipsx.s
@@ -0,0 +1,62 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	REGCTXT	R22
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
+	MOVW	a+0(FP), R1
+	MOVW	b+4(FP), R2
+	BEQ	R1, R2, eq
+	MOVW	size+8(FP), R3
+	ADDU	R1, R3, R4
+loop:
+	BNE	R1, R4, test
+	MOVW	$1, R1
+	MOVB	R1, ret+12(FP)
+	RET
+test:
+	MOVBU	(R1), R6
+	ADDU	$1, R1
+	MOVBU	(R2), R7
+	ADDU	$1, R2
+	BEQ	R6, R7, loop
+
+	MOVB	R0, ret+12(FP)
+	RET
+eq:
+	MOVW	$1, R1
+	MOVB	R1, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
+	MOVW	a+0(FP), R1
+	MOVW	b+4(FP), R2
+	BEQ	R1, R2, eq
+	MOVW	4(REGCTXT), R3	// compiler stores size at offset 4 in the closure
+	ADDU	R1, R3, R4
+loop:
+	BNE	R1, R4, test
+	MOVW	$1, R1
+	MOVB	R1, ret+8(FP)
+	RET
+test:
+	MOVBU	(R1), R6
+	ADDU	$1, R1
+	MOVBU	(R2), R7
+	ADDU	$1, R2
+	BEQ	R6, R7, loop
+
+	MOVB	R0, ret+8(FP)
+	RET
+eq:
+	MOVW	$1, R1
+	MOVB	R1, ret+8(FP)
+	RET
diff --git a/src/internal/bytealg/equal_native.go b/src/internal/bytealg/equal_native.go
new file mode 100644
index 0000000..cf3a245
--- /dev/null
+++ b/src/internal/bytealg/equal_native.go
@@ -0,0 +1,21 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "unsafe"
+
+// The declarations below generate ABI wrappers for functions
+// implemented in assembly in this package but declared in another
+// package.
+
+// The compiler generates calls to runtime.memequal and runtime.memequal_varlen.
+// In addition, the runtime calls runtime.memequal explicitly.
+// Those functions are implemented in this package.
+
+//go:linkname abigen_runtime_memequal runtime.memequal
+func abigen_runtime_memequal(a, b unsafe.Pointer, size uintptr) bool
+
+//go:linkname abigen_runtime_memequal_varlen runtime.memequal_varlen
+func abigen_runtime_memequal_varlen(a, b unsafe.Pointer) bool
diff --git a/src/internal/bytealg/equal_ppc64x.s b/src/internal/bytealg/equal_ppc64x.s
new file mode 100644
index 0000000..f2c7cc1
--- /dev/null
+++ b/src/internal/bytealg/equal_ppc64x.s
@@ -0,0 +1,205 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// 4K (smallest case) page size offset mask for PPC64.
+#define PAGE_OFFSET 4095
+
+// TODO: At writing, ISEL and BC do not support CR bit type arguments,
+// define them here for readability.
+#define CR0LT 4*0+0
+#define CR0EQ 4*0+2
+#define CR1LT 4*1+0
+#define CR6LT 4*6+0
+
+// Likewise, the BC opcode is hard to read, and no extended
+// mnemonics are offered for these forms.
+#define BGELR_CR6 BC  4, CR6LT, (LR)
+#define BEQLR     BC 12, CR0EQ, (LR)
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+	// R3 = a
+	// R4 = b
+	// R5 = size
+	BR	memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
+	// R3 = a
+	// R4 = b
+	CMP	R3, R4
+	BEQ	eq
+	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
+	BR	memeqbody<>(SB)
+eq:
+	MOVD	$1, R3
+	RET
+
+// Do an efficient memequal for ppc64
+// R3 = s1
+// R4 = s2
+// R5 = len
+// On exit:
+// R3 = return value
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3, R8		// Move s1 into R8
+	ADD	R5, R3, R9	// &s1[len(s1)]
+	ADD	R5, R4, R10	// &s2[len(s2)]
+	MOVD	$1, R11
+	CMP	R5, $16		// Use GPR checks for check for len <= 16
+	BLE	check0_16
+	MOVD	$0, R3		// Assume no-match in case BGELR CR6 returns
+	CMP	R5, $32		// Use overlapping VSX loads for len <= 32
+	BLE	check17_32	// Do a pair of overlapping VSR compares
+	CMP	R5, $64
+	BLE	check33_64	// Hybrid check + overlap compare.
+
+setup64:
+	SRD	$6, R5, R6	// number of 64 byte chunks to compare
+	MOVD	R6, CTR
+	MOVD	$16, R14	// index for VSX loads and stores
+	MOVD	$32, R15
+	MOVD	$48, R16
+	ANDCC	$0x3F, R5, R5	// len%64==0?
+
+	PCALIGN $32
+loop64:
+	LXVD2X	(R8+R0), V0
+	LXVD2X	(R4+R0), V1
+	VCMPEQUBCC V0, V1, V2	// compare, setting CR6
+	BGELR_CR6
+	LXVD2X	(R8+R14), V0
+	LXVD2X	(R4+R14), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	LXVD2X	(R8+R15), V0
+	LXVD2X	(R4+R15), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	LXVD2X	(R8+R16), V0
+	LXVD2X	(R4+R16), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	ADD	$64,R8		// bump up to next 64
+	ADD	$64,R4
+	BDNZ	loop64
+
+	ISEL	$CR0EQ, R11, R3, R3	// If no tail, return 1, otherwise R3 remains 0.
+	BEQLR				// return if no tail.
+
+	ADD	$-64, R9, R8
+	ADD	$-64, R10, R4
+	LXVD2X	(R8+R0), V0
+	LXVD2X	(R4+R0), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	LXVD2X	(R8+R14), V0
+	LXVD2X	(R4+R14), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	LXVD2X	(R8+R15), V0
+	LXVD2X	(R4+R15), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	LXVD2X	(R8+R16), V0
+	LXVD2X	(R4+R16), V1
+	VCMPEQUBCC	V0, V1, V2
+	ISEL	$CR6LT, R11, R0, R3
+	RET
+
+check33_64:
+	// Bytes 0-15
+	LXVD2X	(R8+R0), V0
+	LXVD2X	(R4+R0), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+	ADD	$16, R8
+	ADD	$16, R4
+
+	// Bytes 16-31
+	LXVD2X	(R8+R0), V0
+	LXVD2X	(R4+R0), V1
+	VCMPEQUBCC	V0, V1, V2
+	BGELR_CR6
+
+	// A little tricky, but point R4,R8 to &sx[len-32],
+	// and reuse check17_32 to check the next 1-31 bytes (with some overlap)
+	ADD	$-32, R9, R8
+	ADD	$-32, R10, R4
+	// Fallthrough
+
+check17_32:
+	LXVD2X	(R8+R0), V0
+	LXVD2X	(R4+R0), V1
+	VCMPEQUBCC	V0, V1, V2
+	ISEL	$CR6LT, R11, R0, R5
+
+	// Load sX[len(sX)-16:len(sX)] and compare.
+	ADD	$-16, R9
+	ADD	$-16, R10
+	LXVD2X	(R9+R0), V0
+	LXVD2X	(R10+R0), V1
+	VCMPEQUBCC	V0, V1, V2
+	ISEL	$CR6LT, R5, R0, R3
+	RET
+
+check0_16:
+	CMP	R5, $8
+	BLT	check0_7
+	// Load sX[0:7] and compare.
+	MOVD	(R8), R6
+	MOVD	(R4), R7
+	CMP	R6, R7
+	ISEL	$CR0EQ, R11, R0, R5
+	// Load sX[len(sX)-8:len(sX)] and compare.
+	MOVD	-8(R9), R6
+	MOVD	-8(R10), R7
+	CMP	R6, R7
+	ISEL	$CR0EQ, R5, R0, R3
+	RET
+
+check0_7:
+	CMP	R5,$0
+	MOVD	$1, R3
+	BEQLR		// return if len == 0
+
+	// Check < 8B loads with a single compare, but select the load address
+	// such that it cannot cross a page boundary. Load a few bytes from the
+	// lower address if that does not cross the lower page. Or, load a few
+	// extra bytes from the higher addresses. And align those values
+	// consistently in register as either address may have differing
+	// alignment requirements.
+	ANDCC	$PAGE_OFFSET, R8, R6	// &sX & PAGE_OFFSET
+	ANDCC	$PAGE_OFFSET, R4, R9
+	SUBC	R5, $8, R12		// 8-len
+	SLD	$3, R12, R14		// (8-len)*8
+	CMPU	R6, R12, CR1		// Enough bytes lower in the page to load lower?
+	CMPU	R9, R12, CR0
+	SUB	R12, R8, R6		// compute lower load address
+	SUB	R12, R4, R9
+	ISEL	$CR1LT, R8, R6, R8	// R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
+	ISEL	$CR0LT, R4, R9, R4	// Similar for s2
+	MOVD	(R8), R15
+	MOVD	(R4), R16
+	SLD	R14, R15, R7
+	SLD	R14, R16, R17
+	SRD	R14, R7, R7		// Clear the upper (8-len) bytes (with 2 shifts)
+	SRD	R14, R17, R17
+	SRD	R14, R15, R6		// Clear the lower (8-len) bytes
+	SRD	R14, R16, R9
+#ifdef GOARCH_ppc64le
+	ISEL	$CR1LT, R7, R6, R8      // Choose the correct len bytes to compare based on alignment
+	ISEL	$CR0LT, R17, R9, R4
+#else
+	ISEL	$CR1LT, R6, R7, R8
+	ISEL	$CR0LT, R9, R17, R4
+#endif
+	CMP	R4, R8
+	ISEL	$CR0EQ, R11, R0, R3
+	RET
diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s
new file mode 100644
index 0000000..3834083
--- /dev/null
+++ b/src/internal/bytealg/equal_riscv64.s
@@ -0,0 +1,124 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	CTXT	S10
+
+// func memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+	// X10 = a_base
+	// X11 = b_base
+	// X12 = size
+	JMP	memequal<>(SB)
+
+// func memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
+	MOV	8(CTXT), X12    // compiler stores size at offset 8 in the closure
+	// X10 = a_base
+	// X11 = b_base
+	JMP	memequal<>(SB)
+
+// On entry X10 and X11 contain pointers, X12 contains length.
+// For non-regabi X13 contains address for return value.
+// For regabi return value in X10.
+TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0
+	BEQ	X10, X11, eq
+
+	MOV	$32, X23
+	BLT	X12, X23, loop4_check
+
+	// Check alignment - if alignment differs we have to do one byte at a time.
+	AND	$7, X10, X9
+	AND	$7, X11, X19
+	BNE	X9, X19, loop4_check
+	BEQZ	X9, loop32_check
+
+	// Check one byte at a time until we reach 8 byte alignment.
+	SUB	X9, X12, X12
+align:
+	ADD	$-1, X9
+	MOVBU	0(X10), X19
+	MOVBU	0(X11), X20
+	BNE	X19, X20, not_eq
+	ADD	$1, X10
+	ADD	$1, X11
+	BNEZ	X9, align
+
+loop32_check:
+	MOV	$32, X9
+	BLT	X12, X9, loop16_check
+loop32:
+	MOV	0(X10), X19
+	MOV	0(X11), X20
+	MOV	8(X10), X21
+	MOV	8(X11), X22
+	BNE	X19, X20, not_eq
+	BNE	X21, X22, not_eq
+	MOV	16(X10), X14
+	MOV	16(X11), X15
+	MOV	24(X10), X16
+	MOV	24(X11), X17
+	BNE	X14, X15, not_eq
+	BNE	X16, X17, not_eq
+	ADD	$32, X10
+	ADD	$32, X11
+	ADD	$-32, X12
+	BGE	X12, X9, loop32
+	BEQZ	X12, eq
+
+loop16_check:
+	MOV	$16, X23
+	BLT	X12, X23, loop4_check
+loop16:
+	MOV	0(X10), X19
+	MOV	0(X11), X20
+	MOV	8(X10), X21
+	MOV	8(X11), X22
+	BNE	X19, X20, not_eq
+	BNE	X21, X22, not_eq
+	ADD	$16, X10
+	ADD	$16, X11
+	ADD	$-16, X12
+	BGE	X12, X23, loop16
+	BEQZ	X12, eq
+
+loop4_check:
+	MOV	$4, X23
+	BLT	X12, X23, loop1
+loop4:
+	MOVBU	0(X10), X19
+	MOVBU	0(X11), X20
+	MOVBU	1(X10), X21
+	MOVBU	1(X11), X22
+	BNE	X19, X20, not_eq
+	BNE	X21, X22, not_eq
+	MOVBU	2(X10), X14
+	MOVBU	2(X11), X15
+	MOVBU	3(X10), X16
+	MOVBU	3(X11), X17
+	BNE	X14, X15, not_eq
+	BNE	X16, X17, not_eq
+	ADD	$4, X10
+	ADD	$4, X11
+	ADD	$-4, X12
+	BGE	X12, X23, loop4
+
+loop1:
+	BEQZ	X12, eq
+	MOVBU	0(X10), X19
+	MOVBU	0(X11), X20
+	BNE	X19, X20, not_eq
+	ADD	$1, X10
+	ADD	$1, X11
+	ADD	$-1, X12
+	JMP	loop1
+
+not_eq:
+	MOVB	ZERO, X10
+	RET
+eq:
+	MOV	$1, X10
+	RET
diff --git a/src/internal/bytealg/equal_s390x.s b/src/internal/bytealg/equal_s390x.s
new file mode 100644
index 0000000..67f814d
--- /dev/null
+++ b/src/internal/bytealg/equal_s390x.s
@@ -0,0 +1,92 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R5
+	MOVD	size+16(FP), R6
+	LA	ret+24(FP), R7
+	BR	memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R5
+	MOVD	8(R12), R6    // compiler stores size at offset 8 in the closure
+	LA	ret+16(FP), R7
+	BR	memeqbody<>(SB)
+
+// input:
+//   R3 = a
+//   R5 = b
+//   R6 = len
+//   R7 = address of output byte (stores 0 or 1 here)
+//   a and b have the same length
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, equal
+loop:
+	CMPBEQ	R6, $0, equal
+	CMPBLT	R6, $32, tiny
+	CMP	R6, $256
+	BLT	tail
+	CLC	$256, 0(R3), 0(R5)
+	BNE	notequal
+	SUB	$256, R6
+	LA	256(R3), R3
+	LA	256(R5), R5
+	BR	loop
+tail:
+	SUB	$1, R6, R8
+	EXRL	$memeqbodyclc<>(SB), R8
+	BEQ	equal
+notequal:
+	MOVB	$0, 0(R7)
+	RET
+equal:
+	MOVB	$1, 0(R7)
+	RET
+tiny:
+	MOVD	$0, R2
+	CMPBLT	R6, $16, lt16
+	MOVD	0(R3), R8
+	MOVD	0(R5), R9
+	CMPBNE	R8, R9, notequal
+	MOVD	8(R3), R8
+	MOVD	8(R5), R9
+	CMPBNE	R8, R9, notequal
+	LA	16(R2), R2
+	SUB	$16, R6
+lt16:
+	CMPBLT	R6, $8, lt8
+	MOVD	0(R3)(R2*1), R8
+	MOVD	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	8(R2), R2
+	SUB	$8, R6
+lt8:
+	CMPBLT	R6, $4, lt4
+	MOVWZ	0(R3)(R2*1), R8
+	MOVWZ	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	4(R2), R2
+	SUB	$4, R6
+lt4:
+#define CHECK(n) \
+	CMPBEQ	R6, $n, equal \
+	MOVB	n(R3)(R2*1), R8 \
+	MOVB	n(R5)(R2*1), R9 \
+	CMPBNE	R8, R9, notequal
+	CHECK(0)
+	CHECK(1)
+	CHECK(2)
+	CHECK(3)
+	BR	equal
+
+TEXT memeqbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
diff --git a/src/internal/bytealg/equal_wasm.s b/src/internal/bytealg/equal_wasm.s
new file mode 100644
index 0000000..a2b76c1
--- /dev/null
+++ b/src/internal/bytealg/equal_wasm.s
@@ -0,0 +1,77 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB), NOSPLIT, $0-25
+	Get SP
+	I64Load a+0(FP)
+	I64Load b+8(FP)
+	I64Load size+16(FP)
+	Call memeqbody<>(SB)
+	I64Store8 ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB), NOSPLIT, $0-17
+	Get SP
+	I64Load a+0(FP)
+	I64Load b+8(FP)
+	I64Load 8(CTXT) // compiler stores size at offset 8 in the closure
+	Call memeqbody<>(SB)
+	I64Store8 ret+16(FP)
+	RET
+
+// params: a, b, len
+// ret: 0/1
+TEXT memeqbody<>(SB), NOSPLIT, $0-0
+	Get R0
+	Get R1
+	I64Eq
+	If
+		I64Const $1
+		Return
+	End
+
+loop:
+	Loop
+		Get R2
+		I64Eqz
+		If
+			I64Const $1
+			Return
+		End
+
+		Get R0
+		I32WrapI64
+		I64Load8U $0
+		Get R1
+		I32WrapI64
+		I64Load8U $0
+		I64Ne
+		If
+			I64Const $0
+			Return
+		End
+
+		Get R0
+		I64Const $1
+		I64Add
+		Set R0
+
+		Get R1
+		I64Const $1
+		I64Add
+		Set R1
+
+		Get R2
+		I64Const $1
+		I64Sub
+		Set R2
+
+		Br loop
+	End
+	UNDEF
diff --git a/src/internal/bytealg/index_amd64.go b/src/internal/bytealg/index_amd64.go
new file mode 100644
index 0000000..c7a1941
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.go
@@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	if cpu.X86.HasAVX2 {
+		MaxLen = 63
+	} else {
+		MaxLen = 31
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
new file mode 100644
index 0000000..0431491
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.s
@@ -0,0 +1,276 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+24(FP), R8
+	MOVQ b_len+32(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+48(FP), R11
+	JMP  indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+16(FP), R8
+	MOVQ b_len+24(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+32(FP), R11
+	JMP  indexbody<>(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// R8: pointer to string, that we are searching for
+// R11: address, where to put return value
+// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
+TEXT indexbody<>(SB),NOSPLIT,$0
+	CMPQ AX, DX
+	JA fail
+	CMPQ DX, $16
+	JAE sse42
+no_sse42:
+	CMPQ AX, $2
+	JA   _3_or_more
+	MOVW (R8), R8
+	LEAQ -1(DI)(DX*1), DX
+loop2:
+	MOVW (DI), SI
+	CMPW SI,R8
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop2
+	JMP fail
+_3_or_more:
+	CMPQ AX, $3
+	JA   _4_or_more
+	MOVW 1(R8), BX
+	MOVW (R8), R8
+	LEAQ -2(DI)(DX*1), DX
+loop3:
+	MOVW (DI), SI
+	CMPW SI,R8
+	JZ   partial_success3
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+partial_success3:
+	MOVW 1(DI), SI
+	CMPW SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+_4_or_more:
+	CMPQ AX, $4
+	JA   _5_or_more
+	MOVL (R8), R8
+	LEAQ -3(DI)(DX*1), DX
+loop4:
+	MOVL (DI), SI
+	CMPL SI,R8
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop4
+	JMP fail
+_5_or_more:
+	CMPQ AX, $7
+	JA   _8_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVL -4(R8)(AX*1), BX
+	MOVL (R8), R8
+loop5to7:
+	MOVL (DI), SI
+	CMPL SI,R8
+	JZ   partial_success5to7
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+partial_success5to7:
+	MOVL -4(AX)(DI*1), SI
+	CMPL SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+_8_or_more:
+	CMPQ AX, $8
+	JA   _9_or_more
+	MOVQ (R8), R8
+	LEAQ -7(DI)(DX*1), DX
+loop8:
+	MOVQ (DI), SI
+	CMPQ SI,R8
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop8
+	JMP fail
+_9_or_more:
+	CMPQ AX, $15
+	JA   _16_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVQ -8(R8)(AX*1), BX
+	MOVQ (R8), R8
+loop9to15:
+	MOVQ (DI), SI
+	CMPQ SI,R8
+	JZ   partial_success9to15
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+partial_success9to15:
+	MOVQ -8(AX)(DI*1), SI
+	CMPQ SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+_16_or_more:
+	CMPQ AX, $16
+	JA   _17_or_more
+	MOVOU (R8), X1
+	LEAQ -15(DI)(DX*1), DX
+loop16:
+	MOVOU (DI), X2
+	PCMPEQB X1, X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop16
+	JMP fail
+_17_or_more:
+	CMPQ AX, $31
+	JA   _32_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVOU -16(R8)(AX*1), X0
+	MOVOU (R8), X1
+loop17to31:
+	MOVOU (DI), X2
+	PCMPEQB X1,X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   partial_success17to31
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+partial_success17to31:
+	MOVOU -16(AX)(DI*1), X3
+	PCMPEQB X0, X3
+	PMOVMSKB X3, SI
+	CMPQ  SI, $0xffff
+	JE success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+	CMPQ AX, $32
+	JA   _33_to_63
+	VMOVDQU (R8), Y1
+	LEAQ -31(DI)(DX*1), DX
+loop32:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop32
+	JMP fail_avx2
+_33_to_63:
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	VMOVDQU -32(R8)(AX*1), Y0
+	VMOVDQU (R8), Y1
+loop33to63:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   partial_success33to63
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+	JMP fail_avx2
+partial_success33to63:
+	VMOVDQU -32(AX)(DI*1), Y3
+	VPCMPEQB Y0, Y3, Y4
+	VPMOVMSKB Y4, SI
+	CMPL  SI, $0xffffffff
+	JE success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+fail_avx2:
+	VZEROUPPER
+fail:
+	MOVQ $-1, (R11)
+	RET
+success_avx2:
+	VZEROUPPER
+	JMP success
+sse42:
+#ifndef hasSSE42
+	CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
+	JNE no_sse42
+#endif
+	CMPQ AX, $12
+	// PCMPESTRI is slower than normal compare,
+	// so using it makes sense only if we advance 4+ bytes per compare
+	// This value was determined experimentally and is the ~same
+	// on Nehalem (first with SSE42) and Haswell.
+	JAE _9_or_more
+	LEAQ 16(R8), SI
+	TESTW $0xff0, SI
+	JEQ no_sse42
+	MOVOU (R8), X1
+	LEAQ -15(DI)(DX*1), SI
+	MOVQ $16, R9
+	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
+	// for equality (bits 2,3 are 11)
+	// result is not masked or inverted (bits 4,5 are 00)
+	// and corresponds to first matching byte (bit 6 is 0)
+	PCMPESTRI $0x0c, (DI), X1
+	// CX == 16 means no match,
+	// CX > R9 means partial match at the end of the string,
+	// otherwise sep is at offset CX from X1 start
+	CMPQ CX, R9
+	JBE sse42_success
+	ADDQ R9, DI
+	CMPQ DI, SI
+	JB loop_sse42
+	PCMPESTRI $0x0c, -1(SI), X1
+	CMPQ CX, R9
+	JA fail
+	LEAQ -1(SI), DI
+sse42_success:
+	ADDQ CX, DI
+success:
+	SUBQ R10, DI
+	MOVQ DI, (R11)
+	RET
diff --git a/src/internal/bytealg/index_arm64.go b/src/internal/bytealg/index_arm64.go
new file mode 100644
index 0000000..e87c109
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.go
@@ -0,0 +1,23 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Empirical data shows that using Index can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+	// Optimize cases where the length of the substring is less than 32 bytes
+	MaxLen = 32
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 16 characters, plus a few slop to start.
+	return 4 + n>>4
+}
diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s
new file mode 100644
index 0000000..3a551a7
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.s
@@ -0,0 +1,206 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+24(FP), R2
+	MOVD	b_len+32(FP), R3
+	MOVD	$ret+48(FP), R9
+	B	indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+16(FP), R2
+	MOVD	b_len+24(FP), R3
+	MOVD	$ret+32(FP), R9
+	B	indexbody<>(SB)
+
+// input:
+//   R0: haystack
+//   R1: length of haystack
+//   R2: needle
+//   R3: length of needle (2 <= len <= 32)
+//   R9: address to put result
+TEXT indexbody<>(SB),NOSPLIT,$0-56
+	// main idea is to load 'sep' into separate register(s)
+	// to avoid repeatedly re-load it again and again
+	// for sebsequent substring comparisons
+	SUB	R3, R1, R4
+	// R4 contains the start of last substring for comparison
+	ADD	R0, R4, R4
+	ADD	$1, R0, R8
+
+	CMP	$8, R3
+	BHI	greater_8
+	TBZ	$3, R3, len_2_7
+len_8:
+	// R5 contains 8-byte of sep
+	MOVD	(R2), R5
+loop_8:
+	// R6 contains substring for comparison
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_8
+	B	found
+len_2_7:
+	TBZ	$2, R3, len_2_3
+	TBZ	$1, R3, len_4_5
+	TBZ	$0, R3, len_6
+len_7:
+	// R5 and R6 contain 7-byte of sep
+	MOVWU	(R2), R5
+	// 1-byte overlap with R5
+	MOVWU	3(R2), R6
+loop_7:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_7
+	MOVWU	2(R0), R3
+	CMP	R6, R3
+	BNE	loop_7
+	B	found
+len_6:
+	// R5 and R6 contain 6-byte of sep
+	MOVWU	(R2), R5
+	MOVHU	4(R2), R6
+loop_6:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_6
+	MOVHU	3(R0), R3
+	CMP	R6, R3
+	BNE	loop_6
+	B	found
+len_4_5:
+	TBZ	$0, R3, len_4
+len_5:
+	// R5 and R7 contain 5-byte of sep
+	MOVWU	(R2), R5
+	MOVBU	4(R2), R7
+loop_5:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_5
+	MOVBU	3(R0), R3
+	CMP	R7, R3
+	BNE	loop_5
+	B	found
+len_4:
+	// R5 contains 4-byte of sep
+	MOVWU	(R2), R5
+loop_4:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_4
+	B	found
+len_2_3:
+	TBZ	$0, R3, len_2
+len_3:
+	// R6 and R7 contain 3-byte of sep
+	MOVHU	(R2), R6
+	MOVBU	2(R2), R7
+loop_3:
+	CMP	R4, R0
+	BHI	not_found
+	MOVHU.P	1(R0), R3
+	CMP	R6, R3
+	BNE	loop_3
+	MOVBU	1(R0), R3
+	CMP	R7, R3
+	BNE	loop_3
+	B	found
+len_2:
+	// R5 contains 2-byte of sep
+	MOVHU	(R2), R5
+loop_2:
+	CMP	R4, R0
+	BHI	not_found
+	MOVHU.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_2
+found:
+	SUB	R8, R0, R0
+	MOVD	R0, (R9)
+	RET
+not_found:
+	MOVD	$-1, R0
+	MOVD	R0, (R9)
+	RET
+greater_8:
+	SUB	$9, R3, R11	// len(sep) - 9, offset of R0 for last 8 bytes
+	CMP	$16, R3
+	BHI	greater_16
+len_9_16:
+	MOVD.P	8(R2), R5	// R5 contains the first 8-byte of sep
+	SUB	$16, R3, R7	// len(sep) - 16, offset of R2 for last 8 bytes
+	MOVD	(R2)(R7), R6	// R6 contains the last 8-byte of sep
+loop_9_16:
+	// search the first 8 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R7
+	CMP	R5, R7
+	BNE	loop_9_16
+	MOVD	(R0)(R11), R7
+	CMP	R6, R7		// compare the last 8 bytes
+	BNE	loop_9_16
+	B	found
+greater_16:
+	CMP	$24, R3
+	BHI	len_25_32
+len_17_24:
+	LDP.P	16(R2), (R5, R6)	// R5 and R6 contain the first 16-byte of sep
+	SUB	$24, R3, R10		// len(sep) - 24
+	MOVD	(R2)(R10), R7		// R7 contains the last 8-byte of sep
+loop_17_24:
+	// search the first 16 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R10
+	CMP	R5, R10
+	BNE	loop_17_24
+	MOVD	7(R0), R10
+	CMP	R6, R10
+	BNE	loop_17_24
+	MOVD	(R0)(R11), R10
+	CMP	R7, R10		// compare the last 8 bytes
+	BNE	loop_17_24
+	B	found
+len_25_32:
+	LDP.P	16(R2), (R5, R6)
+	MOVD.P	8(R2), R7	// R5, R6 and R7 contain the first 24-byte of sep
+	SUB	$32, R3, R12	// len(sep) - 32
+	MOVD	(R2)(R12), R10	// R10 contains the last 8-byte of sep
+loop_25_32:
+	// search the first 24 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R12
+	CMP	R5, R12
+	BNE	loop_25_32
+	MOVD	7(R0), R12
+	CMP	R6, R12
+	BNE	loop_25_32
+	MOVD	15(R0), R12
+	CMP	R7, R12
+	BNE	loop_25_32
+	MOVD	(R0)(R11), R12
+	CMP	R10, R12	// compare the last 8 bytes
+	BNE	loop_25_32
+	B	found
diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go
new file mode 100644
index 0000000..a59e329
--- /dev/null
+++ b/src/internal/bytealg/index_generic.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64
+
+package bytealg
+
+const MaxBruteForce = 0
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int {
+	panic("unimplemented")
+}
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int {
+	panic("unimplemented")
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	panic("unimplemented")
+}
diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go
new file mode 100644
index 0000000..6e4a2f3
--- /dev/null
+++ b/src/internal/bytealg/index_native.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm64 || s390x || ppc64le || ppc64
+
+package bytealg
+
+//go:noescape
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int
+
+//go:noescape
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int
diff --git a/src/internal/bytealg/index_ppc64x.go b/src/internal/bytealg/index_ppc64x.go
new file mode 100644
index 0000000..ab3cbe5
--- /dev/null
+++ b/src/internal/bytealg/index_ppc64x.go
@@ -0,0 +1,26 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (aix || linux) && (ppc64 || ppc64le)
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 16
+
+var SupportsPower9 = cpu.PPC64.IsPOWER9
+
+func init() {
+	MaxLen = 32
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_ppc64x.s b/src/internal/bytealg/index_ppc64x.s
new file mode 100644
index 0000000..26205ce
--- /dev/null
+++ b/src/internal/bytealg/index_ppc64x.s
@@ -0,0 +1,802 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This is an implementation based on the s390x
+// implementation.
+
+// Find a separator with 2 <= len <= 32 within a string.
+// Separators with lengths of 2, 3 or 4 are handled
+// specially.
+
+// This works on power8 and above. The loads and
+// compares are done in big endian order
+// since that allows the used of VCLZD, and allows
+// the same implementation to work on big and little
+// endian platforms with minimal conditional changes.
+
+// NOTE: There is a power9 implementation that
+// improves performance by 10-15% on little
+// endian for some of the benchmarks.
+// Unrolled index2to16 loop by 4 on ppc64le/power9
+// Work is still needed for a big endian
+// implementation on power9.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Needed to swap LXVD2X loads to the correct
+// byte order to work on POWER8.
+
+#ifdef GOARCH_ppc64
+DATA byteswap<>+0(SB)/8, $0x0001020304050607
+DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
+#else
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+#endif
+
+// Load bytes in big endian order. Address
+// alignment does not need checking.
+#define VLOADSWAP(base, index, vreg, vsreg) \
+	LXVD2X (base)(index), vsreg;  \
+	VPERM  vreg, vreg, SWAP, vreg
+
+GLOBL byteswap<>+0(SB), RODATA, $16
+
+TEXT ·Index<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
+	// R3 = byte array pointer
+	// R4 = length
+	MOVD R6, R5             // R5 = separator pointer
+	MOVD R7, R6             // R6 = separator length
+
+#ifdef GOARCH_ppc64le
+	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
+	CMP   R7, $1
+	BNE   power8
+	BR    indexbodyp9<>(SB)
+#endif
+power8:
+	BR indexbody<>(SB)
+
+TEXT ·IndexString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// R3 = string
+	// R4 = length
+	// R5 = separator pointer
+	// R6 = separator length
+
+#ifdef GOARCH_ppc64le
+	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
+	CMP   R7, $1
+	BNE   power8
+	BR    indexbodyp9<>(SB)
+
+#endif
+power8:
+	BR indexbody<>(SB)
+
+	// s: string we are searching
+	// sep: string to search for
+	// R3=&s[0], R4=len(s)
+	// R5=&sep[0], R6=len(sep)
+	// R14=&ret (index where sep found)
+	// R7=working addr of string
+	// R16=index value 16
+	// R17=index value 17
+	// R18=index value 18
+	// R19=index value 1
+	// R26=LASTBYTE of string
+	// R27=LASTSTR last start byte to compare with sep
+	// R8, R9 scratch
+	// V0=sep left justified zero fill
+	// CR4=sep length >= 16
+
+#define SEPMASK V17
+#define LASTBYTE R26
+#define LASTSTR R27
+#define ONES V20
+#define SWAP V21
+#define SWAP_ VS53
+TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
+	CMP      R6, R4                 // Compare lengths
+	BGT      notfound               // If sep len is > string, notfound
+	ADD      R4, R3, LASTBYTE       // find last byte addr
+	SUB      R6, LASTBYTE, LASTSTR  // LAST=&s[len(s)-len(sep)] (last valid start index)
+	CMP      R6, $0                 // Check sep len
+	BEQ      notfound               // sep len 0 -- not found
+	MOVD     R3, R7                 // Copy of string addr
+	MOVD     $16, R16               // Index value 16
+	MOVD     $17, R17               // Index value 17
+	MOVD     $18, R18               // Index value 18
+	MOVD     $1, R19                // Index value 1
+	MOVD     $byteswap<>+00(SB), R8
+	VSPLTISB $0xFF, ONES            // splat all 1s
+	LXVD2X   (R8)(R0), SWAP_        // Set up swap string
+
+	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
+	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
+	BGE    CR4, loadge16       // Load for len(sep) >= 16
+	SUB    R6, R16, R9         // 16-len of sep
+	SLD    $3, R9              // Set up for VSLO
+	MTVSRD R9, V9              // Set up for VSLO
+	VSLDOI $8, V9, V9, V9      // Set up for VSLO
+	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
+
+loadge16:
+	ANDCC $15, R5, R9 // Find byte offset of sep
+	ADD   R9, R6, R10 // Add sep len
+	CMP   R10, $16    // Check if sep len+offset > 16
+	BGT   sepcross16  // Sep crosses 16 byte boundary
+
+	RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
+	VLOADSWAP(R8, R0, V0, V0) // Load 16 bytes @R8 into V0
+	SLD    $3, R9          // Set up shift count for VSLO
+	MTVSRD R9, V8         // Set up shift count for VSLO
+	VSLDOI $8, V8, V8, V8
+	VSLO   V0, V8, V0      // Shift by start byte
+
+	VAND V0, SEPMASK, V0 // Mask separator (< 16)
+	BR   index2plus
+
+sepcross16:
+	VLOADSWAP(R5, R0, V0, V0)  // Load 16 bytes @R5 into V0
+
+	VAND V0, SEPMASK, V0 // mask out separator
+	BLE  CR4, index2to16
+	BR   index17plus     // Handle sep > 16
+
+index2plus:
+	CMP      R6, $2       // Check length of sep
+	BNE      index3plus   // If not 2, check for 3
+	ADD      $16, R7, R9  // Check if next 16 bytes past last
+	CMP      R9, LASTBYTE // compare with last
+	BGE      index2to16   // 2 <= len(string) <= 16
+	MOVD     $0xff00, R21 // Mask for later
+	MTVSRD   R21, V25     // Move to Vreg
+	VSPLTH   $3, V25, V31 // Splat mask
+	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
+	VSPLTISB $0, V10      // Clear V10
+
+	// First case: 2 byte separator
+	// V1: 2 byte separator splatted
+	// V2: 16 bytes at addr
+	// V4: 16 bytes at addr+1
+	// Compare 2 byte separator at start
+	// and at start+1. Use VSEL to combine
+	// those results to find the first
+	// matching start byte, returning
+	// that value when found. Loop as
+	// long as len(string) > 16
+index2loop2:
+	VLOADSWAP(R7, R19, V3, V3) // Load 16 bytes @R7+1 into V3
+
+index2loop:
+	VLOADSWAP(R7, R0, V2, V2)  // Load 16 bytes @R7 into V2
+	VCMPEQUH V1, V2, V5        // Search for sep
+	VCMPEQUH V1, V3, V6        // Search for sep offset by 1
+	VSEL     V6, V5, V31, V7   // merge even and odd indices
+	VCLZD    V7, V18           // find index of first match
+	MFVSRD   V18, R25          // get first value
+	CMP      R25, $64          // Found if < 64
+	BLT      foundR25          // Return byte index where found
+	VSLDOI   $8, V18, V18, V18 // Adjust 2nd value
+	MFVSRD   V18, R25          // get second value
+	CMP      R25, $64          // Found if < 64
+	ADD      $64, R25          // Update byte offset
+	BLT      foundR25          // Return value
+	ADD      $16, R7           // R7+=16 Update string pointer
+	ADD      $17, R7, R9       // R9=F7+17 since loop unrolled
+	CMP      R9, LASTBYTE      // Compare addr+17 against last byte
+	BLT      index2loop2       // If < last, continue loop
+	CMP      R7, LASTBYTE      // Compare addr+16 against last byte
+	BLT      index2to16        // If < 16 handle specially
+	VLOADSWAP(R7, R0, V3, V3) // Load 16 bytes @R7 into V3
+	VSLDOI   $1, V3, V10, V3   // Shift left by 1 byte
+	BR       index2loop
+
+index3plus:
+	CMP    R6, $3       // Check if sep == 3
+	BNE    index4plus   // If not check larger
+	ADD    $19, R7, R9  // Find bytes for use in this loop
+	CMP    R9, LASTBYTE // Compare against last byte
+	BGE    index2to16   // Remaining string 2<=len<=16
+	MOVD   $0xff00, R21 // Set up mask for upcoming loop
+	MTVSRD R21, V25     // Move mask to Vreg
+	VSPLTH $3, V25, V31 // Splat mask
+	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
+	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
+
+	// Loop to process 3 byte separator.
+	// string[0:16] is in V2
+	// string[2:18] is in V3
+	// sep[0:2] splatted in V1
+	// sec[3] splatted in v8
+	// Load vectors at string, string+1
+	// and string+2. Compare string, string+1
+	// against first 2 bytes of separator
+	// splatted, and string+2 against 3rd
+	// byte splatted. Merge the results with
+	// VSEL to find the first byte of a match.
+
+	// Special handling for last 16 bytes if the
+	// string fits in 16 byte multiple.
+index3loop2:
+	MOVD     $2, R21          // Set up index for 2
+	VSPLTISB $0, V10          // Clear V10
+	VLOADSWAP(R7, R21, V3, V3)// Load 16 bytes @R7+2 into V3
+	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
+
+index3loop:
+	VLOADSWAP(R7, R0, V2, V2)  // Load with correct order
+	VSLDOI   $1, V2, V3, V4    // string[1:17]
+	VSLDOI   $2, V2, V3, V9    // string[2:18]
+	VCMPEQUH V1, V2, V5        // compare hw even indices
+	VCMPEQUH V1, V4, V6        // compare hw odd indices
+	VCMPEQUB V8, V9, V10       // compare 3rd to last byte
+	VSEL     V6, V5, V31, V7   // Find 1st matching byte using mask
+	VAND     V7, V10, V7       // AND matched bytes with matched 3rd byte
+	VCLZD    V7, V18           // Find first nonzero indexes
+	MFVSRD   V18, R25          // Move 1st doubleword
+	CMP      R25, $64          // If < 64 found
+	BLT      foundR25          // Return matching index
+	VSLDOI   $8, V18, V18, V18 // Move value
+	MFVSRD   V18, R25          // Move 2nd doubleword
+	CMP      R25, $64          // If < 64 found
+	ADD      $64, R25          // Update byte index
+	BLT      foundR25          // Return matching index
+	ADD      $16, R7           // R7+=16 string ptr
+	ADD      $19, R7, R9       // Number of string bytes for loop
+	CMP      R9, LASTBYTE      // Compare against last byte of string
+	BLT      index3loop2       // If within, continue this loop
+	CMP      R7, LASTSTR       // Compare against last start byte
+	BLT      index2to16        // Process remainder
+	VSPLTISB $0, V3            // Special case for last 16 bytes
+	BR       index3loop        // Continue this loop
+
+	// Loop to process 4 byte separator
+	// string[0:16] in V2
+	// string[3:16] in V3
+	// sep[0:4] splatted in V1
+	// Set up vectors with strings at offsets
+	// 0, 1, 2, 3 and compare against the 4 byte
+	// separator also splatted. Use VSEL with the
+	// compare results to find the first byte where
+	// a separator match is found.
+index4plus:
+	CMP  R6, $4       // Check if 4 byte separator
+	BNE  index5plus   // If not next higher
+	ADD  $20, R7, R9  // Check string size to load
+	CMP  R9, LASTBYTE // Verify string length
+	BGE  index2to16   // If not large enough, process remaining
+	MOVD $2, R15      // Set up index
+
+	// Set up masks for use with VSEL
+	MOVD   $0xff, R21        // Set up mask 0xff000000ff000000...
+	SLD    $24, R21
+	MTVSRD R21, V10
+	VSPLTW $1, V10, V29
+	VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
+	MOVD   $0xffff, R21
+	SLD    $16, R21
+	MTVSRD R21, V10
+	VSPLTW $1, V10, V31      // Mask 0xffff0000ffff0000...
+	VSPLTW $0, V0, V1        // Splat 1st word of separator
+
+index4loop:
+	VLOADSWAP(R7, R0, V2, V2)   // Load 16 bytes @R7 into V2
+
+next4:
+	VSPLTISB $0, V10            // Clear
+	MOVD     $3, R9             // Number of bytes beyond 16
+	VLOADSWAP(R7, R9, V3, V3)   // Load 16 bytes @R7+3 into V3
+	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
+	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
+	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
+	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
+	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
+	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
+	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
+	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
+	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
+	VSEL     V14, V13, V31, V7  // final merge
+	VCLZD    V7, V18            // Find first index for each half
+	MFVSRD   V18, R25           // Isolate value
+	CMP      R25, $64           // If < 64, found
+	BLT      foundR25           // Return found index
+	VSLDOI   $8, V18, V18, V18  // Move for MFVSRD
+	MFVSRD   V18, R25           // Isolate other value
+	CMP      R25, $64           // If < 64, found
+	ADD      $64, R25           // Update index for high doubleword
+	BLT      foundR25           // Return found index
+	ADD      $16, R7            // R7+=16 for next string
+	ADD      $20, R7, R9        // R+20 for all bytes to load
+	CMP      R9, LASTBYTE       // Past end? Maybe check for extra?
+	BLT      index4loop         // If not, continue loop
+	CMP      R7, LASTSTR        // Check remainder
+	BLE      index2to16         // Process remainder
+	BR       notfound           // Not found
+
+index5plus:
+	CMP R6, $16     // Check for sep > 16
+	BGT index17plus // Handle large sep
+
+	// Assumption is that the separator is smaller than the string at this point
+index2to16:
+	CMP R7, LASTSTR // Compare last start byte
+	BGT notfound    // last takes len(sep) into account
+
+	ADD $16, R7, R9    // Check for last byte of string
+	CMP R9, LASTBYTE
+	BGT index2to16tail
+
+	// At least 16 bytes of string left
+	// Mask the number of bytes in sep
+index2to16loop:
+	VLOADSWAP(R7, R0, V1, V1)  // Load 16 bytes @R7 into V1
+
+compare:
+	VAND       V1, SEPMASK, V2 // Mask out sep size
+	VCMPEQUBCC V0, V2, V3      // Compare masked string
+	BLT        CR6, found      // All equal
+	ADD        $1, R7          // Update ptr to next byte
+	CMP        R7, LASTSTR     // Still less than last start byte
+	BGT        notfound        // Not found
+	ADD        $16, R7, R9     // Verify remaining bytes
+	CMP        R9, LASTBYTE    // At least 16
+	BLT        index2to16loop  // Try again
+
+	// Less than 16 bytes remaining in string
+	// Separator >= 2
+index2to16tail:
+	ADD   R3, R4, R9     // End of string
+	SUB   R7, R9, R9     // Number of bytes left
+	ANDCC $15, R7, R10   // 16 byte offset
+	ADD   R10, R9, R11   // offset + len
+	CMP   R11, $16       // >= 16?
+	BLE   short          // Does not cross 16 bytes
+	VLOADSWAP(R7, R0, V1, V1) // Load 16 bytes @R7 into V1
+	BR    index2to16next // Continue on
+
+short:
+	RLDICR   $0, R7, $59, R9 // Adjust addr to 16 byte container
+	VLOADSWAP(R9, R0, V1, V1)// Load 16 bytes @R9 into V1
+	SLD      $3, R10         // Set up shift
+	MTVSRD   R10, V8         // Set up shift
+	VSLDOI   $8, V8, V8, V8
+	VSLO     V1, V8, V1      // Shift by start byte
+	VSPLTISB $0, V25         // Clear for later use
+
+index2to16next:
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
+	BR         index2to16next  // Check the next partial string
+
+index17plus:
+	CMP      R6, $32      // Check if 17 < len(sep) <= 32
+	BGT      index33plus
+	SUB      $16, R6, R9  // Extra > 16
+	SLD      $56, R9, R10 // Shift to use in VSLO
+	MTVSRD   R10, V9      // Set up for VSLO
+	VLOADSWAP(R5, R9, V1, V1)// Load 16 bytes @R5+R9 into V1
+	VSLO     V1, V9, V1   // Shift left
+	VSPLTISB $0xff, V7    // Splat 1s
+	VSPLTISB $0, V27      // Splat 0
+
+index17to32loop:
+	VLOADSWAP(R7, R0, V2, V2)  // Load 16 bytes @R7 into V2
+
+next17:
+	VLOADSWAP(R7, R9, V3, V3)  // Load 16 bytes @R7+R9 into V3
+	VSLO       V3, V9, V3      // Shift left
+	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
+	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
+	VAND       V4, V5, V6      // Check if both equal
+	VCMPEQUBCC V6, V7, V8      // All equal?
+	BLT        CR6, found      // Yes
+	ADD        $1, R7          // On to next byte
+	CMP        R7, LASTSTR     // Check if last start byte
+	BGT        notfound        // If too high, not found
+	BR         index17to32loop // Continue
+
+notfound:
+	MOVD $-1, R3   // Return -1 if not found
+	RET
+
+index33plus:
+	MOVD $0, (R0) // Case not implemented
+	RET           // Crash before return
+
+foundR25:
+	SRD  $3, R25   // Convert from bits to bytes
+	ADD  R25, R7   // Add to current string address
+	SUB  R3, R7    // Subtract from start of string
+	MOVD R7, R3    // Return byte where found
+	RET
+
+found:
+	SUB  R3, R7    // Return byte where found
+	MOVD R7, R3
+	RET
+
+TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
+	CMP      R6, R4                // Compare lengths
+	BGT      notfound              // If sep len is > string, notfound
+	ADD      R4, R3, LASTBYTE      // find last byte addr
+	SUB      R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
+	CMP      R6, $0                // Check sep len
+	BEQ      notfound              // sep len 0 -- not found
+	MOVD     R3, R7                // Copy of string addr
+	MOVD     $16, R16              // Index value 16
+	MOVD     $17, R17              // Index value 17
+	MOVD     $18, R18              // Index value 18
+	MOVD     $1, R19               // Index value 1
+	VSPLTISB $0xFF, ONES           // splat all 1s
+
+	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
+	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
+	BGE    CR4, loadge16       // Load for len(sep) >= 16
+	SUB    R6, R16, R9         // 16-len of sep
+	SLD    $3, R9              // Set up for VSLO
+	MTVSRD R9, V9              // Set up for VSLO
+	VSLDOI $8, V9, V9, V9      // Set up for VSLO
+	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
+
+loadge16:
+	ANDCC $15, R5, R9 // Find byte offset of sep
+	ADD   R9, R6, R10 // Add sep len
+	CMP   R10, $16    // Check if sep len+offset > 16
+	BGT   sepcross16  // Sep crosses 16 byte boundary
+
+	RLDICR  $0, R5, $59, R8 // Adjust addr to 16 byte container
+	LXVB16X (R8)(R0), V0    // Load 16 bytes @R8 into V0
+	SLD     $3, R9          // Set up shift count for VSLO
+	MTVSRD  R9, V8          // Set up shift count for VSLO
+	VSLDOI  $8, V8, V8, V8
+	VSLO    V0, V8, V0      // Shift by start byte
+
+	VAND V0, SEPMASK, V0 // Mask separator (< 16)
+	BR   index2plus
+
+sepcross16:
+	LXVB16X (R5)(R0), V0 // Load 16 bytes @R5 into V0
+
+	VAND V0, SEPMASK, V0 // mask out separator
+	BLE  CR4, index2to16
+	BR   index17plus     // Handle sep > 16
+
+index2plus:
+	CMP      R6, $2       // Check length of sep
+	BNE      index3plus   // If not 2, check for 3
+	ADD      $16, R7, R9  // Check if next 16 bytes past last
+	CMP      R9, LASTBYTE // compare with last
+	BGE      index2to16   // 2 <= len(string) <= 16
+	MOVD     $0xff00, R21 // Mask for later
+	MTVSRD   R21, V25     // Move to Vreg
+	VSPLTH   $3, V25, V31 // Splat mask
+	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
+	VSPLTISB $0, V10      // Clear V10
+
+	// First case: 2 byte separator
+	// V1: 2 byte separator splatted
+	// V2: 16 bytes at addr
+	// V4: 16 bytes at addr+1
+	// Compare 2 byte separator at start
+	// and at start+1. Use VSEL to combine
+	// those results to find the first
+	// matching start byte, returning
+	// that value when found. Loop as
+	// long as len(string) > 16
+index2loop2:
+	LXVB16X (R7)(R19), V3  // Load 16 bytes @R7+1 into V3
+
+index2loop:
+	LXVB16X  (R7)(R0), V2    // Load 16 bytes @R7 into V2
+	VCMPEQUH V1, V2, V5      // Search for sep
+	VCMPEQUH V1, V3, V6      // Search for sep offset by 1
+	VSEL     V6, V5, V31, V7 // merge even and odd indices
+	VCLZD    V7, V18         // find index of first match
+	MFVSRD   V18, R25        // get first value
+	CMP      R25, $64        // Found if < 64
+	BLT      foundR25        // Return byte index where found
+
+	MFVSRLD V18, R25        // get second value
+	CMP     R25, $64        // Found if < 64
+	ADD     $64, R25        // Update byte offset
+	BLT     foundR25        // Return value
+	ADD     $16, R7         // R7+=16 Update string pointer
+	ADD     $17, R7, R9     // R9=F7+17 since loop unrolled
+	CMP     R9, LASTBYTE    // Compare addr+17 against last byte
+	BLT     index2loop2     // If < last, continue loop
+	CMP     R7, LASTBYTE    // Compare addr+16 against last byte
+	BLT     index2to16      // If < 16 handle specially
+	LXVB16X (R7)(R0), V3    // Load 16 bytes @R7 into V3
+	VSLDOI  $1, V3, V10, V3 // Shift left by 1 byte
+	BR      index2loop
+
+index3plus:
+	CMP    R6, $3       // Check if sep == 3
+	BNE    index4plus   // If not check larger
+	ADD    $19, R7, R9  // Find bytes for use in this loop
+	CMP    R9, LASTBYTE // Compare against last byte
+	BGE    index2to16   // Remaining string 2<=len<=16
+	MOVD   $0xff00, R21 // Set up mask for upcoming loop
+	MTVSRD R21, V25     // Move mask to Vreg
+	VSPLTH $3, V25, V31 // Splat mask
+	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
+	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
+
+	// Loop to process 3 byte separator.
+	// string[0:16] is in V2
+	// string[2:18] is in V3
+	// sep[0:2] splatted in V1
+	// sec[3] splatted in v8
+	// Load vectors at string, string+1
+	// and string+2. Compare string, string+1
+	// against first 2 bytes of separator
+	// splatted, and string+2 against 3rd
+	// byte splatted. Merge the results with
+	// VSEL to find the first byte of a match.
+
+	// Special handling for last 16 bytes if the
+	// string fits in 16 byte multiple.
+index3loop2:
+	MOVD     $2, R21          // Set up index for 2
+	VSPLTISB $0, V10          // Clear V10
+	LXVB16X  (R7)(R21), V3    // Load 16 bytes @R7+2 into V3
+	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
+
+index3loop:
+	LXVB16X  (R7)(R0), V2    // Load 16 bytes @R7
+	VSLDOI   $1, V2, V3, V4  // string[1:17]
+	VSLDOI   $2, V2, V3, V9  // string[2:18]
+	VCMPEQUH V1, V2, V5      // compare hw even indices
+	VCMPEQUH V1, V4, V6      // compare hw odd indices
+	VCMPEQUB V8, V9, V10     // compare 3rd to last byte
+	VSEL     V6, V5, V31, V7 // Find 1st matching byte using mask
+	VAND     V7, V10, V7     // AND matched bytes with matched 3rd byte
+	VCLZD    V7, V18         // Find first nonzero indexes
+	MFVSRD   V18, R25        // Move 1st doubleword
+	CMP      R25, $64        // If < 64 found
+	BLT      foundR25        // Return matching index
+
+	MFVSRLD  V18, R25     // Move 2nd doubleword
+	CMP      R25, $64     // If < 64 found
+	ADD      $64, R25     // Update byte index
+	BLT      foundR25     // Return matching index
+	ADD      $16, R7      // R7+=16 string ptr
+	ADD      $19, R7, R9  // Number of string bytes for loop
+	CMP      R9, LASTBYTE // Compare against last byte of string
+	BLT      index3loop2  // If within, continue this loop
+	CMP      R7, LASTSTR  // Compare against last start byte
+	BLT      index2to16   // Process remainder
+	VSPLTISB $0, V3       // Special case for last 16 bytes
+	BR       index3loop   // Continue this loop
+
+	// Loop to process 4 byte separator
+	// string[0:16] in V2
+	// string[3:16] in V3
+	// sep[0:4] splatted in V1
+	// Set up vectors with strings at offsets
+	// 0, 1, 2, 3 and compare against the 4 byte
+	// separator also splatted. Use VSEL with the
+	// compare results to find the first byte where
+	// a separator match is found.
+index4plus:
+	CMP  R6, $4       // Check if 4 byte separator
+	BNE  index5plus   // If not next higher
+	ADD  $20, R7, R9  // Check string size to load
+	CMP  R9, LASTBYTE // Verify string length
+	BGE  index2to16   // If not large enough, process remaining
+
+	// Set up masks for use with VSEL
+	MOVD    $0xff, R21 // Set up mask 0xff000000ff000000...
+	SLD     $24, R21
+	MTVSRWS R21, V29
+
+	VSLDOI  $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
+	MOVD    $0xffff, R21
+	SLD     $16, R21
+	MTVSRWS R21, V31
+
+	VSPLTW $0, V0, V1 // Splat 1st word of separator
+
+index4loop:
+	LXVB16X (R7)(R0), V2  // Load 16 bytes @R7 into V2
+
+next4:
+	VSPLTISB $0, V10            // Clear
+	MOVD     $3, R9             // Number of bytes beyond 16
+	LXVB16X  (R7)(R9), V3       // Load 16 bytes @R7 into V3
+	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
+	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
+	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
+	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
+	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
+	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
+	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
+	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
+	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
+	VSEL     V14, V13, V31, V7  // final merge
+	VCLZD    V7, V18            // Find first index for each half
+	MFVSRD   V18, R25           // Isolate value
+	CMP      R25, $64           // If < 64, found
+	BLT      foundR25           // Return found index
+
+	MFVSRLD V18, R25     // Isolate other value
+	CMP     R25, $64     // If < 64, found
+	ADD     $64, R25     // Update index for high doubleword
+	BLT     foundR25     // Return found index
+	ADD     $16, R7      // R7+=16 for next string
+	ADD     $20, R7, R9  // R+20 for all bytes to load
+	CMP     R9, LASTBYTE // Past end? Maybe check for extra?
+	BLT     index4loop   // If not, continue loop
+	CMP     R7, LASTSTR  // Check remainder
+	BLE     index2to16   // Process remainder
+	BR      notfound     // Not found
+
+index5plus:
+	CMP R6, $16     // Check for sep > 16
+	BGT index17plus // Handle large sep
+
+	// Assumption is that the separator is smaller than the string at this point
+index2to16:
+	CMP R7, LASTSTR // Compare last start byte
+	BGT notfound    // last takes len(sep) into account
+
+	ADD $19, R7, R9    // To check 4 indices per iteration, need at least 16+3 bytes
+	CMP R9, LASTBYTE
+	// At least 16 bytes of string left
+	// Mask the number of bytes in sep
+	VSPLTISB $0, V10            // Clear
+	BGT index2to16tail
+
+	MOVD     $3, R17            // Number of bytes beyond 16
+	PCALIGN  $32
+index2to16loop:
+	LXVB16X  (R7)(R0), V1       // Load next 16 bytes of string into V1 from R7
+	LXVB16X  (R7)(R17), V5      // Load next 16 bytes of string into V5 from R7+3
+
+	VSLDOI   $13, V5, V10, V2  // Shift left last 3 bytes
+	VSLDOI  $1, V1, V2, V3     // V3=(V1:V2)<<1
+	VSLDOI  $2, V1, V2, V4     // V4=(V1:V2)<<2
+	VAND    V1, SEPMASK, V8    // Mask out sep size 0th index
+	VAND    V3, SEPMASK, V9    // Mask out sep size 1st index
+	VAND    V4, SEPMASK, V11   // Mask out sep size 2nd index
+	VAND    V5, SEPMASK, V12   // Mask out sep size 3rd index
+	VCMPEQUBCC      V0, V8, V8 // compare masked string
+	BLT     CR6, found         // All equal while comparing 0th index
+	VCMPEQUBCC      V0, V9, V9 // compare masked string
+	BLT     CR6, found2        // All equal while comparing 1st index
+	VCMPEQUBCC      V0, V11, V11    // compare masked string
+	BLT     CR6, found3        // All equal while comparing 2nd index
+	VCMPEQUBCC      V0, V12, V12    // compare masked string
+	BLT     CR6, found4        // All equal while comparing 3rd index
+
+	ADD        $4, R7          // Update ptr to next 4 bytes
+	CMP        R7, LASTSTR     // Still less than last start byte
+	BGT        notfound        // Not found
+	ADD        $19, R7, R9     // Verify remaining bytes
+	CMP        R9, LASTBYTE    // length of string at least 19
+	BLE        index2to16loop  // Try again, else do post processing and jump to index2to16next
+
+	// <19 bytes left, post process the remaining string
+index2to16tail:
+	ADD     R3, R4, R9         // End of string
+	SUB     R7, R9, R9         // Number of bytes left
+	ANDCC   $15, R7, R10       // 16 byte offset
+	ADD     R10, R9, R11       // offset + len
+	CMP     R11, $16           // >= 16?
+	BLE     short              // Does not cross 16 bytes
+	LXVB16X (R7)(R0), V1       // Load 16 bytes @R7 into V1
+	CMP     R9, $16            // Post-processing of unrolled loop
+	BLE     index2to16next     // continue to index2to16next if <= 16 bytes
+	SUB     R16, R9, R10       // R9 should be 18 or 17 hence R10 is 1 or 2
+	LXVB16X (R7)(R10), V9
+	CMP     R10, $1            // string length is 17, compare 1 more byte
+	BNE     extra2             // string length is 18, compare 2 more bytes
+	VSLDOI  $15, V9, V10, V25
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
+	BR         index2to16next  // go to remainder loop
+extra2:
+	VSLDOI  $14, V9, V10, V25
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VOR        V1, V1, V4      // save remaining string
+	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte for 17th byte
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VSLDOI     $2, V4, V25, V1 // Shift saved string left by 2 bytes for 18th byte
+	BR         index2to16next  // Check the remaining partial string in index2to16next
+
+short:
+	RLDICR   $0, R7, $59, R9   // Adjust addr to 16 byte container
+	LXVB16X  (R9)(R0), V1      // Load 16 bytes @R9 into V1
+	SLD      $3, R10           // Set up shift
+	MTVSRD   R10, V8           // Set up shift
+	VSLDOI   $8, V8, V8, V8
+	VSLO     V1, V8, V1        // Shift by start byte
+	PCALIGN  $32
+index2to16next:
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VSLDOI     $1, V1, V10, V1 // Shift string left by 1 byte
+	BR         index2to16next  // Check the next partial string
+
+index17plus:
+	CMP      R6, $32       // Check if 17 < len(sep) <= 32
+	BGT      index33plus
+	SUB      $16, R6, R9   // Extra > 16
+	SLD      $56, R9, R10  // Shift to use in VSLO
+	MTVSRD   R10, V9       // Set up for VSLO
+	LXVB16X  (R5)(R9), V1  // Load 16 bytes @R5+R9 into V1
+	VSLO     V1, V9, V1    // Shift left
+	VSPLTISB $0xff, V7     // Splat 1s
+	VSPLTISB $0, V27       // Splat 0
+
+index17to32loop:
+	LXVB16X (R7)(R0), V2  // Load 16 bytes @R7 into V2
+
+next17:
+	LXVB16X    (R7)(R9), V3    // Load 16 bytes @R7+R9 into V3
+	VSLO       V3, V9, V3      // Shift left
+	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
+	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
+	VAND       V4, V5, V6      // Check if both equal
+	VCMPEQUBCC V6, V7, V8      // All equal?
+	BLT        CR6, found      // Yes
+	ADD        $1, R7          // On to next byte
+	CMP        R7, LASTSTR     // Check if last start byte
+	BGT        notfound        // If too high, not found
+	BR         index17to32loop // Continue
+
+notfound:
+	MOVD $-1, R3   // Return -1 if not found
+	RET
+
+index33plus:
+	MOVD $0, (R0) // Case not implemented
+	RET           // Crash before return
+
+foundR25:
+	SRD  $3, R25   // Convert from bits to bytes
+	ADD  R25, R7   // Add to current string address
+	SUB  R3, R7    // Subtract from start of string
+	MOVD R7, R3    // Return byte where found
+	RET
+found4:
+	ADD $1, R7     // found from unrolled loop at index 3
+found3:
+	ADD $1, R7     // found from unrolled loop at index 2
+found2:
+	ADD $1, R7     // found from unrolled loop at index 1
+found:                 // found at index 0
+	SUB  R3, R7    // Return byte where found
+	MOVD R7, R3
+	RET
diff --git a/src/internal/bytealg/index_s390x.go b/src/internal/bytealg/index_s390x.go
new file mode 100644
index 0000000..9340cf1
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.go
@@ -0,0 +1,31 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	// Note: we're kind of lucky that this flag is available at this point.
+	// The runtime sets HasVX when processing auxv records, and that happens
+	// to happen *before* running the init functions of packages that
+	// the runtime depends on.
+	// TODO: it would really be nicer for internal/cpu to figure out this
+	// flag by itself. Then we wouldn't need to depend on quirks of
+	// early startup initialization order.
+	if cpu.S390X.HasVX {
+		MaxLen = 64
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_s390x.s b/src/internal/bytealg/index_s390x.s
new file mode 100644
index 0000000..491d5bc
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.s
@@ -0,0 +1,216 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+48(FP), R5
+	BR	indexbody<>(SB)
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+32(FP), R5
+	BR	indexbody<>(SB)
+
+// s: string we are searching
+// sep: string to search for
+// R1=&s[0], R2=len(s)
+// R3=&sep[0], R4=len(sep)
+// R5=&ret (int)
+// Caller must confirm availability of vx facility before calling.
+TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
+	CMPBGT	R4, R2, notfound
+	ADD	R1, R2
+	SUB	R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
+	CMPBEQ	R4, $0, notfound
+	SUB	$1, R4 // R4=len(sep)-1 for use as VLL index
+	VLL	R4, (R3), V0 // contains first 16 bytes of sep
+	MOVD	R1, R7
+index2plus:
+	CMPBNE	R4, $1, index3plus
+	MOVD	$15(R7), R9
+	CMPBGE	R9, R2, index2to16
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	CMPBGE	R9, R2, index2to16
+index2loop:
+	VL	0(R7), V2          // 16 bytes, even indices
+	VL	1(R7), V4          // 16 bytes, odd indices
+	VCEQH	V1, V2, V5         // compare even indices
+	VCEQH	V1, V4, V6         // compare odd indices
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index3plus:
+	CMPBNE	R4, $2, index4plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$1, R0
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	VREPB	$2, V0, V8
+index3loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 2-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<2
+	VCEQH	V1, V2, V5         // compare 2-byte even indices
+	VCEQH	V1, V4, V6         // compare 2-byte odd indices
+	VCEQB	V8, V9, V10        // compare last bytes
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VN	V7, V10, V7        // AND indices with last byte
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index4plus:
+	CMPBNE	R4, $3, index5plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$2, R0
+	VGBM	$0x8888, V29       // 0xff000000ff000000...
+	VGBM	$0x2222, V30       // 0x0000ff000000ff00...
+	VGBM	$0xcccc, V31       // 0xffff0000ffff0000...
+	VONE	V16
+	VREPF	$0, V0, V1
+index4loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 3-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<1
+	VSLDB	$3, V2, V3, V10    // V10=(V2:V3)<<1
+	VCEQF	V1, V2, V5         // compare index 0, 4, ...
+	VCEQF	V1, V4, V6         // compare index 1, 5, ...
+	VCEQF	V1, V9, V11        // compare index 2, 6, ...
+	VCEQF	V1, V10, V12       // compare index 3, 7, ...
+	VSEL	V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
+	VSEL	V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
+	VSEL	V13, V14, V31, V7  // final merge
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index5plus:
+	CMPBGT	R4, $15, index17plus
+index2to16:
+	CMPBGT	R7, R2, notfound
+	MOVD	$1(R7), R8
+	CMPBGT	R8, R2, index2to16tail
+index2to16loop:
+	// unrolled 2x
+	VLL	R4, (R7), V1
+	VLL	R4, 1(R7), V2
+	VCEQGS	V0, V1, V3
+	BEQ	found
+	MOVD	$1(R7), R7
+	VCEQGS	V0, V2, V4
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLT	R7, R2, index2to16loop
+	CMPBGT	R7, R2, notfound
+index2to16tail:
+	VLL	R4, (R7), V1
+	VCEQGS	V0, V1, V2
+	BEQ	found
+	BR	notfound
+
+index17plus:
+	CMPBGT	R4, $31, index33plus
+	SUB	$16, R4, R0
+	VLL	R0, 16(R3), V1
+	VONE	V7
+index17to32loop:
+	VL	(R7), V2
+	VLL	R0, 16(R7), V3
+	VCEQG	V0, V2, V4
+	VCEQG	V1, V3, V5
+	VN	V4, V5, V6
+	VCEQGS	V6, V7, V8
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index17to32loop
+	BR	notfound
+
+index33plus:
+	CMPBGT	R4, $47, index49plus
+	SUB	$32, R4, R0
+	VL	16(R3), V1
+	VLL	R0, 32(R3), V2
+	VONE	V11
+index33to48loop:
+	VL	(R7), V3
+	VL	16(R7), V4
+	VLL	R0, 32(R7), V5
+	VCEQG	V0, V3, V6
+	VCEQG	V1, V4, V7
+	VCEQG	V2, V5, V8
+	VN	V6, V7, V9
+	VN	V8, V9, V10
+	VCEQGS	V10, V11, V12
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index33to48loop
+	BR	notfound
+
+index49plus:
+	CMPBGT	R4, $63, index65plus
+	SUB	$48, R4, R0
+	VL	16(R3), V1
+	VL	32(R3), V2
+	VLL	R0, 48(R3), V3
+	VONE	V15
+index49to64loop:
+	VL	(R7), V4
+	VL	16(R7), V5
+	VL	32(R7), V6
+	VLL	R0, 48(R7), V7
+	VCEQG	V0, V4, V8
+	VCEQG	V1, V5, V9
+	VCEQG	V2, V6, V10
+	VCEQG	V3, V7, V11
+	VN	V8, V9, V12
+	VN	V10, V11, V13
+	VN	V12, V13, V14
+	VCEQGS	V14, V15, V16
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index49to64loop
+notfound:
+	MOVD	$-1, (R5)
+	RET
+
+index65plus:
+	// not implemented
+	MOVD	$0, (R0)
+	RET
+
+foundV17: // index is in doubleword V17[0]
+	VLGVG	$0, V17, R8
+	ADD	R8, R7
+found:
+	SUB	R1, R7
+	MOVD	R7, (R5)
+	RET
diff --git a/src/internal/bytealg/indexbyte_386.s b/src/internal/bytealg/indexbyte_386.s
new file mode 100644
index 0000000..8a03054
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_386.s
@@ -0,0 +1,34 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVL	b_base+0(FP), SI
+	MOVL	b_len+4(FP), CX
+	MOVB	c+12(FP), AL
+	MOVL	SI, DI
+	CLD; REPN; SCASB
+	JZ 3(PC)
+	MOVL	$-1, ret+16(FP)
+	RET
+	SUBL	SI, DI
+	SUBL	$1, DI
+	MOVL	DI, ret+16(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVL	s_base+0(FP), SI
+	MOVL	s_len+4(FP), CX
+	MOVB	c+8(FP), AL
+	MOVL	SI, DI
+	CLD; REPN; SCASB
+	JZ 3(PC)
+	MOVL	$-1, ret+12(FP)
+	RET
+	SUBL	SI, DI
+	SUBL	$1, DI
+	MOVL	DI, ret+12(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s
new file mode 100644
index 0000000..1ca70e3
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_amd64.s
@@ -0,0 +1,149 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT	·IndexByte(SB), NOSPLIT, $0-40
+	MOVQ b_base+0(FP), SI
+	MOVQ b_len+8(FP), BX
+	MOVB c+24(FP), AL
+	LEAQ ret+32(FP), R8
+	JMP  indexbytebody<>(SB)
+
+TEXT	·IndexByteString(SB), NOSPLIT, $0-32
+	MOVQ s_base+0(FP), SI
+	MOVQ s_len+8(FP), BX
+	MOVB c+16(FP), AL
+	LEAQ ret+24(FP), R8
+	JMP  indexbytebody<>(SB)
+
+// input:
+//   SI: data
+//   BX: data len
+//   AL: byte sought
+//   R8: address to put result
+TEXT	indexbytebody<>(SB), NOSPLIT, $0
+	// Shuffle X0 around so that each byte contains
+	// the character we're looking for.
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
+
+	CMPQ BX, $16
+	JLT small
+
+	MOVQ SI, DI
+
+	CMPQ BX, $32
+	JA avx2
+sse:
+	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
+	JMP	sseloopentry
+
+sseloop:
+	// Move the next 16-byte chunk of the data into X1.
+	MOVOU	(DI), X1
+	// Compare bytes in X0 to X1.
+	PCMPEQB	X0, X1
+	// Take the top bit of each byte in X1 and put the result in DX.
+	PMOVMSKB X1, DX
+	// Find first set bit, if any.
+	BSFL	DX, DX
+	JNZ	ssesuccess
+	// Advance to next block.
+	ADDQ	$16, DI
+sseloopentry:
+	CMPQ	DI, AX
+	JB	sseloop
+
+	// Search the last 16-byte chunk. This chunk may overlap with the
+	// chunks we've already searched, but that's ok.
+	MOVQ	AX, DI
+	MOVOU	(AX), X1
+	PCMPEQB	X0, X1
+	PMOVMSKB X1, DX
+	BSFL	DX, DX
+	JNZ	ssesuccess
+
+failure:
+	MOVQ $-1, (R8)
+	RET
+
+// We've found a chunk containing the byte.
+// The chunk was loaded from DI.
+// The index of the matching byte in the chunk is DX.
+// The start of the data is SI.
+ssesuccess:
+	SUBQ SI, DI	// Compute offset of chunk within data.
+	ADDQ DX, DI	// Add offset of byte within chunk.
+	MOVQ DI, (R8)
+	RET
+
+// handle for lengths < 16
+small:
+	TESTQ	BX, BX
+	JEQ	failure
+
+	// Check if we'll load across a page boundary.
+	LEAQ	16(SI), AX
+	TESTW	$0xff0, AX
+	JEQ	endofpage
+
+	MOVOU	(SI), X1 // Load data
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	CMPL	DX, BX
+	JAE	failure	// Match is past end of data.
+	MOVQ	DX, (R8)
+	RET
+
+endofpage:
+	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	MOVL	BX, CX
+	SHLL	CX, DX
+	SHRL	$16, DX	// Shift desired bits down to bottom of register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	MOVQ	DX, (R8)
+	RET
+
+avx2:
+#ifndef hasAVX2
+	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JNE sse
+#endif
+	MOVD AX, X0
+	LEAQ -32(SI)(BX*1), R11
+	VPBROADCASTB  X0, Y1
+avx2_loop:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPTEST Y3, Y3
+	JNZ avx2success
+	ADDQ $32, DI
+	CMPQ DI, R11
+	JLT avx2_loop
+	MOVQ R11, DI
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPTEST Y3, Y3
+	JNZ avx2success
+	VZEROUPPER
+	MOVQ $-1, (R8)
+	RET
+
+avx2success:
+	VPMOVMSKB Y3, DX
+	BSFL DX, DX
+	SUBQ SI, DI
+	ADDQ DI, DX
+	MOVQ DX, (R8)
+	VZEROUPPER
+	RET
diff --git a/src/internal/bytealg/indexbyte_arm.s b/src/internal/bytealg/indexbyte_arm.s
new file mode 100644
index 0000000..faf9797
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_arm.s
@@ -0,0 +1,46 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R0
+	MOVW	b_len+4(FP), R1
+	MOVBU	c+12(FP), R2	// byte to find
+	MOVW	$ret+16(FP), R5
+	B	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R0
+	MOVW	s_len+4(FP), R1
+	MOVBU	c+8(FP), R2	// byte to find
+	MOVW	$ret+12(FP), R5
+	B	indexbytebody<>(SB)
+
+// input:
+//  R0: data
+//  R1: data length
+//  R2: byte to find
+//  R5: address to put result
+TEXT indexbytebody<>(SB),NOSPLIT,$0-0
+	MOVW	R0, R4		// store base for later
+	ADD	R0, R1		// end
+
+loop:
+	CMP	R0, R1
+	B.EQ	notfound
+	MOVBU.P	1(R0), R3
+	CMP	R2, R3
+	B.NE	loop
+
+	SUB	$1, R0		// R0 will be one beyond the position we want
+	SUB	R4, R0		// remove base
+	MOVW	R0, (R5)
+	RET
+
+notfound:
+	MOVW	$-1, R0
+	MOVW	R0, (R5)
+	RET
diff --git a/src/internal/bytealg/indexbyte_arm64.s b/src/internal/bytealg/indexbyte_arm64.s
new file mode 100644
index 0000000..40843fb
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_arm64.s
@@ -0,0 +1,126 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+	MOVD	b_base+0(FP), R0
+	MOVD	b_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	MOVD	$ret+32(FP), R8
+	B	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+16(FP), R1
+	MOVD	$ret+24(FP), R8
+	B	indexbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R1: byte to search
+//   R2: data len
+//   R8: address to put result
+TEXT indexbytebody<>(SB),NOSPLIT,$0
+	// Core algorithm:
+	// For each 32-byte chunk we calculate a 64-bit syndrome value,
+	// with two bits per byte. For each tuple, bit 0 is set if the
+	// relevant byte matched the requested character and bit 1 is
+	// not used (faster than using a 32bit syndrome). Since the bits
+	// in the syndrome reflect exactly the order in which things occur
+	// in the original string, counting trailing zeros allows to
+	// identify exactly which byte has matched.
+
+	CBZ	R2, fail
+	MOVD	R0, R11
+	// Magic constant 0x40100401 allows us to identify
+	// which lane matches the requested byte.
+	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
+	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
+	MOVD	$0x40100401, R5
+	VMOV	R1, V0.B16
+	// Work with aligned 32-byte chunks
+	BIC	$0x1f, R0, R3
+	VMOV	R5, V5.S4
+	ANDS	$0x1f, R0, R9
+	AND	$0x1f, R2, R10
+	BEQ	loop
+
+	// Input string is not 32-byte aligned. We calculate the
+	// syndrome value for the aligned 32 bytes block containing
+	// the first bytes and mask off the irrelevant part.
+	VLD1.P	(R3), [V1.B16, V2.B16]
+	SUB	$0x20, R9, R4
+	ADDS	R4, R2, R2
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	VADDP	V4.B16, V3.B16, V6.B16 // 256->128
+	VADDP	V6.B16, V6.B16, V6.B16 // 128->64
+	VMOV	V6.D[0], R6
+	// Clear the irrelevant lower bits
+	LSL	$1, R9, R4
+	LSR	R4, R6, R6
+	LSL	R4, R6, R6
+	// The first block can also be the last
+	BLS	masklast
+	// Have we found something already?
+	CBNZ	R6, tail
+
+loop:
+	VLD1.P	(R3), [V1.B16, V2.B16]
+	SUBS	$0x20, R2, R2
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// If we're out of data we finish regardless of the result
+	BLS	end
+	// Use a fast check for the termination condition
+	VORR	V4.B16, V3.B16, V6.B16
+	VADDP	V6.D2, V6.D2, V6.D2
+	VMOV	V6.D[0], R6
+	// We're not out of data, loop if we haven't found the character
+	CBZ	R6, loop
+
+end:
+	// Termination condition found, let's calculate the syndrome value
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	VADDP	V4.B16, V3.B16, V6.B16
+	VADDP	V6.B16, V6.B16, V6.B16
+	VMOV	V6.D[0], R6
+	// Only do the clear for the last possible block with less than 32 bytes
+	// Condition flags come from SUBS in the loop
+	BHS	tail
+
+masklast:
+	// Clear the irrelevant upper bits
+	ADD	R9, R10, R4
+	AND	$0x1f, R4, R4
+	SUB	$0x20, R4, R4
+	NEG	R4<<1, R4
+	LSL	R4, R6, R6
+	LSR	R4, R6, R6
+
+tail:
+	// Check that we have found a character
+	CBZ	R6, fail
+	// Count the trailing zeros using bit reversing
+	RBIT	R6, R6
+	// Compensate the last post-increment
+	SUB	$0x20, R3, R3
+	// And count the leading zeros
+	CLZ	R6, R6
+	// R6 is twice the offset into the fragment
+	ADD	R6>>1, R3, R0
+	// Compute the offset result
+	SUB	R11, R0, R0
+	MOVD	R0, (R8)
+	RET
+
+fail:
+	MOVD	$-1, R0
+	MOVD	R0, (R8)
+	RET
diff --git a/src/internal/bytealg/indexbyte_generic.go b/src/internal/bytealg/indexbyte_generic.go
new file mode 100644
index 0000000..b89d34f
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_generic.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !mips && !mipsle && !mips64 && !mips64le && !riscv64 && !wasm
+
+package bytealg
+
+func IndexByte(b []byte, c byte) int {
+	for i, x := range b {
+		if x == c {
+			return i
+		}
+	}
+	return -1
+}
+
+func IndexByteString(s string, c byte) int {
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			return i
+		}
+	}
+	return -1
+}
diff --git a/src/internal/bytealg/indexbyte_loong64.s b/src/internal/bytealg/indexbyte_loong64.s
new file mode 100644
index 0000000..baa9c86
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_loong64.s
@@ -0,0 +1,52 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+	MOVV	b_base+0(FP), R4
+	MOVV	b_len+8(FP), R5
+	MOVBU	c+24(FP), R6	// byte to find
+	MOVV	R4, R7		// store base for later
+	ADDV	R4, R5		// end
+	ADDV	$-1, R4
+
+loop:
+	ADDV	$1, R4
+	BEQ	R4, R5, notfound
+	MOVBU	(R4), R8
+	BNE	R6, R8, loop
+
+	SUBV	R7, R4		// remove base
+	MOVV	R4, ret+32(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R4
+	MOVV	R4, ret+32(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+	MOVV	s_base+0(FP), R4
+	MOVV	s_len+8(FP), R5
+	MOVBU	c+16(FP), R6	// byte to find
+	MOVV	R4, R7		// store base for later
+	ADDV	R4, R5		// end
+	ADDV	$-1, R4
+
+loop:
+	ADDV	$1, R4
+	BEQ	R4, R5, notfound
+	MOVBU	(R4), R8
+	BNE	R6, R8, loop
+
+	SUBV	R7, R4		// remove base
+	MOVV	R4, ret+24(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R4
+	MOVV	R4, ret+24(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_mips64x.s b/src/internal/bytealg/indexbyte_mips64x.s
new file mode 100644
index 0000000..5689f84
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_mips64x.s
@@ -0,0 +1,54 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+	MOVV	b_base+0(FP), R1
+	MOVV	b_len+8(FP), R2
+	MOVBU	c+24(FP), R3	// byte to find
+	MOVV	R1, R4		// store base for later
+	ADDV	R1, R2		// end
+	ADDV	$-1, R1
+
+loop:
+	ADDV	$1, R1
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	BNE	R3, R5, loop
+
+	SUBV	R4, R1		// remove base
+	MOVV	R1, ret+32(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R1
+	MOVV	R1, ret+32(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+	MOVV	s_base+0(FP), R1
+	MOVV	s_len+8(FP), R2
+	MOVBU	c+16(FP), R3	// byte to find
+	MOVV	R1, R4		// store base for later
+	ADDV	R1, R2		// end
+	ADDV	$-1, R1
+
+loop:
+	ADDV	$1, R1
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	BNE	R3, R5, loop
+
+	SUBV	R4, R1		// remove base
+	MOVV	R1, ret+24(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R1
+	MOVV	R1, ret+24(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_mipsx.s b/src/internal/bytealg/indexbyte_mipsx.s
new file mode 100644
index 0000000..1c2b104
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_mipsx.s
@@ -0,0 +1,52 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R1
+	MOVW	b_len+4(FP), R2
+	MOVBU	c+12(FP), R3	// byte to find
+	ADDU	$1, R1, R4	// store base+1 for later
+	ADDU	R1, R2	// end
+
+loop:
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	ADDU	$1, R1
+	BNE	R3, R5, loop
+
+	SUBU	R4, R1	// R1 will be one beyond the position we want so remove (base+1)
+	MOVW	R1, ret+16(FP)
+	RET
+
+notfound:
+	MOVW	$-1, R1
+	MOVW	R1, ret+16(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R1
+	MOVW	s_len+4(FP), R2
+	MOVBU	c+8(FP), R3	// byte to find
+	ADDU	$1, R1, R4	// store base+1 for later
+	ADDU	R1, R2	// end
+
+loop:
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	ADDU	$1, R1
+	BNE	R3, R5, loop
+
+	SUBU	R4, R1	// remove (base+1)
+	MOVW	R1, ret+12(FP)
+	RET
+
+notfound:
+	MOVW	$-1, R1
+	MOVW	R1, ret+12(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_native.go b/src/internal/bytealg/indexbyte_native.go
new file mode 100644
index 0000000..c5bb2df
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_native.go
@@ -0,0 +1,13 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64 || s390x || arm || arm64 || loong64 || ppc64 || ppc64le || mips || mipsle || mips64 || mips64le || riscv64 || wasm
+
+package bytealg
+
+//go:noescape
+func IndexByte(b []byte, c byte) int
+
+//go:noescape
+func IndexByteString(s string, c byte) int
diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s
new file mode 100644
index 0000000..1a6e852
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -0,0 +1,391 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// R3 = byte array pointer
+	// R4 = length
+	MOVD	R6, R5		// R5 = byte
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	BR	indexbytebody<>(SB)
+
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
+	// R3 = string
+	// R4 = length
+	// R5 = byte
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	BR	indexbytebody<>(SB)
+
+// R3 = addr of string
+// R4 = len of string
+// R5 = byte to find
+// R16 = 1 if running on a POWER9 system, 0 otherwise
+// On exit:
+// R3 = return value
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R17		// Save base address for calculating the index later.
+	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
+	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
+	ADD	R4,R3,R7	// Last acceptable address in R7.
+
+	RLDIMI	$16,R5,$32,R5
+	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
+	MOVD	$-1,R9
+	RLWNM	$3,R3,$26,$28,R6	// shift amount for mask (r3&0x7)*8
+	RLDIMI	$32,R5,$0,R5
+	MOVD	R7,R10		// Save last acceptable address in R10 for later.
+	ADD	$-1,R7,R7
+#ifdef GOARCH_ppc64le
+	SLD	R6,R9,R9	// Prepare mask for Little Endian
+#else
+	SRD	R6,R9,R9	// Same for Big Endian
+#endif
+	BLT	small_string	// Jump to the small string case if it's <32 bytes.
+	CMP	R16,$1		// optimize for power8 v power9
+	BNE	power8
+	VSPLTISB	$3,V10	// Use V10 as control for VBPERMQ
+	MTVRD	R5,V1
+	LVSL	(R0+R0),V11	// set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
+	VSLB	V11,V10,V10	// to extract the first bit of match result into GPR
+	VSPLTB	$7,V1,V1	// Replicate byte across V1
+	CMP	R4,$64
+	MOVD	$16,R11
+	MOVD	R3,R8
+	BLT	cmp32
+	MOVD	$32,R12
+	MOVD	$48,R6
+
+loop64:
+	LXVB16X	(R0)(R8),V2	// scan 64 bytes at a time
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0	// match found at R8, jump out
+
+	LXVB16X	(R8)(R11),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat1	// match found at R8+16 bytes, jump out
+
+	LXVB16X	(R8)(R12),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat2	// match found at R8+32 bytes, jump out
+
+	LXVB16X	(R8)(R6),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat3	// match found at R8+48 bytes, jump out
+	ADD	$64,R8
+	ADD	$-64,R4
+	CMP	R4,$64		// >=64 bytes left to scan?
+	BGE	loop64
+	CMP	R4,$32
+	BLT	rem		// jump to rem if there are < 32 bytes left
+cmp32:
+	LXVB16X	(R0)(R8),V2	// 32-63 bytes left
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0	// match found at R8
+
+	LXVB16X	(R11)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat1	// match found at R8+16
+
+	ADD	$32,R8
+	ADD	$-32,R4
+rem:
+	RLDICR	$0,R8,$60,R8	// align address to reuse code for tail end processing
+	BR	small_string
+
+foundat3:
+	ADD	$16,R8
+foundat2:
+	ADD	$16,R8
+foundat1:
+	ADD	$16,R8
+foundat0:
+	// Compress the result into a single doubleword and
+	// move it to a GPR for the final calculation.
+	VBPERMQ	V6,V10,V6
+	MFVRD	V6,R3
+	// count leading zeroes upto the match that ends up in low 16 bits
+	// in both endian modes, compute index by subtracting the number by 16
+	CNTLZW	R3,R11
+	ADD	$-16,R11
+	ADD	R8,R11,R3	// Calculate byte address
+	SUB	R17,R3
+	RET
+power8:
+	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
+	// in V0, V1 and V10, then branch to the preloop.
+	ANDCC	$63,R3,R11
+	BEQ	CR0,qw_align
+	RLDICL	$0,R3,$61,R11
+
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
+	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
+	ADD	R4,R11,R4
+
+	// Check for quadword alignment
+	ANDCC	$15,R8,R11
+	BEQ	CR0,qw_align
+
+	// Not aligned, so handle the next doubleword
+	MOVD	0(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR7
+	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
+
+	// Either quadword aligned or 64-byte at this point. We can use LVX.
+qw_align:
+
+	// Set up auxiliary data for the vectorized algorithm.
+	VSPLTISB  $0,V0		// Replicate 0 across V0
+	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
+	MTVRD	  R5,V1
+	LVSL	  (R0+R0),V11
+	VSLB	  V11,V10,V10
+	VSPLTB	  $7,V1,V1	// Replicate byte across V1
+	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
+	BLE	  tail
+
+	// We will load 4 quardwords per iteration in the loop, so check for
+	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
+	ANDCC	  $63,R8,R11
+	BEQ	  CR0,preloop
+
+	// Not 64-byte aligned. Load one quadword at a time until aligned.
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $-16,R4,R4
+	ADD	    $16,R8,R8
+
+	// 64-byte aligned. Prepare for the main loop.
+preloop:
+	CMPU	R4,$64
+	BLE	tail	      // If len ≤ 64, don't use the vectorized loop
+
+	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
+	// per loop iteration. The last doubleword is in R10, so our loop counter
+	// starts at (R10-R8)/64.
+	SUB	R8,R10,R6
+	SRD	$6,R6,R9      // Loop counter in R9
+	MOVD	R9,CTR
+
+	ADD	$-64,R8,R8   // Adjust index for loop entry
+	MOVD	$16,R11      // Load offsets for the vector loads
+	MOVD	$32,R9
+	MOVD	$48,R7
+
+	// Main loop we will load 64 bytes per iteration
+loop:
+	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
+	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
+	LVX	    (R8+R11),V3
+	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
+	VCMPEQUB    V1,V3,V7
+
+	LVX	    (R8+R9),V4
+	LVX	    (R8+R7),V5
+	VCMPEQUB    V1,V4,V8
+	VCMPEQUB    V1,V5,V9
+
+	VOR	    V6,V7,V11	      // Compress the result in a single vector
+	VOR	    V8,V9,V12
+	VOR	    V11,V12,V13
+	VCMPEQUBCC  V0,V13,V14	      // Check for byte
+	BGE	    CR6,found
+	BC	    16,0,loop	      // bdnz loop
+
+	// Handle the tailing bytes or R4 ≤ 64
+	RLDICL	$0,R6,$58,R4
+	ADD	$64,R8,R8
+tail:
+	CMPU	    R4,$0
+	BEQ	    notfound
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+
+notfound:
+	MOVD	$-1, R3
+	RET
+
+found:
+	// We will now compress the results into a single doubleword,
+	// so it can be moved to a GPR for the final index calculation.
+
+	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
+	// first bit of each byte into bits 48-63.
+	VBPERMQ	  V6,V10,V6
+	VBPERMQ	  V7,V10,V7
+	VBPERMQ	  V8,V10,V8
+	VBPERMQ	  V9,V10,V9
+
+	// Shift each 16-bit component into its correct position for
+	// merging into a single doubleword.
+#ifdef GOARCH_ppc64le
+	VSLDOI	  $2,V7,V7,V7
+	VSLDOI	  $4,V8,V8,V8
+	VSLDOI	  $6,V9,V9,V9
+#else
+	VSLDOI	  $6,V6,V6,V6
+	VSLDOI	  $4,V7,V7,V7
+	VSLDOI	  $2,V8,V8,V8
+#endif
+
+	// Merge V6-V9 into a single doubleword and move to a GPR.
+	VOR	V6,V7,V11
+	VOR	V8,V9,V4
+	VOR	V4,V11,V4
+	MFVRD	V4,R3
+
+#ifdef GOARCH_ppc64le
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	ADD	R8,R11,R3	// Calculate byte address
+
+return:
+	SUB	R17, R3
+	RET
+
+found_qw_align:
+	// Use the same algorithm as above. Compress the result into
+	// a single doubleword and move it to a GPR for the final
+	// calculation.
+	VBPERMQ	  V6,V10,V6
+
+#ifdef GOARCH_ppc64le
+	MFVRD	  V6,R3
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11
+#else
+	VSLDOI	  $6,V6,V6,V6
+	MFVRD	  V6,R3
+	CNTLZD	  R3,R11
+#endif
+	ADD	  R8,R11,R3
+	CMPU	  R11,R4
+	BLT	  return
+	BR	  notfound
+	PCALIGN	  $16
+
+done:
+	ADD	$-1,R10,R6
+	// Offset of last index for the final
+	// doubleword comparison
+	RLDICL	$0,R6,$61,R6
+	// At this point, R3 has 0xFF in the same position as the byte we are
+	// looking for in the doubleword. Use that to calculate the exact index
+	// of the byte.
+#ifdef GOARCH_ppc64le
+	ADD	$-1,R3,R11
+	ANDN	R3,R11,R11
+	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	CMPU	R8,R7		// Check if we are at the last doubleword.
+	SRD	$3,R11		// Convert trailing zeros to bytes.
+	ADD	R11,R8,R3
+	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
+	BNE	return
+	BLE	CR7,return
+	BR	notfound
+
+small_string:
+	// process string of length < 32 bytes
+	// We unroll this loop for better performance.
+	CMPU	R4,$0		// Check for length=0
+	BEQ	notfound
+
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base.
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
+	CMPU	R8,R7
+	BNE	CR7,done
+	BEQ	notfound	// Hit length.
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	BNE	CR6,done
+	BR	notfound
+
diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s
new file mode 100644
index 0000000..8be78ed
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_riscv64.s
@@ -0,0 +1,51 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+	// X10 = b_base
+	// X11 = b_len
+	// X12 = b_cap (unused)
+	// X13 = byte to find
+	AND	$0xff, X13
+	MOV	X10, X12		// store base for later
+	ADD	X10, X11		// end
+	ADD	$-1, X10
+
+loop:
+	ADD	$1, X10
+	BEQ	X10, X11, notfound
+	MOVBU	(X10), X14
+	BNE	X13, X14, loop
+
+	SUB	X12, X10		// remove base
+	RET
+
+notfound:
+	MOV	$-1, X10
+	RET
+
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+	// X10 = b_base
+	// X11 = b_len
+	// X12 = byte to find
+	AND	$0xff, X12
+	MOV	X10, X13		// store base for later
+	ADD	X10, X11		// end
+	ADD	$-1, X10
+
+loop:
+	ADD	$1, X10
+	BEQ	X10, X11, notfound
+	MOVBU	(X10), X14
+	BNE	X12, X14, loop
+
+	SUB	X13, X10		// remove base
+	RET
+
+notfound:
+	MOV	$-1, X10
+	RET
diff --git a/src/internal/bytealg/indexbyte_s390x.s b/src/internal/bytealg/indexbyte_s390x.s
new file mode 100644
index 0000000..cf88d92
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_s390x.s
@@ -0,0 +1,108 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	b_base+0(FP), R3// b_base => R3
+	MOVD	b_len+8(FP), R4 // b_len => R4
+	MOVBZ	c+24(FP), R5    // c => R5
+	MOVD	$ret+32(FP), R2 // &ret => R9
+	BR	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
+	MOVD	s_base+0(FP), R3// s_base => R3
+	MOVD	s_len+8(FP), R4 // s_len => R4
+	MOVBZ	c+16(FP), R5    // c => R5
+	MOVD	$ret+24(FP), R2 // &ret => R9
+	BR	indexbytebody<>(SB)
+
+// input:
+// R3: s
+// R4: s_len
+// R5: c -- byte sought
+// R2: &ret -- address to put index into
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0
+	CMPBEQ	R4, $0, notfound
+	MOVD	R3, R6          // store base for later
+	ADD	R3, R4, R8      // the address after the end of the string
+	//if the length is small, use loop; otherwise, use vector or srst search
+	CMPBGE	R4, $16, large
+
+residual:
+	CMPBEQ	R3, R8, notfound
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, residual
+
+found:
+	SUB	R6, R3
+	SUB	$1, R3
+	MOVD	R3, 0(R2)
+	RET
+
+notfound:
+	MOVD	$-1, 0(R2)
+	RET
+
+large:
+	MOVBZ	internal∕cpu·S390X+const_offsetS390xHasVX(SB), R1
+	CMPBNE	R1, $0, vectorimpl
+
+srstimpl:                       // no vector facility
+	MOVBZ	R5, R0          // c needs to be in R0, leave until last minute as currently R0 is expected to be 0
+srstloop:
+	WORD	$0xB25E0083     // srst %r8, %r3 (search the range [R3, R8))
+	BVS	srstloop        // interrupted - continue
+	BGT	notfoundr0
+foundr0:
+	XOR	R0, R0          // reset R0
+	SUB	R6, R8          // remove base
+	MOVD	R8, 0(R2)
+	RET
+notfoundr0:
+	XOR	R0, R0          // reset R0
+	MOVD	$-1, 0(R2)
+	RET
+
+vectorimpl:
+	//if the address is not 16byte aligned, use loop for the header
+	MOVD	R3, R8
+	AND	$15, R8
+	CMPBGT	R8, $0, notaligned
+
+aligned:
+	ADD	R6, R4, R8
+	MOVD	R8, R7
+	AND	$-16, R7
+	// replicate c across V17
+	VLVGB	$0, R5, V19
+	VREPB	$0, V19, V17
+
+vectorloop:
+	CMPBGE	R3, R7, residual
+	VL	0(R3), V16    // load string to be searched into V16
+	ADD	$16, R3
+	VFEEBS	V16, V17, V18 // search V17 in V16 and set conditional code accordingly
+	BVS	vectorloop
+
+	// when vector search found c in the string
+	VLGVB	$7, V18, R7   // load 7th element of V18 containing index into R7
+	SUB	$16, R3
+	SUB	R6, R3
+	ADD	R3, R7
+	MOVD	R7, 0(R2)
+	RET
+
+notaligned:
+	MOVD	R3, R8
+	AND	$-16, R8
+	ADD     $16, R8
+notalignedloop:
+	CMPBEQ	R3, R8, aligned
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, notalignedloop
+	BR	found
diff --git a/src/internal/bytealg/indexbyte_wasm.s b/src/internal/bytealg/indexbyte_wasm.s
new file mode 100644
index 0000000..ef4bd93
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_wasm.s
@@ -0,0 +1,195 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB), NOSPLIT, $0-40
+	I64Load b_base+0(FP)
+	I32WrapI64
+	I32Load8U c+24(FP)
+	I64Load b_len+8(FP)
+	I32WrapI64
+	Call memchr<>(SB)
+	I64ExtendI32S
+	Set R0
+
+	Get SP
+	I64Const $-1
+	Get R0
+	I64Load b_base+0(FP)
+	I64Sub
+	Get R0
+	I64Eqz $0
+	Select
+	I64Store ret+32(FP)
+
+	RET
+
+TEXT ·IndexByteString(SB), NOSPLIT, $0-32
+	Get SP
+	I64Load s_base+0(FP)
+	I32WrapI64
+	I32Load8U c+16(FP)
+	I64Load s_len+8(FP)
+	I32WrapI64
+	Call memchr<>(SB)
+	I64ExtendI32S
+	Set R0
+
+	I64Const $-1
+	Get R0
+	I64Load s_base+0(FP)
+	I64Sub
+	Get R0
+	I64Eqz $0
+	Select
+	I64Store ret+24(FP)
+
+	RET
+
+// initially compiled with emscripten and then modified over time.
+// params:
+//   R0: s
+//   R1: c
+//   R2: len
+// ret: index
+TEXT memchr<>(SB), NOSPLIT, $0
+	Get R1
+	Set R4
+	Block
+		Block
+			Get R2
+			I32Const $0
+			I32Ne
+			Tee R3
+			Get R0
+			I32Const $3
+			I32And
+			I32Const $0
+			I32Ne
+			I32And
+			If
+				Loop
+					Get R0
+					I32Load8U $0
+					Get R1
+					I32Eq
+					BrIf $2
+					Get R2
+					I32Const $-1
+					I32Add
+					Tee R2
+					I32Const $0
+					I32Ne
+					Tee R3
+					Get R0
+					I32Const $1
+					I32Add
+					Tee R0
+					I32Const $3
+					I32And
+					I32Const $0
+					I32Ne
+					I32And
+					BrIf $0
+				End
+			End
+			Get R3
+			BrIf $0
+			I32Const $0
+			Set R1
+			Br $1
+		End
+		Get R0
+		I32Load8U $0
+		Get R4
+		Tee R3
+		I32Eq
+		If
+			Get R2
+			Set R1
+		Else
+			Get R4
+			I32Const $16843009
+			I32Mul
+			Set R4
+			Block
+				Block
+					Get R2
+					I32Const $3
+					I32GtU
+					If
+						Get R2
+						Set R1
+						Loop
+							Get R0
+							I32Load $0
+							Get R4
+							I32Xor
+							Tee R2
+							I32Const $-2139062144
+							I32And
+							I32Const $-2139062144
+							I32Xor
+							Get R2
+							I32Const $-16843009
+							I32Add
+							I32And
+							I32Eqz
+							If
+								Get R0
+								I32Const $4
+								I32Add
+								Set R0
+								Get R1
+								I32Const $-4
+								I32Add
+								Tee R1
+								I32Const $3
+								I32GtU
+								BrIf $1
+								Br $3
+							End
+						End
+					Else
+						Get R2
+						Set R1
+						Br $1
+					End
+					Br $1
+				End
+				Get R1
+				I32Eqz
+				If
+					I32Const $0
+					Set R1
+					Br $3
+				End
+			End
+			Loop
+				Get R0
+				I32Load8U $0
+				Get R3
+				I32Eq
+				BrIf $2
+				Get R0
+				I32Const $1
+				I32Add
+				Set R0
+				Get R1
+				I32Const $-1
+				I32Add
+				Tee R1
+				BrIf $0
+				I32Const $0
+				Set R1
+			End
+		End
+	End
+	Get R0
+	I32Const $0
+	Get R1
+	Select
+	Return