54 files changed, 6081 insertions, 0 deletions
diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go
new file mode 100644
index 0000000..6b2b540
--- /dev/null
+++ b/src/internal/bytealg/bytealg.go
@@ -0,0 +1,151 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import (
+	"internal/cpu"
+	"unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+	offsetX86HasSSE2   = unsafe.Offsetof(cpu.X86.HasSSE2)
+	offsetX86HasSSE42  = unsafe.Offsetof(cpu.X86.HasSSE42)
+	offsetX86HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
+	offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+
+	offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+
+	offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
+)
+
+// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+// If MaxLen is not 0, make sure MaxLen >= 4.
+var MaxLen int
+
+// FIXME: the logic of HashStrBytes, HashStrRevBytes, IndexRabinKarpBytes and HashStr, HashStrRev,
+// IndexRabinKarp are exactly the same, except that the types are different. Can we eliminate
+// three of them without causing allocation?
+
+// PrimeRK is the prime base used in Rabin-Karp algorithm.
+const PrimeRK = 16777619
+
+// HashStrBytes returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func HashStrBytes(sep []byte) (uint32, uint32) {
+	hash := uint32(0)
+	for i := 0; i < len(sep); i++ {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// HashStr returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func HashStr(sep string) (uint32, uint32) {
+	hash := uint32(0)
+	for i := 0; i < len(sep); i++ {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// HashStrRevBytes returns the hash of the reverse of sep and the
+// appropriate multiplicative factor for use in Rabin-Karp algorithm.
+func HashStrRevBytes(sep []byte) (uint32, uint32) {
+	hash := uint32(0)
+	for i := len(sep) - 1; i >= 0; i-- {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// HashStrRev returns the hash of the reverse of sep and the
+// appropriate multiplicative factor for use in Rabin-Karp algorithm.
+func HashStrRev(sep string) (uint32, uint32) {
+	hash := uint32(0)
+	for i := len(sep) - 1; i >= 0; i-- {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// IndexRabinKarpBytes uses the Rabin-Karp search algorithm to return the index of the
+// first occurrence of substr in s, or -1 if not present.
+func IndexRabinKarpBytes(s, sep []byte) int {
+	// Rabin-Karp search
+	hashsep, pow := HashStrBytes(sep)
+	n := len(sep)
+	var h uint32
+	for i := 0; i < n; i++ {
+		h = h*PrimeRK + uint32(s[i])
+	}
+	if h == hashsep && Equal(s[:n], sep) {
+		return 0
+	}
+	for i := n; i < len(s); {
+		h *= PrimeRK
+		h += uint32(s[i])
+		h -= pow * uint32(s[i-n])
+		i++
+		if h == hashsep && Equal(s[i-n:i], sep) {
+			return i - n
+		}
+	}
+	return -1
+}
+
+// IndexRabinKarp uses the Rabin-Karp search algorithm to return the index of the
+// first occurrence of substr in s, or -1 if not present.
+func IndexRabinKarp(s, substr string) int {
+	// Rabin-Karp search
+	hashss, pow := HashStr(substr)
+	n := len(substr)
+	var h uint32
+	for i := 0; i < n; i++ {
+		h = h*PrimeRK + uint32(s[i])
+	}
+	if h == hashss && s[:n] == substr {
+		return 0
+	}
+	for i := n; i < len(s); {
+		h *= PrimeRK
+		h += uint32(s[i])
+		h -= pow * uint32(s[i-n])
+		i++
+		if h == hashss && s[i-n:i] == substr {
+			return i - n
+		}
+	}
+	return -1
+}
diff --git a/src/internal/bytealg/compare_386.s b/src/internal/bytealg/compare_386.s
new file mode 100644
index 0000000..0981983
--- /dev/null
+++ b/src/internal/bytealg/compare_386.s
@@ -0,0 +1,143 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+	MOVL	a_base+0(FP), SI
+	MOVL	a_len+4(FP), BX
+	MOVL	b_base+12(FP), DI
+	MOVL	b_len+16(FP), DX
+	LEAL	ret+24(FP), AX
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+	MOVL	a_base+0(FP), SI
+	MOVL	a_len+4(FP), BX
+	MOVL	b_base+8(FP), DI
+	MOVL	b_len+12(FP), DX
+	LEAL	ret+16(FP), AX
+	JMP	cmpbody<>(SB)
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+//   AX = address of return word (set to 1/0/-1)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+	MOVL	DX, BP
+	SUBL	BX, DX // DX = blen-alen
+	JLE	2(PC)
+	MOVL	BX, BP // BP = min(alen, blen)
+	CMPL	SI, DI
+	JEQ	allsame
+	CMPL	BP, $4
+	JB	small
+	CMPB	internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
+	JNE	mediumloop
+largeloop:
+	CMPL	BP, $16
+	JB	mediumloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, BX
+	XORL	$0xffff, BX	// convert EQ to NE
+	JNE	diff16	// branch if at least one byte is not equal
+	ADDL	$16, SI
+	ADDL	$16, DI
+	SUBL	$16, BP
+	JMP	largeloop
+
+diff16:
+	BSFL	BX, BX	// index of first byte that differs
+	XORL	DX, DX
+	MOVB	(SI)(BX*1), CX
+	CMPB	CX, (DI)(BX*1)
+	SETHI	DX
+	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
+	MOVL	DX, (AX)
+	RET
+
+mediumloop:
+	CMPL	BP, $4
+	JBE	_0through4
+	MOVL	(SI), BX
+	MOVL	(DI), CX
+	CMPL	BX, CX
+	JNE	diff4
+	ADDL	$4, SI
+	ADDL	$4, DI
+	SUBL	$4, BP
+	JMP	mediumloop
+
+_0through4:
+	MOVL	-4(SI)(BP*1), BX
+	MOVL	-4(DI)(BP*1), CX
+	CMPL	BX, CX
+	JEQ	allsame
+
+diff4:
+	BSWAPL	BX	// reverse order of bytes
+	BSWAPL	CX
+	XORL	BX, CX	// find bit differences
+	BSRL	CX, CX	// index of highest bit difference
+	SHRL	CX, BX	// move a's bit to bottom
+	ANDL	$1, BX	// mask bit
+	LEAL	-1(BX*2), BX // 1/0 => +1/-1
+	MOVL	BX, (AX)
+	RET
+
+	// 0-3 bytes in common
+small:
+	LEAL	(BP*8), CX
+	NEGL	CX
+	JEQ	allsame
+
+	// load si
+	CMPB	SI, $0xfc
+	JA	si_high
+	MOVL	(SI), SI
+	JMP	si_finish
+si_high:
+	MOVL	-4(SI)(BP*1), SI
+	SHRL	CX, SI
+si_finish:
+	SHLL	CX, SI
+
+	// same for di
+	CMPB	DI, $0xfc
+	JA	di_high
+	MOVL	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVL	-4(DI)(BP*1), DI
+	SHRL	CX, DI
+di_finish:
+	SHLL	CX, DI
+
+	BSWAPL	SI	// reverse order of bytes
+	BSWAPL	DI
+	XORL	SI, DI	// find bit differences
+	JEQ	allsame
+	BSRL	DI, CX	// index of highest bit difference
+	SHRL	CX, SI	// move a's bit to bottom
+	ANDL	$1, SI	// mask bit
+	LEAL	-1(SI*2), BX // 1/0 => +1/-1
+	MOVL	BX, (AX)
+	RET
+
+	// all the bytes in common are the same, so we just need
+	// to compare the lengths.
+allsame:
+	XORL	BX, BX
+	XORL	CX, CX
+	TESTL	DX, DX
+	SETLT	BX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
+	MOVL	BX, (AX)
+	RET
diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s
new file mode 100644
index 0000000..8295acb
--- /dev/null
+++ b/src/internal/bytealg/compare_amd64.s
@@ -0,0 +1,262 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = a_base (want in SI)
+	// BX = a_len  (want in BX)
+	// CX = a_cap  (unused)
+	// DI = b_base (want in DI)
+	// SI = b_len  (want in DX)
+	// R8 = b_cap  (unused)
+	MOVQ	SI, DX
+	MOVQ	AX, SI
+#else
+	MOVQ	a_base+0(FP), SI
+	MOVQ	a_len+8(FP), BX
+	MOVQ	b_base+24(FP), DI
+	MOVQ	b_len+32(FP), DX
+	LEAQ	ret+48(FP), R9
+#endif
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = a_base (want in SI)
+	// BX = a_len  (want in BX)
+	// CX = b_base (want in DI)
+	// DI = b_len  (want in DX)
+	MOVQ	AX, SI
+	MOVQ	DI, DX
+	MOVQ	CX, DI
+#else
+	MOVQ	a_base+0(FP), SI
+	MOVQ	a_len+8(FP), BX
+	MOVQ	b_base+16(FP), DI
+	MOVQ	b_len+24(FP), DX
+	LEAQ	ret+32(FP), R9
+#endif
+	JMP	cmpbody<>(SB)
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+#ifndef GOEXPERIMENT_regabiargs
+//   R9 = address of output word (stores -1/0/1 here)
+#else
+// output:
+//   AX = output (-1/0/1)
+#endif
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+	CMPQ	SI, DI
+	JEQ	allsame
+	CMPQ	BX, DX
+	MOVQ	DX, R8
+	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
+	CMPQ	R8, $8
+	JB	small
+
+	CMPQ	R8, $63
+	JBE	loop
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JEQ     big_loop_avx2
+	JMP	big_loop
+loop:
+	CMPQ	R8, $16
+	JBE	_0through16
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX	// convert EQ to NE
+	JNE	diff16	// branch if at least one byte is not equal
+	ADDQ	$16, SI
+	ADDQ	$16, DI
+	SUBQ	$16, R8
+	JMP	loop
+
+diff64:
+	ADDQ	$48, SI
+	ADDQ	$48, DI
+	JMP	diff16
+diff48:
+	ADDQ	$32, SI
+	ADDQ	$32, DI
+	JMP	diff16
+diff32:
+	ADDQ	$16, SI
+	ADDQ	$16, DI
+	// AX = bit mask of differences
+diff16:
+	BSFQ	AX, BX	// index of first byte that differs
+	XORQ	AX, AX
+	MOVB	(SI)(BX*1), CX
+	CMPB	CX, (DI)(BX*1)
+	SETHI	AX
+	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
+#ifndef GOEXPERIMENT_regabiargs
+	MOVQ	AX, (R9)
+#endif
+	RET
+
+	// 0 through 16 bytes left, alen>=8, blen>=8
+_0through16:
+	CMPQ	R8, $8
+	JBE	_0through8
+	MOVQ	(SI), AX
+	MOVQ	(DI), CX
+	CMPQ	AX, CX
+	JNE	diff8
+_0through8:
+	MOVQ	-8(SI)(R8*1), AX
+	MOVQ	-8(DI)(R8*1), CX
+	CMPQ	AX, CX
+	JEQ	allsame
+
+	// AX and CX contain parts of a and b that differ.
+diff8:
+	BSWAPQ	AX	// reverse order of bytes
+	BSWAPQ	CX
+	XORQ	AX, CX
+	BSRQ	CX, CX	// index of highest bit difference
+	SHRQ	CX, AX	// move a's bit to bottom
+	ANDQ	$1, AX	// mask bit
+	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
+#ifndef GOEXPERIMENT_regabiargs
+	MOVQ	AX, (R9)
+#endif
+	RET
+
+	// 0-7 bytes in common
+small:
+	LEAQ	(R8*8), CX	// bytes left -> bits left
+	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
+	JEQ	allsame
+
+	// load bytes of a into high bytes of AX
+	CMPB	SI, $0xf8
+	JA	si_high
+	MOVQ	(SI), SI
+	JMP	si_finish
+si_high:
+	MOVQ	-8(SI)(R8*1), SI
+	SHRQ	CX, SI
+si_finish:
+	SHLQ	CX, SI
+
+	// load bytes of b in to high bytes of BX
+	CMPB	DI, $0xf8
+	JA	di_high
+	MOVQ	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVQ	-8(DI)(R8*1), DI
+	SHRQ	CX, DI
+di_finish:
+	SHLQ	CX, DI
+
+	BSWAPQ	SI	// reverse order of bytes
+	BSWAPQ	DI
+	XORQ	SI, DI	// find bit differences
+	JEQ	allsame
+	BSRQ	DI, CX	// index of highest bit difference
+	SHRQ	CX, SI	// move a's bit to bottom
+	ANDQ	$1, SI	// mask bit
+	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
+#ifndef GOEXPERIMENT_regabiargs
+	MOVQ	AX, (R9)
+#endif
+	RET
+
+allsame:
+	XORQ	AX, AX
+	XORQ	CX, CX
+	CMPQ	BX, DX
+	SETGT	AX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
+#ifndef GOEXPERIMENT_regabiargs
+	MOVQ	AX, (R9)
+#endif
+	RET
+
+	// this works for >= 64 bytes of data.
+big_loop:
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff16
+
+	MOVOU	16(SI), X0
+	MOVOU	16(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff32
+
+	MOVOU	32(SI), X0
+	MOVOU	32(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff48
+
+	MOVOU	48(SI), X0
+	MOVOU	48(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff64
+
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, R8
+	CMPQ	R8, $64
+	JBE	loop
+	JMP	big_loop
+
+	// Compare 64-bytes per loop iteration.
+	// Loop is unrolled and uses AVX2.
+big_loop_avx2:
+	VMOVDQU	(SI), Y2
+	VMOVDQU	(DI), Y3
+	VMOVDQU	32(SI), Y4
+	VMOVDQU	32(DI), Y5
+	VPCMPEQB Y2, Y3, Y0
+	VPMOVMSKB Y0, AX
+	XORL	$0xffffffff, AX
+	JNE	diff32_avx2
+	VPCMPEQB Y4, Y5, Y6
+	VPMOVMSKB Y6, AX
+	XORL	$0xffffffff, AX
+	JNE	diff64_avx2
+
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, R8
+	CMPQ	R8, $64
+	JB	big_loop_avx2_exit
+	JMP	big_loop_avx2
+
+	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
+diff32_avx2:
+	VZEROUPPER
+	JMP diff16
+
+	// Same as diff32_avx2, but for last 32 bytes.
+diff64_avx2:
+	VZEROUPPER
+	JMP diff48
+
+	// For <64 bytes remainder jump to normal loop.
+big_loop_avx2_exit:
+	VZEROUPPER
+	JMP loop
diff --git a/src/internal/bytealg/compare_arm.s b/src/internal/bytealg/compare_arm.s
new file mode 100644
index 0000000..80d01a2
--- /dev/null
+++ b/src/internal/bytealg/compare_arm.s
@@ -0,0 +1,86 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	a_base+0(FP), R2
+	MOVW	a_len+4(FP), R0
+	MOVW	b_base+12(FP), R3
+	MOVW	b_len+16(FP), R1
+	ADD	$28, R13, R7
+	B	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-20
+	MOVW	a_base+0(FP), R2
+	MOVW	a_len+4(FP), R0
+	MOVW	b_base+8(FP), R3
+	MOVW	b_len+12(FP), R1
+	ADD	$20, R13, R7
+	B	cmpbody<>(SB)
+
+// On entry:
+// R0 is the length of a
+// R1 is the length of b
+// R2 points to the start of a
+// R3 points to the start of b
+// R7 points to return value (-1/0/1 will be written here)
+//
+// On exit:
+// R4, R5, R6 and R8 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	R2, R3
+	BEQ	samebytes
+	CMP 	R0, R1
+	MOVW 	R0, R6
+	MOVW.LT	R1, R6		// R6 is min(R0, R1)
+
+	CMP	$0, R6
+	BEQ	samebytes
+	CMP	$4, R6
+	ADD	R2, R6		// R2 is current byte in a, R6 is the end of the range to compare
+	BLT	byte_loop	// length < 4
+	AND	$3, R2, R8
+	CMP	$0, R8
+	BNE	byte_loop	// unaligned a, use byte-wise compare (TODO: try to align a)
+aligned_a:
+	AND	$3, R3, R8
+	CMP	$0, R8
+	BNE	byte_loop	// unaligned b, use byte-wise compare
+	AND	$0xfffffffc, R6, R8
+	// length >= 4
+chunk4_loop:
+	MOVW.P	4(R2), R4
+	MOVW.P	4(R3), R5
+	CMP	R4, R5
+	BNE	cmp
+	CMP	R2, R8
+	BNE	chunk4_loop
+	CMP	R2, R6
+	BEQ	samebytes	// all compared bytes were the same; compare lengths
+byte_loop:
+	MOVBU.P	1(R2), R4
+	MOVBU.P	1(R3), R5
+	CMP	R4, R5
+	BNE	ret
+	CMP	R2, R6
+	BNE	byte_loop
+samebytes:
+	CMP	R0, R1
+	MOVW.LT	$1, R0
+	MOVW.GT	$-1, R0
+	MOVW.EQ	$0, R0
+	MOVW	R0, (R7)
+	RET
+ret:
+	// bytes differed
+	MOVW.LT	$1, R0
+	MOVW.GT	$-1, R0
+	MOVW	R0, (R7)
+	RET
+cmp:
+	SUB	$4, R2, R2
+	SUB	$4, R3, R3
+	B	byte_loop
diff --git a/src/internal/bytealg/compare_arm64.s b/src/internal/bytealg/compare_arm64.s
new file mode 100644
index 0000000..56d56f2
--- /dev/null
+++ b/src/internal/bytealg/compare_arm64.s
@@ -0,0 +1,125 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+	MOVD	a_base+0(FP), R2
+	MOVD	a_len+8(FP), R0
+	MOVD	b_base+24(FP), R3
+	MOVD	b_len+32(FP), R1
+	MOVD	$ret+48(FP), R7
+	B	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	a_base+0(FP), R2
+	MOVD	a_len+8(FP), R0
+	MOVD	b_base+16(FP), R3
+	MOVD	b_len+24(FP), R1
+	MOVD	$ret+32(FP), R7
+	B	cmpbody<>(SB)
+
+// On entry:
+// R0 is the length of a
+// R1 is the length of b
+// R2 points to the start of a
+// R3 points to the start of b
+// R7 points to return value (-1/0/1 will be written here)
+//
+// On exit:
+// R4, R5, R6, R8, R9 and R10 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	R2, R3
+	BEQ	samebytes         // same starting pointers; compare lengths
+	CMP	R0, R1
+	CSEL	LT, R1, R0, R6    // R6 is min(R0, R1)
+
+	CBZ	R6, samebytes
+	BIC	$0xf, R6, R10
+	CBZ	R10, small        // length < 16
+	ADD	R2, R10           // end of chunk16
+	// length >= 16
+chunk16_loop:
+	LDP.P	16(R2), (R4, R8)
+	LDP.P	16(R3), (R5, R9)
+	CMP	R4, R5
+	BNE	cmp
+	CMP	R8, R9
+	BNE	cmpnext
+	CMP	R10, R2
+	BNE	chunk16_loop
+	AND	$0xf, R6, R6
+	CBZ	R6, samebytes
+	SUBS	$8, R6
+	BLT	tail
+	// the length of tail > 8 bytes
+	MOVD.P	8(R2), R4
+	MOVD.P	8(R3), R5
+	CMP	R4, R5
+	BNE	cmp
+	SUB	$8, R6
+	// compare last 8 bytes
+tail:
+	MOVD	(R2)(R6), R4
+	MOVD	(R3)(R6), R5
+	CMP	R4, R5
+	BEQ	samebytes
+cmp:
+	REV	R4, R4
+	REV	R5, R5
+	CMP	R4, R5
+ret:
+	MOVD	$1, R4
+	CNEG	HI, R4, R4
+	MOVD	R4, (R7)
+	RET
+small:
+	TBZ	$3, R6, lt_8
+	MOVD	(R2), R4
+	MOVD	(R3), R5
+	CMP	R4, R5
+	BNE	cmp
+	SUBS	$8, R6
+	BEQ	samebytes
+	ADD	$8, R2
+	ADD	$8, R3
+	SUB	$8, R6
+	B	tail
+lt_8:
+	TBZ	$2, R6, lt_4
+	MOVWU	(R2), R4
+	MOVWU	(R3), R5
+	CMPW	R4, R5
+	BNE	cmp
+	SUBS	$4, R6
+	BEQ	samebytes
+	ADD	$4, R2
+	ADD	$4, R3
+lt_4:
+	TBZ	$1, R6, lt_2
+	MOVHU	(R2), R4
+	MOVHU	(R3), R5
+	CMPW	R4, R5
+	BNE	cmp
+	ADD	$2, R2
+	ADD	$2, R3
+lt_2:
+	TBZ	$0, R6, samebytes
+one:
+	MOVBU	(R2), R4
+	MOVBU	(R3), R5
+	CMPW	R4, R5
+	BNE	ret
+samebytes:
+	CMP	R1, R0
+	CSET	NE, R4
+	CNEG	LO, R4, R4
+	MOVD	R4, (R7)
+	RET
+cmpnext:
+	REV	R8, R4
+	REV	R9, R5
+	CMP	R4, R5
+	B	ret
diff --git a/src/internal/bytealg/compare_generic.go b/src/internal/bytealg/compare_generic.go
new file mode 100644
index 0000000..0690d0c
--- /dev/null
+++ b/src/internal/bytealg/compare_generic.go
@@ -0,0 +1,61 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le
+// +build !386,!amd64,!s390x,!arm,!arm64,!ppc64,!ppc64le,!mips,!mipsle,!wasm,!mips64,!mips64le
+
+package bytealg
+
+import _ "unsafe" // for go:linkname
+
+func Compare(a, b []byte) int {
+	l := len(a)
+	if len(b) < l {
+		l = len(b)
+	}
+	if l == 0 || &a[0] == &b[0] {
+		goto samebytes
+	}
+	for i := 0; i < l; i++ {
+		c1, c2 := a[i], b[i]
+		if c1 < c2 {
+			return -1
+		}
+		if c1 > c2 {
+			return +1
+		}
+	}
+samebytes:
+	if len(a) < len(b) {
+		return -1
+	}
+	if len(a) > len(b) {
+		return +1
+	}
+	return 0
+}
+
+//go:linkname runtime_cmpstring runtime.cmpstring
+func runtime_cmpstring(a, b string) int {
+	l := len(a)
+	if len(b) < l {
+		l = len(b)
+	}
+	for i := 0; i < l; i++ {
+		c1, c2 := a[i], b[i]
+		if c1 < c2 {
+			return -1
+		}
+		if c1 > c2 {
+			return +1
+		}
+	}
+	if len(a) < len(b) {
+		return -1
+	}
+	if len(a) > len(b) {
+		return +1
+	}
+	return 0
+}
diff --git a/src/internal/bytealg/compare_mips64x.s b/src/internal/bytealg/compare_mips64x.s
new file mode 100644
index 0000000..b472e51
--- /dev/null
+++ b/src/internal/bytealg/compare_mips64x.s
@@ -0,0 +1,89 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+// +build mips64 mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-56
+	MOVV	a_base+0(FP), R3
+	MOVV	b_base+24(FP), R4
+	MOVV	a_len+8(FP), R1
+	MOVV	b_len+32(FP), R2
+	MOVV	$ret+48(FP), R9
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+	MOVV	a_base+0(FP), R3
+	MOVV	b_base+16(FP), R4
+	MOVV	a_len+8(FP), R1
+	MOVV	b_len+24(FP), R2
+	MOVV	$ret+32(FP), R9
+	JMP	cmpbody<>(SB)
+
+// On entry:
+// R1 length of a
+// R2 length of b
+// R3 points to the start of a
+// R4 points to the start of b
+// R9 points to the return value (-1/0/1)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
+	BEQ	R3, R4, samebytes // same start of a and b
+
+	SGTU	R1, R2, R7
+	BNE	R0, R7, r2_lt_r1
+	MOVV	R1, R10
+	JMP	entry
+r2_lt_r1:
+	MOVV	R2, R10	// R10 is min(R1, R2)
+entry:
+	ADDV	R3, R10, R8	// R3 start of a, R8 end of a
+	BEQ	R3, R8, samebytes // length is 0
+
+	SRLV	$4, R10		// R10 is number of chunks
+	BEQ	R0, R10, byte_loop
+
+	// make sure both a and b are aligned.
+	OR	R3, R4, R11
+	AND	$7, R11
+	BNE	R0, R11, byte_loop
+
+chunk16_loop:
+	BEQ	R0, R10, byte_loop
+	MOVV	(R3), R6
+	MOVV	(R4), R7
+	BNE	R6, R7, byte_loop
+	MOVV	8(R3), R13
+	MOVV	8(R4), R14
+	ADDV	$16, R3
+	ADDV	$16, R4
+	SUBVU	$1, R10
+	BEQ	R13, R14, chunk16_loop
+	SUBV	$8, R3
+	SUBV	$8, R4
+
+byte_loop:
+	BEQ	R3, R8, samebytes
+	MOVBU	(R3), R6
+	ADDVU	$1, R3
+	MOVBU	(R4), R7
+	ADDVU	$1, R4
+	BEQ	R6, R7, byte_loop
+
+byte_cmp:
+	SGTU	R6, R7, R8 // R8 = 1 if (R6 > R7)
+	BNE	R0, R8, ret
+	MOVV	$-1, R8
+	JMP	ret
+
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBV	R7, R6, R8
+
+ret:
+	MOVV	R8, (R9)
+	RET
diff --git a/src/internal/bytealg/compare_mipsx.s b/src/internal/bytealg/compare_mipsx.s
new file mode 100644
index 0000000..dcc4916
--- /dev/null
+++ b/src/internal/bytealg/compare_mipsx.s
@@ -0,0 +1,73 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+// +build mips mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+	MOVW	a_base+0(FP), R3
+	MOVW	b_base+12(FP), R4
+	MOVW	a_len+4(FP), R1
+	MOVW	b_len+16(FP), R2
+	BEQ	R3, R4, samebytes
+	SGTU	R1, R2, R7
+	MOVW	R1, R8
+	CMOVN	R7, R2, R8	// R8 is min(R1, R2)
+
+	ADDU	R3, R8	// R3 is current byte in a, R8 is last byte in a to compare
+loop:
+	BEQ	R3, R8, samebytes
+
+	MOVBU	(R3), R6
+	ADDU	$1, R3
+	MOVBU	(R4), R7
+	ADDU	$1, R4
+	BEQ	R6, R7 , loop
+
+	SGTU	R6, R7, R8
+	MOVW	$-1, R6
+	CMOVZ	R8, R6, R8
+	JMP	cmp_ret
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBU	R7, R6, R8
+cmp_ret:
+	MOVW	R8, ret+24(FP)
+	RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+	MOVW	a_base+0(FP), R3
+	MOVW	a_len+4(FP), R1
+	MOVW	b_base+8(FP), R4
+	MOVW	b_len+12(FP), R2
+	BEQ	R3, R4, samebytes
+	SGTU	R1, R2, R7
+	MOVW	R1, R8
+	CMOVN	R7, R2, R8	// R8 is min(R1, R2)
+
+	ADDU	R3, R8	// R3 is current byte in a, R8 is last byte in a to compare
+loop:
+	BEQ	R3, R8, samebytes	// all compared bytes were the same; compare lengths
+
+	MOVBU	(R3), R6
+	ADDU	$1, R3
+	MOVBU	(R4), R7
+	ADDU	$1, R4
+	BEQ	R6, R7 , loop
+	// bytes differed
+	SGTU	R6, R7, R8
+	MOVW	$-1, R6
+	CMOVZ	R8, R6, R8
+	JMP	cmp_ret
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBU	R7, R6, R8
+cmp_ret:
+	MOVW	R8, ret+16(FP)
+	RET
diff --git a/src/internal/bytealg/compare_native.go b/src/internal/bytealg/compare_native.go
new file mode 100644
index 0000000..baa188f
--- /dev/null
+++ b/src/internal/bytealg/compare_native.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le
+// +build 386 amd64 s390x arm arm64 ppc64 ppc64le mips mipsle wasm mips64 mips64le
+
+package bytealg
+
+import _ "unsafe" // For go:linkname
+
+//go:noescape
+func Compare(a, b []byte) int
+
+// The declaration below generates ABI wrappers for functions
+// implemented in assembly in this package but declared in another
+// package.
+
+//go:linkname abigen_runtime_cmpstring runtime.cmpstring
+func abigen_runtime_cmpstring(a, b string) int
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s
new file mode 100644
index 0000000..83444fa
--- /dev/null
+++ b/src/internal/bytealg/compare_ppc64x.s
@@ -0,0 +1,278 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+// +build ppc64 ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+	MOVD	a_base+0(FP), R5
+	MOVD	b_base+24(FP), R6
+	MOVD	a_len+8(FP), R3
+	CMP	R5,R6,CR7
+	MOVD	b_len+32(FP), R4
+	MOVD	$ret+48(FP), R7
+	CMP	R3,R4,CR6
+	BEQ	CR7,equal
+
+#ifdef	GOARCH_ppc64le
+	BR	cmpbodyLE<>(SB)
+#else
+	BR      cmpbodyBE<>(SB)
+#endif
+
+equal:
+	BEQ	CR6,done
+	MOVD	$1, R8
+	BGT	CR6,greater
+	NEG	R8
+
+greater:
+	MOVD	R8, (R7)
+	RET
+
+done:
+	MOVD	$0, (R7)
+	RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	a_base+0(FP), R5
+	MOVD	b_base+16(FP), R6
+	MOVD	a_len+8(FP), R3
+	CMP	R5,R6,CR7
+	MOVD	b_len+24(FP), R4
+	MOVD	$ret+32(FP), R7
+	CMP	R3,R4,CR6
+	BEQ	CR7,equal
+
+#ifdef	GOARCH_ppc64le
+	BR	cmpbodyLE<>(SB)
+#else
+	BR      cmpbodyBE<>(SB)
+#endif
+
+equal:
+	BEQ	CR6,done
+	MOVD	$1, R8
+	BGT	CR6,greater
+	NEG	R8
+
+greater:
+	MOVD	R8, (R7)
+	RET
+
+done:
+	MOVD	$0, (R7)
+	RET
+
+// Do an efficient memcmp for ppc64le
+// R3 = a len
+// R4 = b len
+// R5 = a addr
+// R6 = b addr
+// R7 = addr of return value
+TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R8		// set up length
+	CMP	R3,R4,CR2	// unequal?
+	BC	12,8,setuplen	// BLT CR2
+	MOVD	R4,R8		// use R4 for comparison len
+setuplen:
+	MOVD	R8,CTR		// set up loop counter
+	CMP	R8,$8		// only optimize >=8
+	BLT	simplecheck
+	DCBT	(R5)		// cache hint
+	DCBT	(R6)
+	CMP	R8,$32		// optimize >= 32
+	MOVD	R8,R9
+	BLT	setup8a		// 8 byte moves only
+setup32a:
+	SRADCC	$5,R8,R9	// number of 32 byte chunks
+	MOVD	R9,CTR
+
+        // Special processing for 32 bytes or longer.
+        // Loading this way is faster and correct as long as the
+	// doublewords being compared are equal. Once they
+	// are found unequal, reload them in proper byte order
+	// to determine greater or less than.
+loop32a:
+	MOVD	0(R5),R9	// doublewords to compare
+	MOVD	0(R6),R10	// get 4 doublewords
+	MOVD	8(R5),R14
+	MOVD	8(R6),R15
+	CMPU	R9,R10		// bytes equal?
+	MOVD	$0,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	16(R5),R9	// get next pair of doublewords
+	MOVD	16(R6),R10
+	CMPU	R14,R15		// bytes match?
+	MOVD	$8,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	24(R5),R14	// get next pair of doublewords
+	MOVD    24(R6),R15
+	CMPU	R9,R10		// bytes match?
+	MOVD	$16,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	$-8,R16		// for cmpne, R5,R6 already inc by 32
+	ADD	$32,R5		// bump up to next 32
+	ADD	$32,R6
+	CMPU    R14,R15		// bytes match?
+	BC	8,2,loop32a	// br ctr and cr
+	BNE	cmpne
+	ANDCC	$24,R8,R9	// Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC	$3,R9,R9	// get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD	R9,CTR		// loop count for doublewords
+loop8:
+	MOVDBR	(R5+R0),R9	// doublewords to compare
+	MOVDBR	(R6+R0),R10	// LE compare order
+	ADD	$8,R5
+	ADD	$8,R6
+	CMPU	R9,R10		// match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BGT	greater
+	BLT	less
+leftover:
+	ANDCC	$7,R8,R9	// check for leftover bytes
+	MOVD	R9,CTR		// save the ctr
+	BNE	simple		// leftover bytes
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less
+	BR	greater
+simplecheck:
+	CMP	R8,$0		// remaining compare length 0
+	BNE	simple		// do simple compare
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less	// 1st len < 2nd len, result less
+	BR	greater		// 1st len > 2nd len must be greater
+simple:
+	MOVBZ	0(R5), R9	// get byte from 1st operand
+	ADD	$1,R5
+	MOVBZ	0(R6), R10	// get byte from 2nd operand
+	ADD	$1,R6
+	CMPU	R9, R10
+	BC	8,2,simple	// bc ctr <> 0 && cr
+	BGT	greater		// 1st > 2nd
+	BLT	less		// 1st < 2nd
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,9,greater	// 2nd len > 1st len
+	BR	less		// must be less
+cmpne:				// only here is not equal
+	MOVDBR	(R5+R16),R8	// reload in reverse order
+	MOVDBR	(R6+R16),R9
+	CMPU	R8,R9		// compare correct endianness
+	BGT	greater		// here only if NE
+less:
+	MOVD	$-1,R3
+	MOVD	R3,(R7)		// return value if A < B
+	RET
+equal:
+	MOVD	$0,(R7)		// return value if A == B
+	RET
+greater:
+	MOVD	$1,R3
+	MOVD	R3,(R7)		// return value if A > B
+	RET
+
+// Do an efficient memcmp for ppc64 (BE)
+// R3 = a len
+// R4 = b len
+// R5 = a addr
+// R6 = b addr
+// R7 = addr of return value
+TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R8		// set up length
+	CMP	R3,R4,CR2	// unequal?
+	BC	12,8,setuplen	// BLT CR2
+	MOVD	R4,R8		// use R4 for comparison len
+setuplen:
+	MOVD	R8,CTR		// set up loop counter
+	CMP	R8,$8		// only optimize >=8
+	BLT	simplecheck
+	DCBT	(R5)		// cache hint
+	DCBT	(R6)
+	CMP	R8,$32		// optimize >= 32
+	MOVD	R8,R9
+	BLT	setup8a		// 8 byte moves only
+
+setup32a:
+	SRADCC	$5,R8,R9	// number of 32 byte chunks
+	MOVD	R9,CTR
+loop32a:
+	MOVD	0(R5),R9	// doublewords to compare
+	MOVD	0(R6),R10	// get 4 doublewords
+	MOVD	8(R5),R14
+	MOVD	8(R6),R15
+	CMPU	R9,R10		// bytes equal?
+	BLT	less		// found to be less
+	BGT	greater		// found to be greater
+	MOVD	16(R5),R9	// get next pair of doublewords
+	MOVD	16(R6),R10
+	CMPU	R14,R15		// bytes match?
+	BLT	less		// found less
+	BGT	greater		// found greater
+	MOVD	24(R5),R14	// get next pair of doublewords
+	MOVD	24(R6),R15
+	CMPU	R9,R10		// bytes match?
+	BLT	less		// found to be less
+	BGT	greater		// found to be greater
+	ADD	$32,R5		// bump up to next 32
+	ADD	$32,R6
+	CMPU	R14,R15		// bytes match?
+	BC	8,2,loop32a	// br ctr and cr
+	BLT	less		// with BE, byte ordering is
+	BGT	greater		// good for compare
+	ANDCC	$24,R8,R9	// Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC	$3,R9,R9	// get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD	R9,CTR		// loop count for doublewords
+loop8:
+	MOVD	(R5),R9
+	MOVD	(R6),R10
+	ADD	$8,R5
+	ADD	$8,R6
+	CMPU	R9,R10		// match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BGT	greater
+	BLT	less
+leftover:
+	ANDCC	$7,R8,R9	// check for leftover bytes
+	MOVD	R9,CTR		// save the ctr
+	BNE	simple		// leftover bytes
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less
+	BR	greater
+simplecheck:
+	CMP	R8,$0		// remaining compare length 0
+	BNE	simple		// do simple compare
+	BC	12,10,equal	// test CR2 for length comparison
+	BC 	12,8,less	// 1st len < 2nd len, result less
+	BR	greater		// same len, must be equal
+simple:
+	MOVBZ	0(R5),R9	// get byte from 1st operand
+	ADD	$1,R5
+	MOVBZ	0(R6),R10	// get byte from 2nd operand
+	ADD	$1,R6
+	CMPU	R9,R10
+	BC	8,2,simple	// bc ctr <> 0 && cr
+	BGT	greater		// 1st > 2nd
+	BLT	less		// 1st < 2nd
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,9,greater	// 2nd len > 1st len
+less:
+	MOVD	$-1,R3
+	MOVD    R3,(R7)		// return value if A < B
+	RET
+equal:
+	MOVD    $0,(R7)		// return value if A == B
+	RET
+greater:
+	MOVD	$1,R3
+	MOVD	R3,(R7)		// return value if A > B
+	RET
diff --git a/src/internal/bytealg/compare_s390x.s b/src/internal/bytealg/compare_s390x.s
new file mode 100644
index 0000000..5394548
--- /dev/null
+++ b/src/internal/bytealg/compare_s390x.s
@@ -0,0 +1,69 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+	MOVD	a_base+0(FP), R3
+	MOVD	a_len+8(FP), R4
+	MOVD	b_base+24(FP), R5
+	MOVD	b_len+32(FP), R6
+	LA	ret+48(FP), R7
+	BR	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	a_base+0(FP), R3
+	MOVD	a_len+8(FP), R4
+	MOVD	b_base+16(FP), R5
+	MOVD	b_len+24(FP), R6
+	LA	ret+32(FP), R7
+	BR	cmpbody<>(SB)
+
+// input:
+//   R3 = a
+//   R4 = alen
+//   R5 = b
+//   R6 = blen
+//   R7 = address of output word (stores -1/0/1 here)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, cmplengths
+	MOVD	R4, R8
+	CMPBLE	R4, R6, amin
+	MOVD	R6, R8
+amin:
+	CMPBEQ	R8, $0, cmplengths
+	CMP	R8, $256
+	BLE	tail
+loop:
+	CLC	$256, 0(R3), 0(R5)
+	BGT	gt
+	BLT	lt
+	SUB	$256, R8
+	MOVD	$256(R3), R3
+	MOVD	$256(R5), R5
+	CMP	R8, $256
+	BGT	loop
+tail:
+	SUB	$1, R8
+	EXRL	$cmpbodyclc<>(SB), R8
+	BGT	gt
+	BLT	lt
+cmplengths:
+	CMP	R4, R6
+	BEQ	eq
+	BLT	lt
+gt:
+	MOVD	$1, 0(R7)
+	RET
+lt:
+	MOVD	$-1, 0(R7)
+	RET
+eq:
+	MOVD	$0, 0(R7)
+	RET
+
+TEXT cmpbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
diff --git a/src/internal/bytealg/compare_wasm.s b/src/internal/bytealg/compare_wasm.s
new file mode 100644
index 0000000..dc8fb33
--- /dev/null
+++ b/src/internal/bytealg/compare_wasm.s
@@ -0,0 +1,115 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB), NOSPLIT, $0-56
+	Get SP
+	I64Load a_base+0(FP)
+	I64Load a_len+8(FP)
+	I64Load b_base+24(FP)
+	I64Load b_len+32(FP)
+	Call cmpbody<>(SB)
+	I64Store ret+48(FP)
+	RET
+
+TEXT runtime·cmpstring(SB), NOSPLIT, $0-40
+	Get SP
+	I64Load a_base+0(FP)
+	I64Load a_len+8(FP)
+	I64Load b_base+16(FP)
+	I64Load b_len+24(FP)
+	Call cmpbody<>(SB)
+	I64Store ret+32(FP)
+	RET
+
+// params: a, alen, b, blen
+// ret: -1/0/1
+TEXT cmpbody<>(SB), NOSPLIT, $0-0
+	// len = min(alen, blen)
+	Get R1
+	Get R3
+	Get R1
+	Get R3
+	I64LtU
+	Select
+	Set R4
+
+	Get R0
+	I32WrapI64
+	Get R2
+	I32WrapI64
+	Get R4
+	I32WrapI64
+	Call memcmp<>(SB)
+	I64ExtendI32S
+	Tee R5
+
+	I64Eqz
+	If
+		// check length
+		Get R1
+		Get R3
+		I64Sub
+		Set R5
+	End
+
+	I64Const $0
+	I64Const $-1
+	I64Const $1
+	Get R5
+	I64Const $0
+	I64LtS
+	Select
+	Get R5
+	I64Eqz
+	Select
+	Return
+
+// compiled with emscripten
+// params: a, b, len
+// ret: <0/0/>0
+TEXT memcmp<>(SB), NOSPLIT, $0-0
+	Get R2
+	If $1
+	Loop
+	Get R0
+	I32Load8S $0
+	Tee R3
+	Get R1
+	I32Load8S $0
+	Tee R4
+	I32Eq
+	If
+	Get R0
+	I32Const $1
+	I32Add
+	Set R0
+	Get R1
+	I32Const $1
+	I32Add
+	Set R1
+	I32Const $0
+	Get R2
+	I32Const $-1
+	I32Add
+	Tee R2
+	I32Eqz
+	BrIf $3
+	Drop
+	Br $1
+	End
+	End
+	Get R3
+	I32Const $255
+	I32And
+	Get R4
+	I32Const $255
+	I32And
+	I32Sub
+	Else
+	I32Const $0
+	End
+	Return
diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s
new file mode 100644
index 0000000..fa864c4
--- /dev/null
+++ b/src/internal/bytealg/count_amd64.s
@@ -0,0 +1,201 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
+	JEQ	2(PC)
+	JMP	·countGeneric(SB)
+	MOVQ	b_base+0(FP), SI
+	MOVQ	b_len+8(FP), BX
+	MOVB	c+24(FP), AL
+	LEAQ	ret+32(FP), R8
+	JMP	countbody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
+	JEQ	2(PC)
+	JMP	·countGenericString(SB)
+	MOVQ	s_base+0(FP), SI
+	MOVQ	s_len+8(FP), BX
+	MOVB	c+16(FP), AL
+	LEAQ	ret+24(FP), R8
+	JMP	countbody<>(SB)
+
+// input:
+//   SI: data
+//   BX: data len
+//   AL: byte sought
+//   R8: address to put result
+// This function requires the POPCNT instruction.
+TEXT countbody<>(SB),NOSPLIT,$0
+	// Shuffle X0 around so that each byte contains
+	// the character we're looking for.
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
+
+	CMPQ BX, $16
+	JLT small
+
+	MOVQ $0, R12 // Accumulator
+
+	MOVQ SI, DI
+
+	CMPQ BX, $32
+	JA avx2
+sse:
+	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
+	JMP	sseloopentry
+
+sseloop:
+	// Move the next 16-byte chunk of the data into X1.
+	MOVOU	(DI), X1
+	// Compare bytes in X0 to X1.
+	PCMPEQB	X0, X1
+	// Take the top bit of each byte in X1 and put the result in DX.
+	PMOVMSKB X1, DX
+	// Count number of matching bytes
+	POPCNTL DX, DX
+	// Accumulate into R12
+	ADDQ DX, R12
+	// Advance to next block.
+	ADDQ	$16, DI
+sseloopentry:
+	CMPQ	DI, AX
+	JBE	sseloop
+
+	// Get the number of bytes to consider in the last 16 bytes
+	ANDQ $15, BX
+	JZ end
+
+	// Create mask to ignore overlap between previous 16 byte block
+	// and the next.
+	MOVQ $16,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+
+	// Process the last 16-byte chunk. This chunk may overlap with the
+	// chunks we've already searched so we need to mask part of it.
+	MOVOU	(AX), X1
+	PCMPEQB	X0, X1
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+end:
+	MOVQ R12, (R8)
+	RET
+
+// handle for lengths < 16
+small:
+	TESTQ	BX, BX
+	JEQ	endzero
+
+	// Check if we'll load across a page boundary.
+	LEAQ	16(SI), AX
+	TESTW	$0xff0, AX
+	JEQ	endofpage
+
+	// We must ignore high bytes as they aren't part of our slice.
+	// Create mask.
+	MOVB BX, CX
+	MOVQ $1, R10
+	SALQ CL, R10
+	SUBQ $1, R10
+
+	// Load data
+	MOVOU	(SI), X1
+	// Compare target byte with each byte in data.
+	PCMPEQB	X0, X1
+	// Move result bits to integer register.
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	// Directly return DX, we don't need to accumulate
+	// since we have <16 bytes.
+	MOVQ	DX, (R8)
+	RET
+endzero:
+	MOVQ $0, (R8)
+	RET
+
+endofpage:
+	// We must ignore low bytes as they aren't part of our slice.
+	MOVQ $16,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+
+	// Load data into the high end of X1.
+	MOVOU	-16(SI)(BX*1), X1
+	// Compare target byte with each byte in data.
+	PCMPEQB	X0, X1
+	// Move result bits to integer register.
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	// Directly return DX, we don't need to accumulate
+	// since we have <16 bytes.
+	POPCNTL DX, DX
+	MOVQ	DX, (R8)
+	RET
+
+avx2:
+	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JNE sse
+	MOVD AX, X0
+	LEAQ -32(SI)(BX*1), R11
+	VPBROADCASTB  X0, Y1
+avx2_loop:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+	ADDQ $32, DI
+	CMPQ DI, R11
+	JLE avx2_loop
+
+	// If last block is already processed,
+	// skip to the end.
+	CMPQ DI, R11
+	JEQ endavx
+
+	// Load address of the last 32 bytes.
+	// There is an overlap with the previous block.
+	MOVQ R11, DI
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, DX
+	// Exit AVX mode.
+	VZEROUPPER
+
+	// Create mask to ignore overlap between previous 32 byte block
+	// and the next.
+	ANDQ $31, BX
+	MOVQ $32,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFFFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+	MOVQ R12, (R8)
+	RET
+endavx:
+	// Exit AVX mode.
+	VZEROUPPER
+	MOVQ R12, (R8)
+	RET
diff --git a/src/internal/bytealg/count_arm.s b/src/internal/bytealg/count_arm.s
new file mode 100644
index 0000000..f704ea0
--- /dev/null
+++ b/src/internal/bytealg/count_arm.s
@@ -0,0 +1,43 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R0
+	MOVW	b_len+4(FP), R1
+	MOVBU	c+12(FP), R2
+	MOVW	$ret+16(FP), R7
+	B	countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R0
+	MOVW	s_len+4(FP), R1
+	MOVBU	c+8(FP), R2
+	MOVW	$ret+12(FP), R7
+	B	countbytebody<>(SB)
+
+// Input:
+// R0: data
+// R1: data length
+// R2: byte to find
+// R7: address to put result
+//
+// On exit:
+// R4 and R8 are clobbered
+TEXT countbytebody<>(SB),NOSPLIT,$0
+	MOVW	$0, R8	// R8 = count of byte to search
+	CMP	$0, R1
+	B.EQ	done	// short path to handle 0-byte case
+	ADD	R0, R1	// R1 is the end of the range
+byte_loop:
+	MOVBU.P	1(R0), R4
+	CMP	R4, R2
+	ADD.EQ	$1, R8
+	CMP	R0, R1
+	B.NE	byte_loop
+done:
+	MOVW	R8, (R7)
+	RET
diff --git a/src/internal/bytealg/count_arm64.s b/src/internal/bytealg/count_arm64.s
new file mode 100644
index 0000000..8cd703d
--- /dev/null
+++ b/src/internal/bytealg/count_arm64.s
@@ -0,0 +1,90 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+	MOVD	b_base+0(FP), R0
+	MOVD	b_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	MOVD	$ret+32(FP), R8
+	B	countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+16(FP), R1
+	MOVD	$ret+24(FP), R8
+	B	countbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R2: data len
+//   R1: byte to find
+//   R8: address to put result
+TEXT countbytebody<>(SB),NOSPLIT,$0
+	// R11 = count of byte to search
+	MOVD	$0, R11
+	// short path to handle 0-byte case
+	CBZ	R2, done
+	CMP	$0x20, R2
+	// jump directly to tail if length < 32
+	BLO	tail
+	ANDS	$0x1f, R0, R9
+	BEQ	chunk
+	// Work with not 32-byte aligned head
+	BIC	$0x1f, R0, R3
+	ADD	$0x20, R3
+head_loop:
+	MOVBU.P	1(R0), R5
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	SUB	$1, R2, R2
+	CMP	R0, R3
+	BNE	head_loop
+	// Work with 32-byte aligned chunks
+chunk:
+	BIC	$0x1f, R2, R9
+	// The first chunk can also be the last
+	CBZ	R9, tail
+	// R3 = end of 32-byte chunks
+	ADD	R0, R9, R3
+	MOVD	$1, R5
+	VMOV	R5, V5.B16
+	// R2 = length of tail
+	SUB	R9, R2, R2
+	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
+	VMOV	R1, V0.B16
+	// Clear the low 64-bit element of V7 and V8
+	VEOR	V7.B8, V7.B8, V7.B8
+	VEOR	V8.B8, V8.B8, V8.B8
+	// Count the target byte in 32-byte chunk
+chunk_loop:
+	VLD1.P	(R0), [V1.B16, V2.B16]
+	CMP	R0, R3
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// Clear the higher 7 bits
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	// Count lanes match the requested byte
+	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
+	VUADDLV	V6.B16, V7
+	// Accumulate the count in low 64-bit element of V8 when inside the loop
+	VADD	V7, V8
+	BNE	chunk_loop
+	VMOV	V8.D[0], R6
+	ADD	R6, R11, R11
+	CBZ	R2, done
+tail:
+	// Work with tail shorter than 32 bytes
+	MOVBU.P	1(R0), R5
+	SUB	$1, R2, R2
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	CBNZ	R2, tail
+done:
+	MOVD	R11, (R8)
+	RET
diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go
new file mode 100644
index 0000000..1891d29
--- /dev/null
+++ b/src/internal/bytealg/count_generic.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !arm && !arm64 && !ppc64le && !ppc64 && !riscv64 && !s390x
+// +build !amd64,!arm,!arm64,!ppc64le,!ppc64,!riscv64,!s390x
+
+package bytealg
+
+func Count(b []byte, c byte) int {
+	n := 0
+	for _, x := range b {
+		if x == c {
+			n++
+		}
+	}
+	return n
+}
+
+func CountString(s string, c byte) int {
+	n := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			n++
+		}
+	}
+	return n
+}
diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go
new file mode 100644
index 0000000..a19a6f8
--- /dev/null
+++ b/src/internal/bytealg/count_native.go
@@ -0,0 +1,34 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm || arm64 || ppc64le || ppc64 || riscv64 || s390x
+// +build amd64 arm arm64 ppc64le ppc64 riscv64 s390x
+
+package bytealg
+
+//go:noescape
+func Count(b []byte, c byte) int
+
+//go:noescape
+func CountString(s string, c byte) int
+
+// A backup implementation to use by assembly.
+func countGeneric(b []byte, c byte) int {
+	n := 0
+	for _, x := range b {
+		if x == c {
+			n++
+		}
+	}
+	return n
+}
+func countGenericString(s string, c byte) int {
+	n := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			n++
+		}
+	}
+	return n
+}
diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s
new file mode 100644
index 0000000..94163cb
--- /dev/null
+++ b/src/internal/bytealg/count_ppc64x.s
@@ -0,0 +1,98 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64le || ppc64
+// +build ppc64le ppc64
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
+	MOVD  b_base+0(FP), R3    // R3 = byte array pointer
+	MOVD  b_len+8(FP), R4     // R4 = length
+	MOVBZ c+24(FP), R5        // R5 = byte
+	MOVD  $ret+32(FP), R14    // R14 = &ret
+	BR    countbytebody<>(SB)
+
+TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
+	MOVD  s_base+0(FP), R3    // R3 = string
+	MOVD  s_len+8(FP), R4     // R4 = length
+	MOVBZ c+16(FP), R5        // R5 = byte
+	MOVD  $ret+24(FP), R14    // R14 = &ret
+	BR    countbytebody<>(SB)
+
+// R3: addr of string
+// R4: len of string
+// R5: byte to count
+// R14: addr for return value
+// endianness shouldn't matter since we are just counting and order
+// is irrelevant
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+	DCBT (R3)    // Prepare cache line.
+	MOVD R0, R18 // byte count
+	MOVD R3, R19 // Save base address for calculating the index later.
+	MOVD R4, R16
+
+	MOVD   R5, R6
+	RLDIMI $8, R6, $48, R6
+	RLDIMI $16, R6, $32, R6
+	RLDIMI $32, R6, $0, R6  // fill reg with the byte to count
+
+	VSPLTISW $3, V4     // used for shift
+	MTVRD    R6, V1     // move compare byte
+	VSPLTB   $7, V1, V1 // replicate byte across V1
+
+	CMPU   R4, $32          // Check if it's a small string (<32 bytes)
+	BLT    tail             // Jump to the small string case
+	XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
+
+cmploop:
+	LXVW4X (R3), VS32 // load bytes from string
+
+	// when the bytes match, the corresponding byte contains all 1s
+	VCMPEQUB V1, V0, V2     // compare bytes
+	VPOPCNTD V2, V3         // each double word contains its count
+	VADDUDM  V3, V5, V5     // accumulate bit count in each double word
+	ADD      $16, R3, R3    // increment pointer
+	SUB      $16, R16, R16  // remaining bytes
+	CMP      R16, $16       // at least 16 remaining?
+	BGE      cmploop
+	VSRD     V5, V4, V5     // shift by 3 to convert bits to bytes
+	VSLDOI   $8, V5, V5, V6 // get the double word values from vector
+	MFVSRD   V5, R9
+	MFVSRD   V6, R10
+	ADD      R9, R10, R9
+	ADD      R9, R18, R18
+
+tail:
+	CMP R16, $8 // 8 bytes left?
+	BLT small
+
+	MOVD    (R3), R12     // load 8 bytes
+	CMPB    R12, R6, R17  // compare bytes
+	POPCNTD R17, R15      // bit count
+	SRD     $3, R15, R15  // byte count
+	ADD     R15, R18, R18 // add to byte count
+
+next1:
+	ADD $8, R3, R3
+	SUB $8, R16, R16 // remaining bytes
+	BR  tail
+
+small:
+	CMP   $0, R16   // any remaining
+	BEQ   done
+	MOVBZ (R3), R12 // check each remaining byte
+	CMP   R12, R5
+	BNE   next2
+	ADD   $1, R18
+
+next2:
+	SUB $1, R16
+	ADD $1, R3  // inc address
+	BR  small
+
+done:
+	MOVD R18, (R14) // return count
+	RET
diff --git a/src/internal/bytealg/count_riscv64.s b/src/internal/bytealg/count_riscv64.s
new file mode 100644
index 0000000..3f4eb23
--- /dev/null
+++ b/src/internal/bytealg/count_riscv64.s
@@ -0,0 +1,44 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+	MOV	b_base+0(FP), A1
+	MOV	b_len+8(FP), A2
+	MOVBU	c+24(FP), A3	// byte to count
+	MOV	ZERO, A4	// count
+	ADD	A1, A2		// end
+
+loop:
+	BEQ	A1, A2, done
+	MOVBU	(A1), A5
+	ADD	$1, A1
+	BNE	A3, A5, loop
+	ADD	$1, A4
+	JMP	loop
+
+done:
+	MOV	A4, ret+32(FP)
+	RET
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+	MOV	s_base+0(FP), A1
+	MOV	s_len+8(FP), A2
+	MOVBU	c+16(FP), A3	// byte to count
+	MOV	ZERO, A4	// count
+	ADD	A1, A2		// end
+
+loop:
+	BEQ	A1, A2, done
+	MOVBU	(A1), A5
+	ADD	$1, A1
+	BNE	A3, A5, loop
+	ADD	$1, A4
+	JMP	loop
+
+done:
+	MOV	A4, ret+24(FP)
+	RET
diff --git a/src/internal/bytealg/count_s390x.s b/src/internal/bytealg/count_s390x.s
new file mode 100644
index 0000000..2a3b5c0
--- /dev/null
+++ b/src/internal/bytealg/count_s390x.s
@@ -0,0 +1,169 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// condition code masks
+#define EQ 8
+#define NE 7
+
+// register assignments
+#define R_ZERO R0
+#define R_VAL  R1
+#define R_TMP  R2
+#define R_PTR  R3
+#define R_LEN  R4
+#define R_CHAR R5
+#define R_RET  R6
+#define R_ITER R7
+#define R_CNT  R8
+#define R_MPTR R9
+
+// vector register assignments
+#define V_ZERO V0
+#define V_CHAR V1
+#define V_MASK V2
+#define V_VAL  V3
+#define V_CNT  V4
+
+// mask for trailing bytes in vector implementation
+GLOBL countbytemask<>(SB), RODATA, $16
+DATA countbytemask<>+0(SB)/8, $0x0101010101010101
+DATA countbytemask<>+8(SB)/8, $0x0101010101010101
+
+// func Count(b []byte, c byte) int
+TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
+	LMG   b+0(FP), R_PTR, R_LEN
+	MOVBZ c+24(FP), R_CHAR
+	MOVD  $ret+32(FP), R_RET
+	BR    countbytebody<>(SB)
+
+// func CountString(s string, c byte) int
+TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
+	LMG   s+0(FP), R_PTR, R_LEN
+	MOVBZ c+16(FP), R_CHAR
+	MOVD  $ret+24(FP), R_RET
+	BR    countbytebody<>(SB)
+
+// input:
+// R_PTR  = address of array of bytes
+// R_LEN  = number of bytes in array
+// R_CHAR = byte value to count zero (extended to register width)
+// R_RET  = address of return value
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+	MOVD  $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
+	MOVD  $countbytemask<>(SB), R_MPTR
+	CGIJ  $EQ, R_LEN, $0, ret0 // return if length is 0.
+	SRD   $4, R_LEN, R_ITER    // R_ITER is the number of 16-byte chunks
+	MOVBZ (R_TMP), R_TMP       // load bool indicating support for vector facility
+	CGIJ  $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
+
+	// Start of vector code (have vector facility).
+	//
+	// Set R_LEN to be the length mod 16 minus 1 to use as an index for
+	// vector 'load with length' (VLL). It will be in the range [-1,14].
+	// Also replicate c across a 16-byte vector and initialize V_ZERO.
+	ANDW  $0xf, R_LEN
+	VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
+	VZERO V_ZERO             // V_ZERO = [1]uint128{0}
+	ADDW  $-1, R_LEN
+	VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
+
+	// Jump to loop if we have more than 15 bytes to process.
+	CGIJ $NE, R_ITER, $0, vxchunks
+
+	// Load 1-15 bytes and corresponding mask.
+	// Note: only the low 32-bits of R_LEN are used for the index.
+	VLL R_LEN, (R_PTR), V_VAL
+	VLL R_LEN, (R_MPTR), V_MASK
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+
+	// Return rightmost (lowest) 64-bit part of accumulator.
+	VSTEG $1, V_CNT, (R_RET)
+	RET
+
+vxchunks:
+	// Load 0x01 into every byte element in the 16-byte mask vector.
+	VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
+	VZERO  V_CNT      // initial uint128 count of 0
+
+vxloop:
+	// Load input bytes in 16-byte chunks.
+	VL (R_PTR), V_VAL
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+	// Increment input string address.
+	MOVD $16(R_PTR), R_PTR
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
+
+	// Repeat until all 16-byte chunks are done.
+	BRCTG R_ITER, vxloop
+
+	// Skip to end if there are no trailing bytes.
+	CIJ $EQ, R_LEN, $-1, vxret
+
+	// Load 1-15 bytes and corresponding mask.
+	// Note: only the low 32-bits of R_LEN are used for the index.
+	VLL R_LEN, (R_PTR), V_VAL
+	VLL R_LEN, (R_MPTR), V_MASK
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL
+	VN    V_MASK, V_VAL, V_VAL
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
+
+vxret:
+	// Return rightmost (lowest) 64-bit part of accumulator.
+	VSTEG $1, V_CNT, (R_RET)
+	RET
+
+novx:
+	// Start of non-vector code (the vector facility not available).
+	//
+	// Initialise counter and constant zero.
+	MOVD $0, R_CNT
+	MOVD $0, R_ZERO
+
+loop:
+	// Read 1-byte from input and compare.
+	// Note: avoid putting LOCGR in critical path.
+	MOVBZ (R_PTR), R_VAL
+	MOVD  $1, R_TMP
+	MOVD  $1(R_PTR), R_PTR
+	CMPW  R_VAL, R_CHAR
+	LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
+	ADD   R_TMP, R_CNT       // accumulate 64-bit result
+
+	// Repeat until all bytes have been checked.
+	BRCTG R_LEN, loop
+
+ret:
+	MOVD R_CNT, (R_RET)
+	RET
+
+ret0:
+	MOVD $0, (R_RET)
+	RET
diff --git a/src/internal/bytealg/equal_386.s b/src/internal/bytealg/equal_386.s
new file mode 100644
index 0000000..8723363
--- /dev/null
+++ b/src/internal/bytealg/equal_386.s
@@ -0,0 +1,129 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
+	MOVL	a+0(FP), SI
+	MOVL	b+4(FP), DI
+	CMPL	SI, DI
+	JEQ	eq
+	MOVL	size+8(FP), BX
+	LEAL	ret+12(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB    $1, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
+	MOVL    a+0(FP), SI
+	MOVL    b+4(FP), DI
+	CMPL    SI, DI
+	JEQ     eq
+	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
+	LEAL	ret+8(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB    $1, ret+8(FP)
+	RET
+
+// a in SI
+// b in DI
+// count in BX
+// address of result byte in AX
+TEXT memeqbody<>(SB),NOSPLIT,$0-0
+	CMPL	BX, $4
+	JB	small
+
+	// 64 bytes at a time using xmm registers
+hugeloop:
+	CMPL	BX, $64
+	JB	bigloop
+	CMPB	internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
+	JNE	bigloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	MOVOU	16(SI), X2
+	MOVOU	16(DI), X3
+	MOVOU	32(SI), X4
+	MOVOU	32(DI), X5
+	MOVOU	48(SI), X6
+	MOVOU	48(DI), X7
+	PCMPEQB	X1, X0
+	PCMPEQB	X3, X2
+	PCMPEQB	X5, X4
+	PCMPEQB	X7, X6
+	PAND	X2, X0
+	PAND	X6, X4
+	PAND	X4, X0
+	PMOVMSKB X0, DX
+	ADDL	$64, SI
+	ADDL	$64, DI
+	SUBL	$64, BX
+	CMPL	DX, $0xffff
+	JEQ	hugeloop
+	MOVB	$0, (AX)
+	RET
+
+	// 4 bytes at a time using 32-bit register
+bigloop:
+	CMPL	BX, $4
+	JBE	leftover
+	MOVL	(SI), CX
+	MOVL	(DI), DX
+	ADDL	$4, SI
+	ADDL	$4, DI
+	SUBL	$4, BX
+	CMPL	CX, DX
+	JEQ	bigloop
+	MOVB	$0, (AX)
+	RET
+
+	// remaining 0-4 bytes
+leftover:
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	-4(DI)(BX*1), DX
+	CMPL	CX, DX
+	SETEQ	(AX)
+	RET
+
+small:
+	CMPL	BX, $0
+	JEQ	equal
+
+	LEAL	0(BX*8), CX
+	NEGL	CX
+
+	MOVL	SI, DX
+	CMPB	DX, $0xfc
+	JA	si_high
+
+	// load at SI won't cross a page boundary.
+	MOVL	(SI), SI
+	JMP	si_finish
+si_high:
+	// address ends in 111111xx. Load up to bytes we want, move to correct position.
+	MOVL	-4(SI)(BX*1), SI
+	SHRL	CX, SI
+si_finish:
+
+	// same for DI.
+	MOVL	DI, DX
+	CMPB	DX, $0xfc
+	JA	di_high
+	MOVL	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVL	-4(DI)(BX*1), DI
+	SHRL	CX, DI
+di_finish:
+
+	SUBL	SI, DI
+	SHLL	CX, DI
+equal:
+	SETEQ	(AX)
+	RET
diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s
new file mode 100644
index 0000000..6f12d2a
--- /dev/null
+++ b/src/internal/bytealg/equal_amd64.s
@@ -0,0 +1,210 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = a    (want in SI)
+	// BX = b    (want in DI)
+	// CX = size (want in BX)
+	CMPQ	AX, BX
+	JNE	neq
+	MOVQ	$1, AX	// return 1
+	RET
+neq:
+	MOVQ	AX, SI
+	MOVQ	BX, DI
+	MOVQ	CX, BX
+	JMP	memeqbody<>(SB)
+#else
+	MOVQ	a+0(FP), SI
+	MOVQ	b+8(FP), DI
+	CMPQ	SI, DI
+	JEQ	eq
+	MOVQ	size+16(FP), BX
+	LEAQ	ret+24(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB	$1, ret+24(FP)
+	RET
+#endif
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = a       (want in SI)
+	// BX = b       (want in DI)
+	// 8(DX) = size (want in BX)
+	CMPQ	AX, BX
+	JNE	neq
+	MOVQ	$1, AX	// return 1
+	RET
+neq:
+	MOVQ	AX, SI
+	MOVQ	BX, DI
+	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
+	JMP	memeqbody<>(SB)
+#else
+	MOVQ	a+0(FP), SI
+	MOVQ	b+8(FP), DI
+	CMPQ	SI, DI
+	JEQ	eq
+	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
+	LEAQ	ret+16(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB	$1, ret+16(FP)
+	RET
+#endif
+
+// Input:
+//   a in SI
+//   b in DI
+//   count in BX
+#ifndef GOEXPERIMENT_regabiargs
+//   address of result byte in AX
+#else
+// Output:
+//   result in AX
+#endif
+TEXT memeqbody<>(SB),NOSPLIT,$0-0
+	CMPQ	BX, $8
+	JB	small
+	CMPQ	BX, $64
+	JB	bigloop
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JE	hugeloop_avx2
+
+	// 64 bytes at a time using xmm registers
+hugeloop:
+	CMPQ	BX, $64
+	JB	bigloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	MOVOU	16(SI), X2
+	MOVOU	16(DI), X3
+	MOVOU	32(SI), X4
+	MOVOU	32(DI), X5
+	MOVOU	48(SI), X6
+	MOVOU	48(DI), X7
+	PCMPEQB	X1, X0
+	PCMPEQB	X3, X2
+	PCMPEQB	X5, X4
+	PCMPEQB	X7, X6
+	PAND	X2, X0
+	PAND	X6, X4
+	PAND	X4, X0
+	PMOVMSKB X0, DX
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, BX
+	CMPL	DX, $0xffff
+	JEQ	hugeloop
+#ifdef GOEXPERIMENT_regabiargs
+	XORQ	AX, AX	// return 0
+#else
+	MOVB	$0, (AX)
+#endif
+	RET
+
+	// 64 bytes at a time using ymm registers
+hugeloop_avx2:
+	CMPQ	BX, $64
+	JB	bigloop_avx2
+	VMOVDQU	(SI), Y0
+	VMOVDQU	(DI), Y1
+	VMOVDQU	32(SI), Y2
+	VMOVDQU	32(DI), Y3
+	VPCMPEQB	Y1, Y0, Y4
+	VPCMPEQB	Y2, Y3, Y5
+	VPAND	Y4, Y5, Y6
+	VPMOVMSKB Y6, DX
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, BX
+	CMPL	DX, $0xffffffff
+	JEQ	hugeloop_avx2
+	VZEROUPPER
+#ifdef GOEXPERIMENT_regabiargs
+	XORQ	AX, AX	// return 0
+#else
+	MOVB	$0, (AX)
+#endif
+	RET
+
+bigloop_avx2:
+	VZEROUPPER
+
+	// 8 bytes at a time using 64-bit register
+bigloop:
+	CMPQ	BX, $8
+	JBE	leftover
+	MOVQ	(SI), CX
+	MOVQ	(DI), DX
+	ADDQ	$8, SI
+	ADDQ	$8, DI
+	SUBQ	$8, BX
+	CMPQ	CX, DX
+	JEQ	bigloop
+#ifdef GOEXPERIMENT_regabiargs
+	XORQ	AX, AX	// return 0
+#else
+	MOVB	$0, (AX)
+#endif
+	RET
+
+	// remaining 0-8 bytes
+leftover:
+	MOVQ	-8(SI)(BX*1), CX
+	MOVQ	-8(DI)(BX*1), DX
+	CMPQ	CX, DX
+#ifdef GOEXPERIMENT_regabiargs
+	SETEQ	AX
+#else
+	SETEQ	(AX)
+#endif
+	RET
+
+small:
+	CMPQ	BX, $0
+	JEQ	equal
+
+	LEAQ	0(BX*8), CX
+	NEGQ	CX
+
+	CMPB	SI, $0xf8
+	JA	si_high
+
+	// load at SI won't cross a page boundary.
+	MOVQ	(SI), SI
+	JMP	si_finish
+si_high:
+	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
+	MOVQ	-8(SI)(BX*1), SI
+	SHRQ	CX, SI
+si_finish:
+
+	// same for DI.
+	CMPB	DI, $0xf8
+	JA	di_high
+	MOVQ	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVQ	-8(DI)(BX*1), DI
+	SHRQ	CX, DI
+di_finish:
+
+	SUBQ	SI, DI
+	SHLQ	CX, DI
+equal:
+#ifdef GOEXPERIMENT_regabiargs
+	SETEQ	AX
+#else
+	SETEQ	(AX)
+#endif
+	RET
+
diff --git a/src/internal/bytealg/equal_arm.s b/src/internal/bytealg/equal_arm.s
new file mode 100644
index 0000000..a6c4369
--- /dev/null
+++ b/src/internal/bytealg/equal_arm.s
@@ -0,0 +1,91 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-13
+	MOVW	a+0(FP), R0
+	MOVW	b+4(FP), R2
+	CMP	R0, R2
+	B.EQ	eq
+	MOVW	size+8(FP), R1
+	CMP	$0, R1
+	B.EQ	eq		// short path to handle 0-byte case
+	MOVW	$ret+12(FP), R7
+	B	memeqbody<>(SB)
+eq:
+	MOVW	$1, R0
+	MOVB	R0, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-9
+	MOVW	a+0(FP), R0
+	MOVW	b+4(FP), R2
+	CMP	R0, R2
+	B.EQ	eq
+	MOVW	4(R7), R1	// compiler stores size at offset 4 in the closure
+	CMP	$0, R1
+	B.EQ	eq		// short path to handle 0-byte case
+	MOVW	$ret+8(FP), R7
+	B	memeqbody<>(SB)
+eq:
+	MOVW	$1, R0
+	MOVB	R0, ret+8(FP)
+	RET
+
+// Input:
+// R0: data of a
+// R1: length
+// R2: data of b
+// R7: points to return value
+//
+// On exit:
+// R4, R5 and R6 are clobbered
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	$1, R1
+	B.EQ	one		// 1-byte special case for better performance
+
+	CMP	$4, R1
+	ADD	R0, R1		// R1 is the end of the range to compare
+	B.LT	byte_loop	// length < 4
+	AND	$3, R0, R6
+	CMP	$0, R6
+	B.NE	byte_loop	// unaligned a, use byte-wise compare (TODO: try to align a)
+	AND	$3, R2, R6
+	CMP	$0, R6
+	B.NE	byte_loop	// unaligned b, use byte-wise compare
+	AND	$0xfffffffc, R1, R6
+	// length >= 4
+chunk4_loop:
+	MOVW.P	4(R0), R4
+	MOVW.P	4(R2), R5
+	CMP	R4, R5
+	B.NE	notequal
+	CMP	R0, R6
+	B.NE	chunk4_loop
+	CMP	R0, R1
+	B.EQ	equal		// reached the end
+byte_loop:
+	MOVBU.P	1(R0), R4
+	MOVBU.P	1(R2), R5
+	CMP	R4, R5
+	B.NE	notequal
+	CMP	R0, R1
+	B.NE	byte_loop
+equal:
+	MOVW	$1, R0
+	MOVB	R0, (R7)
+	RET
+one:
+	MOVBU	(R0), R4
+	MOVBU	(R2), R5
+	CMP	R4, R5
+	B.EQ	equal
+notequal:
+	MOVW	$0, R0
+	MOVB	R0, (R7)
+	RET
diff --git a/src/internal/bytealg/equal_arm64.s b/src/internal/bytealg/equal_arm64.s
new file mode 100644
index 0000000..01aa7b7
--- /dev/null
+++ b/src/internal/bytealg/equal_arm64.s
@@ -0,0 +1,136 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVD	size+16(FP), R1
+	// short path to handle 0-byte case
+	CBZ	R1, equal
+	MOVD	a+0(FP), R0
+	MOVD	b+8(FP), R2
+	MOVD	$ret+24(FP), R8
+	B	memeqbody<>(SB)
+equal:
+	MOVD	$1, R0
+	MOVB	R0, ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R4
+	CMP	R3, R4
+	BEQ	eq
+	MOVD	8(R26), R5    // compiler stores size at offset 8 in the closure
+	CBZ	R5, eq
+	MOVD	R3, 8(RSP)
+	MOVD	R4, 16(RSP)
+	MOVD	R5, 24(RSP)
+	BL	runtime·memequal(SB)
+	MOVBU	32(RSP), R3
+	MOVB	R3, ret+16(FP)
+	RET
+eq:
+	MOVD	$1, R3
+	MOVB	R3, ret+16(FP)
+	RET
+
+// input:
+// R0: pointer a
+// R1: data len
+// R2: pointer b
+// R8: address to put result
+TEXT memeqbody<>(SB),NOSPLIT,$0
+	CMP	$1, R1
+	// handle 1-byte special case for better performance
+	BEQ	one
+	CMP	$16, R1
+	// handle specially if length < 16
+	BLO	tail
+	BIC	$0x3f, R1, R3
+	CBZ	R3, chunk16
+	// work with 64-byte chunks
+	ADD	R3, R0, R6	// end of chunks
+chunk64_loop:
+	VLD1.P	(R0), [V0.D2, V1.D2, V2.D2, V3.D2]
+	VLD1.P	(R2), [V4.D2, V5.D2, V6.D2, V7.D2]
+	VCMEQ	V0.D2, V4.D2, V8.D2
+	VCMEQ	V1.D2, V5.D2, V9.D2
+	VCMEQ	V2.D2, V6.D2, V10.D2
+	VCMEQ	V3.D2, V7.D2, V11.D2
+	VAND	V8.B16, V9.B16, V8.B16
+	VAND	V8.B16, V10.B16, V8.B16
+	VAND	V8.B16, V11.B16, V8.B16
+	CMP	R0, R6
+	VMOV	V8.D[0], R4
+	VMOV	V8.D[1], R5
+	CBZ	R4, not_equal
+	CBZ	R5, not_equal
+	BNE	chunk64_loop
+	AND	$0x3f, R1, R1
+	CBZ	R1, equal
+chunk16:
+	// work with 16-byte chunks
+	BIC	$0xf, R1, R3
+	CBZ	R3, tail
+	ADD	R3, R0, R6	// end of chunks
+chunk16_loop:
+	LDP.P	16(R0), (R4, R5)
+	LDP.P	16(R2), (R7, R9)
+	EOR	R4, R7
+	CBNZ	R7, not_equal
+	EOR	R5, R9
+	CBNZ	R9, not_equal
+	CMP	R0, R6
+	BNE	chunk16_loop
+	AND	$0xf, R1, R1
+	CBZ	R1, equal
+tail:
+	// special compare of tail with length < 16
+	TBZ	$3, R1, lt_8
+	MOVD	(R0), R4
+	MOVD	(R2), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	SUB	$8, R1, R6	// offset of the last 8 bytes
+	MOVD	(R0)(R6), R4
+	MOVD	(R2)(R6), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	B	equal
+lt_8:
+	TBZ	$2, R1, lt_4
+	MOVWU	(R0), R4
+	MOVWU	(R2), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	SUB	$4, R1, R6	// offset of the last 4 bytes
+	MOVWU	(R0)(R6), R4
+	MOVWU	(R2)(R6), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	B	equal
+lt_4:
+	TBZ	$1, R1, lt_2
+	MOVHU.P	2(R0), R4
+	MOVHU.P	2(R2), R5
+	CMP	R4, R5
+	BNE	not_equal
+lt_2:
+	TBZ	$0, R1, equal
+one:
+	MOVBU	(R0), R4
+	MOVBU	(R2), R5
+	CMP	R4, R5
+	BNE	not_equal
+equal:
+	MOVD	$1, R0
+	MOVB	R0, (R8)
+	RET
+not_equal:
+	MOVB	ZR, (R8)
+	RET
diff --git a/src/internal/bytealg/equal_generic.go b/src/internal/bytealg/equal_generic.go
new file mode 100644
index 0000000..59bdf8f
--- /dev/null
+++ b/src/internal/bytealg/equal_generic.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Equal reports whether a and b
+// are the same length and contain the same bytes.
+// A nil argument is equivalent to an empty slice.
+//
+// Equal is equivalent to bytes.Equal.
+// It is provided here for convenience,
+// because some packages cannot depend on bytes.
+func Equal(a, b []byte) bool {
+	// Neither cmd/compile nor gccgo allocates for these string conversions.
+	// There is a test for this in package bytes.
+	return string(a) == string(b)
+}
diff --git a/src/internal/bytealg/equal_mips64x.s b/src/internal/bytealg/equal_mips64x.s
new file mode 100644
index 0000000..c2f7d39
--- /dev/null
+++ b/src/internal/bytealg/equal_mips64x.s
@@ -0,0 +1,119 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+// +build mips64 mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	REGCTXT	R22
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVV	a+0(FP), R1
+	MOVV	b+8(FP), R2
+	BEQ	R1, R2, eq
+	MOVV	size+16(FP), R3
+	ADDV	R1, R3, R4
+
+	// chunk size is 16
+	SGTU	$16, R3, R8
+	BEQ	R0, R8, chunk_entry
+
+byte_loop:
+	BNE	R1, R4, byte_test
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	RET
+byte_test:
+	MOVBU	(R1), R6
+	ADDV	$1, R1
+	MOVBU	(R2), R7
+	ADDV	$1, R2
+	BEQ	R6, R7, byte_loop
+	JMP	not_eq
+
+chunk_entry:
+	// make sure both a and b are aligned
+	OR	R1, R2, R9
+	AND	$0x7, R9
+	BNE	R0, R9, byte_loop
+	JMP	chunk_loop_1
+
+chunk_loop:
+	// chunk size is 16
+	SGTU	$16, R3, R8
+	BNE	R0, R8, chunk_tail_8
+chunk_loop_1:
+	MOVV	(R1), R6
+	MOVV	(R2), R7
+	BNE	R6, R7, not_eq
+	MOVV	8(R1), R12
+	MOVV	8(R2), R13
+	ADDV	$16, R1
+	ADDV	$16, R2
+	SUBV	$16, R3
+	BEQ	R12, R13, chunk_loop
+	JMP	not_eq
+
+chunk_tail_8:
+	AND	$8, R3, R14
+	BEQ	R0, R14, chunk_tail_4
+	MOVV	(R1), R6
+	MOVV	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$8, R1
+	ADDV	$8, R2
+
+chunk_tail_4:
+	AND	$4, R3, R14
+	BEQ	R0, R14, chunk_tail_2
+	MOVWU	(R1), R6
+	MOVWU	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$4, R1
+	ADDV	$4, R2
+
+chunk_tail_2:
+	AND	$2, R3, R14
+	BEQ	R0, R14, chunk_tail_1
+	MOVHU	(R1), R6
+	MOVHU	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$2, R1
+	ADDV	$2, R2
+
+chunk_tail_1:
+	AND	$1, R3, R14
+	BEQ	R0, R14, eq
+	MOVBU	(R1), R6
+	MOVBU	(R2), R7
+	BEQ	R6, R7, eq
+
+not_eq:
+	MOVB	R0, ret+24(FP)
+	RET
+eq:
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+	MOVV	a+0(FP), R1
+	MOVV	b+8(FP), R2
+	BEQ	R1, R2, eq
+	MOVV	8(REGCTXT), R3    // compiler stores size at offset 8 in the closure
+	MOVV	R1, 8(R29)
+	MOVV	R2, 16(R29)
+	MOVV	R3, 24(R29)
+	JAL	runtime·memequal(SB)
+	MOVBU	32(R29), R1
+	MOVB	R1, ret+16(FP)
+	RET
+eq:
+	MOVV	$1, R1
+	MOVB	R1, ret+16(FP)
+	RET
diff --git a/src/internal/bytealg/equal_mipsx.s b/src/internal/bytealg/equal_mipsx.s
new file mode 100644
index 0000000..11e5549
--- /dev/null
+++ b/src/internal/bytealg/equal_mipsx.s
@@ -0,0 +1,63 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+// +build mips mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	REGCTXT	R22
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
+	MOVW	a+0(FP), R1
+	MOVW	b+4(FP), R2
+	BEQ	R1, R2, eq
+	MOVW	size+8(FP), R3
+	ADDU	R1, R3, R4
+loop:
+	BNE	R1, R4, test
+	MOVW	$1, R1
+	MOVB	R1, ret+12(FP)
+	RET
+test:
+	MOVBU	(R1), R6
+	ADDU	$1, R1
+	MOVBU	(R2), R7
+	ADDU	$1, R2
+	BEQ	R6, R7, loop
+
+	MOVB	R0, ret+12(FP)
+	RET
+eq:
+	MOVW	$1, R1
+	MOVB	R1, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
+	MOVW	a+0(FP), R1
+	MOVW	b+4(FP), R2
+	BEQ	R1, R2, eq
+	MOVW	4(REGCTXT), R3	// compiler stores size at offset 4 in the closure
+	ADDU	R1, R3, R4
+loop:
+	BNE	R1, R4, test
+	MOVW	$1, R1
+	MOVB	R1, ret+8(FP)
+	RET
+test:
+	MOVBU	(R1), R6
+	ADDU	$1, R1
+	MOVBU	(R2), R7
+	ADDU	$1, R2
+	BEQ	R6, R7, loop
+
+	MOVB	R0, ret+8(FP)
+	RET
+eq:
+	MOVW	$1, R1
+	MOVB	R1, ret+8(FP)
+	RET
diff --git a/src/internal/bytealg/equal_native.go b/src/internal/bytealg/equal_native.go
new file mode 100644
index 0000000..cf3a245
--- /dev/null
+++ b/src/internal/bytealg/equal_native.go
@@ -0,0 +1,21 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "unsafe"
+
+// The declarations below generate ABI wrappers for functions
+// implemented in assembly in this package but declared in another
+// package.
+
+// The compiler generates calls to runtime.memequal and runtime.memequal_varlen.
+// In addition, the runtime calls runtime.memequal explicitly.
+// Those functions are implemented in this package.
+
+//go:linkname abigen_runtime_memequal runtime.memequal
+func abigen_runtime_memequal(a, b unsafe.Pointer, size uintptr) bool
+
+//go:linkname abigen_runtime_memequal_varlen runtime.memequal_varlen
+func abigen_runtime_memequal_varlen(a, b unsafe.Pointer) bool
diff --git a/src/internal/bytealg/equal_ppc64x.s b/src/internal/bytealg/equal_ppc64x.s
new file mode 100644
index 0000000..5f0fea5
--- /dev/null
+++ b/src/internal/bytealg/equal_ppc64x.s
@@ -0,0 +1,103 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+// +build ppc64 ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVD    a+0(FP), R3
+	MOVD    b+8(FP), R4
+	MOVD    size+16(FP), R5
+	MOVD    $ret+24(FP), R10
+
+	BR	memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R4
+	CMP	R3, R4
+	BEQ	eq
+	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
+	MOVD    $ret+16(FP), R10
+	BR	memeqbody<>(SB)
+eq:
+	MOVD	$1, R3
+	MOVB	R3, ret+16(FP)
+	RET
+
+// Do an efficient memequal for ppc64
+// R3 = s1
+// R4 = s2
+// R5 = len
+// R10 = addr of return value (byte)
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD    R5,CTR
+	CMP     R5,$8		// only optimize >=8
+	BLT     simplecheck
+	DCBT	(R3)		// cache hint
+	DCBT	(R4)
+	CMP	R5,$32		// optimize >= 32
+	MOVD	R5,R6		// needed if setup8a branch
+	BLT	setup8a		// 8 byte moves only
+setup32a:                       // 8 byte aligned, >= 32 bytes
+	SRADCC  $5,R5,R6        // number of 32 byte chunks to compare
+	MOVD	R6,CTR
+	MOVD	$16,R14		// index for VSX loads and stores
+loop32a:
+	LXVD2X  (R3+R0), VS32	// VS32 = V0
+	LXVD2X  (R4+R0), VS33	// VS33 = V1
+	VCMPEQUBCC V0, V1, V2	// compare, setting CR6
+	BGE     CR6, noteq
+	LXVD2X  (R3+R14), VS32
+	LXVD2X  (R4+R14), VS33
+	VCMPEQUBCC V0, V1, V2
+	BGE     CR6, noteq
+	ADD     $32,R3		// bump up to next 32
+	ADD     $32,R4
+	BC      16, 0, loop32a  // br ctr and cr
+	ANDCC	$24,R5,R6       // Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC  $3,R6,R6        // get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD    R6,CTR
+loop8:
+	MOVD    0(R3),R6        // doublewords to compare
+	ADD	$8,R3
+	MOVD    0(R4),R7
+	ADD     $8,R4
+	CMP     R6,R7           // match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BNE     noteq
+leftover:
+	ANDCC   $7,R5,R6        // check for leftover bytes
+	BEQ     equal
+	MOVD    R6,CTR
+	BR	simple
+simplecheck:
+	CMP	R5,$0
+	BEQ	equal
+simple:
+	MOVBZ   0(R3), R6
+	ADD	$1,R3
+	MOVBZ   0(R4), R7
+	ADD     $1,R4
+	CMP     R6, R7
+	BNE     noteq
+	BC      8,2,simple
+	BNE	noteq
+	BR	equal
+noteq:
+	MOVB    $0, (R10)
+	RET
+equal:
+	MOVD	$1, R3
+	MOVB	R3, (R10)
+	RET
+
diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s
new file mode 100644
index 0000000..22cb4fa
--- /dev/null
+++ b/src/internal/bytealg/equal_riscv64.s
@@ -0,0 +1,49 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	CTXT	S4
+
+// func memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOV	a+0(FP), A1
+	MOV	b+8(FP), A2
+	BEQ	A1, A2, eq
+	MOV	size+16(FP), A3
+	ADD	A1, A3, A4
+loop:
+	BEQ	A1, A4, eq
+
+	MOVBU	(A1), A6
+	ADD	$1, A1
+	MOVBU	(A2), A7
+	ADD	$1, A2
+	BEQ	A6, A7, loop
+
+	MOVB	ZERO, ret+24(FP)
+	RET
+eq:
+	MOV	$1, A1
+	MOVB	A1, ret+24(FP)
+	RET
+
+// func memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+	MOV	a+0(FP), A1
+	MOV	b+8(FP), A2
+	BEQ	A1, A2, eq
+	MOV	8(CTXT), A3    // compiler stores size at offset 8 in the closure
+	MOV	A1, 8(X2)
+	MOV	A2, 16(X2)
+	MOV	A3, 24(X2)
+	CALL	runtime·memequal(SB)
+	MOVBU	32(X2), A1
+	MOVB	A1, ret+16(FP)
+	RET
+eq:
+	MOV	$1, A1
+	MOVB	A1, ret+16(FP)
+	RET
diff --git a/src/internal/bytealg/equal_s390x.s b/src/internal/bytealg/equal_s390x.s
new file mode 100644
index 0000000..67f814d
--- /dev/null
+++ b/src/internal/bytealg/equal_s390x.s
@@ -0,0 +1,92 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R5
+	MOVD	size+16(FP), R6
+	LA	ret+24(FP), R7
+	BR	memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R5
+	MOVD	8(R12), R6    // compiler stores size at offset 8 in the closure
+	LA	ret+16(FP), R7
+	BR	memeqbody<>(SB)
+
+// input:
+//   R3 = a
+//   R5 = b
+//   R6 = len
+//   R7 = address of output byte (stores 0 or 1 here)
+//   a and b have the same length
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, equal
+loop:
+	CMPBEQ	R6, $0, equal
+	CMPBLT	R6, $32, tiny
+	CMP	R6, $256
+	BLT	tail
+	CLC	$256, 0(R3), 0(R5)
+	BNE	notequal
+	SUB	$256, R6
+	LA	256(R3), R3
+	LA	256(R5), R5
+	BR	loop
+tail:
+	SUB	$1, R6, R8
+	EXRL	$memeqbodyclc<>(SB), R8
+	BEQ	equal
+notequal:
+	MOVB	$0, 0(R7)
+	RET
+equal:
+	MOVB	$1, 0(R7)
+	RET
+tiny:
+	MOVD	$0, R2
+	CMPBLT	R6, $16, lt16
+	MOVD	0(R3), R8
+	MOVD	0(R5), R9
+	CMPBNE	R8, R9, notequal
+	MOVD	8(R3), R8
+	MOVD	8(R5), R9
+	CMPBNE	R8, R9, notequal
+	LA	16(R2), R2
+	SUB	$16, R6
+lt16:
+	CMPBLT	R6, $8, lt8
+	MOVD	0(R3)(R2*1), R8
+	MOVD	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	8(R2), R2
+	SUB	$8, R6
+lt8:
+	CMPBLT	R6, $4, lt4
+	MOVWZ	0(R3)(R2*1), R8
+	MOVWZ	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	4(R2), R2
+	SUB	$4, R6
+lt4:
+#define CHECK(n) \
+	CMPBEQ	R6, $n, equal \
+	MOVB	n(R3)(R2*1), R8 \
+	MOVB	n(R5)(R2*1), R9 \
+	CMPBNE	R8, R9, notequal
+	CHECK(0)
+	CHECK(1)
+	CHECK(2)
+	CHECK(3)
+	BR	equal
+
+TEXT memeqbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
diff --git a/src/internal/bytealg/equal_wasm.s b/src/internal/bytealg/equal_wasm.s
new file mode 100644
index 0000000..a2b76c1
--- /dev/null
+++ b/src/internal/bytealg/equal_wasm.s
@@ -0,0 +1,77 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB), NOSPLIT, $0-25
+	Get SP
+	I64Load a+0(FP)
+	I64Load b+8(FP)
+	I64Load size+16(FP)
+	Call memeqbody<>(SB)
+	I64Store8 ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB), NOSPLIT, $0-17
+	Get SP
+	I64Load a+0(FP)
+	I64Load b+8(FP)
+	I64Load 8(CTXT) // compiler stores size at offset 8 in the closure
+	Call memeqbody<>(SB)
+	I64Store8 ret+16(FP)
+	RET
+
+// params: a, b, len
+// ret: 0/1
+TEXT memeqbody<>(SB), NOSPLIT, $0-0
+	Get R0
+	Get R1
+	I64Eq
+	If
+		I64Const $1
+		Return
+	End
+
+loop:
+	Loop
+		Get R2
+		I64Eqz
+		If
+			I64Const $1
+			Return
+		End
+
+		Get R0
+		I32WrapI64
+		I64Load8U $0
+		Get R1
+		I32WrapI64
+		I64Load8U $0
+		I64Ne
+		If
+			I64Const $0
+			Return
+		End
+
+		Get R0
+		I64Const $1
+		I64Add
+		Set R0
+
+		Get R1
+		I64Const $1
+		I64Add
+		Set R1
+
+		Get R2
+		I64Const $1
+		I64Sub
+		Set R2
+
+		Br loop
+	End
+	UNDEF
diff --git a/src/internal/bytealg/index_amd64.go b/src/internal/bytealg/index_amd64.go
new file mode 100644
index 0000000..c7a1941
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.go
@@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	if cpu.X86.HasAVX2 {
+		MaxLen = 63
+	} else {
+		MaxLen = 31
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
new file mode 100644
index 0000000..6193b57
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.s
@@ -0,0 +1,274 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+24(FP), R8
+	MOVQ b_len+32(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+48(FP), R11
+	JMP  indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+16(FP), R8
+	MOVQ b_len+24(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+32(FP), R11
+	JMP  indexbody<>(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// R8: pointer to string, that we are searching for
+// R11: address, where to put return value
+// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
+TEXT indexbody<>(SB),NOSPLIT,$0
+	CMPQ AX, DX
+	JA fail
+	CMPQ DX, $16
+	JAE sse42
+no_sse42:
+	CMPQ AX, $2
+	JA   _3_or_more
+	MOVW (R8), R8
+	LEAQ -1(DI)(DX*1), DX
+loop2:
+	MOVW (DI), SI
+	CMPW SI,R8
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop2
+	JMP fail
+_3_or_more:
+	CMPQ AX, $3
+	JA   _4_or_more
+	MOVW 1(R8), BX
+	MOVW (R8), R8
+	LEAQ -2(DI)(DX*1), DX
+loop3:
+	MOVW (DI), SI
+	CMPW SI,R8
+	JZ   partial_success3
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+partial_success3:
+	MOVW 1(DI), SI
+	CMPW SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+_4_or_more:
+	CMPQ AX, $4
+	JA   _5_or_more
+	MOVL (R8), R8
+	LEAQ -3(DI)(DX*1), DX
+loop4:
+	MOVL (DI), SI
+	CMPL SI,R8
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop4
+	JMP fail
+_5_or_more:
+	CMPQ AX, $7
+	JA   _8_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVL -4(R8)(AX*1), BX
+	MOVL (R8), R8
+loop5to7:
+	MOVL (DI), SI
+	CMPL SI,R8
+	JZ   partial_success5to7
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+partial_success5to7:
+	MOVL -4(AX)(DI*1), SI
+	CMPL SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+_8_or_more:
+	CMPQ AX, $8
+	JA   _9_or_more
+	MOVQ (R8), R8
+	LEAQ -7(DI)(DX*1), DX
+loop8:
+	MOVQ (DI), SI
+	CMPQ SI,R8
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop8
+	JMP fail
+_9_or_more:
+	CMPQ AX, $15
+	JA   _16_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVQ -8(R8)(AX*1), BX
+	MOVQ (R8), R8
+loop9to15:
+	MOVQ (DI), SI
+	CMPQ SI,R8
+	JZ   partial_success9to15
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+partial_success9to15:
+	MOVQ -8(AX)(DI*1), SI
+	CMPQ SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+_16_or_more:
+	CMPQ AX, $16
+	JA   _17_or_more
+	MOVOU (R8), X1
+	LEAQ -15(DI)(DX*1), DX
+loop16:
+	MOVOU (DI), X2
+	PCMPEQB X1, X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop16
+	JMP fail
+_17_or_more:
+	CMPQ AX, $31
+	JA   _32_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVOU -16(R8)(AX*1), X0
+	MOVOU (R8), X1
+loop17to31:
+	MOVOU (DI), X2
+	PCMPEQB X1,X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   partial_success17to31
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+partial_success17to31:
+	MOVOU -16(AX)(DI*1), X3
+	PCMPEQB X0, X3
+	PMOVMSKB X3, SI
+	CMPQ  SI, $0xffff
+	JE success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+	CMPQ AX, $32
+	JA   _33_to_63
+	VMOVDQU (R8), Y1
+	LEAQ -31(DI)(DX*1), DX
+loop32:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop32
+	JMP fail_avx2
+_33_to_63:
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	VMOVDQU -32(R8)(AX*1), Y0
+	VMOVDQU (R8), Y1
+loop33to63:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   partial_success33to63
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+	JMP fail_avx2
+partial_success33to63:
+	VMOVDQU -32(AX)(DI*1), Y3
+	VPCMPEQB Y0, Y3, Y4
+	VPMOVMSKB Y4, SI
+	CMPL  SI, $0xffffffff
+	JE success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+fail_avx2:
+	VZEROUPPER
+fail:
+	MOVQ $-1, (R11)
+	RET
+success_avx2:
+	VZEROUPPER
+	JMP success
+sse42:
+	CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
+	JNE no_sse42
+	CMPQ AX, $12
+	// PCMPESTRI is slower than normal compare,
+	// so using it makes sense only if we advance 4+ bytes per compare
+	// This value was determined experimentally and is the ~same
+	// on Nehalem (first with SSE42) and Haswell.
+	JAE _9_or_more
+	LEAQ 16(R8), SI
+	TESTW $0xff0, SI
+	JEQ no_sse42
+	MOVOU (R8), X1
+	LEAQ -15(DI)(DX*1), SI
+	MOVQ $16, R9
+	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
+	// for equality (bits 2,3 are 11)
+	// result is not masked or inverted (bits 4,5 are 00)
+	// and corresponds to first matching byte (bit 6 is 0)
+	PCMPESTRI $0x0c, (DI), X1
+	// CX == 16 means no match,
+	// CX > R9 means partial match at the end of the string,
+	// otherwise sep is at offset CX from X1 start
+	CMPQ CX, R9
+	JBE sse42_success
+	ADDQ R9, DI
+	CMPQ DI, SI
+	JB loop_sse42
+	PCMPESTRI $0x0c, -1(SI), X1
+	CMPQ CX, R9
+	JA fail
+	LEAQ -1(SI), DI
+sse42_success:
+	ADDQ CX, DI
+success:
+	SUBQ R10, DI
+	MOVQ DI, (R11)
+	RET
diff --git a/src/internal/bytealg/index_arm64.go b/src/internal/bytealg/index_arm64.go
new file mode 100644
index 0000000..e87c109
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.go
@@ -0,0 +1,23 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Empirical data shows that using Index can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+	// Optimize cases where the length of the substring is less than 32 bytes
+	MaxLen = 32
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 16 characters, plus a few slop to start.
+	return 4 + n>>4
+}
diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s
new file mode 100644
index 0000000..3a551a7
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.s
@@ -0,0 +1,206 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+24(FP), R2
+	MOVD	b_len+32(FP), R3
+	MOVD	$ret+48(FP), R9
+	B	indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+16(FP), R2
+	MOVD	b_len+24(FP), R3
+	MOVD	$ret+32(FP), R9
+	B	indexbody<>(SB)
+
+// input:
+//   R0: haystack
+//   R1: length of haystack
+//   R2: needle
+//   R3: length of needle (2 <= len <= 32)
+//   R9: address to put result
+TEXT indexbody<>(SB),NOSPLIT,$0-56
+	// main idea is to load 'sep' into separate register(s)
+	// to avoid repeatedly re-load it again and again
+	// for sebsequent substring comparisons
+	SUB	R3, R1, R4
+	// R4 contains the start of last substring for comparison
+	ADD	R0, R4, R4
+	ADD	$1, R0, R8
+
+	CMP	$8, R3
+	BHI	greater_8
+	TBZ	$3, R3, len_2_7
+len_8:
+	// R5 contains 8-byte of sep
+	MOVD	(R2), R5
+loop_8:
+	// R6 contains substring for comparison
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_8
+	B	found
+len_2_7:
+	TBZ	$2, R3, len_2_3
+	TBZ	$1, R3, len_4_5
+	TBZ	$0, R3, len_6
+len_7:
+	// R5 and R6 contain 7-byte of sep
+	MOVWU	(R2), R5
+	// 1-byte overlap with R5
+	MOVWU	3(R2), R6
+loop_7:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_7
+	MOVWU	2(R0), R3
+	CMP	R6, R3
+	BNE	loop_7
+	B	found
+len_6:
+	// R5 and R6 contain 6-byte of sep
+	MOVWU	(R2), R5
+	MOVHU	4(R2), R6
+loop_6:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_6
+	MOVHU	3(R0), R3
+	CMP	R6, R3
+	BNE	loop_6
+	B	found
+len_4_5:
+	TBZ	$0, R3, len_4
+len_5:
+	// R5 and R7 contain 5-byte of sep
+	MOVWU	(R2), R5
+	MOVBU	4(R2), R7
+loop_5:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_5
+	MOVBU	3(R0), R3
+	CMP	R7, R3
+	BNE	loop_5
+	B	found
+len_4:
+	// R5 contains 4-byte of sep
+	MOVWU	(R2), R5
+loop_4:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_4
+	B	found
+len_2_3:
+	TBZ	$0, R3, len_2
+len_3:
+	// R6 and R7 contain 3-byte of sep
+	MOVHU	(R2), R6
+	MOVBU	2(R2), R7
+loop_3:
+	CMP	R4, R0
+	BHI	not_found
+	MOVHU.P	1(R0), R3
+	CMP	R6, R3
+	BNE	loop_3
+	MOVBU	1(R0), R3
+	CMP	R7, R3
+	BNE	loop_3
+	B	found
+len_2:
+	// R5 contains 2-byte of sep
+	MOVHU	(R2), R5
+loop_2:
+	CMP	R4, R0
+	BHI	not_found
+	MOVHU.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_2
+found:
+	SUB	R8, R0, R0
+	MOVD	R0, (R9)
+	RET
+not_found:
+	MOVD	$-1, R0
+	MOVD	R0, (R9)
+	RET
+greater_8:
+	SUB	$9, R3, R11	// len(sep) - 9, offset of R0 for last 8 bytes
+	CMP	$16, R3
+	BHI	greater_16
+len_9_16:
+	MOVD.P	8(R2), R5	// R5 contains the first 8-byte of sep
+	SUB	$16, R3, R7	// len(sep) - 16, offset of R2 for last 8 bytes
+	MOVD	(R2)(R7), R6	// R6 contains the last 8-byte of sep
+loop_9_16:
+	// search the first 8 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R7
+	CMP	R5, R7
+	BNE	loop_9_16
+	MOVD	(R0)(R11), R7
+	CMP	R6, R7		// compare the last 8 bytes
+	BNE	loop_9_16
+	B	found
+greater_16:
+	CMP	$24, R3
+	BHI	len_25_32
+len_17_24:
+	LDP.P	16(R2), (R5, R6)	// R5 and R6 contain the first 16-byte of sep
+	SUB	$24, R3, R10		// len(sep) - 24
+	MOVD	(R2)(R10), R7		// R7 contains the last 8-byte of sep
+loop_17_24:
+	// search the first 16 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R10
+	CMP	R5, R10
+	BNE	loop_17_24
+	MOVD	7(R0), R10
+	CMP	R6, R10
+	BNE	loop_17_24
+	MOVD	(R0)(R11), R10
+	CMP	R7, R10		// compare the last 8 bytes
+	BNE	loop_17_24
+	B	found
+len_25_32:
+	LDP.P	16(R2), (R5, R6)
+	MOVD.P	8(R2), R7	// R5, R6 and R7 contain the first 24-byte of sep
+	SUB	$32, R3, R12	// len(sep) - 32
+	MOVD	(R2)(R12), R10	// R10 contains the last 8-byte of sep
+loop_25_32:
+	// search the first 24 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R12
+	CMP	R5, R12
+	BNE	loop_25_32
+	MOVD	7(R0), R12
+	CMP	R6, R12
+	BNE	loop_25_32
+	MOVD	15(R0), R12
+	CMP	R7, R12
+	BNE	loop_25_32
+	MOVD	(R0)(R11), R12
+	CMP	R10, R12	// compare the last 8 bytes
+	BNE	loop_25_32
+	B	found
diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go
new file mode 100644
index 0000000..0a6eb90
--- /dev/null
+++ b/src/internal/bytealg/index_generic.go
@@ -0,0 +1,30 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64
+// +build !amd64,!arm64,!s390x,!ppc64le,!ppc64
+
+package bytealg
+
+const MaxBruteForce = 0
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int {
+	panic("unimplemented")
+}
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int {
+	panic("unimplemented")
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	panic("unimplemented")
+}
diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go
new file mode 100644
index 0000000..9547a5d
--- /dev/null
+++ b/src/internal/bytealg/index_native.go
@@ -0,0 +1,20 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 || arm64 || s390x || ppc64le || ppc64
+// +build amd64 arm64 s390x ppc64le ppc64
+
+package bytealg
+
+//go:noescape
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int
+
+//go:noescape
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int
diff --git a/src/internal/bytealg/index_ppc64x.go b/src/internal/bytealg/index_ppc64x.go
new file mode 100644
index 0000000..c9b2b5a
--- /dev/null
+++ b/src/internal/bytealg/index_ppc64x.go
@@ -0,0 +1,28 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (aix || linux) && (ppc64 || ppc64le)
+// +build aix linux
+// +build ppc64 ppc64le
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 16
+
+var SupportsPower9 = cpu.PPC64.IsPOWER9
+
+func init() {
+	MaxLen = 32
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_ppc64x.s b/src/internal/bytealg/index_ppc64x.s
new file mode 100644
index 0000000..3ed9442
--- /dev/null
+++ b/src/internal/bytealg/index_ppc64x.s
@@ -0,0 +1,768 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This is an implementation based on the s390x
+// implementation.
+
+// Find a separator with 2 <= len <= 32 within a string.
+// Separators with lengths of 2, 3 or 4 are handled
+// specially.
+
+// This works on power8 and above. The loads and
+// compares are done in big endian order
+// since that allows the used of VCLZD, and allows
+// the same implementation to work on big and little
+// endian platforms with minimal conditional changes.
+
+// NOTE: There is a power9 implementation that
+// improves performance by 10-15% on little
+// endian for some of the benchmarks, but
+// work is still needed for a big endian
+// implementation on power9.
+
+//go:build ppc64 || ppc64le
+// +build ppc64 ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Needed to swap LXVD2X loads to the correct
+// byte order to work on POWER8.
+
+#ifdef GOARCH_ppc64
+DATA byteswap<>+0(SB)/8, $0x0001020304050607
+DATA byteswap<>+8(SB)/8, $0x08090a0b0c0d0e0f
+#else
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+#endif
+
+// Load bytes in big endian order. Address
+// alignment does not need checking.
+#define VLOADSWAP(base, index, vreg, vsreg) \
+	LXVD2X (base)(index), vsreg;  \
+	VPERM  vreg, vreg, SWAP, vreg
+
+GLOBL byteswap<>+0(SB), RODATA, $16
+
+TEXT ·Index(SB), NOSPLIT|NOFRAME, $0-56
+	MOVD a_base+0(FP), R3  // R3 = byte array pointer
+	MOVD a_len+8(FP), R4   // R4 = length
+	MOVD b_base+24(FP), R5 // R5 = separator pointer
+	MOVD b_len+32(FP), R6  // R6 = separator length
+	MOVD $ret+48(FP), R14  // R14 = &ret
+
+#ifdef GOARCH_ppc64le
+	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
+	CMP   R7, $1
+	BNE   power8
+	BR    indexbodyp9<>(SB)
+
+#endif
+power8:
+	BR indexbody<>(SB)
+
+TEXT ·IndexString(SB), NOSPLIT|NOFRAME, $0-40
+	MOVD a_base+0(FP), R3  // R3 = string
+	MOVD a_len+8(FP), R4   // R4 = length
+	MOVD b_base+16(FP), R5 // R5 = separator pointer
+	MOVD b_len+24(FP), R6  // R6 = separator length
+	MOVD $ret+32(FP), R14  // R14 = &ret
+
+#ifdef GOARCH_ppc64le
+	MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R7
+	CMP   R7, $1
+	BNE   power8
+	BR    indexbody<>(SB)
+
+#endif
+power8:
+	BR indexbody<>(SB)
+
+	// s: string we are searching
+	// sep: string to search for
+	// R3=&s[0], R4=len(s)
+	// R5=&sep[0], R6=len(sep)
+	// R14=&ret (index where sep found)
+	// R7=working addr of string
+	// R16=index value 16
+	// R17=index value 17
+	// R18=index value 18
+	// R19=index value 1
+	// R26=LASTBYTE of string
+	// R27=LASTSTR last start byte to compare with sep
+	// R8, R9 scratch
+	// V0=sep left justified zero fill
+	// CR4=sep length >= 16
+
+#define SEPMASK V17
+#define LASTBYTE R26
+#define LASTSTR R27
+#define ONES V20
+#define SWAP V21
+#define V0_ VS32
+#define V1_ VS33
+#define V2_ VS34
+#define V3_ VS35
+#define V4_ VS36
+#define V5_ VS37
+#define V6_ VS38
+#define V7_ VS39
+#define V8_ VS40
+#define V9_ VS41
+#define SWAP_ VS53
+TEXT indexbody<>(SB), NOSPLIT|NOFRAME, $0
+	CMP      R6, R4                 // Compare lengths
+	BGT      notfound               // If sep len is > string, notfound
+	ADD      R4, R3, LASTBYTE       // find last byte addr
+	SUB      R6, LASTBYTE, LASTSTR  // LAST=&s[len(s)-len(sep)] (last valid start index)
+	CMP      R6, $0                 // Check sep len
+	BEQ      notfound               // sep len 0 -- not found
+	MOVD     R3, R7                 // Copy of string addr
+	MOVD     $16, R16               // Index value 16
+	MOVD     $17, R17               // Index value 17
+	MOVD     $18, R18               // Index value 18
+	MOVD     $1, R19                // Index value 1
+	MOVD     $byteswap<>+00(SB), R8
+	VSPLTISB $0xFF, ONES            // splat all 1s
+	LXVD2X   (R8)(R0), SWAP_        // Set up swap string
+
+	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
+	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
+	BGE    CR4, loadge16       // Load for len(sep) >= 16
+	SUB    R6, R16, R9         // 16-len of sep
+	SLD    $3, R9              // Set up for VSLO
+	MTVSRD R9, V9_             // Set up for VSLO
+	VSLDOI $8, V9, V9, V9      // Set up for VSLO
+	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
+
+loadge16:
+	ANDCC $15, R5, R9 // Find byte offset of sep
+	ADD   R9, R6, R10 // Add sep len
+	CMP   R10, $16    // Check if sep len+offset > 16
+	BGE   sepcross16  // Sep crosses 16 byte boundary
+
+	RLDICR $0, R5, $59, R8 // Adjust addr to 16 byte container
+	VLOADSWAP(R8, R0, V0, V0_)// Load 16 bytes @R8 into V0
+	SLD    $3, R9          // Set up shift count for VSLO
+	MTVSRD R9, V8_         // Set up shift count for VSLO
+	VSLDOI $8, V8, V8, V8
+	VSLO   V0, V8, V0      // Shift by start byte
+
+	VAND V0, SEPMASK, V0 // Mask separator (< 16)
+	BR   index2plus
+
+sepcross16:
+	VLOADSWAP(R5, R0, V0, V0_) // Load 16 bytes @R5 into V0
+
+	VAND V0, SEPMASK, V0 // mask out separator
+	BLE  CR4, index2to16
+	BR   index17plus     // Handle sep > 16
+
+index2plus:
+	CMP      R6, $2       // Check length of sep
+	BNE      index3plus   // If not 2, check for 3
+	ADD      $16, R7, R9  // Check if next 16 bytes past last
+	CMP      R9, LASTBYTE // compare with last
+	BGE      index2to16   // 2 <= len(string) <= 16
+	MOVD     $0xff00, R21 // Mask for later
+	MTVSRD   R21, V25     // Move to Vreg
+	VSPLTH   $3, V25, V31 // Splat mask
+	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
+	VSPLTISB $0, V10      // Clear V10
+
+	// First case: 2 byte separator
+	// V1: 2 byte separator splatted
+	// V2: 16 bytes at addr
+	// V4: 16 bytes at addr+1
+	// Compare 2 byte separator at start
+	// and at start+1. Use VSEL to combine
+	// those results to find the first
+	// matching start byte, returning
+	// that value when found. Loop as
+	// long as len(string) > 16
+index2loop2:
+	VLOADSWAP(R7, R19, V3, V3_) // Load 16 bytes @R7+1 into V3
+
+index2loop:
+	VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
+	VCMPEQUH V1, V2, V5        // Search for sep
+	VCMPEQUH V1, V3, V6        // Search for sep offset by 1
+	VSEL     V6, V5, V31, V7   // merge even and odd indices
+	VCLZD    V7, V18           // find index of first match
+	MFVSRD   V18, R25          // get first value
+	CMP      R25, $64          // Found if < 64
+	BLT      foundR25          // Return byte index where found
+	VSLDOI   $8, V18, V18, V18 // Adjust 2nd value
+	MFVSRD   V18, R25          // get second value
+	CMP      R25, $64          // Found if < 64
+	ADD      $64, R25          // Update byte offset
+	BLT      foundR25          // Return value
+	ADD      $16, R7           // R7+=16 Update string pointer
+	ADD      $17, R7, R9       // R9=F7+17 since loop unrolled
+	CMP      R9, LASTBYTE      // Compare addr+17 against last byte
+	BLT      index2loop2       // If < last, continue loop
+	CMP      R7, LASTBYTE      // Compare addr+16 against last byte
+	BLT      index2to16        // If < 16 handle specially
+	VLOADSWAP(R7, R0, V3, V3_) // Load 16 bytes @R7 into V3
+	VSLDOI   $1, V3, V10, V3   // Shift left by 1 byte
+	BR       index2loop
+
+index3plus:
+	CMP    R6, $3       // Check if sep == 3
+	BNE    index4plus   // If not check larger
+	ADD    $19, R7, R9  // Find bytes for use in this loop
+	CMP    R9, LASTBYTE // Compare against last byte
+	BGE    index2to16   // Remaining string 2<=len<=16
+	MOVD   $0xff00, R21 // Set up mask for upcoming loop
+	MTVSRD R21, V25     // Move mask to Vreg
+	VSPLTH $3, V25, V31 // Splat mask
+	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
+	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
+
+	// Loop to process 3 byte separator.
+	// string[0:16] is in V2
+	// string[2:18] is in V3
+	// sep[0:2] splatted in V1
+	// sec[3] splatted in v8
+	// Load vectors at string, string+1
+	// and string+2. Compare string, string+1
+	// against first 2 bytes of separator
+	// splatted, and string+2 against 3rd
+	// byte splatted. Merge the results with
+	// VSEL to find the first byte of a match.
+
+	// Special handling for last 16 bytes if the
+	// string fits in 16 byte multiple.
+index3loop2:
+	MOVD     $2, R21          // Set up index for 2
+	VSPLTISB $0, V10          // Clear V10
+	VLOADSWAP(R7, R21, V3, V3_)// Load 16 bytes @R7+2 into V3
+	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
+
+index3loop:
+	VLOADSWAP(R7, R0, V2, V2_) // Load with correct order
+	VSLDOI   $1, V2, V3, V4    // string[1:17]
+	VSLDOI   $2, V2, V3, V9    // string[2:18]
+	VCMPEQUH V1, V2, V5        // compare hw even indices
+	VCMPEQUH V1, V4, V6        // compare hw odd indices
+	VCMPEQUB V8, V9, V10       // compare 3rd to last byte
+	VSEL     V6, V5, V31, V7   // Find 1st matching byte using mask
+	VAND     V7, V10, V7       // AND matched bytes with matched 3rd byte
+	VCLZD    V7, V18           // Find first nonzero indexes
+	MFVSRD   V18, R25          // Move 1st doubleword
+	CMP      R25, $64          // If < 64 found
+	BLT      foundR25          // Return matching index
+	VSLDOI   $8, V18, V18, V18 // Move value
+	MFVSRD   V18, R25          // Move 2nd doubleword
+	CMP      R25, $64          // If < 64 found
+	ADD      $64, R25          // Update byte index
+	BLT      foundR25          // Return matching index
+	ADD      $16, R7           // R7+=16 string ptr
+	ADD      $19, R7, R9       // Number of string bytes for loop
+	CMP      R9, LASTBYTE      // Compare against last byte of string
+	BLT      index3loop2       // If within, continue this loop
+	CMP      R7, LASTSTR       // Compare against last start byte
+	BLT      index2to16        // Process remainder
+	VSPLTISB $0, V3            // Special case for last 16 bytes
+	BR       index3loop        // Continue this loop
+
+	// Loop to process 4 byte separator
+	// string[0:16] in V2
+	// string[3:16] in V3
+	// sep[0:4] splatted in V1
+	// Set up vectors with strings at offsets
+	// 0, 1, 2, 3 and compare against the 4 byte
+	// separator also splatted. Use VSEL with the
+	// compare results to find the first byte where
+	// a separator match is found.
+index4plus:
+	CMP  R6, $4       // Check if 4 byte separator
+	BNE  index5plus   // If not next higher
+	ADD  $20, R7, R9  // Check string size to load
+	CMP  R9, LASTBYTE // Verify string length
+	BGE  index2to16   // If not large enough, process remaining
+	MOVD $2, R15      // Set up index
+
+	// Set up masks for use with VSEL
+	MOVD   $0xff, R21        // Set up mask 0xff000000ff000000...
+	SLD    $24, R21
+	MTVSRD R21, V10
+	VSPLTW $1, V10, V29
+	VSLDOI $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
+	MOVD   $0xffff, R21
+	SLD    $16, R21
+	MTVSRD R21, V10
+	VSPLTW $1, V10, V31      // Mask 0xffff0000ffff0000...
+	VSPLTW $0, V0, V1        // Splat 1st word of separator
+
+index4loop:
+	VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
+
+next4:
+	VSPLTISB $0, V10            // Clear
+	MOVD     $3, R9             // Number of bytes beyond 16
+	VLOADSWAP(R7, R9, V3, V3_)  // Load 16 bytes @R7+3 into V3
+	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
+	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
+	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
+	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
+	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
+	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
+	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
+	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
+	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
+	VSEL     V14, V13, V31, V7  // final merge
+	VCLZD    V7, V18            // Find first index for each half
+	MFVSRD   V18, R25           // Isolate value
+	CMP      R25, $64           // If < 64, found
+	BLT      foundR25           // Return found index
+	VSLDOI   $8, V18, V18, V18  // Move for MFVSRD
+	MFVSRD   V18, R25           // Isolate other value
+	CMP      R25, $64           // If < 64, found
+	ADD      $64, R25           // Update index for high doubleword
+	BLT      foundR25           // Return found index
+	ADD      $16, R7            // R7+=16 for next string
+	ADD      $20, R7, R9        // R+20 for all bytes to load
+	CMP      R9, LASTBYTE       // Past end? Maybe check for extra?
+	BLT      index4loop         // If not, continue loop
+	CMP      R7, LASTSTR        // Check remainder
+	BLE      index2to16         // Process remainder
+	BR       notfound           // Not found
+
+index5plus:
+	CMP R6, $16     // Check for sep > 16
+	BGT index17plus // Handle large sep
+
+	// Assumption is that the separator is smaller than the string at this point
+index2to16:
+	CMP R7, LASTSTR // Compare last start byte
+	BGT notfound    // last takes len(sep) into account
+
+	ADD $16, R7, R9    // Check for last byte of string
+	CMP R9, LASTBYTE
+	BGT index2to16tail
+
+	// At least 16 bytes of string left
+	// Mask the number of bytes in sep
+index2to16loop:
+	VLOADSWAP(R7, R0, V1, V1_) // Load 16 bytes @R7 into V1
+
+compare:
+	VAND       V1, SEPMASK, V2 // Mask out sep size
+	VCMPEQUBCC V0, V2, V3      // Compare masked string
+	BLT        CR6, found      // All equal
+	ADD        $1, R7          // Update ptr to next byte
+	CMP        R7, LASTSTR     // Still less than last start byte
+	BGT        notfound        // Not found
+	ADD        $16, R7, R9     // Verify remaining bytes
+	CMP        R9, LASTBYTE    // At least 16
+	BLT        index2to16loop  // Try again
+
+	// Less than 16 bytes remaining in string
+	// Separator >= 2
+index2to16tail:
+	ADD   R3, R4, R9     // End of string
+	SUB   R7, R9, R9     // Number of bytes left
+	ANDCC $15, R7, R10   // 16 byte offset
+	ADD   R10, R9, R11   // offset + len
+	CMP   R11, $16       // >= 16?
+	BLE   short          // Does not cross 16 bytes
+	VLOADSWAP(R7, R0, V1, V1_)// Load 16 bytes @R7 into V1
+	BR    index2to16next // Continue on
+
+short:
+	RLDICR   $0, R7, $59, R9 // Adjust addr to 16 byte container
+	VLOADSWAP(R9, R0, V1, V1_)// Load 16 bytes @R9 into V1
+	SLD      $3, R10         // Set up shift
+	MTVSRD   R10, V8_        // Set up shift
+	VSLDOI   $8, V8, V8, V8
+	VSLO     V1, V8, V1      // Shift by start byte
+	VSPLTISB $0, V25         // Clear for later use
+
+index2to16next:
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
+	BR         index2to16next  // Check the next partial string
+
+index17plus:
+	CMP      R6, $32      // Check if 17 < len(sep) <= 32
+	BGT      index33plus
+	SUB      $16, R6, R9  // Extra > 16
+	SLD      $56, R9, R10 // Shift to use in VSLO
+	MTVSRD   R10, V9_     // Set up for VSLO
+	VLOADSWAP(R5, R9, V1, V1_)// Load 16 bytes @R5+R9 into V1
+	VSLO     V1, V9, V1   // Shift left
+	VSPLTISB $0xff, V7    // Splat 1s
+	VSPLTISB $0, V27      // Splat 0
+
+index17to32loop:
+	VLOADSWAP(R7, R0, V2, V2_) // Load 16 bytes @R7 into V2
+
+next17:
+	VLOADSWAP(R7, R9, V3, V3_) // Load 16 bytes @R7+R9 into V3
+	VSLO       V3, V9, V3      // Shift left
+	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
+	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
+	VAND       V4, V5, V6      // Check if both equal
+	VCMPEQUBCC V6, V7, V8      // All equal?
+	BLT        CR6, found      // Yes
+	ADD        $1, R7          // On to next byte
+	CMP        R7, LASTSTR     // Check if last start byte
+	BGT        notfound        // If too high, not found
+	BR         index17to32loop // Continue
+
+notfound:
+	MOVD $-1, R8   // Return -1 if not found
+	MOVD R8, (R14)
+	RET
+
+index33plus:
+	MOVD $0, (R0) // Case not implemented
+	RET           // Crash before return
+
+foundR25:
+	SRD  $3, R25   // Convert from bits to bytes
+	ADD  R25, R7   // Add to current string address
+	SUB  R3, R7    // Subtract from start of string
+	MOVD R7, (R14) // Return byte where found
+	RET
+
+found:
+	SUB  R3, R7    // Return byte where found
+	MOVD R7, (R14)
+	RET
+
+TEXT indexbodyp9<>(SB), NOSPLIT|NOFRAME, $0
+	CMP      R6, R4                // Compare lengths
+	BGT      notfound              // If sep len is > string, notfound
+	ADD      R4, R3, LASTBYTE      // find last byte addr
+	SUB      R6, LASTBYTE, LASTSTR // LAST=&s[len(s)-len(sep)] (last valid start index)
+	CMP      R6, $0                // Check sep len
+	BEQ      notfound              // sep len 0 -- not found
+	MOVD     R3, R7                // Copy of string addr
+	MOVD     $16, R16              // Index value 16
+	MOVD     $17, R17              // Index value 17
+	MOVD     $18, R18              // Index value 18
+	MOVD     $1, R19               // Index value 1
+	VSPLTISB $0xFF, ONES           // splat all 1s
+
+	CMP    R6, $16, CR4        // CR4 for len(sep) >= 16
+	VOR    ONES, ONES, SEPMASK // Set up full SEPMASK
+	BGE    CR4, loadge16       // Load for len(sep) >= 16
+	SUB    R6, R16, R9         // 16-len of sep
+	SLD    $3, R9              // Set up for VSLO
+	MTVSRD R9, V9_             // Set up for VSLO
+	VSLDOI $8, V9, V9, V9      // Set up for VSLO
+	VSLO   ONES, V9, SEPMASK   // Mask for separator len(sep) < 16
+
+loadge16:
+	ANDCC $15, R5, R9 // Find byte offset of sep
+	ADD   R9, R6, R10 // Add sep len
+	CMP   R10, $16    // Check if sep len+offset > 16
+	BGE   sepcross16  // Sep crosses 16 byte boundary
+
+	RLDICR  $0, R5, $59, R8 // Adjust addr to 16 byte container
+	LXVB16X (R8)(R0), V0_   // Load 16 bytes @R8 into V0
+	SLD     $3, R9          // Set up shift count for VSLO
+	MTVSRD  R9, V8_         // Set up shift count for VSLO
+	VSLDOI  $8, V8, V8, V8
+	VSLO    V0, V8, V0      // Shift by start byte
+
+	VAND V0, SEPMASK, V0 // Mask separator (< 16)
+	BR   index2plus
+
+sepcross16:
+	LXVB16X (R5)(R0), V0_ // Load 16 bytes @R5 into V0
+
+	VAND V0, SEPMASK, V0 // mask out separator
+	BLE  CR4, index2to16
+	BR   index17plus     // Handle sep > 16
+
+index2plus:
+	CMP      R6, $2       // Check length of sep
+	BNE      index3plus   // If not 2, check for 3
+	ADD      $16, R7, R9  // Check if next 16 bytes past last
+	CMP      R9, LASTBYTE // compare with last
+	BGE      index2to16   // 2 <= len(string) <= 16
+	MOVD     $0xff00, R21 // Mask for later
+	MTVSRD   R21, V25     // Move to Vreg
+	VSPLTH   $3, V25, V31 // Splat mask
+	VSPLTH   $0, V0, V1   // Splat 1st 2 bytes of sep
+	VSPLTISB $0, V10      // Clear V10
+
+	// First case: 2 byte separator
+	// V1: 2 byte separator splatted
+	// V2: 16 bytes at addr
+	// V4: 16 bytes at addr+1
+	// Compare 2 byte separator at start
+	// and at start+1. Use VSEL to combine
+	// those results to find the first
+	// matching start byte, returning
+	// that value when found. Loop as
+	// long as len(string) > 16
+index2loop2:
+	LXVB16X (R7)(R19), V3_ // Load 16 bytes @R7+1 into V3
+
+index2loop:
+	LXVB16X  (R7)(R0), V2_   // Load 16 bytes @R7 into V2
+	VCMPEQUH V1, V2, V5      // Search for sep
+	VCMPEQUH V1, V3, V6      // Search for sep offset by 1
+	VSEL     V6, V5, V31, V7 // merge even and odd indices
+	VCLZD    V7, V18         // find index of first match
+	MFVSRD   V18, R25        // get first value
+	CMP      R25, $64        // Found if < 64
+	BLT      foundR25        // Return byte index where found
+
+	MFVSRLD V18, R25        // get second value
+	CMP     R25, $64        // Found if < 64
+	ADD     $64, R25        // Update byte offset
+	BLT     foundR25        // Return value
+	ADD     $16, R7         // R7+=16 Update string pointer
+	ADD     $17, R7, R9     // R9=F7+17 since loop unrolled
+	CMP     R9, LASTBYTE    // Compare addr+17 against last byte
+	BLT     index2loop2     // If < last, continue loop
+	CMP     R7, LASTBYTE    // Compare addr+16 against last byte
+	BLT     index2to16      // If < 16 handle specially
+	LXVB16X (R7)(R0), V3_   // Load 16 bytes @R7 into V3
+	VSLDOI  $1, V3, V10, V3 // Shift left by 1 byte
+	BR      index2loop
+
+index3plus:
+	CMP    R6, $3       // Check if sep == 3
+	BNE    index4plus   // If not check larger
+	ADD    $19, R7, R9  // Find bytes for use in this loop
+	CMP    R9, LASTBYTE // Compare against last byte
+	BGE    index2to16   // Remaining string 2<=len<=16
+	MOVD   $0xff00, R21 // Set up mask for upcoming loop
+	MTVSRD R21, V25     // Move mask to Vreg
+	VSPLTH $3, V25, V31 // Splat mask
+	VSPLTH $0, V0, V1   // Splat 1st two bytes of sep
+	VSPLTB $2, V0, V8   // Splat 3rd byte of sep
+
+	// Loop to process 3 byte separator.
+	// string[0:16] is in V2
+	// string[2:18] is in V3
+	// sep[0:2] splatted in V1
+	// sec[3] splatted in v8
+	// Load vectors at string, string+1
+	// and string+2. Compare string, string+1
+	// against first 2 bytes of separator
+	// splatted, and string+2 against 3rd
+	// byte splatted. Merge the results with
+	// VSEL to find the first byte of a match.
+
+	// Special handling for last 16 bytes if the
+	// string fits in 16 byte multiple.
+index3loop2:
+	MOVD     $2, R21          // Set up index for 2
+	VSPLTISB $0, V10          // Clear V10
+	LXVB16X  (R7)(R21), V3_   // Load 16 bytes @R7+2 into V3
+	VSLDOI   $14, V3, V10, V3 // Left justify next 2 bytes
+
+index3loop:
+	LXVB16X  (R7)(R0), V2_   // Load 16 bytes @R7
+	VSLDOI   $1, V2, V3, V4  // string[1:17]
+	VSLDOI   $2, V2, V3, V9  // string[2:18]
+	VCMPEQUH V1, V2, V5      // compare hw even indices
+	VCMPEQUH V1, V4, V6      // compare hw odd indices
+	VCMPEQUB V8, V9, V10     // compare 3rd to last byte
+	VSEL     V6, V5, V31, V7 // Find 1st matching byte using mask
+	VAND     V7, V10, V7     // AND matched bytes with matched 3rd byte
+	VCLZD    V7, V18         // Find first nonzero indexes
+	MFVSRD   V18, R25        // Move 1st doubleword
+	CMP      R25, $64        // If < 64 found
+	BLT      foundR25        // Return matching index
+
+	MFVSRLD  V18, R25     // Move 2nd doubleword
+	CMP      R25, $64     // If < 64 found
+	ADD      $64, R25     // Update byte index
+	BLT      foundR25     // Return matching index
+	ADD      $16, R7      // R7+=16 string ptr
+	ADD      $19, R7, R9  // Number of string bytes for loop
+	CMP      R9, LASTBYTE // Compare against last byte of string
+	BLT      index3loop2  // If within, continue this loop
+	CMP      R7, LASTSTR  // Compare against last start byte
+	BLT      index2to16   // Process remainder
+	VSPLTISB $0, V3       // Special case for last 16 bytes
+	BR       index3loop   // Continue this loop
+
+	// Loop to process 4 byte separator
+	// string[0:16] in V2
+	// string[3:16] in V3
+	// sep[0:4] splatted in V1
+	// Set up vectors with strings at offsets
+	// 0, 1, 2, 3 and compare against the 4 byte
+	// separator also splatted. Use VSEL with the
+	// compare results to find the first byte where
+	// a separator match is found.
+index4plus:
+	CMP  R6, $4       // Check if 4 byte separator
+	BNE  index5plus   // If not next higher
+	ADD  $20, R7, R9  // Check string size to load
+	CMP  R9, LASTBYTE // Verify string length
+	BGE  index2to16   // If not large enough, process remaining
+	MOVD $2, R15      // Set up index
+
+	// Set up masks for use with VSEL
+	MOVD    $0xff, R21 // Set up mask 0xff000000ff000000...
+	SLD     $24, R21
+	MTVSRWS R21, V29
+
+	VSLDOI  $2, V29, V29, V30 // Mask 0x0000ff000000ff00...
+	MOVD    $0xffff, R21
+	SLD     $16, R21
+	MTVSRWS R21, V31
+
+	VSPLTW $0, V0, V1 // Splat 1st word of separator
+
+index4loop:
+	LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
+
+next4:
+	VSPLTISB $0, V10            // Clear
+	MOVD     $3, R9             // Number of bytes beyond 16
+	LXVB16X  (R7)(R9), V3_      // Load 16 bytes @R7 into V2
+	VSLDOI   $13, V3, V10, V3   // Shift left last 3 bytes
+	VSLDOI   $1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDOI   $2, V2, V3, V9     // V9=(V2:V3)<<2
+	VSLDOI   $3, V2, V3, V10    // V10=(V2:v3)<<3
+	VCMPEQUW V1, V2, V5         // compare index 0, 4, ... with sep
+	VCMPEQUW V1, V4, V6         // compare index 1, 5, ... with sep
+	VCMPEQUW V1, V9, V11        // compare index 2, 6, ... with sep
+	VCMPEQUW V1, V10, V12       // compare index 3, 7, ... with sep
+	VSEL     V6, V5, V29, V13   // merge index 0, 1, 4, 5, using mask
+	VSEL     V12, V11, V30, V14 // merge index 2, 3, 6, 7, using mask
+	VSEL     V14, V13, V31, V7  // final merge
+	VCLZD    V7, V18            // Find first index for each half
+	MFVSRD   V18, R25           // Isolate value
+	CMP      R25, $64           // If < 64, found
+	BLT      foundR25           // Return found index
+
+	MFVSRLD V18, R25     // Isolate other value
+	CMP     R25, $64     // If < 64, found
+	ADD     $64, R25     // Update index for high doubleword
+	BLT     foundR25     // Return found index
+	ADD     $16, R7      // R7+=16 for next string
+	ADD     $20, R7, R9  // R+20 for all bytes to load
+	CMP     R9, LASTBYTE // Past end? Maybe check for extra?
+	BLT     index4loop   // If not, continue loop
+	CMP     R7, LASTSTR  // Check remainder
+	BLE     index2to16   // Process remainder
+	BR      notfound     // Not found
+
+index5plus:
+	CMP R6, $16     // Check for sep > 16
+	BGT index17plus // Handle large sep
+
+	// Assumption is that the separator is smaller than the string at this point
+index2to16:
+	CMP R7, LASTSTR // Compare last start byte
+	BGT notfound    // last takes len(sep) into account
+
+	ADD $16, R7, R9    // Check for last byte of string
+	CMP R9, LASTBYTE
+	BGT index2to16tail
+
+	// At least 16 bytes of string left
+	// Mask the number of bytes in sep
+index2to16loop:
+	LXVB16X (R7)(R0), V1_ // Load 16 bytes @R7 into V1
+
+compare:
+	VAND       V1, SEPMASK, V2 // Mask out sep size
+	VCMPEQUBCC V0, V2, V3      // Compare masked string
+	BLT        CR6, found      // All equal
+	ADD        $1, R7          // Update ptr to next byte
+	CMP        R7, LASTSTR     // Still less than last start byte
+	BGT        notfound        // Not found
+	ADD        $16, R7, R9     // Verify remaining bytes
+	CMP        R9, LASTBYTE    // At least 16
+	BLT        index2to16loop  // Try again
+
+	// Less than 16 bytes remaining in string
+	// Separator >= 2
+index2to16tail:
+	ADD     R3, R4, R9     // End of string
+	SUB     R7, R9, R9     // Number of bytes left
+	ANDCC   $15, R7, R10   // 16 byte offset
+	ADD     R10, R9, R11   // offset + len
+	CMP     R11, $16       // >= 16?
+	BLE     short          // Does not cross 16 bytes
+	LXVB16X (R7)(R0), V1_  // Load 16 bytes @R7 into V1
+	BR      index2to16next // Continue on
+
+short:
+	RLDICR   $0, R7, $59, R9 // Adjust addr to 16 byte container
+	LXVB16X  (R9)(R0), V1_   // Load 16 bytes @R9 into V1
+	SLD      $3, R10         // Set up shift
+	MTVSRD   R10, V8_        // Set up shift
+	VSLDOI   $8, V8, V8, V8
+	VSLO     V1, V8, V1      // Shift by start byte
+	VSPLTISB $0, V25         // Clear for later use
+
+index2to16next:
+	VAND       V1, SEPMASK, V2 // Just compare size of sep
+	VCMPEQUBCC V0, V2, V3      // Compare sep and partial string
+	BLT        CR6, found      // Found
+	ADD        $1, R7          // Not found, try next partial string
+	CMP        R7, LASTSTR     // Check for end of string
+	BGT        notfound        // If at end, then not found
+	VSLDOI     $1, V1, V25, V1 // Shift string left by 1 byte
+	BR         index2to16next  // Check the next partial string
+
+index17plus:
+	CMP      R6, $32       // Check if 17 < len(sep) <= 32
+	BGT      index33plus
+	SUB      $16, R6, R9   // Extra > 16
+	SLD      $56, R9, R10  // Shift to use in VSLO
+	MTVSRD   R10, V9_      // Set up for VSLO
+	LXVB16X  (R5)(R9), V1_ // Load 16 bytes @R5+R9 into V1
+	VSLO     V1, V9, V1    // Shift left
+	VSPLTISB $0xff, V7     // Splat 1s
+	VSPLTISB $0, V27       // Splat 0
+
+index17to32loop:
+	LXVB16X (R7)(R0), V2_ // Load 16 bytes @R7 into V2
+
+next17:
+	LXVB16X    (R7)(R9), V3_   // Load 16 bytes @R7+R9 into V3
+	VSLO       V3, V9, V3      // Shift left
+	VCMPEQUB   V0, V2, V4      // Compare first 16 bytes
+	VCMPEQUB   V1, V3, V5      // Compare extra over 16 bytes
+	VAND       V4, V5, V6      // Check if both equal
+	VCMPEQUBCC V6, V7, V8      // All equal?
+	BLT        CR6, found      // Yes
+	ADD        $1, R7          // On to next byte
+	CMP        R7, LASTSTR     // Check if last start byte
+	BGT        notfound        // If too high, not found
+	BR         index17to32loop // Continue
+
+notfound:
+	MOVD $-1, R8   // Return -1 if not found
+	MOVD R8, (R14)
+	RET
+
+index33plus:
+	MOVD $0, (R0) // Case not implemented
+	RET           // Crash before return
+
+foundR25:
+	SRD  $3, R25   // Convert from bits to bytes
+	ADD  R25, R7   // Add to current string address
+	SUB  R3, R7    // Subtract from start of string
+	MOVD R7, (R14) // Return byte where found
+	RET
+
+found:
+	SUB  R3, R7    // Return byte where found
+	MOVD R7, (R14)
+	RET
+
diff --git a/src/internal/bytealg/index_s390x.go b/src/internal/bytealg/index_s390x.go
new file mode 100644
index 0000000..9340cf1
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.go
@@ -0,0 +1,31 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	// Note: we're kind of lucky that this flag is available at this point.
+	// The runtime sets HasVX when processing auxv records, and that happens
+	// to happen *before* running the init functions of packages that
+	// the runtime depends on.
+	// TODO: it would really be nicer for internal/cpu to figure out this
+	// flag by itself. Then we wouldn't need to depend on quirks of
+	// early startup initialization order.
+	if cpu.S390X.HasVX {
+		MaxLen = 64
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_s390x.s b/src/internal/bytealg/index_s390x.s
new file mode 100644
index 0000000..491d5bc
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.s
@@ -0,0 +1,216 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+48(FP), R5
+	BR	indexbody<>(SB)
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+32(FP), R5
+	BR	indexbody<>(SB)
+
+// s: string we are searching
+// sep: string to search for
+// R1=&s[0], R2=len(s)
+// R3=&sep[0], R4=len(sep)
+// R5=&ret (int)
+// Caller must confirm availability of vx facility before calling.
+TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
+	CMPBGT	R4, R2, notfound
+	ADD	R1, R2
+	SUB	R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
+	CMPBEQ	R4, $0, notfound
+	SUB	$1, R4 // R4=len(sep)-1 for use as VLL index
+	VLL	R4, (R3), V0 // contains first 16 bytes of sep
+	MOVD	R1, R7
+index2plus:
+	CMPBNE	R4, $1, index3plus
+	MOVD	$15(R7), R9
+	CMPBGE	R9, R2, index2to16
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	CMPBGE	R9, R2, index2to16
+index2loop:
+	VL	0(R7), V2          // 16 bytes, even indices
+	VL	1(R7), V4          // 16 bytes, odd indices
+	VCEQH	V1, V2, V5         // compare even indices
+	VCEQH	V1, V4, V6         // compare odd indices
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index3plus:
+	CMPBNE	R4, $2, index4plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$1, R0
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	VREPB	$2, V0, V8
+index3loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 2-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<2
+	VCEQH	V1, V2, V5         // compare 2-byte even indices
+	VCEQH	V1, V4, V6         // compare 2-byte odd indices
+	VCEQB	V8, V9, V10        // compare last bytes
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VN	V7, V10, V7        // AND indices with last byte
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index4plus:
+	CMPBNE	R4, $3, index5plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$2, R0
+	VGBM	$0x8888, V29       // 0xff000000ff000000...
+	VGBM	$0x2222, V30       // 0x0000ff000000ff00...
+	VGBM	$0xcccc, V31       // 0xffff0000ffff0000...
+	VONE	V16
+	VREPF	$0, V0, V1
+index4loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 3-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<1
+	VSLDB	$3, V2, V3, V10    // V10=(V2:V3)<<1
+	VCEQF	V1, V2, V5         // compare index 0, 4, ...
+	VCEQF	V1, V4, V6         // compare index 1, 5, ...
+	VCEQF	V1, V9, V11        // compare index 2, 6, ...
+	VCEQF	V1, V10, V12       // compare index 3, 7, ...
+	VSEL	V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
+	VSEL	V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
+	VSEL	V13, V14, V31, V7  // final merge
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index5plus:
+	CMPBGT	R4, $15, index17plus
+index2to16:
+	CMPBGT	R7, R2, notfound
+	MOVD	$1(R7), R8
+	CMPBGT	R8, R2, index2to16tail
+index2to16loop:
+	// unrolled 2x
+	VLL	R4, (R7), V1
+	VLL	R4, 1(R7), V2
+	VCEQGS	V0, V1, V3
+	BEQ	found
+	MOVD	$1(R7), R7
+	VCEQGS	V0, V2, V4
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLT	R7, R2, index2to16loop
+	CMPBGT	R7, R2, notfound
+index2to16tail:
+	VLL	R4, (R7), V1
+	VCEQGS	V0, V1, V2
+	BEQ	found
+	BR	notfound
+
+index17plus:
+	CMPBGT	R4, $31, index33plus
+	SUB	$16, R4, R0
+	VLL	R0, 16(R3), V1
+	VONE	V7
+index17to32loop:
+	VL	(R7), V2
+	VLL	R0, 16(R7), V3
+	VCEQG	V0, V2, V4
+	VCEQG	V1, V3, V5
+	VN	V4, V5, V6
+	VCEQGS	V6, V7, V8
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index17to32loop
+	BR	notfound
+
+index33plus:
+	CMPBGT	R4, $47, index49plus
+	SUB	$32, R4, R0
+	VL	16(R3), V1
+	VLL	R0, 32(R3), V2
+	VONE	V11
+index33to48loop:
+	VL	(R7), V3
+	VL	16(R7), V4
+	VLL	R0, 32(R7), V5
+	VCEQG	V0, V3, V6
+	VCEQG	V1, V4, V7
+	VCEQG	V2, V5, V8
+	VN	V6, V7, V9
+	VN	V8, V9, V10
+	VCEQGS	V10, V11, V12
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index33to48loop
+	BR	notfound
+
+index49plus:
+	CMPBGT	R4, $63, index65plus
+	SUB	$48, R4, R0
+	VL	16(R3), V1
+	VL	32(R3), V2
+	VLL	R0, 48(R3), V3
+	VONE	V15
+index49to64loop:
+	VL	(R7), V4
+	VL	16(R7), V5
+	VL	32(R7), V6
+	VLL	R0, 48(R7), V7
+	VCEQG	V0, V4, V8
+	VCEQG	V1, V5, V9
+	VCEQG	V2, V6, V10
+	VCEQG	V3, V7, V11
+	VN	V8, V9, V12
+	VN	V10, V11, V13
+	VN	V12, V13, V14
+	VCEQGS	V14, V15, V16
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index49to64loop
+notfound:
+	MOVD	$-1, (R5)
+	RET
+
+index65plus:
+	// not implemented
+	MOVD	$0, (R0)
+	RET
+
+foundV17: // index is in doubleword V17[0]
+	VLGVG	$0, V17, R8
+	ADD	R8, R7
+found:
+	SUB	R1, R7
+	MOVD	R7, (R5)
+	RET
diff --git a/src/internal/bytealg/indexbyte_386.s b/src/internal/bytealg/indexbyte_386.s
new file mode 100644
index 0000000..8a03054
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_386.s
@@ -0,0 +1,34 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVL	b_base+0(FP), SI
+	MOVL	b_len+4(FP), CX
+	MOVB	c+12(FP), AL
+	MOVL	SI, DI
+	CLD; REPN; SCASB
+	JZ 3(PC)
+	MOVL	$-1, ret+16(FP)
+	RET
+	SUBL	SI, DI
+	SUBL	$1, DI
+	MOVL	DI, ret+16(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVL	s_base+0(FP), SI
+	MOVL	s_len+4(FP), CX
+	MOVB	c+8(FP), AL
+	MOVL	SI, DI
+	CLD; REPN; SCASB
+	JZ 3(PC)
+	MOVL	$-1, ret+12(FP)
+	RET
+	SUBL	SI, DI
+	SUBL	$1, DI
+	MOVL	DI, ret+12(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s
new file mode 100644
index 0000000..f78093c
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_amd64.s
@@ -0,0 +1,147 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT	·IndexByte(SB), NOSPLIT, $0-40
+	MOVQ b_base+0(FP), SI
+	MOVQ b_len+8(FP), BX
+	MOVB c+24(FP), AL
+	LEAQ ret+32(FP), R8
+	JMP  indexbytebody<>(SB)
+
+TEXT	·IndexByteString(SB), NOSPLIT, $0-32
+	MOVQ s_base+0(FP), SI
+	MOVQ s_len+8(FP), BX
+	MOVB c+16(FP), AL
+	LEAQ ret+24(FP), R8
+	JMP  indexbytebody<>(SB)
+
+// input:
+//   SI: data
+//   BX: data len
+//   AL: byte sought
+//   R8: address to put result
+TEXT	indexbytebody<>(SB), NOSPLIT, $0
+	// Shuffle X0 around so that each byte contains
+	// the character we're looking for.
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
+
+	CMPQ BX, $16
+	JLT small
+
+	MOVQ SI, DI
+
+	CMPQ BX, $32
+	JA avx2
+sse:
+	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
+	JMP	sseloopentry
+
+sseloop:
+	// Move the next 16-byte chunk of the data into X1.
+	MOVOU	(DI), X1
+	// Compare bytes in X0 to X1.
+	PCMPEQB	X0, X1
+	// Take the top bit of each byte in X1 and put the result in DX.
+	PMOVMSKB X1, DX
+	// Find first set bit, if any.
+	BSFL	DX, DX
+	JNZ	ssesuccess
+	// Advance to next block.
+	ADDQ	$16, DI
+sseloopentry:
+	CMPQ	DI, AX
+	JB	sseloop
+
+	// Search the last 16-byte chunk. This chunk may overlap with the
+	// chunks we've already searched, but that's ok.
+	MOVQ	AX, DI
+	MOVOU	(AX), X1
+	PCMPEQB	X0, X1
+	PMOVMSKB X1, DX
+	BSFL	DX, DX
+	JNZ	ssesuccess
+
+failure:
+	MOVQ $-1, (R8)
+	RET
+
+// We've found a chunk containing the byte.
+// The chunk was loaded from DI.
+// The index of the matching byte in the chunk is DX.
+// The start of the data is SI.
+ssesuccess:
+	SUBQ SI, DI	// Compute offset of chunk within data.
+	ADDQ DX, DI	// Add offset of byte within chunk.
+	MOVQ DI, (R8)
+	RET
+
+// handle for lengths < 16
+small:
+	TESTQ	BX, BX
+	JEQ	failure
+
+	// Check if we'll load across a page boundary.
+	LEAQ	16(SI), AX
+	TESTW	$0xff0, AX
+	JEQ	endofpage
+
+	MOVOU	(SI), X1 // Load data
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	CMPL	DX, BX
+	JAE	failure	// Match is past end of data.
+	MOVQ	DX, (R8)
+	RET
+
+endofpage:
+	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	MOVL	BX, CX
+	SHLL	CX, DX
+	SHRL	$16, DX	// Shift desired bits down to bottom of register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	MOVQ	DX, (R8)
+	RET
+
+avx2:
+	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JNE sse
+	MOVD AX, X0
+	LEAQ -32(SI)(BX*1), R11
+	VPBROADCASTB  X0, Y1
+avx2_loop:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPTEST Y3, Y3
+	JNZ avx2success
+	ADDQ $32, DI
+	CMPQ DI, R11
+	JLT avx2_loop
+	MOVQ R11, DI
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPTEST Y3, Y3
+	JNZ avx2success
+	VZEROUPPER
+	MOVQ $-1, (R8)
+	RET
+
+avx2success:
+	VPMOVMSKB Y3, DX
+	BSFL DX, DX
+	SUBQ SI, DI
+	ADDQ DI, DX
+	MOVQ DX, (R8)
+	VZEROUPPER
+	RET
diff --git a/src/internal/bytealg/indexbyte_arm.s b/src/internal/bytealg/indexbyte_arm.s
new file mode 100644
index 0000000..faf9797
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_arm.s
@@ -0,0 +1,46 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R0
+	MOVW	b_len+4(FP), R1
+	MOVBU	c+12(FP), R2	// byte to find
+	MOVW	$ret+16(FP), R5
+	B	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R0
+	MOVW	s_len+4(FP), R1
+	MOVBU	c+8(FP), R2	// byte to find
+	MOVW	$ret+12(FP), R5
+	B	indexbytebody<>(SB)
+
+// input:
+//  R0: data
+//  R1: data length
+//  R2: byte to find
+//  R5: address to put result
+TEXT indexbytebody<>(SB),NOSPLIT,$0-0
+	MOVW	R0, R4		// store base for later
+	ADD	R0, R1		// end
+
+loop:
+	CMP	R0, R1
+	B.EQ	notfound
+	MOVBU.P	1(R0), R3
+	CMP	R2, R3
+	B.NE	loop
+
+	SUB	$1, R0		// R0 will be one beyond the position we want
+	SUB	R4, R0		// remove base
+	MOVW	R0, (R5)
+	RET
+
+notfound:
+	MOVW	$-1, R0
+	MOVW	R0, (R5)
+	RET
diff --git a/src/internal/bytealg/indexbyte_arm64.s b/src/internal/bytealg/indexbyte_arm64.s
new file mode 100644
index 0000000..40843fb
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_arm64.s
@@ -0,0 +1,126 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+	MOVD	b_base+0(FP), R0
+	MOVD	b_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	MOVD	$ret+32(FP), R8
+	B	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+16(FP), R1
+	MOVD	$ret+24(FP), R8
+	B	indexbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R1: byte to search
+//   R2: data len
+//   R8: address to put result
+TEXT indexbytebody<>(SB),NOSPLIT,$0
+	// Core algorithm:
+	// For each 32-byte chunk we calculate a 64-bit syndrome value,
+	// with two bits per byte. For each tuple, bit 0 is set if the
+	// relevant byte matched the requested character and bit 1 is
+	// not used (faster than using a 32bit syndrome). Since the bits
+	// in the syndrome reflect exactly the order in which things occur
+	// in the original string, counting trailing zeros allows to
+	// identify exactly which byte has matched.
+
+	CBZ	R2, fail
+	MOVD	R0, R11
+	// Magic constant 0x40100401 allows us to identify
+	// which lane matches the requested byte.
+	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
+	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
+	MOVD	$0x40100401, R5
+	VMOV	R1, V0.B16
+	// Work with aligned 32-byte chunks
+	BIC	$0x1f, R0, R3
+	VMOV	R5, V5.S4
+	ANDS	$0x1f, R0, R9
+	AND	$0x1f, R2, R10
+	BEQ	loop
+
+	// Input string is not 32-byte aligned. We calculate the
+	// syndrome value for the aligned 32 bytes block containing
+	// the first bytes and mask off the irrelevant part.
+	VLD1.P	(R3), [V1.B16, V2.B16]
+	SUB	$0x20, R9, R4
+	ADDS	R4, R2, R2
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	VADDP	V4.B16, V3.B16, V6.B16 // 256->128
+	VADDP	V6.B16, V6.B16, V6.B16 // 128->64
+	VMOV	V6.D[0], R6
+	// Clear the irrelevant lower bits
+	LSL	$1, R9, R4
+	LSR	R4, R6, R6
+	LSL	R4, R6, R6
+	// The first block can also be the last
+	BLS	masklast
+	// Have we found something already?
+	CBNZ	R6, tail
+
+loop:
+	VLD1.P	(R3), [V1.B16, V2.B16]
+	SUBS	$0x20, R2, R2
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// If we're out of data we finish regardless of the result
+	BLS	end
+	// Use a fast check for the termination condition
+	VORR	V4.B16, V3.B16, V6.B16
+	VADDP	V6.D2, V6.D2, V6.D2
+	VMOV	V6.D[0], R6
+	// We're not out of data, loop if we haven't found the character
+	CBZ	R6, loop
+
+end:
+	// Termination condition found, let's calculate the syndrome value
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	VADDP	V4.B16, V3.B16, V6.B16
+	VADDP	V6.B16, V6.B16, V6.B16
+	VMOV	V6.D[0], R6
+	// Only do the clear for the last possible block with less than 32 bytes
+	// Condition flags come from SUBS in the loop
+	BHS	tail
+
+masklast:
+	// Clear the irrelevant upper bits
+	ADD	R9, R10, R4
+	AND	$0x1f, R4, R4
+	SUB	$0x20, R4, R4
+	NEG	R4<<1, R4
+	LSL	R4, R6, R6
+	LSR	R4, R6, R6
+
+tail:
+	// Check that we have found a character
+	CBZ	R6, fail
+	// Count the trailing zeros using bit reversing
+	RBIT	R6, R6
+	// Compensate the last post-increment
+	SUB	$0x20, R3, R3
+	// And count the leading zeros
+	CLZ	R6, R6
+	// R6 is twice the offset into the fragment
+	ADD	R6>>1, R3, R0
+	// Compute the offset result
+	SUB	R11, R0, R0
+	MOVD	R0, (R8)
+	RET
+
+fail:
+	MOVD	$-1, R0
+	MOVD	R0, (R8)
+	RET
diff --git a/src/internal/bytealg/indexbyte_generic.go b/src/internal/bytealg/indexbyte_generic.go
new file mode 100644
index 0000000..6ef639f
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_generic.go
@@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !mips64 && !mips64le && !riscv64 && !wasm
+// +build !386,!amd64,!s390x,!arm,!arm64,!ppc64,!ppc64le,!mips,!mipsle,!mips64,!mips64le,!riscv64,!wasm
+
+package bytealg
+
+func IndexByte(b []byte, c byte) int {
+	for i, x := range b {
+		if x == c {
+			return i
+		}
+	}
+	return -1
+}
+
+func IndexByteString(s string, c byte) int {
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			return i
+		}
+	}
+	return -1
+}
diff --git a/src/internal/bytealg/indexbyte_mips64x.s b/src/internal/bytealg/indexbyte_mips64x.s
new file mode 100644
index 0000000..0f377f5
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_mips64x.s
@@ -0,0 +1,55 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips64 || mips64le
+// +build mips64 mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+	MOVV	b_base+0(FP), R1
+	MOVV	b_len+8(FP), R2
+	MOVBU	c+24(FP), R3	// byte to find
+	MOVV	R1, R4		// store base for later
+	ADDV	R1, R2		// end
+	ADDV	$-1, R1
+
+loop:
+	ADDV	$1, R1
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	BNE	R3, R5, loop
+
+	SUBV	R4, R1		// remove base
+	MOVV	R1, ret+32(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R1
+	MOVV	R1, ret+32(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+	MOVV	s_base+0(FP), R1
+	MOVV	s_len+8(FP), R2
+	MOVBU	c+16(FP), R3	// byte to find
+	MOVV	R1, R4		// store base for later
+	ADDV	R1, R2		// end
+	ADDV	$-1, R1
+
+loop:
+	ADDV	$1, R1
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	BNE	R3, R5, loop
+
+	SUBV	R4, R1		// remove base
+	MOVV	R1, ret+24(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R1
+	MOVV	R1, ret+24(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_mipsx.s b/src/internal/bytealg/indexbyte_mipsx.s
new file mode 100644
index 0000000..bed015b
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_mipsx.s
@@ -0,0 +1,53 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build mips || mipsle
+// +build mips mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R1
+	MOVW	b_len+4(FP), R2
+	MOVBU	c+12(FP), R3	// byte to find
+	ADDU	$1, R1, R4	// store base+1 for later
+	ADDU	R1, R2	// end
+
+loop:
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	ADDU	$1, R1
+	BNE	R3, R5, loop
+
+	SUBU	R4, R1	// R1 will be one beyond the position we want so remove (base+1)
+	MOVW	R1, ret+16(FP)
+	RET
+
+notfound:
+	MOVW	$-1, R1
+	MOVW	R1, ret+16(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R1
+	MOVW	s_len+4(FP), R2
+	MOVBU	c+8(FP), R3	// byte to find
+	ADDU	$1, R1, R4	// store base+1 for later
+	ADDU	R1, R2	// end
+
+loop:
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	ADDU	$1, R1
+	BNE	R3, R5, loop
+
+	SUBU	R4, R1	// remove (base+1)
+	MOVW	R1, ret+12(FP)
+	RET
+
+notfound:
+	MOVW	$-1, R1
+	MOVW	R1, ret+12(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_native.go b/src/internal/bytealg/indexbyte_native.go
new file mode 100644
index 0000000..965f38f
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_native.go
@@ -0,0 +1,14 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || mips64 || mips64le || riscv64 || wasm
+// +build 386 amd64 s390x arm arm64 ppc64 ppc64le mips mipsle mips64 mips64le riscv64 wasm
+
+package bytealg
+
+//go:noescape
+func IndexByte(b []byte, c byte) int
+
+//go:noescape
+func IndexByteString(s string, c byte) int
diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s
new file mode 100644
index 0000000..8e13c5a
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -0,0 +1,316 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+// +build ppc64 ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	b_base+0(FP), R3	// R3 = byte array pointer
+	MOVD	b_len+8(FP), R4		// R4 = length
+	MOVBZ	c+24(FP), R5		// R5 = byte
+	MOVD	$ret+32(FP), R14	// R14 = &ret
+	BR	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
+	MOVD	s_base+0(FP), R3  // R3 = string
+	MOVD	s_len+8(FP), R4	  // R4 = length
+	MOVBZ	c+16(FP), R5	  // R5 = byte
+	MOVD	$ret+24(FP), R14  // R14 = &ret
+	BR	indexbytebody<>(SB)
+
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R17		// Save base address for calculating the index later.
+	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
+	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
+	ADD	R4,R3,R7	// Last acceptable address in R7.
+	DCBT	(R8)		// Prepare cache line.
+
+	RLDIMI	$16,R5,$32,R5
+	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
+	MOVD	$-1,R9
+	WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+	RLDIMI	$32,R5,$0,R5
+	MOVD	R7,R10		// Save last acceptable address in R10 for later.
+	ADD	$-1,R7,R7
+#ifdef GOARCH_ppc64le
+	SLD	R6,R9,R9	// Prepare mask for Little Endian
+#else
+	SRD	R6,R9,R9	// Same for Big Endian
+#endif
+	BLE	small_string	// Jump to the small string case if it's ≤32 bytes.
+
+	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
+	// in V0, V1 and V10, then branch to the preloop.
+	ANDCC	$63,R3,R11
+	BEQ	CR0,qw_align
+	RLDICL	$0,R3,$61,R11
+
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base
+	RLDICL	$0,R7,$61,R6	// length-1
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
+	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
+	ADD	R4,R11,R4
+
+	// Check for quadword alignment
+	ANDCC	$15,R8,R11
+	BEQ	CR0,qw_align
+
+	// Not aligned, so handle the next doubleword
+	MOVD	0(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR7
+	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
+
+	// Either quadword aligned or 64-byte at this point. We can use LVX.
+qw_align:
+
+	// Set up auxiliary data for the vectorized algorithm.
+	VSPLTISB  $0,V0		// Replicate 0 across V0
+	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
+	MTVRD	  R5,V1
+	LVSL	  (R0+R0),V11
+	VSLB	  V11,V10,V10
+	VSPLTB	  $7,V1,V1	// Replicate byte across V1
+	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
+	BLE	  tail
+
+	// We will load 4 quardwords per iteration in the loop, so check for
+	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
+	ANDCC	  $63,R8,R11
+	BEQ	  CR0,preloop
+
+	// Not 64-byte aligned. Load one quadword at a time until aligned.
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $-16,R4,R4
+	ADD	    $16,R8,R8
+
+	// 64-byte aligned. Prepare for the main loop.
+preloop:
+	CMPU	R4,$64
+	BLE	tail	      // If len ≤ 64, don't use the vectorized loop
+
+	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
+	// per loop iteration. The last doubleword is in R10, so our loop counter
+	// starts at (R10-R8)/64.
+	SUB	R8,R10,R6
+	SRD	$6,R6,R9      // Loop counter in R9
+	MOVD	R9,CTR
+
+	ADD	$-64,R8,R8   // Adjust index for loop entry
+	MOVD	$16,R11      // Load offsets for the vector loads
+	MOVD	$32,R9
+	MOVD	$48,R7
+
+	// Main loop we will load 64 bytes per iteration
+loop:
+	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
+	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
+	LVX	    (R8+R11),V3
+	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
+	VCMPEQUB    V1,V3,V7
+
+	LVX	    (R8+R9),V4
+	LVX	    (R8+R7),V5
+	VCMPEQUB    V1,V4,V8
+	VCMPEQUB    V1,V5,V9
+
+	VOR	    V6,V7,V11	      // Compress the result in a single vector
+	VOR	    V8,V9,V12
+	VOR	    V11,V12,V13
+	VCMPEQUBCC  V0,V13,V14	      // Check for byte
+	BGE	    CR6,found
+	BC	    16,0,loop	      // bdnz loop
+
+	// Handle the tailing bytes or R4 ≤ 64
+	RLDICL	$0,R6,$58,R4
+	ADD	$64,R8,R8
+tail:
+	CMPU	    R4,$0
+	BEQ	    notfound
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+
+notfound:
+	MOVD	$-1,R3
+	MOVD	R3,(R14)
+	RET
+
+found:
+	// We will now compress the results into a single doubleword,
+	// so it can be moved to a GPR for the final index calculation.
+
+	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
+	// first bit of each byte into bits 48-63.
+	VBPERMQ	  V6,V10,V6
+	VBPERMQ	  V7,V10,V7
+	VBPERMQ	  V8,V10,V8
+	VBPERMQ	  V9,V10,V9
+
+	// Shift each 16-bit component into its correct position for
+	// merging into a single doubleword.
+#ifdef GOARCH_ppc64le
+	VSLDOI	  $2,V7,V7,V7
+	VSLDOI	  $4,V8,V8,V8
+	VSLDOI	  $6,V9,V9,V9
+#else
+	VSLDOI	  $6,V6,V6,V6
+	VSLDOI	  $4,V7,V7,V7
+	VSLDOI	  $2,V8,V8,V8
+#endif
+
+	// Merge V6-V9 into a single doubleword and move to a GPR.
+	VOR	V6,V7,V11
+	VOR	V8,V9,V4
+	VOR	V4,V11,V4
+	MFVRD	V4,R3
+
+#ifdef GOARCH_ppc64le
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	ADD	R8,R11,R3	// Calculate byte address
+
+return:
+	SUB	R17,R3
+	MOVD	R3,(R14)
+	RET
+
+found_qw_align:
+	// Use the same algorithm as above. Compress the result into
+	// a single doubleword and move it to a GPR for the final
+	// calculation.
+	VBPERMQ	  V6,V10,V6
+
+#ifdef GOARCH_ppc64le
+	MFVRD	  V6,R3
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11
+#else
+	VSLDOI	  $6,V6,V6,V6
+	MFVRD	  V6,R3
+	CNTLZD	  R3,R11
+#endif
+	ADD	  R8,R11,R3
+	CMPU	  R11,R4
+	BLT	  return
+	BR	  notfound
+
+done:
+	// At this point, R3 has 0xFF in the same position as the byte we are
+	// looking for in the doubleword. Use that to calculate the exact index
+	// of the byte.
+#ifdef GOARCH_ppc64le
+	ADD	$-1,R3,R11
+	ANDN	R3,R11,R11
+	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	CMPU	R8,R7		// Check if we are at the last doubleword.
+	SRD	$3,R11		// Convert trailing zeros to bytes.
+	ADD	R11,R8,R3
+	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
+	BNE	return
+	BLE	CR7,return
+	BR	notfound
+
+small_string:
+	// We unroll this loop for better performance.
+	CMPU	R4,$0		// Check for length=0
+	BEQ	notfound
+
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base.
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
+	RLDICL	$0,R7,$61,R6	// length-1
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
+	CMPU	R8,R7
+	BNE	CR7,done
+	BEQ	notfound	// Hit length.
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	BNE	CR6,done
+	BR	notfound
+
diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s
new file mode 100644
index 0000000..156c303
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_riscv64.s
@@ -0,0 +1,52 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+	MOV	b_base+0(FP), A1
+	MOV	b_len+8(FP), A2
+	MOVBU	c+24(FP), A3	// byte to find
+	MOV	A1, A4		// store base for later
+	ADD	A1, A2		// end
+	ADD	$-1, A1
+
+loop:
+	ADD	$1, A1
+	BEQ	A1, A2, notfound
+	MOVBU	(A1), A5
+	BNE	A3, A5, loop
+
+	SUB	A4, A1		// remove base
+	MOV	A1, ret+32(FP)
+	RET
+
+notfound:
+	MOV	$-1, A1
+	MOV	A1, ret+32(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+	MOV	s_base+0(FP), A1
+	MOV	s_len+8(FP), A2
+	MOVBU	c+16(FP), A3	// byte to find
+	MOV	A1, A4		// store base for later
+	ADD	A1, A2		// end
+	ADD	$-1, A1
+
+loop:
+	ADD	$1, A1
+	BEQ	A1, A2, notfound
+	MOVBU	(A1), A5
+	BNE	A3, A5, loop
+
+	SUB	A4, A1		// remove base
+	MOV	A1, ret+24(FP)
+	RET
+
+notfound:
+	MOV	$-1, A1
+	MOV	A1, ret+24(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_s390x.s b/src/internal/bytealg/indexbyte_s390x.s
new file mode 100644
index 0000000..cf88d92
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_s390x.s
@@ -0,0 +1,108 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	b_base+0(FP), R3// b_base => R3
+	MOVD	b_len+8(FP), R4 // b_len => R4
+	MOVBZ	c+24(FP), R5    // c => R5
+	MOVD	$ret+32(FP), R2 // &ret => R9
+	BR	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
+	MOVD	s_base+0(FP), R3// s_base => R3
+	MOVD	s_len+8(FP), R4 // s_len => R4
+	MOVBZ	c+16(FP), R5    // c => R5
+	MOVD	$ret+24(FP), R2 // &ret => R9
+	BR	indexbytebody<>(SB)
+
+// input:
+// R3: s
+// R4: s_len
+// R5: c -- byte sought
+// R2: &ret -- address to put index into
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0
+	CMPBEQ	R4, $0, notfound
+	MOVD	R3, R6          // store base for later
+	ADD	R3, R4, R8      // the address after the end of the string
+	//if the length is small, use loop; otherwise, use vector or srst search
+	CMPBGE	R4, $16, large
+
+residual:
+	CMPBEQ	R3, R8, notfound
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, residual
+
+found:
+	SUB	R6, R3
+	SUB	$1, R3
+	MOVD	R3, 0(R2)
+	RET
+
+notfound:
+	MOVD	$-1, 0(R2)
+	RET
+
+large:
+	MOVBZ	internal∕cpu·S390X+const_offsetS390xHasVX(SB), R1
+	CMPBNE	R1, $0, vectorimpl
+
+srstimpl:                       // no vector facility
+	MOVBZ	R5, R0          // c needs to be in R0, leave until last minute as currently R0 is expected to be 0
+srstloop:
+	WORD	$0xB25E0083     // srst %r8, %r3 (search the range [R3, R8))
+	BVS	srstloop        // interrupted - continue
+	BGT	notfoundr0
+foundr0:
+	XOR	R0, R0          // reset R0
+	SUB	R6, R8          // remove base
+	MOVD	R8, 0(R2)
+	RET
+notfoundr0:
+	XOR	R0, R0          // reset R0
+	MOVD	$-1, 0(R2)
+	RET
+
+vectorimpl:
+	//if the address is not 16byte aligned, use loop for the header
+	MOVD	R3, R8
+	AND	$15, R8
+	CMPBGT	R8, $0, notaligned
+
+aligned:
+	ADD	R6, R4, R8
+	MOVD	R8, R7
+	AND	$-16, R7
+	// replicate c across V17
+	VLVGB	$0, R5, V19
+	VREPB	$0, V19, V17
+
+vectorloop:
+	CMPBGE	R3, R7, residual
+	VL	0(R3), V16    // load string to be searched into V16
+	ADD	$16, R3
+	VFEEBS	V16, V17, V18 // search V17 in V16 and set conditional code accordingly
+	BVS	vectorloop
+
+	// when vector search found c in the string
+	VLGVB	$7, V18, R7   // load 7th element of V18 containing index into R7
+	SUB	$16, R3
+	SUB	R6, R3
+	ADD	R3, R7
+	MOVD	R7, 0(R2)
+	RET
+
+notaligned:
+	MOVD	R3, R8
+	AND	$-16, R8
+	ADD     $16, R8
+notalignedloop:
+	CMPBEQ	R3, R8, aligned
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, notalignedloop
+	BR	found
diff --git a/src/internal/bytealg/indexbyte_wasm.s b/src/internal/bytealg/indexbyte_wasm.s
new file mode 100644
index 0000000..ef4bd93
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_wasm.s
@@ -0,0 +1,195 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB), NOSPLIT, $0-40
+	I64Load b_base+0(FP)
+	I32WrapI64
+	I32Load8U c+24(FP)
+	I64Load b_len+8(FP)
+	I32WrapI64
+	Call memchr<>(SB)
+	I64ExtendI32S
+	Set R0
+
+	Get SP
+	I64Const $-1
+	Get R0
+	I64Load b_base+0(FP)
+	I64Sub
+	Get R0
+	I64Eqz $0
+	Select
+	I64Store ret+32(FP)
+
+	RET
+
+TEXT ·IndexByteString(SB), NOSPLIT, $0-32
+	Get SP
+	I64Load s_base+0(FP)
+	I32WrapI64
+	I32Load8U c+16(FP)
+	I64Load s_len+8(FP)
+	I32WrapI64
+	Call memchr<>(SB)
+	I64ExtendI32S
+	Set R0
+
+	I64Const $-1
+	Get R0
+	I64Load s_base+0(FP)
+	I64Sub
+	Get R0
+	I64Eqz $0
+	Select
+	I64Store ret+24(FP)
+
+	RET
+
+// initially compiled with emscripten and then modified over time.
+// params:
+//   R0: s
+//   R1: c
+//   R2: len
+// ret: index
+TEXT memchr<>(SB), NOSPLIT, $0
+	Get R1
+	Set R4
+	Block
+		Block
+			Get R2
+			I32Const $0
+			I32Ne
+			Tee R3
+			Get R0
+			I32Const $3
+			I32And
+			I32Const $0
+			I32Ne
+			I32And
+			If
+				Loop
+					Get R0
+					I32Load8U $0
+					Get R1
+					I32Eq
+					BrIf $2
+					Get R2
+					I32Const $-1
+					I32Add
+					Tee R2
+					I32Const $0
+					I32Ne
+					Tee R3
+					Get R0
+					I32Const $1
+					I32Add
+					Tee R0
+					I32Const $3
+					I32And
+					I32Const $0
+					I32Ne
+					I32And
+					BrIf $0
+				End
+			End
+			Get R3
+			BrIf $0
+			I32Const $0
+			Set R1
+			Br $1
+		End
+		Get R0
+		I32Load8U $0
+		Get R4
+		Tee R3
+		I32Eq
+		If
+			Get R2
+			Set R1
+		Else
+			Get R4
+			I32Const $16843009
+			I32Mul
+			Set R4
+			Block
+				Block
+					Get R2
+					I32Const $3
+					I32GtU
+					If
+						Get R2
+						Set R1
+						Loop
+							Get R0
+							I32Load $0
+							Get R4
+							I32Xor
+							Tee R2
+							I32Const $-2139062144
+							I32And
+							I32Const $-2139062144
+							I32Xor
+							Get R2
+							I32Const $-16843009
+							I32Add
+							I32And
+							I32Eqz
+							If
+								Get R0
+								I32Const $4
+								I32Add
+								Set R0
+								Get R1
+								I32Const $-4
+								I32Add
+								Tee R1
+								I32Const $3
+								I32GtU
+								BrIf $1
+								Br $3
+							End
+						End
+					Else
+						Get R2
+						Set R1
+						Br $1
+					End
+					Br $1
+				End
+				Get R1
+				I32Eqz
+				If
+					I32Const $0
+					Set R1
+					Br $3
+				End
+			End
+			Loop
+				Get R0
+				I32Load8U $0
+				Get R3
+				I32Eq
+				BrIf $2
+				Get R0
+				I32Const $1
+				I32Add
+				Set R0
+				Get R1
+				I32Const $-1
+				I32Add
+				Tee R1
+				BrIf $0
+				I32Const $0
+				Set R1
+			End
+		End
+	End
+	Get R0
+	I32Const $0
+	Get R1
+	Select
+	Return