Adding upstream version 1.16.10.upstream/1.16.10 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 13:14:23 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-28 13:14:23 +0000
commit: 73df946d56c74384511a194dd01dbe099584fd1a (patch)
tree: fd0bcea490dd81327ddfbb31e215439672c9a068 /src/internal/bytealg
parent: Initial commit. (diff)
download: golang-1.16-73df946d56c74384511a194dd01dbe099584fd1a.tar.xz
golang-1.16-73df946d56c74384511a194dd01dbe099584fd1a.zip
52 files changed, 5175 insertions, 0 deletions
diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go
new file mode 100644
index 0000000..b30c234
--- /dev/null
+++ b/src/internal/bytealg/bytealg.go
@@ -0,0 +1,149 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import (
+	"internal/cpu"
+	"unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+	offsetX86HasSSE2   = unsafe.Offsetof(cpu.X86.HasSSE2)
+	offsetX86HasSSE42  = unsafe.Offsetof(cpu.X86.HasSSE42)
+	offsetX86HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
+	offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+
+	offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+)
+
+// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+// If MaxLen is not 0, make sure MaxLen >= 4.
+var MaxLen int
+
+// FIXME: the logic of HashStrBytes, HashStrRevBytes, IndexRabinKarpBytes and HashStr, HashStrRev,
+// IndexRabinKarp are exactly the same, except that the types are different. Can we eliminate
+// three of them without causing allocation?
+
+// PrimeRK is the prime base used in Rabin-Karp algorithm.
+const PrimeRK = 16777619
+
+// HashStrBytes returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func HashStrBytes(sep []byte) (uint32, uint32) {
+	hash := uint32(0)
+	for i := 0; i < len(sep); i++ {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// HashStr returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func HashStr(sep string) (uint32, uint32) {
+	hash := uint32(0)
+	for i := 0; i < len(sep); i++ {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// HashStrRevBytes returns the hash of the reverse of sep and the
+// appropriate multiplicative factor for use in Rabin-Karp algorithm.
+func HashStrRevBytes(sep []byte) (uint32, uint32) {
+	hash := uint32(0)
+	for i := len(sep) - 1; i >= 0; i-- {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// HashStrRev returns the hash of the reverse of sep and the
+// appropriate multiplicative factor for use in Rabin-Karp algorithm.
+func HashStrRev(sep string) (uint32, uint32) {
+	hash := uint32(0)
+	for i := len(sep) - 1; i >= 0; i-- {
+		hash = hash*PrimeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, PrimeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
+
+// IndexRabinKarpBytes uses the Rabin-Karp search algorithm to return the index of the
+// first occurrence of substr in s, or -1 if not present.
+func IndexRabinKarpBytes(s, sep []byte) int {
+	// Rabin-Karp search
+	hashsep, pow := HashStrBytes(sep)
+	n := len(sep)
+	var h uint32
+	for i := 0; i < n; i++ {
+		h = h*PrimeRK + uint32(s[i])
+	}
+	if h == hashsep && Equal(s[:n], sep) {
+		return 0
+	}
+	for i := n; i < len(s); {
+		h *= PrimeRK
+		h += uint32(s[i])
+		h -= pow * uint32(s[i-n])
+		i++
+		if h == hashsep && Equal(s[i-n:i], sep) {
+			return i - n
+		}
+	}
+	return -1
+}
+
+// IndexRabinKarp uses the Rabin-Karp search algorithm to return the index of the
+// first occurrence of substr in s, or -1 if not present.
+func IndexRabinKarp(s, substr string) int {
+	// Rabin-Karp search
+	hashss, pow := HashStr(substr)
+	n := len(substr)
+	var h uint32
+	for i := 0; i < n; i++ {
+		h = h*PrimeRK + uint32(s[i])
+	}
+	if h == hashss && s[:n] == substr {
+		return 0
+	}
+	for i := n; i < len(s); {
+		h *= PrimeRK
+		h += uint32(s[i])
+		h -= pow * uint32(s[i-n])
+		i++
+		if h == hashss && s[i-n:i] == substr {
+			return i - n
+		}
+	}
+	return -1
+}
diff --git a/src/internal/bytealg/compare_386.s b/src/internal/bytealg/compare_386.s
new file mode 100644
index 0000000..0981983
--- /dev/null
+++ b/src/internal/bytealg/compare_386.s
@@ -0,0 +1,143 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+	MOVL	a_base+0(FP), SI
+	MOVL	a_len+4(FP), BX
+	MOVL	b_base+12(FP), DI
+	MOVL	b_len+16(FP), DX
+	LEAL	ret+24(FP), AX
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+	MOVL	a_base+0(FP), SI
+	MOVL	a_len+4(FP), BX
+	MOVL	b_base+8(FP), DI
+	MOVL	b_len+12(FP), DX
+	LEAL	ret+16(FP), AX
+	JMP	cmpbody<>(SB)
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+//   AX = address of return word (set to 1/0/-1)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+	MOVL	DX, BP
+	SUBL	BX, DX // DX = blen-alen
+	JLE	2(PC)
+	MOVL	BX, BP // BP = min(alen, blen)
+	CMPL	SI, DI
+	JEQ	allsame
+	CMPL	BP, $4
+	JB	small
+	CMPB	internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
+	JNE	mediumloop
+largeloop:
+	CMPL	BP, $16
+	JB	mediumloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, BX
+	XORL	$0xffff, BX	// convert EQ to NE
+	JNE	diff16	// branch if at least one byte is not equal
+	ADDL	$16, SI
+	ADDL	$16, DI
+	SUBL	$16, BP
+	JMP	largeloop
+
+diff16:
+	BSFL	BX, BX	// index of first byte that differs
+	XORL	DX, DX
+	MOVB	(SI)(BX*1), CX
+	CMPB	CX, (DI)(BX*1)
+	SETHI	DX
+	LEAL	-1(DX*2), DX	// convert 1/0 to +1/-1
+	MOVL	DX, (AX)
+	RET
+
+mediumloop:
+	CMPL	BP, $4
+	JBE	_0through4
+	MOVL	(SI), BX
+	MOVL	(DI), CX
+	CMPL	BX, CX
+	JNE	diff4
+	ADDL	$4, SI
+	ADDL	$4, DI
+	SUBL	$4, BP
+	JMP	mediumloop
+
+_0through4:
+	MOVL	-4(SI)(BP*1), BX
+	MOVL	-4(DI)(BP*1), CX
+	CMPL	BX, CX
+	JEQ	allsame
+
+diff4:
+	BSWAPL	BX	// reverse order of bytes
+	BSWAPL	CX
+	XORL	BX, CX	// find bit differences
+	BSRL	CX, CX	// index of highest bit difference
+	SHRL	CX, BX	// move a's bit to bottom
+	ANDL	$1, BX	// mask bit
+	LEAL	-1(BX*2), BX // 1/0 => +1/-1
+	MOVL	BX, (AX)
+	RET
+
+	// 0-3 bytes in common
+small:
+	LEAL	(BP*8), CX
+	NEGL	CX
+	JEQ	allsame
+
+	// load si
+	CMPB	SI, $0xfc
+	JA	si_high
+	MOVL	(SI), SI
+	JMP	si_finish
+si_high:
+	MOVL	-4(SI)(BP*1), SI
+	SHRL	CX, SI
+si_finish:
+	SHLL	CX, SI
+
+	// same for di
+	CMPB	DI, $0xfc
+	JA	di_high
+	MOVL	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVL	-4(DI)(BP*1), DI
+	SHRL	CX, DI
+di_finish:
+	SHLL	CX, DI
+
+	BSWAPL	SI	// reverse order of bytes
+	BSWAPL	DI
+	XORL	SI, DI	// find bit differences
+	JEQ	allsame
+	BSRL	DI, CX	// index of highest bit difference
+	SHRL	CX, SI	// move a's bit to bottom
+	ANDL	$1, SI	// mask bit
+	LEAL	-1(SI*2), BX // 1/0 => +1/-1
+	MOVL	BX, (AX)
+	RET
+
+	// all the bytes in common are the same, so we just need
+	// to compare the lengths.
+allsame:
+	XORL	BX, BX
+	XORL	CX, CX
+	TESTL	DX, DX
+	SETLT	BX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAL	-1(CX)(BX*2), BX	// 1,0,-1 result
+	MOVL	BX, (AX)
+	RET
diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s
new file mode 100644
index 0000000..900b92a
--- /dev/null
+++ b/src/internal/bytealg/compare_amd64.s
@@ -0,0 +1,228 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-56
+	MOVQ	a_base+0(FP), SI
+	MOVQ	a_len+8(FP), BX
+	MOVQ	b_base+24(FP), DI
+	MOVQ	b_len+32(FP), DX
+	LEAQ	ret+48(FP), R9
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+	MOVQ	a_base+0(FP), SI
+	MOVQ	a_len+8(FP), BX
+	MOVQ	b_base+16(FP), DI
+	MOVQ	b_len+24(FP), DX
+	LEAQ	ret+32(FP), R9
+	JMP	cmpbody<>(SB)
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+//   R9 = address of output word (stores -1/0/1 here)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+	CMPQ	SI, DI
+	JEQ	allsame
+	CMPQ	BX, DX
+	MOVQ	DX, R8
+	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
+	CMPQ	R8, $8
+	JB	small
+
+	CMPQ	R8, $63
+	JBE	loop
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JEQ     big_loop_avx2
+	JMP	big_loop
+loop:
+	CMPQ	R8, $16
+	JBE	_0through16
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX	// convert EQ to NE
+	JNE	diff16	// branch if at least one byte is not equal
+	ADDQ	$16, SI
+	ADDQ	$16, DI
+	SUBQ	$16, R8
+	JMP	loop
+
+diff64:
+	ADDQ	$48, SI
+	ADDQ	$48, DI
+	JMP	diff16
+diff48:
+	ADDQ	$32, SI
+	ADDQ	$32, DI
+	JMP	diff16
+diff32:
+	ADDQ	$16, SI
+	ADDQ	$16, DI
+	// AX = bit mask of differences
+diff16:
+	BSFQ	AX, BX	// index of first byte that differs
+	XORQ	AX, AX
+	MOVB	(SI)(BX*1), CX
+	CMPB	CX, (DI)(BX*1)
+	SETHI	AX
+	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
+	MOVQ	AX, (R9)
+	RET
+
+	// 0 through 16 bytes left, alen>=8, blen>=8
+_0through16:
+	CMPQ	R8, $8
+	JBE	_0through8
+	MOVQ	(SI), AX
+	MOVQ	(DI), CX
+	CMPQ	AX, CX
+	JNE	diff8
+_0through8:
+	MOVQ	-8(SI)(R8*1), AX
+	MOVQ	-8(DI)(R8*1), CX
+	CMPQ	AX, CX
+	JEQ	allsame
+
+	// AX and CX contain parts of a and b that differ.
+diff8:
+	BSWAPQ	AX	// reverse order of bytes
+	BSWAPQ	CX
+	XORQ	AX, CX
+	BSRQ	CX, CX	// index of highest bit difference
+	SHRQ	CX, AX	// move a's bit to bottom
+	ANDQ	$1, AX	// mask bit
+	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
+	MOVQ	AX, (R9)
+	RET
+
+	// 0-7 bytes in common
+small:
+	LEAQ	(R8*8), CX	// bytes left -> bits left
+	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
+	JEQ	allsame
+
+	// load bytes of a into high bytes of AX
+	CMPB	SI, $0xf8
+	JA	si_high
+	MOVQ	(SI), SI
+	JMP	si_finish
+si_high:
+	MOVQ	-8(SI)(R8*1), SI
+	SHRQ	CX, SI
+si_finish:
+	SHLQ	CX, SI
+
+	// load bytes of b in to high bytes of BX
+	CMPB	DI, $0xf8
+	JA	di_high
+	MOVQ	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVQ	-8(DI)(R8*1), DI
+	SHRQ	CX, DI
+di_finish:
+	SHLQ	CX, DI
+
+	BSWAPQ	SI	// reverse order of bytes
+	BSWAPQ	DI
+	XORQ	SI, DI	// find bit differences
+	JEQ	allsame
+	BSRQ	DI, CX	// index of highest bit difference
+	SHRQ	CX, SI	// move a's bit to bottom
+	ANDQ	$1, SI	// mask bit
+	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
+	MOVQ	AX, (R9)
+	RET
+
+allsame:
+	XORQ	AX, AX
+	XORQ	CX, CX
+	CMPQ	BX, DX
+	SETGT	AX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
+	MOVQ	AX, (R9)
+	RET
+
+	// this works for >= 64 bytes of data.
+big_loop:
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff16
+
+	MOVOU	16(SI), X0
+	MOVOU	16(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff32
+
+	MOVOU	32(SI), X0
+	MOVOU	32(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff48
+
+	MOVOU	48(SI), X0
+	MOVOU	48(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff64
+
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, R8
+	CMPQ	R8, $64
+	JBE	loop
+	JMP	big_loop
+
+	// Compare 64-bytes per loop iteration.
+	// Loop is unrolled and uses AVX2.
+big_loop_avx2:
+	VMOVDQU	(SI), Y2
+	VMOVDQU	(DI), Y3
+	VMOVDQU	32(SI), Y4
+	VMOVDQU	32(DI), Y5
+	VPCMPEQB Y2, Y3, Y0
+	VPMOVMSKB Y0, AX
+	XORL	$0xffffffff, AX
+	JNE	diff32_avx2
+	VPCMPEQB Y4, Y5, Y6
+	VPMOVMSKB Y6, AX
+	XORL	$0xffffffff, AX
+	JNE	diff64_avx2
+
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, R8
+	CMPQ	R8, $64
+	JB	big_loop_avx2_exit
+	JMP	big_loop_avx2
+
+	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
+diff32_avx2:
+	VZEROUPPER
+	JMP diff16
+
+	// Same as diff32_avx2, but for last 32 bytes.
+diff64_avx2:
+	VZEROUPPER
+	JMP diff48
+
+	// For <64 bytes remainder jump to normal loop.
+big_loop_avx2_exit:
+	VZEROUPPER
+	JMP loop
diff --git a/src/internal/bytealg/compare_arm.s b/src/internal/bytealg/compare_arm.s
new file mode 100644
index 0000000..80d01a2
--- /dev/null
+++ b/src/internal/bytealg/compare_arm.s
@@ -0,0 +1,86 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	a_base+0(FP), R2
+	MOVW	a_len+4(FP), R0
+	MOVW	b_base+12(FP), R3
+	MOVW	b_len+16(FP), R1
+	ADD	$28, R13, R7
+	B	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-20
+	MOVW	a_base+0(FP), R2
+	MOVW	a_len+4(FP), R0
+	MOVW	b_base+8(FP), R3
+	MOVW	b_len+12(FP), R1
+	ADD	$20, R13, R7
+	B	cmpbody<>(SB)
+
+// On entry:
+// R0 is the length of a
+// R1 is the length of b
+// R2 points to the start of a
+// R3 points to the start of b
+// R7 points to return value (-1/0/1 will be written here)
+//
+// On exit:
+// R4, R5, R6 and R8 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	R2, R3
+	BEQ	samebytes
+	CMP 	R0, R1
+	MOVW 	R0, R6
+	MOVW.LT	R1, R6		// R6 is min(R0, R1)
+
+	CMP	$0, R6
+	BEQ	samebytes
+	CMP	$4, R6
+	ADD	R2, R6		// R2 is current byte in a, R6 is the end of the range to compare
+	BLT	byte_loop	// length < 4
+	AND	$3, R2, R8
+	CMP	$0, R8
+	BNE	byte_loop	// unaligned a, use byte-wise compare (TODO: try to align a)
+aligned_a:
+	AND	$3, R3, R8
+	CMP	$0, R8
+	BNE	byte_loop	// unaligned b, use byte-wise compare
+	AND	$0xfffffffc, R6, R8
+	// length >= 4
+chunk4_loop:
+	MOVW.P	4(R2), R4
+	MOVW.P	4(R3), R5
+	CMP	R4, R5
+	BNE	cmp
+	CMP	R2, R8
+	BNE	chunk4_loop
+	CMP	R2, R6
+	BEQ	samebytes	// all compared bytes were the same; compare lengths
+byte_loop:
+	MOVBU.P	1(R2), R4
+	MOVBU.P	1(R3), R5
+	CMP	R4, R5
+	BNE	ret
+	CMP	R2, R6
+	BNE	byte_loop
+samebytes:
+	CMP	R0, R1
+	MOVW.LT	$1, R0
+	MOVW.GT	$-1, R0
+	MOVW.EQ	$0, R0
+	MOVW	R0, (R7)
+	RET
+ret:
+	// bytes differed
+	MOVW.LT	$1, R0
+	MOVW.GT	$-1, R0
+	MOVW	R0, (R7)
+	RET
+cmp:
+	SUB	$4, R2, R2
+	SUB	$4, R3, R3
+	B	byte_loop
diff --git a/src/internal/bytealg/compare_arm64.s b/src/internal/bytealg/compare_arm64.s
new file mode 100644
index 0000000..56d56f2
--- /dev/null
+++ b/src/internal/bytealg/compare_arm64.s
@@ -0,0 +1,125 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+	MOVD	a_base+0(FP), R2
+	MOVD	a_len+8(FP), R0
+	MOVD	b_base+24(FP), R3
+	MOVD	b_len+32(FP), R1
+	MOVD	$ret+48(FP), R7
+	B	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	a_base+0(FP), R2
+	MOVD	a_len+8(FP), R0
+	MOVD	b_base+16(FP), R3
+	MOVD	b_len+24(FP), R1
+	MOVD	$ret+32(FP), R7
+	B	cmpbody<>(SB)
+
+// On entry:
+// R0 is the length of a
+// R1 is the length of b
+// R2 points to the start of a
+// R3 points to the start of b
+// R7 points to return value (-1/0/1 will be written here)
+//
+// On exit:
+// R4, R5, R6, R8, R9 and R10 are clobbered
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	R2, R3
+	BEQ	samebytes         // same starting pointers; compare lengths
+	CMP	R0, R1
+	CSEL	LT, R1, R0, R6    // R6 is min(R0, R1)
+
+	CBZ	R6, samebytes
+	BIC	$0xf, R6, R10
+	CBZ	R10, small        // length < 16
+	ADD	R2, R10           // end of chunk16
+	// length >= 16
+chunk16_loop:
+	LDP.P	16(R2), (R4, R8)
+	LDP.P	16(R3), (R5, R9)
+	CMP	R4, R5
+	BNE	cmp
+	CMP	R8, R9
+	BNE	cmpnext
+	CMP	R10, R2
+	BNE	chunk16_loop
+	AND	$0xf, R6, R6
+	CBZ	R6, samebytes
+	SUBS	$8, R6
+	BLT	tail
+	// the length of tail > 8 bytes
+	MOVD.P	8(R2), R4
+	MOVD.P	8(R3), R5
+	CMP	R4, R5
+	BNE	cmp
+	SUB	$8, R6
+	// compare last 8 bytes
+tail:
+	MOVD	(R2)(R6), R4
+	MOVD	(R3)(R6), R5
+	CMP	R4, R5
+	BEQ	samebytes
+cmp:
+	REV	R4, R4
+	REV	R5, R5
+	CMP	R4, R5
+ret:
+	MOVD	$1, R4
+	CNEG	HI, R4, R4
+	MOVD	R4, (R7)
+	RET
+small:
+	TBZ	$3, R6, lt_8
+	MOVD	(R2), R4
+	MOVD	(R3), R5
+	CMP	R4, R5
+	BNE	cmp
+	SUBS	$8, R6
+	BEQ	samebytes
+	ADD	$8, R2
+	ADD	$8, R3
+	SUB	$8, R6
+	B	tail
+lt_8:
+	TBZ	$2, R6, lt_4
+	MOVWU	(R2), R4
+	MOVWU	(R3), R5
+	CMPW	R4, R5
+	BNE	cmp
+	SUBS	$4, R6
+	BEQ	samebytes
+	ADD	$4, R2
+	ADD	$4, R3
+lt_4:
+	TBZ	$1, R6, lt_2
+	MOVHU	(R2), R4
+	MOVHU	(R3), R5
+	CMPW	R4, R5
+	BNE	cmp
+	ADD	$2, R2
+	ADD	$2, R3
+lt_2:
+	TBZ	$0, R6, samebytes
+one:
+	MOVBU	(R2), R4
+	MOVBU	(R3), R5
+	CMPW	R4, R5
+	BNE	ret
+samebytes:
+	CMP	R1, R0
+	CSET	NE, R4
+	CNEG	LO, R4, R4
+	MOVD	R4, (R7)
+	RET
+cmpnext:
+	REV	R8, R4
+	REV	R9, R5
+	CMP	R4, R5
+	B	ret
diff --git a/src/internal/bytealg/compare_generic.go b/src/internal/bytealg/compare_generic.go
new file mode 100644
index 0000000..bd4489a
--- /dev/null
+++ b/src/internal/bytealg/compare_generic.go
@@ -0,0 +1,60 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !386,!amd64,!s390x,!arm,!arm64,!ppc64,!ppc64le,!mips,!mipsle,!wasm,!mips64,!mips64le
+
+package bytealg
+
+import _ "unsafe" // for go:linkname
+
+func Compare(a, b []byte) int {
+	l := len(a)
+	if len(b) < l {
+		l = len(b)
+	}
+	if l == 0 || &a[0] == &b[0] {
+		goto samebytes
+	}
+	for i := 0; i < l; i++ {
+		c1, c2 := a[i], b[i]
+		if c1 < c2 {
+			return -1
+		}
+		if c1 > c2 {
+			return +1
+		}
+	}
+samebytes:
+	if len(a) < len(b) {
+		return -1
+	}
+	if len(a) > len(b) {
+		return +1
+	}
+	return 0
+}
+
+//go:linkname runtime_cmpstring runtime.cmpstring
+func runtime_cmpstring(a, b string) int {
+	l := len(a)
+	if len(b) < l {
+		l = len(b)
+	}
+	for i := 0; i < l; i++ {
+		c1, c2 := a[i], b[i]
+		if c1 < c2 {
+			return -1
+		}
+		if c1 > c2 {
+			return +1
+		}
+	}
+	if len(a) < len(b) {
+		return -1
+	}
+	if len(a) > len(b) {
+		return +1
+	}
+	return 0
+}
diff --git a/src/internal/bytealg/compare_mips64x.s b/src/internal/bytealg/compare_mips64x.s
new file mode 100644
index 0000000..4f05fce
--- /dev/null
+++ b/src/internal/bytealg/compare_mips64x.s
@@ -0,0 +1,88 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-56
+	MOVV	a_base+0(FP), R3
+	MOVV	b_base+24(FP), R4
+	MOVV	a_len+8(FP), R1
+	MOVV	b_len+32(FP), R2
+	MOVV	$ret+48(FP), R9
+	JMP	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+	MOVV	a_base+0(FP), R3
+	MOVV	b_base+16(FP), R4
+	MOVV	a_len+8(FP), R1
+	MOVV	b_len+24(FP), R2
+	MOVV	$ret+32(FP), R9
+	JMP	cmpbody<>(SB)
+
+// On entry:
+// R1 length of a
+// R2 length of b
+// R3 points to the start of a
+// R4 points to the start of b
+// R9 points to the return value (-1/0/1)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
+	BEQ	R3, R4, samebytes // same start of a and b
+
+	SGTU	R1, R2, R7
+	BNE	R0, R7, r2_lt_r1
+	MOVV	R1, R10
+	JMP	entry
+r2_lt_r1:
+	MOVV	R2, R10	// R10 is min(R1, R2)
+entry:
+	ADDV	R3, R10, R8	// R3 start of a, R8 end of a
+	BEQ	R3, R8, samebytes // length is 0
+
+	SRLV	$4, R10		// R10 is number of chunks
+	BEQ	R0, R10, byte_loop
+
+	// make sure both a and b are aligned.
+	OR	R3, R4, R11
+	AND	$7, R11
+	BNE	R0, R11, byte_loop
+
+chunk16_loop:
+	BEQ	R0, R10, byte_loop
+	MOVV	(R3), R6
+	MOVV	(R4), R7
+	BNE	R6, R7, byte_loop
+	MOVV	8(R3), R13
+	MOVV	8(R4), R14
+	ADDV	$16, R3
+	ADDV	$16, R4
+	SUBVU	$1, R10
+	BEQ	R13, R14, chunk16_loop
+	SUBV	$8, R3
+	SUBV	$8, R4
+
+byte_loop:
+	BEQ	R3, R8, samebytes
+	MOVBU	(R3), R6
+	ADDVU	$1, R3
+	MOVBU	(R4), R7
+	ADDVU	$1, R4
+	BEQ	R6, R7, byte_loop
+
+byte_cmp:
+	SGTU	R6, R7, R8 // R8 = 1 if (R6 > R7)
+	BNE	R0, R8, ret
+	MOVV	$-1, R8
+	JMP	ret
+
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBV	R7, R6, R8
+
+ret:
+	MOVV	R8, (R9)
+	RET
diff --git a/src/internal/bytealg/compare_mipsx.s b/src/internal/bytealg/compare_mipsx.s
new file mode 100644
index 0000000..9ac5ba5
--- /dev/null
+++ b/src/internal/bytealg/compare_mipsx.s
@@ -0,0 +1,72 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-28
+	MOVW	a_base+0(FP), R3
+	MOVW	b_base+12(FP), R4
+	MOVW	a_len+4(FP), R1
+	MOVW	b_len+16(FP), R2
+	BEQ	R3, R4, samebytes
+	SGTU	R1, R2, R7
+	MOVW	R1, R8
+	CMOVN	R7, R2, R8	// R8 is min(R1, R2)
+
+	ADDU	R3, R8	// R3 is current byte in a, R8 is last byte in a to compare
+loop:
+	BEQ	R3, R8, samebytes
+
+	MOVBU	(R3), R6
+	ADDU	$1, R3
+	MOVBU	(R4), R7
+	ADDU	$1, R4
+	BEQ	R6, R7 , loop
+
+	SGTU	R6, R7, R8
+	MOVW	$-1, R6
+	CMOVZ	R8, R6, R8
+	JMP	cmp_ret
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBU	R7, R6, R8
+cmp_ret:
+	MOVW	R8, ret+24(FP)
+	RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+	MOVW	a_base+0(FP), R3
+	MOVW	a_len+4(FP), R1
+	MOVW	b_base+8(FP), R4
+	MOVW	b_len+12(FP), R2
+	BEQ	R3, R4, samebytes
+	SGTU	R1, R2, R7
+	MOVW	R1, R8
+	CMOVN	R7, R2, R8	// R8 is min(R1, R2)
+
+	ADDU	R3, R8	// R3 is current byte in a, R8 is last byte in a to compare
+loop:
+	BEQ	R3, R8, samebytes	// all compared bytes were the same; compare lengths
+
+	MOVBU	(R3), R6
+	ADDU	$1, R3
+	MOVBU	(R4), R7
+	ADDU	$1, R4
+	BEQ	R6, R7 , loop
+	// bytes differed
+	SGTU	R6, R7, R8
+	MOVW	$-1, R6
+	CMOVZ	R8, R6, R8
+	JMP	cmp_ret
+samebytes:
+	SGTU	R1, R2, R6
+	SGTU	R2, R1, R7
+	SUBU	R7, R6, R8
+cmp_ret:
+	MOVW	R8, ret+16(FP)
+	RET
diff --git a/src/internal/bytealg/compare_native.go b/src/internal/bytealg/compare_native.go
new file mode 100644
index 0000000..b53ba97
--- /dev/null
+++ b/src/internal/bytealg/compare_native.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 s390x arm arm64 ppc64 ppc64le mips mipsle wasm mips64 mips64le
+
+package bytealg
+
+import _ "unsafe" // For go:linkname
+
+//go:noescape
+func Compare(a, b []byte) int
+
+// The declaration below generates ABI wrappers for functions
+// implemented in assembly in this package but declared in another
+// package.
+
+//go:linkname abigen_runtime_cmpstring runtime.cmpstring
+func abigen_runtime_cmpstring(a, b string) int
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s
new file mode 100644
index 0000000..7819da3
--- /dev/null
+++ b/src/internal/bytealg/compare_ppc64x.s
@@ -0,0 +1,277 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+	MOVD	a_base+0(FP), R5
+	MOVD	b_base+24(FP), R6
+	MOVD	a_len+8(FP), R3
+	CMP	R5,R6,CR7
+	MOVD	b_len+32(FP), R4
+	MOVD	$ret+48(FP), R7
+	CMP	R3,R4,CR6
+	BEQ	CR7,equal
+
+#ifdef	GOARCH_ppc64le
+	BR	cmpbodyLE<>(SB)
+#else
+	BR      cmpbodyBE<>(SB)
+#endif
+
+equal:
+	BEQ	CR6,done
+	MOVD	$1, R8
+	BGT	CR6,greater
+	NEG	R8
+
+greater:
+	MOVD	R8, (R7)
+	RET
+
+done:
+	MOVD	$0, (R7)
+	RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	a_base+0(FP), R5
+	MOVD	b_base+16(FP), R6
+	MOVD	a_len+8(FP), R3
+	CMP	R5,R6,CR7
+	MOVD	b_len+24(FP), R4
+	MOVD	$ret+32(FP), R7
+	CMP	R3,R4,CR6
+	BEQ	CR7,equal
+
+#ifdef	GOARCH_ppc64le
+	BR	cmpbodyLE<>(SB)
+#else
+	BR      cmpbodyBE<>(SB)
+#endif
+
+equal:
+	BEQ	CR6,done
+	MOVD	$1, R8
+	BGT	CR6,greater
+	NEG	R8
+
+greater:
+	MOVD	R8, (R7)
+	RET
+
+done:
+	MOVD	$0, (R7)
+	RET
+
+// Do an efficient memcmp for ppc64le
+// R3 = a len
+// R4 = b len
+// R5 = a addr
+// R6 = b addr
+// R7 = addr of return value
+TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R8		// set up length
+	CMP	R3,R4,CR2	// unequal?
+	BC	12,8,setuplen	// BLT CR2
+	MOVD	R4,R8		// use R4 for comparison len
+setuplen:
+	MOVD	R8,CTR		// set up loop counter
+	CMP	R8,$8		// only optimize >=8
+	BLT	simplecheck
+	DCBT	(R5)		// cache hint
+	DCBT	(R6)
+	CMP	R8,$32		// optimize >= 32
+	MOVD	R8,R9
+	BLT	setup8a		// 8 byte moves only
+setup32a:
+	SRADCC	$5,R8,R9	// number of 32 byte chunks
+	MOVD	R9,CTR
+
+        // Special processing for 32 bytes or longer.
+        // Loading this way is faster and correct as long as the
+	// doublewords being compared are equal. Once they
+	// are found unequal, reload them in proper byte order
+	// to determine greater or less than.
+loop32a:
+	MOVD	0(R5),R9	// doublewords to compare
+	MOVD	0(R6),R10	// get 4 doublewords
+	MOVD	8(R5),R14
+	MOVD	8(R6),R15
+	CMPU	R9,R10		// bytes equal?
+	MOVD	$0,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	16(R5),R9	// get next pair of doublewords
+	MOVD	16(R6),R10
+	CMPU	R14,R15		// bytes match?
+	MOVD	$8,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	24(R5),R14	// get next pair of doublewords
+	MOVD    24(R6),R15
+	CMPU	R9,R10		// bytes match?
+	MOVD	$16,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	$-8,R16		// for cmpne, R5,R6 already inc by 32
+	ADD	$32,R5		// bump up to next 32
+	ADD	$32,R6
+	CMPU    R14,R15		// bytes match?
+	BC	8,2,loop32a	// br ctr and cr
+	BNE	cmpne
+	ANDCC	$24,R8,R9	// Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC	$3,R9,R9	// get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD	R9,CTR		// loop count for doublewords
+loop8:
+	MOVDBR	(R5+R0),R9	// doublewords to compare
+	MOVDBR	(R6+R0),R10	// LE compare order
+	ADD	$8,R5
+	ADD	$8,R6
+	CMPU	R9,R10		// match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BGT	greater
+	BLT	less
+leftover:
+	ANDCC	$7,R8,R9	// check for leftover bytes
+	MOVD	R9,CTR		// save the ctr
+	BNE	simple		// leftover bytes
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less
+	BR	greater
+simplecheck:
+	CMP	R8,$0		// remaining compare length 0
+	BNE	simple		// do simple compare
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less	// 1st len < 2nd len, result less
+	BR	greater		// 1st len > 2nd len must be greater
+simple:
+	MOVBZ	0(R5), R9	// get byte from 1st operand
+	ADD	$1,R5
+	MOVBZ	0(R6), R10	// get byte from 2nd operand
+	ADD	$1,R6
+	CMPU	R9, R10
+	BC	8,2,simple	// bc ctr <> 0 && cr
+	BGT	greater		// 1st > 2nd
+	BLT	less		// 1st < 2nd
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,9,greater	// 2nd len > 1st len
+	BR	less		// must be less
+cmpne:				// only here is not equal
+	MOVDBR	(R5+R16),R8	// reload in reverse order
+	MOVDBR	(R6+R16),R9
+	CMPU	R8,R9		// compare correct endianness
+	BGT	greater		// here only if NE
+less:
+	MOVD	$-1,R3
+	MOVD	R3,(R7)		// return value if A < B
+	RET
+equal:
+	MOVD	$0,(R7)		// return value if A == B
+	RET
+greater:
+	MOVD	$1,R3
+	MOVD	R3,(R7)		// return value if A > B
+	RET
+
+// Do an efficient memcmp for ppc64 (BE)
+// R3 = a len
+// R4 = b len
+// R5 = a addr
+// R6 = b addr
+// R7 = addr of return value
+TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R8		// set up length
+	CMP	R3,R4,CR2	// unequal?
+	BC	12,8,setuplen	// BLT CR2
+	MOVD	R4,R8		// use R4 for comparison len
+setuplen:
+	MOVD	R8,CTR		// set up loop counter
+	CMP	R8,$8		// only optimize >=8
+	BLT	simplecheck
+	DCBT	(R5)		// cache hint
+	DCBT	(R6)
+	CMP	R8,$32		// optimize >= 32
+	MOVD	R8,R9
+	BLT	setup8a		// 8 byte moves only
+
+setup32a:
+	SRADCC	$5,R8,R9	// number of 32 byte chunks
+	MOVD	R9,CTR
+loop32a:
+	MOVD	0(R5),R9	// doublewords to compare
+	MOVD	0(R6),R10	// get 4 doublewords
+	MOVD	8(R5),R14
+	MOVD	8(R6),R15
+	CMPU	R9,R10		// bytes equal?
+	BLT	less		// found to be less
+	BGT	greater		// found to be greater
+	MOVD	16(R5),R9	// get next pair of doublewords
+	MOVD	16(R6),R10
+	CMPU	R14,R15		// bytes match?
+	BLT	less		// found less
+	BGT	greater		// found greater
+	MOVD	24(R5),R14	// get next pair of doublewords
+	MOVD	24(R6),R15
+	CMPU	R9,R10		// bytes match?
+	BLT	less		// found to be less
+	BGT	greater		// found to be greater
+	ADD	$32,R5		// bump up to next 32
+	ADD	$32,R6
+	CMPU	R14,R15		// bytes match?
+	BC	8,2,loop32a	// br ctr and cr
+	BLT	less		// with BE, byte ordering is
+	BGT	greater		// good for compare
+	ANDCC	$24,R8,R9	// Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC	$3,R9,R9	// get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD	R9,CTR		// loop count for doublewords
+loop8:
+	MOVD	(R5),R9
+	MOVD	(R6),R10
+	ADD	$8,R5
+	ADD	$8,R6
+	CMPU	R9,R10		// match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BGT	greater
+	BLT	less
+leftover:
+	ANDCC	$7,R8,R9	// check for leftover bytes
+	MOVD	R9,CTR		// save the ctr
+	BNE	simple		// leftover bytes
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less
+	BR	greater
+simplecheck:
+	CMP	R8,$0		// remaining compare length 0
+	BNE	simple		// do simple compare
+	BC	12,10,equal	// test CR2 for length comparison
+	BC 	12,8,less	// 1st len < 2nd len, result less
+	BR	greater		// same len, must be equal
+simple:
+	MOVBZ	0(R5),R9	// get byte from 1st operand
+	ADD	$1,R5
+	MOVBZ	0(R6),R10	// get byte from 2nd operand
+	ADD	$1,R6
+	CMPU	R9,R10
+	BC	8,2,simple	// bc ctr <> 0 && cr
+	BGT	greater		// 1st > 2nd
+	BLT	less		// 1st < 2nd
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,9,greater	// 2nd len > 1st len
+less:
+	MOVD	$-1,R3
+	MOVD    R3,(R7)		// return value if A < B
+	RET
+equal:
+	MOVD    $0,(R7)		// return value if A == B
+	RET
+greater:
+	MOVD	$1,R3
+	MOVD	R3,(R7)		// return value if A > B
+	RET
diff --git a/src/internal/bytealg/compare_s390x.s b/src/internal/bytealg/compare_s390x.s
new file mode 100644
index 0000000..5394548
--- /dev/null
+++ b/src/internal/bytealg/compare_s390x.s
@@ -0,0 +1,69 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
+	MOVD	a_base+0(FP), R3
+	MOVD	a_len+8(FP), R4
+	MOVD	b_base+24(FP), R5
+	MOVD	b_len+32(FP), R6
+	LA	ret+48(FP), R7
+	BR	cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	a_base+0(FP), R3
+	MOVD	a_len+8(FP), R4
+	MOVD	b_base+16(FP), R5
+	MOVD	b_len+24(FP), R6
+	LA	ret+32(FP), R7
+	BR	cmpbody<>(SB)
+
+// input:
+//   R3 = a
+//   R4 = alen
+//   R5 = b
+//   R6 = blen
+//   R7 = address of output word (stores -1/0/1 here)
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, cmplengths
+	MOVD	R4, R8
+	CMPBLE	R4, R6, amin
+	MOVD	R6, R8
+amin:
+	CMPBEQ	R8, $0, cmplengths
+	CMP	R8, $256
+	BLE	tail
+loop:
+	CLC	$256, 0(R3), 0(R5)
+	BGT	gt
+	BLT	lt
+	SUB	$256, R8
+	MOVD	$256(R3), R3
+	MOVD	$256(R5), R5
+	CMP	R8, $256
+	BGT	loop
+tail:
+	SUB	$1, R8
+	EXRL	$cmpbodyclc<>(SB), R8
+	BGT	gt
+	BLT	lt
+cmplengths:
+	CMP	R4, R6
+	BEQ	eq
+	BLT	lt
+gt:
+	MOVD	$1, 0(R7)
+	RET
+lt:
+	MOVD	$-1, 0(R7)
+	RET
+eq:
+	MOVD	$0, 0(R7)
+	RET
+
+TEXT cmpbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
diff --git a/src/internal/bytealg/compare_wasm.s b/src/internal/bytealg/compare_wasm.s
new file mode 100644
index 0000000..dc8fb33
--- /dev/null
+++ b/src/internal/bytealg/compare_wasm.s
@@ -0,0 +1,115 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB), NOSPLIT, $0-56
+	Get SP
+	I64Load a_base+0(FP)
+	I64Load a_len+8(FP)
+	I64Load b_base+24(FP)
+	I64Load b_len+32(FP)
+	Call cmpbody<>(SB)
+	I64Store ret+48(FP)
+	RET
+
+TEXT runtime·cmpstring(SB), NOSPLIT, $0-40
+	Get SP
+	I64Load a_base+0(FP)
+	I64Load a_len+8(FP)
+	I64Load b_base+16(FP)
+	I64Load b_len+24(FP)
+	Call cmpbody<>(SB)
+	I64Store ret+32(FP)
+	RET
+
+// params: a, alen, b, blen
+// ret: -1/0/1
+TEXT cmpbody<>(SB), NOSPLIT, $0-0
+	// len = min(alen, blen)
+	Get R1
+	Get R3
+	Get R1
+	Get R3
+	I64LtU
+	Select
+	Set R4
+
+	Get R0
+	I32WrapI64
+	Get R2
+	I32WrapI64
+	Get R4
+	I32WrapI64
+	Call memcmp<>(SB)
+	I64ExtendI32S
+	Tee R5
+
+	I64Eqz
+	If
+		// check length
+		Get R1
+		Get R3
+		I64Sub
+		Set R5
+	End
+
+	I64Const $0
+	I64Const $-1
+	I64Const $1
+	Get R5
+	I64Const $0
+	I64LtS
+	Select
+	Get R5
+	I64Eqz
+	Select
+	Return
+
+// compiled with emscripten
+// params: a, b, len
+// ret: <0/0/>0
+TEXT memcmp<>(SB), NOSPLIT, $0-0
+	Get R2
+	If $1
+	Loop
+	Get R0
+	I32Load8S $0
+	Tee R3
+	Get R1
+	I32Load8S $0
+	Tee R4
+	I32Eq
+	If
+	Get R0
+	I32Const $1
+	I32Add
+	Set R0
+	Get R1
+	I32Const $1
+	I32Add
+	Set R1
+	I32Const $0
+	Get R2
+	I32Const $-1
+	I32Add
+	Tee R2
+	I32Eqz
+	BrIf $3
+	Drop
+	Br $1
+	End
+	End
+	Get R3
+	I32Const $255
+	I32And
+	Get R4
+	I32Const $255
+	I32And
+	I32Sub
+	Else
+	I32Const $0
+	End
+	Return
diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s
new file mode 100644
index 0000000..fa864c4
--- /dev/null
+++ b/src/internal/bytealg/count_amd64.s
@@ -0,0 +1,201 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
+	JEQ	2(PC)
+	JMP	·countGeneric(SB)
+	MOVQ	b_base+0(FP), SI
+	MOVQ	b_len+8(FP), BX
+	MOVB	c+24(FP), AL
+	LEAQ	ret+32(FP), R8
+	JMP	countbody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+	CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
+	JEQ	2(PC)
+	JMP	·countGenericString(SB)
+	MOVQ	s_base+0(FP), SI
+	MOVQ	s_len+8(FP), BX
+	MOVB	c+16(FP), AL
+	LEAQ	ret+24(FP), R8
+	JMP	countbody<>(SB)
+
+// input:
+//   SI: data
+//   BX: data len
+//   AL: byte sought
+//   R8: address to put result
+// This function requires the POPCNT instruction.
+TEXT countbody<>(SB),NOSPLIT,$0
+	// Shuffle X0 around so that each byte contains
+	// the character we're looking for.
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
+
+	CMPQ BX, $16
+	JLT small
+
+	MOVQ $0, R12 // Accumulator
+
+	MOVQ SI, DI
+
+	CMPQ BX, $32
+	JA avx2
+sse:
+	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
+	JMP	sseloopentry
+
+sseloop:
+	// Move the next 16-byte chunk of the data into X1.
+	MOVOU	(DI), X1
+	// Compare bytes in X0 to X1.
+	PCMPEQB	X0, X1
+	// Take the top bit of each byte in X1 and put the result in DX.
+	PMOVMSKB X1, DX
+	// Count number of matching bytes
+	POPCNTL DX, DX
+	// Accumulate into R12
+	ADDQ DX, R12
+	// Advance to next block.
+	ADDQ	$16, DI
+sseloopentry:
+	CMPQ	DI, AX
+	JBE	sseloop
+
+	// Get the number of bytes to consider in the last 16 bytes
+	ANDQ $15, BX
+	JZ end
+
+	// Create mask to ignore overlap between previous 16 byte block
+	// and the next.
+	MOVQ $16,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+
+	// Process the last 16-byte chunk. This chunk may overlap with the
+	// chunks we've already searched so we need to mask part of it.
+	MOVOU	(AX), X1
+	PCMPEQB	X0, X1
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+end:
+	MOVQ R12, (R8)
+	RET
+
+// handle for lengths < 16
+small:
+	TESTQ	BX, BX
+	JEQ	endzero
+
+	// Check if we'll load across a page boundary.
+	LEAQ	16(SI), AX
+	TESTW	$0xff0, AX
+	JEQ	endofpage
+
+	// We must ignore high bytes as they aren't part of our slice.
+	// Create mask.
+	MOVB BX, CX
+	MOVQ $1, R10
+	SALQ CL, R10
+	SUBQ $1, R10
+
+	// Load data
+	MOVOU	(SI), X1
+	// Compare target byte with each byte in data.
+	PCMPEQB	X0, X1
+	// Move result bits to integer register.
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	// Directly return DX, we don't need to accumulate
+	// since we have <16 bytes.
+	MOVQ	DX, (R8)
+	RET
+endzero:
+	MOVQ $0, (R8)
+	RET
+
+endofpage:
+	// We must ignore low bytes as they aren't part of our slice.
+	MOVQ $16,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+
+	// Load data into the high end of X1.
+	MOVOU	-16(SI)(BX*1), X1
+	// Compare target byte with each byte in data.
+	PCMPEQB	X0, X1
+	// Move result bits to integer register.
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	// Directly return DX, we don't need to accumulate
+	// since we have <16 bytes.
+	POPCNTL DX, DX
+	MOVQ	DX, (R8)
+	RET
+
+avx2:
+	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JNE sse
+	MOVD AX, X0
+	LEAQ -32(SI)(BX*1), R11
+	VPBROADCASTB  X0, Y1
+avx2_loop:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+	ADDQ $32, DI
+	CMPQ DI, R11
+	JLE avx2_loop
+
+	// If last block is already processed,
+	// skip to the end.
+	CMPQ DI, R11
+	JEQ endavx
+
+	// Load address of the last 32 bytes.
+	// There is an overlap with the previous block.
+	MOVQ R11, DI
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, DX
+	// Exit AVX mode.
+	VZEROUPPER
+
+	// Create mask to ignore overlap between previous 32 byte block
+	// and the next.
+	ANDQ $31, BX
+	MOVQ $32,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFFFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+	MOVQ R12, (R8)
+	RET
+endavx:
+	// Exit AVX mode.
+	VZEROUPPER
+	MOVQ R12, (R8)
+	RET
diff --git a/src/internal/bytealg/count_arm.s b/src/internal/bytealg/count_arm.s
new file mode 100644
index 0000000..f704ea0
--- /dev/null
+++ b/src/internal/bytealg/count_arm.s
@@ -0,0 +1,43 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R0
+	MOVW	b_len+4(FP), R1
+	MOVBU	c+12(FP), R2
+	MOVW	$ret+16(FP), R7
+	B	countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R0
+	MOVW	s_len+4(FP), R1
+	MOVBU	c+8(FP), R2
+	MOVW	$ret+12(FP), R7
+	B	countbytebody<>(SB)
+
+// Input:
+// R0: data
+// R1: data length
+// R2: byte to find
+// R7: address to put result
+//
+// On exit:
+// R4 and R8 are clobbered
+TEXT countbytebody<>(SB),NOSPLIT,$0
+	MOVW	$0, R8	// R8 = count of byte to search
+	CMP	$0, R1
+	B.EQ	done	// short path to handle 0-byte case
+	ADD	R0, R1	// R1 is the end of the range
+byte_loop:
+	MOVBU.P	1(R0), R4
+	CMP	R4, R2
+	ADD.EQ	$1, R8
+	CMP	R0, R1
+	B.NE	byte_loop
+done:
+	MOVW	R8, (R7)
+	RET
diff --git a/src/internal/bytealg/count_arm64.s b/src/internal/bytealg/count_arm64.s
new file mode 100644
index 0000000..8cd703d
--- /dev/null
+++ b/src/internal/bytealg/count_arm64.s
@@ -0,0 +1,90 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+	MOVD	b_base+0(FP), R0
+	MOVD	b_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	MOVD	$ret+32(FP), R8
+	B	countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+16(FP), R1
+	MOVD	$ret+24(FP), R8
+	B	countbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R2: data len
+//   R1: byte to find
+//   R8: address to put result
+TEXT countbytebody<>(SB),NOSPLIT,$0
+	// R11 = count of byte to search
+	MOVD	$0, R11
+	// short path to handle 0-byte case
+	CBZ	R2, done
+	CMP	$0x20, R2
+	// jump directly to tail if length < 32
+	BLO	tail
+	ANDS	$0x1f, R0, R9
+	BEQ	chunk
+	// Work with not 32-byte aligned head
+	BIC	$0x1f, R0, R3
+	ADD	$0x20, R3
+head_loop:
+	MOVBU.P	1(R0), R5
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	SUB	$1, R2, R2
+	CMP	R0, R3
+	BNE	head_loop
+	// Work with 32-byte aligned chunks
+chunk:
+	BIC	$0x1f, R2, R9
+	// The first chunk can also be the last
+	CBZ	R9, tail
+	// R3 = end of 32-byte chunks
+	ADD	R0, R9, R3
+	MOVD	$1, R5
+	VMOV	R5, V5.B16
+	// R2 = length of tail
+	SUB	R9, R2, R2
+	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
+	VMOV	R1, V0.B16
+	// Clear the low 64-bit element of V7 and V8
+	VEOR	V7.B8, V7.B8, V7.B8
+	VEOR	V8.B8, V8.B8, V8.B8
+	// Count the target byte in 32-byte chunk
+chunk_loop:
+	VLD1.P	(R0), [V1.B16, V2.B16]
+	CMP	R0, R3
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// Clear the higher 7 bits
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	// Count lanes match the requested byte
+	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
+	VUADDLV	V6.B16, V7
+	// Accumulate the count in low 64-bit element of V8 when inside the loop
+	VADD	V7, V8
+	BNE	chunk_loop
+	VMOV	V8.D[0], R6
+	ADD	R6, R11, R11
+	CBZ	R2, done
+tail:
+	// Work with tail shorter than 32 bytes
+	MOVBU.P	1(R0), R5
+	SUB	$1, R2, R2
+	CMP	R5, R1
+	CINC	EQ, R11, R11
+	CBNZ	R2, tail
+done:
+	MOVD	R11, (R8)
+	RET
diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go
new file mode 100644
index 0000000..5575e81
--- /dev/null
+++ b/src/internal/bytealg/count_generic.go
@@ -0,0 +1,27 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!arm,!arm64,!ppc64le,!ppc64,!riscv64,!s390x
+
+package bytealg
+
+func Count(b []byte, c byte) int {
+	n := 0
+	for _, x := range b {
+		if x == c {
+			n++
+		}
+	}
+	return n
+}
+
+func CountString(s string, c byte) int {
+	n := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			n++
+		}
+	}
+	return n
+}
diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go
new file mode 100644
index 0000000..b1ff1d2
--- /dev/null
+++ b/src/internal/bytealg/count_native.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 arm arm64 ppc64le ppc64 riscv64 s390x
+
+package bytealg
+
+//go:noescape
+func Count(b []byte, c byte) int
+
+//go:noescape
+func CountString(s string, c byte) int
+
+// A backup implementation to use by assembly.
+func countGeneric(b []byte, c byte) int {
+	n := 0
+	for _, x := range b {
+		if x == c {
+			n++
+		}
+	}
+	return n
+}
+func countGenericString(s string, c byte) int {
+	n := 0
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			n++
+		}
+	}
+	return n
+}
diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s
new file mode 100644
index 0000000..a64d7d7
--- /dev/null
+++ b/src/internal/bytealg/count_ppc64x.s
@@ -0,0 +1,97 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64le ppc64
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
+	MOVD  b_base+0(FP), R3    // R3 = byte array pointer
+	MOVD  b_len+8(FP), R4     // R4 = length
+	MOVBZ c+24(FP), R5        // R5 = byte
+	MOVD  $ret+32(FP), R14    // R14 = &ret
+	BR    countbytebody<>(SB)
+
+TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
+	MOVD  s_base+0(FP), R3    // R3 = string
+	MOVD  s_len+8(FP), R4     // R4 = length
+	MOVBZ c+16(FP), R5        // R5 = byte
+	MOVD  $ret+24(FP), R14    // R14 = &ret
+	BR    countbytebody<>(SB)
+
+// R3: addr of string
+// R4: len of string
+// R5: byte to count
+// R14: addr for return value
+// endianness shouldn't matter since we are just counting and order
+// is irrelevant
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+	DCBT (R3)    // Prepare cache line.
+	MOVD R0, R18 // byte count
+	MOVD R3, R19 // Save base address for calculating the index later.
+	MOVD R4, R16
+
+	MOVD   R5, R6
+	RLDIMI $8, R6, $48, R6
+	RLDIMI $16, R6, $32, R6
+	RLDIMI $32, R6, $0, R6  // fill reg with the byte to count
+
+	VSPLTISW $3, V4     // used for shift
+	MTVRD    R6, V1     // move compare byte
+	VSPLTB   $7, V1, V1 // replicate byte across V1
+
+	CMPU   R4, $32          // Check if it's a small string (<32 bytes)
+	BLT    tail             // Jump to the small string case
+	XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
+
+cmploop:
+	LXVW4X (R3), VS32 // load bytes from string
+
+	// when the bytes match, the corresponding byte contains all 1s
+	VCMPEQUB V1, V0, V2     // compare bytes
+	VPOPCNTD V2, V3         // each double word contains its count
+	VADDUDM  V3, V5, V5     // accumulate bit count in each double word
+	ADD      $16, R3, R3    // increment pointer
+	SUB      $16, R16, R16  // remaining bytes
+	CMP      R16, $16       // at least 16 remaining?
+	BGE      cmploop
+	VSRD     V5, V4, V5     // shift by 3 to convert bits to bytes
+	VSLDOI   $8, V5, V5, V6 // get the double word values from vector
+	MFVSRD   V5, R9
+	MFVSRD   V6, R10
+	ADD      R9, R10, R9
+	ADD      R9, R18, R18
+
+tail:
+	CMP R16, $8 // 8 bytes left?
+	BLT small
+
+	MOVD    (R3), R12     // load 8 bytes
+	CMPB    R12, R6, R17  // compare bytes
+	POPCNTD R17, R15      // bit count
+	SRD     $3, R15, R15  // byte count
+	ADD     R15, R18, R18 // add to byte count
+
+next1:
+	ADD $8, R3, R3
+	SUB $8, R16, R16 // remaining bytes
+	BR  tail
+
+small:
+	CMP   $0, R16   // any remaining
+	BEQ   done
+	MOVBZ (R3), R12 // check each remaining byte
+	CMP   R12, R5
+	BNE   next2
+	ADD   $1, R18
+
+next2:
+	SUB $1, R16
+	ADD $1, R3  // inc address
+	BR  small
+
+done:
+	MOVD R18, (R14) // return count
+	RET
diff --git a/src/internal/bytealg/count_riscv64.s b/src/internal/bytealg/count_riscv64.s
new file mode 100644
index 0000000..3f4eb23
--- /dev/null
+++ b/src/internal/bytealg/count_riscv64.s
@@ -0,0 +1,44 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+	MOV	b_base+0(FP), A1
+	MOV	b_len+8(FP), A2
+	MOVBU	c+24(FP), A3	// byte to count
+	MOV	ZERO, A4	// count
+	ADD	A1, A2		// end
+
+loop:
+	BEQ	A1, A2, done
+	MOVBU	(A1), A5
+	ADD	$1, A1
+	BNE	A3, A5, loop
+	ADD	$1, A4
+	JMP	loop
+
+done:
+	MOV	A4, ret+32(FP)
+	RET
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+	MOV	s_base+0(FP), A1
+	MOV	s_len+8(FP), A2
+	MOVBU	c+16(FP), A3	// byte to count
+	MOV	ZERO, A4	// count
+	ADD	A1, A2		// end
+
+loop:
+	BEQ	A1, A2, done
+	MOVBU	(A1), A5
+	ADD	$1, A1
+	BNE	A3, A5, loop
+	ADD	$1, A4
+	JMP	loop
+
+done:
+	MOV	A4, ret+24(FP)
+	RET
diff --git a/src/internal/bytealg/count_s390x.s b/src/internal/bytealg/count_s390x.s
new file mode 100644
index 0000000..2a3b5c0
--- /dev/null
+++ b/src/internal/bytealg/count_s390x.s
@@ -0,0 +1,169 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// condition code masks
+#define EQ 8
+#define NE 7
+
+// register assignments
+#define R_ZERO R0
+#define R_VAL  R1
+#define R_TMP  R2
+#define R_PTR  R3
+#define R_LEN  R4
+#define R_CHAR R5
+#define R_RET  R6
+#define R_ITER R7
+#define R_CNT  R8
+#define R_MPTR R9
+
+// vector register assignments
+#define V_ZERO V0
+#define V_CHAR V1
+#define V_MASK V2
+#define V_VAL  V3
+#define V_CNT  V4
+
+// mask for trailing bytes in vector implementation
+GLOBL countbytemask<>(SB), RODATA, $16
+DATA countbytemask<>+0(SB)/8, $0x0101010101010101
+DATA countbytemask<>+8(SB)/8, $0x0101010101010101
+
+// func Count(b []byte, c byte) int
+TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
+	LMG   b+0(FP), R_PTR, R_LEN
+	MOVBZ c+24(FP), R_CHAR
+	MOVD  $ret+32(FP), R_RET
+	BR    countbytebody<>(SB)
+
+// func CountString(s string, c byte) int
+TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
+	LMG   s+0(FP), R_PTR, R_LEN
+	MOVBZ c+16(FP), R_CHAR
+	MOVD  $ret+24(FP), R_RET
+	BR    countbytebody<>(SB)
+
+// input:
+// R_PTR  = address of array of bytes
+// R_LEN  = number of bytes in array
+// R_CHAR = byte value to count zero (extended to register width)
+// R_RET  = address of return value
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+	MOVD  $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
+	MOVD  $countbytemask<>(SB), R_MPTR
+	CGIJ  $EQ, R_LEN, $0, ret0 // return if length is 0.
+	SRD   $4, R_LEN, R_ITER    // R_ITER is the number of 16-byte chunks
+	MOVBZ (R_TMP), R_TMP       // load bool indicating support for vector facility
+	CGIJ  $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
+
+	// Start of vector code (have vector facility).
+	//
+	// Set R_LEN to be the length mod 16 minus 1 to use as an index for
+	// vector 'load with length' (VLL). It will be in the range [-1,14].
+	// Also replicate c across a 16-byte vector and initialize V_ZERO.
+	ANDW  $0xf, R_LEN
+	VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
+	VZERO V_ZERO             // V_ZERO = [1]uint128{0}
+	ADDW  $-1, R_LEN
+	VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
+
+	// Jump to loop if we have more than 15 bytes to process.
+	CGIJ $NE, R_ITER, $0, vxchunks
+
+	// Load 1-15 bytes and corresponding mask.
+	// Note: only the low 32-bits of R_LEN are used for the index.
+	VLL R_LEN, (R_PTR), V_VAL
+	VLL R_LEN, (R_MPTR), V_MASK
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+
+	// Return rightmost (lowest) 64-bit part of accumulator.
+	VSTEG $1, V_CNT, (R_RET)
+	RET
+
+vxchunks:
+	// Load 0x01 into every byte element in the 16-byte mask vector.
+	VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
+	VZERO  V_CNT      // initial uint128 count of 0
+
+vxloop:
+	// Load input bytes in 16-byte chunks.
+	VL (R_PTR), V_VAL
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+	// Increment input string address.
+	MOVD $16(R_PTR), R_PTR
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
+
+	// Repeat until all 16-byte chunks are done.
+	BRCTG R_ITER, vxloop
+
+	// Skip to end if there are no trailing bytes.
+	CIJ $EQ, R_LEN, $-1, vxret
+
+	// Load 1-15 bytes and corresponding mask.
+	// Note: only the low 32-bits of R_LEN are used for the index.
+	VLL R_LEN, (R_PTR), V_VAL
+	VLL R_LEN, (R_MPTR), V_MASK
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL
+	VN    V_MASK, V_VAL, V_VAL
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
+
+vxret:
+	// Return rightmost (lowest) 64-bit part of accumulator.
+	VSTEG $1, V_CNT, (R_RET)
+	RET
+
+novx:
+	// Start of non-vector code (the vector facility not available).
+	//
+	// Initialise counter and constant zero.
+	MOVD $0, R_CNT
+	MOVD $0, R_ZERO
+
+loop:
+	// Read 1-byte from input and compare.
+	// Note: avoid putting LOCGR in critical path.
+	MOVBZ (R_PTR), R_VAL
+	MOVD  $1, R_TMP
+	MOVD  $1(R_PTR), R_PTR
+	CMPW  R_VAL, R_CHAR
+	LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
+	ADD   R_TMP, R_CNT       // accumulate 64-bit result
+
+	// Repeat until all bytes have been checked.
+	BRCTG R_LEN, loop
+
+ret:
+	MOVD R_CNT, (R_RET)
+	RET
+
+ret0:
+	MOVD $0, (R_RET)
+	RET
diff --git a/src/internal/bytealg/equal_386.s b/src/internal/bytealg/equal_386.s
new file mode 100644
index 0000000..8723363
--- /dev/null
+++ b/src/internal/bytealg/equal_386.s
@@ -0,0 +1,129 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
+	MOVL	a+0(FP), SI
+	MOVL	b+4(FP), DI
+	CMPL	SI, DI
+	JEQ	eq
+	MOVL	size+8(FP), BX
+	LEAL	ret+12(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB    $1, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
+	MOVL    a+0(FP), SI
+	MOVL    b+4(FP), DI
+	CMPL    SI, DI
+	JEQ     eq
+	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
+	LEAL	ret+8(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB    $1, ret+8(FP)
+	RET
+
+// a in SI
+// b in DI
+// count in BX
+// address of result byte in AX
+TEXT memeqbody<>(SB),NOSPLIT,$0-0
+	CMPL	BX, $4
+	JB	small
+
+	// 64 bytes at a time using xmm registers
+hugeloop:
+	CMPL	BX, $64
+	JB	bigloop
+	CMPB	internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
+	JNE	bigloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	MOVOU	16(SI), X2
+	MOVOU	16(DI), X3
+	MOVOU	32(SI), X4
+	MOVOU	32(DI), X5
+	MOVOU	48(SI), X6
+	MOVOU	48(DI), X7
+	PCMPEQB	X1, X0
+	PCMPEQB	X3, X2
+	PCMPEQB	X5, X4
+	PCMPEQB	X7, X6
+	PAND	X2, X0
+	PAND	X6, X4
+	PAND	X4, X0
+	PMOVMSKB X0, DX
+	ADDL	$64, SI
+	ADDL	$64, DI
+	SUBL	$64, BX
+	CMPL	DX, $0xffff
+	JEQ	hugeloop
+	MOVB	$0, (AX)
+	RET
+
+	// 4 bytes at a time using 32-bit register
+bigloop:
+	CMPL	BX, $4
+	JBE	leftover
+	MOVL	(SI), CX
+	MOVL	(DI), DX
+	ADDL	$4, SI
+	ADDL	$4, DI
+	SUBL	$4, BX
+	CMPL	CX, DX
+	JEQ	bigloop
+	MOVB	$0, (AX)
+	RET
+
+	// remaining 0-4 bytes
+leftover:
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	-4(DI)(BX*1), DX
+	CMPL	CX, DX
+	SETEQ	(AX)
+	RET
+
+small:
+	CMPL	BX, $0
+	JEQ	equal
+
+	LEAL	0(BX*8), CX
+	NEGL	CX
+
+	MOVL	SI, DX
+	CMPB	DX, $0xfc
+	JA	si_high
+
+	// load at SI won't cross a page boundary.
+	MOVL	(SI), SI
+	JMP	si_finish
+si_high:
+	// address ends in 111111xx. Load up to bytes we want, move to correct position.
+	MOVL	-4(SI)(BX*1), SI
+	SHRL	CX, SI
+si_finish:
+
+	// same for DI.
+	MOVL	DI, DX
+	CMPB	DX, $0xfc
+	JA	di_high
+	MOVL	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVL	-4(DI)(BX*1), DI
+	SHRL	CX, DI
+di_finish:
+
+	SUBL	SI, DI
+	SHLL	CX, DI
+equal:
+	SETEQ	(AX)
+	RET
diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s
new file mode 100644
index 0000000..c816409
--- /dev/null
+++ b/src/internal/bytealg/equal_amd64.s
@@ -0,0 +1,154 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-25
+	MOVQ	a+0(FP), SI
+	MOVQ	b+8(FP), DI
+	CMPQ	SI, DI
+	JEQ	eq
+	MOVQ	size+16(FP), BX
+	LEAQ	ret+24(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB	$1, ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
+	MOVQ	a+0(FP), SI
+	MOVQ	b+8(FP), DI
+	CMPQ	SI, DI
+	JEQ	eq
+	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
+	LEAQ	ret+16(FP), AX
+	JMP	memeqbody<>(SB)
+eq:
+	MOVB	$1, ret+16(FP)
+	RET
+
+// a in SI
+// b in DI
+// count in BX
+// address of result byte in AX
+TEXT memeqbody<>(SB),NOSPLIT,$0-0
+	CMPQ	BX, $8
+	JB	small
+	CMPQ	BX, $64
+	JB	bigloop
+	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JE	hugeloop_avx2
+
+	// 64 bytes at a time using xmm registers
+hugeloop:
+	CMPQ	BX, $64
+	JB	bigloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	MOVOU	16(SI), X2
+	MOVOU	16(DI), X3
+	MOVOU	32(SI), X4
+	MOVOU	32(DI), X5
+	MOVOU	48(SI), X6
+	MOVOU	48(DI), X7
+	PCMPEQB	X1, X0
+	PCMPEQB	X3, X2
+	PCMPEQB	X5, X4
+	PCMPEQB	X7, X6
+	PAND	X2, X0
+	PAND	X6, X4
+	PAND	X4, X0
+	PMOVMSKB X0, DX
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, BX
+	CMPL	DX, $0xffff
+	JEQ	hugeloop
+	MOVB	$0, (AX)
+	RET
+
+	// 64 bytes at a time using ymm registers
+hugeloop_avx2:
+	CMPQ	BX, $64
+	JB	bigloop_avx2
+	VMOVDQU	(SI), Y0
+	VMOVDQU	(DI), Y1
+	VMOVDQU	32(SI), Y2
+	VMOVDQU	32(DI), Y3
+	VPCMPEQB	Y1, Y0, Y4
+	VPCMPEQB	Y2, Y3, Y5
+	VPAND	Y4, Y5, Y6
+	VPMOVMSKB Y6, DX
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, BX
+	CMPL	DX, $0xffffffff
+	JEQ	hugeloop_avx2
+	VZEROUPPER
+	MOVB	$0, (AX)
+	RET
+
+bigloop_avx2:
+	VZEROUPPER
+
+	// 8 bytes at a time using 64-bit register
+bigloop:
+	CMPQ	BX, $8
+	JBE	leftover
+	MOVQ	(SI), CX
+	MOVQ	(DI), DX
+	ADDQ	$8, SI
+	ADDQ	$8, DI
+	SUBQ	$8, BX
+	CMPQ	CX, DX
+	JEQ	bigloop
+	MOVB	$0, (AX)
+	RET
+
+	// remaining 0-8 bytes
+leftover:
+	MOVQ	-8(SI)(BX*1), CX
+	MOVQ	-8(DI)(BX*1), DX
+	CMPQ	CX, DX
+	SETEQ	(AX)
+	RET
+
+small:
+	CMPQ	BX, $0
+	JEQ	equal
+
+	LEAQ	0(BX*8), CX
+	NEGQ	CX
+
+	CMPB	SI, $0xf8
+	JA	si_high
+
+	// load at SI won't cross a page boundary.
+	MOVQ	(SI), SI
+	JMP	si_finish
+si_high:
+	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
+	MOVQ	-8(SI)(BX*1), SI
+	SHRQ	CX, SI
+si_finish:
+
+	// same for DI.
+	CMPB	DI, $0xf8
+	JA	di_high
+	MOVQ	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVQ	-8(DI)(BX*1), DI
+	SHRQ	CX, DI
+di_finish:
+
+	SUBQ	SI, DI
+	SHLQ	CX, DI
+equal:
+	SETEQ	(AX)
+	RET
+
diff --git a/src/internal/bytealg/equal_arm.s b/src/internal/bytealg/equal_arm.s
new file mode 100644
index 0000000..a6c4369
--- /dev/null
+++ b/src/internal/bytealg/equal_arm.s
@@ -0,0 +1,91 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-13
+	MOVW	a+0(FP), R0
+	MOVW	b+4(FP), R2
+	CMP	R0, R2
+	B.EQ	eq
+	MOVW	size+8(FP), R1
+	CMP	$0, R1
+	B.EQ	eq		// short path to handle 0-byte case
+	MOVW	$ret+12(FP), R7
+	B	memeqbody<>(SB)
+eq:
+	MOVW	$1, R0
+	MOVB	R0, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-9
+	MOVW	a+0(FP), R0
+	MOVW	b+4(FP), R2
+	CMP	R0, R2
+	B.EQ	eq
+	MOVW	4(R7), R1	// compiler stores size at offset 4 in the closure
+	CMP	$0, R1
+	B.EQ	eq		// short path to handle 0-byte case
+	MOVW	$ret+8(FP), R7
+	B	memeqbody<>(SB)
+eq:
+	MOVW	$1, R0
+	MOVB	R0, ret+8(FP)
+	RET
+
+// Input:
+// R0: data of a
+// R1: length
+// R2: data of b
+// R7: points to return value
+//
+// On exit:
+// R4, R5 and R6 are clobbered
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	$1, R1
+	B.EQ	one		// 1-byte special case for better performance
+
+	CMP	$4, R1
+	ADD	R0, R1		// R1 is the end of the range to compare
+	B.LT	byte_loop	// length < 4
+	AND	$3, R0, R6
+	CMP	$0, R6
+	B.NE	byte_loop	// unaligned a, use byte-wise compare (TODO: try to align a)
+	AND	$3, R2, R6
+	CMP	$0, R6
+	B.NE	byte_loop	// unaligned b, use byte-wise compare
+	AND	$0xfffffffc, R1, R6
+	// length >= 4
+chunk4_loop:
+	MOVW.P	4(R0), R4
+	MOVW.P	4(R2), R5
+	CMP	R4, R5
+	B.NE	notequal
+	CMP	R0, R6
+	B.NE	chunk4_loop
+	CMP	R0, R1
+	B.EQ	equal		// reached the end
+byte_loop:
+	MOVBU.P	1(R0), R4
+	MOVBU.P	1(R2), R5
+	CMP	R4, R5
+	B.NE	notequal
+	CMP	R0, R1
+	B.NE	byte_loop
+equal:
+	MOVW	$1, R0
+	MOVB	R0, (R7)
+	RET
+one:
+	MOVBU	(R0), R4
+	MOVBU	(R2), R5
+	CMP	R4, R5
+	B.EQ	equal
+notequal:
+	MOVW	$0, R0
+	MOVB	R0, (R7)
+	RET
diff --git a/src/internal/bytealg/equal_arm64.s b/src/internal/bytealg/equal_arm64.s
new file mode 100644
index 0000000..01aa7b7
--- /dev/null
+++ b/src/internal/bytealg/equal_arm64.s
@@ -0,0 +1,136 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVD	size+16(FP), R1
+	// short path to handle 0-byte case
+	CBZ	R1, equal
+	MOVD	a+0(FP), R0
+	MOVD	b+8(FP), R2
+	MOVD	$ret+24(FP), R8
+	B	memeqbody<>(SB)
+equal:
+	MOVD	$1, R0
+	MOVB	R0, ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R4
+	CMP	R3, R4
+	BEQ	eq
+	MOVD	8(R26), R5    // compiler stores size at offset 8 in the closure
+	CBZ	R5, eq
+	MOVD	R3, 8(RSP)
+	MOVD	R4, 16(RSP)
+	MOVD	R5, 24(RSP)
+	BL	runtime·memequal(SB)
+	MOVBU	32(RSP), R3
+	MOVB	R3, ret+16(FP)
+	RET
+eq:
+	MOVD	$1, R3
+	MOVB	R3, ret+16(FP)
+	RET
+
+// input:
+// R0: pointer a
+// R1: data len
+// R2: pointer b
+// R8: address to put result
+TEXT memeqbody<>(SB),NOSPLIT,$0
+	CMP	$1, R1
+	// handle 1-byte special case for better performance
+	BEQ	one
+	CMP	$16, R1
+	// handle specially if length < 16
+	BLO	tail
+	BIC	$0x3f, R1, R3
+	CBZ	R3, chunk16
+	// work with 64-byte chunks
+	ADD	R3, R0, R6	// end of chunks
+chunk64_loop:
+	VLD1.P	(R0), [V0.D2, V1.D2, V2.D2, V3.D2]
+	VLD1.P	(R2), [V4.D2, V5.D2, V6.D2, V7.D2]
+	VCMEQ	V0.D2, V4.D2, V8.D2
+	VCMEQ	V1.D2, V5.D2, V9.D2
+	VCMEQ	V2.D2, V6.D2, V10.D2
+	VCMEQ	V3.D2, V7.D2, V11.D2
+	VAND	V8.B16, V9.B16, V8.B16
+	VAND	V8.B16, V10.B16, V8.B16
+	VAND	V8.B16, V11.B16, V8.B16
+	CMP	R0, R6
+	VMOV	V8.D[0], R4
+	VMOV	V8.D[1], R5
+	CBZ	R4, not_equal
+	CBZ	R5, not_equal
+	BNE	chunk64_loop
+	AND	$0x3f, R1, R1
+	CBZ	R1, equal
+chunk16:
+	// work with 16-byte chunks
+	BIC	$0xf, R1, R3
+	CBZ	R3, tail
+	ADD	R3, R0, R6	// end of chunks
+chunk16_loop:
+	LDP.P	16(R0), (R4, R5)
+	LDP.P	16(R2), (R7, R9)
+	EOR	R4, R7
+	CBNZ	R7, not_equal
+	EOR	R5, R9
+	CBNZ	R9, not_equal
+	CMP	R0, R6
+	BNE	chunk16_loop
+	AND	$0xf, R1, R1
+	CBZ	R1, equal
+tail:
+	// special compare of tail with length < 16
+	TBZ	$3, R1, lt_8
+	MOVD	(R0), R4
+	MOVD	(R2), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	SUB	$8, R1, R6	// offset of the last 8 bytes
+	MOVD	(R0)(R6), R4
+	MOVD	(R2)(R6), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	B	equal
+lt_8:
+	TBZ	$2, R1, lt_4
+	MOVWU	(R0), R4
+	MOVWU	(R2), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	SUB	$4, R1, R6	// offset of the last 4 bytes
+	MOVWU	(R0)(R6), R4
+	MOVWU	(R2)(R6), R5
+	EOR	R4, R5
+	CBNZ	R5, not_equal
+	B	equal
+lt_4:
+	TBZ	$1, R1, lt_2
+	MOVHU.P	2(R0), R4
+	MOVHU.P	2(R2), R5
+	CMP	R4, R5
+	BNE	not_equal
+lt_2:
+	TBZ	$0, R1, equal
+one:
+	MOVBU	(R0), R4
+	MOVBU	(R2), R5
+	CMP	R4, R5
+	BNE	not_equal
+equal:
+	MOVD	$1, R0
+	MOVB	R0, (R8)
+	RET
+not_equal:
+	MOVB	ZR, (R8)
+	RET
diff --git a/src/internal/bytealg/equal_generic.go b/src/internal/bytealg/equal_generic.go
new file mode 100644
index 0000000..59bdf8f
--- /dev/null
+++ b/src/internal/bytealg/equal_generic.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Equal reports whether a and b
+// are the same length and contain the same bytes.
+// A nil argument is equivalent to an empty slice.
+//
+// Equal is equivalent to bytes.Equal.
+// It is provided here for convenience,
+// because some packages cannot depend on bytes.
+func Equal(a, b []byte) bool {
+	// Neither cmd/compile nor gccgo allocates for these string conversions.
+	// There is a test for this in package bytes.
+	return string(a) == string(b)
+}
diff --git a/src/internal/bytealg/equal_mips64x.s b/src/internal/bytealg/equal_mips64x.s
new file mode 100644
index 0000000..641e3ff
--- /dev/null
+++ b/src/internal/bytealg/equal_mips64x.s
@@ -0,0 +1,118 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	REGCTXT	R22
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVV	a+0(FP), R1
+	MOVV	b+8(FP), R2
+	BEQ	R1, R2, eq
+	MOVV	size+16(FP), R3
+	ADDV	R1, R3, R4
+
+	// chunk size is 16
+	SGTU	$16, R3, R8
+	BEQ	R0, R8, chunk_entry
+
+byte_loop:
+	BNE	R1, R4, byte_test
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	RET
+byte_test:
+	MOVBU	(R1), R6
+	ADDV	$1, R1
+	MOVBU	(R2), R7
+	ADDV	$1, R2
+	BEQ	R6, R7, byte_loop
+	JMP	not_eq
+
+chunk_entry:
+	// make sure both a and b are aligned
+	OR	R1, R2, R9
+	AND	$0x7, R9
+	BNE	R0, R9, byte_loop
+	JMP	chunk_loop_1
+
+chunk_loop:
+	// chunk size is 16
+	SGTU	$16, R3, R8
+	BNE	R0, R8, chunk_tail_8
+chunk_loop_1:
+	MOVV	(R1), R6
+	MOVV	(R2), R7
+	BNE	R6, R7, not_eq
+	MOVV	8(R1), R12
+	MOVV	8(R2), R13
+	ADDV	$16, R1
+	ADDV	$16, R2
+	SUBV	$16, R3
+	BEQ	R12, R13, chunk_loop
+	JMP	not_eq
+
+chunk_tail_8:
+	AND	$8, R3, R14
+	BEQ	R0, R14, chunk_tail_4
+	MOVV	(R1), R6
+	MOVV	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$8, R1
+	ADDV	$8, R2
+
+chunk_tail_4:
+	AND	$4, R3, R14
+	BEQ	R0, R14, chunk_tail_2
+	MOVWU	(R1), R6
+	MOVWU	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$4, R1
+	ADDV	$4, R2
+
+chunk_tail_2:
+	AND	$2, R3, R14
+	BEQ	R0, R14, chunk_tail_1
+	MOVHU	(R1), R6
+	MOVHU	(R2), R7
+	BNE	R6, R7, not_eq
+	ADDV	$2, R1
+	ADDV	$2, R2
+
+chunk_tail_1:
+	AND	$1, R3, R14
+	BEQ	R0, R14, eq
+	MOVBU	(R1), R6
+	MOVBU	(R2), R7
+	BEQ	R6, R7, eq
+
+not_eq:
+	MOVB	R0, ret+24(FP)
+	RET
+eq:
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+	MOVV	a+0(FP), R1
+	MOVV	b+8(FP), R2
+	BEQ	R1, R2, eq
+	MOVV	8(REGCTXT), R3    // compiler stores size at offset 8 in the closure
+	MOVV	R1, 8(R29)
+	MOVV	R2, 16(R29)
+	MOVV	R3, 24(R29)
+	JAL	runtime·memequal(SB)
+	MOVBU	32(R29), R1
+	MOVB	R1, ret+16(FP)
+	RET
+eq:
+	MOVV	$1, R1
+	MOVB	R1, ret+16(FP)
+	RET
diff --git a/src/internal/bytealg/equal_mipsx.s b/src/internal/bytealg/equal_mipsx.s
new file mode 100644
index 0000000..1cabc70
--- /dev/null
+++ b/src/internal/bytealg/equal_mipsx.s
@@ -0,0 +1,62 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	REGCTXT	R22
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
+	MOVW	a+0(FP), R1
+	MOVW	b+4(FP), R2
+	BEQ	R1, R2, eq
+	MOVW	size+8(FP), R3
+	ADDU	R1, R3, R4
+loop:
+	BNE	R1, R4, test
+	MOVW	$1, R1
+	MOVB	R1, ret+12(FP)
+	RET
+test:
+	MOVBU	(R1), R6
+	ADDU	$1, R1
+	MOVBU	(R2), R7
+	ADDU	$1, R2
+	BEQ	R6, R7, loop
+
+	MOVB	R0, ret+12(FP)
+	RET
+eq:
+	MOVW	$1, R1
+	MOVB	R1, ret+12(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
+	MOVW	a+0(FP), R1
+	MOVW	b+4(FP), R2
+	BEQ	R1, R2, eq
+	MOVW	4(REGCTXT), R3	// compiler stores size at offset 4 in the closure
+	ADDU	R1, R3, R4
+loop:
+	BNE	R1, R4, test
+	MOVW	$1, R1
+	MOVB	R1, ret+8(FP)
+	RET
+test:
+	MOVBU	(R1), R6
+	ADDU	$1, R1
+	MOVBU	(R2), R7
+	ADDU	$1, R2
+	BEQ	R6, R7, loop
+
+	MOVB	R0, ret+8(FP)
+	RET
+eq:
+	MOVW	$1, R1
+	MOVB	R1, ret+8(FP)
+	RET
diff --git a/src/internal/bytealg/equal_native.go b/src/internal/bytealg/equal_native.go
new file mode 100644
index 0000000..cf3a245
--- /dev/null
+++ b/src/internal/bytealg/equal_native.go
@@ -0,0 +1,21 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "unsafe"
+
+// The declarations below generate ABI wrappers for functions
+// implemented in assembly in this package but declared in another
+// package.
+
+// The compiler generates calls to runtime.memequal and runtime.memequal_varlen.
+// In addition, the runtime calls runtime.memequal explicitly.
+// Those functions are implemented in this package.
+
+//go:linkname abigen_runtime_memequal runtime.memequal
+func abigen_runtime_memequal(a, b unsafe.Pointer, size uintptr) bool
+
+//go:linkname abigen_runtime_memequal_varlen runtime.memequal_varlen
+func abigen_runtime_memequal_varlen(a, b unsafe.Pointer) bool
diff --git a/src/internal/bytealg/equal_ppc64x.s b/src/internal/bytealg/equal_ppc64x.s
new file mode 100644
index 0000000..18171ea
--- /dev/null
+++ b/src/internal/bytealg/equal_ppc64x.s
@@ -0,0 +1,102 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVD    a+0(FP), R3
+	MOVD    b+8(FP), R4
+	MOVD    size+16(FP), R5
+	MOVD    $ret+24(FP), R10
+
+	BR	memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R4
+	CMP	R3, R4
+	BEQ	eq
+	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
+	MOVD    $ret+16(FP), R10
+	BR	memeqbody<>(SB)
+eq:
+	MOVD	$1, R3
+	MOVB	R3, ret+16(FP)
+	RET
+
+// Do an efficient memequal for ppc64
+// R3 = s1
+// R4 = s2
+// R5 = len
+// R10 = addr of return value (byte)
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD    R5,CTR
+	CMP     R5,$8		// only optimize >=8
+	BLT     simplecheck
+	DCBT	(R3)		// cache hint
+	DCBT	(R4)
+	CMP	R5,$32		// optimize >= 32
+	MOVD	R5,R6		// needed if setup8a branch
+	BLT	setup8a		// 8 byte moves only
+setup32a:                       // 8 byte aligned, >= 32 bytes
+	SRADCC  $5,R5,R6        // number of 32 byte chunks to compare
+	MOVD	R6,CTR
+	MOVD	$16,R14		// index for VSX loads and stores
+loop32a:
+	LXVD2X  (R3+R0), VS32	// VS32 = V0
+	LXVD2X  (R4+R0), VS33	// VS33 = V1
+	VCMPEQUBCC V0, V1, V2	// compare, setting CR6
+	BGE     CR6, noteq
+	LXVD2X  (R3+R14), VS32
+	LXVD2X  (R4+R14), VS33
+	VCMPEQUBCC V0, V1, V2
+	BGE     CR6, noteq
+	ADD     $32,R3		// bump up to next 32
+	ADD     $32,R4
+	BC      16, 0, loop32a  // br ctr and cr
+	ANDCC	$24,R5,R6       // Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC  $3,R6,R6        // get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD    R6,CTR
+loop8:
+	MOVD    0(R3),R6        // doublewords to compare
+	ADD	$8,R3
+	MOVD    0(R4),R7
+	ADD     $8,R4
+	CMP     R6,R7           // match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BNE     noteq
+leftover:
+	ANDCC   $7,R5,R6        // check for leftover bytes
+	BEQ     equal
+	MOVD    R6,CTR
+	BR	simple
+simplecheck:
+	CMP	R5,$0
+	BEQ	equal
+simple:
+	MOVBZ   0(R3), R6
+	ADD	$1,R3
+	MOVBZ   0(R4), R7
+	ADD     $1,R4
+	CMP     R6, R7
+	BNE     noteq
+	BC      8,2,simple
+	BNE	noteq
+	BR	equal
+noteq:
+	MOVB    $0, (R10)
+	RET
+equal:
+	MOVD	$1, R3
+	MOVB	R3, (R10)
+	RET
+
diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s
new file mode 100644
index 0000000..22cb4fa
--- /dev/null
+++ b/src/internal/bytealg/equal_riscv64.s
@@ -0,0 +1,49 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define	CTXT	S4
+
+// func memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOV	a+0(FP), A1
+	MOV	b+8(FP), A2
+	BEQ	A1, A2, eq
+	MOV	size+16(FP), A3
+	ADD	A1, A3, A4
+loop:
+	BEQ	A1, A4, eq
+
+	MOVBU	(A1), A6
+	ADD	$1, A1
+	MOVBU	(A2), A7
+	ADD	$1, A2
+	BEQ	A6, A7, loop
+
+	MOVB	ZERO, ret+24(FP)
+	RET
+eq:
+	MOV	$1, A1
+	MOVB	A1, ret+24(FP)
+	RET
+
+// func memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+	MOV	a+0(FP), A1
+	MOV	b+8(FP), A2
+	BEQ	A1, A2, eq
+	MOV	8(CTXT), A3    // compiler stores size at offset 8 in the closure
+	MOV	A1, 8(X2)
+	MOV	A2, 16(X2)
+	MOV	A3, 24(X2)
+	CALL	runtime·memequal(SB)
+	MOVBU	32(X2), A1
+	MOVB	A1, ret+16(FP)
+	RET
+eq:
+	MOV	$1, A1
+	MOVB	A1, ret+16(FP)
+	RET
diff --git a/src/internal/bytealg/equal_s390x.s b/src/internal/bytealg/equal_s390x.s
new file mode 100644
index 0000000..67f814d
--- /dev/null
+++ b/src/internal/bytealg/equal_s390x.s
@@ -0,0 +1,92 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R5
+	MOVD	size+16(FP), R6
+	LA	ret+24(FP), R7
+	BR	memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R5
+	MOVD	8(R12), R6    // compiler stores size at offset 8 in the closure
+	LA	ret+16(FP), R7
+	BR	memeqbody<>(SB)
+
+// input:
+//   R3 = a
+//   R5 = b
+//   R6 = len
+//   R7 = address of output byte (stores 0 or 1 here)
+//   a and b have the same length
+TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, equal
+loop:
+	CMPBEQ	R6, $0, equal
+	CMPBLT	R6, $32, tiny
+	CMP	R6, $256
+	BLT	tail
+	CLC	$256, 0(R3), 0(R5)
+	BNE	notequal
+	SUB	$256, R6
+	LA	256(R3), R3
+	LA	256(R5), R5
+	BR	loop
+tail:
+	SUB	$1, R6, R8
+	EXRL	$memeqbodyclc<>(SB), R8
+	BEQ	equal
+notequal:
+	MOVB	$0, 0(R7)
+	RET
+equal:
+	MOVB	$1, 0(R7)
+	RET
+tiny:
+	MOVD	$0, R2
+	CMPBLT	R6, $16, lt16
+	MOVD	0(R3), R8
+	MOVD	0(R5), R9
+	CMPBNE	R8, R9, notequal
+	MOVD	8(R3), R8
+	MOVD	8(R5), R9
+	CMPBNE	R8, R9, notequal
+	LA	16(R2), R2
+	SUB	$16, R6
+lt16:
+	CMPBLT	R6, $8, lt8
+	MOVD	0(R3)(R2*1), R8
+	MOVD	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	8(R2), R2
+	SUB	$8, R6
+lt8:
+	CMPBLT	R6, $4, lt4
+	MOVWZ	0(R3)(R2*1), R8
+	MOVWZ	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	4(R2), R2
+	SUB	$4, R6
+lt4:
+#define CHECK(n) \
+	CMPBEQ	R6, $n, equal \
+	MOVB	n(R3)(R2*1), R8 \
+	MOVB	n(R5)(R2*1), R9 \
+	CMPBNE	R8, R9, notequal
+	CHECK(0)
+	CHECK(1)
+	CHECK(2)
+	CHECK(3)
+	BR	equal
+
+TEXT memeqbodyclc<>(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
diff --git a/src/internal/bytealg/equal_wasm.s b/src/internal/bytealg/equal_wasm.s
new file mode 100644
index 0000000..a2b76c1
--- /dev/null
+++ b/src/internal/bytealg/equal_wasm.s
@@ -0,0 +1,77 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB), NOSPLIT, $0-25
+	Get SP
+	I64Load a+0(FP)
+	I64Load b+8(FP)
+	I64Load size+16(FP)
+	Call memeqbody<>(SB)
+	I64Store8 ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB), NOSPLIT, $0-17
+	Get SP
+	I64Load a+0(FP)
+	I64Load b+8(FP)
+	I64Load 8(CTXT) // compiler stores size at offset 8 in the closure
+	Call memeqbody<>(SB)
+	I64Store8 ret+16(FP)
+	RET
+
+// params: a, b, len
+// ret: 0/1
+TEXT memeqbody<>(SB), NOSPLIT, $0-0
+	Get R0
+	Get R1
+	I64Eq
+	If
+		I64Const $1
+		Return
+	End
+
+loop:
+	Loop
+		Get R2
+		I64Eqz
+		If
+			I64Const $1
+			Return
+		End
+
+		Get R0
+		I32WrapI64
+		I64Load8U $0
+		Get R1
+		I32WrapI64
+		I64Load8U $0
+		I64Ne
+		If
+			I64Const $0
+			Return
+		End
+
+		Get R0
+		I64Const $1
+		I64Add
+		Set R0
+
+		Get R1
+		I64Const $1
+		I64Add
+		Set R1
+
+		Get R2
+		I64Const $1
+		I64Sub
+		Set R2
+
+		Br loop
+	End
+	UNDEF
diff --git a/src/internal/bytealg/index_amd64.go b/src/internal/bytealg/index_amd64.go
new file mode 100644
index 0000000..c7a1941
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.go
@@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	if cpu.X86.HasAVX2 {
+		MaxLen = 63
+	} else {
+		MaxLen = 31
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
new file mode 100644
index 0000000..6193b57
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.s
@@ -0,0 +1,274 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+24(FP), R8
+	MOVQ b_len+32(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+48(FP), R11
+	JMP  indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVQ a_base+0(FP), DI
+	MOVQ a_len+8(FP), DX
+	MOVQ b_base+16(FP), R8
+	MOVQ b_len+24(FP), AX
+	MOVQ DI, R10
+	LEAQ ret+32(FP), R11
+	JMP  indexbody<>(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// R8: pointer to string, that we are searching for
+// R11: address, where to put return value
+// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
+TEXT indexbody<>(SB),NOSPLIT,$0
+	CMPQ AX, DX
+	JA fail
+	CMPQ DX, $16
+	JAE sse42
+no_sse42:
+	CMPQ AX, $2
+	JA   _3_or_more
+	MOVW (R8), R8
+	LEAQ -1(DI)(DX*1), DX
+loop2:
+	MOVW (DI), SI
+	CMPW SI,R8
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop2
+	JMP fail
+_3_or_more:
+	CMPQ AX, $3
+	JA   _4_or_more
+	MOVW 1(R8), BX
+	MOVW (R8), R8
+	LEAQ -2(DI)(DX*1), DX
+loop3:
+	MOVW (DI), SI
+	CMPW SI,R8
+	JZ   partial_success3
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+partial_success3:
+	MOVW 1(DI), SI
+	CMPW SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+_4_or_more:
+	CMPQ AX, $4
+	JA   _5_or_more
+	MOVL (R8), R8
+	LEAQ -3(DI)(DX*1), DX
+loop4:
+	MOVL (DI), SI
+	CMPL SI,R8
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop4
+	JMP fail
+_5_or_more:
+	CMPQ AX, $7
+	JA   _8_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVL -4(R8)(AX*1), BX
+	MOVL (R8), R8
+loop5to7:
+	MOVL (DI), SI
+	CMPL SI,R8
+	JZ   partial_success5to7
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+partial_success5to7:
+	MOVL -4(AX)(DI*1), SI
+	CMPL SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+_8_or_more:
+	CMPQ AX, $8
+	JA   _9_or_more
+	MOVQ (R8), R8
+	LEAQ -7(DI)(DX*1), DX
+loop8:
+	MOVQ (DI), SI
+	CMPQ SI,R8
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop8
+	JMP fail
+_9_or_more:
+	CMPQ AX, $15
+	JA   _16_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVQ -8(R8)(AX*1), BX
+	MOVQ (R8), R8
+loop9to15:
+	MOVQ (DI), SI
+	CMPQ SI,R8
+	JZ   partial_success9to15
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+partial_success9to15:
+	MOVQ -8(AX)(DI*1), SI
+	CMPQ SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+_16_or_more:
+	CMPQ AX, $16
+	JA   _17_or_more
+	MOVOU (R8), X1
+	LEAQ -15(DI)(DX*1), DX
+loop16:
+	MOVOU (DI), X2
+	PCMPEQB X1, X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop16
+	JMP fail
+_17_or_more:
+	CMPQ AX, $31
+	JA   _32_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVOU -16(R8)(AX*1), X0
+	MOVOU (R8), X1
+loop17to31:
+	MOVOU (DI), X2
+	PCMPEQB X1,X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   partial_success17to31
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+partial_success17to31:
+	MOVOU -16(AX)(DI*1), X3
+	PCMPEQB X0, X3
+	PMOVMSKB X3, SI
+	CMPQ  SI, $0xffff
+	JE success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+	CMPQ AX, $32
+	JA   _33_to_63
+	VMOVDQU (R8), Y1
+	LEAQ -31(DI)(DX*1), DX
+loop32:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop32
+	JMP fail_avx2
+_33_to_63:
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	VMOVDQU -32(R8)(AX*1), Y0
+	VMOVDQU (R8), Y1
+loop33to63:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, SI
+	CMPL  SI, $0xffffffff
+	JE   partial_success33to63
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+	JMP fail_avx2
+partial_success33to63:
+	VMOVDQU -32(AX)(DI*1), Y3
+	VPCMPEQB Y0, Y3, Y4
+	VPMOVMSKB Y4, SI
+	CMPL  SI, $0xffffffff
+	JE success_avx2
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop33to63
+fail_avx2:
+	VZEROUPPER
+fail:
+	MOVQ $-1, (R11)
+	RET
+success_avx2:
+	VZEROUPPER
+	JMP success
+sse42:
+	CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
+	JNE no_sse42
+	CMPQ AX, $12
+	// PCMPESTRI is slower than normal compare,
+	// so using it makes sense only if we advance 4+ bytes per compare
+	// This value was determined experimentally and is the ~same
+	// on Nehalem (first with SSE42) and Haswell.
+	JAE _9_or_more
+	LEAQ 16(R8), SI
+	TESTW $0xff0, SI
+	JEQ no_sse42
+	MOVOU (R8), X1
+	LEAQ -15(DI)(DX*1), SI
+	MOVQ $16, R9
+	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
+	// for equality (bits 2,3 are 11)
+	// result is not masked or inverted (bits 4,5 are 00)
+	// and corresponds to first matching byte (bit 6 is 0)
+	PCMPESTRI $0x0c, (DI), X1
+	// CX == 16 means no match,
+	// CX > R9 means partial match at the end of the string,
+	// otherwise sep is at offset CX from X1 start
+	CMPQ CX, R9
+	JBE sse42_success
+	ADDQ R9, DI
+	CMPQ DI, SI
+	JB loop_sse42
+	PCMPESTRI $0x0c, -1(SI), X1
+	CMPQ CX, R9
+	JA fail
+	LEAQ -1(SI), DI
+sse42_success:
+	ADDQ CX, DI
+success:
+	SUBQ R10, DI
+	MOVQ DI, (R11)
+	RET
diff --git a/src/internal/bytealg/index_arm64.go b/src/internal/bytealg/index_arm64.go
new file mode 100644
index 0000000..e87c109
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.go
@@ -0,0 +1,23 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Empirical data shows that using Index can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+	// Optimize cases where the length of the substring is less than 32 bytes
+	MaxLen = 32
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 16 characters, plus a few slop to start.
+	return 4 + n>>4
+}
diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s
new file mode 100644
index 0000000..3a551a7
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.s
@@ -0,0 +1,206 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+24(FP), R2
+	MOVD	b_len+32(FP), R3
+	MOVD	$ret+48(FP), R9
+	B	indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+	MOVD	a_base+0(FP), R0
+	MOVD	a_len+8(FP), R1
+	MOVD	b_base+16(FP), R2
+	MOVD	b_len+24(FP), R3
+	MOVD	$ret+32(FP), R9
+	B	indexbody<>(SB)
+
+// input:
+//   R0: haystack
+//   R1: length of haystack
+//   R2: needle
+//   R3: length of needle (2 <= len <= 32)
+//   R9: address to put result
+TEXT indexbody<>(SB),NOSPLIT,$0-56
+	// main idea is to load 'sep' into separate register(s)
+	// to avoid repeatedly re-load it again and again
+	// for sebsequent substring comparisons
+	SUB	R3, R1, R4
+	// R4 contains the start of last substring for comparison
+	ADD	R0, R4, R4
+	ADD	$1, R0, R8
+
+	CMP	$8, R3
+	BHI	greater_8
+	TBZ	$3, R3, len_2_7
+len_8:
+	// R5 contains 8-byte of sep
+	MOVD	(R2), R5
+loop_8:
+	// R6 contains substring for comparison
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_8
+	B	found
+len_2_7:
+	TBZ	$2, R3, len_2_3
+	TBZ	$1, R3, len_4_5
+	TBZ	$0, R3, len_6
+len_7:
+	// R5 and R6 contain 7-byte of sep
+	MOVWU	(R2), R5
+	// 1-byte overlap with R5
+	MOVWU	3(R2), R6
+loop_7:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_7
+	MOVWU	2(R0), R3
+	CMP	R6, R3
+	BNE	loop_7
+	B	found
+len_6:
+	// R5 and R6 contain 6-byte of sep
+	MOVWU	(R2), R5
+	MOVHU	4(R2), R6
+loop_6:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_6
+	MOVHU	3(R0), R3
+	CMP	R6, R3
+	BNE	loop_6
+	B	found
+len_4_5:
+	TBZ	$0, R3, len_4
+len_5:
+	// R5 and R7 contain 5-byte of sep
+	MOVWU	(R2), R5
+	MOVBU	4(R2), R7
+loop_5:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R3
+	CMP	R5, R3
+	BNE	loop_5
+	MOVBU	3(R0), R3
+	CMP	R7, R3
+	BNE	loop_5
+	B	found
+len_4:
+	// R5 contains 4-byte of sep
+	MOVWU	(R2), R5
+loop_4:
+	CMP	R4, R0
+	BHI	not_found
+	MOVWU.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_4
+	B	found
+len_2_3:
+	TBZ	$0, R3, len_2
+len_3:
+	// R6 and R7 contain 3-byte of sep
+	MOVHU	(R2), R6
+	MOVBU	2(R2), R7
+loop_3:
+	CMP	R4, R0
+	BHI	not_found
+	MOVHU.P	1(R0), R3
+	CMP	R6, R3
+	BNE	loop_3
+	MOVBU	1(R0), R3
+	CMP	R7, R3
+	BNE	loop_3
+	B	found
+len_2:
+	// R5 contains 2-byte of sep
+	MOVHU	(R2), R5
+loop_2:
+	CMP	R4, R0
+	BHI	not_found
+	MOVHU.P	1(R0), R6
+	CMP	R5, R6
+	BNE	loop_2
+found:
+	SUB	R8, R0, R0
+	MOVD	R0, (R9)
+	RET
+not_found:
+	MOVD	$-1, R0
+	MOVD	R0, (R9)
+	RET
+greater_8:
+	SUB	$9, R3, R11	// len(sep) - 9, offset of R0 for last 8 bytes
+	CMP	$16, R3
+	BHI	greater_16
+len_9_16:
+	MOVD.P	8(R2), R5	// R5 contains the first 8-byte of sep
+	SUB	$16, R3, R7	// len(sep) - 16, offset of R2 for last 8 bytes
+	MOVD	(R2)(R7), R6	// R6 contains the last 8-byte of sep
+loop_9_16:
+	// search the first 8 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R7
+	CMP	R5, R7
+	BNE	loop_9_16
+	MOVD	(R0)(R11), R7
+	CMP	R6, R7		// compare the last 8 bytes
+	BNE	loop_9_16
+	B	found
+greater_16:
+	CMP	$24, R3
+	BHI	len_25_32
+len_17_24:
+	LDP.P	16(R2), (R5, R6)	// R5 and R6 contain the first 16-byte of sep
+	SUB	$24, R3, R10		// len(sep) - 24
+	MOVD	(R2)(R10), R7		// R7 contains the last 8-byte of sep
+loop_17_24:
+	// search the first 16 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R10
+	CMP	R5, R10
+	BNE	loop_17_24
+	MOVD	7(R0), R10
+	CMP	R6, R10
+	BNE	loop_17_24
+	MOVD	(R0)(R11), R10
+	CMP	R7, R10		// compare the last 8 bytes
+	BNE	loop_17_24
+	B	found
+len_25_32:
+	LDP.P	16(R2), (R5, R6)
+	MOVD.P	8(R2), R7	// R5, R6 and R7 contain the first 24-byte of sep
+	SUB	$32, R3, R12	// len(sep) - 32
+	MOVD	(R2)(R12), R10	// R10 contains the last 8-byte of sep
+loop_25_32:
+	// search the first 24 bytes first
+	CMP	R4, R0
+	BHI	not_found
+	MOVD.P	1(R0), R12
+	CMP	R5, R12
+	BNE	loop_25_32
+	MOVD	7(R0), R12
+	CMP	R6, R12
+	BNE	loop_25_32
+	MOVD	15(R0), R12
+	CMP	R7, R12
+	BNE	loop_25_32
+	MOVD	(R0)(R11), R12
+	CMP	R10, R12	// compare the last 8 bytes
+	BNE	loop_25_32
+	B	found
diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go
new file mode 100644
index 0000000..98e859f
--- /dev/null
+++ b/src/internal/bytealg/index_generic.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!arm64,!s390x
+
+package bytealg
+
+const MaxBruteForce = 0
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int {
+	panic("unimplemented")
+}
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int {
+	panic("unimplemented")
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	panic("unimplemented")
+}
diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go
new file mode 100644
index 0000000..fde4214
--- /dev/null
+++ b/src/internal/bytealg/index_native.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 arm64 s390x
+
+package bytealg
+
+//go:noescape
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int
+
+//go:noescape
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int
diff --git a/src/internal/bytealg/index_s390x.go b/src/internal/bytealg/index_s390x.go
new file mode 100644
index 0000000..9340cf1
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.go
@@ -0,0 +1,31 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+	// Note: we're kind of lucky that this flag is available at this point.
+	// The runtime sets HasVX when processing auxv records, and that happens
+	// to happen *before* running the init functions of packages that
+	// the runtime depends on.
+	// TODO: it would really be nicer for internal/cpu to figure out this
+	// flag by itself. Then we wouldn't need to depend on quirks of
+	// early startup initialization order.
+	if cpu.S390X.HasVX {
+		MaxLen = 64
+	}
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+	// 1 error per 8 characters, plus a few slop to start.
+	return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_s390x.s b/src/internal/bytealg/index_s390x.s
new file mode 100644
index 0000000..491d5bc
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.s
@@ -0,0 +1,216 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+48(FP), R5
+	BR	indexbody<>(SB)
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
+	LMG	a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+	LMG	b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+32(FP), R5
+	BR	indexbody<>(SB)
+
+// s: string we are searching
+// sep: string to search for
+// R1=&s[0], R2=len(s)
+// R3=&sep[0], R4=len(sep)
+// R5=&ret (int)
+// Caller must confirm availability of vx facility before calling.
+TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
+	CMPBGT	R4, R2, notfound
+	ADD	R1, R2
+	SUB	R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
+	CMPBEQ	R4, $0, notfound
+	SUB	$1, R4 // R4=len(sep)-1 for use as VLL index
+	VLL	R4, (R3), V0 // contains first 16 bytes of sep
+	MOVD	R1, R7
+index2plus:
+	CMPBNE	R4, $1, index3plus
+	MOVD	$15(R7), R9
+	CMPBGE	R9, R2, index2to16
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	CMPBGE	R9, R2, index2to16
+index2loop:
+	VL	0(R7), V2          // 16 bytes, even indices
+	VL	1(R7), V4          // 16 bytes, odd indices
+	VCEQH	V1, V2, V5         // compare even indices
+	VCEQH	V1, V4, V6         // compare odd indices
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index3plus:
+	CMPBNE	R4, $2, index4plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$1, R0
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	VREPB	$2, V0, V8
+index3loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 2-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<2
+	VCEQH	V1, V2, V5         // compare 2-byte even indices
+	VCEQH	V1, V4, V6         // compare 2-byte odd indices
+	VCEQB	V8, V9, V10        // compare last bytes
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VN	V7, V10, V7        // AND indices with last byte
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index4plus:
+	CMPBNE	R4, $3, index5plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$2, R0
+	VGBM	$0x8888, V29       // 0xff000000ff000000...
+	VGBM	$0x2222, V30       // 0x0000ff000000ff00...
+	VGBM	$0xcccc, V31       // 0xffff0000ffff0000...
+	VONE	V16
+	VREPF	$0, V0, V1
+index4loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 3-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<1
+	VSLDB	$3, V2, V3, V10    // V10=(V2:V3)<<1
+	VCEQF	V1, V2, V5         // compare index 0, 4, ...
+	VCEQF	V1, V4, V6         // compare index 1, 5, ...
+	VCEQF	V1, V9, V11        // compare index 2, 6, ...
+	VCEQF	V1, V10, V12       // compare index 3, 7, ...
+	VSEL	V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
+	VSEL	V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
+	VSEL	V13, V14, V31, V7  // final merge
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index5plus:
+	CMPBGT	R4, $15, index17plus
+index2to16:
+	CMPBGT	R7, R2, notfound
+	MOVD	$1(R7), R8
+	CMPBGT	R8, R2, index2to16tail
+index2to16loop:
+	// unrolled 2x
+	VLL	R4, (R7), V1
+	VLL	R4, 1(R7), V2
+	VCEQGS	V0, V1, V3
+	BEQ	found
+	MOVD	$1(R7), R7
+	VCEQGS	V0, V2, V4
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLT	R7, R2, index2to16loop
+	CMPBGT	R7, R2, notfound
+index2to16tail:
+	VLL	R4, (R7), V1
+	VCEQGS	V0, V1, V2
+	BEQ	found
+	BR	notfound
+
+index17plus:
+	CMPBGT	R4, $31, index33plus
+	SUB	$16, R4, R0
+	VLL	R0, 16(R3), V1
+	VONE	V7
+index17to32loop:
+	VL	(R7), V2
+	VLL	R0, 16(R7), V3
+	VCEQG	V0, V2, V4
+	VCEQG	V1, V3, V5
+	VN	V4, V5, V6
+	VCEQGS	V6, V7, V8
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index17to32loop
+	BR	notfound
+
+index33plus:
+	CMPBGT	R4, $47, index49plus
+	SUB	$32, R4, R0
+	VL	16(R3), V1
+	VLL	R0, 32(R3), V2
+	VONE	V11
+index33to48loop:
+	VL	(R7), V3
+	VL	16(R7), V4
+	VLL	R0, 32(R7), V5
+	VCEQG	V0, V3, V6
+	VCEQG	V1, V4, V7
+	VCEQG	V2, V5, V8
+	VN	V6, V7, V9
+	VN	V8, V9, V10
+	VCEQGS	V10, V11, V12
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index33to48loop
+	BR	notfound
+
+index49plus:
+	CMPBGT	R4, $63, index65plus
+	SUB	$48, R4, R0
+	VL	16(R3), V1
+	VL	32(R3), V2
+	VLL	R0, 48(R3), V3
+	VONE	V15
+index49to64loop:
+	VL	(R7), V4
+	VL	16(R7), V5
+	VL	32(R7), V6
+	VLL	R0, 48(R7), V7
+	VCEQG	V0, V4, V8
+	VCEQG	V1, V5, V9
+	VCEQG	V2, V6, V10
+	VCEQG	V3, V7, V11
+	VN	V8, V9, V12
+	VN	V10, V11, V13
+	VN	V12, V13, V14
+	VCEQGS	V14, V15, V16
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index49to64loop
+notfound:
+	MOVD	$-1, (R5)
+	RET
+
+index65plus:
+	// not implemented
+	MOVD	$0, (R0)
+	RET
+
+foundV17: // index is in doubleword V17[0]
+	VLGVG	$0, V17, R8
+	ADD	R8, R7
+found:
+	SUB	R1, R7
+	MOVD	R7, (R5)
+	RET
diff --git a/src/internal/bytealg/indexbyte_386.s b/src/internal/bytealg/indexbyte_386.s
new file mode 100644
index 0000000..8a03054
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_386.s
@@ -0,0 +1,34 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVL	b_base+0(FP), SI
+	MOVL	b_len+4(FP), CX
+	MOVB	c+12(FP), AL
+	MOVL	SI, DI
+	CLD; REPN; SCASB
+	JZ 3(PC)
+	MOVL	$-1, ret+16(FP)
+	RET
+	SUBL	SI, DI
+	SUBL	$1, DI
+	MOVL	DI, ret+16(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVL	s_base+0(FP), SI
+	MOVL	s_len+4(FP), CX
+	MOVB	c+8(FP), AL
+	MOVL	SI, DI
+	CLD; REPN; SCASB
+	JZ 3(PC)
+	MOVL	$-1, ret+12(FP)
+	RET
+	SUBL	SI, DI
+	SUBL	$1, DI
+	MOVL	DI, ret+12(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s
new file mode 100644
index 0000000..f78093c
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_amd64.s
@@ -0,0 +1,147 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT	·IndexByte(SB), NOSPLIT, $0-40
+	MOVQ b_base+0(FP), SI
+	MOVQ b_len+8(FP), BX
+	MOVB c+24(FP), AL
+	LEAQ ret+32(FP), R8
+	JMP  indexbytebody<>(SB)
+
+TEXT	·IndexByteString(SB), NOSPLIT, $0-32
+	MOVQ s_base+0(FP), SI
+	MOVQ s_len+8(FP), BX
+	MOVB c+16(FP), AL
+	LEAQ ret+24(FP), R8
+	JMP  indexbytebody<>(SB)
+
+// input:
+//   SI: data
+//   BX: data len
+//   AL: byte sought
+//   R8: address to put result
+TEXT	indexbytebody<>(SB), NOSPLIT, $0
+	// Shuffle X0 around so that each byte contains
+	// the character we're looking for.
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
+
+	CMPQ BX, $16
+	JLT small
+
+	MOVQ SI, DI
+
+	CMPQ BX, $32
+	JA avx2
+sse:
+	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
+	JMP	sseloopentry
+
+sseloop:
+	// Move the next 16-byte chunk of the data into X1.
+	MOVOU	(DI), X1
+	// Compare bytes in X0 to X1.
+	PCMPEQB	X0, X1
+	// Take the top bit of each byte in X1 and put the result in DX.
+	PMOVMSKB X1, DX
+	// Find first set bit, if any.
+	BSFL	DX, DX
+	JNZ	ssesuccess
+	// Advance to next block.
+	ADDQ	$16, DI
+sseloopentry:
+	CMPQ	DI, AX
+	JB	sseloop
+
+	// Search the last 16-byte chunk. This chunk may overlap with the
+	// chunks we've already searched, but that's ok.
+	MOVQ	AX, DI
+	MOVOU	(AX), X1
+	PCMPEQB	X0, X1
+	PMOVMSKB X1, DX
+	BSFL	DX, DX
+	JNZ	ssesuccess
+
+failure:
+	MOVQ $-1, (R8)
+	RET
+
+// We've found a chunk containing the byte.
+// The chunk was loaded from DI.
+// The index of the matching byte in the chunk is DX.
+// The start of the data is SI.
+ssesuccess:
+	SUBQ SI, DI	// Compute offset of chunk within data.
+	ADDQ DX, DI	// Add offset of byte within chunk.
+	MOVQ DI, (R8)
+	RET
+
+// handle for lengths < 16
+small:
+	TESTQ	BX, BX
+	JEQ	failure
+
+	// Check if we'll load across a page boundary.
+	LEAQ	16(SI), AX
+	TESTW	$0xff0, AX
+	JEQ	endofpage
+
+	MOVOU	(SI), X1 // Load data
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	CMPL	DX, BX
+	JAE	failure	// Match is past end of data.
+	MOVQ	DX, (R8)
+	RET
+
+endofpage:
+	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	MOVL	BX, CX
+	SHLL	CX, DX
+	SHRL	$16, DX	// Shift desired bits down to bottom of register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	MOVQ	DX, (R8)
+	RET
+
+avx2:
+	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+	JNE sse
+	MOVD AX, X0
+	LEAQ -32(SI)(BX*1), R11
+	VPBROADCASTB  X0, Y1
+avx2_loop:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPTEST Y3, Y3
+	JNZ avx2success
+	ADDQ $32, DI
+	CMPQ DI, R11
+	JLT avx2_loop
+	MOVQ R11, DI
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPTEST Y3, Y3
+	JNZ avx2success
+	VZEROUPPER
+	MOVQ $-1, (R8)
+	RET
+
+avx2success:
+	VPMOVMSKB Y3, DX
+	BSFL DX, DX
+	SUBQ SI, DI
+	ADDQ DI, DX
+	MOVQ DX, (R8)
+	VZEROUPPER
+	RET
diff --git a/src/internal/bytealg/indexbyte_arm.s b/src/internal/bytealg/indexbyte_arm.s
new file mode 100644
index 0000000..faf9797
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_arm.s
@@ -0,0 +1,46 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R0
+	MOVW	b_len+4(FP), R1
+	MOVBU	c+12(FP), R2	// byte to find
+	MOVW	$ret+16(FP), R5
+	B	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R0
+	MOVW	s_len+4(FP), R1
+	MOVBU	c+8(FP), R2	// byte to find
+	MOVW	$ret+12(FP), R5
+	B	indexbytebody<>(SB)
+
+// input:
+//  R0: data
+//  R1: data length
+//  R2: byte to find
+//  R5: address to put result
+TEXT indexbytebody<>(SB),NOSPLIT,$0-0
+	MOVW	R0, R4		// store base for later
+	ADD	R0, R1		// end
+
+loop:
+	CMP	R0, R1
+	B.EQ	notfound
+	MOVBU.P	1(R0), R3
+	CMP	R2, R3
+	B.NE	loop
+
+	SUB	$1, R0		// R0 will be one beyond the position we want
+	SUB	R4, R0		// remove base
+	MOVW	R0, (R5)
+	RET
+
+notfound:
+	MOVW	$-1, R0
+	MOVW	R0, (R5)
+	RET
diff --git a/src/internal/bytealg/indexbyte_arm64.s b/src/internal/bytealg/indexbyte_arm64.s
new file mode 100644
index 0000000..40843fb
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_arm64.s
@@ -0,0 +1,126 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+	MOVD	b_base+0(FP), R0
+	MOVD	b_len+8(FP), R2
+	MOVBU	c+24(FP), R1
+	MOVD	$ret+32(FP), R8
+	B	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+	MOVD	s_base+0(FP), R0
+	MOVD	s_len+8(FP), R2
+	MOVBU	c+16(FP), R1
+	MOVD	$ret+24(FP), R8
+	B	indexbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R1: byte to search
+//   R2: data len
+//   R8: address to put result
+TEXT indexbytebody<>(SB),NOSPLIT,$0
+	// Core algorithm:
+	// For each 32-byte chunk we calculate a 64-bit syndrome value,
+	// with two bits per byte. For each tuple, bit 0 is set if the
+	// relevant byte matched the requested character and bit 1 is
+	// not used (faster than using a 32bit syndrome). Since the bits
+	// in the syndrome reflect exactly the order in which things occur
+	// in the original string, counting trailing zeros allows to
+	// identify exactly which byte has matched.
+
+	CBZ	R2, fail
+	MOVD	R0, R11
+	// Magic constant 0x40100401 allows us to identify
+	// which lane matches the requested byte.
+	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
+	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
+	MOVD	$0x40100401, R5
+	VMOV	R1, V0.B16
+	// Work with aligned 32-byte chunks
+	BIC	$0x1f, R0, R3
+	VMOV	R5, V5.S4
+	ANDS	$0x1f, R0, R9
+	AND	$0x1f, R2, R10
+	BEQ	loop
+
+	// Input string is not 32-byte aligned. We calculate the
+	// syndrome value for the aligned 32 bytes block containing
+	// the first bytes and mask off the irrelevant part.
+	VLD1.P	(R3), [V1.B16, V2.B16]
+	SUB	$0x20, R9, R4
+	ADDS	R4, R2, R2
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	VADDP	V4.B16, V3.B16, V6.B16 // 256->128
+	VADDP	V6.B16, V6.B16, V6.B16 // 128->64
+	VMOV	V6.D[0], R6
+	// Clear the irrelevant lower bits
+	LSL	$1, R9, R4
+	LSR	R4, R6, R6
+	LSL	R4, R6, R6
+	// The first block can also be the last
+	BLS	masklast
+	// Have we found something already?
+	CBNZ	R6, tail
+
+loop:
+	VLD1.P	(R3), [V1.B16, V2.B16]
+	SUBS	$0x20, R2, R2
+	VCMEQ	V0.B16, V1.B16, V3.B16
+	VCMEQ	V0.B16, V2.B16, V4.B16
+	// If we're out of data we finish regardless of the result
+	BLS	end
+	// Use a fast check for the termination condition
+	VORR	V4.B16, V3.B16, V6.B16
+	VADDP	V6.D2, V6.D2, V6.D2
+	VMOV	V6.D[0], R6
+	// We're not out of data, loop if we haven't found the character
+	CBZ	R6, loop
+
+end:
+	// Termination condition found, let's calculate the syndrome value
+	VAND	V5.B16, V3.B16, V3.B16
+	VAND	V5.B16, V4.B16, V4.B16
+	VADDP	V4.B16, V3.B16, V6.B16
+	VADDP	V6.B16, V6.B16, V6.B16
+	VMOV	V6.D[0], R6
+	// Only do the clear for the last possible block with less than 32 bytes
+	// Condition flags come from SUBS in the loop
+	BHS	tail
+
+masklast:
+	// Clear the irrelevant upper bits
+	ADD	R9, R10, R4
+	AND	$0x1f, R4, R4
+	SUB	$0x20, R4, R4
+	NEG	R4<<1, R4
+	LSL	R4, R6, R6
+	LSR	R4, R6, R6
+
+tail:
+	// Check that we have found a character
+	CBZ	R6, fail
+	// Count the trailing zeros using bit reversing
+	RBIT	R6, R6
+	// Compensate the last post-increment
+	SUB	$0x20, R3, R3
+	// And count the leading zeros
+	CLZ	R6, R6
+	// R6 is twice the offset into the fragment
+	ADD	R6>>1, R3, R0
+	// Compute the offset result
+	SUB	R11, R0, R0
+	MOVD	R0, (R8)
+	RET
+
+fail:
+	MOVD	$-1, R0
+	MOVD	R0, (R8)
+	RET
diff --git a/src/internal/bytealg/indexbyte_generic.go b/src/internal/bytealg/indexbyte_generic.go
new file mode 100644
index 0000000..0b012a8
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_generic.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !386,!amd64,!s390x,!arm,!arm64,!ppc64,!ppc64le,!mips,!mipsle,!mips64,!mips64le,!riscv64,!wasm
+
+package bytealg
+
+func IndexByte(b []byte, c byte) int {
+	for i, x := range b {
+		if x == c {
+			return i
+		}
+	}
+	return -1
+}
+
+func IndexByteString(s string, c byte) int {
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			return i
+		}
+	}
+	return -1
+}
diff --git a/src/internal/bytealg/indexbyte_mips64x.s b/src/internal/bytealg/indexbyte_mips64x.s
new file mode 100644
index 0000000..6ebf0de
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_mips64x.s
@@ -0,0 +1,54 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+	MOVV	b_base+0(FP), R1
+	MOVV	b_len+8(FP), R2
+	MOVBU	c+24(FP), R3	// byte to find
+	MOVV	R1, R4		// store base for later
+	ADDV	R1, R2		// end
+	ADDV	$-1, R1
+
+loop:
+	ADDV	$1, R1
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	BNE	R3, R5, loop
+
+	SUBV	R4, R1		// remove base
+	MOVV	R1, ret+32(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R1
+	MOVV	R1, ret+32(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+	MOVV	s_base+0(FP), R1
+	MOVV	s_len+8(FP), R2
+	MOVBU	c+16(FP), R3	// byte to find
+	MOVV	R1, R4		// store base for later
+	ADDV	R1, R2		// end
+	ADDV	$-1, R1
+
+loop:
+	ADDV	$1, R1
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	BNE	R3, R5, loop
+
+	SUBV	R4, R1		// remove base
+	MOVV	R1, ret+24(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R1
+	MOVV	R1, ret+24(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_mipsx.s b/src/internal/bytealg/indexbyte_mipsx.s
new file mode 100644
index 0000000..e44440b
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_mipsx.s
@@ -0,0 +1,52 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-20
+	MOVW	b_base+0(FP), R1
+	MOVW	b_len+4(FP), R2
+	MOVBU	c+12(FP), R3	// byte to find
+	ADDU	$1, R1, R4	// store base+1 for later
+	ADDU	R1, R2	// end
+
+loop:
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	ADDU	$1, R1
+	BNE	R3, R5, loop
+
+	SUBU	R4, R1	// R1 will be one beyond the position we want so remove (base+1)
+	MOVW	R1, ret+16(FP)
+	RET
+
+notfound:
+	MOVW	$-1, R1
+	MOVW	R1, ret+16(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-16
+	MOVW	s_base+0(FP), R1
+	MOVW	s_len+4(FP), R2
+	MOVBU	c+8(FP), R3	// byte to find
+	ADDU	$1, R1, R4	// store base+1 for later
+	ADDU	R1, R2	// end
+
+loop:
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	ADDU	$1, R1
+	BNE	R3, R5, loop
+
+	SUBU	R4, R1	// remove (base+1)
+	MOVW	R1, ret+12(FP)
+	RET
+
+notfound:
+	MOVW	$-1, R1
+	MOVW	R1, ret+12(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_native.go b/src/internal/bytealg/indexbyte_native.go
new file mode 100644
index 0000000..f96c5be
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_native.go
@@ -0,0 +1,13 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 s390x arm arm64 ppc64 ppc64le mips mipsle mips64 mips64le riscv64 wasm
+
+package bytealg
+
+//go:noescape
+func IndexByte(b []byte, c byte) int
+
+//go:noescape
+func IndexByteString(s string, c byte) int
diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s
new file mode 100644
index 0000000..6e14e80
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -0,0 +1,315 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	b_base+0(FP), R3	// R3 = byte array pointer
+	MOVD	b_len+8(FP), R4		// R4 = length
+	MOVBZ	c+24(FP), R5		// R5 = byte
+	MOVD	$ret+32(FP), R14	// R14 = &ret
+	BR	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
+	MOVD	s_base+0(FP), R3  // R3 = string
+	MOVD	s_len+8(FP), R4	  // R4 = length
+	MOVBZ	c+16(FP), R5	  // R5 = byte
+	MOVD	$ret+24(FP), R14  // R14 = &ret
+	BR	indexbytebody<>(SB)
+
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R17		// Save base address for calculating the index later.
+	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
+	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
+	ADD	R4,R3,R7	// Last acceptable address in R7.
+	DCBT	(R8)		// Prepare cache line.
+
+	RLDIMI	$16,R5,$32,R5
+	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
+	MOVD	$-1,R9
+	WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+	RLDIMI	$32,R5,$0,R5
+	MOVD	R7,R10		// Save last acceptable address in R10 for later.
+	ADD	$-1,R7,R7
+#ifdef GOARCH_ppc64le
+	SLD	R6,R9,R9	// Prepare mask for Little Endian
+#else
+	SRD	R6,R9,R9	// Same for Big Endian
+#endif
+	BLE	small_string	// Jump to the small string case if it's ≤32 bytes.
+
+	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
+	// in V0, V1 and V10, then branch to the preloop.
+	ANDCC	$63,R3,R11
+	BEQ	CR0,qw_align
+	RLDICL	$0,R3,$61,R11
+
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base
+	RLDICL	$0,R7,$61,R6	// length-1
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
+	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
+	ADD	R4,R11,R4
+
+	// Check for quadword alignment
+	ANDCC	$15,R8,R11
+	BEQ	CR0,qw_align
+
+	// Not aligned, so handle the next doubleword
+	MOVD	0(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR7
+	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
+
+	// Either quadword aligned or 64-byte at this point. We can use LVX.
+qw_align:
+
+	// Set up auxiliary data for the vectorized algorithm.
+	VSPLTISB  $0,V0		// Replicate 0 across V0
+	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
+	MTVRD	  R5,V1
+	LVSL	  (R0+R0),V11
+	VSLB	  V11,V10,V10
+	VSPLTB	  $7,V1,V1	// Replicate byte across V1
+	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
+	BLE	  tail
+
+	// We will load 4 quardwords per iteration in the loop, so check for
+	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
+	ANDCC	  $63,R8,R11
+	BEQ	  CR0,preloop
+
+	// Not 64-byte aligned. Load one quadword at a time until aligned.
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $-16,R4,R4
+	ADD	    $16,R8,R8
+
+	// 64-byte aligned. Prepare for the main loop.
+preloop:
+	CMPU	R4,$64
+	BLE	tail	      // If len ≤ 64, don't use the vectorized loop
+
+	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
+	// per loop iteration. The last doubleword is in R10, so our loop counter
+	// starts at (R10-R8)/64.
+	SUB	R8,R10,R6
+	SRD	$6,R6,R9      // Loop counter in R9
+	MOVD	R9,CTR
+
+	ADD	$-64,R8,R8   // Adjust index for loop entry
+	MOVD	$16,R11      // Load offsets for the vector loads
+	MOVD	$32,R9
+	MOVD	$48,R7
+
+	// Main loop we will load 64 bytes per iteration
+loop:
+	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
+	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
+	LVX	    (R8+R11),V3
+	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
+	VCMPEQUB    V1,V3,V7
+
+	LVX	    (R8+R9),V4
+	LVX	    (R8+R7),V5
+	VCMPEQUB    V1,V4,V8
+	VCMPEQUB    V1,V5,V9
+
+	VOR	    V6,V7,V11	      // Compress the result in a single vector
+	VOR	    V8,V9,V12
+	VOR	    V11,V12,V13
+	VCMPEQUBCC  V0,V13,V14	      // Check for byte
+	BGE	    CR6,found
+	BC	    16,0,loop	      // bdnz loop
+
+	// Handle the tailing bytes or R4 ≤ 64
+	RLDICL	$0,R6,$58,R4
+	ADD	$64,R8,R8
+tail:
+	CMPU	    R4,$0
+	BEQ	    notfound
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+
+notfound:
+	MOVD	$-1,R3
+	MOVD	R3,(R14)
+	RET
+
+found:
+	// We will now compress the results into a single doubleword,
+	// so it can be moved to a GPR for the final index calculation.
+
+	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
+	// first bit of each byte into bits 48-63.
+	VBPERMQ	  V6,V10,V6
+	VBPERMQ	  V7,V10,V7
+	VBPERMQ	  V8,V10,V8
+	VBPERMQ	  V9,V10,V9
+
+	// Shift each 16-bit component into its correct position for
+	// merging into a single doubleword.
+#ifdef GOARCH_ppc64le
+	VSLDOI	  $2,V7,V7,V7
+	VSLDOI	  $4,V8,V8,V8
+	VSLDOI	  $6,V9,V9,V9
+#else
+	VSLDOI	  $6,V6,V6,V6
+	VSLDOI	  $4,V7,V7,V7
+	VSLDOI	  $2,V8,V8,V8
+#endif
+
+	// Merge V6-V9 into a single doubleword and move to a GPR.
+	VOR	V6,V7,V11
+	VOR	V8,V9,V4
+	VOR	V4,V11,V4
+	MFVRD	V4,R3
+
+#ifdef GOARCH_ppc64le
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	ADD	R8,R11,R3	// Calculate byte address
+
+return:
+	SUB	R17,R3
+	MOVD	R3,(R14)
+	RET
+
+found_qw_align:
+	// Use the same algorithm as above. Compress the result into
+	// a single doubleword and move it to a GPR for the final
+	// calculation.
+	VBPERMQ	  V6,V10,V6
+
+#ifdef GOARCH_ppc64le
+	MFVRD	  V6,R3
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11
+#else
+	VSLDOI	  $6,V6,V6,V6
+	MFVRD	  V6,R3
+	CNTLZD	  R3,R11
+#endif
+	ADD	  R8,R11,R3
+	CMPU	  R11,R4
+	BLT	  return
+	BR	  notfound
+
+done:
+	// At this point, R3 has 0xFF in the same position as the byte we are
+	// looking for in the doubleword. Use that to calculate the exact index
+	// of the byte.
+#ifdef GOARCH_ppc64le
+	ADD	$-1,R3,R11
+	ANDN	R3,R11,R11
+	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	CMPU	R8,R7		// Check if we are at the last doubleword.
+	SRD	$3,R11		// Convert trailing zeros to bytes.
+	ADD	R11,R8,R3
+	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
+	BNE	return
+	BLE	CR7,return
+	BR	notfound
+
+small_string:
+	// We unroll this loop for better performance.
+	CMPU	R4,$0		// Check for length=0
+	BEQ	notfound
+
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base.
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
+	RLDICL	$0,R7,$61,R6	// length-1
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
+	CMPU	R8,R7
+	BNE	CR7,done
+	BEQ	notfound	// Hit length.
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	BNE	CR6,done
+	BR	notfound
+
diff --git a/src/internal/bytealg/indexbyte_riscv64.s b/src/internal/bytealg/indexbyte_riscv64.s
new file mode 100644
index 0000000..156c303
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_riscv64.s
@@ -0,0 +1,52 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT,$0-40
+	MOV	b_base+0(FP), A1
+	MOV	b_len+8(FP), A2
+	MOVBU	c+24(FP), A3	// byte to find
+	MOV	A1, A4		// store base for later
+	ADD	A1, A2		// end
+	ADD	$-1, A1
+
+loop:
+	ADD	$1, A1
+	BEQ	A1, A2, notfound
+	MOVBU	(A1), A5
+	BNE	A3, A5, loop
+
+	SUB	A4, A1		// remove base
+	MOV	A1, ret+32(FP)
+	RET
+
+notfound:
+	MOV	$-1, A1
+	MOV	A1, ret+32(FP)
+	RET
+
+TEXT ·IndexByteString(SB),NOSPLIT,$0-32
+	MOV	s_base+0(FP), A1
+	MOV	s_len+8(FP), A2
+	MOVBU	c+16(FP), A3	// byte to find
+	MOV	A1, A4		// store base for later
+	ADD	A1, A2		// end
+	ADD	$-1, A1
+
+loop:
+	ADD	$1, A1
+	BEQ	A1, A2, notfound
+	MOVBU	(A1), A5
+	BNE	A3, A5, loop
+
+	SUB	A4, A1		// remove base
+	MOV	A1, ret+24(FP)
+	RET
+
+notfound:
+	MOV	$-1, A1
+	MOV	A1, ret+24(FP)
+	RET
diff --git a/src/internal/bytealg/indexbyte_s390x.s b/src/internal/bytealg/indexbyte_s390x.s
new file mode 100644
index 0000000..cf88d92
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_s390x.s
@@ -0,0 +1,108 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	b_base+0(FP), R3// b_base => R3
+	MOVD	b_len+8(FP), R4 // b_len => R4
+	MOVBZ	c+24(FP), R5    // c => R5
+	MOVD	$ret+32(FP), R2 // &ret => R9
+	BR	indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
+	MOVD	s_base+0(FP), R3// s_base => R3
+	MOVD	s_len+8(FP), R4 // s_len => R4
+	MOVBZ	c+16(FP), R5    // c => R5
+	MOVD	$ret+24(FP), R2 // &ret => R9
+	BR	indexbytebody<>(SB)
+
+// input:
+// R3: s
+// R4: s_len
+// R5: c -- byte sought
+// R2: &ret -- address to put index into
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0
+	CMPBEQ	R4, $0, notfound
+	MOVD	R3, R6          // store base for later
+	ADD	R3, R4, R8      // the address after the end of the string
+	//if the length is small, use loop; otherwise, use vector or srst search
+	CMPBGE	R4, $16, large
+
+residual:
+	CMPBEQ	R3, R8, notfound
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, residual
+
+found:
+	SUB	R6, R3
+	SUB	$1, R3
+	MOVD	R3, 0(R2)
+	RET
+
+notfound:
+	MOVD	$-1, 0(R2)
+	RET
+
+large:
+	MOVBZ	internal∕cpu·S390X+const_offsetS390xHasVX(SB), R1
+	CMPBNE	R1, $0, vectorimpl
+
+srstimpl:                       // no vector facility
+	MOVBZ	R5, R0          // c needs to be in R0, leave until last minute as currently R0 is expected to be 0
+srstloop:
+	WORD	$0xB25E0083     // srst %r8, %r3 (search the range [R3, R8))
+	BVS	srstloop        // interrupted - continue
+	BGT	notfoundr0
+foundr0:
+	XOR	R0, R0          // reset R0
+	SUB	R6, R8          // remove base
+	MOVD	R8, 0(R2)
+	RET
+notfoundr0:
+	XOR	R0, R0          // reset R0
+	MOVD	$-1, 0(R2)
+	RET
+
+vectorimpl:
+	//if the address is not 16byte aligned, use loop for the header
+	MOVD	R3, R8
+	AND	$15, R8
+	CMPBGT	R8, $0, notaligned
+
+aligned:
+	ADD	R6, R4, R8
+	MOVD	R8, R7
+	AND	$-16, R7
+	// replicate c across V17
+	VLVGB	$0, R5, V19
+	VREPB	$0, V19, V17
+
+vectorloop:
+	CMPBGE	R3, R7, residual
+	VL	0(R3), V16    // load string to be searched into V16
+	ADD	$16, R3
+	VFEEBS	V16, V17, V18 // search V17 in V16 and set conditional code accordingly
+	BVS	vectorloop
+
+	// when vector search found c in the string
+	VLGVB	$7, V18, R7   // load 7th element of V18 containing index into R7
+	SUB	$16, R3
+	SUB	R6, R3
+	ADD	R3, R7
+	MOVD	R7, 0(R2)
+	RET
+
+notaligned:
+	MOVD	R3, R8
+	AND	$-16, R8
+	ADD     $16, R8
+notalignedloop:
+	CMPBEQ	R3, R8, aligned
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, notalignedloop
+	BR	found
diff --git a/src/internal/bytealg/indexbyte_wasm.s b/src/internal/bytealg/indexbyte_wasm.s
new file mode 100644
index 0000000..ef4bd93
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_wasm.s
@@ -0,0 +1,195 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB), NOSPLIT, $0-40
+	I64Load b_base+0(FP)
+	I32WrapI64
+	I32Load8U c+24(FP)
+	I64Load b_len+8(FP)
+	I32WrapI64
+	Call memchr<>(SB)
+	I64ExtendI32S
+	Set R0
+
+	Get SP
+	I64Const $-1
+	Get R0
+	I64Load b_base+0(FP)
+	I64Sub
+	Get R0
+	I64Eqz $0
+	Select
+	I64Store ret+32(FP)
+
+	RET
+
+TEXT ·IndexByteString(SB), NOSPLIT, $0-32
+	Get SP
+	I64Load s_base+0(FP)
+	I32WrapI64
+	I32Load8U c+16(FP)
+	I64Load s_len+8(FP)
+	I32WrapI64
+	Call memchr<>(SB)
+	I64ExtendI32S
+	Set R0
+
+	I64Const $-1
+	Get R0
+	I64Load s_base+0(FP)
+	I64Sub
+	Get R0
+	I64Eqz $0
+	Select
+	I64Store ret+24(FP)
+
+	RET
+
+// initially compiled with emscripten and then modified over time.
+// params:
+//   R0: s
+//   R1: c
+//   R2: len
+// ret: index
+TEXT memchr<>(SB), NOSPLIT, $0
+	Get R1
+	Set R4
+	Block
+		Block
+			Get R2
+			I32Const $0
+			I32Ne
+			Tee R3
+			Get R0
+			I32Const $3
+			I32And
+			I32Const $0
+			I32Ne
+			I32And
+			If
+				Loop
+					Get R0
+					I32Load8U $0
+					Get R1
+					I32Eq
+					BrIf $2
+					Get R2
+					I32Const $-1
+					I32Add
+					Tee R2
+					I32Const $0
+					I32Ne
+					Tee R3
+					Get R0
+					I32Const $1
+					I32Add
+					Tee R0
+					I32Const $3
+					I32And
+					I32Const $0
+					I32Ne
+					I32And
+					BrIf $0
+				End
+			End
+			Get R3
+			BrIf $0
+			I32Const $0
+			Set R1
+			Br $1
+		End
+		Get R0
+		I32Load8U $0
+		Get R4
+		Tee R3
+		I32Eq
+		If
+			Get R2
+			Set R1
+		Else
+			Get R4
+			I32Const $16843009
+			I32Mul
+			Set R4
+			Block
+				Block
+					Get R2
+					I32Const $3
+					I32GtU
+					If
+						Get R2
+						Set R1
+						Loop
+							Get R0
+							I32Load $0
+							Get R4
+							I32Xor
+							Tee R2
+							I32Const $-2139062144
+							I32And
+							I32Const $-2139062144
+							I32Xor
+							Get R2
+							I32Const $-16843009
+							I32Add
+							I32And
+							I32Eqz
+							If
+								Get R0
+								I32Const $4
+								I32Add
+								Set R0
+								Get R1
+								I32Const $-4
+								I32Add
+								Tee R1
+								I32Const $3
+								I32GtU
+								BrIf $1
+								Br $3
+							End
+						End
+					Else
+						Get R2
+						Set R1
+						Br $1
+					End
+					Br $1
+				End
+				Get R1
+				I32Eqz
+				If
+					I32Const $0
+					Set R1
+					Br $3
+				End
+			End
+			Loop
+				Get R0
+				I32Load8U $0
+				Get R3
+				I32Eq
+				BrIf $2
+				Get R0
+				I32Const $1
+				I32Add
+				Set R0
+				Get R1
+				I32Const $-1
+				I32Add
+				Tee R1
+				BrIf $0
+				I32Const $0
+				Set R1
+			End
+		End
+	End
+	Get R0
+	I32Const $0
+	Get R1
+	Select
+	Return
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 13:14:23 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-28 13:14:23 +0000
commit	73df946d56c74384511a194dd01dbe099584fd1a (patch)
tree	fd0bcea490dd81327ddfbb31e215439672c9a068 /src/internal/bytealg
parent	Initial commit. (diff)
download	golang-1.16-73df946d56c74384511a194dd01dbe099584fd1a.tar.xz golang-1.16-73df946d56c74384511a194dd01dbe099584fd1a.zip