1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
|
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·Compare(SB),NOSPLIT,$0-56
MOVQ a_base+0(FP), SI
MOVQ a_len+8(FP), BX
MOVQ b_base+24(FP), DI
MOVQ b_len+32(FP), DX
LEAQ ret+48(FP), R9
JMP cmpbody<>(SB)
TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
MOVQ a_base+0(FP), SI
MOVQ a_len+8(FP), BX
MOVQ b_base+16(FP), DI
MOVQ b_len+24(FP), DX
LEAQ ret+32(FP), R9
JMP cmpbody<>(SB)
// input:
// SI = a
// DI = b
// BX = alen
// DX = blen
// R9 = address of output word (stores -1/0/1 here)
TEXT cmpbody<>(SB),NOSPLIT,$0-0
CMPQ SI, DI
JEQ allsame
CMPQ BX, DX
MOVQ DX, R8
CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
CMPQ R8, $8
JB small
CMPQ R8, $63
JBE loop
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
JEQ big_loop_avx2
JMP big_loop
loop:
CMPQ R8, $16
JBE _0through16
MOVOU (SI), X0
MOVOU (DI), X1
PCMPEQB X0, X1
PMOVMSKB X1, AX
XORQ $0xffff, AX // convert EQ to NE
JNE diff16 // branch if at least one byte is not equal
ADDQ $16, SI
ADDQ $16, DI
SUBQ $16, R8
JMP loop
diff64:
ADDQ $48, SI
ADDQ $48, DI
JMP diff16
diff48:
ADDQ $32, SI
ADDQ $32, DI
JMP diff16
diff32:
ADDQ $16, SI
ADDQ $16, DI
// AX = bit mask of differences
diff16:
BSFQ AX, BX // index of first byte that differs
XORQ AX, AX
MOVB (SI)(BX*1), CX
CMPB CX, (DI)(BX*1)
SETHI AX
LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
MOVQ AX, (R9)
RET
// 0 through 16 bytes left, alen>=8, blen>=8
_0through16:
CMPQ R8, $8
JBE _0through8
MOVQ (SI), AX
MOVQ (DI), CX
CMPQ AX, CX
JNE diff8
_0through8:
MOVQ -8(SI)(R8*1), AX
MOVQ -8(DI)(R8*1), CX
CMPQ AX, CX
JEQ allsame
// AX and CX contain parts of a and b that differ.
diff8:
BSWAPQ AX // reverse order of bytes
BSWAPQ CX
XORQ AX, CX
BSRQ CX, CX // index of highest bit difference
SHRQ CX, AX // move a's bit to bottom
ANDQ $1, AX // mask bit
LEAQ -1(AX*2), AX // 1/0 => +1/-1
MOVQ AX, (R9)
RET
// 0-7 bytes in common
small:
LEAQ (R8*8), CX // bytes left -> bits left
NEGQ CX // - bits lift (== 64 - bits left mod 64)
JEQ allsame
// load bytes of a into high bytes of AX
CMPB SI, $0xf8
JA si_high
MOVQ (SI), SI
JMP si_finish
si_high:
MOVQ -8(SI)(R8*1), SI
SHRQ CX, SI
si_finish:
SHLQ CX, SI
// load bytes of b in to high bytes of BX
CMPB DI, $0xf8
JA di_high
MOVQ (DI), DI
JMP di_finish
di_high:
MOVQ -8(DI)(R8*1), DI
SHRQ CX, DI
di_finish:
SHLQ CX, DI
BSWAPQ SI // reverse order of bytes
BSWAPQ DI
XORQ SI, DI // find bit differences
JEQ allsame
BSRQ DI, CX // index of highest bit difference
SHRQ CX, SI // move a's bit to bottom
ANDQ $1, SI // mask bit
LEAQ -1(SI*2), AX // 1/0 => +1/-1
MOVQ AX, (R9)
RET
allsame:
XORQ AX, AX
XORQ CX, CX
CMPQ BX, DX
SETGT AX // 1 if alen > blen
SETEQ CX // 1 if alen == blen
LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
MOVQ AX, (R9)
RET
// this works for >= 64 bytes of data.
big_loop:
MOVOU (SI), X0
MOVOU (DI), X1
PCMPEQB X0, X1
PMOVMSKB X1, AX
XORQ $0xffff, AX
JNE diff16
MOVOU 16(SI), X0
MOVOU 16(DI), X1
PCMPEQB X0, X1
PMOVMSKB X1, AX
XORQ $0xffff, AX
JNE diff32
MOVOU 32(SI), X0
MOVOU 32(DI), X1
PCMPEQB X0, X1
PMOVMSKB X1, AX
XORQ $0xffff, AX
JNE diff48
MOVOU 48(SI), X0
MOVOU 48(DI), X1
PCMPEQB X0, X1
PMOVMSKB X1, AX
XORQ $0xffff, AX
JNE diff64
ADDQ $64, SI
ADDQ $64, DI
SUBQ $64, R8
CMPQ R8, $64
JBE loop
JMP big_loop
// Compare 64-bytes per loop iteration.
// Loop is unrolled and uses AVX2.
big_loop_avx2:
VMOVDQU (SI), Y2
VMOVDQU (DI), Y3
VMOVDQU 32(SI), Y4
VMOVDQU 32(DI), Y5
VPCMPEQB Y2, Y3, Y0
VPMOVMSKB Y0, AX
XORL $0xffffffff, AX
JNE diff32_avx2
VPCMPEQB Y4, Y5, Y6
VPMOVMSKB Y6, AX
XORL $0xffffffff, AX
JNE diff64_avx2
ADDQ $64, SI
ADDQ $64, DI
SUBQ $64, R8
CMPQ R8, $64
JB big_loop_avx2_exit
JMP big_loop_avx2
// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
diff32_avx2:
VZEROUPPER
JMP diff16
// Same as diff32_avx2, but for last 32 bytes.
diff64_avx2:
VZEROUPPER
JMP diff48
// For <64 bytes remainder jump to normal loop.
big_loop_avx2_exit:
VZEROUPPER
JMP loop
|