1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
|
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·Index(SB),NOSPLIT,$0-56
MOVQ a_base+0(FP), DI
MOVQ a_len+8(FP), DX
MOVQ b_base+24(FP), R8
MOVQ b_len+32(FP), AX
MOVQ DI, R10
LEAQ ret+48(FP), R11
JMP indexbody<>(SB)
TEXT ·IndexString(SB),NOSPLIT,$0-40
MOVQ a_base+0(FP), DI
MOVQ a_len+8(FP), DX
MOVQ b_base+16(FP), R8
MOVQ b_len+24(FP), AX
MOVQ DI, R10
LEAQ ret+32(FP), R11
JMP indexbody<>(SB)
// AX: length of string, that we are searching for
// DX: length of string, in which we are searching
// DI: pointer to string, in which we are searching
// R8: pointer to string, that we are searching for
// R11: address, where to put return value
// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
TEXT indexbody<>(SB),NOSPLIT,$0
CMPQ AX, DX
JA fail
CMPQ DX, $16
JAE sse42
no_sse42:
CMPQ AX, $2
JA _3_or_more
MOVW (R8), R8
LEAQ -1(DI)(DX*1), DX
loop2:
MOVW (DI), SI
CMPW SI,R8
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop2
JMP fail
_3_or_more:
CMPQ AX, $3
JA _4_or_more
MOVW 1(R8), BX
MOVW (R8), R8
LEAQ -2(DI)(DX*1), DX
loop3:
MOVW (DI), SI
CMPW SI,R8
JZ partial_success3
ADDQ $1,DI
CMPQ DI,DX
JB loop3
JMP fail
partial_success3:
MOVW 1(DI), SI
CMPW SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop3
JMP fail
_4_or_more:
CMPQ AX, $4
JA _5_or_more
MOVL (R8), R8
LEAQ -3(DI)(DX*1), DX
loop4:
MOVL (DI), SI
CMPL SI,R8
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop4
JMP fail
_5_or_more:
CMPQ AX, $7
JA _8_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVL -4(R8)(AX*1), BX
MOVL (R8), R8
loop5to7:
MOVL (DI), SI
CMPL SI,R8
JZ partial_success5to7
ADDQ $1,DI
CMPQ DI,DX
JB loop5to7
JMP fail
partial_success5to7:
MOVL -4(AX)(DI*1), SI
CMPL SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop5to7
JMP fail
_8_or_more:
CMPQ AX, $8
JA _9_or_more
MOVQ (R8), R8
LEAQ -7(DI)(DX*1), DX
loop8:
MOVQ (DI), SI
CMPQ SI,R8
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop8
JMP fail
_9_or_more:
CMPQ AX, $15
JA _16_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVQ -8(R8)(AX*1), BX
MOVQ (R8), R8
loop9to15:
MOVQ (DI), SI
CMPQ SI,R8
JZ partial_success9to15
ADDQ $1,DI
CMPQ DI,DX
JB loop9to15
JMP fail
partial_success9to15:
MOVQ -8(AX)(DI*1), SI
CMPQ SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,DX
JB loop9to15
JMP fail
_16_or_more:
CMPQ AX, $16
JA _17_or_more
MOVOU (R8), X1
LEAQ -15(DI)(DX*1), DX
loop16:
MOVOU (DI), X2
PCMPEQB X1, X2
PMOVMSKB X2, SI
CMPQ SI, $0xffff
JE success
ADDQ $1,DI
CMPQ DI,DX
JB loop16
JMP fail
_17_or_more:
CMPQ AX, $31
JA _32_or_more
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVOU -16(R8)(AX*1), X0
MOVOU (R8), X1
loop17to31:
MOVOU (DI), X2
PCMPEQB X1,X2
PMOVMSKB X2, SI
CMPQ SI, $0xffff
JE partial_success17to31
ADDQ $1,DI
CMPQ DI,DX
JB loop17to31
JMP fail
partial_success17to31:
MOVOU -16(AX)(DI*1), X3
PCMPEQB X0, X3
PMOVMSKB X3, SI
CMPQ SI, $0xffff
JE success
ADDQ $1,DI
CMPQ DI,DX
JB loop17to31
JMP fail
// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
// So no need to check cpuid
_32_or_more:
CMPQ AX, $32
JA _33_to_63
VMOVDQU (R8), Y1
LEAQ -31(DI)(DX*1), DX
loop32:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, SI
CMPL SI, $0xffffffff
JE success_avx2
ADDQ $1,DI
CMPQ DI,DX
JB loop32
JMP fail_avx2
_33_to_63:
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
VMOVDQU -32(R8)(AX*1), Y0
VMOVDQU (R8), Y1
loop33to63:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, SI
CMPL SI, $0xffffffff
JE partial_success33to63
ADDQ $1,DI
CMPQ DI,DX
JB loop33to63
JMP fail_avx2
partial_success33to63:
VMOVDQU -32(AX)(DI*1), Y3
VPCMPEQB Y0, Y3, Y4
VPMOVMSKB Y4, SI
CMPL SI, $0xffffffff
JE success_avx2
ADDQ $1,DI
CMPQ DI,DX
JB loop33to63
fail_avx2:
VZEROUPPER
fail:
MOVQ $-1, (R11)
RET
success_avx2:
VZEROUPPER
JMP success
sse42:
CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
JNE no_sse42
CMPQ AX, $12
// PCMPESTRI is slower than normal compare,
// so using it makes sense only if we advance 4+ bytes per compare
// This value was determined experimentally and is the ~same
// on Nehalem (first with SSE42) and Haswell.
JAE _9_or_more
LEAQ 16(R8), SI
TESTW $0xff0, SI
JEQ no_sse42
MOVOU (R8), X1
LEAQ -15(DI)(DX*1), SI
MOVQ $16, R9
SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
loop_sse42:
// 0x0c means: unsigned byte compare (bits 0,1 are 00)
// for equality (bits 2,3 are 11)
// result is not masked or inverted (bits 4,5 are 00)
// and corresponds to first matching byte (bit 6 is 0)
PCMPESTRI $0x0c, (DI), X1
// CX == 16 means no match,
// CX > R9 means partial match at the end of the string,
// otherwise sep is at offset CX from X1 start
CMPQ CX, R9
JBE sse42_success
ADDQ R9, DI
CMPQ DI, SI
JB loop_sse42
PCMPESTRI $0x0c, -1(SI), X1
CMPQ CX, R9
JA fail
LEAQ -1(SI), DI
sse42_success:
ADDQ CX, DI
success:
SUBQ R10, DI
MOVQ DI, (R11)
RET
|