1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
|
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ppc64 || ppc64le
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
#ifndef GOEXPERIMENT_regabiargs
MOVD b_base+0(FP), R3 // R3 = byte array pointer
MOVD b_len+8(FP), R4 // R4 = length
MOVBZ c+24(FP), R5 // R5 = byte
MOVD $ret+32(FP), R14 // R14 = &ret
#else
MOVD R6, R5
#endif
BR indexbytebody<>(SB)
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
#ifndef GOEXPERIMENT_regabiargs
MOVD s_base+0(FP), R3 // R3 = string
MOVD s_len+8(FP), R4 // R4 = length
MOVBZ c+16(FP), R5 // R5 = byte
MOVD $ret+24(FP), R14 // R14 = &ret
#endif
BR indexbytebody<>(SB)
// R3 = addr of string
// R4 = len of string
// R5 = byte to find
// R14 = addr of return value when not regabi
TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R17 // Save base address for calculating the index later.
RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
ADD R4,R3,R7 // Last acceptable address in R7.
DCBT (R8) // Prepare cache line.
RLDIMI $16,R5,$32,R5
CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently.
MOVD $-1,R9
WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
RLDIMI $32,R5,$0,R5
MOVD R7,R10 // Save last acceptable address in R10 for later.
ADD $-1,R7,R7
#ifdef GOARCH_ppc64le
SLD R6,R9,R9 // Prepare mask for Little Endian
#else
SRD R6,R9,R9 // Same for Big Endian
#endif
BLE small_string // Jump to the small string case if it's ≤32 bytes.
// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
// in V0, V1 and V10, then branch to the preloop.
ANDCC $63,R3,R11
BEQ CR0,qw_align
RLDICL $0,R3,$61,R11
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base
RLDICL $0,R7,$61,R6 // length-1
RLDICR $0,R7,$60,R7 // Last doubleword in R7
CMPU R3,$0,CR7 // If we have a match, jump to the final computation
BNE CR7,done
ADD $8,R8,R8
ADD $-8,R4,R4
ADD R4,R11,R4
// Check for quadword alignment
ANDCC $15,R8,R11
BEQ CR0,qw_align
// Not aligned, so handle the next doubleword
MOVD 0(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR7
BNE CR7,done
ADD $8,R8,R8
ADD $-8,R4,R4
// Either quadword aligned or 64-byte at this point. We can use LVX.
qw_align:
// Set up auxiliary data for the vectorized algorithm.
VSPLTISB $0,V0 // Replicate 0 across V0
VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
MTVRD R5,V1
LVSL (R0+R0),V11
VSLB V11,V10,V10
VSPLTB $7,V1,V1 // Replicate byte across V1
CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop
BLE tail
// We will load 4 quardwords per iteration in the loop, so check for
// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
ANDCC $63,R8,R11
BEQ CR0,preloop
// Not 64-byte aligned. Load one quadword at a time until aligned.
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $16,R8,R8
ADD $-16,R4,R4
ANDCC $63,R8,R11
BEQ CR0,preloop
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $16,R8,R8
ADD $-16,R4,R4
ANDCC $63,R8,R11
BEQ CR0,preloop
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $-16,R4,R4
ADD $16,R8,R8
// 64-byte aligned. Prepare for the main loop.
preloop:
CMPU R4,$64
BLE tail // If len ≤ 64, don't use the vectorized loop
// We are now aligned to a 64-byte boundary. We will load 4 quadwords
// per loop iteration. The last doubleword is in R10, so our loop counter
// starts at (R10-R8)/64.
SUB R8,R10,R6
SRD $6,R6,R9 // Loop counter in R9
MOVD R9,CTR
ADD $-64,R8,R8 // Adjust index for loop entry
MOVD $16,R11 // Load offsets for the vector loads
MOVD $32,R9
MOVD $48,R7
// Main loop we will load 64 bytes per iteration
loop:
ADD $64,R8,R8 // Fuse addi+lvx for performance
LVX (R8+R0),V2 // Load 4 16-byte vectors
LVX (R8+R11),V3
VCMPEQUB V1,V2,V6 // Look for byte in each vector
VCMPEQUB V1,V3,V7
LVX (R8+R9),V4
LVX (R8+R7),V5
VCMPEQUB V1,V4,V8
VCMPEQUB V1,V5,V9
VOR V6,V7,V11 // Compress the result in a single vector
VOR V8,V9,V12
VOR V11,V12,V13
VCMPEQUBCC V0,V13,V14 // Check for byte
BGE CR6,found
BC 16,0,loop // bdnz loop
// Handle the tailing bytes or R4 ≤ 64
RLDICL $0,R6,$58,R4
ADD $64,R8,R8
tail:
CMPU R4,$0
BEQ notfound
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
notfound:
MOVD $-1,R3
#ifndef GOEXPERIMENT_regabiargs
MOVD R3,(R14)
#endif
RET
found:
// We will now compress the results into a single doubleword,
// so it can be moved to a GPR for the final index calculation.
// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
// first bit of each byte into bits 48-63.
VBPERMQ V6,V10,V6
VBPERMQ V7,V10,V7
VBPERMQ V8,V10,V8
VBPERMQ V9,V10,V9
// Shift each 16-bit component into its correct position for
// merging into a single doubleword.
#ifdef GOARCH_ppc64le
VSLDOI $2,V7,V7,V7
VSLDOI $4,V8,V8,V8
VSLDOI $6,V9,V9,V9
#else
VSLDOI $6,V6,V6,V6
VSLDOI $4,V7,V7,V7
VSLDOI $2,V8,V8,V8
#endif
// Merge V6-V9 into a single doubleword and move to a GPR.
VOR V6,V7,V11
VOR V8,V9,V4
VOR V4,V11,V4
MFVRD V4,R3
#ifdef GOARCH_ppc64le
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11 // Count trailing zeros (Little Endian).
#else
CNTLZD R3,R11 // Count leading zeros (Big Endian).
#endif
ADD R8,R11,R3 // Calculate byte address
return:
SUB R17,R3
#ifndef GOEXPERIMENT_regabiargs
MOVD R3,(R14)
#endif
RET
found_qw_align:
// Use the same algorithm as above. Compress the result into
// a single doubleword and move it to a GPR for the final
// calculation.
VBPERMQ V6,V10,V6
#ifdef GOARCH_ppc64le
MFVRD V6,R3
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11
#else
VSLDOI $6,V6,V6,V6
MFVRD V6,R3
CNTLZD R3,R11
#endif
ADD R8,R11,R3
CMPU R11,R4
BLT return
BR notfound
done:
// At this point, R3 has 0xFF in the same position as the byte we are
// looking for in the doubleword. Use that to calculate the exact index
// of the byte.
#ifdef GOARCH_ppc64le
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11 // Count trailing zeros (Little Endian).
#else
CNTLZD R3,R11 // Count leading zeros (Big Endian).
#endif
CMPU R8,R7 // Check if we are at the last doubleword.
SRD $3,R11 // Convert trailing zeros to bytes.
ADD R11,R8,R3
CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
BNE return
BLE CR7,return
BR notfound
small_string:
// We unroll this loop for better performance.
CMPU R4,$0 // Check for length=0
BEQ notfound
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base.
CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
RLDICL $0,R7,$61,R6 // length-1
RLDICR $0,R7,$60,R7 // Last doubleword in R7.
CMPU R8,R7
BNE CR7,done
BEQ notfound // Hit length.
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
BNE CR6,done
BR notfound
|