src/internal/bytealg/indexbyte_ppc64x.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391

// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build ppc64 || ppc64le

#include "go_asm.h"
#include "textflag.h"

TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
	// R3 = byte array pointer
	// R4 = length
	MOVD	R6, R5		// R5 = byte
	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
	BR	indexbytebody<>(SB)

TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
	// R3 = string
	// R4 = length
	// R5 = byte
	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
	BR	indexbytebody<>(SB)

// R3 = addr of string
// R4 = len of string
// R5 = byte to find
// R16 = 1 if running on a POWER9 system, 0 otherwise
// On exit:
// R3 = return value
TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
	MOVD	R3,R17		// Save base address for calculating the index later.
	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
	ADD	R4,R3,R7	// Last acceptable address in R7.

	RLDIMI	$16,R5,$32,R5
	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
	MOVD	$-1,R9
	RLWNM	$3,R3,$26,$28,R6	// shift amount for mask (r3&0x7)*8
	RLDIMI	$32,R5,$0,R5
	MOVD	R7,R10		// Save last acceptable address in R10 for later.
	ADD	$-1,R7,R7
#ifdef GOARCH_ppc64le
	SLD	R6,R9,R9	// Prepare mask for Little Endian
#else
	SRD	R6,R9,R9	// Same for Big Endian
#endif
	BLT	small_string	// Jump to the small string case if it's <32 bytes.
	CMP	R16,$1		// optimize for power8 v power9
	BNE	power8
	VSPLTISB	$3,V10	// Use V10 as control for VBPERMQ
	MTVRD	R5,V1
	LVSL	(R0+R0),V11	// set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
	VSLB	V11,V10,V10	// to extract the first bit of match result into GPR
	VSPLTB	$7,V1,V1	// Replicate byte across V1
	CMP	R4,$64
	MOVD	$16,R11
	MOVD	R3,R8
	BLT	cmp32
	MOVD	$32,R12
	MOVD	$48,R6

loop64:
	LXVB16X	(R0)(R8),V2	// scan 64 bytes at a time
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat0	// match found at R8, jump out

	LXVB16X	(R8)(R11),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat1	// match found at R8+16 bytes, jump out

	LXVB16X	(R8)(R12),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat2	// match found at R8+32 bytes, jump out

	LXVB16X	(R8)(R6),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat3	// match found at R8+48 bytes, jump out
	ADD	$64,R8
	ADD	$-64,R4
	CMP	R4,$64		// >=64 bytes left to scan?
	BGE	loop64
	CMP	R4,$32
	BLT	rem		// jump to rem if there are < 32 bytes left
cmp32:
	LXVB16X	(R0)(R8),V2	// 32-63 bytes left
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat0	// match found at R8

	LXVB16X	(R11)(R8),V2
	VCMPEQUBCC	V2,V1,V6
	BNE	CR6,foundat1	// match found at R8+16

	ADD	$32,R8
	ADD	$-32,R4
rem:
	RLDICR	$0,R8,$60,R8	// align address to reuse code for tail end processing
	BR	small_string

foundat3:
	ADD	$16,R8
foundat2:
	ADD	$16,R8
foundat1:
	ADD	$16,R8
foundat0:
	// Compress the result into a single doubleword and
	// move it to a GPR for the final calculation.
	VBPERMQ	V6,V10,V6
	MFVRD	V6,R3
	// count leading zeroes upto the match that ends up in low 16 bits
	// in both endian modes, compute index by subtracting the number by 16
	CNTLZW	R3,R11
	ADD	$-16,R11
	ADD	R8,R11,R3	// Calculate byte address
	SUB	R17,R3
	RET
power8:
	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
	// in V0, V1 and V10, then branch to the preloop.
	ANDCC	$63,R3,R11
	BEQ	CR0,qw_align
	RLDICL	$0,R3,$61,R11

	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
	CMPB	R12,R5,R3	// Check for a match.
	AND	R9,R3,R3	// Mask bytes below s_base
	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
	BNE	CR7,done
	ADD	$8,R8,R8
	ADD	$-8,R4,R4
	ADD	R4,R11,R4

	// Check for quadword alignment
	ANDCC	$15,R8,R11
	BEQ	CR0,qw_align

	// Not aligned, so handle the next doubleword
	MOVD	0(R8),R12
	CMPB	R12,R5,R3
	CMPU	R3,$0,CR7
	BNE	CR7,done
	ADD	$8,R8,R8
	ADD	$-8,R4,R4

	// Either quadword aligned or 64-byte at this point. We can use LVX.
qw_align:

	// Set up auxiliary data for the vectorized algorithm.
	VSPLTISB  $0,V0		// Replicate 0 across V0
	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
	MTVRD	  R5,V1
	LVSL	  (R0+R0),V11
	VSLB	  V11,V10,V10
	VSPLTB	  $7,V1,V1	// Replicate byte across V1
	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
	BLE	  tail

	// We will load 4 quardwords per iteration in the loop, so check for
	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
	ANDCC	  $63,R8,R11
	BEQ	  CR0,preloop

	// Not 64-byte aligned. Load one quadword at a time until aligned.
	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
	BNE	    CR6,found_qw_align
	ADD	    $16,R8,R8
	ADD	    $-16,R4,R4

	ANDCC	    $63,R8,R11
	BEQ	    CR0,preloop
	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
	BNE	    CR6,found_qw_align
	ADD	    $16,R8,R8
	ADD	    $-16,R4,R4

	ANDCC	    $63,R8,R11
	BEQ	    CR0,preloop
	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
	BNE	    CR6,found_qw_align
	ADD	    $-16,R4,R4
	ADD	    $16,R8,R8

	// 64-byte aligned. Prepare for the main loop.
preloop:
	CMPU	R4,$64
	BLE	tail	      // If len ≤ 64, don't use the vectorized loop

	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
	// per loop iteration. The last doubleword is in R10, so our loop counter
	// starts at (R10-R8)/64.
	SUB	R8,R10,R6
	SRD	$6,R6,R9      // Loop counter in R9
	MOVD	R9,CTR

	ADD	$-64,R8,R8   // Adjust index for loop entry
	MOVD	$16,R11      // Load offsets for the vector loads
	MOVD	$32,R9
	MOVD	$48,R7

	// Main loop we will load 64 bytes per iteration
loop:
	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
	LVX	    (R8+R11),V3
	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
	VCMPEQUB    V1,V3,V7

	LVX	    (R8+R9),V4
	LVX	    (R8+R7),V5
	VCMPEQUB    V1,V4,V8
	VCMPEQUB    V1,V5,V9

	VOR	    V6,V7,V11	      // Compress the result in a single vector
	VOR	    V8,V9,V12
	VOR	    V11,V12,V13
	VCMPEQUBCC  V0,V13,V14	      // Check for byte
	BGE	    CR6,found
	BC	    16,0,loop	      // bdnz loop

	// Handle the tailing bytes or R4 ≤ 64
	RLDICL	$0,R6,$58,R4
	ADD	$64,R8,R8
tail:
	CMPU	    R4,$0
	BEQ	    notfound
	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6
	BNE	    CR6,found_qw_align
	ADD	    $16,R8,R8
	CMPU	    R4,$16,CR6
	BLE	    CR6,notfound
	ADD	    $-16,R4,R4

	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6
	BNE	    CR6,found_qw_align
	ADD	    $16,R8,R8
	CMPU	    R4,$16,CR6
	BLE	    CR6,notfound
	ADD	    $-16,R4,R4

	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6
	BNE	    CR6,found_qw_align
	ADD	    $16,R8,R8
	CMPU	    R4,$16,CR6
	BLE	    CR6,notfound
	ADD	    $-16,R4,R4

	LVX	    (R8+R0),V4
	VCMPEQUBCC  V1,V4,V6
	BNE	    CR6,found_qw_align

notfound:
	MOVD	$-1, R3
	RET

found:
	// We will now compress the results into a single doubleword,
	// so it can be moved to a GPR for the final index calculation.

	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
	// first bit of each byte into bits 48-63.
	VBPERMQ	  V6,V10,V6
	VBPERMQ	  V7,V10,V7
	VBPERMQ	  V8,V10,V8
	VBPERMQ	  V9,V10,V9

	// Shift each 16-bit component into its correct position for
	// merging into a single doubleword.
#ifdef GOARCH_ppc64le
	VSLDOI	  $2,V7,V7,V7
	VSLDOI	  $4,V8,V8,V8
	VSLDOI	  $6,V9,V9,V9
#else
	VSLDOI	  $6,V6,V6,V6
	VSLDOI	  $4,V7,V7,V7
	VSLDOI	  $2,V8,V8,V8
#endif

	// Merge V6-V9 into a single doubleword and move to a GPR.
	VOR	V6,V7,V11
	VOR	V8,V9,V4
	VOR	V4,V11,V4
	MFVRD	V4,R3

#ifdef GOARCH_ppc64le
	ADD	  $-1,R3,R11
	ANDN	  R3,R11,R11
	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
#else
	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
#endif
	ADD	R8,R11,R3	// Calculate byte address

return:
	SUB	R17, R3
	RET

found_qw_align:
	// Use the same algorithm as above. Compress the result into
	// a single doubleword and move it to a GPR for the final
	// calculation.
	VBPERMQ	  V6,V10,V6

#ifdef GOARCH_ppc64le
	MFVRD	  V6,R3
	ADD	  $-1,R3,R11
	ANDN	  R3,R11,R11
	POPCNTD	  R11,R11
#else
	VSLDOI	  $6,V6,V6,V6
	MFVRD	  V6,R3
	CNTLZD	  R3,R11
#endif
	ADD	  R8,R11,R3
	CMPU	  R11,R4
	BLT	  return
	BR	  notfound
	PCALIGN	  $16

done:
	ADD	$-1,R10,R6
	// Offset of last index for the final
	// doubleword comparison
	RLDICL	$0,R6,$61,R6
	// At this point, R3 has 0xFF in the same position as the byte we are
	// looking for in the doubleword. Use that to calculate the exact index
	// of the byte.
#ifdef GOARCH_ppc64le
	ADD	$-1,R3,R11
	ANDN	R3,R11,R11
	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
#else
	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
#endif
	CMPU	R8,R7		// Check if we are at the last doubleword.
	SRD	$3,R11		// Convert trailing zeros to bytes.
	ADD	R11,R8,R3
	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
	BNE	return
	BLE	CR7,return
	BR	notfound

small_string:
	// process string of length < 32 bytes
	// We unroll this loop for better performance.
	CMPU	R4,$0		// Check for length=0
	BEQ	notfound

	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
	CMPB	R12,R5,R3	// Check for a match.
	AND	R9,R3,R3	// Mask bytes below s_base.
	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
	CMPU	R8,R7
	BNE	CR7,done
	BEQ	notfound	// Hit length.

	MOVDU	8(R8),R12
	CMPB	R12,R5,R3
	CMPU	R3,$0,CR6
	CMPU	R8,R7
	BNE	CR6,done
	BEQ	notfound

	MOVDU	8(R8),R12
	CMPB	R12,R5,R3
	CMPU	R3,$0,CR6
	CMPU	R8,R7
	BNE	CR6,done
	BEQ	notfound

	MOVDU	8(R8),R12
	CMPB	R12,R5,R3
	CMPU	R3,$0,CR6
	CMPU	R8,R7
	BNE	CR6,done
	BEQ	notfound

	MOVDU	8(R8),R12
	CMPB	R12,R5,R3
	CMPU	R3,$0,CR6
	BNE	CR6,done
	BR	notfound