1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
|
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
// condition code masks
#define EQ 8
#define NE 7
// register assignments
#define R_ZERO R0
#define R_VAL R1
#define R_TMP R2
#define R_PTR R3
#define R_LEN R4
#define R_CHAR R5
#define R_RET R6
#define R_ITER R7
#define R_CNT R8
#define R_MPTR R9
// vector register assignments
#define V_ZERO V0
#define V_CHAR V1
#define V_MASK V2
#define V_VAL V3
#define V_CNT V4
// mask for trailing bytes in vector implementation
GLOBL countbytemask<>(SB), RODATA, $16
DATA countbytemask<>+0(SB)/8, $0x0101010101010101
DATA countbytemask<>+8(SB)/8, $0x0101010101010101
// func Count(b []byte, c byte) int
TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
LMG b+0(FP), R_PTR, R_LEN
MOVBZ c+24(FP), R_CHAR
MOVD $ret+32(FP), R_RET
BR countbytebody<>(SB)
// func CountString(s string, c byte) int
TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
LMG s+0(FP), R_PTR, R_LEN
MOVBZ c+16(FP), R_CHAR
MOVD $ret+24(FP), R_RET
BR countbytebody<>(SB)
// input:
// R_PTR = address of array of bytes
// R_LEN = number of bytes in array
// R_CHAR = byte value to count zero (extended to register width)
// R_RET = address of return value
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
MOVD $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
MOVD $countbytemask<>(SB), R_MPTR
CGIJ $EQ, R_LEN, $0, ret0 // return if length is 0.
SRD $4, R_LEN, R_ITER // R_ITER is the number of 16-byte chunks
MOVBZ (R_TMP), R_TMP // load bool indicating support for vector facility
CGIJ $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
// Start of vector code (have vector facility).
//
// Set R_LEN to be the length mod 16 minus 1 to use as an index for
// vector 'load with length' (VLL). It will be in the range [-1,14].
// Also replicate c across a 16-byte vector and initialize V_ZERO.
ANDW $0xf, R_LEN
VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
VZERO V_ZERO // V_ZERO = [1]uint128{0}
ADDW $-1, R_LEN
VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
// Jump to loop if we have more than 15 bytes to process.
CGIJ $NE, R_ITER, $0, vxchunks
// Load 1-15 bytes and corresponding mask.
// Note: only the low 32-bits of R_LEN are used for the index.
VLL R_LEN, (R_PTR), V_VAL
VLL R_LEN, (R_MPTR), V_MASK
// Compare each byte in input chunk against byte to be counted.
// Each byte element will be set to either 0 (no match) or 1 (match).
VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
// Accumulate matched byte count in 128-bit integer value.
VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
// Return rightmost (lowest) 64-bit part of accumulator.
VSTEG $1, V_CNT, (R_RET)
RET
vxchunks:
// Load 0x01 into every byte element in the 16-byte mask vector.
VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
VZERO V_CNT // initial uint128 count of 0
vxloop:
// Load input bytes in 16-byte chunks.
VL (R_PTR), V_VAL
// Compare each byte in input chunk against byte to be counted.
// Each byte element will be set to either 0 (no match) or 1 (match).
VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
// Increment input string address.
MOVD $16(R_PTR), R_PTR
// Accumulate matched byte count in 128-bit integer value.
VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
VAQ V_VAL, V_CNT, V_CNT // accumulate
// Repeat until all 16-byte chunks are done.
BRCTG R_ITER, vxloop
// Skip to end if there are no trailing bytes.
CIJ $EQ, R_LEN, $-1, vxret
// Load 1-15 bytes and corresponding mask.
// Note: only the low 32-bits of R_LEN are used for the index.
VLL R_LEN, (R_PTR), V_VAL
VLL R_LEN, (R_MPTR), V_MASK
// Compare each byte in input chunk against byte to be counted.
// Each byte element will be set to either 0 (no match) or 1 (match).
VCEQB V_CHAR, V_VAL, V_VAL
VN V_MASK, V_VAL, V_VAL
// Accumulate matched byte count in 128-bit integer value.
VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
VAQ V_VAL, V_CNT, V_CNT // accumulate
vxret:
// Return rightmost (lowest) 64-bit part of accumulator.
VSTEG $1, V_CNT, (R_RET)
RET
novx:
// Start of non-vector code (the vector facility not available).
//
// Initialise counter and constant zero.
MOVD $0, R_CNT
MOVD $0, R_ZERO
loop:
// Read 1-byte from input and compare.
// Note: avoid putting LOCGR in critical path.
MOVBZ (R_PTR), R_VAL
MOVD $1, R_TMP
MOVD $1(R_PTR), R_PTR
CMPW R_VAL, R_CHAR
LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
ADD R_TMP, R_CNT // accumulate 64-bit result
// Repeat until all bytes have been checked.
BRCTG R_LEN, loop
ret:
MOVD R_CNT, (R_RET)
RET
ret0:
MOVD $0, (R_RET)
RET
|