1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
|
/* SPDX-License-Identifier: GPL-2.0 */
/*
*
* Optmized version of the standard do_csum() function
*
* Return: a 64bit quantity containing the 16bit Internet checksum
*
* Inputs:
* in0: address of buffer to checksum (char *)
* in1: length of the buffer (int)
*
* Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
*
* 02/04/22 Ken Chen <kenneth.w.chen@intel.com>
* Data locality study on the checksum buffer.
* More optimization cleanup - remove excessive stop bits.
* 02/04/08 David Mosberger <davidm@hpl.hp.com>
* More cleanup and tuning.
* 01/04/18 Jun Nakajima <jun.nakajima@intel.com>
* Clean up and optimize and the software pipeline, loading two
* back-to-back 8-byte words per loop. Clean up the initialization
* for the loop. Support the cases where load latency = 1 or 2.
* Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
*/
#include <asm/asmmacro.h>
//
// Theory of operations:
// The goal is to go as quickly as possible to the point where
// we can checksum 16 bytes/loop. Before reaching that point we must
// take care of incorrect alignment of first byte.
//
// The code hereafter also takes care of the "tail" part of the buffer
// before entering the core loop, if any. The checksum is a sum so it
// allows us to commute operations. So we do the "head" and "tail"
// first to finish at full speed in the body. Once we get the head and
// tail values, we feed them into the pipeline, very handy initialization.
//
// Of course we deal with the special case where the whole buffer fits
// into one 8 byte word. In this case we have only one entry in the pipeline.
//
// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
// possible load latency and also to accommodate for head and tail.
//
// The end of the function deals with folding the checksum from 64bits
// down to 16bits taking care of the carry.
//
// This version avoids synchronization in the core loop by also using a
// pipeline for the accumulation of the checksum in resultx[] (x=1,2).
//
// wordx[] (x=1,2)
// |---|
// | | 0 : new value loaded in pipeline
// |---|
// | | - : in transit data
// |---|
// | | LOAD_LATENCY : current value to add to checksum
// |---|
// | | LOAD_LATENCY+1 : previous value added to checksum
// |---| (previous iteration)
//
// resultx[] (x=1,2)
// |---|
// | | 0 : initial value
// |---|
// | | LOAD_LATENCY-1 : new checksum
// |---|
// | | LOAD_LATENCY : previous value of checksum
// |---|
// | | LOAD_LATENCY+1 : final checksum when out of the loop
// |---|
//
//
// See RFC1071 "Computing the Internet Checksum" for various techniques for
// calculating the Internet checksum.
//
// NOT YET DONE:
// - Maybe another algorithm which would take care of the folding at the
// end in a different manner
// - Work with people more knowledgeable than me on the network stack
// to figure out if we could not split the function depending on the
// type of packet or alignment we get. Like the ip_fast_csum() routine
// where we know we have at least 20bytes worth of data to checksum.
// - Do a better job of handling small packets.
// - Note on prefetching: it was found that under various load, i.e. ftp read/write,
// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
// on the data that buffer points to (partly because the checksum is often preceded by
// a copy_from_user()). This finding indiate that lfetch will not be beneficial since
// the data is already in the cache.
//
#define saved_pfs r11
#define hmask r16
#define tmask r17
#define first1 r18
#define firstval r19
#define firstoff r20
#define last r21
#define lastval r22
#define lastoff r23
#define saved_lc r24
#define saved_pr r25
#define tmp1 r26
#define tmp2 r27
#define tmp3 r28
#define carry1 r29
#define carry2 r30
#define first2 r31
#define buf in0
#define len in1
#define LOAD_LATENCY 2 // XXX fix me
#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
#endif
#define PIPE_DEPTH (LOAD_LATENCY+2)
#define ELD p[LOAD_LATENCY] // end of load
#define ELD_1 p[LOAD_LATENCY+1] // and next stage
// unsigned long do_csum(unsigned char *buf,long len)
GLOBAL_ENTRY(do_csum)
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,2,16,0,16
.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
mov ret0=r0 // in case we have zero length
cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len)
;;
add tmp1=buf,len // last byte's address
.save pr, saved_pr
mov saved_pr=pr // preserve predicates (rotation)
(p6) br.ret.spnt.many rp // return if zero or negative length
mov hmask=-1 // initialize head mask
tbit.nz p15,p0=buf,0 // is buf an odd address?
and first1=-8,buf // 8-byte align down address of first1 element
and firstoff=7,buf // how many bytes off for first1 element
mov tmask=-1 // initialize tail mask
;;
adds tmp2=-1,tmp1 // last-1
and lastoff=7,tmp1 // how many bytes off for last element
;;
sub tmp1=8,lastoff // complement to lastoff
and last=-8,tmp2 // address of word containing last byte
;;
sub tmp3=last,first1 // tmp3=distance from first1 to last
.save ar.lc, saved_lc
mov saved_lc=ar.lc // save lc
cmp.eq p8,p9=last,first1 // everything fits in one word ?
ld8 firstval=[first1],8 // load, ahead of time, "first1" word
and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
shl tmp2=firstoff,3 // number of bits
;;
(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed
shl tmp1=tmp1,3 // number of bits
(p9) adds tmp3=-8,tmp3 // effectively loaded
;;
(p8) mov lastval=r0 // we don't need lastval if first1==last
shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[
shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
;;
.body
#define count tmp3
(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
(p9) and word2[0]=lastval,tmask // mask last it as appropriate
shr.u count=count,3 // how many 8-byte?
;;
// If count is odd, finish this 8-byte word so that we can
// load two back-to-back 8-byte words per loop thereafter.
and word1[0]=firstval,hmask // and mask it as appropriate
tbit.nz p10,p11=count,0 // if (count is odd)
;;
(p8) mov result1[0]=word1[0]
(p9) add result1[0]=word1[0],word2[0]
;;
cmp.ltu p6,p0=result1[0],word1[0] // check the carry
cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte
;;
(p6) adds result1[0]=1,result1[0]
(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
(p11) br.cond.dptk .do_csum16 // if (count is even)
// Here count is odd.
ld8 word1[1]=[first1],8 // load an 8-byte word
cmp.eq p9,p10=1,count // if (count == 1)
adds count=-1,count // loaded an 8-byte word
;;
add result1[0]=result1[0],word1[1]
;;
cmp.ltu p6,p0=result1[0],word1[1]
;;
(p6) adds result1[0]=1,result1[0]
(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit
// Fall through to calculate the checksum, feeding result1[0] as
// the initial value in result1[0].
//
// Calculate the checksum loading two 8-byte words per loop.
//
.do_csum16:
add first2=8,first1
shr.u count=count,1 // we do 16 bytes per loop
;;
adds count=-1,count
mov carry1=r0
mov carry2=r0
brp.loop.imp 1f,2f
;;
mov ar.ec=PIPE_DEPTH
mov ar.lc=count // set lc
mov pr.rot=1<<16
// result1[0] must be initialized in advance.
mov result2[0]=r0
;;
.align 32
1:
(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
(pC1[1])adds carry1=1,carry1
(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
(pC2[1])adds carry2=1,carry2
(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
2:
(p[0]) ld8 word1[0]=[first1],16
(p[0]) ld8 word2[0]=[first2],16
br.ctop.sptk 1b
;;
// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
(pC1[1])adds carry1=1,carry1 // since we miss the last one
(pC2[1])adds carry2=1,carry2
;;
add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
;;
cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
;;
(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
;;
add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
;;
cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
;;
(p6) adds result1[0]=1,result1[0]
;;
.do_csum_exit:
//
// now fold 64 into 16 bits taking care of carry
// that's not very good because it has lots of sequentiality
//
mov tmp3=0xffff
zxt4 tmp1=result1[0]
shr.u tmp2=result1[0],32
;;
add result1[0]=tmp1,tmp2
;;
and tmp1=result1[0],tmp3
shr.u tmp2=result1[0],16
;;
add result1[0]=tmp1,tmp2
;;
and tmp1=result1[0],tmp3
shr.u tmp2=result1[0],16
;;
add result1[0]=tmp1,tmp2
;;
and tmp1=result1[0],tmp3
shr.u tmp2=result1[0],16
;;
add ret0=tmp1,tmp2
mov pr=saved_pr,0xffffffffffff0000
;;
// if buf was odd then swap bytes
mov ar.pfs=saved_pfs // restore ar.ec
(p15) mux1 ret0=ret0,@rev // reverse word
;;
mov ar.lc=saved_lc
(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
br.ret.sptk.many rp
// I (Jun Nakajima) wrote an equivalent code (see below), but it was
// not much better than the original. So keep the original there so that
// someone else can challenge.
//
// shr.u word1[0]=result1[0],32
// zxt4 result1[0]=result1[0]
// ;;
// add result1[0]=result1[0],word1[0]
// ;;
// zxt2 result2[0]=result1[0]
// extr.u word1[0]=result1[0],16,16
// shr.u carry1=result1[0],32
// ;;
// add result2[0]=result2[0],word1[0]
// ;;
// add result2[0]=result2[0],carry1
// ;;
// extr.u ret0=result2[0],16,16
// ;;
// add ret0=ret0,result2[0]
// ;;
// zxt2 ret0=ret0
// mov ar.pfs=saved_pfs // restore ar.ec
// mov pr=saved_pr,0xffffffffffff0000
// ;;
// // if buf was odd then swap bytes
// mov ar.lc=saved_lc
//(p15) mux1 ret0=ret0,@rev // reverse word
// ;;
//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
// br.ret.sptk.many rp
END(do_csum)
|