1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
|
; $Id: prfamd64msc.asm 29 2009-07-01 20:30:29Z bird $;
;; @file
; kProfiler Mark 2 - Microsoft C/C++ Compiler Interaction, AMD64.
;
;
; Copyright (c) 2006-2007 Knut St. Osmundsen <bird-kStuff-spamix@anduin.net>
;
; Permission is hereby granted, free of charge, to any person
; obtaining a copy of this software and associated documentation
; files (the "Software"), to deal in the Software without
; restriction, including without limitation the rights to use,
; copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the
; Software is furnished to do so, subject to the following
; conditions:
;
; The above copyright notice and this permission notice shall be
; included in all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
; OTHER DEALINGS IN THE SOFTWARE.
;
[section .data]
;
g_fCalibrated:
dd 0
g_OverheadAdj:
dd 0
[section .text]
extern KPRF_ENTER
extern KPRF_LEAVE
global _penter
global _pexit
;ifdef UNDEFINED
global common_return_path
global common_overhead
global common_no_overhead
global calibrate
global calib_inner_update_minimum
global calib_inner_next
global calib_outer_dec
global calib_outer_inc
global calib_done
global calib_nullproc
;endif
;;
; On x86 the call to this function has been observed to be put before
; creating the stack frame, as the very first instruction in the function.
;
; Thus the stack layout is as follows:
; 24 return address of the calling function.
; 20 our return address - the address of the calling function + 5.
; 1c eax
; 18 edx
; 14 eflags
; 10 ecx
; c tsc high - param 3
; 8 tsc low
; 4 frame pointer - param 2
; 0 function ptr - param 1
;
;
align 16
_penter:
; save volatile register and get the time stamp.
push rax
push rdx
rdtsc
pushfq
push rcx
push r8
push r9
push r10
push r11
sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
; reserve 20h for spill, and 8 bytes for ts.
; setting up the enter call frame
mov r8d, edx
shl r8, 32
or r8, rax ; param 3 - the timestamp
mov [rsp + 20h], r8 ; save the tsc for later use.
lea rdx, [rsp + 8*8 + 28h] ; Param 2 - default frame pointer
mov rcx, [rdx] ; Param 1 - The function address
; MSC seems to put the _penter both before and after the typical sub rsp, xxh
; statement as if it cannot quite make up its mind. We'll try adjust for this
; to make the unwinding a bit more accurate wrt to longjmp/throw. But since
; there are also an uneven amount of push/pop around the _penter/_pexit we
; can never really make a perfect job of it. sigh.
cmp word [rcx - 5 - 4], 08348h ; sub rsp, imm8
jne .not_byte_sub
cmp byte [rcx - 5 - 2], 0ech
jne .not_byte_sub
movzx eax, byte [rcx - 5 - 1] ; imm8
add rdx, rax
jmp .call_prf_enter
.not_byte_sub:
cmp word [rcx - 5 - 7], 08148h ; sub rsp, imm32
jne .not_dword_sub
cmp byte [rcx - 5 - 5], 0ech
jne .not_dword_sub
mov eax, [rcx - 5 - 4] ; imm32
add rdx, rax
; jmp .call_prf_enter
.not_dword_sub:
.call_prf_enter:
call KPRF_ENTER
jmp common_return_path
;;
; On x86 the call to this function has been observed to be put right before
; return instruction. This fact matters since since we have to calc the same
; stack address as in _penter.
;
; Thus the stack layout is as follows:
; 24 return address of the calling function.
; 20 our return address - the address of the calling function + 5.
; 1c eax
; 18 edx
; 14 eflags
; 10 ecx
; c tsc high - param 3
; 8 tsc low
; 4 frame pointer - param 2
; 0 function ptr - param 1
;
;
align 16
_pexit:
; save volatile register and get the time stamp.
push rax
push rdx
rdtsc
pushfq
push rcx
push r8
push r9
push r10
push r11
sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
; reserve 20h for spill, and 8 bytes for ts.
; setting up the enter call frame
mov r8d, edx
shl r8, 32
or r8, rax ; param 3 - the timestamp
mov [rsp + 20h], r8 ; save the tsc for later use.
lea rdx, [rsp + 8*8 + 28h] ; Param 2 - frame pointer.
mov rcx, [rdx] ; Param 1 - The function address
; MSC some times put the _pexit before the add rsp, xxh. To try match up with
; any adjustments made in _penter, we'll try detect this.
cmp word [rcx], 08348h ; add rsp, imm8
jne .not_byte_sub
cmp byte [rcx + 2], 0c4h
jne .not_byte_sub
movzx eax, byte [rcx + 3] ; imm8
add rdx, rax
jmp .call_prf_leave
.not_byte_sub:
cmp word [rcx], 08148h ; add rsp, imm32
jne .not_dword_sub
cmp byte [rcx + 2], 0c4h
jne .not_dword_sub
mov eax, [rcx + 3] ; imm32
add rdx, rax
; jmp .call_prf_leave
.not_dword_sub:
.call_prf_leave:
call KPRF_LEAVE
jmp common_return_path
;;
; This is the common return path for both the enter and exit hooks.
; It's kept common because we can then use the same overhead adjustment
; and save some calibration efforts. It also saves space :-)
align 16
common_return_path:
; Update overhead
test rax, rax
jz common_no_overhead
cmp byte [g_fCalibrated wrt rip], 0
jnz common_overhead
call calibrate
common_overhead:
mov rcx, rax ; rcx <- pointer to overhead counter.
mov eax, [g_OverheadAdj wrt rip]; apply the adjustment before reading tsc
sub [rsp + 20h], rax
rdtsc
shl rdx, 32
or rdx, rax ; rdx = 64-bit timestamp
sub rdx, [rsp + 20h] ; rdx = elapsed
lock add [rcx], rdx ; update counter.
common_no_overhead:
; restore volatile registers.
add rsp, 28h
pop r11
pop r10
pop r9
pop r8
pop rcx
popfq
pop rdx
pop rax
ret
;;
; Data rsi points to while we're calibrating.
struc CALIBDATA
.Overhead resq 1
.Profiled resq 1
.EnterTS resq 1
.Min resq 1
endstruc
align 16
;;
; Do necessary calibrations.
;
calibrate:
; prolog - save everything
push rbp
pushfq
push rax ; pushaq
push rbx
push rcx
push rdx
push rdi
push rsi
push r8
push r9
push r10
push r11
push r12
push r13
push r14
push r15
mov rbp, rsp
sub rsp, CALIBDATA_size
mov rsi, rsp ; rsi points to the CALIBDATA
and rsp, -16
;
; Indicate that we have finished calibrating.
;
mov eax, 1
xchg dword [g_fCalibrated wrt rip], eax
;
; The outer loop - find the right adjustment.
;
mov ebx, 200h ; loop counter.
calib_outer_loop:
;
; The inner loop - calls the function number of times to establish a
; good minimum value
;
mov ecx, 200h
mov dword [rsi + CALIBDATA.Min], 0ffffffffh
mov dword [rsi + CALIBDATA.Min + 4], 07fffffffh
calib_inner_loop:
; zero the overhead and profiled times.
xor eax, eax
mov [rsi + CALIBDATA.Overhead], rax
mov [rsi + CALIBDATA.Profiled], rax
call calib_nullproc
; subtract the overhead
mov rax, [rsi + CALIBDATA.Profiled]
sub rax, [rsi + CALIBDATA.Overhead]
; update the minimum value.
bt rax, 63
jc near calib_outer_dec ; if negative, just simplify and shortcut
cmp rax, [rsi + CALIBDATA.Min]
jge calib_inner_next
calib_inner_update_minimum:
mov [rsi + CALIBDATA.Min], rax
calib_inner_next:
loop calib_inner_loop
; Is the minimum value acceptable?
test dword [rsi + CALIBDATA.Min + 4], 80000000h
jnz calib_outer_dec ; simplify if negative.
cmp dword [rsi + CALIBDATA.Min + 4], 0
jnz calib_outer_inc ; this shouldn't be possible
cmp dword [rsi + CALIBDATA.Min], 1fh
jbe calib_outer_dec ; too low - 2 ticks per pair is the minimum!
;cmp dword [rsi + CALIBDATA.Min], 30h
;jbe calib_done ; this is fine!
cmp dword [rsi + CALIBDATA.Min], 70h ; - a bit weird...
jbe calib_outer_next ; do the full 200h*200h iteration
calib_outer_inc:
inc dword [g_OverheadAdj wrt rip]
jmp calib_outer_next
calib_outer_dec:
cmp dword [g_OverheadAdj wrt rip], 1
je calib_done
dec dword [g_OverheadAdj wrt rip]
calib_outer_next:
dec ebx
jnz calib_outer_loop
calib_done:
; epilog - restore it all.
mov rsp, rbp
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop r8
pop rsi
pop rdi
pop rdx
pop rcx
pop rbx
pop rax
popfq
pop rbp
ret
;;
; The calibration _penter - this must be identical to the real thing except for the KPRF call.
align 16
calib_penter:
; This part must be identical past the rdtsc.
push rax
push rdx
rdtsc
pushfq
push rcx
push r8
push r9
push r10
push r11
sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
; reserve 20h for spill, and 8 bytes for ts.
; store the entry / stack frame.
mov r8d, edx
shl r8, 32
or r8, rax
mov [rsp + 20h], r8
mov [rsi + CALIBDATA.EnterTS], r8
lea rax, [rsi + CALIBDATA.Overhead]
jmp common_overhead
;;
; The calibration _pexit - this must be identical to the real thing except for the KPRF call.
align 16
calib_pexit:
; This part must be identical past the rdtsc.
push rax
push rdx
rdtsc
pushfq
push rcx
push r8
push r9
push r10
push r11
sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
; reserve 20h for spill, and 8 bytes for ts.
; store the entry / stack frame.
mov r8d, edx
shl r8, 32
or r8, rax
mov [rsp + 20h], r8
sub r8, [rsi + CALIBDATA.EnterTS]
add [rsi + CALIBDATA.Profiled], r8
lea rax, [rsi + CALIBDATA.EnterTS]
jmp common_overhead
;;
; The 'function' we're profiling.
; The general idea is that each pair should take something like 2-10 ticks.
;
; (Btw. If we don't use multiple pairs here, we end up with the wrong result.)
align 16
calib_nullproc:
call calib_penter ;0
call calib_pexit
call calib_penter ;1
call calib_pexit
call calib_penter ;2
call calib_pexit
call calib_penter ;3
call calib_pexit
call calib_penter ;4
call calib_pexit
call calib_penter ;5
call calib_pexit
call calib_penter ;6
call calib_pexit
call calib_penter ;7
call calib_pexit
call calib_penter ;8
call calib_pexit
call calib_penter ;9
call calib_pexit
call calib_penter ;a
call calib_pexit
call calib_penter ;b
call calib_pexit
call calib_penter ;c
call calib_pexit
call calib_penter ;d
call calib_pexit
call calib_penter ;e
call calib_pexit
call calib_penter ;f
call calib_pexit
ret
;
; Dummy stack check function.
;
global __chkstk
__chkstk:
ret
|