1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
|
; $Id: prfx86msc.asm 29 2009-07-01 20:30:29Z bird $
;; @file
; kProfiler Mark 2 - Microsoft C/C++ Compiler Interaction, x86.
;
;
; Copyright (c) 2006-2007 Knut St. Osmundsen <bird-kStuff-spamix@anduin.net>
;
; Permission is hereby granted, free of charge, to any person
; obtaining a copy of this software and associated documentation
; files (the "Software"), to deal in the Software without
; restriction, including without limitation the rights to use,
; copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the
; Software is furnished to do so, subject to the following
; conditions:
;
; The above copyright notice and this permission notice shall be
; included in all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
; OTHER DEALINGS IN THE SOFTWARE.
;
[section .data]
;
g_fCalibrated:
dd 0
g_OverheadAdj:
dd 0
[section .text]
extern KPRF_ENTER
extern KPRF_LEAVE
global __penter
global __pexit
;ifdef UNDEFINED
global common_return_path
global common_overhead
global common_no_overhead
global calibrate
global calib_inner_update_minimum
global calib_inner_next
global calib_outer_dec
global calib_outer_inc
global calib_done
global calib_nullproc
;endif
;;
; On x86 the call to this function has been observed to be put before
; creating the stack frame, as the very first instruction in the function.
;
; Thus the stack layout is as follows:
; 24 return address of the calling function.
; 20 our return address - the address of the calling function + 5.
; 1c eax
; 18 edx
; 14 eflags
; 10 ecx
; c tsc high - param 3
; 8 tsc low
; 4 frame pointer - param 2
; 0 function ptr - param 1
;
;
align 16
__penter:
; save volatile register and get the time stamp.
push eax
push edx
rdtsc
pushfd
push ecx
; setting up the enter call frame (cdecl).
sub esp, 4 + 4 + 8
mov [esp + 0ch], edx ; Param 3 - the timestamp
mov [esp + 08h], eax
lea edx, [esp + 24h] ; Param 2 - frame pointer (pointer to the return address of the function calling us)
mov [esp + 04h], edx
mov eax, [esp + 20h] ; Param 1 - The function address
sub eax, 5 ; call instruction
mov [esp], eax
call KPRF_ENTER
jmp common_return_path
;;
; On x86 the call to this function has been observed to be put right before
; return instruction. This fact matters since since we have to calc the same
; stack address as in _penter.
;
; Thus the stack layout is as follows:
; 24 return address of the calling function.
; 20 our return address - the address of the calling function + 5.
; 1c eax
; 18 edx
; 14 eflags
; 10 ecx
; c tsc high - param 3
; 8 tsc low
; 4 frame pointer - param 2
; 0 function ptr - param 1
;
;
align 16
__pexit:
; save volatile register and get the time stamp.
push eax
push edx
rdtsc
pushfd
push ecx
; setting up the leave call frame (cdecl).
sub esp, 4 + 4 + 8
mov [esp + 0ch], edx ; Param 3 - the timestamp
mov [esp + 08h], eax
lea edx, [esp + 24h] ; Param 2 - frame pointer (pointer to the return address of the function calling us)
mov [esp + 04h], edx
mov eax, [esp + 20h] ; Param 1 - Some address in the function.
sub eax, 5 ; call instruction
mov [esp], eax
call KPRF_LEAVE
jmp common_return_path
;;
; This is the common return path for both the enter and exit hooks.
; It's kept common because we can then use the same overhead adjustment
; and save some calibration efforts. It also saves space :-)
align 16
common_return_path:
; Update overhead
test eax, eax
jz common_no_overhead
cmp byte [g_fCalibrated], 0
jnz common_overhead
call calibrate
common_overhead:
mov ecx, eax ; ecx <- pointer to overhead counter.
mov eax, [g_OverheadAdj] ; apply the adjustment before reading tsc
sub [esp + 08h], eax
sbb dword [esp + 0ch], 0
rdtsc
sub eax, [esp + 08h]
sbb edx, [esp + 0ch]
add [ecx], eax
adc [ecx + 4], edx
common_no_overhead:
add esp, 4 + 4 + 8
; restore volatile registers.
pop ecx
popfd
pop edx
pop eax
ret
;;
; Data esi points to while we're calibrating.
struc CALIBDATA
.OverheadLo resd 1
.OverheadHi resd 1
.ProfiledLo resd 1
.ProfiledHi resd 1
.EnterTSLo resd 1
.EnterTSHi resd 1
.MinLo resd 1
.MinHi resd 1
endstruc
align 16
;;
; Do necessary calibrations.
;
calibrate:
; prolog
push ebp
mov ebp, esp
pushfd
pushad
sub esp, CALIBDATA_size
mov esi, esp ; esi points to the CALIBDATA
;
; Indicate that we have finished calibrating.
;
mov eax, 1
xchg dword [g_fCalibrated], eax
;
; The outer loop - find the right adjustment.
;
mov ebx, 200h ; loop counter.
calib_outer_loop:
;
; The inner loop - calls the function number of times to establish a
; good minimum value
;
mov ecx, 200h
mov dword [esi + CALIBDATA.MinLo], 0ffffffffh
mov dword [esi + CALIBDATA.MinHi], 07fffffffh
calib_inner_loop:
; zero the overhead and profiled times.
xor eax, eax
mov [esi + CALIBDATA.OverheadLo], eax
mov [esi + CALIBDATA.OverheadHi], eax
mov [esi + CALIBDATA.ProfiledLo], eax
mov [esi + CALIBDATA.ProfiledHi], eax
call calib_nullproc
; subtract the overhead
mov eax, [esi + CALIBDATA.ProfiledLo]
mov edx, [esi + CALIBDATA.ProfiledHi]
sub eax, [esi + CALIBDATA.OverheadLo]
sbb edx, [esi + CALIBDATA.OverheadHi]
; update the minimum value.
test edx, 080000000h
jnz near calib_outer_dec ; if negative, just simplify and shortcut
cmp edx, [esi + CALIBDATA.MinHi]
jg calib_inner_next
jl calib_inner_update_minimum
cmp eax, [esi + CALIBDATA.MinLo]
jge calib_inner_next
calib_inner_update_minimum:
mov [esi + CALIBDATA.MinLo], eax
mov [esi + CALIBDATA.MinHi], edx
calib_inner_next:
loop calib_inner_loop
; Is the minimum value acceptable?
test dword [esi + CALIBDATA.MinHi], 80000000h
jnz calib_outer_dec ; simplify if negative.
cmp dword [esi + CALIBDATA.MinHi], 0
jnz calib_outer_inc ; this shouldn't be possible
cmp dword [esi + CALIBDATA.MinLo], 1fh
jbe calib_outer_dec ; too low - 2 ticks per pair is the minimum!
cmp dword [esi + CALIBDATA.MinLo], 30h
jbe calib_done ; this is fine!
calib_outer_inc:
inc dword [g_OverheadAdj]
jmp calib_outer_next
calib_outer_dec:
cmp dword [g_OverheadAdj], 1
je calib_done
dec dword [g_OverheadAdj]
calib_outer_next:
dec ebx
jnz calib_outer_loop
calib_done:
; epilog
add esp, CALIBDATA_size
popad
popfd
leave
ret
;;
; The calibration __penter - this must be identical to the real thing except for the KPRF call.
align 16
calib_penter:
; This part must be identical
push eax
push edx
rdtsc
pushfd
push ecx
; store the entry
mov [esi + CALIBDATA.EnterTSLo], eax
mov [esi + CALIBDATA.EnterTSHi], edx
; create the call frame
push edx
push eax
push 0
push 0
lea eax, [esi + CALIBDATA.OverheadLo]
jmp common_overhead
;;
; The calibration __pexit - this must be identical to the real thing except for the KPRF call.
align 16
calib_pexit:
; This part must be identical
push eax
push edx
rdtsc
pushfd
push ecx
; update the time
push eax
push edx
sub eax, [esi + CALIBDATA.EnterTSLo]
sbb edx, [esi + CALIBDATA.EnterTSHi]
add [esi + CALIBDATA.ProfiledLo], eax
adc [esi + CALIBDATA.ProfiledHi], edx
pop edx
pop eax
; create the call frame
push edx
push eax
push 0
push 0
lea eax, [esi + CALIBDATA.EnterTSLo]
jmp common_overhead
;;
; The 'function' we're profiling.
; The general idea is that each pair should take something like 2-10 ticks.
;
; (Btw. If we don't use multiple pairs here, we end up with the wrong result.)
align 16
calib_nullproc:
call calib_penter ;0
call calib_pexit
call calib_penter ;1
call calib_pexit
call calib_penter ;2
call calib_pexit
call calib_penter ;3
call calib_pexit
call calib_penter ;4
call calib_pexit
call calib_penter ;5
call calib_pexit
call calib_penter ;6
call calib_pexit
call calib_penter ;7
call calib_pexit
call calib_penter ;8
call calib_pexit
call calib_penter ;9
call calib_pexit
call calib_penter ;a
call calib_pexit
call calib_penter ;b
call calib_pexit
call calib_penter ;c
call calib_pexit
call calib_penter ;d
call calib_pexit
call calib_penter ;e
call calib_pexit
call calib_penter ;f
call calib_pexit
ret
|