1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
|
; $Id: ASMMemFirstMismatchingU8.asm $
;; @file
; IPRT - ASMMemFirstMismatchingU8().
;
;
; Copyright (C) 2006-2020 Oracle Corporation
;
; This file is part of VirtualBox Open Source Edition (OSE), as
; available from http://www.virtualbox.org. This file is free software;
; you can redistribute it and/or modify it under the terms of the GNU
; General Public License (GPL) as published by the Free Software
; Foundation, in version 2 as it comes in the "COPYING" file of the
; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
;
; The contents of this file may alternatively be used under the terms
; of the Common Development and Distribution License Version 1.0
; (CDDL) only, as it comes in the "COPYING.CDDL" file of the
; VirtualBox OSE distribution, in which case the provisions of the
; CDDL are applicable instead of those of the GPL.
;
; You may elect to license modified versions of this file under the
; terms and conditions of either the GPL or the CDDL or both.
;
;*******************************************************************************
;* Header Files *
;*******************************************************************************
%define RT_ASM_WITH_SEH64
%include "iprt/asmdefs.mac"
BEGINCODE
;;
; Variant of ASMMemFirstMismatchingU8 with a fixed @a u8 value.
; We repeat the prolog and join the generic function.
;
BEGINPROC_EXPORTED ASMMemFirstNonZero
;
; Prologue.
;
%if ARCH_BITS != 64
push xBP
mov xBP, xSP
push xDI
%if ARCH_BITS == 16
push es
%endif
%elifdef ASM_CALL64_MSC
mov r9, rdi ; save rdi in r9
%endif
SEH64_END_PROLOGUE
;
; Normalize input; rdi=pv, rcx=cb, rax=0
;
%if ARCH_BITS == 64
%ifdef ASM_CALL64_MSC
mov rdi, rcx
mov rcx, rdx
jrcxz RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return_all_same)
xor eax, eax
%else
mov rcx, rsi
jrcxz RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return_all_same)
xor eax, eax
%endif
%elif ARCH_BITS == 32
mov ecx, [ebp + 0ch]
jecxz RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return_all_same)
mov edi, [ebp + 08h]
xor eax, eax
%elif ARCH_BITS == 16
mov cx, [bp + 08h] ; cb
jcxz RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return16_all_same)
les di, [bp + 04h] ; pv (far)
xor ax, ax
%else
%error "Invalid ARCH_BITS value"
%endif
;
; Join ASMMemFirstMismatchingU8
;
jmp RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.is_all_zero_joining)
ENDPROC ASMMemFirstNonZero
;;
; Inverted memchr.
;
; @returns Pointer to the byte which doesn't equal u8.
; @returns NULL if all equal to u8.
;
; @param msc:rcx gcc:rdi pv Pointer to the memory block.
; @param msc:rdx gcc:rsi cb Number of bytes in the block. This MUST be aligned on 32-bit!
; @param msc:r8b gcc:dl u8 The value it's supposed to be filled with.
;
; @cproto DECLINLINE(void *) ASMMemFirstMismatchingU8(void const *pv, size_t cb, uint8_t u8)
;
BEGINPROC_EXPORTED ASMMemFirstMismatchingU8
;
; Prologue.
;
%if ARCH_BITS != 64
push xBP
mov xBP, xSP
push xDI
%if ARCH_BITS == 16
push es
%endif
%elifdef ASM_CALL64_MSC
mov r9, rdi ; save rdi in r9
%endif
SEH64_END_PROLOGUE
%if ARCH_BITS != 16
;
; The 32-bit and 64-bit variant of the code.
;
; Normalize input; rdi=pv, rcx=cb, rax=eight-times-u8
%if ARCH_BITS == 64
%ifdef ASM_CALL64_MSC
mov rdi, rcx
mov rcx, rdx
jrcxz .return_all_same
movzx r8d, r8b
mov rax, qword 0101010101010101h
imul rax, r8
%else
mov rcx, rsi
jrcxz .return_all_same
movzx edx, dl
mov rax, qword 0101010101010101h
imul rax, rdx
%endif
%elif ARCH_BITS == 32
mov ecx, [ebp + 0ch]
jecxz .return_all_same
mov edi, [ebp + 08h]
movzx eax, byte [ebp + 10h]
mov ah, al
movzx edx, ax
shl eax, 16
or eax, edx
%else
%error "Invalid ARCH_BITS value"
%endif
.is_all_zero_joining:
cld
; Unaligned pointer? Align it (elsewhere).
test edi, xCB - 1
jnz .unaligned_pv
.aligned_pv:
; Do the dword/qword scan.
mov edx, xCB - 1
and edx, ecx ; Remaining bytes for tail scan
%if ARCH_BITS == 64
shr xCX, 3
repe scasq
%else
shr xCX, 2
repe scasd
%endif
jne .multibyte_mismatch
; Prep for tail scan.
mov ecx, edx
;
; Byte by byte scan.
;
.byte_by_byte:
repe scasb
jne .return_xDI
.return_all_same:
xor eax, eax
%ifdef ASM_CALL64_MSC
mov rdi, r9 ; restore rdi
%elif ARCH_BITS == 32
pop edi
leave
%endif
ret
; Return after byte scan mismatch.
.return_xDI:
lea xAX, [xDI - 1]
%ifdef ASM_CALL64_MSC
mov rdi, r9 ; restore rdi
%elif ARCH_BITS == 32
pop edi
leave
%endif
ret
;
; Multibyte mismatch. We rewind and do a byte scan of the remainder.
; (can't just search the qword as the buffer must be considered volatile).
;
.multibyte_mismatch:
lea xDI, [xDI - xCB]
lea xCX, [xCX * xCB + xCB]
or ecx, edx
jmp .byte_by_byte
;
; Unaligned pointer. If it's worth it, align the pointer, but if the
; memory block is too small do the byte scan variant.
;
.unaligned_pv:
cmp xCX, 4*xCB ; 4 steps seems reasonable.
jbe .byte_by_byte
; Unrolled buffer realignment.
%if ARCH_BITS == 64
dec xCX
scasb
jne .return_xDI
test edi, xCB - 1
jz .aligned_pv
dec xCX
scasb
jne .return_xDI
test edi, xCB - 1
jz .aligned_pv
dec xCX
scasb
jne .return_xDI
test edi, xCB - 1
jz .aligned_pv
dec xCX
scasb
jne .return_xDI
test edi, xCB - 1
jz .aligned_pv
%endif
dec xCX
scasb
jne .return_xDI
test edi, xCB - 1
jz .aligned_pv
dec xCX
scasb
jne .return_xDI
test edi, xCB - 1
jz .aligned_pv
dec xCX
scasb
jne .return_xDI
jmp .aligned_pv
%else ; ARCH_BITS == 16
;
; The 16-bit variant of the code is a little simpler since we're
; working with two byte words in the 'fast' scan. We also keep
; this separate from the 32-bit/64-bit code because that allows
; avoid a few rex prefixes here and there by using extended
; registers (e??) where we don't care about the whole register.
;
CPU 8086
; Load input parameters.
mov cx, [bp + 08h] ; cb
jcxz .return16_all_same
les di, [bp + 04h] ; pv (far)
mov al, [bp + 0ah] ; u8
mov ah, al
.is_all_zero_joining:
cld
; Align the pointer.
test di, 1
jz .word_scan
dec cx
scasb
jne .return16_di
jcxz .return16_all_same
; Scan word-by-word.
.word_scan:
mov dx, cx
shr cx, 1
repe scasw
jne .word_mismatch
; do we have a tail byte?
test dl, 1
jz .return16_all_same
scasb
jne .return16_di
.return16_all_same:
xor ax, ax
xor dx, dx
.return16:
pop es
pop di
pop bp
ret
.word_mismatch:
; back up a word.
inc cx
sub di, 2
; Do byte-by-byte scanning of the rest of the buffer.
shl cx, 1
mov dl, 1
and dl, [bp + 08h] ; cb
or cl, dl
repe scasb
je .return16_all_same
.return16_di:
mov ax, di
dec ax
mov dx, es
jmp .return16
%endif ; ARCH_BITS == 16
ENDPROC ASMMemFirstMismatchingU8
|