summaryrefslogtreecommitdiffstats
path: root/src/VBox/Runtime/common/asm/ASMMemFirstMismatchingU8.asm
blob: 2a8e560f639c2eddd64cc1b0b304735b7a941581 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
; $Id: ASMMemFirstMismatchingU8.asm $
;; @file
; IPRT - ASMMemFirstMismatchingU8().
;

;
; Copyright (C) 2006-2020 Oracle Corporation
;
; This file is part of VirtualBox Open Source Edition (OSE), as
; available from http://www.virtualbox.org. This file is free software;
; you can redistribute it and/or modify it under the terms of the GNU
; General Public License (GPL) as published by the Free Software
; Foundation, in version 2 as it comes in the "COPYING" file of the
; VirtualBox OSE distribution. VirtualBox OSE is distributed in the
; hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
;
; The contents of this file may alternatively be used under the terms
; of the Common Development and Distribution License Version 1.0
; (CDDL) only, as it comes in the "COPYING.CDDL" file of the
; VirtualBox OSE distribution, in which case the provisions of the
; CDDL are applicable instead of those of the GPL.
;
; You may elect to license modified versions of this file under the
; terms and conditions of either the GPL or the CDDL or both.
;


;*******************************************************************************
;* Header Files                                                                *
;*******************************************************************************
%define RT_ASM_WITH_SEH64
%include "iprt/asmdefs.mac"


BEGINCODE

;;
; Variant of ASMMemFirstMismatchingU8 with a fixed @a u8 value.
; We repeat the prolog and join the generic function.
;
BEGINPROC_EXPORTED  ASMMemFirstNonZero
        ;
        ; Prologue.
        ;
%if ARCH_BITS != 64
        push    xBP
        mov     xBP, xSP
        push    xDI
 %if ARCH_BITS == 16
        push    es
 %endif
%elifdef ASM_CALL64_MSC
        mov     r9, rdi                 ; save rdi in r9
%endif
SEH64_END_PROLOGUE

        ;
        ; Normalize input; rdi=pv, rcx=cb, rax=0
        ;
 %if ARCH_BITS == 64
  %ifdef ASM_CALL64_MSC
        mov     rdi, rcx
        mov     rcx, rdx
        jrcxz   RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return_all_same)
        xor     eax, eax
  %else
        mov     rcx, rsi
        jrcxz   RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return_all_same)
        xor     eax, eax
  %endif

 %elif ARCH_BITS == 32
        mov     ecx, [ebp + 0ch]
        jecxz   RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return_all_same)
        mov     edi, [ebp + 08h]
        xor     eax, eax

 %elif ARCH_BITS == 16
        mov     cx, [bp + 08h]          ; cb
        jcxz    RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.return16_all_same)
        les     di, [bp + 04h]          ; pv (far)
        xor     ax, ax

 %else
  %error "Invalid ARCH_BITS value"
 %endif

        ;
        ; Join ASMMemFirstMismatchingU8
        ;
        jmp     RT_CONCAT(NAME(ASMMemFirstMismatchingU8),.is_all_zero_joining)
ENDPROC    ASMMemFirstNonZero


;;
; Inverted memchr.
;
; @returns Pointer to the byte which doesn't equal u8.
; @returns NULL if all equal to u8.
;
; @param   msc:rcx gcc:rdi  pv      Pointer to the memory block.
; @param   msc:rdx gcc:rsi  cb      Number of bytes in the block. This MUST be aligned on 32-bit!
; @param   msc:r8b gcc:dl   u8      The value it's supposed to be filled with.
;
; @cproto DECLINLINE(void *) ASMMemFirstMismatchingU8(void const *pv, size_t cb, uint8_t u8)
;
BEGINPROC_EXPORTED ASMMemFirstMismatchingU8
        ;
        ; Prologue.
        ;
%if ARCH_BITS != 64
        push    xBP
        mov     xBP, xSP
        push    xDI
 %if ARCH_BITS == 16
        push    es
 %endif
%elifdef ASM_CALL64_MSC
        mov     r9, rdi                 ; save rdi in r9
%endif
SEH64_END_PROLOGUE

%if ARCH_BITS != 16
        ;
        ; The 32-bit and 64-bit variant of the code.
        ;

        ; Normalize input; rdi=pv, rcx=cb, rax=eight-times-u8
 %if ARCH_BITS == 64
  %ifdef ASM_CALL64_MSC
        mov     rdi, rcx
        mov     rcx, rdx
        jrcxz   .return_all_same
        movzx   r8d, r8b
        mov     rax, qword 0101010101010101h
        imul    rax, r8
  %else
        mov     rcx, rsi
        jrcxz   .return_all_same
        movzx   edx, dl
        mov     rax, qword 0101010101010101h
        imul    rax, rdx
  %endif

 %elif ARCH_BITS == 32
        mov     ecx, [ebp + 0ch]
        jecxz   .return_all_same
        mov     edi, [ebp + 08h]
        movzx   eax, byte [ebp + 10h]
        mov     ah, al
        movzx   edx, ax
        shl     eax, 16
        or      eax, edx
 %else
  %error "Invalid ARCH_BITS value"
 %endif

.is_all_zero_joining:
        cld

        ; Unaligned pointer? Align it (elsewhere).
        test    edi, xCB - 1
        jnz     .unaligned_pv
.aligned_pv:

        ; Do the dword/qword scan.
        mov     edx, xCB - 1
        and     edx, ecx                ; Remaining bytes for tail scan
 %if ARCH_BITS == 64
        shr     xCX, 3
        repe scasq
 %else
        shr     xCX, 2
        repe scasd
 %endif
        jne     .multibyte_mismatch

        ; Prep for tail scan.
        mov     ecx, edx

        ;
        ; Byte by byte scan.
        ;
.byte_by_byte:
        repe scasb
        jne     .return_xDI

.return_all_same:
        xor     eax, eax
 %ifdef ASM_CALL64_MSC
        mov     rdi, r9                 ; restore rdi
 %elif ARCH_BITS == 32
        pop     edi
        leave
 %endif
        ret

        ; Return after byte scan mismatch.
.return_xDI:
        lea     xAX, [xDI - 1]
 %ifdef ASM_CALL64_MSC
        mov     rdi, r9                 ; restore rdi
 %elif ARCH_BITS == 32
        pop     edi
        leave
 %endif
        ret

        ;
        ; Multibyte mismatch.  We rewind and do a byte scan of the remainder.
        ; (can't just search the qword as the buffer must be considered volatile).
        ;
.multibyte_mismatch:
        lea     xDI, [xDI - xCB]
        lea     xCX, [xCX * xCB + xCB]
        or      ecx, edx
        jmp     .byte_by_byte

        ;
        ; Unaligned pointer.  If it's worth it, align the pointer, but if the
        ; memory block is too small do the byte scan variant.
        ;
.unaligned_pv:
        cmp     xCX, 4*xCB              ; 4 steps seems reasonable.
        jbe     .byte_by_byte

        ; Unrolled buffer realignment.
 %if ARCH_BITS == 64
        dec     xCX
        scasb
        jne     .return_xDI
        test    edi, xCB - 1
        jz      .aligned_pv

        dec     xCX
        scasb
        jne     .return_xDI
        test    edi, xCB - 1
        jz      .aligned_pv

        dec     xCX
        scasb
        jne     .return_xDI
        test    edi, xCB - 1
        jz      .aligned_pv

        dec     xCX
        scasb
        jne     .return_xDI
        test    edi, xCB - 1
        jz      .aligned_pv
 %endif

        dec     xCX
        scasb
        jne     .return_xDI
        test    edi, xCB - 1
        jz      .aligned_pv

        dec     xCX
        scasb
        jne     .return_xDI
        test    edi, xCB - 1
        jz      .aligned_pv

        dec     xCX
        scasb
        jne     .return_xDI
        jmp     .aligned_pv


%else ; ARCH_BITS == 16

        ;
        ; The 16-bit variant of the code is a little simpler since we're
        ; working with two byte words in the 'fast' scan.  We also keep
        ; this separate from the 32-bit/64-bit code because that allows
        ; avoid a few rex prefixes here and there by using extended
        ; registers (e??) where we don't care about the whole register.
        ;
CPU 8086

        ; Load input parameters.
        mov     cx, [bp + 08h]          ; cb
        jcxz   .return16_all_same
        les     di, [bp + 04h]          ; pv (far)
        mov     al, [bp + 0ah]          ; u8
        mov     ah, al

.is_all_zero_joining:
        cld

        ; Align the pointer.
        test    di, 1
        jz      .word_scan

        dec     cx
        scasb
        jne     .return16_di
        jcxz    .return16_all_same

        ; Scan word-by-word.
.word_scan:
        mov     dx, cx
        shr     cx, 1
        repe scasw
        jne     .word_mismatch

        ; do we have a tail byte?
        test    dl, 1
        jz      .return16_all_same
        scasb
        jne     .return16_di

.return16_all_same:
        xor     ax, ax
        xor     dx, dx
.return16:
        pop     es
        pop     di
        pop     bp
        ret

.word_mismatch:
        ; back up a word.
        inc     cx
        sub     di, 2

        ; Do byte-by-byte scanning of the rest of the buffer.
        shl     cx, 1
        mov     dl, 1
        and     dl, [bp + 08h]          ; cb
        or      cl, dl
        repe scasb
        je      .return16_all_same

.return16_di:
        mov     ax, di
        dec     ax
        mov     dx, es
        jmp     .return16

%endif  ; ARCH_BITS == 16
ENDPROC ASMMemFirstMismatchingU8