1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
|
use core::intrinsics::likely;
const WORD_SIZE: usize = core::mem::size_of::<usize>();
const WORD_MASK: usize = WORD_SIZE - 1;
// If the number of bytes involved exceed this threshold we will opt in word-wise copy.
// The value here selected is max(2 * WORD_SIZE, 16):
// * We need at least 2 * WORD_SIZE bytes to guarantee that at least 1 word will be copied through
// word-wise copy.
// * The word-wise copy logic needs to perform some checks so it has some small overhead.
// ensures that even on 32-bit platforms we have copied at least 8 bytes through
// word-wise copy so the saving of word-wise copy outweights the fixed overhead.
const WORD_COPY_THRESHOLD: usize = if 2 * WORD_SIZE > 16 {
2 * WORD_SIZE
} else {
16
};
#[cfg(feature = "mem-unaligned")]
unsafe fn read_usize_unaligned(x: *const usize) -> usize {
// Do not use `core::ptr::read_unaligned` here, since it calls `copy_nonoverlapping` which
// is translated to memcpy in LLVM.
let x_read = (x as *const [u8; core::mem::size_of::<usize>()]).read();
core::mem::transmute(x_read)
}
#[inline(always)]
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, mut n: usize) {
#[inline(always)]
unsafe fn copy_forward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) {
let dest_end = dest.add(n);
while dest < dest_end {
*dest = *src;
dest = dest.add(1);
src = src.add(1);
}
}
#[inline(always)]
unsafe fn copy_forward_aligned_words(dest: *mut u8, src: *const u8, n: usize) {
let mut dest_usize = dest as *mut usize;
let mut src_usize = src as *mut usize;
let dest_end = dest.add(n) as *mut usize;
while dest_usize < dest_end {
*dest_usize = *src_usize;
dest_usize = dest_usize.add(1);
src_usize = src_usize.add(1);
}
}
#[cfg(not(feature = "mem-unaligned"))]
#[inline(always)]
unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
let mut dest_usize = dest as *mut usize;
let dest_end = dest.add(n) as *mut usize;
// Calculate the misalignment offset and shift needed to reassemble value.
let offset = src as usize & WORD_MASK;
let shift = offset * 8;
// Realign src
let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize;
// This will read (but won't use) bytes out of bound.
// cfg needed because not all targets will have atomic loads that can be lowered
// (e.g. BPF, MSP430), or provided by an external library (e.g. RV32I)
#[cfg(target_has_atomic_load_store = "ptr")]
let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned);
#[cfg(not(target_has_atomic_load_store = "ptr"))]
let mut prev_word = core::ptr::read_volatile(src_aligned);
while dest_usize < dest_end {
src_aligned = src_aligned.add(1);
let cur_word = *src_aligned;
#[cfg(target_endian = "little")]
let resembled = prev_word >> shift | cur_word << (WORD_SIZE * 8 - shift);
#[cfg(target_endian = "big")]
let resembled = prev_word << shift | cur_word >> (WORD_SIZE * 8 - shift);
prev_word = cur_word;
*dest_usize = resembled;
dest_usize = dest_usize.add(1);
}
}
#[cfg(feature = "mem-unaligned")]
#[inline(always)]
unsafe fn copy_forward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
let mut dest_usize = dest as *mut usize;
let mut src_usize = src as *mut usize;
let dest_end = dest.add(n) as *mut usize;
while dest_usize < dest_end {
*dest_usize = read_usize_unaligned(src_usize);
dest_usize = dest_usize.add(1);
src_usize = src_usize.add(1);
}
}
if n >= WORD_COPY_THRESHOLD {
// Align dest
// Because of n >= 2 * WORD_SIZE, dst_misalignment < n
let dest_misalignment = (dest as usize).wrapping_neg() & WORD_MASK;
copy_forward_bytes(dest, src, dest_misalignment);
dest = dest.add(dest_misalignment);
src = src.add(dest_misalignment);
n -= dest_misalignment;
let n_words = n & !WORD_MASK;
let src_misalignment = src as usize & WORD_MASK;
if likely(src_misalignment == 0) {
copy_forward_aligned_words(dest, src, n_words);
} else {
copy_forward_misaligned_words(dest, src, n_words);
}
dest = dest.add(n_words);
src = src.add(n_words);
n -= n_words;
}
copy_forward_bytes(dest, src, n);
}
#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, mut n: usize) {
// The following backward copy helper functions uses the pointers past the end
// as their inputs instead of pointers to the start!
#[inline(always)]
unsafe fn copy_backward_bytes(mut dest: *mut u8, mut src: *const u8, n: usize) {
let dest_start = dest.sub(n);
while dest_start < dest {
dest = dest.sub(1);
src = src.sub(1);
*dest = *src;
}
}
#[inline(always)]
unsafe fn copy_backward_aligned_words(dest: *mut u8, src: *const u8, n: usize) {
let mut dest_usize = dest as *mut usize;
let mut src_usize = src as *mut usize;
let dest_start = dest.sub(n) as *mut usize;
while dest_start < dest_usize {
dest_usize = dest_usize.sub(1);
src_usize = src_usize.sub(1);
*dest_usize = *src_usize;
}
}
#[cfg(not(feature = "mem-unaligned"))]
#[inline(always)]
unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
let mut dest_usize = dest as *mut usize;
let dest_start = dest.sub(n) as *mut usize;
// Calculate the misalignment offset and shift needed to reassemble value.
let offset = src as usize & WORD_MASK;
let shift = offset * 8;
// Realign src_aligned
let mut src_aligned = (src as usize & !WORD_MASK) as *mut usize;
// This will read (but won't use) bytes out of bound.
// cfg needed because not all targets will have atomic loads that can be lowered
// (e.g. BPF, MSP430), or provided by an external library (e.g. RV32I)
#[cfg(target_has_atomic_load_store = "ptr")]
let mut prev_word = core::intrinsics::atomic_load_unordered(src_aligned);
#[cfg(not(target_has_atomic_load_store = "ptr"))]
let mut prev_word = core::ptr::read_volatile(src_aligned);
while dest_start < dest_usize {
src_aligned = src_aligned.sub(1);
let cur_word = *src_aligned;
#[cfg(target_endian = "little")]
let resembled = prev_word << (WORD_SIZE * 8 - shift) | cur_word >> shift;
#[cfg(target_endian = "big")]
let resembled = prev_word >> (WORD_SIZE * 8 - shift) | cur_word << shift;
prev_word = cur_word;
dest_usize = dest_usize.sub(1);
*dest_usize = resembled;
}
}
#[cfg(feature = "mem-unaligned")]
#[inline(always)]
unsafe fn copy_backward_misaligned_words(dest: *mut u8, src: *const u8, n: usize) {
let mut dest_usize = dest as *mut usize;
let mut src_usize = src as *mut usize;
let dest_start = dest.sub(n) as *mut usize;
while dest_start < dest_usize {
dest_usize = dest_usize.sub(1);
src_usize = src_usize.sub(1);
*dest_usize = read_usize_unaligned(src_usize);
}
}
let mut dest = dest.add(n);
let mut src = src.add(n);
if n >= WORD_COPY_THRESHOLD {
// Align dest
// Because of n >= 2 * WORD_SIZE, dst_misalignment < n
let dest_misalignment = dest as usize & WORD_MASK;
copy_backward_bytes(dest, src, dest_misalignment);
dest = dest.sub(dest_misalignment);
src = src.sub(dest_misalignment);
n -= dest_misalignment;
let n_words = n & !WORD_MASK;
let src_misalignment = src as usize & WORD_MASK;
if likely(src_misalignment == 0) {
copy_backward_aligned_words(dest, src, n_words);
} else {
copy_backward_misaligned_words(dest, src, n_words);
}
dest = dest.sub(n_words);
src = src.sub(n_words);
n -= n_words;
}
copy_backward_bytes(dest, src, n);
}
#[inline(always)]
pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) {
#[inline(always)]
pub unsafe fn set_bytes_bytes(mut s: *mut u8, c: u8, n: usize) {
let end = s.add(n);
while s < end {
*s = c;
s = s.add(1);
}
}
#[inline(always)]
pub unsafe fn set_bytes_words(s: *mut u8, c: u8, n: usize) {
let mut broadcast = c as usize;
let mut bits = 8;
while bits < WORD_SIZE * 8 {
broadcast |= broadcast << bits;
bits *= 2;
}
let mut s_usize = s as *mut usize;
let end = s.add(n) as *mut usize;
while s_usize < end {
*s_usize = broadcast;
s_usize = s_usize.add(1);
}
}
if likely(n >= WORD_COPY_THRESHOLD) {
// Align s
// Because of n >= 2 * WORD_SIZE, dst_misalignment < n
let misalignment = (s as usize).wrapping_neg() & WORD_MASK;
set_bytes_bytes(s, c, misalignment);
s = s.add(misalignment);
n -= misalignment;
let n_words = n & !WORD_MASK;
set_bytes_words(s, c, n_words);
s = s.add(n_words);
n -= n_words;
}
set_bytes_bytes(s, c, n);
}
|