deps/jemalloc/include/jemalloc/internal/bit_util.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422

#ifndef JEMALLOC_INTERNAL_BIT_UTIL_H
#define JEMALLOC_INTERNAL_BIT_UTIL_H

#include "jemalloc/internal/assert.h"

/* Sanity check. */
#if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \
    || !defined(JEMALLOC_INTERNAL_FFS)
#  error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
#endif

/*
 * Unlike the builtins and posix ffs functions, our ffs requires a non-zero
 * input, and returns the position of the lowest bit set (as opposed to the
 * posix versions, which return 1 larger than that position and use a return
 * value of zero as a sentinel.  This tends to simplify logic in callers, and
 * allows for consistency with the builtins we build fls on top of.
 */
static inline unsigned
ffs_llu(unsigned long long x) {
	util_assume(x != 0);
	return JEMALLOC_INTERNAL_FFSLL(x) - 1;
}

static inline unsigned
ffs_lu(unsigned long x) {
	util_assume(x != 0);
	return JEMALLOC_INTERNAL_FFSL(x) - 1;
}

static inline unsigned
ffs_u(unsigned x) {
	util_assume(x != 0);
	return JEMALLOC_INTERNAL_FFS(x) - 1;
}

#define DO_FLS_SLOW(x, suffix) do {					\
	util_assume(x != 0);						\
	x |= (x >> 1);							\
	x |= (x >> 2);							\
	x |= (x >> 4);							\
	x |= (x >> 8);							\
	x |= (x >> 16);							\
	if (sizeof(x) > 4) {						\
		/*							\
		 * If sizeof(x) is 4, then the expression "x >> 32"	\
		 * will generate compiler warnings even if the code	\
		 * never executes.  This circumvents the warning, and	\
		 * gets compiled out in optimized builds.		\
		 */							\
		int constant_32 = sizeof(x) * 4;			\
		x |= (x >> constant_32);				\
	}								\
	x++;								\
	if (x == 0) {							\
		return 8 * sizeof(x) - 1;				\
	}								\
	return ffs_##suffix(x) - 1;					\
} while(0)

static inline unsigned
fls_llu_slow(unsigned long long x) {
	DO_FLS_SLOW(x, llu);
}

static inline unsigned
fls_lu_slow(unsigned long x) {
	DO_FLS_SLOW(x, lu);
}

static inline unsigned
fls_u_slow(unsigned x) {
	DO_FLS_SLOW(x, u);
}

#undef DO_FLS_SLOW

#ifdef JEMALLOC_HAVE_BUILTIN_CLZ
static inline unsigned
fls_llu(unsigned long long x) {
	util_assume(x != 0);
	/*
	 * Note that the xor here is more naturally written as subtraction; the
	 * last bit set is the number of bits in the type minus the number of
	 * leading zero bits.  But GCC implements that as:
	 *    bsr     edi, edi
	 *    mov     eax, 31
	 *    xor     edi, 31
	 *    sub     eax, edi
	 * If we write it as xor instead, then we get
	 *    bsr     eax, edi
	 * as desired.
	 */
	return (8 * sizeof(x) - 1) ^ __builtin_clzll(x);
}

static inline unsigned
fls_lu(unsigned long x) {
	util_assume(x != 0);
	return (8 * sizeof(x) - 1) ^ __builtin_clzl(x);
}

static inline unsigned
fls_u(unsigned x) {
	util_assume(x != 0);
	return (8 * sizeof(x) - 1) ^ __builtin_clz(x);
}
#elif defined(_MSC_VER)

#if LG_SIZEOF_PTR == 3
#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
#else
/*
 * This never actually runs; we're just dodging a compiler error for the
 * never-taken branch where sizeof(void *) == 8.
 */
#define DO_BSR64(bit, x) bit = 0; unreachable()
#endif

#define DO_FLS(x) do {							\
	if (x == 0) {							\
		return 8 * sizeof(x);					\
	}								\
	unsigned long bit;						\
	if (sizeof(x) == 4) {						\
		_BitScanReverse(&bit, (unsigned)x);			\
		return (unsigned)bit;					\
	}								\
	if (sizeof(x) == 8 && sizeof(void *) == 8) {			\
		DO_BSR64(bit, x);					\
		return (unsigned)bit;					\
	}								\
	if (sizeof(x) == 8 && sizeof(void *) == 4) {			\
		/* Dodge a compiler warning, as above. */		\
		int constant_32 = sizeof(x) * 4;			\
		if (_BitScanReverse(&bit,				\
		    (unsigned)(x >> constant_32))) {			\
			return 32 + (unsigned)bit;			\
		} else {						\
			_BitScanReverse(&bit, (unsigned)x);		\
			return (unsigned)bit;				\
		}							\
	}								\
	unreachable();							\
} while (0)

static inline unsigned
fls_llu(unsigned long long x) {
	DO_FLS(x);
}

static inline unsigned
fls_lu(unsigned long x) {
	DO_FLS(x);
}

static inline unsigned
fls_u(unsigned x) {
	DO_FLS(x);
}

#undef DO_FLS
#undef DO_BSR64
#else

static inline unsigned
fls_llu(unsigned long long x) {
	return fls_llu_slow(x);
}

static inline unsigned
fls_lu(unsigned long x) {
	return fls_lu_slow(x);
}

static inline unsigned
fls_u(unsigned x) {
	return fls_u_slow(x);
}
#endif

#if LG_SIZEOF_LONG_LONG > 3
#  error "Haven't implemented popcount for 16-byte ints."
#endif

#define DO_POPCOUNT(x, type) do {					\
	/*								\
	 * Algorithm from an old AMD optimization reference manual.	\
	 * We're putting a little bit more work than you might expect	\
	 * into the no-instrinsic case, since we only support the	\
	 * GCC intrinsics spelling of popcount (for now).  Detecting	\
	 * whether or not the popcount builtin is actually useable in	\
	 * MSVC is nontrivial.						\
	 */								\
									\
	type bmul = (type)0x0101010101010101ULL;			\
									\
	/*								\
	 * Replace each 2 bits with the sideways sum of the original	\
	 * values.  0x5 = 0b0101.					\
	 *								\
	 * You might expect this to be:					\
	 *   x = (x & 0x55...) + ((x >> 1) & 0x55...).			\
	 * That costs an extra mask relative to this, though.		\
	 */								\
	x = x - ((x >> 1) & (0x55U * bmul));				\
	/* Replace each 4 bits with their sideays sum.  0x3 = 0b0011. */\
	x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U));		\
	/*								\
	 * Replace each 8 bits with their sideways sum.  Note that we	\
	 * can't overflow within each 4-bit sum here, so we can skip	\
	 * the initial mask.						\
	 */								\
	x = (x + (x >> 4)) & (bmul * 0x0FU);				\
	/*								\
	 * None of the partial sums in this multiplication (viewed in	\
	 * base-256) can overflow into the next digit.  So the least	\
	 * significant byte of the product will be the least		\
	 * significant byte of the original value, the second least	\
	 * significant byte will be the sum of the two least		\
	 * significant bytes of the original value, and so on.		\
	 * Importantly, the high byte will be the byte-wise sum of all	\
	 * the bytes of the original value.				\
	 */								\
	x = x * bmul;							\
	x >>= ((sizeof(x) - 1) * 8);					\
	return (unsigned)x;						\
} while(0)

static inline unsigned
popcount_u_slow(unsigned bitmap) {
	DO_POPCOUNT(bitmap, unsigned);
}

static inline unsigned
popcount_lu_slow(unsigned long bitmap) {
	DO_POPCOUNT(bitmap, unsigned long);
}

static inline unsigned
popcount_llu_slow(unsigned long long bitmap) {
	DO_POPCOUNT(bitmap, unsigned long long);
}

#undef DO_POPCOUNT

static inline unsigned
popcount_u(unsigned bitmap) {
#ifdef JEMALLOC_INTERNAL_POPCOUNT
	return JEMALLOC_INTERNAL_POPCOUNT(bitmap);
#else
	return popcount_u_slow(bitmap);
#endif
}

static inline unsigned
popcount_lu(unsigned long bitmap) {
#ifdef JEMALLOC_INTERNAL_POPCOUNTL
	return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
#else
	return popcount_lu_slow(bitmap);
#endif
}

static inline unsigned
popcount_llu(unsigned long long bitmap) {
#ifdef JEMALLOC_INTERNAL_POPCOUNTLL
	return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap);
#else
	return popcount_llu_slow(bitmap);
#endif
}

/*
 * Clears first unset bit in bitmap, and returns
 * place of bit.  bitmap *must not* be 0.
 */

static inline size_t
cfs_lu(unsigned long* bitmap) {
	util_assume(*bitmap != 0);
	size_t bit = ffs_lu(*bitmap);
	*bitmap ^= ZU(1) << bit;
	return bit;
}

static inline unsigned
ffs_zu(size_t x) {
#if LG_SIZEOF_PTR == LG_SIZEOF_INT
	return ffs_u(x);
#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
	return ffs_lu(x);
#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
	return ffs_llu(x);
#else
#error No implementation for size_t ffs()
#endif
}

static inline unsigned
fls_zu(size_t x) {
#if LG_SIZEOF_PTR == LG_SIZEOF_INT
	return fls_u(x);
#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
	return fls_lu(x);
#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
	return fls_llu(x);
#else
#error No implementation for size_t fls()
#endif
}


static inline unsigned
ffs_u64(uint64_t x) {
#if LG_SIZEOF_LONG == 3
	return ffs_lu(x);
#elif LG_SIZEOF_LONG_LONG == 3
	return ffs_llu(x);
#else
#error No implementation for 64-bit ffs()
#endif
}

static inline unsigned
fls_u64(uint64_t x) {
#if LG_SIZEOF_LONG == 3
	return fls_lu(x);
#elif LG_SIZEOF_LONG_LONG == 3
	return fls_llu(x);
#else
#error No implementation for 64-bit fls()
#endif
}

static inline unsigned
ffs_u32(uint32_t x) {
#if LG_SIZEOF_INT == 2
	return ffs_u(x);
#else
#error No implementation for 32-bit ffs()
#endif
	return ffs_u(x);
}

static inline unsigned
fls_u32(uint32_t x) {
#if LG_SIZEOF_INT == 2
	return fls_u(x);
#else
#error No implementation for 32-bit fls()
#endif
	return fls_u(x);
}

static inline uint64_t
pow2_ceil_u64(uint64_t x) {
	if (unlikely(x <= 1)) {
		return x;
	}
	size_t msb_on_index = fls_u64(x - 1);
	/*
	 * Range-check; it's on the callers to ensure that the result of this
	 * call won't overflow.
	 */
	assert(msb_on_index < 63);
	return 1ULL << (msb_on_index + 1);
}

static inline uint32_t
pow2_ceil_u32(uint32_t x) {
	if (unlikely(x <= 1)) {
	    return x;
	}
	size_t msb_on_index = fls_u32(x - 1);
	/* As above. */
	assert(msb_on_index < 31);
	return 1U << (msb_on_index + 1);
}

/* Compute the smallest power of 2 that is >= x. */
static inline size_t
pow2_ceil_zu(size_t x) {
#if (LG_SIZEOF_PTR == 3)
	return pow2_ceil_u64(x);
#else
	return pow2_ceil_u32(x);
#endif
}

static inline unsigned
lg_floor(size_t x) {
	util_assume(x != 0);
#if (LG_SIZEOF_PTR == 3)
	return fls_u64(x);
#else
	return fls_u32(x);
#endif
}

static inline unsigned
lg_ceil(size_t x) {
	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
}

/* A compile-time version of lg_floor and lg_ceil. */
#define LG_FLOOR_1(x) 0
#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
#if LG_SIZEOF_PTR == 2
#  define LG_FLOOR(x) LG_FLOOR_32((x))
#else
#  define LG_FLOOR(x) LG_FLOOR_64((x))
#endif

#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))

#endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */