diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/ck_backoff.h | 2 | ||||
-rw-r--r-- | include/ck_cc.h | 57 | ||||
-rw-r--r-- | include/ck_ec.h | 945 | ||||
-rw-r--r-- | include/ck_epoch.h | 96 | ||||
-rw-r--r-- | include/ck_fifo.h | 2 | ||||
-rw-r--r-- | include/ck_hs.h | 12 | ||||
-rw-r--r-- | include/ck_md.h.in | 10 | ||||
-rw-r--r-- | include/ck_pr.h | 66 | ||||
-rw-r--r-- | include/ck_queue.h | 230 | ||||
-rw-r--r-- | include/ck_ring.h | 723 | ||||
-rw-r--r-- | include/freebsd/ck_md.h.in | 133 | ||||
-rw-r--r-- | include/gcc/aarch64/ck_pr.h | 12 | ||||
-rw-r--r-- | include/gcc/aarch64/ck_pr_llsc.h | 106 | ||||
-rw-r--r-- | include/gcc/aarch64/ck_pr_lse.h | 37 | ||||
-rw-r--r-- | include/gcc/ck_cc.h | 28 | ||||
-rw-r--r-- | include/gcc/ck_pr.h | 4 | ||||
-rw-r--r-- | include/gcc/ppc/ck_pr.h | 32 | ||||
-rw-r--r-- | include/gcc/s390x/ck_f_pr.h | 97 | ||||
-rw-r--r-- | include/gcc/s390x/ck_pr.h | 373 | ||||
-rw-r--r-- | include/gcc/sparcv9/ck_pr.h | 32 | ||||
-rw-r--r-- | include/gcc/x86/ck_pr.h | 157 | ||||
-rw-r--r-- | include/gcc/x86_64/ck_pr.h | 132 | ||||
-rw-r--r-- | include/spinlock/dec.h | 3 | ||||
-rw-r--r-- | include/spinlock/fas.h | 9 | ||||
-rw-r--r-- | include/spinlock/hclh.h | 12 |
25 files changed, 2746 insertions, 564 deletions
diff --git a/include/ck_backoff.h b/include/ck_backoff.h index 82a4f21..a1f7616 100644 --- a/include/ck_backoff.h +++ b/include/ck_backoff.h @@ -50,7 +50,7 @@ ck_backoff_eb(unsigned int *c) for (i = 0; i < ceiling; i++) ck_pr_barrier(); - *c = ceiling <<= ceiling < CK_BACKOFF_CEILING; + *c = ceiling << (ceiling < CK_BACKOFF_CEILING); return; } diff --git a/include/ck_cc.h b/include/ck_cc.h index e17dc7b..1b4ff46 100644 --- a/include/ck_cc.h +++ b/include/ck_cc.h @@ -50,6 +50,7 @@ * Container function. * This relies on (compiler) implementation-defined behavior. */ +#ifndef CK_CC_CONTAINER #define CK_CC_CONTAINER(F, T, M, N) \ CK_CC_INLINE static T * \ N(F *p) \ @@ -57,6 +58,7 @@ F *n = p; \ return (T *)(void *)(((char *)n) - ((size_t)&((T *)0)->M)); \ } +#endif #define CK_CC_PAD(x) union { char pad[x]; } @@ -104,41 +106,35 @@ #define CK_CC_TYPEOF(X, DEFAULT) (DEFAULT) #endif -#ifndef CK_F_CC_FFS -#define CK_F_CC_FFS -CK_CC_INLINE static int -ck_cc_ffs(unsigned int x) -{ - unsigned int i; - - if (x == 0) - return 0; - - for (i = 1; (x & 1) == 0; i++, x >>= 1); - - return i; +#define CK_F_CC_FFS_G(L, T) \ +CK_CC_INLINE static int \ +ck_cc_##L(T v) \ +{ \ + unsigned int i; \ + \ + if (v == 0) \ + return 0; \ + \ + for (i = 1; (v & 1) == 0; i++, v >>= 1); \ + return i; \ } -#endif - -#ifndef CK_F_CC_CLZ -#define CK_F_CC_CLZ -#include <ck_limits.h> -CK_CC_INLINE static int -ck_cc_clz(unsigned int x) -{ - unsigned int count, i; +#ifndef CK_F_CC_FFS +#define CK_F_CC_FFS +CK_F_CC_FFS_G(ffs, unsigned int) +#endif /* CK_F_CC_FFS */ - for (count = 0, i = sizeof(unsigned int) * CHAR_BIT; i > 0; count++) { - unsigned int bit = 1U << --i; +#ifndef CK_F_CC_FFSL +#define CK_F_CC_FFSL +CK_F_CC_FFS_G(ffsl, unsigned long) +#endif /* CK_F_CC_FFSL */ - if (x & bit) - break; - } +#ifndef CK_F_CC_FFSLL +#define CK_F_CC_FFSLL +CK_F_CC_FFS_G(ffsll, unsigned long long) +#endif /* CK_F_CC_FFSLL */ - return count; -} -#endif +#undef CK_F_CC_FFS_G #ifndef CK_F_CC_CTZ #define CK_F_CC_CTZ @@ -151,7 +147,6 @@ ck_cc_ctz(unsigned int x) return 0; for (i = 0; (x & 1) == 0; i++, x >>= 1); - return i; } #endif diff --git a/include/ck_ec.h b/include/ck_ec.h new file mode 100644 index 0000000..cd2a368 --- /dev/null +++ b/include/ck_ec.h @@ -0,0 +1,945 @@ +/* + * Copyright 2018 Paul Khuong, Google LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Overview + * ======== + * + * ck_ec implements 32- and 64- bit event counts. Event counts let us + * easily integrate OS-level blocking (e.g., futexes) in lock-free + * protocols. Waiters block conditionally, if the event count's value + * is still equal to some old value. + * + * Event counts come in four variants: 32 and 64 bit (with one bit + * stolen for internal signaling, so 31 and 63 bit counters), and + * single or multiple producers (wakers). Waiters are always multiple + * consumers. The 32 bit variants are smaller, and more efficient, + * especially in single producer mode. The 64 bit variants are larger, + * but practically invulnerable to ABA. + * + * The 32 bit variant is always available. The 64 bit variant is only + * available if CK supports 64-bit atomic operations. Currently, + * specialization for single producer is only implemented for x86 and + * x86-64, on compilers that support GCC extended inline assembly; + * other platforms fall back to the multiple producer code path. + * + * A typical usage pattern is: + * + * 1. On the producer side: + * + * - Make changes to some shared data structure, without involving + * the event count at all. + * - After each change, call ck_ec_inc on the event count. The call + * acts as a write-write barrier, and wakes up any consumer blocked + * on the event count (waiting for new changes). + * + * 2. On the consumer side: + * + * - Snapshot ck_ec_value of the event count. The call acts as a + * read barrier. + * - Read and process the shared data structure. + * - Wait for new changes by calling ck_ec_wait with the snapshot value. + * + * Some data structures may opt for tighter integration with their + * event count. For example, an SPMC ring buffer or disruptor might + * use the event count's value as the write pointer. If the buffer is + * regularly full, it might also make sense to store the read pointer + * in an MP event count. + * + * This event count implementation supports tighter integration in two + * ways. + * + * Producers may opt to increment by an arbitrary value (less than + * INT32_MAX / INT64_MAX), in order to encode, e.g., byte + * offsets. Larger increment values make wraparound more likely, so + * the increments should still be relatively small. + * + * Consumers may pass a predicate to ck_ec_wait_pred. This predicate + * can make `ck_ec_wait_pred` return early, before the event count's + * value changes, and can override the deadline passed to futex_wait. + * This lets consumer block on one eventcount, while optimistically + * looking at other waking conditions. + * + * API Reference + * ============= + * + * When compiled as C11 or later, this header defines type-generic + * macros for ck_ec32 and ck_ec64; the reference describes this + * type-generic API. + * + * ck_ec needs additional OS primitives to determine the current time, + * to wait on an address, and to wake all threads waiting on a given + * address. These are defined with fields in a struct ck_ec_ops. Each + * ck_ec_ops may additionally define the number of spin loop + * iterations in the slow path, as well as the initial wait time in + * the internal exponential backoff, the exponential scale factor, and + * the right shift count (< 32). + * + * The ops, in addition to the single/multiple producer flag, are + * encapsulated in a struct ck_ec_mode, passed to most ck_ec + * operations. + * + * ec is a struct ck_ec32 *, or a struct ck_ec64 *. + * + * value is an uint32_t for ck_ec32, and an uint64_t for ck_ec64. It + * never exceeds INT32_MAX and INT64_MAX respectively. + * + * mode is a struct ck_ec_mode *. + * + * deadline is either NULL, or a `const struct timespec *` that will + * be treated as an absolute deadline. + * + * `void ck_ec_init(ec, value)`: initializes the event count to value. + * + * `value ck_ec_value(ec)`: returns the current value of the event + * counter. This read acts as a read (acquire) barrier. + * + * `bool ck_ec_has_waiters(ec)`: returns whether some thread has + * marked the event count as requiring an OS wakeup. + * + * `void ck_ec_inc(ec, mode)`: increments the value of the event + * counter by one. This writes acts as a write barrier. Wakes up + * any waiting thread. + * + * `value ck_ec_add(ec, mode, value)`: increments the event counter by + * `value`, and returns the event counter's previous value. This + * write acts as a write barrier. Wakes up any waiting thread. + * + * `int ck_ec_deadline(struct timespec *new_deadline, + * mode, + * const struct timespec *timeout)`: + * computes a deadline `timeout` away from the current time. If + * timeout is NULL, computes a deadline in the infinite future. The + * resulting deadline is written to `new_deadline`. Returns 0 on + * success, and -1 if ops->gettime failed (without touching errno). + * + * `int ck_ec_wait(ec, mode, value, deadline)`: waits until the event + * counter's value differs from `value`, or, if `deadline` is + * provided and non-NULL, until the current time is after that + * deadline. Use a deadline with tv_sec = 0 for a non-blocking + * execution. Returns 0 if the event counter has changed, and -1 on + * timeout. This function acts as a read (acquire) barrier. + * + * `int ck_ec_wait_pred(ec, mode, value, pred, data, deadline)`: waits + * until the event counter's value differs from `value`, or until + * `pred` returns non-zero, or, if `deadline` is provided and + * non-NULL, until the current time is after that deadline. Use a + * deadline with tv_sec = 0 for a non-blocking execution. Returns 0 if + * the event counter has changed, `pred`'s return value if non-zero, + * and -1 on timeout. This function acts as a read (acquire) barrier. + * + * `pred` is always called as `pred(data, iteration_deadline, now)`, + * where `iteration_deadline` is a timespec of the deadline for this + * exponential backoff iteration, and `now` is the current time. If + * `pred` returns a non-zero value, that value is immediately returned + * to the waiter. Otherwise, `pred` is free to modify + * `iteration_deadline` (moving it further in the future is a bad + * idea). + * + * Implementation notes + * ==================== + * + * The multiple producer implementation is a regular locked event + * count, with a single flag bit to denote the need to wake up waiting + * threads. + * + * The single producer specialization is heavily tied to + * [x86-TSO](https://www.cl.cam.ac.uk/~pes20/weakmemory/cacm.pdf), and + * to non-atomic read-modify-write instructions (e.g., `inc mem`); + * these non-atomic RMW let us write to the same memory locations with + * atomic and non-atomic instructions, without suffering from process + * scheduling stalls. + * + * The reason we can mix atomic and non-atomic writes to the `counter` + * word is that every non-atomic write obviates the need for the + * atomically flipped flag bit: we only use non-atomic writes to + * update the event count, and the atomic flag only informs the + * producer that we would like a futex_wake, because of the update. + * We only require the non-atomic RMW counter update to prevent + * preemption from introducing arbitrarily long worst case delays. + * + * Correctness does not rely on the usual ordering argument: in the + * absence of fences, there is no strict ordering between atomic and + * non-atomic writes. The key is instead x86-TSO's guarantee that a + * read is satisfied from the most recent buffered write in the local + * store queue if there is one, or from memory if there is no write to + * that address in the store queue. + * + * x86-TSO's constraint on reads suffices to guarantee that the + * producer will never forget about a counter update. If the last + * update is still queued, the new update will be based on the queued + * value. Otherwise, the new update will be based on the value in + * memory, which may or may not have had its flag flipped. In either + * case, the value of the counter (modulo flag) is correct. + * + * When the producer forwards the counter's value from its store + * queue, the new update might not preserve a flag flip. Any waiter + * thus has to check from time to time to determine if it wasn't + * woken up because the flag bit was silently cleared. + * + * In reality, the store queue in x86-TSO stands for in-flight + * instructions in the chip's out-of-order backend. In the vast + * majority of cases, instructions will only remain in flight for a + * few hundred or thousand of cycles. That's why ck_ec_wait spins on + * the `counter` word for ~100 iterations after flipping its flag bit: + * if the counter hasn't changed after that many iterations, it is + * very likely that the producer's next counter update will observe + * the flag flip. + * + * That's still not a hard guarantee of correctness. Conservatively, + * we can expect that no instruction will remain in flight for more + * than 1 second... if only because some interrupt will have forced + * the chip to store its architectural state in memory, at which point + * an instruction is either fully retired or rolled back. Interrupts, + * particularly the pre-emption timer, are why single-producer updates + * must happen in a single non-atomic read-modify-write instruction. + * Having a single instruction as the critical section means we only + * have to consider the worst-case execution time for that + * instruction. That's easier than doing the same for a pair of + * instructions, which an unlucky pre-emption could delay for + * arbitrarily long. + * + * Thus, after a short spin loop, ck_ec_wait enters an exponential + * backoff loop, where each "sleep" is instead a futex_wait. The + * backoff is only necessary to handle rare cases where the flag flip + * was overwritten after the spin loop. Eventually, more than one + * second will have elapsed since the flag flip, and the sleep timeout + * becomes infinite: since the flag bit has been set for much longer + * than the time for which an instruction may remain in flight, the + * flag will definitely be observed at the next counter update. + * + * The 64 bit ck_ec_wait pulls another trick: futexes only handle 32 + * bit ints, so we must treat the 64 bit counter's low 32 bits as an + * int in futex_wait. That's a bit dodgy, but fine in practice, given + * that the OS's futex code will always read whatever value is + * currently in memory: even if the producer thread were to wait on + * its own event count, the syscall and ring transition would empty + * the store queue (the out-of-order execution backend). + * + * Finally, what happens when the producer is migrated to another core + * or otherwise pre-empted? Migration must already incur a barrier, so + * that thread always sees its own writes, so that's safe. As for + * pre-emption, that requires storing the architectural state, which + * means every instruction must either be executed fully or not at + * all when pre-emption happens. + */ + +#ifndef CK_EC_H +#define CK_EC_H +#include <ck_cc.h> +#include <ck_pr.h> +#include <ck_stdbool.h> +#include <ck_stdint.h> +#include <ck_stddef.h> +#include <sys/time.h> + +/* + * If we have ck_pr_faa_64 (and, presumably, ck_pr_load_64), we + * support 63 bit counters. + */ +#ifdef CK_F_PR_FAA_64 +#define CK_F_EC64 +#endif /* CK_F_PR_FAA_64 */ + +/* + * GCC inline assembly lets us exploit non-atomic read-modify-write + * instructions on x86/x86_64 for a fast single-producer mode. + * + * If we CK_F_EC_SP is not defined, CK_EC always uses the slower + * multiple producer code. + */ +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#define CK_F_EC_SP +#endif /* GNUC && (__i386__ || __x86_64__) */ + +struct ck_ec_ops; + +struct ck_ec_wait_state { + struct timespec start; /* Time when we entered ck_ec_wait. */ + struct timespec now; /* Time now. */ + const struct ck_ec_ops *ops; + void *data; /* Opaque pointer for the predicate's internal state. */ + +}; + +/* + * ck_ec_ops define system-specific functions to get the current time, + * atomically wait on an address if it still has some expected value, + * and to wake all threads waiting on an address. + * + * Each platform is expected to have few (one) opaque pointer to a + * const ops struct, and reuse it for all ck_ec_mode structs. + */ +struct ck_ec_ops { + /* Populates out with the current time. Returns non-zero on failure. */ + int (*gettime)(const struct ck_ec_ops *, struct timespec *out); + + /* + * Waits on address if its value is still `expected`. If + * deadline is non-NULL, stops waiting once that deadline is + * reached. May return early for any reason. + */ + void (*wait32)(const struct ck_ec_wait_state *, const uint32_t *, + uint32_t expected, const struct timespec *deadline); + + /* + * Same as wait32, but for a 64 bit counter. Only used if + * CK_F_EC64 is defined. + * + * If underlying blocking primitive only supports 32 bit + * control words, it should be safe to block on the least + * significant half of the 64 bit address. + */ + void (*wait64)(const struct ck_ec_wait_state *, const uint64_t *, + uint64_t expected, const struct timespec *deadline); + + /* Wakes up all threads waiting on address. */ + void (*wake32)(const struct ck_ec_ops *, const uint32_t *address); + + /* + * Same as wake32, but for a 64 bit counter. Only used if + * CK_F_EC64 is defined. + * + * When wait64 truncates the control word at address to `only` + * consider its least significant half, wake64 should perform + * any necessary fixup (e.g., on big endian platforms). + */ + void (*wake64)(const struct ck_ec_ops *, const uint64_t *address); + + /* + * Number of iterations for the initial busy wait. 0 defaults + * to 100 (not ABI stable). + */ + uint32_t busy_loop_iter; + + /* + * Delay in nanoseconds for the first iteration of the + * exponential backoff. 0 defaults to 2 ms (not ABI stable). + */ + uint32_t initial_wait_ns; + + /* + * Scale factor for the exponential backoff. 0 defaults to 8x + * (not ABI stable). + */ + uint32_t wait_scale_factor; + + /* + * Right shift count for the exponential backoff. The update + * after each iteration is + * wait_ns = (wait_ns * wait_scale_factor) >> wait_shift_count, + * until one second has elapsed. After that, the deadline goes + * to infinity. + */ + uint32_t wait_shift_count; +}; + +/* + * ck_ec_mode wraps the ops table, and informs the fast path whether + * it should attempt to specialize for single producer mode. + * + * mode structs are expected to be exposed by value, e.g., + * + * extern const struct ck_ec_ops system_ec_ops; + * + * static const struct ck_ec_mode ec_sp = { + * .ops = &system_ec_ops, + * .single_producer = true + * }; + * + * static const struct ck_ec_mode ec_mp = { + * .ops = &system_ec_ops, + * .single_producer = false + * }; + * + * ck_ec_mode structs are only passed to inline functions defined in + * this header, and never escape to their slow paths, so they should + * not result in any object file size increase. + */ +struct ck_ec_mode { + const struct ck_ec_ops *ops; + /* + * If single_producer is true, the event count has a unique + * incrementer. The implementation will specialize ck_ec_inc + * and ck_ec_add if possible (if CK_F_EC_SP is defined). + */ + bool single_producer; +}; + +struct ck_ec32 { + /* Flag is "sign" bit, value in bits 0:30. */ + uint32_t counter; +}; + +typedef struct ck_ec32 ck_ec32_t; + +#ifdef CK_F_EC64 +struct ck_ec64 { + /* + * Flag is bottom bit, value in bits 1:63. Eventcount only + * works on x86-64 (i.e., little endian), so the futex int + * lies in the first 4 (bottom) bytes. + */ + uint64_t counter; +}; + +typedef struct ck_ec64 ck_ec64_t; +#endif /* CK_F_EC64 */ + +#define CK_EC_INITIALIZER { .counter = 0 } + +/* + * Initializes the event count to `value`. The value must not + * exceed INT32_MAX. + */ +static void ck_ec32_init(struct ck_ec32 *ec, uint32_t value); + +#ifndef CK_F_EC64 +#define ck_ec_init ck_ec32_init +#else +/* + * Initializes the event count to `value`. The value must not + * exceed INT64_MAX. + */ +static void ck_ec64_init(struct ck_ec64 *ec, uint64_t value); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_init(EC, VALUE) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_init, \ + struct ck_ec64 : ck_ec64_init)((EC), (VALUE))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Returns the counter value in the event count. The value is at most + * INT32_MAX. + */ +static uint32_t ck_ec32_value(const struct ck_ec32* ec); + +#ifndef CK_F_EC64 +#define ck_ec_value ck_ec32_value +#else +/* + * Returns the counter value in the event count. The value is at most + * INT64_MAX. + */ +static uint64_t ck_ec64_value(const struct ck_ec64* ec); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_value(EC) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_value, \ + struct ck_ec64 : ck_ec64_value)((EC))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Returns whether there may be slow pathed waiters that need an + * explicit OS wakeup for this event count. + */ +static bool ck_ec32_has_waiters(const struct ck_ec32 *ec); + +#ifndef CK_F_EC64 +#define ck_ec_has_waiters ck_ec32_has_waiters +#else +static bool ck_ec64_has_waiters(const struct ck_ec64 *ec); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_has_waiters(EC) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_has_waiters, \ + struct ck_ec64 : ck_ec64_has_waiters)((EC))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Increments the counter value in the event count by one, and wakes + * up any waiter. + */ +static void ck_ec32_inc(struct ck_ec32 *ec, const struct ck_ec_mode *mode); + +#ifndef CK_F_EC64 +#define ck_ec_inc ck_ec32_inc +#else +static void ck_ec64_inc(struct ck_ec64 *ec, const struct ck_ec_mode *mode); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_inc(EC, MODE) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_inc, \ + struct ck_ec64 : ck_ec64_inc)((EC), (MODE))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Increments the counter value in the event count by delta, wakes + * up any waiter, and returns the previous counter value. + */ +static uint32_t ck_ec32_add(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t delta); + +#ifndef CK_F_EC64 +#define ck_ec_add ck_ec32_add +#else +static uint64_t ck_ec64_add(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t delta); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_add(EC, MODE, DELTA) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_add, \ + struct ck_ec64 : ck_ec64_add)((EC), (MODE), (DELTA))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Populates `new_deadline` with a deadline `timeout` in the future. + * Returns 0 on success, and -1 if clock_gettime failed, in which + * case errno is left as is. + */ +static int ck_ec_deadline(struct timespec *new_deadline, + const struct ck_ec_mode *mode, + const struct timespec *timeout); + +/* + * Waits until the counter value in the event count differs from + * old_value, or, if deadline is non-NULL, until CLOCK_MONOTONIC is + * past the deadline. + * + * Returns 0 on success, and -1 on timeout. + */ +static int ck_ec32_wait(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t old_value, + const struct timespec *deadline); + +#ifndef CK_F_EC64 +#define ck_ec_wait ck_ec32_wait +#else +static int ck_ec64_wait(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t old_value, + const struct timespec *deadline); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_wait(EC, MODE, OLD_VALUE, DEADLINE) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_wait, \ + struct ck_ec64 : ck_ec64_wait)((EC), (MODE), \ + (OLD_VALUE), (DEADLINE))) + +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Waits until the counter value in the event count differs from + * old_value, pred returns non-zero, or, if deadline is non-NULL, + * until CLOCK_MONOTONIC is past the deadline. + * + * Returns 0 on success, -1 on timeout, and the return value of pred + * if it returns non-zero. + * + * A NULL pred represents a function that always returns 0. + */ +static int ck_ec32_wait_pred(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t old_value, + int (*pred)(const struct ck_ec_wait_state *, + struct timespec *deadline), + void *data, + const struct timespec *deadline); + +#ifndef CK_F_EC64 +#define ck_ec_wait_pred ck_ec32_wait_pred +#else +static int ck_ec64_wait_pred(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t old_value, + int (*pred)(const struct ck_ec_wait_state *, + struct timespec *deadline), + void *data, + const struct timespec *deadline); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_wait_pred(EC, MODE, OLD_VALUE, PRED, DATA, DEADLINE) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_wait_pred, \ + struct ck_ec64 : ck_ec64_wait_pred) \ + ((EC), (MODE), (OLD_VALUE), (PRED), (DATA), (DEADLINE))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Inline implementation details. 32 bit first, then 64 bit + * conditionally. + */ +CK_CC_FORCE_INLINE void ck_ec32_init(struct ck_ec32 *ec, uint32_t value) +{ + ec->counter = value & ~(1UL << 31); + return; +} + +CK_CC_FORCE_INLINE uint32_t ck_ec32_value(const struct ck_ec32 *ec) +{ + uint32_t ret = ck_pr_load_32(&ec->counter) & ~(1UL << 31); + + ck_pr_fence_acquire(); + return ret; +} + +CK_CC_FORCE_INLINE bool ck_ec32_has_waiters(const struct ck_ec32 *ec) +{ + return ck_pr_load_32(&ec->counter) & (1UL << 31); +} + +/* Slow path for ck_ec{32,64}_{inc,add} */ +void ck_ec32_wake(struct ck_ec32 *ec, const struct ck_ec_ops *ops); + +CK_CC_FORCE_INLINE void ck_ec32_inc(struct ck_ec32 *ec, + const struct ck_ec_mode *mode) +{ +#if !defined(CK_F_EC_SP) + /* Nothing to specialize if we don't have EC_SP. */ + ck_ec32_add(ec, mode, 1); + return; +#else + char flagged; + +#if __GNUC__ >= 6 + /* + * We don't want to wake if the sign bit is 0. We do want to + * wake if the sign bit just flipped from 1 to 0. We don't + * care what happens when our increment caused the sign bit to + * flip from 0 to 1 (that's once per 2^31 increment). + * + * This leaves us with four cases: + * + * old sign bit | new sign bit | SF | OF | ZF + * ------------------------------------------- + * 0 | 0 | 0 | 0 | ? + * 0 | 1 | 1 | 0 | ? + * 1 | 1 | 1 | 0 | ? + * 1 | 0 | 0 | 0 | 1 + * + * In the first case, we don't want to hit ck_ec32_wake. In + * the last two cases, we do want to call ck_ec32_wake. In the + * second case, we don't care, so we arbitrarily choose to + * call ck_ec32_wake. + * + * The "le" condition checks if SF != OF, or ZF == 1, which + * meets our requirements. + */ +#define CK_EC32_INC_ASM(PREFIX) \ + __asm__ volatile(PREFIX " incl %0" \ + : "+m"(ec->counter), "=@ccle"(flagged) \ + :: "cc", "memory") +#else +#define CK_EC32_INC_ASM(PREFIX) \ + __asm__ volatile(PREFIX " incl %0; setle %1" \ + : "+m"(ec->counter), "=r"(flagged) \ + :: "cc", "memory") +#endif /* __GNUC__ */ + + if (mode->single_producer == true) { + ck_pr_fence_store(); + CK_EC32_INC_ASM(""); + } else { + ck_pr_fence_store_atomic(); + CK_EC32_INC_ASM("lock"); + } +#undef CK_EC32_INC_ASM + + if (CK_CC_UNLIKELY(flagged)) { + ck_ec32_wake(ec, mode->ops); + } + + return; +#endif /* CK_F_EC_SP */ +} + +CK_CC_FORCE_INLINE uint32_t ck_ec32_add_epilogue(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t old) +{ + const uint32_t flag_mask = 1U << 31; + uint32_t ret; + + ret = old & ~flag_mask; + /* These two only differ if the flag bit is set. */ + if (CK_CC_UNLIKELY(old != ret)) { + ck_ec32_wake(ec, mode->ops); + } + + return ret; +} + +static CK_CC_INLINE uint32_t ck_ec32_add_mp(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t delta) +{ + uint32_t old; + + ck_pr_fence_store_atomic(); + old = ck_pr_faa_32(&ec->counter, delta); + return ck_ec32_add_epilogue(ec, mode, old); +} + +#ifdef CK_F_EC_SP +static CK_CC_INLINE uint32_t ck_ec32_add_sp(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t delta) +{ + uint32_t old; + + /* + * Correctness of this racy write depends on actually + * having an update to write. Exit here if the update + * is a no-op. + */ + if (CK_CC_UNLIKELY(delta == 0)) { + return ck_ec32_value(ec); + } + + ck_pr_fence_store(); + old = delta; + __asm__ volatile("xaddl %1, %0" + : "+m"(ec->counter), "+r"(old) + :: "cc", "memory"); + return ck_ec32_add_epilogue(ec, mode, old); +} +#endif /* CK_F_EC_SP */ + +CK_CC_FORCE_INLINE uint32_t ck_ec32_add(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t delta) +{ +#ifdef CK_F_EC_SP + if (mode->single_producer == true) { + return ck_ec32_add_sp(ec, mode, delta); + } +#endif + + return ck_ec32_add_mp(ec, mode, delta); +} + +int ck_ec_deadline_impl(struct timespec *new_deadline, + const struct ck_ec_ops *ops, + const struct timespec *timeout); + +CK_CC_FORCE_INLINE int ck_ec_deadline(struct timespec *new_deadline, + const struct ck_ec_mode *mode, + const struct timespec *timeout) +{ + return ck_ec_deadline_impl(new_deadline, mode->ops, timeout); +} + + +int ck_ec32_wait_slow(struct ck_ec32 *ec, + const struct ck_ec_ops *ops, + uint32_t old_value, + const struct timespec *deadline); + +CK_CC_FORCE_INLINE int ck_ec32_wait(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t old_value, + const struct timespec *deadline) +{ + if (ck_ec32_value(ec) != old_value) { + return 0; + } + + return ck_ec32_wait_slow(ec, mode->ops, old_value, deadline); +} + +int ck_ec32_wait_pred_slow(struct ck_ec32 *ec, + const struct ck_ec_ops *ops, + uint32_t old_value, + int (*pred)(const struct ck_ec_wait_state *state, + struct timespec *deadline), + void *data, + const struct timespec *deadline); + +CK_CC_FORCE_INLINE int +ck_ec32_wait_pred(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t old_value, + int (*pred)(const struct ck_ec_wait_state *state, + struct timespec *deadline), + void *data, + const struct timespec *deadline) +{ + if (ck_ec32_value(ec) != old_value) { + return 0; + } + + return ck_ec32_wait_pred_slow(ec, mode->ops, old_value, + pred, data, deadline); +} + +#ifdef CK_F_EC64 +CK_CC_FORCE_INLINE void ck_ec64_init(struct ck_ec64 *ec, uint64_t value) +{ + ec->counter = value << 1; + return; +} + +CK_CC_FORCE_INLINE uint64_t ck_ec64_value(const struct ck_ec64 *ec) +{ + uint64_t ret = ck_pr_load_64(&ec->counter) >> 1; + + ck_pr_fence_acquire(); + return ret; +} + +CK_CC_FORCE_INLINE bool ck_ec64_has_waiters(const struct ck_ec64 *ec) +{ + return ck_pr_load_64(&ec->counter) & 1; +} + +void ck_ec64_wake(struct ck_ec64 *ec, const struct ck_ec_ops *ops); + +CK_CC_FORCE_INLINE void ck_ec64_inc(struct ck_ec64 *ec, + const struct ck_ec_mode *mode) +{ + /* We always xadd, so there's no special optimization here. */ + (void)ck_ec64_add(ec, mode, 1); + return; +} + +CK_CC_FORCE_INLINE uint64_t ck_ec_add64_epilogue(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t old) +{ + uint64_t ret = old >> 1; + + if (CK_CC_UNLIKELY(old & 1)) { + ck_ec64_wake(ec, mode->ops); + } + + return ret; +} + +static CK_CC_INLINE uint64_t ck_ec64_add_mp(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t delta) +{ + uint64_t inc = 2 * delta; /* The low bit is the flag bit. */ + + ck_pr_fence_store_atomic(); + return ck_ec_add64_epilogue(ec, mode, ck_pr_faa_64(&ec->counter, inc)); +} + +#ifdef CK_F_EC_SP +/* Single-producer specialisation. */ +static CK_CC_INLINE uint64_t ck_ec64_add_sp(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t delta) +{ + uint64_t old; + + /* + * Correctness of this racy write depends on actually + * having an update to write. Exit here if the update + * is a no-op. + */ + if (CK_CC_UNLIKELY(delta == 0)) { + return ck_ec64_value(ec); + } + + ck_pr_fence_store(); + old = 2 * delta; /* The low bit is the flag bit. */ + __asm__ volatile("xaddq %1, %0" + : "+m"(ec->counter), "+r"(old) + :: "cc", "memory"); + return ck_ec_add64_epilogue(ec, mode, old); +} +#endif /* CK_F_EC_SP */ + +/* + * Dispatch on mode->single_producer in this FORCE_INLINE function: + * the end result is always small, but not all compilers have enough + * foresight to inline and get the reduction. + */ +CK_CC_FORCE_INLINE uint64_t ck_ec64_add(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t delta) +{ +#ifdef CK_F_EC_SP + if (mode->single_producer == true) { + return ck_ec64_add_sp(ec, mode, delta); + } +#endif + + return ck_ec64_add_mp(ec, mode, delta); +} + +int ck_ec64_wait_slow(struct ck_ec64 *ec, + const struct ck_ec_ops *ops, + uint64_t old_value, + const struct timespec *deadline); + +CK_CC_FORCE_INLINE int ck_ec64_wait(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t old_value, + const struct timespec *deadline) +{ + if (ck_ec64_value(ec) != old_value) { + return 0; + } + + return ck_ec64_wait_slow(ec, mode->ops, old_value, deadline); +} + +int ck_ec64_wait_pred_slow(struct ck_ec64 *ec, + const struct ck_ec_ops *ops, + uint64_t old_value, + int (*pred)(const struct ck_ec_wait_state *state, + struct timespec *deadline), + void *data, + const struct timespec *deadline); + + +CK_CC_FORCE_INLINE int +ck_ec64_wait_pred(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t old_value, + int (*pred)(const struct ck_ec_wait_state *state, + struct timespec *deadline), + void *data, + const struct timespec *deadline) +{ + if (ck_ec64_value(ec) != old_value) { + return 0; + } + + return ck_ec64_wait_pred_slow(ec, mode->ops, old_value, + pred, data, deadline); +} +#endif /* CK_F_EC64 */ +#endif /* !CK_EC_H */ diff --git a/include/ck_epoch.h b/include/ck_epoch.h index e7ce5bc..58f3d28 100644 --- a/include/ck_epoch.h +++ b/include/ck_epoch.h @@ -83,6 +83,7 @@ struct ck_epoch_ref { }; struct ck_epoch_record { + ck_stack_entry_t record_next; struct ck_epoch *global; unsigned int state; unsigned int epoch; @@ -92,17 +93,16 @@ struct ck_epoch_record { } local CK_CC_CACHELINE; unsigned int n_pending; unsigned int n_peak; - unsigned long n_dispatch; + unsigned int n_dispatch; + void *ct; ck_stack_t pending[CK_EPOCH_LENGTH]; - ck_stack_entry_t record_next; } CK_CC_CACHELINE; typedef struct ck_epoch_record ck_epoch_record_t; struct ck_epoch { unsigned int epoch; - char pad[CK_MD_CACHELINE - sizeof(unsigned int)]; - ck_stack_t records; unsigned int n_free; + ck_stack_t records; }; typedef struct ck_epoch ck_epoch_t; @@ -110,7 +110,14 @@ typedef struct ck_epoch ck_epoch_t; * Internal functions. */ void _ck_epoch_addref(ck_epoch_record_t *, ck_epoch_section_t *); -void _ck_epoch_delref(ck_epoch_record_t *, ck_epoch_section_t *); +bool _ck_epoch_delref(ck_epoch_record_t *, ck_epoch_section_t *); + +CK_CC_FORCE_INLINE static void * +ck_epoch_record_ct(const ck_epoch_record_t *record) +{ + + return ck_pr_load_ptr(&record->ct); +} /* * Marks the beginning of an epoch-protected section. @@ -160,9 +167,10 @@ ck_epoch_begin(ck_epoch_record_t *record, ck_epoch_section_t *section) } /* - * Marks the end of an epoch-protected section. + * Marks the end of an epoch-protected section. Returns true if no more + * sections exist for the caller. */ -CK_CC_FORCE_INLINE static void +CK_CC_FORCE_INLINE static bool ck_epoch_end(ck_epoch_record_t *record, ck_epoch_section_t *section) { @@ -170,15 +178,19 @@ ck_epoch_end(ck_epoch_record_t *record, ck_epoch_section_t *section) ck_pr_store_uint(&record->active, record->active - 1); if (section != NULL) - _ck_epoch_delref(record, section); + return _ck_epoch_delref(record, section); - return; + return record->active == 0; } /* * Defers the execution of the function pointed to by the "cb" * argument until an epoch counter loop. This allows for a * non-blocking deferral. + * + * We can get away without a fence here due to the monotonic nature + * of the epoch counter. Worst case, this will result in some delays + * before object destruction. */ CK_CC_FORCE_INLINE static void ck_epoch_call(ck_epoch_record_t *record, @@ -195,13 +207,75 @@ ck_epoch_call(ck_epoch_record_t *record, return; } +/* + * Same as ck_epoch_call, but allows for records to be shared and is reentrant. + */ +CK_CC_FORCE_INLINE static void +ck_epoch_call_strict(ck_epoch_record_t *record, + ck_epoch_entry_t *entry, + ck_epoch_cb_t *function) +{ + struct ck_epoch *epoch = record->global; + unsigned int e = ck_pr_load_uint(&epoch->epoch); + unsigned int offset = e & (CK_EPOCH_LENGTH - 1); + + ck_pr_inc_uint(&record->n_pending); + entry->function = function; + + /* Store fence is implied by push operation. */ + ck_stack_push_upmc(&record->pending[offset], &entry->stack_entry); + return; +} + +/* + * This callback is used for synchronize_wait to allow for custom blocking + * behavior. + */ +typedef void ck_epoch_wait_cb_t(ck_epoch_t *, ck_epoch_record_t *, + void *); + +/* + * Return latest epoch value. This operation provides load ordering. + */ +CK_CC_FORCE_INLINE static unsigned int +ck_epoch_value(const ck_epoch_t *ep) +{ + + ck_pr_fence_load(); + return ck_pr_load_uint(&ep->epoch); +} + void ck_epoch_init(ck_epoch_t *); -ck_epoch_record_t *ck_epoch_recycle(ck_epoch_t *); -void ck_epoch_register(ck_epoch_t *, ck_epoch_record_t *); + +/* + * Attempts to recycle an unused epoch record. If one is successfully + * allocated, the record context pointer is also updated. + */ +ck_epoch_record_t *ck_epoch_recycle(ck_epoch_t *, void *); + +/* + * Registers an epoch record. An optional context pointer may be passed that + * is retrievable with ck_epoch_record_ct. + */ +void ck_epoch_register(ck_epoch_t *, ck_epoch_record_t *, void *); + +/* + * Marks a record as available for re-use by a subsequent recycle operation. + * Note that the record cannot be physically destroyed. + */ void ck_epoch_unregister(ck_epoch_record_t *); + bool ck_epoch_poll(ck_epoch_record_t *); +bool ck_epoch_poll_deferred(struct ck_epoch_record *record, ck_stack_t *deferred); void ck_epoch_synchronize(ck_epoch_record_t *); +void ck_epoch_synchronize_wait(ck_epoch_t *, ck_epoch_wait_cb_t *, void *); void ck_epoch_barrier(ck_epoch_record_t *); +void ck_epoch_barrier_wait(ck_epoch_record_t *, ck_epoch_wait_cb_t *, void *); + +/* + * Reclaim entries associated with a record. This is safe to call only on + * the caller's record or records that are using call_strict. + */ void ck_epoch_reclaim(ck_epoch_record_t *); #endif /* CK_EPOCH_H */ diff --git a/include/ck_fifo.h b/include/ck_fifo.h index 6d50070..c9a6f3d 100644 --- a/include/ck_fifo.h +++ b/include/ck_fifo.h @@ -115,7 +115,7 @@ CK_CC_INLINE static void ck_fifo_spsc_deinit(struct ck_fifo_spsc *fifo, struct ck_fifo_spsc_entry **garbage) { - *garbage = fifo->head; + *garbage = fifo->garbage; fifo->head = fifo->tail = NULL; return; } diff --git a/include/ck_hs.h b/include/ck_hs.h index b3eb046..cd3e5da 100644 --- a/include/ck_hs.h +++ b/include/ck_hs.h @@ -100,18 +100,28 @@ struct ck_hs_stat { struct ck_hs_iterator { void **cursor; unsigned long offset; + struct ck_hs_map *map; }; typedef struct ck_hs_iterator ck_hs_iterator_t; -#define CK_HS_ITERATOR_INITIALIZER { NULL, 0 } +#define CK_HS_ITERATOR_INITIALIZER { NULL, 0, NULL } /* Convenience wrapper to table hash function. */ #define CK_HS_HASH(T, F, K) F((K), (T)->seed) +/* Computes the hash of n bytes of k for the specified hash map. */ +static inline unsigned long +ck_hs_hash(const struct ck_hs *hs, const void *k) +{ + + return hs->hf(k, hs->seed); +} + typedef void *ck_hs_apply_fn_t(void *, void *); bool ck_hs_apply(ck_hs_t *, unsigned long, const void *, ck_hs_apply_fn_t *, void *); void ck_hs_iterator_init(ck_hs_iterator_t *); bool ck_hs_next(ck_hs_t *, ck_hs_iterator_t *, void **); +bool ck_hs_next_spmc(ck_hs_t *, ck_hs_iterator_t *, void **); bool ck_hs_move(ck_hs_t *, ck_hs_t *, ck_hs_hash_cb_t *, ck_hs_compare_cb_t *, struct ck_malloc *); bool ck_hs_init(ck_hs_t *, unsigned int, ck_hs_hash_cb_t *, diff --git a/include/ck_md.h.in b/include/ck_md.h.in index cb5783e..feae35b 100644 --- a/include/ck_md.h.in +++ b/include/ck_md.h.in @@ -47,7 +47,15 @@ #define @POINTER_PACK_ENABLE@ #endif /* @POINTER_PACK_ENABLE@ */ -#ifndef @VMA_BITS@ +#ifndef @SSE_DISABLE@ +#define @SSE_DISABLE@ +#endif /* @SSE_DISABLE@ */ + +#ifndef @PPC32_LWSYNC_ENABLE@ +#define @PPC32_LWSYNC_ENABLE@ +#endif /* @PPC32_LWSYNC_ENABLE@ */ + +#ifndef @VMA_BITS@ #define @VMA_BITS@ @VMA_BITS_VALUE@ #endif /* @VMA_BITS@ */ diff --git a/include/ck_pr.h b/include/ck_pr.h index 9b7fc42..8ebf855 100644 --- a/include/ck_pr.h +++ b/include/ck_pr.h @@ -34,7 +34,20 @@ #include <ck_stdint.h> #include <ck_stdbool.h> -#ifndef CK_USE_CC_BUILTINS +/* + * Default to using builtins for clang analyzer, coverity, and sparse: + * inline assembly is often too opaque for useful analysis. Override + * the defaults by defining CK_USE_CC_BUILTINS=0 or 1. + */ +#if !defined(CK_USE_CC_BUILTINS) +#if defined(__clang_analyzer__) || defined(__COVERITY__) || defined(__CHECKER__) +#define CK_USE_CC_BUILTINS 1 +#else +#define CK_USE_CC_BUILTINS 0 +#endif +#endif + +#if !CK_USE_CC_BUILTINS #if defined(__x86_64__) #include "gcc/x86_64/ck_pr.h" #elif defined(__x86__) @@ -43,6 +56,8 @@ #include "gcc/sparcv9/ck_pr.h" #elif defined(__ppc64__) #include "gcc/ppc64/ck_pr.h" +#elif defined(__s390x__) +#include "gcc/s390x/ck_pr.h" #elif defined(__ppc__) #include "gcc/ppc/ck_pr.h" #elif defined(__arm__) @@ -613,8 +628,8 @@ CK_PR_BTX_S(bts, 16, uint16_t, |,) } #define CK_PR_UNARY_Z(K, S, M, T, P, C, Z) \ - CK_CC_INLINE static void \ - ck_pr_##K##_##S##_zero(M *target, bool *zero) \ + CK_CC_INLINE static bool \ + ck_pr_##K##_##S##_is_zero(M *target) \ { \ T previous; \ C punt; \ @@ -625,12 +640,21 @@ CK_PR_BTX_S(bts, 16, uint16_t, |,) (C)(previous P 1), \ &previous) == false) \ ck_pr_stall(); \ - *zero = previous == (T)Z; \ + return previous == (T)Z; \ + } + +#define CK_PR_UNARY_Z_STUB(K, S, M) \ + CK_CC_INLINE static void \ + ck_pr_##K##_##S##_zero(M *target, bool *zero) \ + { \ + *zero = ck_pr_##K##_##S##_is_zero(target); \ return; \ } #define CK_PR_UNARY_S(K, X, S, M) CK_PR_UNARY(K, X, S, M, M) -#define CK_PR_UNARY_Z_S(K, S, M, P, Z) CK_PR_UNARY_Z(K, S, M, M, P, M, Z) +#define CK_PR_UNARY_Z_S(K, S, M, P, Z) \ + CK_PR_UNARY_Z(K, S, M, M, P, M, Z) \ + CK_PR_UNARY_Z_STUB(K, S, M) #if defined(CK_F_PR_LOAD_CHAR) && defined(CK_F_PR_CAS_CHAR_VALUE) @@ -642,6 +666,8 @@ CK_PR_UNARY_S(inc, add, char, char) #ifndef CK_F_PR_INC_CHAR_ZERO #define CK_F_PR_INC_CHAR_ZERO CK_PR_UNARY_Z_S(inc, char, char, +, -1) +#else +CK_PR_UNARY_Z_STUB(inc, char, char) #endif /* CK_F_PR_INC_CHAR_ZERO */ #ifndef CK_F_PR_DEC_CHAR @@ -652,6 +678,8 @@ CK_PR_UNARY_S(dec, sub, char, char) #ifndef CK_F_PR_DEC_CHAR_ZERO #define CK_F_PR_DEC_CHAR_ZERO CK_PR_UNARY_Z_S(dec, char, char, -, 1) +#else +CK_PR_UNARY_Z_STUB(dec, char, char) #endif /* CK_F_PR_DEC_CHAR_ZERO */ #endif /* CK_F_PR_LOAD_CHAR && CK_F_PR_CAS_CHAR_VALUE */ @@ -666,6 +694,8 @@ CK_PR_UNARY_S(inc, add, int, int) #ifndef CK_F_PR_INC_INT_ZERO #define CK_F_PR_INC_INT_ZERO CK_PR_UNARY_Z_S(inc, int, int, +, -1) +#else +CK_PR_UNARY_Z_STUB(inc, int, int) #endif /* CK_F_PR_INC_INT_ZERO */ #ifndef CK_F_PR_DEC_INT @@ -676,6 +706,8 @@ CK_PR_UNARY_S(dec, sub, int, int) #ifndef CK_F_PR_DEC_INT_ZERO #define CK_F_PR_DEC_INT_ZERO CK_PR_UNARY_Z_S(dec, int, int, -, 1) +#else +CK_PR_UNARY_Z_STUB(dec, int, int) #endif /* CK_F_PR_DEC_INT_ZERO */ #endif /* CK_F_PR_LOAD_INT && CK_F_PR_CAS_INT_VALUE */ @@ -705,6 +737,8 @@ CK_PR_UNARY_S(inc, add, uint, unsigned int) #ifndef CK_F_PR_INC_UINT_ZERO #define CK_F_PR_INC_UINT_ZERO CK_PR_UNARY_Z_S(inc, uint, unsigned int, +, UINT_MAX) +#else +CK_PR_UNARY_Z_STUB(inc, uint, unsigned int) #endif /* CK_F_PR_INC_UINT_ZERO */ #ifndef CK_F_PR_DEC_UINT @@ -715,6 +749,8 @@ CK_PR_UNARY_S(dec, sub, uint, unsigned int) #ifndef CK_F_PR_DEC_UINT_ZERO #define CK_F_PR_DEC_UINT_ZERO CK_PR_UNARY_Z_S(dec, uint, unsigned int, -, 1) +#else +CK_PR_UNARY_Z_STUB(dec, uint, unsigned int) #endif /* CK_F_PR_DEC_UINT_ZERO */ #endif /* CK_F_PR_LOAD_UINT && CK_F_PR_CAS_UINT_VALUE */ @@ -729,6 +765,8 @@ CK_PR_UNARY(inc, add, ptr, void, uintptr_t) #ifndef CK_F_PR_INC_PTR_ZERO #define CK_F_PR_INC_PTR_ZERO CK_PR_UNARY_Z(inc, ptr, void, uintptr_t, +, void *, UINT_MAX) +#else +CK_PR_UNARY_Z_STUB(inc, ptr, void) #endif /* CK_F_PR_INC_PTR_ZERO */ #ifndef CK_F_PR_DEC_PTR @@ -739,6 +777,8 @@ CK_PR_UNARY(dec, sub, ptr, void, uintptr_t) #ifndef CK_F_PR_DEC_PTR_ZERO #define CK_F_PR_DEC_PTR_ZERO CK_PR_UNARY_Z(dec, ptr, void, uintptr_t, -, void *, 1) +#else +CK_PR_UNARY_Z_STUB(dec, ptr, void) #endif /* CK_F_PR_DEC_PTR_ZERO */ #endif /* CK_F_PR_LOAD_PTR && CK_F_PR_CAS_PTR_VALUE */ @@ -753,6 +793,8 @@ CK_PR_UNARY_S(inc, add, 64, uint64_t) #ifndef CK_F_PR_INC_64_ZERO #define CK_F_PR_INC_64_ZERO CK_PR_UNARY_Z_S(inc, 64, uint64_t, +, UINT64_MAX) +#else +CK_PR_UNARY_Z_STUB(inc, 64, uint64_t) #endif /* CK_F_PR_INC_64_ZERO */ #ifndef CK_F_PR_DEC_64 @@ -763,6 +805,8 @@ CK_PR_UNARY_S(dec, sub, 64, uint64_t) #ifndef CK_F_PR_DEC_64_ZERO #define CK_F_PR_DEC_64_ZERO CK_PR_UNARY_Z_S(dec, 64, uint64_t, -, 1) +#else +CK_PR_UNARY_Z_STUB(dec, 64, uint64_t) #endif /* CK_F_PR_DEC_64_ZERO */ #endif /* CK_F_PR_LOAD_64 && CK_F_PR_CAS_64_VALUE */ @@ -777,6 +821,8 @@ CK_PR_UNARY_S(inc, add, 32, uint32_t) #ifndef CK_F_PR_INC_32_ZERO #define CK_F_PR_INC_32_ZERO CK_PR_UNARY_Z_S(inc, 32, uint32_t, +, UINT32_MAX) +#else +CK_PR_UNARY_Z_STUB(inc, 32, uint32_t) #endif /* CK_F_PR_INC_32_ZERO */ #ifndef CK_F_PR_DEC_32 @@ -787,6 +833,8 @@ CK_PR_UNARY_S(dec, sub, 32, uint32_t) #ifndef CK_F_PR_DEC_32_ZERO #define CK_F_PR_DEC_32_ZERO CK_PR_UNARY_Z_S(dec, 32, uint32_t, -, 1) +#else +CK_PR_UNARY_Z_STUB(dec, 32, uint32_t) #endif /* CK_F_PR_DEC_32_ZERO */ #endif /* CK_F_PR_LOAD_32 && CK_F_PR_CAS_32_VALUE */ @@ -801,6 +849,8 @@ CK_PR_UNARY_S(inc, add, 16, uint16_t) #ifndef CK_F_PR_INC_16_ZERO #define CK_F_PR_INC_16_ZERO CK_PR_UNARY_Z_S(inc, 16, uint16_t, +, UINT16_MAX) +#else +CK_PR_UNARY_Z_STUB(inc, 16, uint16_t) #endif /* CK_F_PR_INC_16_ZERO */ #ifndef CK_F_PR_DEC_16 @@ -811,6 +861,8 @@ CK_PR_UNARY_S(dec, sub, 16, uint16_t) #ifndef CK_F_PR_DEC_16_ZERO #define CK_F_PR_DEC_16_ZERO CK_PR_UNARY_Z_S(dec, 16, uint16_t, -, 1) +#else +CK_PR_UNARY_Z_STUB(dec, 16, uint16_t) #endif /* CK_F_PR_DEC_16_ZERO */ #endif /* CK_F_PR_LOAD_16 && CK_F_PR_CAS_16_VALUE */ @@ -825,6 +877,8 @@ CK_PR_UNARY_S(inc, add, 8, uint8_t) #ifndef CK_F_PR_INC_8_ZERO #define CK_F_PR_INC_8_ZERO CK_PR_UNARY_Z_S(inc, 8, uint8_t, +, UINT8_MAX) +#else +CK_PR_UNARY_Z_STUB(inc, 8, uint8_t) #endif /* CK_F_PR_INC_8_ZERO */ #ifndef CK_F_PR_DEC_8 @@ -835,6 +889,8 @@ CK_PR_UNARY_S(dec, sub, 8, uint8_t) #ifndef CK_F_PR_DEC_8_ZERO #define CK_F_PR_DEC_8_ZERO CK_PR_UNARY_Z_S(dec, 8, uint8_t, -, 1) +#else +CK_PR_UNARY_Z_STUB(dec, 8, uint8_t) #endif /* CK_F_PR_DEC_8_ZERO */ #endif /* CK_F_PR_LOAD_8 && CK_F_PR_CAS_8_VALUE */ diff --git a/include/ck_queue.h b/include/ck_queue.h index c1e9872..fd38d8a 100644 --- a/include/ck_queue.h +++ b/include/ck_queue.h @@ -125,7 +125,7 @@ */ #define CK_SLIST_HEAD(name, type) \ struct name { \ - struct type *slh_first; /* first element */ \ + struct type *cslh_first; /* first element */ \ } #define CK_SLIST_HEAD_INITIALIZER(head) \ @@ -133,85 +133,95 @@ struct name { \ #define CK_SLIST_ENTRY(type) \ struct { \ - struct type *sle_next; /* next element */ \ + struct type *csle_next; /* next element */ \ } /* * Singly-linked List functions. */ #define CK_SLIST_EMPTY(head) \ - (ck_pr_load_ptr(&(head)->slh_first) == NULL) + (ck_pr_load_ptr(&(head)->cslh_first) == NULL) #define CK_SLIST_FIRST(head) \ - (ck_pr_load_ptr(&(head)->slh_first)) + (ck_pr_load_ptr(&(head)->cslh_first)) #define CK_SLIST_NEXT(elm, field) \ - ck_pr_load_ptr(&((elm)->field.sle_next)) + ck_pr_load_ptr(&((elm)->field.csle_next)) #define CK_SLIST_FOREACH(var, head, field) \ for ((var) = CK_SLIST_FIRST((head)); \ - (var) && (ck_pr_fence_load(), 1); \ + (var); \ (var) = CK_SLIST_NEXT((var), field)) -#define CK_SLIST_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = CK_SLIST_FIRST(head); \ - (var) && (ck_pr_fence_load(), (tvar) = CK_SLIST_NEXT(var, field), 1);\ +#define CK_SLIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = CK_SLIST_FIRST(head); \ + (var) && ((tvar) = CK_SLIST_NEXT(var, field), 1); \ (var) = (tvar)) #define CK_SLIST_FOREACH_PREVPTR(var, varp, head, field) \ - for ((varp) = &(head)->slh_first; \ - ((var) = ck_pr_load_ptr(varp)) != NULL && (ck_pr_fence_load(), 1); \ - (varp) = &(var)->field.sle_next) + for ((varp) = &(head)->cslh_first; \ + ((var) = ck_pr_load_ptr(varp)) != NULL; \ + (varp) = &(var)->field.csle_next) #define CK_SLIST_INIT(head) do { \ - ck_pr_store_ptr(&(head)->slh_first, NULL); \ + ck_pr_store_ptr(&(head)->cslh_first, NULL); \ ck_pr_fence_store(); \ } while (0) #define CK_SLIST_INSERT_AFTER(a, b, field) do { \ - (b)->field.sle_next = (a)->field.sle_next; \ + (b)->field.csle_next = (a)->field.csle_next; \ ck_pr_fence_store(); \ - ck_pr_store_ptr(&(a)->field.sle_next, b); \ + ck_pr_store_ptr(&(a)->field.csle_next, b); \ } while (0) #define CK_SLIST_INSERT_HEAD(head, elm, field) do { \ - (elm)->field.sle_next = (head)->slh_first; \ + (elm)->field.csle_next = (head)->cslh_first; \ ck_pr_fence_store(); \ - ck_pr_store_ptr(&(head)->slh_first, elm); \ + ck_pr_store_ptr(&(head)->cslh_first, elm); \ +} while (0) + +#define CK_SLIST_INSERT_PREVPTR(prevp, slistelm, elm, field) do { \ + (elm)->field.csle_next = (slistelm); \ + ck_pr_fence_store(); \ + ck_pr_store_ptr(prevp, elm); \ } while (0) #define CK_SLIST_REMOVE_AFTER(elm, field) do { \ - ck_pr_store_ptr(&(elm)->field.sle_next, \ - (elm)->field.sle_next->field.sle_next); \ + ck_pr_store_ptr(&(elm)->field.csle_next, \ + (elm)->field.csle_next->field.csle_next); \ } while (0) #define CK_SLIST_REMOVE(head, elm, type, field) do { \ - if ((head)->slh_first == (elm)) { \ + if ((head)->cslh_first == (elm)) { \ CK_SLIST_REMOVE_HEAD((head), field); \ } else { \ - struct type *curelm = (head)->slh_first; \ - while (curelm->field.sle_next != (elm)) \ - curelm = curelm->field.sle_next; \ + struct type *curelm = (head)->cslh_first; \ + while (curelm->field.csle_next != (elm)) \ + curelm = curelm->field.csle_next; \ CK_SLIST_REMOVE_AFTER(curelm, field); \ } \ } while (0) #define CK_SLIST_REMOVE_HEAD(head, field) do { \ - ck_pr_store_ptr(&(head)->slh_first, \ - (head)->slh_first->field.sle_next); \ + ck_pr_store_ptr(&(head)->cslh_first, \ + (head)->cslh_first->field.csle_next); \ +} while (0) + +#define CK_SLIST_REMOVE_PREVPTR(prevp, elm, field) do { \ + ck_pr_store_ptr(prevptr, (elm)->field.csle_next); \ } while (0) #define CK_SLIST_MOVE(head1, head2, field) do { \ - ck_pr_store_ptr(&(head1)->slh_first, (head2)->slh_first); \ + ck_pr_store_ptr(&(head1)->cslh_first, (head2)->cslh_first); \ } while (0) /* * This operation is not applied atomically. */ #define CK_SLIST_SWAP(a, b, type) do { \ - struct type *swap_first = (a)->slh_first; \ - (a)->slh_first = (b)->slh_first; \ - (b)->slh_first = swap_first; \ + struct type *swap_first = (a)->cslh_first; \ + (a)->cslh_first = (b)->cslh_first; \ + (b)->cslh_first = swap_first; \ } while (0) /* @@ -219,107 +229,107 @@ struct { \ */ #define CK_STAILQ_HEAD(name, type) \ struct name { \ - struct type *stqh_first;/* first element */ \ - struct type **stqh_last;/* addr of last next element */ \ + struct type *cstqh_first;/* first element */ \ + struct type **cstqh_last;/* addr of last next element */ \ } #define CK_STAILQ_HEAD_INITIALIZER(head) \ - { NULL, &(head).stqh_first } + { NULL, &(head).cstqh_first } #define CK_STAILQ_ENTRY(type) \ struct { \ - struct type *stqe_next; /* next element */ \ + struct type *cstqe_next; /* next element */ \ } /* * Singly-linked Tail queue functions. */ #define CK_STAILQ_CONCAT(head1, head2) do { \ - if ((head2)->stqh_first == NULL) { \ - ck_pr_store_ptr((head1)->stqh_last, (head2)->stqh_first); \ + if ((head2)->cstqh_first != NULL) { \ + ck_pr_store_ptr((head1)->cstqh_last, (head2)->cstqh_first); \ ck_pr_fence_store(); \ - (head1)->stqh_last = (head2)->stqh_last; \ + (head1)->cstqh_last = (head2)->cstqh_last; \ CK_STAILQ_INIT((head2)); \ } \ } while (0) -#define CK_STAILQ_EMPTY(head) (ck_pr_load_ptr(&(head)->stqh_first) == NULL) +#define CK_STAILQ_EMPTY(head) (ck_pr_load_ptr(&(head)->cstqh_first) == NULL) -#define CK_STAILQ_FIRST(head) (ck_pr_load_ptr(&(head)->stqh_first)) +#define CK_STAILQ_FIRST(head) (ck_pr_load_ptr(&(head)->cstqh_first)) #define CK_STAILQ_FOREACH(var, head, field) \ for((var) = CK_STAILQ_FIRST((head)); \ - (var) && (ck_pr_fence_load(), 1); \ + (var); \ (var) = CK_STAILQ_NEXT((var), field)) #define CK_STAILQ_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = CK_STAILQ_FIRST((head)); \ - (var) && (ck_pr_fence_load(), (tvar) = \ + (var) && ((tvar) = \ CK_STAILQ_NEXT((var), field), 1); \ (var) = (tvar)) #define CK_STAILQ_INIT(head) do { \ - ck_pr_store_ptr(&(head)->stqh_first, NULL); \ + ck_pr_store_ptr(&(head)->cstqh_first, NULL); \ ck_pr_fence_store(); \ - (head)->stqh_last = &(head)->stqh_first; \ + (head)->cstqh_last = &(head)->cstqh_first; \ } while (0) #define CK_STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ - (elm)->field.stqe_next = (tqelm)->field.stqe_next; \ + (elm)->field.cstqe_next = (tqelm)->field.cstqe_next; \ ck_pr_fence_store(); \ - ck_pr_store_ptr(&(tqelm)->field.stqe_next, elm); \ - if ((elm)->field.stqe_next == NULL) \ - (head)->stqh_last = &(elm)->field.stqe_next; \ + ck_pr_store_ptr(&(tqelm)->field.cstqe_next, elm); \ + if ((elm)->field.cstqe_next == NULL) \ + (head)->cstqh_last = &(elm)->field.cstqe_next; \ } while (0) #define CK_STAILQ_INSERT_HEAD(head, elm, field) do { \ - (elm)->field.stqe_next = (head)->stqh_first; \ + (elm)->field.cstqe_next = (head)->cstqh_first; \ ck_pr_fence_store(); \ - ck_pr_store_ptr(&(head)->stqh_first, elm); \ - if ((elm)->field.stqe_next == NULL) \ - (head)->stqh_last = &(elm)->field.stqe_next; \ + ck_pr_store_ptr(&(head)->cstqh_first, elm); \ + if ((elm)->field.cstqe_next == NULL) \ + (head)->cstqh_last = &(elm)->field.cstqe_next; \ } while (0) #define CK_STAILQ_INSERT_TAIL(head, elm, field) do { \ - (elm)->field.stqe_next = NULL; \ + (elm)->field.cstqe_next = NULL; \ ck_pr_fence_store(); \ - ck_pr_store_ptr((head)->stqh_last, (elm)); \ - (head)->stqh_last = &(elm)->field.stqe_next; \ + ck_pr_store_ptr((head)->cstqh_last, (elm)); \ + (head)->cstqh_last = &(elm)->field.cstqe_next; \ } while (0) #define CK_STAILQ_NEXT(elm, field) \ - (ck_pr_load_ptr(&(elm)->field.stqe_next)) + (ck_pr_load_ptr(&(elm)->field.cstqe_next)) #define CK_STAILQ_REMOVE(head, elm, type, field) do { \ - if ((head)->stqh_first == (elm)) { \ + if ((head)->cstqh_first == (elm)) { \ CK_STAILQ_REMOVE_HEAD((head), field); \ } else { \ - struct type *curelm = (head)->stqh_first; \ - while (curelm->field.stqe_next != (elm)) \ - curelm = curelm->field.stqe_next; \ + struct type *curelm = (head)->cstqh_first; \ + while (curelm->field.cstqe_next != (elm)) \ + curelm = curelm->field.cstqe_next; \ CK_STAILQ_REMOVE_AFTER(head, curelm, field); \ } \ } while (0) #define CK_STAILQ_REMOVE_AFTER(head, elm, field) do { \ - ck_pr_store_ptr(&(elm)->field.stqe_next, \ - (elm)->field.stqe_next->field.stqe_next); \ - if ((elm)->field.stqe_next == NULL) \ - (head)->stqh_last = &(elm)->field.stqe_next; \ + ck_pr_store_ptr(&(elm)->field.cstqe_next, \ + (elm)->field.cstqe_next->field.cstqe_next); \ + if ((elm)->field.cstqe_next == NULL) \ + (head)->cstqh_last = &(elm)->field.cstqe_next; \ } while (0) #define CK_STAILQ_REMOVE_HEAD(head, field) do { \ - ck_pr_store_ptr(&(head)->stqh_first, \ - (head)->stqh_first->field.stqe_next); \ - if ((head)->stqh_first == NULL) \ - (head)->stqh_last = &(head)->stqh_first; \ + ck_pr_store_ptr(&(head)->cstqh_first, \ + (head)->cstqh_first->field.cstqe_next); \ + if ((head)->cstqh_first == NULL) \ + (head)->cstqh_last = &(head)->cstqh_first; \ } while (0) #define CK_STAILQ_MOVE(head1, head2, field) do { \ - ck_pr_store_ptr(&(head1)->stqh_first, (head2)->stqh_first); \ - (head1)->stqh_last = (head2)->stqh_last; \ - if ((head2)->stqh_last == &(head2)->stqh_first) \ - (head1)->stqh_last = &(head1)->stqh_first; \ + ck_pr_store_ptr(&(head1)->cstqh_first, (head2)->cstqh_first); \ + (head1)->cstqh_last = (head2)->cstqh_last; \ + if ((head2)->cstqh_last == &(head2)->cstqh_first) \ + (head1)->cstqh_last = &(head1)->cstqh_first; \ } while (0) /* @@ -327,15 +337,15 @@ struct { \ */ #define CK_STAILQ_SWAP(head1, head2, type) do { \ struct type *swap_first = CK_STAILQ_FIRST(head1); \ - struct type **swap_last = (head1)->stqh_last; \ + struct type **swap_last = (head1)->cstqh_last; \ CK_STAILQ_FIRST(head1) = CK_STAILQ_FIRST(head2); \ - (head1)->stqh_last = (head2)->stqh_last; \ + (head1)->cstqh_last = (head2)->cstqh_last; \ CK_STAILQ_FIRST(head2) = swap_first; \ - (head2)->stqh_last = swap_last; \ + (head2)->cstqh_last = swap_last; \ if (CK_STAILQ_EMPTY(head1)) \ - (head1)->stqh_last = &(head1)->stqh_first; \ + (head1)->cstqh_last = &(head1)->cstqh_first; \ if (CK_STAILQ_EMPTY(head2)) \ - (head2)->stqh_last = &(head2)->stqh_first; \ + (head2)->cstqh_last = &(head2)->cstqh_first; \ } while (0) /* @@ -343,7 +353,7 @@ struct { \ */ #define CK_LIST_HEAD(name, type) \ struct name { \ - struct type *lh_first; /* first element */ \ + struct type *clh_first; /* first element */ \ } #define CK_LIST_HEAD_INITIALIZER(head) \ @@ -351,78 +361,78 @@ struct name { \ #define CK_LIST_ENTRY(type) \ struct { \ - struct type *le_next; /* next element */ \ - struct type **le_prev; /* address of previous next element */ \ + struct type *cle_next; /* next element */ \ + struct type **cle_prev; /* address of previous next element */ \ } -#define CK_LIST_FIRST(head) ck_pr_load_ptr(&(head)->lh_first) +#define CK_LIST_FIRST(head) ck_pr_load_ptr(&(head)->clh_first) #define CK_LIST_EMPTY(head) (CK_LIST_FIRST(head) == NULL) -#define CK_LIST_NEXT(elm, field) ck_pr_load_ptr(&(elm)->field.le_next) +#define CK_LIST_NEXT(elm, field) ck_pr_load_ptr(&(elm)->field.cle_next) #define CK_LIST_FOREACH(var, head, field) \ for ((var) = CK_LIST_FIRST((head)); \ - (var) && (ck_pr_fence_load(), 1); \ + (var); \ (var) = CK_LIST_NEXT((var), field)) #define CK_LIST_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = CK_LIST_FIRST((head)); \ - (var) && (ck_pr_fence_load(), (tvar) = CK_LIST_NEXT((var), field), 1);\ + (var) && ((tvar) = CK_LIST_NEXT((var), field), 1); \ (var) = (tvar)) #define CK_LIST_INIT(head) do { \ - ck_pr_store_ptr(&(head)->lh_first, NULL); \ + ck_pr_store_ptr(&(head)->clh_first, NULL); \ ck_pr_fence_store(); \ } while (0) #define CK_LIST_INSERT_AFTER(listelm, elm, field) do { \ - (elm)->field.le_next = (listelm)->field.le_next; \ - (elm)->field.le_prev = &(listelm)->field.le_next; \ + (elm)->field.cle_next = (listelm)->field.cle_next; \ + (elm)->field.cle_prev = &(listelm)->field.cle_next; \ ck_pr_fence_store(); \ - if ((listelm)->field.le_next != NULL) \ - (listelm)->field.le_next->field.le_prev = &(elm)->field.le_next;\ - ck_pr_store_ptr(&(listelm)->field.le_next, elm); \ + if ((listelm)->field.cle_next != NULL) \ + (listelm)->field.cle_next->field.cle_prev = &(elm)->field.cle_next;\ + ck_pr_store_ptr(&(listelm)->field.cle_next, elm); \ } while (0) #define CK_LIST_INSERT_BEFORE(listelm, elm, field) do { \ - (elm)->field.le_prev = (listelm)->field.le_prev; \ - (elm)->field.le_next = (listelm); \ + (elm)->field.cle_prev = (listelm)->field.cle_prev; \ + (elm)->field.cle_next = (listelm); \ ck_pr_fence_store(); \ - ck_pr_store_ptr((listelm)->field.le_prev, (elm)); \ - (listelm)->field.le_prev = &(elm)->field.le_next; \ + ck_pr_store_ptr((listelm)->field.cle_prev, (elm)); \ + (listelm)->field.cle_prev = &(elm)->field.cle_next; \ } while (0) #define CK_LIST_INSERT_HEAD(head, elm, field) do { \ - (elm)->field.le_next = (head)->lh_first; \ + (elm)->field.cle_next = (head)->clh_first; \ ck_pr_fence_store(); \ - if ((elm)->field.le_next != NULL) \ - (head)->lh_first->field.le_prev = &(elm)->field.le_next; \ - ck_pr_store_ptr(&(head)->lh_first, elm); \ - (elm)->field.le_prev = &(head)->lh_first; \ + if ((elm)->field.cle_next != NULL) \ + (head)->clh_first->field.cle_prev = &(elm)->field.cle_next; \ + ck_pr_store_ptr(&(head)->clh_first, elm); \ + (elm)->field.cle_prev = &(head)->clh_first; \ } while (0) #define CK_LIST_REMOVE(elm, field) do { \ - ck_pr_store_ptr((elm)->field.le_prev, (elm)->field.le_next); \ - if ((elm)->field.le_next != NULL) \ - (elm)->field.le_next->field.le_prev = (elm)->field.le_prev; \ + ck_pr_store_ptr((elm)->field.cle_prev, (elm)->field.cle_next); \ + if ((elm)->field.cle_next != NULL) \ + (elm)->field.cle_next->field.cle_prev = (elm)->field.cle_prev; \ } while (0) #define CK_LIST_MOVE(head1, head2, field) do { \ - ck_pr_store_ptr(&(head1)->lh_first, (head2)->lh_first); \ - if ((head1)->lh_first != NULL) \ - (head1)->lh_first->field.le_prev = &(head1)->lh_first; \ + ck_pr_store_ptr(&(head1)->clh_first, (head2)->clh_first); \ + if ((head1)->clh_first != NULL) \ + (head1)->clh_first->field.cle_prev = &(head1)->clh_first; \ } while (0) /* * This operation is not applied atomically. */ #define CK_LIST_SWAP(head1, head2, type, field) do { \ - struct type *swap_tmp = (head1)->lh_first; \ - (head1)->lh_first = (head2)->lh_first; \ - (head2)->lh_first = swap_tmp; \ - if ((swap_tmp = (head1)->lh_first) != NULL) \ - swap_tmp->field.le_prev = &(head1)->lh_first; \ - if ((swap_tmp = (head2)->lh_first) != NULL) \ - swap_tmp->field.le_prev = &(head2)->lh_first; \ + struct type *swap_tmp = (head1)->clh_first; \ + (head1)->clh_first = (head2)->clh_first; \ + (head2)->clh_first = swap_tmp; \ + if ((swap_tmp = (head1)->clh_first) != NULL) \ + swap_tmp->field.cle_prev = &(head1)->clh_first; \ + if ((swap_tmp = (head2)->clh_first) != NULL) \ + swap_tmp->field.cle_prev = &(head2)->clh_first; \ } while (0) #endif /* CK_QUEUE_H */ diff --git a/include/ck_ring.h b/include/ck_ring.h index 8a2a791..9f6754e 100644 --- a/include/ck_ring.h +++ b/include/ck_ring.h @@ -66,9 +66,56 @@ ck_ring_size(const struct ck_ring *ring) CK_CC_INLINE static unsigned int ck_ring_capacity(const struct ck_ring *ring) { + return ring->size; } +/* + * This function is only safe to call when there are no concurrent operations + * on the ring. This is primarily meant for persistent ck_ring use-cases. The + * function returns true if any mutations were performed on the ring. + */ +CK_CC_INLINE static bool +ck_ring_repair(struct ck_ring *ring) +{ + bool r = false; + + if (ring->p_tail != ring->p_head) { + ring->p_tail = ring->p_head; + r = true; + } + + return r; +} + +/* + * This can be called when no concurrent updates are occurring on the ring + * structure to check for consistency. This is primarily meant to be used for + * persistent storage of the ring. If this functions returns false, the ring + * is in an inconsistent state. + */ +CK_CC_INLINE static bool +ck_ring_valid(const struct ck_ring *ring) +{ + unsigned int size = ring->size; + unsigned int c_head = ring->c_head; + unsigned int p_head = ring->p_head; + + /* The ring must be a power of 2. */ + if (size & (size - 1)) + return false; + + /* The consumer counter must always be smaller than the producer. */ + if (c_head > p_head) + return false; + + /* The producer may only be up to size slots ahead of consumer. */ + if (p_head - c_head >= size) + return false; + + return true; +} + CK_CC_INLINE static void ck_ring_init(struct ck_ring *ring, unsigned int size) { @@ -84,6 +131,45 @@ ck_ring_init(struct ck_ring *ring, unsigned int size) /* * The _ck_ring_* namespace is internal only and must not used externally. */ + +/* + * This function will return a region of memory to write for the next value + * for a single producer. + */ +CK_CC_FORCE_INLINE static void * +_ck_ring_enqueue_reserve_sp(struct ck_ring *ring, + void *CK_CC_RESTRICT buffer, + unsigned int ts, + unsigned int *size) +{ + const unsigned int mask = ring->mask; + unsigned int consumer, producer, delta; + + consumer = ck_pr_load_uint(&ring->c_head); + producer = ring->p_tail; + delta = producer + 1; + if (size != NULL) + *size = (producer - consumer) & mask; + + if (CK_CC_UNLIKELY((delta & mask) == (consumer & mask))) + return NULL; + + return (char *)buffer + ts * (producer & mask); +} + +/* + * This is to be called to commit and make visible a region of previously + * reserved with reverse_sp. + */ +CK_CC_FORCE_INLINE static void +_ck_ring_enqueue_commit_sp(struct ck_ring *ring) +{ + + ck_pr_fence_store(); + ck_pr_store_uint(&ring->p_tail, ring->p_tail + 1); + return; +} + CK_CC_FORCE_INLINE static bool _ck_ring_enqueue_sp(struct ck_ring *ring, void *CK_CC_RESTRICT buffer, @@ -163,6 +249,65 @@ _ck_ring_dequeue_sc(struct ck_ring *ring, return true; } +CK_CC_FORCE_INLINE static void * +_ck_ring_enqueue_reserve_mp(struct ck_ring *ring, + void *buffer, + unsigned int ts, + unsigned int *ticket, + unsigned int *size) +{ + const unsigned int mask = ring->mask; + unsigned int producer, consumer, delta; + + producer = ck_pr_load_uint(&ring->p_head); + + for (;;) { + ck_pr_fence_load(); + consumer = ck_pr_load_uint(&ring->c_head); + + delta = producer + 1; + + if (CK_CC_LIKELY((producer - consumer) < mask)) { + if (ck_pr_cas_uint_value(&ring->p_head, + producer, delta, &producer) == true) { + break; + } + } else { + unsigned int new_producer; + + ck_pr_fence_load(); + new_producer = ck_pr_load_uint(&ring->p_head); + + if (producer == new_producer) { + if (size != NULL) + *size = (producer - consumer) & mask; + + return false; + } + + producer = new_producer; + } + } + + *ticket = producer; + if (size != NULL) + *size = (producer - consumer) & mask; + + return (char *)buffer + ts * (producer & mask); +} + +CK_CC_FORCE_INLINE static void +_ck_ring_enqueue_commit_mp(struct ck_ring *ring, unsigned int producer) +{ + + while (ck_pr_load_uint(&ring->p_tail) != producer) + ck_pr_stall(); + + ck_pr_fence_store(); + ck_pr_store_uint(&ring->p_tail, producer + 1); + return; +} + CK_CC_FORCE_INLINE static bool _ck_ring_enqueue_mp(struct ck_ring *ring, void *buffer, @@ -176,23 +321,54 @@ _ck_ring_enqueue_mp(struct ck_ring *ring, producer = ck_pr_load_uint(&ring->p_head); - do { + for (;;) { /* - * The snapshot of producer must be up to date with - * respect to consumer. + * The snapshot of producer must be up to date with respect to + * consumer. */ ck_pr_fence_load(); consumer = ck_pr_load_uint(&ring->c_head); delta = producer + 1; - if (CK_CC_UNLIKELY((delta & mask) == (consumer & mask))) { - r = false; - goto leave; + + /* + * Only try to CAS if the producer is not clearly stale (not + * less than consumer) and the buffer is definitely not full. + */ + if (CK_CC_LIKELY((producer - consumer) < mask)) { + if (ck_pr_cas_uint_value(&ring->p_head, + producer, delta, &producer) == true) { + break; + } + } else { + unsigned int new_producer; + + /* + * Slow path. Either the buffer is full or we have a + * stale snapshot of p_head. Execute a second read of + * p_read that must be ordered wrt the snapshot of + * c_head. + */ + ck_pr_fence_load(); + new_producer = ck_pr_load_uint(&ring->p_head); + + /* + * Only fail if we haven't made forward progress in + * production: the buffer must have been full when we + * read new_producer (or we wrapped around UINT_MAX + * during this iteration). + */ + if (producer == new_producer) { + r = false; + goto leave; + } + + /* + * p_head advanced during this iteration. Try again. + */ + producer = new_producer; } - } while (ck_pr_cas_uint_value(&ring->p_head, - producer, - delta, - &producer) == false); + } buffer = (char *)buffer + ts * (producer & mask); memcpy(buffer, entry, ts); @@ -323,6 +499,33 @@ ck_ring_enqueue_spsc(struct ck_ring *ring, &entry, sizeof(entry), NULL); } +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_spsc_size(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *size) +{ + + return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), + size); +} + +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_spsc(struct ck_ring *ring, + struct ck_ring_buffer *buffer) +{ + + return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), + NULL); +} + +CK_CC_INLINE static void +ck_ring_enqueue_commit_spsc(struct ck_ring *ring) +{ + + _ck_ring_enqueue_commit_sp(ring); + return; +} + CK_CC_INLINE static bool ck_ring_dequeue_spsc(struct ck_ring *ring, const struct ck_ring_buffer *buffer, @@ -344,8 +547,7 @@ ck_ring_enqueue_mpmc(struct ck_ring *ring, const void *entry) { - return _ck_ring_enqueue_mp(ring, buffer, &entry, - sizeof(entry), NULL); + return _ck_ring_enqueue_mp(ring, buffer, &entry, sizeof(entry), NULL); } CK_CC_INLINE static bool @@ -355,8 +557,37 @@ ck_ring_enqueue_mpmc_size(struct ck_ring *ring, unsigned int *size) { - return _ck_ring_enqueue_mp_size(ring, buffer, &entry, - sizeof(entry), size); + return _ck_ring_enqueue_mp_size(ring, buffer, &entry, sizeof(entry), + size); +} + +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_mpmc(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *ticket) +{ + + return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *), + ticket, NULL); +} + +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_mpmc_size(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *ticket, + unsigned int *size) +{ + + return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *), + ticket, size); +} + +CK_CC_INLINE static void +ck_ring_enqueue_commit_mpmc(struct ck_ring *ring, unsigned int ticket) +{ + + _ck_ring_enqueue_commit_mp(ring, ticket); + return; } CK_CC_INLINE static bool @@ -384,6 +615,31 @@ ck_ring_dequeue_mpmc(struct ck_ring *ring, * ring buffer containing pointers. Correctness is provided for any number of * consumers with up to one concurrent producer. */ +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_spmc_size(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *size) +{ + + return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), size); +} + +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_spmc(struct ck_ring *ring, + struct ck_ring_buffer *buffer) +{ + + return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), NULL); +} + +CK_CC_INLINE static void +ck_ring_enqueue_commit_spmc(struct ck_ring *ring) +{ + + _ck_ring_enqueue_commit_sp(ring); + return; +} + CK_CC_INLINE static bool ck_ring_enqueue_spmc_size(struct ck_ring *ring, struct ck_ring_buffer *buffer, @@ -428,6 +684,35 @@ ck_ring_dequeue_spmc(struct ck_ring *ring, * ring buffer containing pointers. Correctness is provided for any number of * producers with up to one concurrent consumers. */ +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_mpsc(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *ticket) +{ + + return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *), + ticket, NULL); +} + +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_mpsc_size(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *ticket, + unsigned int *size) +{ + + return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *), + ticket, size); +} + +CK_CC_INLINE static void +ck_ring_enqueue_commit_mpsc(struct ck_ring *ring, unsigned int ticket) +{ + + _ck_ring_enqueue_commit_mp(ring, ticket); + return; +} + CK_CC_INLINE static bool ck_ring_enqueue_mpsc(struct ck_ring *ring, struct ck_ring_buffer *buffer, @@ -463,194 +748,290 @@ ck_ring_dequeue_mpsc(struct ck_ring *ring, * CK_RING_PROTOTYPE is used to define a type-safe interface for inlining * values of a particular type in the ring the buffer. */ -#define CK_RING_PROTOTYPE(name, type) \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_spsc_size_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c, \ - unsigned int *d) \ -{ \ - \ - return _ck_ring_enqueue_sp_size(a, b, c, \ - sizeof(struct type), d); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_spsc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_enqueue_sp(a, b, c, \ - sizeof(struct type), NULL); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_dequeue_spsc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_dequeue_sc(a, b, c, \ - sizeof(struct type)); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_spmc_size_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c, \ - unsigned int *d) \ -{ \ - \ - return _ck_ring_enqueue_sp_size(a, b, c, \ - sizeof(struct type), d); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_spmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_enqueue_sp(a, b, c, \ - sizeof(struct type), NULL); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_trydequeue_spmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_trydequeue_mc(a, \ - b, c, sizeof(struct type)); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_dequeue_spmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_dequeue_mc(a, b, c, \ - sizeof(struct type)); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_mpsc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_enqueue_mp(a, b, c, \ - sizeof(struct type), NULL); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_mpsc_size_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c, \ - unsigned int *d) \ -{ \ - \ - return _ck_ring_enqueue_mp_size(a, b, c, \ - sizeof(struct type), d); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_dequeue_mpsc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_dequeue_sc(a, b, c, \ - sizeof(struct type)); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_mpmc_size_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c, \ - unsigned int *d) \ -{ \ - \ - return _ck_ring_enqueue_mp_size(a, b, c, \ - sizeof(struct type), d); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_mpmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_enqueue_mp(a, b, c, \ - sizeof(struct type), NULL); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_trydequeue_mpmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_trydequeue_mc(a, \ - b, c, sizeof(struct type)); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_dequeue_mpmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_dequeue_mc(a, b, c, \ - sizeof(struct type)); \ +#define CK_RING_PROTOTYPE(name, type) \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_spsc_##name(struct ck_ring *a, \ + struct type *b) \ +{ \ + \ + return _ck_ring_enqueue_reserve_sp(a, b, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_spsc_size_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c) \ +{ \ + \ + return _ck_ring_enqueue_reserve_sp(a, b, \ + sizeof(struct type), c); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_spsc_size_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_sp_size(a, b, c, \ + sizeof(struct type), d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_spsc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_enqueue_sp(a, b, c, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_dequeue_spsc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_dequeue_sc(a, b, c, \ + sizeof(struct type)); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_spmc_##name(struct ck_ring *a, \ + struct type *b) \ +{ \ + \ + return _ck_ring_enqueue_reserve_sp(a, b, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_spmc_size_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c) \ +{ \ + \ + return _ck_ring_enqueue_reserve_sp(a, b, \ + sizeof(struct type), c); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_spmc_size_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_sp_size(a, b, c, \ + sizeof(struct type), d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_spmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_enqueue_sp(a, b, c, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_trydequeue_spmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_trydequeue_mc(a, \ + b, c, sizeof(struct type)); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_dequeue_spmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_dequeue_mc(a, b, c, \ + sizeof(struct type)); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_mpsc_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c) \ +{ \ + \ + return _ck_ring_enqueue_reserve_mp(a, b, \ + sizeof(struct type), c, NULL); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_mpsc_size_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_reserve_mp(a, b, \ + sizeof(struct type), c, d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_mpsc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_enqueue_mp(a, b, c, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_mpsc_size_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_mp_size(a, b, c, \ + sizeof(struct type), d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_dequeue_mpsc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_dequeue_sc(a, b, c, \ + sizeof(struct type)); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_mpmc_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c) \ +{ \ + \ + return _ck_ring_enqueue_reserve_mp(a, b, \ + sizeof(struct type), c, NULL); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_mpmc_size_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_reserve_mp(a, b, \ + sizeof(struct type), c, d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_mpmc_size_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_mp_size(a, b, c, \ + sizeof(struct type), d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_mpmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_enqueue_mp(a, b, c, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_trydequeue_mpmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_trydequeue_mc(a, \ + b, c, sizeof(struct type)); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_dequeue_mpmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_dequeue_mc(a, b, c, \ + sizeof(struct type)); \ } /* * A single producer with one concurrent consumer. */ -#define CK_RING_ENQUEUE_SPSC(name, a, b, c) \ +#define CK_RING_ENQUEUE_SPSC(name, a, b, c) \ ck_ring_enqueue_spsc_##name(a, b, c) -#define CK_RING_ENQUEUE_SPSC_SIZE(name, a, b, c, d) \ +#define CK_RING_ENQUEUE_SPSC_SIZE(name, a, b, c, d) \ ck_ring_enqueue_spsc_size_##name(a, b, c, d) -#define CK_RING_DEQUEUE_SPSC(name, a, b, c) \ +#define CK_RING_ENQUEUE_RESERVE_SPSC(name, a, b, c) \ + ck_ring_enqueue_reserve_spsc_##name(a, b, c) +#define CK_RING_ENQUEUE_RESERVE_SPSC_SIZE(name, a, b, c, d) \ + ck_ring_enqueue_reserve_spsc_size_##name(a, b, c, d) +#define CK_RING_DEQUEUE_SPSC(name, a, b, c) \ ck_ring_dequeue_spsc_##name(a, b, c) /* * A single producer with any number of concurrent consumers. */ -#define CK_RING_ENQUEUE_SPMC(name, a, b, c) \ +#define CK_RING_ENQUEUE_SPMC(name, a, b, c) \ ck_ring_enqueue_spmc_##name(a, b, c) -#define CK_RING_ENQUEUE_SPMC_SIZE(name, a, b, c, d) \ +#define CK_RING_ENQUEUE_SPMC_SIZE(name, a, b, c, d) \ ck_ring_enqueue_spmc_size_##name(a, b, c, d) -#define CK_RING_TRYDEQUEUE_SPMC(name, a, b, c) \ +#define CK_RING_ENQUEUE_RESERVE_SPMC(name, a, b, c) \ + ck_ring_enqueue_reserve_spmc_##name(a, b, c) +#define CK_RING_ENQUEUE_RESERVE_SPMC_SIZE(name, a, b, c, d) \ + ck_ring_enqueue_reserve_spmc_size_##name(a, b, c, d) +#define CK_RING_TRYDEQUEUE_SPMC(name, a, b, c) \ ck_ring_trydequeue_spmc_##name(a, b, c) -#define CK_RING_DEQUEUE_SPMC(name, a, b, c) \ +#define CK_RING_DEQUEUE_SPMC(name, a, b, c) \ ck_ring_dequeue_spmc_##name(a, b, c) /* * Any number of concurrent producers with up to one * concurrent consumer. */ -#define CK_RING_ENQUEUE_MPSC(name, a, b, c) \ +#define CK_RING_ENQUEUE_MPSC(name, a, b, c) \ ck_ring_enqueue_mpsc_##name(a, b, c) -#define CK_RING_ENQUEUE_MPSC_SIZE(name, a, b, c, d) \ +#define CK_RING_ENQUEUE_MPSC_SIZE(name, a, b, c, d) \ ck_ring_enqueue_mpsc_size_##name(a, b, c, d) -#define CK_RING_DEQUEUE_MPSC(name, a, b, c) \ +#define CK_RING_ENQUEUE_RESERVE_MPSC(name, a, b, c) \ + ck_ring_enqueue_reserve_mpsc_##name(a, b, c) +#define CK_RING_ENQUEUE_RESERVE_MPSC_SIZE(name, a, b, c, d) \ + ck_ring_enqueue_reserve_mpsc_size_##name(a, b, c, d) +#define CK_RING_DEQUEUE_MPSC(name, a, b, c) \ ck_ring_dequeue_mpsc_##name(a, b, c) /* * Any number of concurrent producers and consumers. */ -#define CK_RING_ENQUEUE_MPMC(name, a, b, c) \ +#define CK_RING_ENQUEUE_MPMC(name, a, b, c) \ ck_ring_enqueue_mpmc_##name(a, b, c) -#define CK_RING_ENQUEUE_MPMC_SIZE(name, a, b, c, d) \ +#define CK_RING_ENQUEUE_MPMC_SIZE(name, a, b, c, d) \ ck_ring_enqueue_mpmc_size_##name(a, b, c, d) -#define CK_RING_TRYDEQUEUE_MPMC(name, a, b, c) \ +#define CK_RING_ENQUEUE_RESERVE_MPMC(name, a, b, c) \ + ck_ring_enqueue_reserve_mpmc_##name(a, b, c) +#define CK_RING_ENQUEUE_RESERVE_MPMC_SIZE(name, a, b, c, d) \ + ck_ring_enqueue_reserve_mpmc_size_##name(a, b, c, d) +#define CK_RING_TRYDEQUEUE_MPMC(name, a, b, c) \ ck_ring_trydequeue_mpmc_##name(a, b, c) -#define CK_RING_DEQUEUE_MPMC(name, a, b, c) \ +#define CK_RING_DEQUEUE_MPMC(name, a, b, c) \ ck_ring_dequeue_mpmc_##name(a, b, c) #endif /* CK_RING_H */ diff --git a/include/freebsd/ck_md.h.in b/include/freebsd/ck_md.h.in new file mode 100644 index 0000000..7fb6c64 --- /dev/null +++ b/include/freebsd/ck_md.h.in @@ -0,0 +1,133 @@ +/* + * Copyright 2018 Samy Al Bahra. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This header file is meant for use of Concurrency Kit in the FreeBSD kernel. + */ + +#ifndef CK_MD_H +#define CK_MD_H + +#include <sys/param.h> + +#ifndef _KERNEL +#error This header file is meant for the FreeBSD kernel. +#endif /* _KERNEL */ + +#ifndef CK_MD_CACHELINE +/* + * FreeBSD's CACHE_LINE macro is a compile-time maximum cache-line size for an + * architecture, defined to be 128 bytes by default on x86*. Even in presence + * of adjacent sector prefetch, this doesn't make sense from a modeling + * perspective. + */ +#if defined(__amd64__) || defined(__i386__) +#define CK_MD_CACHELINE (64) +#else +#define CK_MD_CACHELINE (CACHE_LINE_SIZE) +#endif /* !__amd64__ && !__i386__ */ +#endif /* CK_MD_CACHELINE */ + +#ifndef CK_MD_PAGESIZE +#define CK_MD_PAGESIZE (PAGE_SIZE) +#endif + +/* + * Once FreeBSD has a mechanism to detect RTM, this can be enabled and RTM + * facilities can be called. These facilities refer to TSX. + */ +#ifndef CK_MD_RTM_DISABLE +#define CK_MD_RTM_DISABLE +#endif /* CK_MD_RTM_DISABLE */ + +/* + * Do not enable pointer-packing-related (VMA) optimizations in kernel-space. + */ +#ifndef CK_MD_POINTER_PACK_DISABLE +#define CK_MD_POINTER_PACK_DISABLE +#endif /* CK_MD_POINTER_PACK_DISABLE */ + +/* + * The following would be used for pointer-packing tricks, disabled for the + * kernel. + */ +#ifndef CK_MD_VMA_BITS_UNKNOWN +#define CK_MD_VMA_BITS_UNKNOWN +#endif /* CK_MD_VMA_BITS_UNKNOWN */ + +/* + * Do not enable double operations in kernel-space. + */ +#ifndef CK_PR_DISABLE_DOUBLE +#define CK_PR_DISABLE_DOUBLE +#endif /* CK_PR_DISABLE_DOUBLE */ + +/* + * If building for a uni-processor target, then enable the uniprocessor + * feature flag. This, among other things, will remove the lock prefix. + */ +#ifndef SMP +#define CK_MD_UMP +#endif /* SMP */ + +/* + * Disable the use of compiler builtin functions. + */ +#define CK_MD_CC_BUILTIN_DISABLE 1 + +/* + * CK expects those, which are normally defined by the build system. + */ +#if defined(__i386__) && !defined(__x86__) +#define __x86__ +/* + * If x86 becomes more relevant, we may want to consider importing in + * __mbk() to avoid potential issues around false sharing. + */ +#define CK_MD_TSO +#define CK_MD_SSE_DISABLE 1 +#elif defined(__amd64__) +#define CK_MD_TSO +#elif defined(__sparc64__) && !defined(__sparcv9__) +#define __sparcv9__ +#define CK_MD_TSO +#elif defined(__powerpc64__) && !defined(__ppc64__) +#define __ppc64__ +#elif defined(__powerpc__) && !defined(__ppc__) +#define __ppc__ +#endif + +/* If no memory model has been defined, assume RMO. */ +#if !defined(CK_MD_RMO) && !defined(CK_MD_TSO) && !defined(CK_MD_PSO) +#define CK_MD_RMO +#endif + +#define CK_VERSION "@VERSION@" +#define CK_GIT_SHA "@GIT_SHA@" + +#endif /* CK_MD_H */ diff --git a/include/gcc/aarch64/ck_pr.h b/include/gcc/aarch64/ck_pr.h index e739c4d..0a47307 100644 --- a/include/gcc/aarch64/ck_pr.h +++ b/include/gcc/aarch64/ck_pr.h @@ -92,7 +92,7 @@ CK_PR_FENCE(unlock, CK_DMB_SY) ck_pr_md_load_##S(const M *target) \ { \ long r = 0; \ - __asm__ __volatile__(I " %w0, [%1];" \ + __asm__ __volatile__(I " %w0, [%1]\n" \ : "=r" (r) \ : "r" (target) \ : "memory"); \ @@ -103,7 +103,7 @@ CK_PR_FENCE(unlock, CK_DMB_SY) ck_pr_md_load_##S(const M *target) \ { \ long r = 0; \ - __asm__ __volatile__(I " %0, [%1];" \ + __asm__ __volatile__(I " %0, [%1]\n" \ : "=r" (r) \ : "r" (target) \ : "memory"); \ @@ -195,10 +195,10 @@ CK_PR_STORE_S_64(double, double, "str") T previous = 0; \ T tmp = 0; \ __asm__ __volatile__("1:" \ - "ldxr" W " %" R "0, [%2];" \ - "neg %" R "0, %" R "0;" \ - "stxr" W " %w1, %" R "0, [%2];" \ - "cbnz %w1, 1b;" \ + "ldxr" W " %" R "0, [%2]\n"\ + "neg %" R "0, %" R "0\n" \ + "stxr" W " %w1, %" R "0, [%2]\n" \ + "cbnz %w1, 1b\n" \ : "=&r" (previous), \ "=&r" (tmp) \ : "r" (target) \ diff --git a/include/gcc/aarch64/ck_pr_llsc.h b/include/gcc/aarch64/ck_pr_llsc.h index aa4e309..6500d96 100644 --- a/include/gcc/aarch64/ck_pr_llsc.h +++ b/include/gcc/aarch64/ck_pr_llsc.h @@ -38,17 +38,17 @@ ck_pr_cas_64_2_value(uint64_t target[2], uint64_t compare[2], uint64_t set[2], u uint64_t tmp1, tmp2; __asm__ __volatile__("1:" - "ldxp %0, %1, [%4];" - "mov %2, %0;" - "mov %3, %1;" - "eor %0, %0, %5;" - "eor %1, %1, %6;" - "orr %1, %0, %1;" - "mov %w0, #0;" - "cbnz %1, 2f;" - "stxp %w0, %7, %8, [%4];" - "cbnz %w0, 1b;" - "mov %w0, #1;" + "ldxp %0, %1, [%4]\n" + "mov %2, %0\n" + "mov %3, %1\n" + "eor %0, %0, %5\n" + "eor %1, %1, %6\n" + "orr %1, %0, %1\n" + "mov %w0, #0\n" + "cbnz %1, 2f\n" + "stxp %w0, %7, %8, [%4]\n" + "cbnz %w0, 1b\n" + "mov %w0, #1\n" "2:" : "=&r" (tmp1), "=&r" (tmp2), "=&r" (value[0]), "=&r" (value[1]) : "r" (target), "r" (compare[0]), "r" (compare[1]), "r" (set[0]), "r" (set[1]) @@ -72,15 +72,15 @@ ck_pr_cas_64_2(uint64_t target[2], uint64_t compare[2], uint64_t set[2]) uint64_t tmp1, tmp2; __asm__ __volatile__("1:" - "ldxp %0, %1, [%2];" - "eor %0, %0, %3;" - "eor %1, %1, %4;" - "orr %1, %0, %1;" - "mov %w0, #0;" - "cbnz %1, 2f;" - "stxp %w0, %5, %6, [%2];" - "cbnz %w0, 1b;" - "mov %w0, #1;" + "ldxp %0, %1, [%2]\n" + "eor %0, %0, %3\n" + "eor %1, %1, %4\n" + "orr %1, %0, %1\n" + "mov %w0, #0\n" + "cbnz %1, 2f\n" + "stxp %w0, %5, %6, [%2]\n" + "cbnz %w0, 1b\n" + "mov %w0, #1\n" "2:" : "=&r" (tmp1), "=&r" (tmp2) : "r" (target), "r" (compare[0]), "r" (compare[1]), "r" (set[0]), "r" (set[1]) @@ -103,12 +103,12 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set) { \ T previous; \ T tmp; \ - __asm__ __volatile__("1:" \ - "ldxr" W " %" R "0, [%2];" \ - "cmp %" R "0, %" R "4;" \ - "b.ne 2f;" \ - "stxr" W " %w1, %" R "3, [%2];" \ - "cbnz %w1, 1b;" \ + __asm__ __volatile__("1:\n" \ + "ldxr" W " %" R "0, [%2]\n" \ + "cmp %" R "0, %" R "4\n" \ + "b.ne 2f\n" \ + "stxr" W " %w1, %" R "3, [%2]\n" \ + "cbnz %w1, 1b\n" \ "2:" \ : "=&r" (previous), \ "=&r" (tmp) \ @@ -126,11 +126,11 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set) T tmp; \ __asm__ __volatile__( \ "1:" \ - "ldxr" W " %" R "0, [%2];" \ - "cmp %" R "0, %" R "4;" \ - "b.ne 2f;" \ - "stxr" W " %w1, %" R "3, [%2];" \ - "cbnz %w1, 1b;" \ + "ldxr" W " %" R "0, [%2]\n" \ + "cmp %" R "0, %" R "4\n" \ + "b.ne 2f\n" \ + "stxr" W " %w1, %" R "3, [%2]\n" \ + "cbnz %w1, 1b\n" \ "2:" \ : "=&r" (previous), \ "=&r" (tmp) \ @@ -167,9 +167,9 @@ CK_PR_CAS_S(char, char, "b", "w") T previous; \ T tmp; \ __asm__ __volatile__("1:" \ - "ldxr" W " %" R "0, [%2];" \ - "stxr" W " %w1, %" R "3, [%2];"\ - "cbnz %w1, 1b;" \ + "ldxr" W " %" R "0, [%2]\n"\ + "stxr" W " %w1, %" R "3, [%2]\n"\ + "cbnz %w1, 1b\n" \ : "=&r" (previous), \ "=&r" (tmp) \ : "r" (target), \ @@ -198,10 +198,10 @@ CK_PR_FAS(char, char, char, "b", "w") T previous = 0; \ T tmp = 0; \ __asm__ __volatile__("1:" \ - "ldxr" W " %" R "0, [%2];" \ - I ";" \ - "stxr" W " %w1, %" R "0, [%2];" \ - "cbnz %w1, 1b;" \ + "ldxr" W " %" R "0, [%2]\n"\ + I "\n" \ + "stxr" W " %w1, %" R "0, [%2]\n" \ + "cbnz %w1, 1b\n" \ : "=&r" (previous), \ "=&r" (tmp) \ : "r" (target) \ @@ -239,10 +239,10 @@ CK_PR_UNARY_S(char, char, "b") T previous; \ T tmp; \ __asm__ __volatile__("1:" \ - "ldxr" W " %" R "0, [%2];"\ - I " %" R "0, %" R "0, %" R "3;" \ - "stxr" W " %w1, %" R "0, [%2];" \ - "cbnz %w1, 1b;" \ + "ldxr" W " %" R "0, [%2]\n"\ + I " %" R "0, %" R "0, %" R "3\n" \ + "stxr" W " %w1, %" R "0, [%2]\n" \ + "cbnz %w1, 1b\n" \ : "=&r" (previous), \ "=&r" (tmp) \ : "r" (target), \ @@ -286,10 +286,10 @@ ck_pr_faa_ptr(void *target, uintptr_t delta) uintptr_t previous, r, tmp; __asm__ __volatile__("1:" - "ldxr %0, [%3];" - "add %1, %4, %0;" - "stxr %w2, %1, [%3];" - "cbnz %w2, 1b;" + "ldxr %0, [%3]\n" + "add %1, %4, %0\n" + "stxr %w2, %1, [%3]\n" + "cbnz %w2, 1b\n" : "=&r" (previous), "=&r" (r), "=&r" (tmp) @@ -306,9 +306,9 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta) uint64_t previous, r, tmp; __asm__ __volatile__("1:" - "ldxr %0, [%3];" - "add %1, %4, %0;" - "stxr %w2, %1, [%3];" + "ldxr %0, [%3]\n" + "add %1, %4, %0\n" + "stxr %w2, %1, [%3]\n" "cbnz %w2, 1b;" : "=&r" (previous), "=&r" (r), @@ -326,10 +326,10 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta) { \ T previous, r, tmp; \ __asm__ __volatile__("1:" \ - "ldxr" W " %w0, [%3];" \ - "add %w1, %w4, %w0;" \ - "stxr" W " %w2, %w1, [%3];" \ - "cbnz %w2, 1b;" \ + "ldxr" W " %w0, [%3]\n" \ + "add %w1, %w4, %w0\n" \ + "stxr" W " %w2, %w1, [%3]\n" \ + "cbnz %w2, 1b\n" \ : "=&r" (previous), \ "=&r" (r), \ "=&r" (tmp) \ diff --git a/include/gcc/aarch64/ck_pr_lse.h b/include/gcc/aarch64/ck_pr_lse.h index e2c9554..e450e72 100644 --- a/include/gcc/aarch64/ck_pr_lse.h +++ b/include/gcc/aarch64/ck_pr_lse.h @@ -29,6 +29,7 @@ #ifndef CK_PR_AARCH64_LSE_H #define CK_PR_AARCH64_LSE_H +#error bite #ifndef CK_PR_H #error Do not include this file directly, use ck_pr.h #endif @@ -43,10 +44,10 @@ ck_pr_cas_64_2_value(uint64_t target[2], uint64_t compare[2], uint64_t set[2], u register uint64_t x2 __asm__ ("x2") = set[0]; register uint64_t x3 __asm__ ("x3") = set[1]; - __asm__ __volatile__("casp %0, %1, %4, %5, [%6];" - "eor %2, %0, %7;" - "eor %3, %1, %8;" - "orr %2, %2, %3;" + __asm__ __volatile__("casp %0, %1, %4, %5, [%6]\n" + "eor %2, %0, %7\n" + "eor %3, %1, %8\n" + "orr %2, %2, %3\n" : "+&r" (x0), "+&r" (x1), "=&r" (tmp1), "=&r" (tmp2) : "r" (x2), "r" (x3), "r" (target), "r" (compare[0]), "r" (compare[1]) : "memory"); @@ -74,10 +75,10 @@ ck_pr_cas_64_2(uint64_t target[2], uint64_t compare[2], uint64_t set[2]) register uint64_t x2 __asm__ ("x2") = set[0]; register uint64_t x3 __asm__ ("x3") = set[1]; - __asm__ __volatile__("casp %0, %1, %2, %3, [%4];" - "eor %0, %0, %5;" - "eor %1, %1, %6;" - "orr %0, %0, %1;" + __asm__ __volatile__("casp %0, %1, %2, %3, [%4]\n" + "eor %0, %0, %5\n" + "eor %1, %1, %6\n" + "orr %0, %0, %1\n" : "+&r" (x0), "+&r" (x1) : "r" (x2), "r" (x3), "r" (target), "r" (compare[0]), "r" (compare[1]) : "memory"); @@ -99,7 +100,7 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set) { \ *(T *)value = compare; \ __asm__ __volatile__( \ - "cas" W " %" R "0, %" R "2, [%1];" \ + "cas" W " %" R "0, %" R "2, [%1]\n"\ : "+&r" (*(T *)value) \ : "r" (target), \ "r" (set) \ @@ -111,7 +112,7 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set) { \ T previous = compare; \ __asm__ __volatile__( \ - "cas" W " %" R "0, %" R "2, [%1];" \ + "cas" W " %" R "0, %" R "2, [%1]\n"\ : "+&r" (previous) \ : "r" (target), \ "r" (set) \ @@ -144,7 +145,7 @@ CK_PR_CAS_S(char, char, "b", "w") { \ T previous; \ __asm__ __volatile__( \ - "swp" W " %" R "2, %" R "0, [%1];" \ + "swp" W " %" R "2, %" R "0, [%1]\n"\ : "=&r" (previous) \ : "r" (target), \ "r" (v) \ @@ -169,8 +170,8 @@ CK_PR_FAS(char, char, char, "b", "w") CK_CC_INLINE static void \ ck_pr_##O##_##N(M *target) \ { \ - __asm__ __volatile__(I ";" \ - "st" S W " " R "0, [%0];" \ + __asm__ __volatile__(I "\n" \ + "st" S W " " R "0, [%0]\n" \ : \ : "r" (target) \ : "x0", "memory"); \ @@ -204,8 +205,8 @@ CK_PR_UNARY_S(char, char, "b") CK_CC_INLINE static void \ ck_pr_##O##_##N(M *target, T delta) \ { \ - __asm__ __volatile__(I ";" \ - "st" S W " %" R "0, [%1];" \ + __asm__ __volatile__(I "\n" \ + "st" S W " %" R "0, [%1]\n"\ : "+&r" (delta) \ : "r" (target) \ : "memory"); \ @@ -247,7 +248,7 @@ ck_pr_faa_ptr(void *target, uintptr_t delta) uintptr_t previous; __asm__ __volatile__( - "ldadd %2, %0, [%1];" + "ldadd %2, %0, [%1]\n" : "=r" (previous) : "r" (target), "r" (delta) @@ -262,7 +263,7 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta) uint64_t previous; __asm__ __volatile__( - "ldadd %2, %0, [%1];" + "ldadd %2, %0, [%1]\n" : "=r" (previous) : "r" (target), "r" (delta) @@ -277,7 +278,7 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta) { \ T previous; \ __asm__ __volatile__( \ - "ldadd" W " %w2, %w0, [%1];" \ + "ldadd" W " %w2, %w0, [%1]\n" \ : "=r" (previous) \ : "r" (target), \ "r" (delta) \ diff --git a/include/gcc/ck_cc.h b/include/gcc/ck_cc.h index a14a4b5..0a6d17b 100644 --- a/include/gcc/ck_cc.h +++ b/include/gcc/ck_cc.h @@ -39,6 +39,15 @@ #define CK_CC_UNUSED __attribute__((unused)) #define CK_CC_USED __attribute__((used)) #define CK_CC_IMM "i" + +#define CK_CC_CONTAINER(F, T, M, N) \ + CK_CC_INLINE static T * \ + N(F *p) \ + { \ + \ + return (T *)(void *)((char *)p - __builtin_offsetof(T, M)); \ + } + #if defined(__x86_64__) || defined(__x86__) #define CK_CC_IMM_U32 "Z" #define CK_CC_IMM_S32 "e" @@ -103,28 +112,26 @@ #define CK_CC_TYPEOF(X, DEFAULT) __typeof__(X) /* - * Portability wrappers for bitwise ops. + * Portability wrappers for bitwise operations. */ - +#ifndef CK_MD_CC_BUILTIN_DISABLE #define CK_F_CC_FFS -#define CK_F_CC_CLZ -#define CK_F_CC_CTZ -#define CK_F_CC_POPCOUNT - CK_CC_INLINE static int ck_cc_ffs(unsigned int x) { - return __builtin_ffs(x); + return __builtin_ffsl(x); } +#define CK_F_CC_FFSL CK_CC_INLINE static int -ck_cc_clz(unsigned int x) +ck_cc_ffsl(unsigned long x) { - return __builtin_clz(x); + return __builtin_ffsll(x); } +#define CK_F_CC_CTZ CK_CC_INLINE static int ck_cc_ctz(unsigned int x) { @@ -132,11 +139,12 @@ ck_cc_ctz(unsigned int x) return __builtin_ctz(x); } +#define CK_F_CC_POPCOUNT CK_CC_INLINE static int ck_cc_popcount(unsigned int x) { return __builtin_popcount(x); } - +#endif /* CK_MD_CC_BUILTIN_DISABLE */ #endif /* CK_GCC_CC_H */ diff --git a/include/gcc/ck_pr.h b/include/gcc/ck_pr.h index 084d423..108e983 100644 --- a/include/gcc/ck_pr.h +++ b/include/gcc/ck_pr.h @@ -80,7 +80,7 @@ ck_pr_md_load_ptr(const void *target) void *r; ck_pr_barrier(); - r = CK_CC_DECONST_PTR(CK_PR_ACCESS(target)); + r = CK_CC_DECONST_PTR(*(volatile void *const*)(target)); ck_pr_barrier(); return r; @@ -91,7 +91,7 @@ ck_pr_md_store_ptr(void *target, const void *v) { ck_pr_barrier(); - CK_PR_ACCESS(target) = CK_CC_DECONST_PTR(v); + *(volatile void **)target = CK_CC_DECONST_PTR(v); ck_pr_barrier(); return; } diff --git a/include/gcc/ppc/ck_pr.h b/include/gcc/ppc/ck_pr.h index cd7935d..73f0cb7 100644 --- a/include/gcc/ppc/ck_pr.h +++ b/include/gcc/ppc/ck_pr.h @@ -67,21 +67,29 @@ ck_pr_stall(void) __asm__ __volatile__(I ::: "memory"); \ } -CK_PR_FENCE(atomic, "lwsync") -CK_PR_FENCE(atomic_store, "lwsync") +#ifdef CK_MD_PPC32_LWSYNC +#define CK_PR_LWSYNCOP "lwsync" +#else /* CK_MD_PPC32_LWSYNC_DISABLE */ +#define CK_PR_LWSYNCOP "sync" +#endif + +CK_PR_FENCE(atomic, CK_PR_LWSYNCOP) +CK_PR_FENCE(atomic_store, CK_PR_LWSYNCOP) CK_PR_FENCE(atomic_load, "sync") -CK_PR_FENCE(store_atomic, "lwsync") -CK_PR_FENCE(load_atomic, "lwsync") -CK_PR_FENCE(store, "lwsync") +CK_PR_FENCE(store_atomic, CK_PR_LWSYNCOP) +CK_PR_FENCE(load_atomic, CK_PR_LWSYNCOP) +CK_PR_FENCE(store, CK_PR_LWSYNCOP) CK_PR_FENCE(store_load, "sync") -CK_PR_FENCE(load, "lwsync") -CK_PR_FENCE(load_store, "lwsync") +CK_PR_FENCE(load, CK_PR_LWSYNCOP) +CK_PR_FENCE(load_store, CK_PR_LWSYNCOP) CK_PR_FENCE(memory, "sync") -CK_PR_FENCE(acquire, "lwsync") -CK_PR_FENCE(release, "lwsync") -CK_PR_FENCE(acqrel, "lwsync") -CK_PR_FENCE(lock, "lwsync") -CK_PR_FENCE(unlock, "lwsync") +CK_PR_FENCE(acquire, CK_PR_LWSYNCOP) +CK_PR_FENCE(release, CK_PR_LWSYNCOP) +CK_PR_FENCE(acqrel, CK_PR_LWSYNCOP) +CK_PR_FENCE(lock, CK_PR_LWSYNCOP) +CK_PR_FENCE(unlock, CK_PR_LWSYNCOP) + +#undef CK_PR_LWSYNCOP #undef CK_PR_FENCE diff --git a/include/gcc/s390x/ck_f_pr.h b/include/gcc/s390x/ck_f_pr.h new file mode 100644 index 0000000..cd54a28 --- /dev/null +++ b/include/gcc/s390x/ck_f_pr.h @@ -0,0 +1,97 @@ +/* DO NOT EDIT. This is auto-generated from feature.sh */ +#define CK_F_PR_ADD_32 +#define CK_F_PR_ADD_64 +#define CK_F_PR_ADD_INT +#define CK_F_PR_ADD_PTR +#define CK_F_PR_ADD_UINT +#define CK_F_PR_AND_32 +#define CK_F_PR_AND_64 +#define CK_F_PR_AND_INT +#define CK_F_PR_AND_PTR +#define CK_F_PR_AND_UINT +#define CK_F_PR_CAS_32 +#define CK_F_PR_CAS_32_VALUE +#define CK_F_PR_CAS_64 +#define CK_F_PR_CAS_64_VALUE +#define CK_F_PR_CAS_INT +#define CK_F_PR_CAS_INT_VALUE +#define CK_F_PR_CAS_PTR +#define CK_F_PR_CAS_PTR_VALUE +#define CK_F_PR_CAS_UINT +#define CK_F_PR_CAS_UINT_VALUE +#define CK_F_PR_DEC_32 +#define CK_F_PR_DEC_64 +#define CK_F_PR_DEC_INT +#define CK_F_PR_DEC_PTR +#define CK_F_PR_DEC_UINT +#define CK_F_PR_FAA_32 +#define CK_F_PR_FAA_64 +#define CK_F_PR_FAA_INT +#define CK_F_PR_FAA_PTR +#define CK_F_PR_FAA_UINT +#define CK_F_PR_FAS_32 +#define CK_F_PR_FAS_64 +#define CK_F_PR_FAS_INT +#define CK_F_PR_FAS_PTR +#define CK_F_PR_FAS_UINT +#define CK_F_PR_FAS_DOUBLE +#define CK_F_PR_FENCE_LOAD +#define CK_F_PR_FENCE_LOAD_DEPENDS +#define CK_F_PR_FENCE_MEMORY +#define CK_F_PR_FENCE_STORE +#define CK_F_PR_FENCE_STRICT_LOAD +#define CK_F_PR_FENCE_STRICT_LOAD_DEPENDS +#define CK_F_PR_FENCE_STRICT_MEMORY +#define CK_F_PR_FENCE_STRICT_STORE +#define CK_F_PR_INC_32 +#define CK_F_PR_INC_64 +#define CK_F_PR_INC_INT +#define CK_F_PR_INC_PTR +#define CK_F_PR_INC_UINT +#define CK_F_PR_LOAD_16 +#define CK_F_PR_LOAD_32 +#define CK_F_PR_LOAD_64 +#define CK_F_PR_LOAD_8 +#define CK_F_PR_LOAD_CHAR +#define CK_F_PR_LOAD_DOUBLE +#define CK_F_PR_LOAD_INT +#define CK_F_PR_LOAD_PTR +#define CK_F_PR_LOAD_SHORT +#define CK_F_PR_LOAD_UINT +#define CK_F_PR_NEG_32 +#define CK_F_PR_NEG_64 +#define CK_F_PR_NEG_INT +#define CK_F_PR_NEG_PTR +#define CK_F_PR_NEG_UINT +#define CK_F_PR_NOT_32 +#define CK_F_PR_NOT_64 +#define CK_F_PR_NOT_INT +#define CK_F_PR_NOT_PTR +#define CK_F_PR_NOT_UINT +#define CK_F_PR_OR_32 +#define CK_F_PR_OR_64 +#define CK_F_PR_OR_INT +#define CK_F_PR_OR_PTR +#define CK_F_PR_OR_UINT +#define CK_F_PR_STALL +#define CK_F_PR_STORE_16 +#define CK_F_PR_STORE_32 +#define CK_F_PR_STORE_64 +#define CK_F_PR_STORE_8 +#define CK_F_PR_STORE_CHAR +#define CK_F_PR_STORE_DOUBLE +#define CK_F_PR_STORE_INT +#define CK_F_PR_STORE_PTR +#define CK_F_PR_STORE_SHORT +#define CK_F_PR_STORE_UINT +#define CK_F_PR_SUB_32 +#define CK_F_PR_SUB_64 +#define CK_F_PR_SUB_INT +#define CK_F_PR_SUB_PTR +#define CK_F_PR_SUB_UINT +#define CK_F_PR_XOR_32 +#define CK_F_PR_XOR_64 +#define CK_F_PR_XOR_INT +#define CK_F_PR_XOR_PTR +#define CK_F_PR_XOR_UINT + diff --git a/include/gcc/s390x/ck_pr.h b/include/gcc/s390x/ck_pr.h new file mode 100644 index 0000000..8ad22b2 --- /dev/null +++ b/include/gcc/s390x/ck_pr.h @@ -0,0 +1,373 @@ +/* + * Copyright 2009-2015 Samy Al Bahra. + * Copyright 2017 Neale Ferguson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef CK_PR_S390X_H +#define CK_PR_S390X_H + +#ifndef CK_PR_H +#error Do not include this file directly, use ck_pr.h +#endif + +#include <ck_cc.h> +#include <ck_md.h> + +/* + * The following represent supported atomic operations. + * These operations may be emulated. + */ +#include "ck_f_pr.h" + +/* + * Minimum interface requirement met. + */ +#define CK_F_PR + +/* + * This bounces the hardware thread from low to medium + * priority. I am unsure of the benefits of this approach + * but it is used by the Linux kernel. + */ +CK_CC_INLINE static void +ck_pr_stall(void) +{ + __sync_synchronize(); + return; +} + +#define CK_PR_FENCE(T) \ + CK_CC_INLINE static void \ + ck_pr_fence_strict_##T(void) \ + { \ + __sync_synchronize(); \ + } + +/* + * These are derived from: + * http://www.ibm.com/developerworks/systems/articles/powerpc.html + */ +CK_PR_FENCE(atomic) +CK_PR_FENCE(atomic_store) +CK_PR_FENCE(atomic_load) +CK_PR_FENCE(store_atomic) +CK_PR_FENCE(load_atomic) +CK_PR_FENCE(store) +CK_PR_FENCE(store_load) +CK_PR_FENCE(load) +CK_PR_FENCE(load_store) +CK_PR_FENCE(memory) +CK_PR_FENCE(acquire) +CK_PR_FENCE(release) +CK_PR_FENCE(acqrel) +CK_PR_FENCE(lock) +CK_PR_FENCE(unlock) + +#undef CK_PR_FENCE + +#define CK_PR_LOAD(S, M, T, C, I) \ + CK_CC_INLINE static T \ + ck_pr_md_load_##S(const M *target) \ + { \ + T r; \ + __asm__ __volatile__(I "\t%0, %1\n" \ + : "=r" (r) \ + : "Q" (*(const C *)target) \ + : "memory"); \ + return (r); \ + } + +CK_PR_LOAD(ptr, void, void *, uint64_t, "lg") + +#define CK_PR_LOAD_S(S, T, I) CK_PR_LOAD(S, T, T, T, I) + +CK_PR_LOAD_S(64, uint64_t, "lg") +CK_PR_LOAD_S(32, uint32_t, "llgf") +CK_PR_LOAD_S(16, uint16_t, "llgh") +CK_PR_LOAD_S(8, uint8_t, "llgc") +CK_PR_LOAD_S(uint, unsigned int, "llgf") +CK_PR_LOAD_S(int, int, "llgf") +CK_PR_LOAD_S(short, short, "lgh") +CK_PR_LOAD_S(char, char, "lgb") +#ifndef CK_PR_DISABLE_DOUBLE +CK_CC_INLINE static double +ck_pr_md_load_double(const double *target) +{ + double r; + __asm__ __volatile__("ld %0, %1\n" + : "=f" (r) + : "Q" (*(const double *)target) + : "memory"); + return (r); +} +#endif + +#undef CK_PR_LOAD_S +#undef CK_PR_LOAD + +#define CK_PR_STORE(S, M, T, C, I) \ + CK_CC_INLINE static void \ + ck_pr_md_store_##S(M *target, T v) \ + { \ + __asm__ __volatile__(I "\t%1, %0\n" \ + : "=Q" (*(C *)target) \ + : "r" (v) \ + : "memory"); \ + return; \ + } + +CK_PR_STORE(ptr, void, const void *, uint64_t, "stg") + +#define CK_PR_STORE_S(S, T, I) CK_PR_STORE(S, T, T, T, I) + +CK_PR_STORE_S(64, uint64_t, "stg") +CK_PR_STORE_S(32, uint32_t, "st") +CK_PR_STORE_S(16, uint16_t, "sth") +CK_PR_STORE_S(8, uint8_t, "stc") +CK_PR_STORE_S(uint, unsigned int, "st") +CK_PR_STORE_S(int, int, "st") +CK_PR_STORE_S(short, short, "sth") +CK_PR_STORE_S(char, char, "stc") +#ifndef CK_PR_DISABLE_DOUBLE +CK_CC_INLINE static void +ck_pr_md_store_double(double *target, double v) +{ + __asm__ __volatile__(" std %1, %0\n" + : "=Q" (*(double *)target) + : "f" (v) + : "0", "memory"); +} +#endif + +#undef CK_PR_STORE_S +#undef CK_PR_STORE + +CK_CC_INLINE static bool +ck_pr_cas_64_value(uint64_t *target, uint64_t compare, uint64_t set, uint64_t *value) +{ + *value = __sync_val_compare_and_swap(target,compare,set); + return (*value == compare); +} + +CK_CC_INLINE static bool +ck_pr_cas_ptr_value(void *target, void *compare, void *set, void *value) +{ + uintptr_t previous; + + previous = __sync_val_compare_and_swap((uintptr_t *) target, + (uintptr_t) compare, + (uintptr_t) set); + *((uintptr_t *) value) = previous; + return (previous == (uintptr_t) compare); +} + +CK_CC_INLINE static bool +ck_pr_cas_64(uint64_t *target, uint64_t compare, uint64_t set) +{ + return(__sync_bool_compare_and_swap(target,compare,set)); +} + +CK_CC_INLINE static bool +ck_pr_cas_ptr(void *target, void *compare, void *set) +{ + return(__sync_bool_compare_and_swap((uintptr_t *) target, + (uintptr_t) compare, + (uintptr_t) set)); +} + +#define CK_PR_CAS(N, T) \ + CK_CC_INLINE static bool \ + ck_pr_cas_##N##_value(T *target, T compare, T set, T *value) \ + { \ + *value = __sync_val_compare_and_swap(target, \ + compare, \ + set); \ + return(*value == compare); \ + } \ + CK_CC_INLINE static bool \ + ck_pr_cas_##N(T *target, T compare, T set) \ + { \ + return(__sync_bool_compare_and_swap(target, \ + compare, \ + set)); \ + } + +CK_PR_CAS(32, uint32_t) +CK_PR_CAS(uint, unsigned int) +CK_PR_CAS(int, int) + +#undef CK_PR_CAS + +CK_CC_INLINE static void * +ck_pr_fas_ptr(void *target, void *v) +{ + return((void *)__atomic_exchange_n((uintptr_t *) target, (uintptr_t) v, __ATOMIC_ACQUIRE)); +} + +#define CK_PR_FAS(N, M, T) \ + CK_CC_INLINE static T \ + ck_pr_fas_##N(M *target, T v) \ + { \ + return(__atomic_exchange_n(target, v, __ATOMIC_ACQUIRE)); \ + } + +CK_PR_FAS(64, uint64_t, uint64_t) +CK_PR_FAS(32, uint32_t, uint32_t) +CK_PR_FAS(int, int, int) +CK_PR_FAS(uint, unsigned int, unsigned int) + +#ifndef CK_PR_DISABLE_DOUBLE +CK_CC_INLINE static double +ck_pr_fas_double(double *target, double *v) +{ + double previous; + + __asm__ __volatile__ (" lg 1,%2\n" + "0: lg 0,%1\n" + " csg 0,1,%1\n" + " jnz 0b\n" + " ldgr %0,0\n" + : "=f" (previous) + : "Q" (target), "Q" (v) + : "0", "1", "cc", "memory"); + return (previous); +} +#endif + +#undef CK_PR_FAS + +/* + * Atomic store-only binary operations. + */ +#define CK_PR_BINARY(K, S, M, T) \ + CK_CC_INLINE static void \ + ck_pr_##K##_##S(M *target, T d) \ + { \ + d = __sync_fetch_and_##K((T *)target, d); \ + return; \ + } + +#define CK_PR_BINARY_S(K, S, T) CK_PR_BINARY(K, S, T, T) + +#define CK_PR_GENERATE(K) \ + CK_PR_BINARY(K, ptr, void, void *) \ + CK_PR_BINARY_S(K, char, char) \ + CK_PR_BINARY_S(K, int, int) \ + CK_PR_BINARY_S(K, uint, unsigned int) \ + CK_PR_BINARY_S(K, 64, uint64_t) \ + CK_PR_BINARY_S(K, 32, uint32_t) \ + CK_PR_BINARY_S(K, 16, uint16_t) \ + CK_PR_BINARY_S(K, 8, uint8_t) + +CK_PR_GENERATE(add) +CK_PR_GENERATE(sub) +CK_PR_GENERATE(and) +CK_PR_GENERATE(or) +CK_PR_GENERATE(xor) + +#undef CK_PR_GENERATE +#undef CK_PR_BINARY_S +#undef CK_PR_BINARY + +#define CK_PR_UNARY(S, M, T) \ + CK_CC_INLINE static void \ + ck_pr_inc_##S(M *target) \ + { \ + ck_pr_add_##S(target, (T)1); \ + return; \ + } \ + CK_CC_INLINE static void \ + ck_pr_dec_##S(M *target) \ + { \ + ck_pr_sub_##S(target, (T)1); \ + return; \ + } + +#define CK_PR_UNARY_X(S, M) \ + CK_CC_INLINE static void \ + ck_pr_not_##S(M *target) \ + { \ + M newval; \ + do { \ + newval = ~(*target); \ + } while (!__sync_bool_compare_and_swap(target, \ + *target, \ + newval)); \ + } \ + CK_CC_INLINE static void \ + ck_pr_neg_##S(M *target) \ + { \ + M newval; \ + do { \ + newval = -(*target); \ + } while (!__sync_bool_compare_and_swap(target, \ + *target, \ + newval)); \ + } + +#define CK_PR_UNARY_S(S, M) CK_PR_UNARY(S, M, M) \ + CK_PR_UNARY_X(S, M) + +CK_PR_UNARY(ptr, void, void *) +CK_PR_UNARY_S(char, char) +CK_PR_UNARY_S(int, int) +CK_PR_UNARY_S(uint, unsigned int) +CK_PR_UNARY_S(64, uint64_t) +CK_PR_UNARY_S(32, uint32_t) +CK_PR_UNARY_S(16, uint16_t) +CK_PR_UNARY_S(8, uint8_t) + +#undef CK_PR_UNARY_S +#undef CK_PR_UNARY + +CK_CC_INLINE static void * +ck_pr_faa_ptr(void *target, uintptr_t delta) +{ + uintptr_t previous; + + previous = __sync_fetch_and_add((uintptr_t *) target, delta); + + return (void *)(previous); +} + +#define CK_PR_FAA(S, T) \ + CK_CC_INLINE static T \ + ck_pr_faa_##S(T *target, T delta) \ + { \ + T previous; \ + \ + previous = __sync_fetch_and_add(target, delta); \ + \ + return (previous); \ + } + +CK_PR_FAA(64, uint64_t) +CK_PR_FAA(32, uint32_t) +CK_PR_FAA(uint, unsigned int) +CK_PR_FAA(int, int) + +#undef CK_PR_FAA + +#endif /* CK_PR_S390X_H */ diff --git a/include/gcc/sparcv9/ck_pr.h b/include/gcc/sparcv9/ck_pr.h index 767af6a..b60e199 100644 --- a/include/gcc/sparcv9/ck_pr.h +++ b/include/gcc/sparcv9/ck_pr.h @@ -76,7 +76,7 @@ CK_PR_FENCE(store, "membar #StoreStore") CK_PR_FENCE(store_load, "membar #StoreLoad") CK_PR_FENCE(load, "membar #LoadLoad") CK_PR_FENCE(load_store, "membar #LoadStore") -CK_PR_FENCE(memory, "membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad") +CK_PR_FENCE(memory, "membar #MemIssue") CK_PR_FENCE(acquire, "membar #LoadLoad | #LoadStore") CK_PR_FENCE(release, "membar #LoadStore | #StoreStore") CK_PR_FENCE(acqrel, "membar #LoadLoad | #LoadStore | #StoreStore") @@ -136,11 +136,26 @@ CK_PR_STORE_S(int, int, "stsw") #undef CK_PR_STORE_S #undef CK_PR_STORE +/* Use the appropriate address space for atomics within the FreeBSD kernel. */ +#if defined(__FreeBSD__) && defined(_KERNEL) +#include <sys/cdefs.h> +#include <machine/atomic.h> +#define CK_PR_INS_CAS "casa" +#define CK_PR_INS_CASX "casxa" +#define CK_PR_INS_SWAP "swapa" +#define CK_PR_ASI_ATOMIC __XSTRING(__ASI_ATOMIC) +#else +#define CK_PR_INS_CAS "cas" +#define CK_PR_INS_CASX "casx" +#define CK_PR_INS_SWAP "swap" +#define CK_PR_ASI_ATOMIC "" +#endif + CK_CC_INLINE static bool ck_pr_cas_64_value(uint64_t *target, uint64_t compare, uint64_t set, uint64_t *value) { - __asm__ __volatile__("casx [%1], %2, %0" + __asm__ __volatile__(CK_PR_INS_CASX " [%1] " CK_PR_ASI_ATOMIC ", %2, %0" : "+&r" (set) : "r" (target), "r" (compare) @@ -154,7 +169,7 @@ CK_CC_INLINE static bool ck_pr_cas_64(uint64_t *target, uint64_t compare, uint64_t set) { - __asm__ __volatile__("casx [%1], %2, %0" + __asm__ __volatile__(CK_PR_INS_CASX " [%1] " CK_PR_ASI_ATOMIC ", %2, %0" : "+&r" (set) : "r" (target), "r" (compare) @@ -181,7 +196,7 @@ ck_pr_cas_ptr_value(void *target, void *compare, void *set, void *previous) CK_CC_INLINE static bool \ ck_pr_cas_##N##_value(T *target, T compare, T set, T *value) \ { \ - __asm__ __volatile__("cas [%1], %2, %0" \ + __asm__ __volatile__(CK_PR_INS_CAS " [%1] " CK_PR_ASI_ATOMIC ", %2, %0" \ : "+&r" (set) \ : "r" (target), \ "r" (compare) \ @@ -192,7 +207,7 @@ ck_pr_cas_ptr_value(void *target, void *compare, void *set, void *previous) CK_CC_INLINE static bool \ ck_pr_cas_##N(T *target, T compare, T set) \ { \ - __asm__ __volatile__("cas [%1], %2, %0" \ + __asm__ __volatile__(CK_PR_INS_CAS " [%1] " CK_PR_ASI_ATOMIC ", %2, %0" \ : "+&r" (set) \ : "r" (target), \ "r" (compare) \ @@ -211,7 +226,7 @@ CK_PR_CAS(int, int) ck_pr_fas_##N(T *target, T update) \ { \ \ - __asm__ __volatile__("swap [%1], %0" \ + __asm__ __volatile__(CK_PR_INS_SWAP " [%1] " CK_PR_ASI_ATOMIC ", %0" \ : "+&r" (update) \ : "r" (target) \ : "memory"); \ @@ -224,5 +239,10 @@ CK_PR_FAS(32, uint32_t) #undef CK_PR_FAS +#undef CK_PR_INS_CAS +#undef CK_PR_INS_CASX +#undef CK_PR_INS_SWAP +#undef CK_PR_ASI_ATOMIC + #endif /* CK_PR_SPARCV9_H */ diff --git a/include/gcc/x86/ck_pr.h b/include/gcc/x86/ck_pr.h index a04cebf..5194dee 100644 --- a/include/gcc/x86/ck_pr.h +++ b/include/gcc/x86/ck_pr.h @@ -45,15 +45,9 @@ /* Minimum requirements for the CK_PR interface are met. */ #define CK_F_PR -#ifdef CK_MD_UMP -#define CK_PR_LOCK_PREFIX -#else -#define CK_PR_LOCK_PREFIX "lock " -#endif - /* - * Prevent speculative execution in busy-wait loops (P4 <=) - * or "predefined delay". + * Prevent speculative execution in busy-wait loops (P4 <=) or "predefined + * delay". */ CK_CC_INLINE static void ck_pr_stall(void) @@ -62,28 +56,52 @@ ck_pr_stall(void) return; } +#ifdef CK_MD_UMP +#define CK_PR_LOCK_PREFIX +#define CK_PR_FENCE(T, I) \ + CK_CC_INLINE static void \ + ck_pr_fence_strict_##T(void) \ + { \ + __asm__ __volatile__("" ::: "memory"); \ + return; \ + } +#else +#define CK_PR_LOCK_PREFIX "lock " #define CK_PR_FENCE(T, I) \ CK_CC_INLINE static void \ ck_pr_fence_strict_##T(void) \ { \ __asm__ __volatile__(I ::: "memory"); \ + return; \ } +#endif /* CK_MD_UMP */ -CK_PR_FENCE(atomic, "sfence") -CK_PR_FENCE(atomic_store, "sfence") -CK_PR_FENCE(atomic_load, "mfence") -CK_PR_FENCE(store_atomic, "sfence") -CK_PR_FENCE(load_atomic, "mfence") -CK_PR_FENCE(load, "lfence") -CK_PR_FENCE(load_store, "mfence") -CK_PR_FENCE(store, "sfence") -CK_PR_FENCE(store_load, "mfence") -CK_PR_FENCE(memory, "mfence") -CK_PR_FENCE(release, "mfence") -CK_PR_FENCE(acquire, "mfence") -CK_PR_FENCE(acqrel, "mfence") -CK_PR_FENCE(lock, "mfence") -CK_PR_FENCE(unlock, "mfence") +#if defined(CK_MD_SSE_DISABLE) +/* If SSE is disabled, then use atomic operations for serialization. */ +#define CK_MD_X86_MFENCE "lock addl $0, (%%esp)" +#define CK_MD_X86_SFENCE CK_MD_X86_MFENCE +#define CK_MD_X86_LFENCE CK_MD_X86_MFENCE +#else +#define CK_MD_X86_SFENCE "sfence" +#define CK_MD_X86_LFENCE "lfence" +#define CK_MD_X86_MFENCE "mfence" +#endif /* !CK_MD_SSE_DISABLE */ + +CK_PR_FENCE(atomic, "") +CK_PR_FENCE(atomic_store, "") +CK_PR_FENCE(atomic_load, "") +CK_PR_FENCE(store_atomic, "") +CK_PR_FENCE(load_atomic, "") +CK_PR_FENCE(load, CK_MD_X86_LFENCE) +CK_PR_FENCE(load_store, CK_MD_X86_MFENCE) +CK_PR_FENCE(store, CK_MD_X86_SFENCE) +CK_PR_FENCE(store_load, CK_MD_X86_MFENCE) +CK_PR_FENCE(memory, CK_MD_X86_MFENCE) +CK_PR_FENCE(release, CK_MD_X86_MFENCE) +CK_PR_FENCE(acquire, CK_MD_X86_MFENCE) +CK_PR_FENCE(acqrel, CK_MD_X86_MFENCE) +CK_PR_FENCE(lock, CK_MD_X86_MFENCE) +CK_PR_FENCE(unlock, CK_MD_X86_MFENCE) #undef CK_PR_FENCE @@ -215,18 +233,18 @@ CK_PR_FAA_S(8, uint8_t, "xaddb") } #define CK_PR_UNARY_V(K, S, T, C, I) \ - CK_CC_INLINE static void \ - ck_pr_##K##_##S##_zero(T *target, bool *r) \ + CK_CC_INLINE static bool \ + ck_pr_##K##_##S##_is_zero(T *target) \ { \ + bool ret; \ __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %0; setz %1" \ : "+m" (*(C *)target), \ - "=m" (*r) \ + "=qm" (ret) \ : \ : "memory", "cc"); \ - return; \ + return ret; \ } - #define CK_PR_UNARY_S(K, S, T, I) CK_PR_UNARY(K, S, T, T, I) #define CK_PR_GENERATE(K) \ @@ -289,8 +307,38 @@ CK_PR_GENERATE(xor) #undef CK_PR_BINARY /* - * Atomic compare and swap. + * Atomic compare and swap, with a variant that sets *v to the old value of target. */ +#ifdef __GCC_ASM_FLAG_OUTPUTS__ +#define CK_PR_CAS(S, M, T, C, I) \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S(M *target, T compare, T set) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0" \ + : "+m" (*(C *)target), \ + "=@ccz" (z), \ + /* RAX is clobbered by cmpxchg. */ \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + return z; \ + } \ + \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \ + : "+m" (*(C *)target), \ + "=@ccz" (z), \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + *(T *)v = compare; \ + return z; \ + } +#else #define CK_PR_CAS(S, M, T, C, I) \ CK_CC_INLINE static bool \ ck_pr_cas_##S(M *target, T compare, T set) \ @@ -303,7 +351,23 @@ CK_PR_GENERATE(xor) "a" (compare) \ : "memory", "cc"); \ return z; \ + } \ + \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \ + "setz %1;" \ + : "+m" (*(C *)target), \ + "=q" (z), \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + *(T *)v = compare; \ + return z; \ } +#endif CK_PR_CAS(ptr, void, void *, char, "cmpxchgl") @@ -320,41 +384,6 @@ CK_PR_CAS_S(8, uint8_t, "cmpxchgb") #undef CK_PR_CAS /* - * Compare and swap, set *v to old value of target. - */ -#define CK_PR_CAS_O(S, M, T, C, I, R) \ - CK_CC_INLINE static bool \ - ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ - { \ - bool z; \ - __asm__ __volatile__(CK_PR_LOCK_PREFIX "cmpxchg" I " %3, %0;" \ - "mov %% " R ", %2;" \ - "setz %1;" \ - : "+m" (*(C *)target), \ - "=a" (z), \ - "=m" (*(C *)v) \ - : "q" (set), \ - "a" (compare) \ - : "memory", "cc"); \ - return (bool)z; \ - } - -CK_PR_CAS_O(ptr, void, void *, char, "l", "eax") - -#define CK_PR_CAS_O_S(S, T, I, R) \ - CK_PR_CAS_O(S, T, T, T, I, R) - -CK_PR_CAS_O_S(char, char, "b", "al") -CK_PR_CAS_O_S(int, int, "l", "eax") -CK_PR_CAS_O_S(uint, unsigned int, "l", "eax") -CK_PR_CAS_O_S(32, uint32_t, "l", "eax") -CK_PR_CAS_O_S(16, uint16_t, "w", "ax") -CK_PR_CAS_O_S(8, uint8_t, "b", "al") - -#undef CK_PR_CAS_O_S -#undef CK_PR_CAS_O - -/* * Atomic bit test operations. */ #define CK_PR_BT(K, S, T, P, C, I) \ diff --git a/include/gcc/x86_64/ck_pr.h b/include/gcc/x86_64/ck_pr.h index 532d593..4222729 100644 --- a/include/gcc/x86_64/ck_pr.h +++ b/include/gcc/x86_64/ck_pr.h @@ -58,8 +58,8 @@ #endif /* - * Prevent speculative execution in busy-wait loops (P4 <=) - * or "predefined delay". + * Prevent speculative execution in busy-wait loops (P4 <=) or "predefined + * delay". */ CK_CC_INLINE static void ck_pr_stall(void) @@ -75,18 +75,39 @@ ck_pr_stall(void) __asm__ __volatile__(I ::: "memory"); \ } -CK_PR_FENCE(atomic, "sfence") -CK_PR_FENCE(atomic_store, "sfence") -CK_PR_FENCE(atomic_load, "mfence") -CK_PR_FENCE(store_atomic, "sfence") -CK_PR_FENCE(load_atomic, "mfence") +/* Atomic operations are always serializing. */ +CK_PR_FENCE(atomic, "") +CK_PR_FENCE(atomic_store, "") +CK_PR_FENCE(atomic_load, "") +CK_PR_FENCE(store_atomic, "") +CK_PR_FENCE(load_atomic, "") + +/* Traditional fence interface. */ CK_PR_FENCE(load, "lfence") CK_PR_FENCE(load_store, "mfence") CK_PR_FENCE(store, "sfence") CK_PR_FENCE(store_load, "mfence") CK_PR_FENCE(memory, "mfence") + +/* Below are stdatomic-style fences. */ + +/* + * Provides load-store and store-store ordering. However, Intel specifies that + * the WC memory model is relaxed. It is likely an sfence *is* sufficient (in + * particular, stores are not re-ordered with respect to prior loads and it is + * really just the stores that are subject to re-ordering). However, we take + * the conservative route as the manuals are too ambiguous for my taste. + */ CK_PR_FENCE(release, "mfence") + +/* + * Provides load-load and load-store ordering. The lfence instruction ensures + * all prior load operations are complete before any subsequent instructions + * actually begin execution. However, the manual also ends up going to describe + * WC memory as a relaxed model. + */ CK_PR_FENCE(acquire, "mfence") + CK_PR_FENCE(acqrel, "mfence") CK_PR_FENCE(lock, "mfence") CK_PR_FENCE(unlock, "mfence") @@ -311,18 +332,18 @@ CK_PR_FAA_S(8, uint8_t, "xaddb") } #define CK_PR_UNARY_V(K, S, T, C, I) \ - CK_CC_INLINE static void \ - ck_pr_##K##_##S##_zero(T *target, bool *r) \ + CK_CC_INLINE static bool \ + ck_pr_##K##_##S##_is_zero(T *target) \ { \ + bool ret; \ __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %0; setz %1" \ : "+m" (*(C *)target), \ - "=m" (*r) \ + "=rm" (ret) \ : \ : "memory", "cc"); \ - return; \ + return ret; \ } - #define CK_PR_UNARY_S(K, S, T, I) CK_PR_UNARY(K, S, T, T, I) #define CK_PR_GENERATE(K) \ @@ -387,8 +408,38 @@ CK_PR_GENERATE(xor) #undef CK_PR_BINARY /* - * Atomic compare and swap. + * Atomic compare and swap, with a variant that sets *v to the old value of target. */ +#ifdef __GCC_ASM_FLAG_OUTPUTS__ +#define CK_PR_CAS(S, M, T, C, I) \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S(M *target, T compare, T set) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0" \ + : "+m" (*(C *)target), \ + "=@ccz" (z), \ + /* RAX is clobbered by cmpxchg. */ \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + return z; \ + } \ + \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \ + : "+m" (*(C *)target), \ + "=@ccz" (z), \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + *(T *)v = compare; \ + return z; \ + } +#else #define CK_PR_CAS(S, M, T, C, I) \ CK_CC_INLINE static bool \ ck_pr_cas_##S(M *target, T compare, T set) \ @@ -401,7 +452,23 @@ CK_PR_GENERATE(xor) "a" (compare) \ : "memory", "cc"); \ return z; \ + } \ + \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \ + "setz %1;" \ + : "+m" (*(C *)target), \ + "=q" (z), \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + *(T *)v = compare; \ + return z; \ } +#endif CK_PR_CAS(ptr, void, void *, char, "cmpxchgq") @@ -422,45 +489,6 @@ CK_PR_CAS_S(8, uint8_t, "cmpxchgb") #undef CK_PR_CAS /* - * Compare and swap, set *v to old value of target. - */ -#define CK_PR_CAS_O(S, M, T, C, I, R) \ - CK_CC_INLINE static bool \ - ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ - { \ - bool z; \ - __asm__ __volatile__(CK_PR_LOCK_PREFIX "cmpxchg" I " %3, %0;" \ - "mov %% " R ", %2;" \ - "setz %1;" \ - : "+m" (*(C *)target), \ - "=a" (z), \ - "=m" (*(C *)v) \ - : "q" (set), \ - "a" (compare) \ - : "memory", "cc"); \ - return z; \ - } - -CK_PR_CAS_O(ptr, void, void *, char, "q", "rax") - -#define CK_PR_CAS_O_S(S, T, I, R) \ - CK_PR_CAS_O(S, T, T, T, I, R) - -CK_PR_CAS_O_S(char, char, "b", "al") -CK_PR_CAS_O_S(int, int, "l", "eax") -CK_PR_CAS_O_S(uint, unsigned int, "l", "eax") -#ifndef CK_PR_DISABLE_DOUBLE -CK_PR_CAS_O_S(double, double, "q", "rax") -#endif -CK_PR_CAS_O_S(64, uint64_t, "q", "rax") -CK_PR_CAS_O_S(32, uint32_t, "l", "eax") -CK_PR_CAS_O_S(16, uint16_t, "w", "ax") -CK_PR_CAS_O_S(8, uint8_t, "b", "al") - -#undef CK_PR_CAS_O_S -#undef CK_PR_CAS_O - -/* * Contrary to C-interface, alignment requirements are that of uint64_t[2]. */ CK_CC_INLINE static bool diff --git a/include/spinlock/dec.h b/include/spinlock/dec.h index 11d36dd..3e36bf7 100644 --- a/include/spinlock/dec.h +++ b/include/spinlock/dec.h @@ -111,7 +111,8 @@ ck_spinlock_dec_lock_eb(struct ck_spinlock_dec *lock) if (r == true) break; - ck_backoff_eb(&backoff); + while (ck_pr_load_uint(&lock->value) != 1) + ck_backoff_eb(&backoff); } ck_pr_fence_lock(); diff --git a/include/spinlock/fas.h b/include/spinlock/fas.h index 4e6c123..bfe91fe 100644 --- a/include/spinlock/fas.h +++ b/include/spinlock/fas.h @@ -77,10 +77,11 @@ CK_CC_INLINE static void ck_spinlock_fas_lock(struct ck_spinlock_fas *lock) { - while (ck_pr_fas_uint(&lock->value, true) == true) { - while (ck_pr_load_uint(&lock->value) == true) - ck_pr_stall(); - } + while (CK_CC_UNLIKELY(ck_pr_fas_uint(&lock->value, true) == true)) { + do { + ck_pr_stall(); + } while (ck_pr_load_uint(&lock->value) == true); + } ck_pr_fence_lock(); return; diff --git a/include/spinlock/hclh.h b/include/spinlock/hclh.h index 296448b..ece56c6 100644 --- a/include/spinlock/hclh.h +++ b/include/spinlock/hclh.h @@ -81,6 +81,8 @@ ck_spinlock_hclh_lock(struct ck_spinlock_hclh **glob_queue, thread->wait = true; thread->splice = false; thread->cluster_id = (*local_queue)->cluster_id; + /* Make sure previous->previous doesn't appear to be NULL */ + thread->previous = *local_queue; /* Serialize with respect to update of local queue. */ ck_pr_fence_store_atomic(); @@ -91,13 +93,15 @@ ck_spinlock_hclh_lock(struct ck_spinlock_hclh **glob_queue, /* Wait until previous thread from the local queue is done with lock. */ ck_pr_fence_load(); - if (previous->previous != NULL && - previous->cluster_id == thread->cluster_id) { - while (ck_pr_load_uint(&previous->wait) == true) + if (previous->previous != NULL) { + while (ck_pr_load_uint(&previous->wait) == true && + ck_pr_load_int(&previous->cluster_id) == thread->cluster_id && + ck_pr_load_uint(&previous->splice) == false) ck_pr_stall(); /* We're head of the global queue, we're done */ - if (ck_pr_load_uint(&previous->splice) == false) + if (ck_pr_load_int(&previous->cluster_id) == thread->cluster_id && + ck_pr_load_uint(&previous->splice) == false) return; } |