summaryrefslogtreecommitdiffstats
path: root/src/include/port/atomics/arch-x86.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/port/atomics/arch-x86.h')
-rw-r--r--src/include/port/atomics/arch-x86.h252
1 files changed, 252 insertions, 0 deletions
diff --git a/src/include/port/atomics/arch-x86.h b/src/include/port/atomics/arch-x86.h
new file mode 100644
index 0000000..13cc4f5
--- /dev/null
+++ b/src/include/port/atomics/arch-x86.h
@@ -0,0 +1,252 @@
+/*-------------------------------------------------------------------------
+ *
+ * arch-x86.h
+ * Atomic operations considerations specific to intel x86
+ *
+ * Note that we actually require a 486 upwards because the 386 doesn't have
+ * support for xadd and cmpxchg. Given that the 386 isn't supported anywhere
+ * anymore that's not much of a restriction luckily.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * NOTES:
+ *
+ * src/include/port/atomics/arch-x86.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Both 32 and 64 bit x86 do not allow loads to be reordered with other loads,
+ * or stores to be reordered with other stores, but a load can be performed
+ * before a subsequent store.
+ *
+ * Technically, some x86-ish chips support uncached memory access and/or
+ * special instructions that are weakly ordered. In those cases we'd need
+ * the read and write barriers to be lfence and sfence. But since we don't
+ * do those things, a compiler barrier should be enough.
+ *
+ * "lock; addl" has worked for longer than "mfence". It's also rumored to be
+ * faster in many scenarios.
+ */
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#if defined(__i386__) || defined(__i386)
+#define pg_memory_barrier_impl() \
+ __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc")
+#elif defined(__x86_64__)
+#define pg_memory_barrier_impl() \
+ __asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc")
+#endif
+#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
+
+#define pg_read_barrier_impl() pg_compiler_barrier_impl()
+#define pg_write_barrier_impl() pg_compiler_barrier_impl()
+
+/*
+ * Provide implementation for atomics using inline assembly on x86 gcc. It's
+ * nice to support older gcc's and the compare/exchange implementation here is
+ * actually more efficient than the * __sync variant.
+ */
+#if defined(HAVE_ATOMICS)
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+
+#define PG_HAVE_ATOMIC_FLAG_SUPPORT
+typedef struct pg_atomic_flag
+{
+ volatile char value;
+} pg_atomic_flag;
+
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+ volatile uint32 value;
+} pg_atomic_uint32;
+
+/*
+ * It's too complicated to write inline asm for 64bit types on 32bit and the
+ * 486 can't do it anyway.
+ */
+#ifdef __x86_64__
+#define PG_HAVE_ATOMIC_U64_SUPPORT
+typedef struct pg_atomic_uint64
+{
+ /* alignment guaranteed due to being on a 64bit platform */
+ volatile uint64 value;
+} pg_atomic_uint64;
+#endif /* __x86_64__ */
+
+#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
+
+#endif /* defined(HAVE_ATOMICS) */
+
+#if !defined(PG_HAVE_SPIN_DELAY)
+/*
+ * This sequence is equivalent to the PAUSE instruction ("rep" is
+ * ignored by old IA32 processors if the following instruction is
+ * not a string operation); the IA-32 Architecture Software
+ * Developer's Manual, Vol. 3, Section 7.7.2 describes why using
+ * PAUSE in the inner loop of a spin lock is necessary for good
+ * performance:
+ *
+ * The PAUSE instruction improves the performance of IA-32
+ * processors supporting Hyper-Threading Technology when
+ * executing spin-wait loops and other routines where one
+ * thread is accessing a shared lock or semaphore in a tight
+ * polling loop. When executing a spin-wait loop, the
+ * processor can suffer a severe performance penalty when
+ * exiting the loop because it detects a possible memory order
+ * violation and flushes the core processor's pipeline. The
+ * PAUSE instruction provides a hint to the processor that the
+ * code sequence is a spin-wait loop. The processor uses this
+ * hint to avoid the memory order violation and prevent the
+ * pipeline flush. In addition, the PAUSE instruction
+ * de-pipelines the spin-wait loop to prevent it from
+ * consuming execution resources excessively.
+ */
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define PG_HAVE_SPIN_DELAY
+static __inline__ void
+pg_spin_delay_impl(void)
+{
+ __asm__ __volatile__(" rep; nop \n");
+}
+#elif defined(_MSC_VER) && defined(__x86_64__)
+#define PG_HAVE_SPIN_DELAY
+static __forceinline void
+pg_spin_delay_impl(void)
+{
+ _mm_pause();
+}
+#elif defined(_MSC_VER)
+#define PG_HAVE_SPIN_DELAY
+static __forceinline void
+pg_spin_delay_impl(void)
+{
+ /* See comment for gcc code. Same code, MASM syntax */
+ __asm rep nop;
+}
+#endif
+#endif /* !defined(PG_HAVE_SPIN_DELAY) */
+
+
+#if defined(HAVE_ATOMICS)
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+
+#define PG_HAVE_ATOMIC_TEST_SET_FLAG
+static inline bool
+pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
+{
+ register char _res = 1;
+
+ __asm__ __volatile__(
+ " lock \n"
+ " xchgb %0,%1 \n"
+: "+q"(_res), "+m"(ptr->value)
+:
+: "memory");
+ return _res == 0;
+}
+
+#define PG_HAVE_ATOMIC_CLEAR_FLAG
+static inline void
+pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
+{
+ /*
+ * On a TSO architecture like x86 it's sufficient to use a compiler
+ * barrier to achieve release semantics.
+ */
+ __asm__ __volatile__("" ::: "memory");
+ ptr->value = 0;
+}
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+static inline bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+ uint32 *expected, uint32 newval)
+{
+ char ret;
+
+ /*
+ * Perform cmpxchg and use the zero flag which it implicitly sets when
+ * equal to measure the success.
+ */
+ __asm__ __volatile__(
+ " lock \n"
+ " cmpxchgl %4,%5 \n"
+ " setz %2 \n"
+: "=a" (*expected), "=m"(ptr->value), "=q" (ret)
+: "a" (*expected), "r" (newval), "m"(ptr->value)
+: "memory", "cc");
+ return (bool) ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U32
+static inline uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+ uint32 res;
+ __asm__ __volatile__(
+ " lock \n"
+ " xaddl %0,%1 \n"
+: "=q"(res), "=m"(ptr->value)
+: "0" (add_), "m"(ptr->value)
+: "memory", "cc");
+ return res;
+}
+
+#ifdef __x86_64__
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+static inline bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+ uint64 *expected, uint64 newval)
+{
+ char ret;
+
+ /*
+ * Perform cmpxchg and use the zero flag which it implicitly sets when
+ * equal to measure the success.
+ */
+ __asm__ __volatile__(
+ " lock \n"
+ " cmpxchgq %4,%5 \n"
+ " setz %2 \n"
+: "=a" (*expected), "=m"(ptr->value), "=q" (ret)
+: "a" (*expected), "r" (newval), "m"(ptr->value)
+: "memory", "cc");
+ return (bool) ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U64
+static inline uint64
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+ uint64 res;
+ __asm__ __volatile__(
+ " lock \n"
+ " xaddq %0,%1 \n"
+: "=q"(res), "=m"(ptr->value)
+: "0" (add_), "m"(ptr->value)
+: "memory", "cc");
+ return res;
+}
+
+#endif /* __x86_64__ */
+
+#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
+
+/*
+ * 8 byte reads / writes have single-copy atomicity on 32 bit x86 platforms
+ * since at least the 586. As well as on all x86-64 cpus.
+ */
+#if defined(__i568__) || defined(__i668__) || /* gcc i586+ */ \
+ (defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \
+ defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, sunpro, msvc */
+#define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
+#endif /* 8 byte single-copy atomicity */
+
+#endif /* HAVE_ATOMICS */