1 files changed, 252 insertions, 0 deletions
diff --git a/src/include/port/atomics/arch-x86.h b/src/include/port/atomics/arch-x86.h
new file mode 100644
index 0000000..13cc4f5
--- /dev/null
+++ b/src/include/port/atomics/arch-x86.h
@@ -0,0 +1,252 @@
+/*-------------------------------------------------------------------------
+ *
+ * arch-x86.h
+ *	  Atomic operations considerations specific to intel x86
+ *
+ * Note that we actually require a 486 upwards because the 386 doesn't have
+ * support for xadd and cmpxchg. Given that the 386 isn't supported anywhere
+ * anymore that's not much of a restriction luckily.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * NOTES:
+ *
+ * src/include/port/atomics/arch-x86.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Both 32 and 64 bit x86 do not allow loads to be reordered with other loads,
+ * or stores to be reordered with other stores, but a load can be performed
+ * before a subsequent store.
+ *
+ * Technically, some x86-ish chips support uncached memory access and/or
+ * special instructions that are weakly ordered.  In those cases we'd need
+ * the read and write barriers to be lfence and sfence.  But since we don't
+ * do those things, a compiler barrier should be enough.
+ *
+ * "lock; addl" has worked for longer than "mfence". It's also rumored to be
+ * faster in many scenarios.
+ */
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#if defined(__i386__) || defined(__i386)
+#define pg_memory_barrier_impl()		\
+	__asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc")
+#elif defined(__x86_64__)
+#define pg_memory_barrier_impl()		\
+	__asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc")
+#endif
+#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
+
+#define pg_read_barrier_impl()		pg_compiler_barrier_impl()
+#define pg_write_barrier_impl()		pg_compiler_barrier_impl()
+
+/*
+ * Provide implementation for atomics using inline assembly on x86 gcc. It's
+ * nice to support older gcc's and the compare/exchange implementation here is
+ * actually more efficient than the * __sync variant.
+ */
+#if defined(HAVE_ATOMICS)
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+
+#define PG_HAVE_ATOMIC_FLAG_SUPPORT
+typedef struct pg_atomic_flag
+{
+	volatile char value;
+} pg_atomic_flag;
+
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+	volatile uint32 value;
+} pg_atomic_uint32;
+
+/*
+ * It's too complicated to write inline asm for 64bit types on 32bit and the
+ * 486 can't do it anyway.
+ */
+#ifdef __x86_64__
+#define PG_HAVE_ATOMIC_U64_SUPPORT
+typedef struct pg_atomic_uint64
+{
+	/* alignment guaranteed due to being on a 64bit platform */
+	volatile uint64 value;
+} pg_atomic_uint64;
+#endif	/* __x86_64__ */
+
+#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
+
+#endif /* defined(HAVE_ATOMICS) */
+
+#if !defined(PG_HAVE_SPIN_DELAY)
+/*
+ * This sequence is equivalent to the PAUSE instruction ("rep" is
+ * ignored by old IA32 processors if the following instruction is
+ * not a string operation); the IA-32 Architecture Software
+ * Developer's Manual, Vol. 3, Section 7.7.2 describes why using
+ * PAUSE in the inner loop of a spin lock is necessary for good
+ * performance:
+ *
+ *     The PAUSE instruction improves the performance of IA-32
+ *     processors supporting Hyper-Threading Technology when
+ *     executing spin-wait loops and other routines where one
+ *     thread is accessing a shared lock or semaphore in a tight
+ *     polling loop. When executing a spin-wait loop, the
+ *     processor can suffer a severe performance penalty when
+ *     exiting the loop because it detects a possible memory order
+ *     violation and flushes the core processor's pipeline. The
+ *     PAUSE instruction provides a hint to the processor that the
+ *     code sequence is a spin-wait loop. The processor uses this
+ *     hint to avoid the memory order violation and prevent the
+ *     pipeline flush. In addition, the PAUSE instruction
+ *     de-pipelines the spin-wait loop to prevent it from
+ *     consuming execution resources excessively.
+ */
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define PG_HAVE_SPIN_DELAY
+static __inline__ void
+pg_spin_delay_impl(void)
+{
+	__asm__ __volatile__(" rep; nop			\n");
+}
+#elif defined(_MSC_VER) && defined(__x86_64__)
+#define PG_HAVE_SPIN_DELAY
+static __forceinline void
+pg_spin_delay_impl(void)
+{
+	_mm_pause();
+}
+#elif defined(_MSC_VER)
+#define PG_HAVE_SPIN_DELAY
+static __forceinline void
+pg_spin_delay_impl(void)
+{
+	/* See comment for gcc code. Same code, MASM syntax */
+	__asm rep nop;
+}
+#endif
+#endif /* !defined(PG_HAVE_SPIN_DELAY) */
+
+
+#if defined(HAVE_ATOMICS)
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+
+#define PG_HAVE_ATOMIC_TEST_SET_FLAG
+static inline bool
+pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	register char _res = 1;
+
+	__asm__ __volatile__(
+		"	lock			\n"
+		"	xchgb	%0,%1	\n"
+:		"+q"(_res), "+m"(ptr->value)
+:
+:		"memory");
+	return _res == 0;
+}
+
+#define PG_HAVE_ATOMIC_CLEAR_FLAG
+static inline void
+pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	/*
+	 * On a TSO architecture like x86 it's sufficient to use a compiler
+	 * barrier to achieve release semantics.
+	 */
+	__asm__ __volatile__("" ::: "memory");
+	ptr->value = 0;
+}
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+static inline bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+									uint32 *expected, uint32 newval)
+{
+	char	ret;
+
+	/*
+	 * Perform cmpxchg and use the zero flag which it implicitly sets when
+	 * equal to measure the success.
+	 */
+	__asm__ __volatile__(
+		"	lock				\n"
+		"	cmpxchgl	%4,%5	\n"
+		"   setz		%2		\n"
+:		"=a" (*expected), "=m"(ptr->value), "=q" (ret)
+:		"a" (*expected), "r" (newval), "m"(ptr->value)
+:		"memory", "cc");
+	return (bool) ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U32
+static inline uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+	uint32 res;
+	__asm__ __volatile__(
+		"	lock				\n"
+		"	xaddl	%0,%1		\n"
+:		"=q"(res), "=m"(ptr->value)
+:		"0" (add_), "m"(ptr->value)
+:		"memory", "cc");
+	return res;
+}
+
+#ifdef __x86_64__
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+static inline bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+									uint64 *expected, uint64 newval)
+{
+	char	ret;
+
+	/*
+	 * Perform cmpxchg and use the zero flag which it implicitly sets when
+	 * equal to measure the success.
+	 */
+	__asm__ __volatile__(
+		"	lock				\n"
+		"	cmpxchgq	%4,%5	\n"
+		"   setz		%2		\n"
+:		"=a" (*expected), "=m"(ptr->value), "=q" (ret)
+:		"a" (*expected), "r" (newval), "m"(ptr->value)
+:		"memory", "cc");
+	return (bool) ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U64
+static inline uint64
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+	uint64 res;
+	__asm__ __volatile__(
+		"	lock				\n"
+		"	xaddq	%0,%1		\n"
+:		"=q"(res), "=m"(ptr->value)
+:		"0" (add_), "m"(ptr->value)
+:		"memory", "cc");
+	return res;
+}
+
+#endif /* __x86_64__ */
+
+#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
+
+/*
+ * 8 byte reads / writes have single-copy atomicity on 32 bit x86 platforms
+ * since at least the 586. As well as on all x86-64 cpus.
+ */
+#if defined(__i568__) || defined(__i668__) || /* gcc i586+ */  \
+	(defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \
+	defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, sunpro, msvc */
+#define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
+#endif /* 8 byte single-copy atomicity */
+
+#endif /* HAVE_ATOMICS */