1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
|
/*-------------------------------------------------------------------------
*
* arch-x86.h
* Atomic operations considerations specific to intel x86
*
* Note that we actually require a 486 upwards because the 386 doesn't have
* support for xadd and cmpxchg. Given that the 386 isn't supported anywhere
* anymore that's not much of a restriction luckily.
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* NOTES:
*
* src/include/port/atomics/arch-x86.h
*
*-------------------------------------------------------------------------
*/
/*
* Both 32 and 64 bit x86 do not allow loads to be reordered with other loads,
* or stores to be reordered with other stores, but a load can be performed
* before a subsequent store.
*
* Technically, some x86-ish chips support uncached memory access and/or
* special instructions that are weakly ordered. In those cases we'd need
* the read and write barriers to be lfence and sfence. But since we don't
* do those things, a compiler barrier should be enough.
*
* "lock; addl" has worked for longer than "mfence". It's also rumored to be
* faster in many scenarios.
*/
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#if defined(__i386__) || defined(__i386)
#define pg_memory_barrier_impl() \
__asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc")
#elif defined(__x86_64__)
#define pg_memory_barrier_impl() \
__asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc")
#endif
#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
#define pg_read_barrier_impl() pg_compiler_barrier_impl()
#define pg_write_barrier_impl() pg_compiler_barrier_impl()
/*
* Provide implementation for atomics using inline assembly on x86 gcc. It's
* nice to support older gcc's and the compare/exchange implementation here is
* actually more efficient than the * __sync variant.
*/
#if defined(HAVE_ATOMICS)
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define PG_HAVE_ATOMIC_FLAG_SUPPORT
typedef struct pg_atomic_flag
{
volatile char value;
} pg_atomic_flag;
#define PG_HAVE_ATOMIC_U32_SUPPORT
typedef struct pg_atomic_uint32
{
volatile uint32 value;
} pg_atomic_uint32;
/*
* It's too complicated to write inline asm for 64bit types on 32bit and the
* 486 can't do it anyway.
*/
#ifdef __x86_64__
#define PG_HAVE_ATOMIC_U64_SUPPORT
typedef struct pg_atomic_uint64
{
/* alignment guaranteed due to being on a 64bit platform */
volatile uint64 value;
} pg_atomic_uint64;
#endif /* __x86_64__ */
#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
#endif /* defined(HAVE_ATOMICS) */
#if !defined(PG_HAVE_SPIN_DELAY)
/*
* This sequence is equivalent to the PAUSE instruction ("rep" is
* ignored by old IA32 processors if the following instruction is
* not a string operation); the IA-32 Architecture Software
* Developer's Manual, Vol. 3, Section 7.7.2 describes why using
* PAUSE in the inner loop of a spin lock is necessary for good
* performance:
*
* The PAUSE instruction improves the performance of IA-32
* processors supporting Hyper-Threading Technology when
* executing spin-wait loops and other routines where one
* thread is accessing a shared lock or semaphore in a tight
* polling loop. When executing a spin-wait loop, the
* processor can suffer a severe performance penalty when
* exiting the loop because it detects a possible memory order
* violation and flushes the core processor's pipeline. The
* PAUSE instruction provides a hint to the processor that the
* code sequence is a spin-wait loop. The processor uses this
* hint to avoid the memory order violation and prevent the
* pipeline flush. In addition, the PAUSE instruction
* de-pipelines the spin-wait loop to prevent it from
* consuming execution resources excessively.
*/
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define PG_HAVE_SPIN_DELAY
static __inline__ void
pg_spin_delay_impl(void)
{
__asm__ __volatile__(" rep; nop \n");
}
#elif defined(_MSC_VER) && defined(__x86_64__)
#define PG_HAVE_SPIN_DELAY
static __forceinline void
pg_spin_delay_impl(void)
{
_mm_pause();
}
#elif defined(_MSC_VER)
#define PG_HAVE_SPIN_DELAY
static __forceinline void
pg_spin_delay_impl(void)
{
/* See comment for gcc code. Same code, MASM syntax */
__asm rep nop;
}
#endif
#endif /* !defined(PG_HAVE_SPIN_DELAY) */
#if defined(HAVE_ATOMICS)
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define PG_HAVE_ATOMIC_TEST_SET_FLAG
static inline bool
pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
{
register char _res = 1;
__asm__ __volatile__(
" lock \n"
" xchgb %0,%1 \n"
: "+q"(_res), "+m"(ptr->value)
:
: "memory");
return _res == 0;
}
#define PG_HAVE_ATOMIC_CLEAR_FLAG
static inline void
pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
{
/*
* On a TSO architecture like x86 it's sufficient to use a compiler
* barrier to achieve release semantics.
*/
__asm__ __volatile__("" ::: "memory");
ptr->value = 0;
}
#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
static inline bool
pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
uint32 *expected, uint32 newval)
{
char ret;
/*
* Perform cmpxchg and use the zero flag which it implicitly sets when
* equal to measure the success.
*/
__asm__ __volatile__(
" lock \n"
" cmpxchgl %4,%5 \n"
" setz %2 \n"
: "=a" (*expected), "=m"(ptr->value), "=q" (ret)
: "a" (*expected), "r" (newval), "m"(ptr->value)
: "memory", "cc");
return (bool) ret;
}
#define PG_HAVE_ATOMIC_FETCH_ADD_U32
static inline uint32
pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
{
uint32 res;
__asm__ __volatile__(
" lock \n"
" xaddl %0,%1 \n"
: "=q"(res), "=m"(ptr->value)
: "0" (add_), "m"(ptr->value)
: "memory", "cc");
return res;
}
#ifdef __x86_64__
#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
static inline bool
pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
uint64 *expected, uint64 newval)
{
char ret;
/*
* Perform cmpxchg and use the zero flag which it implicitly sets when
* equal to measure the success.
*/
__asm__ __volatile__(
" lock \n"
" cmpxchgq %4,%5 \n"
" setz %2 \n"
: "=a" (*expected), "=m"(ptr->value), "=q" (ret)
: "a" (*expected), "r" (newval), "m"(ptr->value)
: "memory", "cc");
return (bool) ret;
}
#define PG_HAVE_ATOMIC_FETCH_ADD_U64
static inline uint64
pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
{
uint64 res;
__asm__ __volatile__(
" lock \n"
" xaddq %0,%1 \n"
: "=q"(res), "=m"(ptr->value)
: "0" (add_), "m"(ptr->value)
: "memory", "cc");
return res;
}
#endif /* __x86_64__ */
#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
/*
* 8 byte reads / writes have single-copy atomicity on 32 bit x86 platforms
* since at least the 586. As well as on all x86-64 cpus.
*/
#if defined(__i568__) || defined(__i668__) || /* gcc i586+ */ \
(defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \
defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, sunpro, msvc */
#define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
#endif /* 8 byte single-copy atomicity */
#endif /* HAVE_ATOMICS */
|