1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
|
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_TARGETS_H_
#define HIGHWAY_HWY_TARGETS_H_
// Allows opting out of C++ standard library usage, which is not available in
// some Compiler Explorer environments.
#ifndef HWY_NO_LIBCXX
#include <vector>
#endif
// For SIMD module implementations and their callers. Defines which targets to
// generate and call.
#include "hwy/base.h"
#include "hwy/detect_targets.h"
#include "hwy/highway_export.h"
#if !HWY_ARCH_RVV && !defined(HWY_NO_LIBCXX)
#include <atomic>
#endif
namespace hwy {
// Returns bitfield of enabled targets that are supported on this CPU; there is
// always at least one such target, hence the return value is never 0. The
// targets returned may change after calling DisableTargets. This function is
// always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding
// calls to it if there is only a single target enabled.
HWY_DLLEXPORT int64_t SupportedTargets();
// Evaluates to a function call, or literal if there is a single target.
#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
#define HWY_SUPPORTED_TARGETS HWY_TARGETS
#else
#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
#endif
// Subsequent SupportedTargets will not return targets whose bit(s) are set in
// `disabled_targets`. Exception: if SupportedTargets would return 0, it will
// instead return HWY_STATIC_TARGET (there must always be one target to call).
//
// This function is useful for disabling targets known to be buggy, or if the
// best available target is undesirable (perhaps due to throttling or memory
// bandwidth limitations). Use SetSupportedTargetsForTest instead of this
// function for iteratively enabling specific targets for testing.
HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
// Subsequent SupportedTargets will return the given set of targets, except
// those disabled via DisableTargets. Call with a mask of 0 to disable the mock
// and return to the normal SupportedTargets behavior. Used to run tests for
// all targets.
HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);
#ifndef HWY_NO_LIBCXX
// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
// is affected by the current SetSupportedTargetsForTest() mock if any.
HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
std::vector<int64_t> ret;
for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
targets = targets & (targets - 1)) {
int64_t current_target = targets & ~(targets - 1);
ret.push_back(current_target);
}
return ret;
}
#endif // HWY_NO_LIBCXX
static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
switch (target) {
#if HWY_ARCH_X86
case HWY_SSSE3:
return "SSSE3";
case HWY_SSE4:
return "SSE4";
case HWY_AVX2:
return "AVX2";
case HWY_AVX3:
return "AVX3";
case HWY_AVX3_DL:
return "AVX3_DL";
#endif
#if HWY_ARCH_ARM
case HWY_SVE2_128:
return "SVE2_128";
case HWY_SVE_256:
return "SVE_256";
case HWY_SVE2:
return "SVE2";
case HWY_SVE:
return "SVE";
case HWY_NEON:
return "NEON";
#endif
#if HWY_ARCH_PPC
case HWY_PPC8:
return "PPC8";
#endif
#if HWY_ARCH_WASM
case HWY_WASM:
return "WASM";
case HWY_WASM_EMU256:
return "WASM_EMU256";
#endif
#if HWY_ARCH_RVV
case HWY_RVV:
return "RVV";
#endif
case HWY_EMU128:
return "EMU128";
case HWY_SCALAR:
return "SCALAR";
default:
return "Unknown"; // must satisfy gtest IsValidParamName()
}
}
// The maximum number of dynamic targets on any architecture is defined by
// HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
// For the ChosenTarget mask and index we use a different bit arrangement than
// in the HWY_TARGETS mask. Only the targets involved in the current
// architecture are used in this mask, and therefore only the least significant
// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least
// significant bit is set when the mask is not initialized, the next
// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
// that position and the next more significant bit is used for HWY_SCALAR (if
// HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to
// define equivalent values for HWY_TARGETS in this representation.
// This mask representation allows to use ctz() on this mask and obtain a small
// number that's used as an index of the table for dynamic dispatch. In this
// way the first entry is used when the mask is uninitialized, the following
// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
// scalar.
// The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format.
#define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1))
// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
// current architecture.
#define HWY_CHOSEN_TARGET_SHIFT(X) \
((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1)) \
<< 1)
// The HWY_TARGETS mask in the ChosenTarget mask format.
#define HWY_CHOSEN_TARGET_MASK_TARGETS \
(HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL)
#if HWY_ARCH_X86
// Maximum number of dynamic targets, changing this value is an ABI incompatible
// change
#define HWY_MAX_DYNAMIC_TARGETS 15
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
// These must match the order in which the HWY_TARGETS are defined
// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
// corresponds to the best target. Don't include a "," at the end of the list.
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \
HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
nullptr, /* AVX */ \
HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \
nullptr , /* reserved - SSE3? */ \
nullptr /* reserved - SSE2? */
#elif HWY_ARCH_ARM
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 15
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \
HWY_CHOOSE_SVE_256(func_name), /* SVE 256-bit */ \
HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \
HWY_CHOOSE_SVE(func_name), /* SVE */ \
HWY_CHOOSE_NEON(func_name), /* NEON */ \
nullptr /* reserved - Helium? */
#elif HWY_ARCH_RVV
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 9
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_RVV(func_name), /* RVV */ \
nullptr /* reserved */
#elif HWY_ARCH_PPC
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 9
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \
nullptr, /* reserved (VSX or AltiVec) */ \
nullptr /* reserved (VSX or AltiVec) */
#elif HWY_ARCH_WASM
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 9
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \
HWY_CHOOSE_WASM(func_name), /* WASM */ \
nullptr /* reserved */
#else
// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
// still creating single-entry tables in HWY_EXPORT to ensure portability.
#define HWY_MAX_DYNAMIC_TARGETS 1
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
#endif
// Bitfield of supported and enabled targets. The format differs from that of
// HWY_TARGETS; the lowest bit governs the first function pointer (which is
// special in that it calls FunctionCache, then Update, then dispatches to the
// actual implementation) in the tables created by HWY_EXPORT. Monostate (see
// GetChosenTarget), thread-safe except on RVV.
struct ChosenTarget {
public:
// Reset bits according to `targets` (typically the return value of
// SupportedTargets()). Postcondition: IsInitialized() == true.
void Update(int64_t targets) {
// These are `targets` shifted downwards, see above. Also include SCALAR
// (corresponds to the last entry in the function table) as fallback.
StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR);
}
// Reset to the uninitialized state, so that FunctionCache will call Update
// during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false.
void DeInit() { StoreMask(1); }
// Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH
// function was called, which we check in tests.
bool IsInitialized() const { return LoadMask() != 1; }
// Return the index in the dynamic dispatch table to be used by the current
// CPU. Note that this method must be in the header file so it uses the value
// of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
// calls it, which may be different from others. This means we only enable
// those targets that were actually compiled in this module.
size_t HWY_INLINE GetIndex() const {
return hwy::Num0BitsBelowLS1Bit_Nonzero64(
static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS));
}
private:
// TODO(janwas): remove RVV once <atomic> is available
#if HWY_ARCH_RVV || defined(HWY_NO_LIBCXX)
int64_t LoadMask() const { return mask_; }
void StoreMask(int64_t mask) { mask_ = mask; }
int64_t mask_{1}; // Initialized to 1 so GetIndex() returns 0.
#else
int64_t LoadMask() const { return mask_.load(); }
void StoreMask(int64_t mask) { mask_.store(mask); }
std::atomic<int64_t> mask_{1}; // Initialized to 1 so GetIndex() returns 0.
#endif // HWY_ARCH_RVV
};
// For internal use (e.g. by FunctionCache and DisableTargets).
HWY_DLLEXPORT ChosenTarget& GetChosenTarget();
} // namespace hwy
#endif // HIGHWAY_HWY_TARGETS_H_
|