summaryrefslogtreecommitdiffstats
path: root/third_party/rust/packed_simd/src/lib.rs
blob: 867cc10e9eb642708f92da603baa6d2ad967af4a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
//! # Portable packed SIMD vectors
//!
//! This crate is proposed for stabilization as `std::packed_simd` in [RFC2366:
//! `std::simd`](https://github.com/rust-lang/rfcs/pull/2366) .
//!
//! The examples available in the
//! [`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples)
//! sub-directory of the crate showcase how to use the library in practice.
//!
//! ## Table of contents
//!
//! - [Introduction](#introduction)
//! - [Vector types](#vector-types)
//! - [Conditional operations](#conditional-operations)
//! - [Conversions](#conversions)
//! - [Hardware Features](#hardware-features)
//! - [Performance guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/)
//!
//! ## Introduction
//!
//! This crate exports [`Simd<[T; N]>`][`Simd`]: a packed vector of `N`
//! elements of type `T` as well as many type aliases for this type: for
//! example, [`f32x4`], which is just an alias for `Simd<[f32; 4]>`.
//!
//! The operations on packed vectors are, by default, "vertical", that is, they
//! are applied to each vector lane in isolation of the others:
//!
//! ```
//! # use packed_simd::*;
//! let a = i32x4::new(1, 2, 3, 4);
//! let b = i32x4::new(5, 6, 7, 8);
//! assert_eq!(a + b, i32x4::new(6, 8, 10, 12));
//! ```
//!
//! Many "horizontal" operations are also provided:
//!
//! ```
//! # use packed_simd::*;
//! # let a = i32x4::new(1, 2, 3, 4);
//! assert_eq!(a.wrapping_sum(), 10);
//! ```
//!
//! In virtually all architectures vertical operations are fast, while
//! horizontal operations are, by comparison, much slower. That is, the
//! most portably-efficient way of performing a reduction over a slice
//! is to collect the results into a vector using vertical operations,
//! and performing a single horizontal operation at the end:
//!
//! ```
//! # use packed_simd::*;
//! fn reduce(x: &[i32]) -> i32 {
//!     assert_eq!(x.len() % 4, 0);
//!     let mut sum = i32x4::splat(0); // [0, 0, 0, 0]
//!     for i in (0..x.len()).step_by(4) {
//!         sum += i32x4::from_slice_unaligned(&x[i..]);
//!     }
//!     sum.wrapping_sum()
//! }
//!
//! let x = [0, 1, 2, 3, 4, 5, 6, 7];
//! assert_eq!(reduce(&x), 28);
//! ```
//!
//! ## Vector types
//!
//! The vector type aliases are named according to the following scheme:
//!
//! > `{element_type}x{number_of_lanes} == Simd<[element_type;
//! number_of_lanes]>`
//!
//! where the following element types are supported:
//!
//! * `i{element_width}`: signed integer
//! * `u{element_width}`: unsigned integer
//! * `f{element_width}`: float
//! * `m{element_width}`: mask (see below)
//! * `*{const,mut} T`: `const` and `mut` pointers
//!
//! ## Basic operations
//!
//! ```
//! # use packed_simd::*;
//! // Sets all elements to `0`:
//! let a = i32x4::splat(0);
//!
//! // Reads a vector from a slice:
//! let mut arr = [0, 0, 0, 1, 2, 3, 4, 5];
//! let b = i32x4::from_slice_unaligned(&arr);
//!
//! // Reads the 4-th element of a vector:
//! assert_eq!(b.extract(3), 1);
//!
//! // Returns a new vector where the 4-th element is replaced with `1`:
//! let a = a.replace(3, 1);
//! assert_eq!(a, b);
//!
//! // Writes a vector to a slice:
//! let a = a.replace(2, 1);
//! a.write_to_slice_unaligned(&mut arr[4..]);
//! assert_eq!(arr, [0, 0, 0, 1, 0, 0, 1, 1]);
//! ```
//!
//! ## Conditional operations
//!
//! One often needs to perform an operation on some lanes of the vector. Vector
//! masks, like `m32x4`, allow selecting on which vector lanes an operation is
//! to be performed:
//!
//! ```
//! # use packed_simd::*;
//! let a = i32x4::new(1, 1, 2, 2);
//!
//! // Add `1` to the first two lanes of the vector.
//! let m = m16x4::new(true, true, false, false);
//! let a = m.select(a + 1, a);
//! assert_eq!(a, i32x4::splat(2));
//! ```
//!
//! The elements of a vector mask are either `true` or `false`. Here `true`
//! means that a lane is "selected", while `false` means that a lane is not
//! selected.
//!
//! All vector masks implement a `mask.select(a: T, b: T) -> T` method that
//! works on all vectors that have the same number of lanes as the mask. The
//! resulting vector contains the elements of `a` for those lanes for which the
//! mask is `true`, and the elements of `b` otherwise.
//!
//! The example constructs a mask with the first two lanes set to `true` and
//! the last two lanes set to `false`. This selects the first two lanes of `a +
//! 1` and the last two lanes of `a`, producing a vector where the first two
//! lanes have been incremented by `1`.
//!
//! > note: mask `select` can be used on vector types that have the same number
//! > of lanes as the mask. The example shows this by using [`m16x4`] instead
//! > of [`m32x4`]. It is _typically_ more performant to use a mask element
//! > width equal to the element width of the vectors being operated upon.
//! > This is, however, not true for 512-bit wide vectors when targeting
//! > AVX-512, where the most efficient masks use only 1-bit per element.
//!
//! All vertical comparison operations returns masks:
//!
//! ```
//! # use packed_simd::*;
//! let a = i32x4::new(1, 1, 3, 3);
//! let b = i32x4::new(2, 2, 0, 0);
//!
//! // ge: >= (Greater Eequal; see also lt, le, gt, eq, ne).
//! let m = a.ge(i32x4::splat(2));
//!
//! if m.any() {
//!     // all / any / none allow coherent control flow
//!     let d = m.select(a, b);
//!     assert_eq!(d, i32x4::new(2, 2, 3, 3));
//! }
//! ```
//!
//! ## Conversions
//!
//! * **lossless widening conversions**: [`From`]/[`Into`] are implemented for
//!   vectors with the same number of lanes when the conversion is value
//! preserving   (same as in `std`).
//!
//! * **safe bitwise conversions**: The cargo feature `into_bits` provides the
//!   `IntoBits/FromBits` traits (`x.into_bits()`). These perform safe bitwise
//!   `transmute`s when all bit patterns of the source type are valid bit
//!   patterns of the target type and are also implemented for the
//!   architecture-specific vector types of `std::arch`. For example, `let x:
//!   u8x8 = m8x8::splat(true).into_bits();` is provided because all `m8x8` bit
//!   patterns are valid `u8x8` bit patterns. However, the opposite is not
//! true,   not all `u8x8` bit patterns are valid `m8x8` bit-patterns, so this
//!   operation cannot be performed safely using `x.into_bits()`; one needs to
//!   use `unsafe { crate::mem::transmute(x) }` for that, making sure that the
//!   value in the `u8x8` is a valid bit-pattern of `m8x8`.
//!
//! * **numeric casts** (`as`): are performed using [`FromCast`]/[`Cast`]
//! (`x.cast()`), just like `as`:
//!
//!   * casting integer vectors whose lane types have the same size (e.g.
//! `i32xN`     -> `u32xN`) is a **no-op**,
//!
//!   * casting from a larger integer to a smaller integer (e.g. `u32xN` ->
//! `u8xN`)     will **truncate**,
//!
//!   * casting from a smaller integer to a larger integer     (e.g. `u8xN` ->
//!     `u32xN`) will:
//!        * **zero-extend** if the source is unsigned, or
//!        * **sign-extend** if the source is signed,
//!
//!   * casting from a float to an integer will **round the float towards
//! zero**,
//!
//!   * casting from an integer to float will produce the floating point
//!     representation of the integer, **rounding to nearest, ties to even**,
//!
//!   * casting from an `f32` to an `f64` is perfect and lossless,
//!
//!   * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.
//!
//!   Numeric casts are not very "precise": sometimes lossy, sometimes value
//!   preserving, etc.
//!
//! ## Hardware Features
//!
//! This crate can use different hardware features based on your configured
//! `RUSTFLAGS`. For example, with no configured `RUSTFLAGS`, `u64x8` on
//! x86_64 will use SSE2 operations like `PCMPEQD`. If you configure
//! `RUSTFLAGS='-C target-feature=+avx2,+avx'` on supported x86_64 hardware
//! the same `u64x8` may use wider AVX2 operations like `VPCMPEQQ`. It is
//! important for performance and for hardware support requirements that
//! you choose an appropriate set of `target-feature` and `target-cpu`
//! options during builds. For more information, see the [Performance
//! guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/)

#![feature(
    adt_const_params,
    repr_simd,
    rustc_attrs,
    platform_intrinsics,
    stdsimd,
    arm_target_feature,
    link_llvm_intrinsics,
    core_intrinsics,
    stmt_expr_attributes,
    custom_inner_attributes,
)]
#![allow(non_camel_case_types, non_snake_case,
        // FIXME: these types are unsound in C FFI already
        // See https://github.com/rust-lang/rust/issues/53346
        improper_ctypes_definitions,
        incomplete_features,
        clippy::cast_possible_truncation,
        clippy::cast_lossless,
        clippy::cast_possible_wrap,
        clippy::cast_precision_loss,
        // TODO: manually add the `#[must_use]` attribute where appropriate
        clippy::must_use_candidate,
        // This lint is currently broken for generic code
        // See https://github.com/rust-lang/rust-clippy/issues/3410
        clippy::use_self,
        clippy::wrong_self_convention,
        clippy::from_over_into,
)]
#![cfg_attr(test, feature(hashmap_internals))]
#![cfg_attr(doc_cfg, feature(doc_cfg))]
#![deny(rust_2018_idioms, clippy::missing_inline_in_public_items)]
#![no_std]

use cfg_if::cfg_if;

cfg_if! {
    if #[cfg(feature = "core_arch")] {
        #[allow(unused_imports)]
        use core_arch as arch;
    } else {
        #[allow(unused_imports)]
        use core::arch;
    }
}

#[cfg(all(target_arch = "wasm32", test))]
use wasm_bindgen_test::*;

#[allow(unused_imports)]
use core::{
    /* arch (handled above), */ cmp, f32, f64, fmt, hash, hint, i128, i16, i32, i64, i8, intrinsics,
    isize, iter, marker, mem, ops, ptr, slice, u128, u16, u32, u64, u8, usize,
};

#[macro_use]
mod testing;
#[macro_use]
mod api;
mod codegen;
mod sealed;

pub use crate::sealed::{Mask, Shuffle, Simd as SimdVector, SimdArray};

/// Packed SIMD vector type.
///
/// # Examples
///
/// ```
/// # use packed_simd::Simd;
/// let v = Simd::<[i32; 4]>::new(0, 1, 2, 3);
/// assert_eq!(v.extract(2), 2);
/// ```
#[repr(transparent)]
#[derive(Copy, Clone)]
pub struct Simd<A: sealed::SimdArray>(
    // FIXME: this type should be private,
    // but it currently must be public for the
    // `shuffle!` macro to work: it needs to
    // access the internal `repr(simd)` type
    // to call the shuffle intrinsics.
    #[doc(hidden)] pub <A as sealed::SimdArray>::Tuple,
);

impl<A: sealed::SimdArray> sealed::Seal for Simd<A> {}

/// Wrapper over `T` implementing a lexicoraphical order via the `PartialOrd`
/// and/or `Ord` traits.
#[repr(transparent)]
#[derive(Copy, Clone, Debug)]
#[allow(clippy::missing_inline_in_public_items)]
pub struct LexicographicallyOrdered<T>(T);

mod masks;
pub use self::masks::*;

mod v16;
pub use self::v16::*;

mod v32;
pub use self::v32::*;

mod v64;
pub use self::v64::*;

mod v128;
pub use self::v128::*;

mod v256;
pub use self::v256::*;

mod v512;
pub use self::v512::*;

mod vSize;
pub use self::vSize::*;

mod vPtr;
pub use self::vPtr::*;

pub use self::api::cast::*;

#[cfg(feature = "into_bits")]
pub use self::api::into_bits::*;

// Re-export the shuffle intrinsics required by the `shuffle!` macro.
#[doc(hidden)]
pub use self::codegen::llvm::{
    __shuffle_vector16, __shuffle_vector2, __shuffle_vector32, __shuffle_vector4, __shuffle_vector64,
    __shuffle_vector8,
};

pub(crate) mod llvm {
    pub(crate) use crate::codegen::llvm::*;
}