diff options
Diffstat (limited to 'third_party/rust/ppv-lite86')
-rw-r--r-- | third_party/rust/ppv-lite86/.cargo-checksum.json | 1 | ||||
-rw-r--r-- | third_party/rust/ppv-lite86/CHANGELOG.md | 10 | ||||
-rw-r--r-- | third_party/rust/ppv-lite86/Cargo.toml | 39 | ||||
-rw-r--r-- | third_party/rust/ppv-lite86/LICENSE-APACHE | 201 | ||||
-rw-r--r-- | third_party/rust/ppv-lite86/LICENSE-MIT | 25 | ||||
-rw-r--r-- | third_party/rust/ppv-lite86/src/generic.rs | 865 | ||||
-rw-r--r-- | third_party/rust/ppv-lite86/src/lib.rs | 22 | ||||
-rw-r--r-- | third_party/rust/ppv-lite86/src/soft.rs | 472 | ||||
-rw-r--r-- | third_party/rust/ppv-lite86/src/types.rs | 298 | ||||
-rw-r--r-- | third_party/rust/ppv-lite86/src/x86_64/mod.rs | 437 | ||||
-rw-r--r-- | third_party/rust/ppv-lite86/src/x86_64/sse2.rs | 1703 |
11 files changed, 4073 insertions, 0 deletions
diff --git a/third_party/rust/ppv-lite86/.cargo-checksum.json b/third_party/rust/ppv-lite86/.cargo-checksum.json new file mode 100644 index 0000000000..8453a22d06 --- /dev/null +++ b/third_party/rust/ppv-lite86/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"CHANGELOG.md":"0bd1d2bdb4a940a0d867a782644eb007e79611be0a8d74d4ba106e83597716df","Cargo.toml":"cff623f02fcb28f62506f25ca2d6898619b42b9fce4fd02bdd8b6f50b074e09c","LICENSE-APACHE":"0218327e7a480793ffdd4eb792379a9709e5c135c7ba267f709d6f6d4d70af0a","LICENSE-MIT":"4cada0bd02ea3692eee6f16400d86c6508bbd3bafb2b65fed0419f36d4f83e8f","src/generic.rs":"a49f9f8fbe3d9e67d67861e77ae9e69cc9f8181edad578be99b19cdf05bd8046","src/lib.rs":"ed340fd5f2c7f8a5dc1ed3666768b2131685b632e5c02b31ce1e847152d876c0","src/soft.rs":"11d7c36036444d3ad1267564b0913e4301d9ba485a7bb596eb39bf2a5973ff57","src/types.rs":"a1c9e993f85a99d1762597193d72ee8ff00c3f1116885040b4e4ecfbdedabf0a","src/x86_64/mod.rs":"145200e7f2dae24e4e0fd1020269132dddd652f30373f70a6b8dd40bf8327fea","src/x86_64/sse2.rs":"a7395837200b4eb03c178c762f3269ce9030187718b8ca62e15070c5c19cba96"},"package":"5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"}
\ No newline at end of file diff --git a/third_party/rust/ppv-lite86/CHANGELOG.md b/third_party/rust/ppv-lite86/CHANGELOG.md new file mode 100644 index 0000000000..6e34be3958 --- /dev/null +++ b/third_party/rust/ppv-lite86/CHANGELOG.md @@ -0,0 +1,10 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.2.16] +### Added +- add [u64; 4] conversion for generic vec256, to support BLAKE on non-x86. +- impl `From` (rather than just `Into`) for conversions between `*_storage` types and arrays. diff --git a/third_party/rust/ppv-lite86/Cargo.toml b/third_party/rust/ppv-lite86/Cargo.toml new file mode 100644 index 0000000000..4b5b14e044 --- /dev/null +++ b/third_party/rust/ppv-lite86/Cargo.toml @@ -0,0 +1,39 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "ppv-lite86" +version = "0.2.17" +authors = ["The CryptoCorrosion Contributors"] +description = "Implementation of the crypto-simd API for x86" +keywords = [ + "crypto", + "simd", + "x86", +] +categories = [ + "cryptography", + "no-std", +] +license = "MIT/Apache-2.0" +repository = "https://github.com/cryptocorrosion/cryptocorrosion" + +[dependencies] + +[features] +default = ["std"] +no_simd = [] +simd = [] +std = [] + +[badges.travis-ci] +repository = "cryptocorrosion/cryptocorrosion" diff --git a/third_party/rust/ppv-lite86/LICENSE-APACHE b/third_party/rust/ppv-lite86/LICENSE-APACHE new file mode 100644 index 0000000000..1eb3215354 --- /dev/null +++ b/third_party/rust/ppv-lite86/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright 2019 The CryptoCorrosion Contributors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/third_party/rust/ppv-lite86/LICENSE-MIT b/third_party/rust/ppv-lite86/LICENSE-MIT new file mode 100644 index 0000000000..d78c961bca --- /dev/null +++ b/third_party/rust/ppv-lite86/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2019 The CryptoCorrosion Contributors + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/third_party/rust/ppv-lite86/src/generic.rs b/third_party/rust/ppv-lite86/src/generic.rs new file mode 100644 index 0000000000..add6c48560 --- /dev/null +++ b/third_party/rust/ppv-lite86/src/generic.rs @@ -0,0 +1,865 @@ +#![allow(non_camel_case_types)] + +use crate::soft::{x2, x4}; +use crate::types::*; +use core::ops::*; + +#[repr(C)] +#[derive(Clone, Copy)] +pub union vec128_storage { + d: [u32; 4], + q: [u64; 2], +} +impl From<[u32; 4]> for vec128_storage { + #[inline(always)] + fn from(d: [u32; 4]) -> Self { + Self { d } + } +} +impl From<vec128_storage> for [u32; 4] { + #[inline(always)] + fn from(d: vec128_storage) -> Self { + unsafe { d.d } + } +} +impl From<[u64; 2]> for vec128_storage { + #[inline(always)] + fn from(q: [u64; 2]) -> Self { + Self { q } + } +} +impl From<vec128_storage> for [u64; 2] { + #[inline(always)] + fn from(q: vec128_storage) -> Self { + unsafe { q.q } + } +} +impl Default for vec128_storage { + #[inline(always)] + fn default() -> Self { + Self { q: [0, 0] } + } +} +impl Eq for vec128_storage {} +impl PartialEq<vec128_storage> for vec128_storage { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { self.q == rhs.q } + } +} +#[derive(Clone, Copy, PartialEq, Eq, Default)] +pub struct vec256_storage { + v128: [vec128_storage; 2], +} +impl vec256_storage { + #[inline(always)] + pub fn new128(v128: [vec128_storage; 2]) -> Self { + Self { v128 } + } + #[inline(always)] + pub fn split128(self) -> [vec128_storage; 2] { + self.v128 + } +} +impl From<vec256_storage> for [u64; 4] { + #[inline(always)] + fn from(q: vec256_storage) -> Self { + let [a, b]: [u64; 2] = q.v128[0].into(); + let [c, d]: [u64; 2] = q.v128[1].into(); + [a, b, c, d] + } +} +impl From<[u64; 4]> for vec256_storage { + #[inline(always)] + fn from([a, b, c, d]: [u64; 4]) -> Self { + Self { + v128: [[a, b].into(), [c, d].into()], + } + } +} +#[derive(Clone, Copy, PartialEq, Eq, Default)] +pub struct vec512_storage { + v128: [vec128_storage; 4], +} +impl vec512_storage { + #[inline(always)] + pub fn new128(v128: [vec128_storage; 4]) -> Self { + Self { v128 } + } + #[inline(always)] + pub fn split128(self) -> [vec128_storage; 4] { + self.v128 + } +} + +#[inline(always)] +fn dmap<T, F>(t: T, f: F) -> T +where + T: Store<vec128_storage> + Into<vec128_storage>, + F: Fn(u32) -> u32, +{ + let t: vec128_storage = t.into(); + let d = unsafe { t.d }; + let d = vec128_storage { + d: [f(d[0]), f(d[1]), f(d[2]), f(d[3])], + }; + unsafe { T::unpack(d) } +} + +fn dmap2<T, F>(a: T, b: T, f: F) -> T +where + T: Store<vec128_storage> + Into<vec128_storage>, + F: Fn(u32, u32) -> u32, +{ + let a: vec128_storage = a.into(); + let b: vec128_storage = b.into(); + let ao = unsafe { a.d }; + let bo = unsafe { b.d }; + let d = vec128_storage { + d: [ + f(ao[0], bo[0]), + f(ao[1], bo[1]), + f(ao[2], bo[2]), + f(ao[3], bo[3]), + ], + }; + unsafe { T::unpack(d) } +} + +#[inline(always)] +fn qmap<T, F>(t: T, f: F) -> T +where + T: Store<vec128_storage> + Into<vec128_storage>, + F: Fn(u64) -> u64, +{ + let t: vec128_storage = t.into(); + let q = unsafe { t.q }; + let q = vec128_storage { + q: [f(q[0]), f(q[1])], + }; + unsafe { T::unpack(q) } +} + +#[inline(always)] +fn qmap2<T, F>(a: T, b: T, f: F) -> T +where + T: Store<vec128_storage> + Into<vec128_storage>, + F: Fn(u64, u64) -> u64, +{ + let a: vec128_storage = a.into(); + let b: vec128_storage = b.into(); + let ao = unsafe { a.q }; + let bo = unsafe { b.q }; + let q = vec128_storage { + q: [f(ao[0], bo[0]), f(ao[1], bo[1])], + }; + unsafe { T::unpack(q) } +} + +#[inline(always)] +fn o_of_q(q: [u64; 2]) -> u128 { + u128::from(q[0]) | (u128::from(q[1]) << 64) +} + +#[inline(always)] +fn q_of_o(o: u128) -> [u64; 2] { + [o as u64, (o >> 64) as u64] +} + +#[inline(always)] +fn omap<T, F>(a: T, f: F) -> T +where + T: Store<vec128_storage> + Into<vec128_storage>, + F: Fn(u128) -> u128, +{ + let a: vec128_storage = a.into(); + let ao = o_of_q(unsafe { a.q }); + let o = vec128_storage { q: q_of_o(f(ao)) }; + unsafe { T::unpack(o) } +} + +#[inline(always)] +fn omap2<T, F>(a: T, b: T, f: F) -> T +where + T: Store<vec128_storage> + Into<vec128_storage>, + F: Fn(u128, u128) -> u128, +{ + let a: vec128_storage = a.into(); + let b: vec128_storage = b.into(); + let ao = o_of_q(unsafe { a.q }); + let bo = o_of_q(unsafe { b.q }); + let o = vec128_storage { + q: q_of_o(f(ao, bo)), + }; + unsafe { T::unpack(o) } +} + +impl RotateEachWord128 for u128x1_generic {} +impl BitOps128 for u128x1_generic {} +impl BitOps64 for u128x1_generic {} +impl BitOps64 for u64x2_generic {} +impl BitOps32 for u128x1_generic {} +impl BitOps32 for u64x2_generic {} +impl BitOps32 for u32x4_generic {} +impl BitOps0 for u128x1_generic {} +impl BitOps0 for u64x2_generic {} +impl BitOps0 for u32x4_generic {} + +macro_rules! impl_bitops { + ($vec:ident) => { + impl Not for $vec { + type Output = Self; + #[inline(always)] + fn not(self) -> Self::Output { + omap(self, |x| !x) + } + } + impl BitAnd for $vec { + type Output = Self; + #[inline(always)] + fn bitand(self, rhs: Self) -> Self::Output { + omap2(self, rhs, |x, y| x & y) + } + } + impl BitOr for $vec { + type Output = Self; + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + omap2(self, rhs, |x, y| x | y) + } + } + impl BitXor for $vec { + type Output = Self; + #[inline(always)] + fn bitxor(self, rhs: Self) -> Self::Output { + omap2(self, rhs, |x, y| x ^ y) + } + } + impl AndNot for $vec { + type Output = Self; + #[inline(always)] + fn andnot(self, rhs: Self) -> Self::Output { + omap2(self, rhs, |x, y| !x & y) + } + } + impl BitAndAssign for $vec { + #[inline(always)] + fn bitand_assign(&mut self, rhs: Self) { + *self = *self & rhs + } + } + impl BitOrAssign for $vec { + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + *self = *self | rhs + } + } + impl BitXorAssign for $vec { + #[inline(always)] + fn bitxor_assign(&mut self, rhs: Self) { + *self = *self ^ rhs + } + } + + impl Swap64 for $vec { + #[inline(always)] + fn swap1(self) -> Self { + qmap(self, |x| { + ((x & 0x5555555555555555) << 1) | ((x & 0xaaaaaaaaaaaaaaaa) >> 1) + }) + } + #[inline(always)] + fn swap2(self) -> Self { + qmap(self, |x| { + ((x & 0x3333333333333333) << 2) | ((x & 0xcccccccccccccccc) >> 2) + }) + } + #[inline(always)] + fn swap4(self) -> Self { + qmap(self, |x| { + ((x & 0x0f0f0f0f0f0f0f0f) << 4) | ((x & 0xf0f0f0f0f0f0f0f0) >> 4) + }) + } + #[inline(always)] + fn swap8(self) -> Self { + qmap(self, |x| { + ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8) + }) + } + #[inline(always)] + fn swap16(self) -> Self { + dmap(self, |x| x.rotate_left(16)) + } + #[inline(always)] + fn swap32(self) -> Self { + qmap(self, |x| x.rotate_left(32)) + } + #[inline(always)] + fn swap64(self) -> Self { + omap(self, |x| (x << 64) | (x >> 64)) + } + } + }; +} +impl_bitops!(u32x4_generic); +impl_bitops!(u64x2_generic); +impl_bitops!(u128x1_generic); + +impl RotateEachWord32 for u32x4_generic { + #[inline(always)] + fn rotate_each_word_right7(self) -> Self { + dmap(self, |x| x.rotate_right(7)) + } + #[inline(always)] + fn rotate_each_word_right8(self) -> Self { + dmap(self, |x| x.rotate_right(8)) + } + #[inline(always)] + fn rotate_each_word_right11(self) -> Self { + dmap(self, |x| x.rotate_right(11)) + } + #[inline(always)] + fn rotate_each_word_right12(self) -> Self { + dmap(self, |x| x.rotate_right(12)) + } + #[inline(always)] + fn rotate_each_word_right16(self) -> Self { + dmap(self, |x| x.rotate_right(16)) + } + #[inline(always)] + fn rotate_each_word_right20(self) -> Self { + dmap(self, |x| x.rotate_right(20)) + } + #[inline(always)] + fn rotate_each_word_right24(self) -> Self { + dmap(self, |x| x.rotate_right(24)) + } + #[inline(always)] + fn rotate_each_word_right25(self) -> Self { + dmap(self, |x| x.rotate_right(25)) + } +} + +impl RotateEachWord32 for u64x2_generic { + #[inline(always)] + fn rotate_each_word_right7(self) -> Self { + qmap(self, |x| x.rotate_right(7)) + } + #[inline(always)] + fn rotate_each_word_right8(self) -> Self { + qmap(self, |x| x.rotate_right(8)) + } + #[inline(always)] + fn rotate_each_word_right11(self) -> Self { + qmap(self, |x| x.rotate_right(11)) + } + #[inline(always)] + fn rotate_each_word_right12(self) -> Self { + qmap(self, |x| x.rotate_right(12)) + } + #[inline(always)] + fn rotate_each_word_right16(self) -> Self { + qmap(self, |x| x.rotate_right(16)) + } + #[inline(always)] + fn rotate_each_word_right20(self) -> Self { + qmap(self, |x| x.rotate_right(20)) + } + #[inline(always)] + fn rotate_each_word_right24(self) -> Self { + qmap(self, |x| x.rotate_right(24)) + } + #[inline(always)] + fn rotate_each_word_right25(self) -> Self { + qmap(self, |x| x.rotate_right(25)) + } +} +impl RotateEachWord64 for u64x2_generic { + #[inline(always)] + fn rotate_each_word_right32(self) -> Self { + qmap(self, |x| x.rotate_right(32)) + } +} + +// workaround for koute/cargo-web#52 (u128::rotate_* broken with cargo web) +#[inline(always)] +fn rotate_u128_right(x: u128, i: u32) -> u128 { + (x >> i) | (x << (128 - i)) +} +#[test] +fn test_rotate_u128() { + const X: u128 = 0x0001_0203_0405_0607_0809_0a0b_0c0d_0e0f; + assert_eq!(rotate_u128_right(X, 17), X.rotate_right(17)); +} + +impl RotateEachWord32 for u128x1_generic { + #[inline(always)] + fn rotate_each_word_right7(self) -> Self { + Self([rotate_u128_right(self.0[0], 7)]) + } + #[inline(always)] + fn rotate_each_word_right8(self) -> Self { + Self([rotate_u128_right(self.0[0], 8)]) + } + #[inline(always)] + fn rotate_each_word_right11(self) -> Self { + Self([rotate_u128_right(self.0[0], 11)]) + } + #[inline(always)] + fn rotate_each_word_right12(self) -> Self { + Self([rotate_u128_right(self.0[0], 12)]) + } + #[inline(always)] + fn rotate_each_word_right16(self) -> Self { + Self([rotate_u128_right(self.0[0], 16)]) + } + #[inline(always)] + fn rotate_each_word_right20(self) -> Self { + Self([rotate_u128_right(self.0[0], 20)]) + } + #[inline(always)] + fn rotate_each_word_right24(self) -> Self { + Self([rotate_u128_right(self.0[0], 24)]) + } + #[inline(always)] + fn rotate_each_word_right25(self) -> Self { + Self([rotate_u128_right(self.0[0], 25)]) + } +} +impl RotateEachWord64 for u128x1_generic { + #[inline(always)] + fn rotate_each_word_right32(self) -> Self { + Self([rotate_u128_right(self.0[0], 32)]) + } +} + +#[derive(Copy, Clone)] +pub struct GenericMachine; +impl Machine for GenericMachine { + type u32x4 = u32x4_generic; + type u64x2 = u64x2_generic; + type u128x1 = u128x1_generic; + type u32x4x2 = u32x4x2_generic; + type u64x2x2 = u64x2x2_generic; + type u64x4 = u64x4_generic; + type u128x2 = u128x2_generic; + type u32x4x4 = u32x4x4_generic; + type u64x2x4 = u64x2x4_generic; + type u128x4 = u128x4_generic; + #[inline(always)] + unsafe fn instance() -> Self { + Self + } +} + +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct u32x4_generic([u32; 4]); +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct u64x2_generic([u64; 2]); +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct u128x1_generic([u128; 1]); + +impl From<u32x4_generic> for vec128_storage { + #[inline(always)] + fn from(d: u32x4_generic) -> Self { + Self { d: d.0 } + } +} +impl From<u64x2_generic> for vec128_storage { + #[inline(always)] + fn from(q: u64x2_generic) -> Self { + Self { q: q.0 } + } +} +impl From<u128x1_generic> for vec128_storage { + #[inline(always)] + fn from(o: u128x1_generic) -> Self { + Self { q: q_of_o(o.0[0]) } + } +} + +impl Store<vec128_storage> for u32x4_generic { + #[inline(always)] + unsafe fn unpack(s: vec128_storage) -> Self { + Self(s.d) + } +} +impl Store<vec128_storage> for u64x2_generic { + #[inline(always)] + unsafe fn unpack(s: vec128_storage) -> Self { + Self(s.q) + } +} +impl Store<vec128_storage> for u128x1_generic { + #[inline(always)] + unsafe fn unpack(s: vec128_storage) -> Self { + Self([o_of_q(s.q); 1]) + } +} + +impl ArithOps for u32x4_generic {} +impl ArithOps for u64x2_generic {} +impl ArithOps for u128x1_generic {} + +impl Add for u32x4_generic { + type Output = Self; + #[inline(always)] + fn add(self, rhs: Self) -> Self::Output { + dmap2(self, rhs, |x, y| x.wrapping_add(y)) + } +} +impl Add for u64x2_generic { + type Output = Self; + #[inline(always)] + fn add(self, rhs: Self) -> Self::Output { + qmap2(self, rhs, |x, y| x.wrapping_add(y)) + } +} +impl Add for u128x1_generic { + type Output = Self; + #[inline(always)] + fn add(self, rhs: Self) -> Self::Output { + omap2(self, rhs, |x, y| x.wrapping_add(y)) + } +} +impl AddAssign for u32x4_generic { + #[inline(always)] + fn add_assign(&mut self, rhs: Self) { + *self = *self + rhs + } +} +impl AddAssign for u64x2_generic { + #[inline(always)] + fn add_assign(&mut self, rhs: Self) { + *self = *self + rhs + } +} +impl AddAssign for u128x1_generic { + #[inline(always)] + fn add_assign(&mut self, rhs: Self) { + *self = *self + rhs + } +} +impl BSwap for u32x4_generic { + #[inline(always)] + fn bswap(self) -> Self { + dmap(self, |x| x.swap_bytes()) + } +} +impl BSwap for u64x2_generic { + #[inline(always)] + fn bswap(self) -> Self { + qmap(self, |x| x.swap_bytes()) + } +} +impl BSwap for u128x1_generic { + #[inline(always)] + fn bswap(self) -> Self { + omap(self, |x| x.swap_bytes()) + } +} +impl StoreBytes for u32x4_generic { + #[inline(always)] + unsafe fn unsafe_read_le(input: &[u8]) -> Self { + assert_eq!(input.len(), 16); + let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16])); + dmap(x, |x| x.to_le()) + } + #[inline(always)] + unsafe fn unsafe_read_be(input: &[u8]) -> Self { + assert_eq!(input.len(), 16); + let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16])); + dmap(x, |x| x.to_be()) + } + #[inline(always)] + fn write_le(self, out: &mut [u8]) { + assert_eq!(out.len(), 16); + let x = dmap(self, |x| x.to_le()); + unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) } + } + #[inline(always)] + fn write_be(self, out: &mut [u8]) { + assert_eq!(out.len(), 16); + let x = dmap(self, |x| x.to_be()); + unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) } + } +} +impl StoreBytes for u64x2_generic { + #[inline(always)] + unsafe fn unsafe_read_le(input: &[u8]) -> Self { + assert_eq!(input.len(), 16); + let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16])); + qmap(x, |x| x.to_le()) + } + #[inline(always)] + unsafe fn unsafe_read_be(input: &[u8]) -> Self { + assert_eq!(input.len(), 16); + let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16])); + qmap(x, |x| x.to_be()) + } + #[inline(always)] + fn write_le(self, out: &mut [u8]) { + assert_eq!(out.len(), 16); + let x = qmap(self, |x| x.to_le()); + unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) } + } + #[inline(always)] + fn write_be(self, out: &mut [u8]) { + assert_eq!(out.len(), 16); + let x = qmap(self, |x| x.to_be()); + unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) } + } +} + +#[derive(Copy, Clone)] +pub struct G0; +#[derive(Copy, Clone)] +pub struct G1; +pub type u32x4x2_generic = x2<u32x4_generic, G0>; +pub type u64x2x2_generic = x2<u64x2_generic, G0>; +pub type u64x4_generic = x2<u64x2_generic, G1>; +pub type u128x2_generic = x2<u128x1_generic, G0>; +pub type u32x4x4_generic = x4<u32x4_generic>; +pub type u64x2x4_generic = x4<u64x2_generic>; +pub type u128x4_generic = x4<u128x1_generic>; + +impl Vector<[u32; 16]> for u32x4x4_generic { + fn to_scalars(self) -> [u32; 16] { + let [a, b, c, d] = self.0; + let a = a.0; + let b = b.0; + let c = c.0; + let d = d.0; + [ + a[0], a[1], a[2], a[3], // + b[0], b[1], b[2], b[3], // + c[0], c[1], c[2], c[3], // + d[0], d[1], d[2], d[3], // + ] + } +} + +impl MultiLane<[u32; 4]> for u32x4_generic { + #[inline(always)] + fn to_lanes(self) -> [u32; 4] { + self.0 + } + #[inline(always)] + fn from_lanes(xs: [u32; 4]) -> Self { + Self(xs) + } +} +impl MultiLane<[u64; 2]> for u64x2_generic { + #[inline(always)] + fn to_lanes(self) -> [u64; 2] { + self.0 + } + #[inline(always)] + fn from_lanes(xs: [u64; 2]) -> Self { + Self(xs) + } +} +impl MultiLane<[u64; 4]> for u64x4_generic { + #[inline(always)] + fn to_lanes(self) -> [u64; 4] { + let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes()); + [a[0], a[1], b[0], b[1]] + } + #[inline(always)] + fn from_lanes(xs: [u64; 4]) -> Self { + let (a, b) = ( + u64x2_generic::from_lanes([xs[0], xs[1]]), + u64x2_generic::from_lanes([xs[2], xs[3]]), + ); + x2::new([a, b]) + } +} +impl MultiLane<[u128; 1]> for u128x1_generic { + #[inline(always)] + fn to_lanes(self) -> [u128; 1] { + self.0 + } + #[inline(always)] + fn from_lanes(xs: [u128; 1]) -> Self { + Self(xs) + } +} +impl Vec4<u32> for u32x4_generic { + #[inline(always)] + fn extract(self, i: u32) -> u32 { + self.0[i as usize] + } + #[inline(always)] + fn insert(mut self, v: u32, i: u32) -> Self { + self.0[i as usize] = v; + self + } +} +impl Vec4<u64> for u64x4_generic { + #[inline(always)] + fn extract(self, i: u32) -> u64 { + let d: [u64; 4] = self.to_lanes(); + d[i as usize] + } + #[inline(always)] + fn insert(self, v: u64, i: u32) -> Self { + self.0[(i / 2) as usize].insert(v, i % 2); + self + } +} +impl Vec2<u64> for u64x2_generic { + #[inline(always)] + fn extract(self, i: u32) -> u64 { + self.0[i as usize] + } + #[inline(always)] + fn insert(mut self, v: u64, i: u32) -> Self { + self.0[i as usize] = v; + self + } +} + +impl Words4 for u32x4_generic { + #[inline(always)] + fn shuffle2301(self) -> Self { + self.swap64() + } + #[inline(always)] + fn shuffle1230(self) -> Self { + let x = self.0; + Self([x[3], x[0], x[1], x[2]]) + } + #[inline(always)] + fn shuffle3012(self) -> Self { + let x = self.0; + Self([x[1], x[2], x[3], x[0]]) + } +} +impl LaneWords4 for u32x4_generic { + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + self.shuffle2301() + } + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + self.shuffle1230() + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + self.shuffle3012() + } +} + +impl Words4 for u64x4_generic { + #[inline(always)] + fn shuffle2301(self) -> Self { + x2::new([self.0[1], self.0[0]]) + } + #[inline(always)] + fn shuffle1230(self) -> Self { + unimplemented!() + } + #[inline(always)] + fn shuffle3012(self) -> Self { + unimplemented!() + } +} + +impl u32x4<GenericMachine> for u32x4_generic {} +impl u64x2<GenericMachine> for u64x2_generic {} +impl u128x1<GenericMachine> for u128x1_generic {} +impl u32x4x2<GenericMachine> for u32x4x2_generic {} +impl u64x2x2<GenericMachine> for u64x2x2_generic {} +impl u64x4<GenericMachine> for u64x4_generic {} +impl u128x2<GenericMachine> for u128x2_generic {} +impl u32x4x4<GenericMachine> for u32x4x4_generic {} +impl u64x2x4<GenericMachine> for u64x2x4_generic {} +impl u128x4<GenericMachine> for u128x4_generic {} + +#[macro_export] +macro_rules! dispatch { + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { + #[inline(always)] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + let $mach = unsafe { $crate::generic::GenericMachine::instance() }; + #[inline(always)] + fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + fn_impl($mach, $($arg),*) + } + }; + ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { + dispatch!($mach, $MTy, { + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body + }); + } +} +#[macro_export] +macro_rules! dispatch_light128 { + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { + #[inline(always)] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + let $mach = unsafe { $crate::generic::GenericMachine::instance() }; + #[inline(always)] + fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + fn_impl($mach, $($arg),*) + } + }; + ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { + dispatch!($mach, $MTy, { + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body + }); + } +} +#[macro_export] +macro_rules! dispatch_light256 { + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { + #[inline(always)] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + let $mach = unsafe { $crate::generic::GenericMachine::instance() }; + #[inline(always)] + fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + fn_impl($mach, $($arg),*) + } + }; + ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { + dispatch!($mach, $MTy, { + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body + }); + } +} +#[macro_export] +macro_rules! dispatch_light512 { + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { + #[inline(always)] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + let $mach = unsafe { $crate::generic::GenericMachine::instance() }; + #[inline(always)] + fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + fn_impl($mach, $($arg),*) + } + }; + ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { + dispatch!($mach, $MTy, { + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body + }); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_bswap32() { + let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100]; + let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203]; + + let m = unsafe { GenericMachine::instance() }; + + let x: <GenericMachine as Machine>::u32x4 = m.vec(xs); + let x = x.bswap(); + + let y = m.vec(ys); + assert_eq!(x, y); + } +} diff --git a/third_party/rust/ppv-lite86/src/lib.rs b/third_party/rust/ppv-lite86/src/lib.rs new file mode 100644 index 0000000000..638552fc2c --- /dev/null +++ b/third_party/rust/ppv-lite86/src/lib.rs @@ -0,0 +1,22 @@ +#![no_std] + +// Design: +// - safety: safe creation of any machine type is done only by instance methods of a +// Machine (which is a ZST + Copy type), which can only by created unsafely or safely +// through feature detection (e.g. fn AVX2::try_get() -> Option<Machine>). + +mod soft; +mod types; +pub use self::types::*; + +#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(feature = "no_simd"), not(miri)))] +pub mod x86_64; +#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(feature = "no_simd"), not(miri)))] +use self::x86_64 as arch; + +#[cfg(any(feature = "no_simd", miri, not(target_arch = "x86_64"), all(target_arch = "x86_64", not(target_feature = "sse2"))))] +pub mod generic; +#[cfg(any(feature = "no_simd", miri, not(target_arch = "x86_64"), all(target_arch = "x86_64", not(target_feature = "sse2"))))] +use self::generic as arch; + +pub use self::arch::{vec128_storage, vec256_storage, vec512_storage}; diff --git a/third_party/rust/ppv-lite86/src/soft.rs b/third_party/rust/ppv-lite86/src/soft.rs new file mode 100644 index 0000000000..0ae390c447 --- /dev/null +++ b/third_party/rust/ppv-lite86/src/soft.rs @@ -0,0 +1,472 @@ +//! Implement 256- and 512- bit in terms of 128-bit, for machines without native wide SIMD. + +use crate::types::*; +use crate::{vec128_storage, vec256_storage, vec512_storage}; +use core::marker::PhantomData; +use core::ops::*; + +#[derive(Copy, Clone, Default)] +#[allow(non_camel_case_types)] +pub struct x2<W, G>(pub [W; 2], PhantomData<G>); +impl<W, G> x2<W, G> { + #[inline(always)] + pub fn new(xs: [W; 2]) -> Self { + x2(xs, PhantomData) + } +} +macro_rules! fwd_binop_x2 { + ($trait:ident, $fn:ident) => { + impl<W: $trait + Copy, G> $trait for x2<W, G> { + type Output = x2<W::Output, G>; + #[inline(always)] + fn $fn(self, rhs: Self) -> Self::Output { + x2::new([self.0[0].$fn(rhs.0[0]), self.0[1].$fn(rhs.0[1])]) + } + } + }; +} +macro_rules! fwd_binop_assign_x2 { + ($trait:ident, $fn_assign:ident) => { + impl<W: $trait + Copy, G> $trait for x2<W, G> { + #[inline(always)] + fn $fn_assign(&mut self, rhs: Self) { + (self.0[0]).$fn_assign(rhs.0[0]); + (self.0[1]).$fn_assign(rhs.0[1]); + } + } + }; +} +macro_rules! fwd_unop_x2 { + ($fn:ident) => { + #[inline(always)] + fn $fn(self) -> Self { + x2::new([self.0[0].$fn(), self.0[1].$fn()]) + } + }; +} +impl<W, G> RotateEachWord32 for x2<W, G> +where + W: Copy + RotateEachWord32, +{ + fwd_unop_x2!(rotate_each_word_right7); + fwd_unop_x2!(rotate_each_word_right8); + fwd_unop_x2!(rotate_each_word_right11); + fwd_unop_x2!(rotate_each_word_right12); + fwd_unop_x2!(rotate_each_word_right16); + fwd_unop_x2!(rotate_each_word_right20); + fwd_unop_x2!(rotate_each_word_right24); + fwd_unop_x2!(rotate_each_word_right25); +} +impl<W, G> RotateEachWord64 for x2<W, G> +where + W: Copy + RotateEachWord64, +{ + fwd_unop_x2!(rotate_each_word_right32); +} +impl<W, G> RotateEachWord128 for x2<W, G> where W: RotateEachWord128 {} +impl<W, G> BitOps0 for x2<W, G> +where + W: BitOps0, + G: Copy, +{ +} +impl<W, G> BitOps32 for x2<W, G> +where + W: BitOps32 + BitOps0, + G: Copy, +{ +} +impl<W, G> BitOps64 for x2<W, G> +where + W: BitOps64 + BitOps0, + G: Copy, +{ +} +impl<W, G> BitOps128 for x2<W, G> +where + W: BitOps128 + BitOps0, + G: Copy, +{ +} +fwd_binop_x2!(BitAnd, bitand); +fwd_binop_x2!(BitOr, bitor); +fwd_binop_x2!(BitXor, bitxor); +fwd_binop_x2!(AndNot, andnot); +fwd_binop_assign_x2!(BitAndAssign, bitand_assign); +fwd_binop_assign_x2!(BitOrAssign, bitor_assign); +fwd_binop_assign_x2!(BitXorAssign, bitxor_assign); +impl<W, G> ArithOps for x2<W, G> +where + W: ArithOps, + G: Copy, +{ +} +fwd_binop_x2!(Add, add); +fwd_binop_assign_x2!(AddAssign, add_assign); +impl<W: Not + Copy, G> Not for x2<W, G> { + type Output = x2<W::Output, G>; + #[inline(always)] + fn not(self) -> Self::Output { + x2::new([self.0[0].not(), self.0[1].not()]) + } +} +impl<W, G> UnsafeFrom<[W; 2]> for x2<W, G> { + #[inline(always)] + unsafe fn unsafe_from(xs: [W; 2]) -> Self { + x2::new(xs) + } +} +impl<W: Copy, G> Vec2<W> for x2<W, G> { + #[inline(always)] + fn extract(self, i: u32) -> W { + self.0[i as usize] + } + #[inline(always)] + fn insert(mut self, w: W, i: u32) -> Self { + self.0[i as usize] = w; + self + } +} +impl<W: Copy + Store<vec128_storage>, G> Store<vec256_storage> for x2<W, G> { + #[inline(always)] + unsafe fn unpack(p: vec256_storage) -> Self { + let p = p.split128(); + x2::new([W::unpack(p[0]), W::unpack(p[1])]) + } +} +impl<W, G> From<x2<W, G>> for vec256_storage +where + W: Copy, + vec128_storage: From<W>, +{ + #[inline(always)] + fn from(x: x2<W, G>) -> Self { + vec256_storage::new128([x.0[0].into(), x.0[1].into()]) + } +} +impl<W, G> Swap64 for x2<W, G> +where + W: Swap64 + Copy, +{ + fwd_unop_x2!(swap1); + fwd_unop_x2!(swap2); + fwd_unop_x2!(swap4); + fwd_unop_x2!(swap8); + fwd_unop_x2!(swap16); + fwd_unop_x2!(swap32); + fwd_unop_x2!(swap64); +} +impl<W: Copy, G> MultiLane<[W; 2]> for x2<W, G> { + #[inline(always)] + fn to_lanes(self) -> [W; 2] { + self.0 + } + #[inline(always)] + fn from_lanes(lanes: [W; 2]) -> Self { + x2::new(lanes) + } +} +impl<W: BSwap + Copy, G> BSwap for x2<W, G> { + #[inline(always)] + fn bswap(self) -> Self { + x2::new([self.0[0].bswap(), self.0[1].bswap()]) + } +} +impl<W: StoreBytes + BSwap + Copy, G> StoreBytes for x2<W, G> { + #[inline(always)] + unsafe fn unsafe_read_le(input: &[u8]) -> Self { + let input = input.split_at(input.len() / 2); + x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)]) + } + #[inline(always)] + unsafe fn unsafe_read_be(input: &[u8]) -> Self { + let input = input.split_at(input.len() / 2); + x2::new([W::unsafe_read_be(input.0), W::unsafe_read_be(input.1)]) + } + #[inline(always)] + fn write_le(self, out: &mut [u8]) { + let out = out.split_at_mut(out.len() / 2); + self.0[0].write_le(out.0); + self.0[1].write_le(out.1); + } + #[inline(always)] + fn write_be(self, out: &mut [u8]) { + let out = out.split_at_mut(out.len() / 2); + self.0[0].write_be(out.0); + self.0[1].write_be(out.1); + } +} +impl<W: Copy + LaneWords4, G: Copy> LaneWords4 for x2<W, G> { + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words2301(), + self.0[1].shuffle_lane_words2301(), + ]) + } + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words1230(), + self.0[1].shuffle_lane_words1230(), + ]) + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words3012(), + self.0[1].shuffle_lane_words3012(), + ]) + } +} + +#[derive(Copy, Clone, Default)] +#[allow(non_camel_case_types)] +pub struct x4<W>(pub [W; 4]); +impl<W> x4<W> { + #[inline(always)] + pub fn new(xs: [W; 4]) -> Self { + x4(xs) + } +} +macro_rules! fwd_binop_x4 { + ($trait:ident, $fn:ident) => { + impl<W: $trait + Copy> $trait for x4<W> { + type Output = x4<W::Output>; + #[inline(always)] + fn $fn(self, rhs: Self) -> Self::Output { + x4([ + self.0[0].$fn(rhs.0[0]), + self.0[1].$fn(rhs.0[1]), + self.0[2].$fn(rhs.0[2]), + self.0[3].$fn(rhs.0[3]), + ]) + } + } + }; +} +macro_rules! fwd_binop_assign_x4 { + ($trait:ident, $fn_assign:ident) => { + impl<W: $trait + Copy> $trait for x4<W> { + #[inline(always)] + fn $fn_assign(&mut self, rhs: Self) { + self.0[0].$fn_assign(rhs.0[0]); + self.0[1].$fn_assign(rhs.0[1]); + self.0[2].$fn_assign(rhs.0[2]); + self.0[3].$fn_assign(rhs.0[3]); + } + } + }; +} +macro_rules! fwd_unop_x4 { + ($fn:ident) => { + #[inline(always)] + fn $fn(self) -> Self { + x4([ + self.0[0].$fn(), + self.0[1].$fn(), + self.0[2].$fn(), + self.0[3].$fn(), + ]) + } + }; +} +impl<W> RotateEachWord32 for x4<W> +where + W: Copy + RotateEachWord32, +{ + fwd_unop_x4!(rotate_each_word_right7); + fwd_unop_x4!(rotate_each_word_right8); + fwd_unop_x4!(rotate_each_word_right11); + fwd_unop_x4!(rotate_each_word_right12); + fwd_unop_x4!(rotate_each_word_right16); + fwd_unop_x4!(rotate_each_word_right20); + fwd_unop_x4!(rotate_each_word_right24); + fwd_unop_x4!(rotate_each_word_right25); +} +impl<W> RotateEachWord64 for x4<W> +where + W: Copy + RotateEachWord64, +{ + fwd_unop_x4!(rotate_each_word_right32); +} +impl<W> RotateEachWord128 for x4<W> where W: RotateEachWord128 {} +impl<W> BitOps0 for x4<W> where W: BitOps0 {} +impl<W> BitOps32 for x4<W> where W: BitOps32 + BitOps0 {} +impl<W> BitOps64 for x4<W> where W: BitOps64 + BitOps0 {} +impl<W> BitOps128 for x4<W> where W: BitOps128 + BitOps0 {} +fwd_binop_x4!(BitAnd, bitand); +fwd_binop_x4!(BitOr, bitor); +fwd_binop_x4!(BitXor, bitxor); +fwd_binop_x4!(AndNot, andnot); +fwd_binop_assign_x4!(BitAndAssign, bitand_assign); +fwd_binop_assign_x4!(BitOrAssign, bitor_assign); +fwd_binop_assign_x4!(BitXorAssign, bitxor_assign); +impl<W> ArithOps for x4<W> where W: ArithOps {} +fwd_binop_x4!(Add, add); +fwd_binop_assign_x4!(AddAssign, add_assign); +impl<W: Not + Copy> Not for x4<W> { + type Output = x4<W::Output>; + #[inline(always)] + fn not(self) -> Self::Output { + x4([ + self.0[0].not(), + self.0[1].not(), + self.0[2].not(), + self.0[3].not(), + ]) + } +} +impl<W> UnsafeFrom<[W; 4]> for x4<W> { + #[inline(always)] + unsafe fn unsafe_from(xs: [W; 4]) -> Self { + x4(xs) + } +} +impl<W: Copy> Vec4<W> for x4<W> { + #[inline(always)] + fn extract(self, i: u32) -> W { + self.0[i as usize] + } + #[inline(always)] + fn insert(mut self, w: W, i: u32) -> Self { + self.0[i as usize] = w; + self + } +} +impl<W: Copy> Vec4Ext<W> for x4<W> { + #[inline(always)] + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) + where + Self: Sized, + { + ( + x4([a.0[0], b.0[0], c.0[0], d.0[0]]), + x4([a.0[1], b.0[1], c.0[1], d.0[1]]), + x4([a.0[2], b.0[2], c.0[2], d.0[2]]), + x4([a.0[3], b.0[3], c.0[3], d.0[3]]), + ) + } +} +impl<W: Copy + Store<vec128_storage>> Store<vec512_storage> for x4<W> { + #[inline(always)] + unsafe fn unpack(p: vec512_storage) -> Self { + let p = p.split128(); + x4([ + W::unpack(p[0]), + W::unpack(p[1]), + W::unpack(p[2]), + W::unpack(p[3]), + ]) + } +} +impl<W> From<x4<W>> for vec512_storage +where + W: Copy, + vec128_storage: From<W>, +{ + #[inline(always)] + fn from(x: x4<W>) -> Self { + vec512_storage::new128([x.0[0].into(), x.0[1].into(), x.0[2].into(), x.0[3].into()]) + } +} +impl<W> Swap64 for x4<W> +where + W: Swap64 + Copy, +{ + fwd_unop_x4!(swap1); + fwd_unop_x4!(swap2); + fwd_unop_x4!(swap4); + fwd_unop_x4!(swap8); + fwd_unop_x4!(swap16); + fwd_unop_x4!(swap32); + fwd_unop_x4!(swap64); +} +impl<W: Copy> MultiLane<[W; 4]> for x4<W> { + #[inline(always)] + fn to_lanes(self) -> [W; 4] { + self.0 + } + #[inline(always)] + fn from_lanes(lanes: [W; 4]) -> Self { + x4(lanes) + } +} +impl<W: BSwap + Copy> BSwap for x4<W> { + #[inline(always)] + fn bswap(self) -> Self { + x4([ + self.0[0].bswap(), + self.0[1].bswap(), + self.0[2].bswap(), + self.0[3].bswap(), + ]) + } +} +impl<W: StoreBytes + BSwap + Copy> StoreBytes for x4<W> { + #[inline(always)] + unsafe fn unsafe_read_le(input: &[u8]) -> Self { + let n = input.len() / 4; + x4([ + W::unsafe_read_le(&input[..n]), + W::unsafe_read_le(&input[n..n * 2]), + W::unsafe_read_le(&input[n * 2..n * 3]), + W::unsafe_read_le(&input[n * 3..]), + ]) + } + #[inline(always)] + unsafe fn unsafe_read_be(input: &[u8]) -> Self { + let n = input.len() / 4; + x4([ + W::unsafe_read_be(&input[..n]), + W::unsafe_read_be(&input[n..n * 2]), + W::unsafe_read_be(&input[n * 2..n * 3]), + W::unsafe_read_be(&input[n * 3..]), + ]) + } + #[inline(always)] + fn write_le(self, out: &mut [u8]) { + let n = out.len() / 4; + self.0[0].write_le(&mut out[..n]); + self.0[1].write_le(&mut out[n..n * 2]); + self.0[2].write_le(&mut out[n * 2..n * 3]); + self.0[3].write_le(&mut out[n * 3..]); + } + #[inline(always)] + fn write_be(self, out: &mut [u8]) { + let n = out.len() / 4; + self.0[0].write_be(&mut out[..n]); + self.0[1].write_be(&mut out[n..n * 2]); + self.0[2].write_be(&mut out[n * 2..n * 3]); + self.0[3].write_be(&mut out[n * 3..]); + } +} +impl<W: Copy + LaneWords4> LaneWords4 for x4<W> { + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + x4([ + self.0[0].shuffle_lane_words2301(), + self.0[1].shuffle_lane_words2301(), + self.0[2].shuffle_lane_words2301(), + self.0[3].shuffle_lane_words2301(), + ]) + } + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + x4([ + self.0[0].shuffle_lane_words1230(), + self.0[1].shuffle_lane_words1230(), + self.0[2].shuffle_lane_words1230(), + self.0[3].shuffle_lane_words1230(), + ]) + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + x4([ + self.0[0].shuffle_lane_words3012(), + self.0[1].shuffle_lane_words3012(), + self.0[2].shuffle_lane_words3012(), + self.0[3].shuffle_lane_words3012(), + ]) + } +} diff --git a/third_party/rust/ppv-lite86/src/types.rs b/third_party/rust/ppv-lite86/src/types.rs new file mode 100644 index 0000000000..f9f3bf1ce7 --- /dev/null +++ b/third_party/rust/ppv-lite86/src/types.rs @@ -0,0 +1,298 @@ +#![allow(non_camel_case_types)] +use core::ops::{Add, AddAssign, BitAnd, BitOr, BitXor, BitXorAssign, Not}; + +pub trait AndNot { + type Output; + fn andnot(self, rhs: Self) -> Self::Output; +} +pub trait BSwap { + fn bswap(self) -> Self; +} +/// Ops that depend on word size +pub trait ArithOps: Add<Output = Self> + AddAssign + Sized + Copy + Clone + BSwap {} +/// Ops that are independent of word size and endian +pub trait BitOps0: + BitAnd<Output = Self> + + BitOr<Output = Self> + + BitXor<Output = Self> + + BitXorAssign + + Not<Output = Self> + + AndNot<Output = Self> + + Sized + + Copy + + Clone +{ +} + +pub trait BitOps32: BitOps0 + RotateEachWord32 {} +pub trait BitOps64: BitOps32 + RotateEachWord64 {} +pub trait BitOps128: BitOps64 + RotateEachWord128 {} + +pub trait RotateEachWord32 { + fn rotate_each_word_right7(self) -> Self; + fn rotate_each_word_right8(self) -> Self; + fn rotate_each_word_right11(self) -> Self; + fn rotate_each_word_right12(self) -> Self; + fn rotate_each_word_right16(self) -> Self; + fn rotate_each_word_right20(self) -> Self; + fn rotate_each_word_right24(self) -> Self; + fn rotate_each_word_right25(self) -> Self; +} + +pub trait RotateEachWord64 { + fn rotate_each_word_right32(self) -> Self; +} + +pub trait RotateEachWord128 {} + +// Vector type naming scheme: +// uN[xP]xL +// Unsigned; N-bit words * P bits per lane * L lanes +// +// A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of +// wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and +// slow inter-lane operations. + +use crate::arch::{vec128_storage, vec256_storage, vec512_storage}; + +#[allow(clippy::missing_safety_doc)] +pub trait UnsafeFrom<T> { + unsafe fn unsafe_from(t: T) -> Self; +} + +/// A vector composed of two elements, which may be words or themselves vectors. +pub trait Vec2<W> { + fn extract(self, i: u32) -> W; + fn insert(self, w: W, i: u32) -> Self; +} + +/// A vector composed of four elements, which may be words or themselves vectors. +pub trait Vec4<W> { + fn extract(self, i: u32) -> W; + fn insert(self, w: W, i: u32) -> Self; +} +/// Vec4 functions which may not be implemented yet for all Vec4 types. +/// NOTE: functions in this trait may be moved to Vec4 in any patch release. To avoid breakage, +/// import Vec4Ext only together with Vec4, and don't qualify its methods. +pub trait Vec4Ext<W> { + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) + where + Self: Sized; +} +pub trait Vector<T> { + fn to_scalars(self) -> T; +} + +// TODO: multiples of 4 should inherit this +/// A vector composed of four words; depending on their size, operations may cross lanes. +pub trait Words4 { + fn shuffle1230(self) -> Self; + fn shuffle2301(self) -> Self; + fn shuffle3012(self) -> Self; +} + +/// A vector composed one or more lanes each composed of four words. +pub trait LaneWords4 { + fn shuffle_lane_words1230(self) -> Self; + fn shuffle_lane_words2301(self) -> Self; + fn shuffle_lane_words3012(self) -> Self; +} + +// TODO: make this a part of BitOps +/// Exchange neigboring ranges of bits of the specified size +pub trait Swap64 { + fn swap1(self) -> Self; + fn swap2(self) -> Self; + fn swap4(self) -> Self; + fn swap8(self) -> Self; + fn swap16(self) -> Self; + fn swap32(self) -> Self; + fn swap64(self) -> Self; +} + +pub trait u32x4<M: Machine>: + BitOps32 + + Store<vec128_storage> + + ArithOps + + Vec4<u32> + + Words4 + + LaneWords4 + + StoreBytes + + MultiLane<[u32; 4]> + + Into<vec128_storage> +{ +} +pub trait u64x2<M: Machine>: + BitOps64 + Store<vec128_storage> + ArithOps + Vec2<u64> + MultiLane<[u64; 2]> + Into<vec128_storage> +{ +} +pub trait u128x1<M: Machine>: + BitOps128 + Store<vec128_storage> + Swap64 + MultiLane<[u128; 1]> + Into<vec128_storage> +{ +} + +pub trait u32x4x2<M: Machine>: + BitOps32 + + Store<vec256_storage> + + Vec2<M::u32x4> + + MultiLane<[M::u32x4; 2]> + + ArithOps + + Into<vec256_storage> + + StoreBytes +{ +} +pub trait u64x2x2<M: Machine>: + BitOps64 + + Store<vec256_storage> + + Vec2<M::u64x2> + + MultiLane<[M::u64x2; 2]> + + ArithOps + + StoreBytes + + Into<vec256_storage> +{ +} +pub trait u64x4<M: Machine>: + BitOps64 + + Store<vec256_storage> + + Vec4<u64> + + MultiLane<[u64; 4]> + + ArithOps + + Words4 + + StoreBytes + + Into<vec256_storage> +{ +} +pub trait u128x2<M: Machine>: + BitOps128 + + Store<vec256_storage> + + Vec2<M::u128x1> + + MultiLane<[M::u128x1; 2]> + + Swap64 + + Into<vec256_storage> +{ +} + +pub trait u32x4x4<M: Machine>: + BitOps32 + + Store<vec512_storage> + + Vec4<M::u32x4> + + Vec4Ext<M::u32x4> + + Vector<[u32; 16]> + + MultiLane<[M::u32x4; 4]> + + ArithOps + + LaneWords4 + + Into<vec512_storage> + + StoreBytes +{ +} +pub trait u64x2x4<M: Machine>: + BitOps64 + + Store<vec512_storage> + + Vec4<M::u64x2> + + MultiLane<[M::u64x2; 4]> + + ArithOps + + Into<vec512_storage> +{ +} +// TODO: Words4 +pub trait u128x4<M: Machine>: + BitOps128 + + Store<vec512_storage> + + Vec4<M::u128x1> + + MultiLane<[M::u128x1; 4]> + + Swap64 + + Into<vec512_storage> +{ +} + +/// A vector composed of multiple 128-bit lanes. +pub trait MultiLane<Lanes> { + /// Split a multi-lane vector into single-lane vectors. + fn to_lanes(self) -> Lanes; + /// Build a multi-lane vector from individual lanes. + fn from_lanes(lanes: Lanes) -> Self; +} + +/// Combine single vectors into a multi-lane vector. +pub trait VZip<V> { + fn vzip(self) -> V; +} + +impl<V, T> VZip<V> for T +where + V: MultiLane<T>, +{ + #[inline(always)] + fn vzip(self) -> V { + V::from_lanes(self) + } +} + +pub trait Machine: Sized + Copy { + type u32x4: u32x4<Self>; + type u64x2: u64x2<Self>; + type u128x1: u128x1<Self>; + + type u32x4x2: u32x4x2<Self>; + type u64x2x2: u64x2x2<Self>; + type u64x4: u64x4<Self>; + type u128x2: u128x2<Self>; + + type u32x4x4: u32x4x4<Self>; + type u64x2x4: u64x2x4<Self>; + type u128x4: u128x4<Self>; + + #[inline(always)] + fn unpack<S, V: Store<S>>(self, s: S) -> V { + unsafe { V::unpack(s) } + } + + #[inline(always)] + fn vec<V, A>(self, a: A) -> V + where + V: MultiLane<A>, + { + V::from_lanes(a) + } + + #[inline(always)] + fn read_le<V>(self, input: &[u8]) -> V + where + V: StoreBytes, + { + unsafe { V::unsafe_read_le(input) } + } + + #[inline(always)] + fn read_be<V>(self, input: &[u8]) -> V + where + V: StoreBytes, + { + unsafe { V::unsafe_read_be(input) } + } + + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. + unsafe fn instance() -> Self; +} + +pub trait Store<S> { + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. + unsafe fn unpack(p: S) -> Self; +} + +pub trait StoreBytes { + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. + unsafe fn unsafe_read_le(input: &[u8]) -> Self; + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. + unsafe fn unsafe_read_be(input: &[u8]) -> Self; + fn write_le(self, out: &mut [u8]); + fn write_be(self, out: &mut [u8]); +} diff --git a/third_party/rust/ppv-lite86/src/x86_64/mod.rs b/third_party/rust/ppv-lite86/src/x86_64/mod.rs new file mode 100644 index 0000000000..937732da3a --- /dev/null +++ b/third_party/rust/ppv-lite86/src/x86_64/mod.rs @@ -0,0 +1,437 @@ +// crate minimums: sse2, x86_64 + +use crate::types::*; +use core::arch::x86_64::{__m128i, __m256i}; + +mod sse2; + +#[derive(Copy, Clone)] +pub struct YesS3; +#[derive(Copy, Clone)] +pub struct NoS3; + +#[derive(Copy, Clone)] +pub struct YesS4; +#[derive(Copy, Clone)] +pub struct NoS4; + +#[derive(Copy, Clone)] +pub struct YesA1; +#[derive(Copy, Clone)] +pub struct NoA1; + +#[derive(Copy, Clone)] +pub struct YesA2; +#[derive(Copy, Clone)] +pub struct NoA2; + +#[derive(Copy, Clone)] +pub struct YesNI; +#[derive(Copy, Clone)] +pub struct NoNI; + +use core::marker::PhantomData; + +#[derive(Copy, Clone)] +pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>); +impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI> +where + sse2::u128x1_sse2<S3, S4, NI>: Swap64, + sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, + sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, + sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4, + sse2::u128x1_sse2<S3, S4, NI>: BSwap, + sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>, + sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>, + sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>, + sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>, + sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>, +{ + type u32x4 = sse2::u32x4_sse2<S3, S4, NI>; + type u64x2 = sse2::u64x2_sse2<S3, S4, NI>; + type u128x1 = sse2::u128x1_sse2<S3, S4, NI>; + + type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>; + type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>; + type u64x4 = sse2::u64x4_sse2<S3, S4, NI>; + type u128x2 = sse2::u128x2_sse2<S3, S4, NI>; + + type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>; + type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>; + type u128x4 = sse2::u128x4_sse2<S3, S4, NI>; + + #[inline(always)] + unsafe fn instance() -> Self { + SseMachine(PhantomData) + } +} + +#[derive(Copy, Clone)] +pub struct Avx2Machine<NI>(PhantomData<NI>); +impl<NI: Copy> Machine for Avx2Machine<NI> +where + sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64, + sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>, + sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>, + sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4, +{ + type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>; + type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; + type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; + + type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>; + type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; + type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; + type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; + + type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>; + type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>; + type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>; + + #[inline(always)] + unsafe fn instance() -> Self { + Avx2Machine(PhantomData) + } +} + +pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>; +pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>; +pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>; +/// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything +/// to avoid expensive SSE/VEX conflicts. +pub type AVX = SseMachine<YesS3, YesS4, NoNI>; +pub type AVX2 = Avx2Machine<NoNI>; + +/// Generic wrapper for unparameterized storage of any of the possible impls. +/// Converting into and out of this type should be essentially free, although it may be more +/// aligned than a particular impl requires. +#[allow(non_camel_case_types)] +#[derive(Copy, Clone)] +pub union vec128_storage { + u32x4: [u32; 4], + u64x2: [u64; 2], + u128x1: [u128; 1], + sse2: __m128i, +} +impl Store<vec128_storage> for vec128_storage { + #[inline(always)] + unsafe fn unpack(p: vec128_storage) -> Self { + p + } +} +impl<'a> From<&'a vec128_storage> for &'a [u32; 4] { + #[inline(always)] + fn from(x: &'a vec128_storage) -> Self { + unsafe { &x.u32x4 } + } +} +impl From<[u32; 4]> for vec128_storage { + #[inline(always)] + fn from(u32x4: [u32; 4]) -> Self { + vec128_storage { u32x4 } + } +} +impl Default for vec128_storage { + #[inline(always)] + fn default() -> Self { + vec128_storage { u128x1: [0] } + } +} +impl Eq for vec128_storage {} +impl PartialEq for vec128_storage { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { self.u128x1 == rhs.u128x1 } + } +} + +#[allow(non_camel_case_types)] +#[derive(Copy, Clone)] +pub union vec256_storage { + u32x8: [u32; 8], + u64x4: [u64; 4], + u128x2: [u128; 2], + sse2: [vec128_storage; 2], + avx: __m256i, +} +impl From<[u64; 4]> for vec256_storage { + #[inline(always)] + fn from(u64x4: [u64; 4]) -> Self { + vec256_storage { u64x4 } + } +} +impl Default for vec256_storage { + #[inline(always)] + fn default() -> Self { + vec256_storage { u128x2: [0, 0] } + } +} +impl vec256_storage { + #[inline(always)] + pub fn new128(xs: [vec128_storage; 2]) -> Self { + Self { sse2: xs } + } + #[inline(always)] + pub fn split128(self) -> [vec128_storage; 2] { + unsafe { self.sse2 } + } +} +impl Eq for vec256_storage {} +impl PartialEq for vec256_storage { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { self.sse2 == rhs.sse2 } + } +} + +#[allow(non_camel_case_types)] +#[derive(Copy, Clone)] +pub union vec512_storage { + u32x16: [u32; 16], + u64x8: [u64; 8], + u128x4: [u128; 4], + sse2: [vec128_storage; 4], + avx: [vec256_storage; 2], +} +impl Default for vec512_storage { + #[inline(always)] + fn default() -> Self { + vec512_storage { + u128x4: [0, 0, 0, 0], + } + } +} +impl vec512_storage { + #[inline(always)] + pub fn new128(xs: [vec128_storage; 4]) -> Self { + Self { sse2: xs } + } + #[inline(always)] + pub fn split128(self) -> [vec128_storage; 4] { + unsafe { self.sse2 } + } +} +impl Eq for vec512_storage {} +impl PartialEq for vec512_storage { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { self.avx == rhs.avx } + } +} + +macro_rules! impl_into { + ($storage:ident, $array:ty, $name:ident) => { + impl From<$storage> for $array { + #[inline(always)] + fn from(vec: $storage) -> Self { + unsafe { vec.$name } + } + } + }; +} +impl_into!(vec128_storage, [u32; 4], u32x4); +impl_into!(vec128_storage, [u64; 2], u64x2); +impl_into!(vec128_storage, [u128; 1], u128x1); +impl_into!(vec256_storage, [u32; 8], u32x8); +impl_into!(vec256_storage, [u64; 4], u64x4); +impl_into!(vec256_storage, [u128; 2], u128x2); +impl_into!(vec512_storage, [u32; 16], u32x16); +impl_into!(vec512_storage, [u64; 8], u64x8); +impl_into!(vec512_storage, [u128; 4], u128x4); + +/// Generate the full set of optimized implementations to take advantage of the most important +/// hardware feature sets. +/// +/// This dispatcher is suitable for maximizing throughput. +#[macro_export] +macro_rules! dispatch { + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { + #[cfg(feature = "std")] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + #[inline(always)] + fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + use std::arch::x86_64::*; + #[target_feature(enable = "avx2")] + unsafe fn impl_avx2($($arg: $argty),*) -> $ret { + let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*); + _mm256_zeroupper(); + ret + } + #[target_feature(enable = "avx")] + #[target_feature(enable = "sse4.1")] + #[target_feature(enable = "ssse3")] + unsafe fn impl_avx($($arg: $argty),*) -> $ret { + let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*); + _mm256_zeroupper(); + ret + } + #[target_feature(enable = "sse4.1")] + #[target_feature(enable = "ssse3")] + unsafe fn impl_sse41($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) + } + #[target_feature(enable = "ssse3")] + unsafe fn impl_ssse3($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) + } + #[target_feature(enable = "sse2")] + unsafe fn impl_sse2($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + unsafe { + if is_x86_feature_detected!("avx2") { + impl_avx2($($arg),*) + } else if is_x86_feature_detected!("avx") { + impl_avx($($arg),*) + } else if is_x86_feature_detected!("sse4.1") { + impl_sse41($($arg),*) + } else if is_x86_feature_detected!("ssse3") { + impl_ssse3($($arg),*) + } else if is_x86_feature_detected!("sse2") { + impl_sse2($($arg),*) + } else { + unimplemented!() + } + } + } + #[cfg(not(feature = "std"))] + #[inline(always)] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + unsafe { + if cfg!(target_feature = "avx2") { + fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) + } else if cfg!(target_feature = "avx") { + fn_impl($crate::x86_64::AVX::instance(), $($arg),*) + } else if cfg!(target_feature = "sse4.1") { + fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) + } else if cfg!(target_feature = "ssse3") { + fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) + } else { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + } + } + }; + ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { + dispatch!($mach, $MTy, { + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body + }); + } +} + +/// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit +/// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX. +/// +/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware +/// features (e.g. because they are done infrequently), so minimizing their contribution to code +/// size is more important. +#[macro_export] +macro_rules! dispatch_light128 { + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { + #[cfg(feature = "std")] + $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + #[inline(always)] + fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + use std::arch::x86_64::*; + #[target_feature(enable = "avx")] + unsafe fn impl_avx($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::AVX::instance(), $($arg),*) + } + #[target_feature(enable = "sse2")] + unsafe fn impl_sse2($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + unsafe { + if is_x86_feature_detected!("avx") { + impl_avx($($arg),*) + } else if is_x86_feature_detected!("sse2") { + impl_sse2($($arg),*) + } else { + unimplemented!() + } + } + } + #[cfg(not(feature = "std"))] + #[inline(always)] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + unsafe { + if cfg!(target_feature = "avx2") { + fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) + } else if cfg!(target_feature = "avx") { + fn_impl($crate::x86_64::AVX::instance(), $($arg),*) + } else if cfg!(target_feature = "sse4.1") { + fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) + } else if cfg!(target_feature = "ssse3") { + fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) + } else { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + } + } + }; + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { + dispatch_light128!($mach, $MTy, { + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body + }); + } +} + +/// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit +/// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2. +/// +/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware +/// features (e.g. because they are done infrequently), so minimizing their contribution to code +/// size is more important. +#[macro_export] +macro_rules! dispatch_light256 { + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { + #[cfg(feature = "std")] + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret { + #[inline(always)] + fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + use std::arch::x86_64::*; + #[target_feature(enable = "avx")] + unsafe fn impl_avx($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::AVX::instance(), $($arg),*) + } + #[target_feature(enable = "sse2")] + unsafe fn impl_sse2($($arg: $argty),*) -> $ret { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + unsafe { + if is_x86_feature_detected!("avx") { + impl_avx($($arg),*) + } else if is_x86_feature_detected!("sse2") { + impl_sse2($($arg),*) + } else { + unimplemented!() + } + } + } + #[cfg(not(feature = "std"))] + #[inline(always)] + $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { + unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body + unsafe { + if cfg!(target_feature = "avx2") { + fn_impl($crate::x86_64::AVX2::instance(), $($arg),*) + } else if cfg!(target_feature = "avx") { + fn_impl($crate::x86_64::AVX::instance(), $($arg),*) + } else if cfg!(target_feature = "sse4.1") { + fn_impl($crate::x86_64::SSE41::instance(), $($arg),*) + } else if cfg!(target_feature = "ssse3") { + fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*) + } else { + fn_impl($crate::x86_64::SSE2::instance(), $($arg),*) + } + } + } + }; + ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => { + dispatch_light256!($mach, $MTy, { + $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body + }); + } +} diff --git a/third_party/rust/ppv-lite86/src/x86_64/sse2.rs b/third_party/rust/ppv-lite86/src/x86_64/sse2.rs new file mode 100644 index 0000000000..97197a436a --- /dev/null +++ b/third_party/rust/ppv-lite86/src/x86_64/sse2.rs @@ -0,0 +1,1703 @@ +use crate::soft::{x2, x4}; +use crate::types::*; +use crate::vec128_storage; +use crate::x86_64::Avx2Machine; +use crate::x86_64::SseMachine as Machine86; +use crate::x86_64::{NoS3, NoS4, YesS3, YesS4}; +use core::arch::x86_64::*; +use core::marker::PhantomData; +use core::ops::{ + Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not, +}; + +macro_rules! impl_binop { + ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => { + impl<S3, S4, NI> $trait for $vec<S3, S4, NI> { + type Output = Self; + #[inline(always)] + fn $fn(self, rhs: Self) -> Self::Output { + Self::new(unsafe { $impl_fn(self.x, rhs.x) }) + } + } + }; +} + +macro_rules! impl_binop_assign { + ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => { + impl<S3, S4, NI> $trait for $vec<S3, S4, NI> + where + $vec<S3, S4, NI>: Copy, + { + #[inline(always)] + fn $fn_assign(&mut self, rhs: Self) { + *self = self.$fn(rhs); + } + } + }; +} + +macro_rules! def_vec { + ($vec:ident, $word:ident) => { + #[allow(non_camel_case_types)] + #[derive(Copy, Clone)] + pub struct $vec<S3, S4, NI> { + x: __m128i, + s3: PhantomData<S3>, + s4: PhantomData<S4>, + ni: PhantomData<NI>, + } + + impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> { + #[inline(always)] + unsafe fn unpack(x: vec128_storage) -> Self { + Self::new(x.sse2) + } + } + impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage { + #[inline(always)] + fn from(x: $vec<S3, S4, NI>) -> Self { + vec128_storage { sse2: x.x } + } + } + impl<S3, S4, NI> $vec<S3, S4, NI> { + #[inline(always)] + fn new(x: __m128i) -> Self { + $vec { + x, + s3: PhantomData, + s4: PhantomData, + ni: PhantomData, + } + } + } + + impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI> + where + Self: BSwap, + { + #[inline(always)] + unsafe fn unsafe_read_le(input: &[u8]) -> Self { + assert_eq!(input.len(), 16); + Self::new(_mm_loadu_si128(input.as_ptr() as *const _)) + } + #[inline(always)] + unsafe fn unsafe_read_be(input: &[u8]) -> Self { + assert_eq!(input.len(), 16); + Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap() + } + #[inline(always)] + fn write_le(self, out: &mut [u8]) { + assert_eq!(out.len(), 16); + unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) } + } + #[inline(always)] + fn write_be(self, out: &mut [u8]) { + assert_eq!(out.len(), 16); + let x = self.bswap().x; + unsafe { + _mm_storeu_si128(out.as_mut_ptr() as *mut _, x); + } + } + } + + impl<S3, S4, NI> Default for $vec<S3, S4, NI> { + #[inline(always)] + fn default() -> Self { + Self::new(unsafe { _mm_setzero_si128() }) + } + } + + impl<S3, S4, NI> Not for $vec<S3, S4, NI> { + type Output = Self; + #[inline(always)] + fn not(self) -> Self::Output { + unsafe { + let ff = _mm_set1_epi64x(-1i64); + self ^ Self::new(ff) + } + } + } + + impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {} + impl_binop!($vec, BitAnd, bitand, _mm_and_si128); + impl_binop!($vec, BitOr, bitor, _mm_or_si128); + impl_binop!($vec, BitXor, bitxor, _mm_xor_si128); + impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand); + impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor); + impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor); + impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> { + type Output = Self; + #[inline(always)] + fn andnot(self, rhs: Self) -> Self { + Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) }) + } + } + }; +} + +macro_rules! impl_bitops32 { + ($vec:ident) => { + impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where + $vec<S3, S4, NI>: RotateEachWord32 + { + } + }; +} + +macro_rules! impl_bitops64 { + ($vec:ident) => { + impl_bitops32!($vec); + impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where + $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + { + } + }; +} + +macro_rules! impl_bitops128 { + ($vec:ident) => { + impl_bitops64!($vec); + impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where + $vec<S3, S4, NI>: RotateEachWord128 + { + } + }; +} + +macro_rules! rotr_32_s3 { + ($name:ident, $k0:expr, $k1:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) }) + } + }; +} +macro_rules! rotr_32 { + ($name:ident, $i:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_epi32(self.x, $i as i32), + _mm_slli_epi32(self.x, 32 - $i as i32), + ) + }) + } + }; +} +impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> { + rotr_32!(rotate_each_word_right7, 7); + rotr_32_s3!( + rotate_each_word_right8, + 0x0c0f_0e0d_080b_0a09, + 0x0407_0605_0003_0201 + ); + rotr_32!(rotate_each_word_right11, 11); + rotr_32!(rotate_each_word_right12, 12); + rotr_32_s3!( + rotate_each_word_right16, + 0x0d0c_0f0e_0908_0b0a, + 0x0504_0706_0100_0302 + ); + rotr_32!(rotate_each_word_right20, 20); + rotr_32_s3!( + rotate_each_word_right24, + 0x0e0d_0c0f_0a09_080b, + 0x0605_0407_0201_0003 + ); + rotr_32!(rotate_each_word_right25, 25); +} +impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> { + rotr_32!(rotate_each_word_right7, 7); + rotr_32!(rotate_each_word_right8, 8); + rotr_32!(rotate_each_word_right11, 11); + rotr_32!(rotate_each_word_right12, 12); + #[inline(always)] + fn rotate_each_word_right16(self) -> Self { + Self::new(swap16_s2(self.x)) + } + rotr_32!(rotate_each_word_right20, 20); + rotr_32!(rotate_each_word_right24, 24); + rotr_32!(rotate_each_word_right25, 25); +} + +macro_rules! rotr_64_s3 { + ($name:ident, $k0:expr, $k1:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) }) + } + }; +} +macro_rules! rotr_64 { + ($name:ident, $i:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_epi64(self.x, $i as i32), + _mm_slli_epi64(self.x, 64 - $i as i32), + ) + }) + } + }; +} +impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> { + rotr_64!(rotate_each_word_right7, 7); + rotr_64_s3!( + rotate_each_word_right8, + 0x080f_0e0d_0c0b_0a09, + 0x0007_0605_0403_0201 + ); + rotr_64!(rotate_each_word_right11, 11); + rotr_64!(rotate_each_word_right12, 12); + rotr_64_s3!( + rotate_each_word_right16, + 0x0908_0f0e_0d0c_0b0a, + 0x0100_0706_0504_0302 + ); + rotr_64!(rotate_each_word_right20, 20); + rotr_64_s3!( + rotate_each_word_right24, + 0x0a09_080f_0e0d_0c0b, + 0x0201_0007_0605_0403 + ); + rotr_64!(rotate_each_word_right25, 25); +} +impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> { + rotr_64!(rotate_each_word_right7, 7); + rotr_64!(rotate_each_word_right8, 8); + rotr_64!(rotate_each_word_right11, 11); + rotr_64!(rotate_each_word_right12, 12); + #[inline(always)] + fn rotate_each_word_right16(self) -> Self { + Self::new(swap16_s2(self.x)) + } + rotr_64!(rotate_each_word_right20, 20); + rotr_64!(rotate_each_word_right24, 24); + rotr_64!(rotate_each_word_right25, 25); +} +impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> { + #[inline(always)] + fn rotate_each_word_right32(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) }) + } +} + +macro_rules! rotr_128 { + ($name:ident, $i:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_si128(self.x, $i as i32), + _mm_slli_si128(self.x, 128 - $i as i32), + ) + }) + } + }; +} +// TODO: completely unoptimized +impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> { + rotr_128!(rotate_each_word_right7, 7); + rotr_128!(rotate_each_word_right8, 8); + rotr_128!(rotate_each_word_right11, 11); + rotr_128!(rotate_each_word_right12, 12); + rotr_128!(rotate_each_word_right16, 16); + rotr_128!(rotate_each_word_right20, 20); + rotr_128!(rotate_each_word_right24, 24); + rotr_128!(rotate_each_word_right25, 25); +} +// TODO: completely unoptimized +impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> { + rotr_128!(rotate_each_word_right32, 32); +} +impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {} + +def_vec!(u32x4_sse2, u32); +def_vec!(u64x2_sse2, u64); +def_vec!(u128x1_sse2, u128); + +impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> { + #[inline(always)] + fn to_lanes(self) -> [u32; 4] { + unsafe { + let x = _mm_cvtsi128_si64(self.x) as u64; + let y = _mm_extract_epi64(self.x, 1) as u64; + [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32] + } + } + #[inline(always)] + fn from_lanes(xs: [u32; 4]) -> Self { + unsafe { + let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64); + x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1); + Self::new(x) + } + } +} +impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> { + #[inline(always)] + fn to_lanes(self) -> [u32; 4] { + unsafe { + let x = _mm_cvtsi128_si64(self.x) as u64; + let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64; + [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32] + } + } + #[inline(always)] + fn from_lanes(xs: [u32; 4]) -> Self { + unsafe { + let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64; + let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64; + let x = _mm_cvtsi64_si128(x); + let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8); + Self::new(_mm_or_si128(x, y)) + } + } +} +impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> { + #[inline(always)] + fn to_lanes(self) -> [u64; 2] { + unsafe { + [ + _mm_cvtsi128_si64(self.x) as u64, + _mm_extract_epi64(self.x, 1) as u64, + ] + } + } + #[inline(always)] + fn from_lanes(xs: [u64; 2]) -> Self { + unsafe { + let mut x = _mm_cvtsi64_si128(xs[0] as i64); + x = _mm_insert_epi64(x, xs[1] as i64, 1); + Self::new(x) + } + } +} +impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> { + #[inline(always)] + fn to_lanes(self) -> [u64; 2] { + unsafe { + [ + _mm_cvtsi128_si64(self.x) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64, + ] + } + } + #[inline(always)] + fn from_lanes(xs: [u64; 2]) -> Self { + unsafe { + let x = _mm_cvtsi64_si128(xs[0] as i64); + let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8); + Self::new(_mm_or_si128(x, y)) + } + } +} +impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> { + #[inline(always)] + fn to_lanes(self) -> [u128; 1] { + unimplemented!() + } + #[inline(always)] + fn from_lanes(xs: [u128; 1]) -> Self { + unimplemented!("{:?}", xs) + } +} + +impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI> +where + u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy, +{ + #[inline(always)] + fn to_lanes(self) -> [u64; 4] { + let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes()); + [a[0], a[1], b[0], b[1]] + } + #[inline(always)] + fn from_lanes(xs: [u64; 4]) -> Self { + let (a, b) = ( + u64x2_sse2::from_lanes([xs[0], xs[1]]), + u64x2_sse2::from_lanes([xs[2], xs[3]]), + ); + x2::new([a, b]) + } +} + +macro_rules! impl_into { + ($from:ident, $to:ident) => { + impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> { + #[inline(always)] + fn from(x: $from<S3, S4, NI>) -> Self { + $to::new(x.x) + } + } + }; +} + +impl_into!(u128x1_sse2, u32x4_sse2); +impl_into!(u128x1_sse2, u64x2_sse2); + +impl_bitops32!(u32x4_sse2); +impl_bitops64!(u64x2_sse2); +impl_bitops128!(u128x1_sse2); + +impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where + u32x4_sse2<S3, S4, NI>: BSwap +{ +} +impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where + u64x2_sse2<S3, S4, NI>: BSwap +{ +} +impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32); +impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64); +impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add); +impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add); + +impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI> +where + u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>, + Machine86<S3, S4, NI>: Machine, +{ +} +impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI> +where + u64x2_sse2<S3, S4, NI>: + RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>, + Machine86<S3, S4, NI>: Machine, +{ +} +impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI> +where + u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap, + Machine86<S3, S4, NI>: Machine, + u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>, + u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>, +{ +} + +impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI> +where + u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>, + Machine86<YesS3, YesS4, NI>: Machine, +{ +} +impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI> +where + u64x2_sse2<YesS3, YesS4, NI>: + RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>, + Machine86<YesS3, YesS4, NI>: Machine, +{ +} +impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI> +where + u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap, + Machine86<YesS3, YesS4, NI>: Machine, + u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>, + u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>, +{ +} + +impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> { + #[inline(always)] + unsafe fn unsafe_from(xs: [u32; 4]) -> Self { + Self::new(_mm_set_epi32( + xs[3] as i32, + xs[2] as i32, + xs[1] as i32, + xs[0] as i32, + )) + } +} + +impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI> +where + Self: MultiLane<[u32; 4]>, +{ + #[inline(always)] + fn extract(self, i: u32) -> u32 { + self.to_lanes()[i as usize] + } + #[inline(always)] + fn insert(self, v: u32, i: u32) -> Self { + Self::new(unsafe { + match i { + 0 => _mm_insert_epi32(self.x, v as i32, 0), + 1 => _mm_insert_epi32(self.x, v as i32, 1), + 2 => _mm_insert_epi32(self.x, v as i32, 2), + 3 => _mm_insert_epi32(self.x, v as i32, 3), + _ => unreachable!(), + } + }) + } +} +impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI> +where + Self: MultiLane<[u32; 4]>, +{ + #[inline(always)] + fn extract(self, i: u32) -> u32 { + self.to_lanes()[i as usize] + } + #[inline(always)] + fn insert(self, v: u32, i: u32) -> Self { + Self::new(unsafe { + match i { + 0 => { + let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x); + _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)) + } + 1 => { + let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000); + x = _mm_slli_si128(x, 4); + x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); + _mm_shuffle_epi32(x, 0b1110_0001) + } + 2 => { + let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100); + x = _mm_slli_si128(x, 4); + x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); + _mm_shuffle_epi32(x, 0b1100_1001) + } + 3 => { + let mut x = _mm_slli_si128(self.x, 4); + x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32)); + _mm_shuffle_epi32(x, 0b0011_1001) + } + _ => unreachable!(), + } + }) + } +} + +impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> { + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + self.shuffle2301() + } + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + self.shuffle1230() + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + self.shuffle3012() + } +} + +impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> { + #[inline(always)] + fn shuffle2301(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) + } + #[inline(always)] + fn shuffle1230(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) }) + } + #[inline(always)] + fn shuffle3012(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) }) + } +} + +impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> { + #[inline(always)] + fn shuffle2301(self) -> Self { + x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)]) + } + #[inline(always)] + fn shuffle3012(self) -> Self { + unsafe { + x2::new([ + u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)), + u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)), + ]) + } + } + #[inline(always)] + fn shuffle1230(self) -> Self { + unsafe { + x2::new([ + u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)), + u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)), + ]) + } + } +} +impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> { + #[inline(always)] + fn shuffle2301(self) -> Self { + x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)]) + } + #[inline(always)] + fn shuffle3012(self) -> Self { + unsafe { + let a = _mm_srli_si128(self.0[0].x, 8); + let b = _mm_slli_si128(self.0[0].x, 8); + let c = _mm_srli_si128(self.0[1].x, 8); + let d = _mm_slli_si128(self.0[1].x, 8); + let da = _mm_or_si128(d, a); + let bc = _mm_or_si128(b, c); + x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)]) + } + } + #[inline(always)] + fn shuffle1230(self) -> Self { + unsafe { + let a = _mm_srli_si128(self.0[0].x, 8); + let b = _mm_slli_si128(self.0[0].x, 8); + let c = _mm_srli_si128(self.0[1].x, 8); + let d = _mm_slli_si128(self.0[1].x, 8); + let da = _mm_or_si128(d, a); + let bc = _mm_or_si128(b, c); + x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)]) + } + } +} + +impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> { + #[inline(always)] + unsafe fn unsafe_from(xs: [u64; 2]) -> Self { + Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64)) + } +} + +impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> { + #[inline(always)] + fn extract(self, i: u32) -> u64 { + unsafe { + match i { + 0 => _mm_cvtsi128_si64(self.x) as u64, + 1 => _mm_extract_epi64(self.x, 1) as u64, + _ => unreachable!(), + } + } + } + #[inline(always)] + fn insert(self, x: u64, i: u32) -> Self { + Self::new(unsafe { + match i { + 0 => _mm_insert_epi64(self.x, x as i64, 0), + 1 => _mm_insert_epi64(self.x, x as i64, 1), + _ => unreachable!(), + } + }) + } +} +impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> { + #[inline(always)] + fn extract(self, i: u32) -> u64 { + unsafe { + match i { + 0 => _mm_cvtsi128_si64(self.x) as u64, + 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64, + _ => unreachable!(), + } + } + } + #[inline(always)] + fn insert(self, x: u64, i: u32) -> Self { + Self::new(unsafe { + match i { + 0 => _mm_or_si128( + _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x), + _mm_cvtsi64_si128(x as i64), + ), + 1 => _mm_or_si128( + _mm_move_epi64(self.x), + _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8), + ), + _ => unreachable!(), + } + }) + } +} + +impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> { + #[inline(always)] + fn bswap(self) -> Self { + Self::new(unsafe { + let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203); + _mm_shuffle_epi8(self.x, k) + }) + } +} +#[inline(always)] +fn bswap32_s2(x: __m128i) -> __m128i { + unsafe { + let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128()); + y = _mm_shufflehi_epi16(y, 0b0001_1011); + y = _mm_shufflelo_epi16(y, 0b0001_1011); + let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128()); + z = _mm_shufflehi_epi16(z, 0b0001_1011); + z = _mm_shufflelo_epi16(z, 0b0001_1011); + _mm_packus_epi16(y, z) + } +} +impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> { + #[inline(always)] + fn bswap(self) -> Self { + Self::new(bswap32_s2(self.x)) + } +} + +impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> { + #[inline(always)] + fn bswap(self) -> Self { + Self::new(unsafe { + let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607); + _mm_shuffle_epi8(self.x, k) + }) + } +} +impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> { + #[inline(always)] + fn bswap(self) -> Self { + Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) }) + } +} + +impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> { + #[inline(always)] + fn bswap(self) -> Self { + Self::new(unsafe { + let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100); + _mm_shuffle_epi8(self.x, k) + }) + } +} +impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> { + #[inline(always)] + fn bswap(self) -> Self { + unimplemented!() + } +} + +macro_rules! swapi { + ($x:expr, $i:expr, $k:expr) => { + unsafe { + const K: u8 = $k; + let k = _mm_set1_epi8(K as i8); + u128x1_sse2::new(_mm_or_si128( + _mm_srli_epi16(_mm_and_si128($x.x, k), $i), + _mm_and_si128(_mm_slli_epi16($x.x, $i), k), + )) + } + }; +} +#[inline(always)] +fn swap16_s2(x: __m128i) -> __m128i { + unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) } +} +impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> { + #[inline(always)] + fn swap1(self) -> Self { + swapi!(self, 1, 0xaa) + } + #[inline(always)] + fn swap2(self) -> Self { + swapi!(self, 2, 0xcc) + } + #[inline(always)] + fn swap4(self) -> Self { + swapi!(self, 4, 0xf0) + } + #[inline(always)] + fn swap8(self) -> Self { + u128x1_sse2::new(unsafe { + let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001); + _mm_shuffle_epi8(self.x, k) + }) + } + #[inline(always)] + fn swap16(self) -> Self { + u128x1_sse2::new(unsafe { + let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302); + _mm_shuffle_epi8(self.x, k) + }) + } + #[inline(always)] + fn swap32(self) -> Self { + u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }) + } + #[inline(always)] + fn swap64(self) -> Self { + u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) + } +} +impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> { + #[inline(always)] + fn swap1(self) -> Self { + swapi!(self, 1, 0xaa) + } + #[inline(always)] + fn swap2(self) -> Self { + swapi!(self, 2, 0xcc) + } + #[inline(always)] + fn swap4(self) -> Self { + swapi!(self, 4, 0xf0) + } + #[inline(always)] + fn swap8(self) -> Self { + u128x1_sse2::new(unsafe { + _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8)) + }) + } + #[inline(always)] + fn swap16(self) -> Self { + u128x1_sse2::new(swap16_s2(self.x)) + } + #[inline(always)] + fn swap32(self) -> Self { + u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }) + } + #[inline(always)] + fn swap64(self) -> Self { + u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }) + } +} + +#[derive(Copy, Clone)] +pub struct G0; +#[derive(Copy, Clone)] +pub struct G1; + +#[allow(non_camel_case_types)] +pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>; +#[allow(non_camel_case_types)] +pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>; +#[allow(non_camel_case_types)] +pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>; +#[allow(non_camel_case_types)] +pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>; + +#[allow(non_camel_case_types)] +pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>; +#[allow(non_camel_case_types)] +pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>; +#[allow(non_camel_case_types)] +pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>; + +impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> { + #[inline(always)] + fn to_scalars(self) -> [u32; 16] { + unsafe { core::mem::transmute(self) } + } +} + +impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI> +where + u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap, + Machine86<S3, S4, NI>: Machine, + u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>, + u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>, +{ +} +impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI> +where + u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, + Machine86<S3, S4, NI>: Machine, + u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>, + u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>, +{ +} +impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI> +where + u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, + Machine86<S3, S4, NI>: Machine, + u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4, +{ +} +impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI> +where + u128x1_sse2<S3, S4, NI>: Swap64 + BSwap, + Machine86<S3, S4, NI>: Machine, + u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>, + u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>, + u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>, + u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>, + u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>, +{ +} + +impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI> +where + u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap, + Avx2Machine<NI>: Machine, + u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>, + u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>, +{ +} +impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI> +where + u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, + Avx2Machine<NI>: Machine, + u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>, + u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>, +{ +} +impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI> +where + u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, + Avx2Machine<NI>: Machine, + u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4, +{ +} +impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI> +where + u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap, + Avx2Machine<NI>: Machine, + u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>, + u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>, + u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>, + u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>, + u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>, +{ +} + +impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI> +where + u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>, +{ + #[inline(always)] + fn extract(self, i: u32) -> u64 { + match i { + 0 => self.0[0].extract(0), + 1 => self.0[0].extract(1), + 2 => self.0[1].extract(0), + 3 => self.0[1].extract(1), + _ => panic!(), + } + } + #[inline(always)] + fn insert(mut self, w: u64, i: u32) -> Self { + match i { + 0 => self.0[0] = self.0[0].insert(w, 0), + 1 => self.0[0] = self.0[0].insert(w, 1), + 2 => self.0[1] = self.0[1].insert(w, 0), + 3 => self.0[1] = self.0[1].insert(w, 1), + _ => panic!(), + }; + self + } +} + +impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI> +where + u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap, + Machine86<S3, S4, NI>: Machine, + u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>, + u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>, + u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>, + u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>, +{ +} +impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI> +where + u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, + Machine86<S3, S4, NI>: Machine, + u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>, + u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>, +{ +} +impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI> +where + u128x1_sse2<S3, S4, NI>: Swap64 + BSwap, + Machine86<S3, S4, NI>: Machine, + u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>, + u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>, + u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>, + u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>, +{ +} + +impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI> +where + u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, + Avx2Machine<NI>: Machine, + u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>, + u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>, +{ +} +impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI> +where + u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap, + Avx2Machine<NI>: Machine, + u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>, + u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>, + u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>, + u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>, +{ +} + +macro_rules! impl_into_x { + ($from:ident, $to:ident) => { + impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>> + for x2<$to<S3, S4, NI>, Gt> + { + #[inline(always)] + fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self { + x2::new([$to::from(x.0[0]), $to::from(x.0[1])]) + } + } + impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> { + #[inline(always)] + fn from(x: x4<$from<S3, S4, NI>>) -> Self { + x4::new([ + $to::from(x.0[0]), + $to::from(x.0[1]), + $to::from(x.0[2]), + $to::from(x.0[3]), + ]) + } + } + }; +} +impl_into_x!(u128x1_sse2, u64x2_sse2); +impl_into_x!(u128x1_sse2, u32x4_sse2); + +///// Debugging + +use core::fmt::{Debug, Formatter, Result}; + +impl<W: PartialEq, G> PartialEq for x2<W, G> { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1] + } +} + +#[allow(unused)] +#[inline(always)] +unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool { + let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110); + _mm_cvtsi128_si64(q) == -1 +} + +#[inline(always)] +unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool { + let q = _mm_cmpeq_epi32(x, y); + let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8)); + let q = _mm_cvtsi128_si64(q); + (p & q) == -1 +} + +impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { eq128_s2(self.x, rhs.x) } + } +} +impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI> +where + Self: Copy + MultiLane<[u32; 4]>, +{ + #[cold] + fn fmt(&self, fmt: &mut Formatter) -> Result { + fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes())) + } +} + +impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { eq128_s2(self.x, rhs.x) } + } +} +impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI> +where + Self: Copy + MultiLane<[u64; 2]>, +{ + #[cold] + fn fmt(&self, fmt: &mut Formatter) -> Result { + fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes())) + } +} + +impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI> +where + u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>, +{ + #[cold] + fn fmt(&self, fmt: &mut Formatter) -> Result { + let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes()); + fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]])) + } +} + +#[cfg(test)] +#[cfg(target_arch = "x86_64")] +mod test { + use super::*; + use crate::x86_64::{SSE2, SSE41, SSSE3}; + use crate::Machine; + + #[test] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] + fn test_bswap32_s2_vs_s3() { + let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100]; + let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + + let x_s2 = { + let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); + x_s2.bswap() + }; + + let x_s3 = { + let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); + x_s3.bswap() + }; + + assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) }); + assert_eq!(x_s2, s2.vec(ys)); + } + + #[test] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] + fn test_bswap64_s2_vs_s3() { + let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100]; + let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + + let x_s2 = { + let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs); + x_s2.bswap() + }; + + let x_s3 = { + let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs); + x_s3.bswap() + }; + + assert_eq!(x_s2, s2.vec(ys)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + } + + #[test] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] + fn test_shuffle32_s2_vs_s3() { + let xs = [0x0, 0x1, 0x2, 0x3]; + let ys = [0x2, 0x3, 0x0, 0x1]; + let zs = [0x1, 0x2, 0x3, 0x0]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + + let x_s2 = { + let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); + x_s2.shuffle2301() + }; + let x_s3 = { + let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); + x_s3.shuffle2301() + }; + assert_eq!(x_s2, s2.vec(ys)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + + let x_s2 = { + let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); + x_s2.shuffle3012() + }; + let x_s3 = { + let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); + x_s3.shuffle3012() + }; + assert_eq!(x_s2, s2.vec(zs)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + + let x_s2 = x_s2.shuffle1230(); + let x_s3 = x_s3.shuffle1230(); + assert_eq!(x_s2, s2.vec(xs)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + } + + #[test] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] + fn test_shuffle64_s2_vs_s3() { + let xs = [0x0, 0x1, 0x2, 0x3]; + let ys = [0x2, 0x3, 0x0, 0x1]; + let zs = [0x1, 0x2, 0x3, 0x0]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + + let x_s2 = { + let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs); + x_s2.shuffle2301() + }; + let x_s3 = { + let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs); + x_s3.shuffle2301() + }; + assert_eq!(x_s2, s2.vec(ys)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + + let x_s2 = { + let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs); + x_s2.shuffle3012() + }; + let x_s3 = { + let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs); + x_s3.shuffle3012() + }; + assert_eq!(x_s2, s2.vec(zs)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + + let x_s2 = x_s2.shuffle1230(); + let x_s3 = x_s3.shuffle1230(); + assert_eq!(x_s2, s2.vec(xs)); + assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + } + + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] + #[test] + fn test_lanes_u32x4() { + let xs = [0x1, 0x2, 0x3, 0x4]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + let s4 = unsafe { SSE41::instance() }; + + { + let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); + let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs); + assert_eq!(x_s2, y_s2); + assert_eq!(xs, y_s2.to_lanes()); + } + + { + let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs); + let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs); + assert_eq!(x_s3, y_s3); + assert_eq!(xs, y_s3.to_lanes()); + } + + { + let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs); + let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs); + assert_eq!(x_s4, y_s4); + assert_eq!(xs, y_s4.to_lanes()); + } + } + + #[test] + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] + fn test_lanes_u64x2() { + let xs = [0x1, 0x2]; + + let s2 = unsafe { SSE2::instance() }; + let s3 = unsafe { SSSE3::instance() }; + let s4 = unsafe { SSE41::instance() }; + + { + let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs); + let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs); + assert_eq!(x_s2, y_s2); + assert_eq!(xs, y_s2.to_lanes()); + } + + { + let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs); + let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs); + assert_eq!(x_s3, y_s3); + assert_eq!(xs, y_s3.to_lanes()); + } + + { + let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs); + let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs); + assert_eq!(x_s4, y_s4); + assert_eq!(xs, y_s4.to_lanes()); + } + } + + #[test] + fn test_vec4_u32x4_s2() { + let xs = [1, 2, 3, 4]; + let s2 = unsafe { SSE2::instance() }; + let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs); + assert_eq!(x_s2.extract(0), 1); + assert_eq!(x_s2.extract(1), 2); + assert_eq!(x_s2.extract(2), 3); + assert_eq!(x_s2.extract(3), 4); + assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4])); + assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4])); + assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4])); + assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf])); + } + + #[test] + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] + fn test_vec4_u32x4_s4() { + let xs = [1, 2, 3, 4]; + let s4 = unsafe { SSE41::instance() }; + let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs); + assert_eq!(x_s4.extract(0), 1); + assert_eq!(x_s4.extract(1), 2); + assert_eq!(x_s4.extract(2), 3); + assert_eq!(x_s4.extract(3), 4); + assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4])); + assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4])); + assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4])); + assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf])); + } + + #[test] + fn test_vec2_u64x2_s2() { + let xs = [0x1, 0x2]; + let s2 = unsafe { SSE2::instance() }; + let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs); + assert_eq!(x_s2.extract(0), 1); + assert_eq!(x_s2.extract(1), 2); + assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2])); + assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf])); + } + + #[test] + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] + fn test_vec4_u64x2_s4() { + let xs = [0x1, 0x2]; + let s4 = unsafe { SSE41::instance() }; + let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs); + assert_eq!(x_s4.extract(0), 1); + assert_eq!(x_s4.extract(1), 2); + assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2])); + assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf])); + } +} + +pub mod avx2 { + #![allow(non_camel_case_types)] + use crate::soft::{x2, x4}; + use crate::types::*; + use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0}; + use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4}; + use core::arch::x86_64::*; + use core::marker::PhantomData; + use core::ops::*; + + #[derive(Copy, Clone)] + pub struct u32x4x2_avx2<NI> { + x: __m256i, + ni: PhantomData<NI>, + } + + impl<NI> u32x4x2_avx2<NI> { + #[inline(always)] + fn new(x: __m256i) -> Self { + Self { x, ni: PhantomData } + } + } + + impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {} + impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> { + #[inline(always)] + unsafe fn unpack(p: vec256_storage) -> Self { + Self::new(p.avx) + } + } + impl<NI> StoreBytes for u32x4x2_avx2<NI> { + #[inline(always)] + unsafe fn unsafe_read_le(input: &[u8]) -> Self { + assert_eq!(input.len(), 32); + Self::new(_mm256_loadu_si256(input.as_ptr() as *const _)) + } + #[inline(always)] + unsafe fn unsafe_read_be(input: &[u8]) -> Self { + Self::unsafe_read_le(input).bswap() + } + #[inline(always)] + fn write_le(self, out: &mut [u8]) { + unsafe { + assert_eq!(out.len(), 32); + _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x) + } + } + #[inline(always)] + fn write_be(self, out: &mut [u8]) { + self.bswap().write_le(out) + } + } + impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> { + #[inline(always)] + fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] { + unsafe { + [ + u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), + u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), + ] + } + } + #[inline(always)] + fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self { + Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) }) + } + } + impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> { + #[inline(always)] + fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { + unsafe { + match i { + 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), + 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), + _ => panic!(), + } + } + } + #[inline(always)] + fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { + Self::new(unsafe { + match i { + 0 => _mm256_inserti128_si256(self.x, w.x, 0), + 1 => _mm256_inserti128_si256(self.x, w.x, 1), + _ => panic!(), + } + }) + } + } + impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {} + impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {} + macro_rules! shuf_lane_bytes { + ($name:ident, $k0:expr, $k1:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1)) + }) + } + }; + } + macro_rules! rotr_32 { + ($name:ident, $i:expr) => { + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm256_or_si256( + _mm256_srli_epi32(self.x, $i as i32), + _mm256_slli_epi32(self.x, 32 - $i as i32), + ) + }) + } + }; + } + impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> { + rotr_32!(rotate_each_word_right7, 7); + shuf_lane_bytes!( + rotate_each_word_right8, + 0x0c0f_0e0d_080b_0a09, + 0x0407_0605_0003_0201 + ); + rotr_32!(rotate_each_word_right11, 11); + rotr_32!(rotate_each_word_right12, 12); + shuf_lane_bytes!( + rotate_each_word_right16, + 0x0d0c_0f0e_0908_0b0a, + 0x0504_0706_0100_0302 + ); + rotr_32!(rotate_each_word_right20, 20); + shuf_lane_bytes!( + rotate_each_word_right24, + 0x0e0d_0c0f_0a09_080b, + 0x0605_0407_0201_0003 + ); + rotr_32!(rotate_each_word_right25, 25); + } + impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {} + impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage { + #[inline(always)] + fn from(x: u32x4x2_avx2<NI>) -> Self { + Self { avx: x.x } + } + } + + macro_rules! impl_assign { + ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => { + impl<NI> $Assign for $vec<NI> + where + NI: Copy, + { + #[inline(always)] + fn $assign_fn(&mut self, rhs: Self) { + *self = self.$bin_fn(rhs); + } + } + }; + } + impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor); + impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor); + impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand); + impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add); + + macro_rules! impl_bitop { + ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => { + impl<NI> $Op for $vec<NI> { + type Output = Self; + #[inline(always)] + fn $op_fn(self, rhs: Self) -> Self::Output { + Self::new(unsafe { $impl_fn(self.x, rhs.x) }) + } + } + }; + } + impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256); + impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256); + impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256); + impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256); + impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32); + + impl<NI> Not for u32x4x2_avx2<NI> { + type Output = Self; + #[inline(always)] + fn not(self) -> Self::Output { + unsafe { + let f = _mm256_set1_epi8(-0x7f); + Self::new(f) ^ self + } + } + } + + impl<NI> BSwap for u32x4x2_avx2<NI> { + shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203); + } + + impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI> + where + NI: Copy, + { + #[inline(always)] + fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self { + Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) }) + } + } + + impl<NI> LaneWords4 for u32x4x2_avx2<NI> { + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) }) + } + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) }) + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) }) + } + } + + /////////////////////////////////////////////////////////////////////////////////////////// + + pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>; + impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {} + + impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> { + #[inline(always)] + unsafe fn unpack(p: vec512_storage) -> Self { + Self::new([ + u32x4x2_avx2::unpack(p.avx[0]), + u32x4x2_avx2::unpack(p.avx[1]), + ]) + } + } + impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> { + #[inline(always)] + fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] { + let [a, b] = self.0[0].to_lanes(); + let [c, d] = self.0[1].to_lanes(); + [a, b, c, d] + } + #[inline(always)] + fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self { + let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]); + let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]); + Self::new([ab, cd]) + } + } + impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { + #[inline(always)] + fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { + match i { + 0 => self.0[0].extract(0), + 1 => self.0[0].extract(1), + 2 => self.0[1].extract(0), + 3 => self.0[1].extract(1), + _ => panic!(), + } + } + #[inline(always)] + fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { + Self::new(match i { + 0 | 1 => [self.0[0].insert(w, i), self.0[1]], + 2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)], + _ => panic!(), + }) + } + } + impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { + #[inline(always)] + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { + /* + * a00:a01 a10:a11 + * b00:b01 b10:b11 + * c00:c01 c10:c11 + * d00:d01 d10:d11 + * => + * a00:b00 c00:d00 + * a01:b01 c01:d01 + * a10:b10 c10:d10 + * a11:b11 c11:d11 + */ + unsafe { + let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20)); + let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31)); + let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20)); + let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31)); + let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20)); + let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31)); + let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20)); + let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31)); + ( + Self::new([ab00, cd00]), + Self::new([ab01, cd01]), + Self::new([ab10, cd10]), + Self::new([ab11, cd11]), + ) + } + } + } + impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> { + #[inline(always)] + fn to_scalars(self) -> [u32; 16] { + unsafe { core::mem::transmute(self) } + } + } + impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage { + #[inline(always)] + fn from(x: u32x4x4_avx2<NI>) -> Self { + Self { + avx: [ + vec256_storage { avx: x.0[0].x }, + vec256_storage { avx: x.0[1].x }, + ], + } + } + } + impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> { + #[inline(always)] + fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self { + Self::new(unsafe { + [ + u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)), + u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)), + ] + }) + } + } +} |