From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Wed, 17 Apr 2024 14:02:58 +0200
Subject: Adding upstream version 1.64.0+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 vendor/ppv-lite86/.cargo-checksum.json |    1 +
 vendor/ppv-lite86/Cargo.toml           |   32 +
 vendor/ppv-lite86/LICENSE-APACHE       |  201 ++++
 vendor/ppv-lite86/LICENSE-MIT          |   25 +
 vendor/ppv-lite86/src/generic.rs       |  831 ++++++++++++++++
 vendor/ppv-lite86/src/lib.rs           |   22 +
 vendor/ppv-lite86/src/soft.rs          |  420 ++++++++
 vendor/ppv-lite86/src/types.rs         |  279 ++++++
 vendor/ppv-lite86/src/x86_64/mod.rs    |  433 +++++++++
 vendor/ppv-lite86/src/x86_64/sse2.rs   | 1632 ++++++++++++++++++++++++++++++++
 10 files changed, 3876 insertions(+)
 create mode 100644 vendor/ppv-lite86/.cargo-checksum.json
 create mode 100644 vendor/ppv-lite86/Cargo.toml
 create mode 100644 vendor/ppv-lite86/LICENSE-APACHE
 create mode 100644 vendor/ppv-lite86/LICENSE-MIT
 create mode 100644 vendor/ppv-lite86/src/generic.rs
 create mode 100644 vendor/ppv-lite86/src/lib.rs
 create mode 100644 vendor/ppv-lite86/src/soft.rs
 create mode 100644 vendor/ppv-lite86/src/types.rs
 create mode 100644 vendor/ppv-lite86/src/x86_64/mod.rs
 create mode 100644 vendor/ppv-lite86/src/x86_64/sse2.rs

(limited to 'vendor/ppv-lite86')

diff --git a/vendor/ppv-lite86/.cargo-checksum.json b/vendor/ppv-lite86/.cargo-checksum.json
new file mode 100644
index 000000000..5b321a6fa
--- /dev/null
+++ b/vendor/ppv-lite86/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"Cargo.toml":"e6b587643c21b13854a8622afd14b380858781d7535160346af790e217349f85","LICENSE-APACHE":"0218327e7a480793ffdd4eb792379a9709e5c135c7ba267f709d6f6d4d70af0a","LICENSE-MIT":"4cada0bd02ea3692eee6f16400d86c6508bbd3bafb2b65fed0419f36d4f83e8f","src/generic.rs":"070d2ba8ea384ce7ac514f643e5109e39ddd2c34e72b9cc2f561bf55ce84fa32","src/lib.rs":"75beb27d89dcc7541c8e81ad1f4bec81908d8d5fa0e3adec47cb1a1f008dfd32","src/soft.rs":"6fb8aa428183ec09d63d45761507d8da6dffc45990f2d1fcfd387c4c856599cc","src/types.rs":"4890069359ed53575a6b9a8168037ccdd4b029c8d61d540e9770fe3c90359345","src/x86_64/mod.rs":"1f210594432c180f8560d2111a206ae633a2bcd44ebdad390346574cb66cb0b2","src/x86_64/sse2.rs":"199022e8918b37f7287274ea886b097027ce12b2522038579d79ccf25dd333e1"},"package":"237a5ed80e274dbc66f86bd59c1e25edc039660be53194b5fe0a482e0f2612ea"}
\ No newline at end of file
diff --git a/vendor/ppv-lite86/Cargo.toml b/vendor/ppv-lite86/Cargo.toml
new file mode 100644
index 000000000..9a82142ac
--- /dev/null
+++ b/vendor/ppv-lite86/Cargo.toml
@@ -0,0 +1,32 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+edition = "2018"
+name = "ppv-lite86"
+version = "0.2.8"
+authors = ["The CryptoCorrosion Contributors"]
+description = "Implementation of the crypto-simd API for x86"
+keywords = ["crypto", "simd", "x86"]
+categories = ["cryptography", "no-std"]
+license = "MIT/Apache-2.0"
+repository = "https://github.com/cryptocorrosion/cryptocorrosion"
+
+[dependencies]
+
+[features]
+default = ["std"]
+no_simd = []
+simd = []
+std = []
+[badges.travis-ci]
+repository = "cryptocorrosion/cryptocorrosion"
diff --git a/vendor/ppv-lite86/LICENSE-APACHE b/vendor/ppv-lite86/LICENSE-APACHE
new file mode 100644
index 000000000..1eb321535
--- /dev/null
+++ b/vendor/ppv-lite86/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright 2019 The CryptoCorrosion Contributors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/vendor/ppv-lite86/LICENSE-MIT b/vendor/ppv-lite86/LICENSE-MIT
new file mode 100644
index 000000000..d78c961bc
--- /dev/null
+++ b/vendor/ppv-lite86/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2019 The CryptoCorrosion Contributors
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/vendor/ppv-lite86/src/generic.rs b/vendor/ppv-lite86/src/generic.rs
new file mode 100644
index 000000000..4f4113fc3
--- /dev/null
+++ b/vendor/ppv-lite86/src/generic.rs
@@ -0,0 +1,831 @@
+#![allow(non_camel_case_types)]
+
+use core::ops::*;
+use crate::soft::{x2, x4};
+use crate::types::*;
+
+#[derive(Clone, Copy)]
+pub union vec128_storage {
+    d: [u32; 4],
+    q: [u64; 2],
+    o: [u128; 1],
+}
+impl From<[u32; 4]> for vec128_storage {
+    #[inline]
+    fn from(d: [u32; 4]) -> Self {
+        Self { d }
+    }
+}
+impl From<vec128_storage> for [u32; 4] {
+    #[inline]
+    fn from(d: vec128_storage) -> Self {
+        unsafe { d.d }
+    }
+}
+impl From<[u64; 2]> for vec128_storage {
+    #[inline]
+    fn from(q: [u64; 2]) -> Self {
+        Self { q }
+    }
+}
+impl From<vec128_storage> for [u64; 2] {
+    #[inline]
+    fn from(q: vec128_storage) -> Self {
+        unsafe { q.q }
+    }
+}
+impl Default for vec128_storage {
+    #[inline]
+    fn default() -> Self {
+        Self { o: [0] }
+    }
+}
+impl Eq for vec128_storage {}
+impl PartialEq<vec128_storage> for vec128_storage {
+    #[inline]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { self.o == rhs.o }
+    }
+}
+#[derive(Clone, Copy, PartialEq, Eq, Default)]
+pub struct vec256_storage {
+    v128: [vec128_storage; 2],
+}
+impl vec256_storage {
+    #[inline(always)]
+    pub fn new128(v128: [vec128_storage; 2]) -> Self {
+        Self { v128 }
+    }
+    #[inline(always)]
+    pub fn split128(self) -> [vec128_storage; 2] {
+        self.v128
+    }
+}
+impl From<[u64; 4]> for vec256_storage {
+    #[inline]
+    fn from(q: [u64; 4]) -> Self {
+        Self { v128: [[0, 1].into(), [2, 3].into()] }
+    }
+}
+impl From<vec256_storage> for [u64; 4] {
+    #[inline]
+    fn from(q: vec256_storage) -> Self {
+        let [a, b]: [u64; 2] = q.v128[0].into();
+        let [c, d]: [u64; 2] = q.v128[1].into();
+        [a, b, c, d]
+    }
+}
+#[derive(Clone, Copy, PartialEq, Eq, Default)]
+pub struct vec512_storage {
+    v128: [vec128_storage; 4],
+}
+impl vec512_storage {
+    #[inline(always)]
+    pub fn new128(v128: [vec128_storage; 4]) -> Self {
+        Self { v128 }
+    }
+    #[inline(always)]
+    pub fn split128(self) -> [vec128_storage; 4] {
+        self.v128
+    }
+}
+
+fn dmap<T, F>(t: T, f: F) -> T
+where
+    T: Store<vec128_storage> + Into<vec128_storage>,
+    F: Fn(u32) -> u32,
+{
+    let t: vec128_storage = t.into();
+    let d = unsafe { t.d };
+    let d = vec128_storage {
+        d: [f(d[0]), f(d[1]), f(d[2]), f(d[3])],
+    };
+    unsafe { T::unpack(d) }
+}
+
+fn dmap2<T, F>(a: T, b: T, f: F) -> T
+where
+    T: Store<vec128_storage> + Into<vec128_storage>,
+    F: Fn(u32, u32) -> u32,
+{
+    let a: vec128_storage = a.into();
+    let b: vec128_storage = b.into();
+    let ao = unsafe { a.d };
+    let bo = unsafe { b.d };
+    let d = vec128_storage {
+        d: [
+            f(ao[0], bo[0]),
+            f(ao[1], bo[1]),
+            f(ao[2], bo[2]),
+            f(ao[3], bo[3]),
+        ],
+    };
+    unsafe { T::unpack(d) }
+}
+
+fn qmap<T, F>(t: T, f: F) -> T
+where
+    T: Store<vec128_storage> + Into<vec128_storage>,
+    F: Fn(u64) -> u64,
+{
+    let t: vec128_storage = t.into();
+    let q = unsafe { t.q };
+    let q = vec128_storage {
+        q: [f(q[0]), f(q[1])],
+    };
+    unsafe { T::unpack(q) }
+}
+
+fn qmap2<T, F>(a: T, b: T, f: F) -> T
+where
+    T: Store<vec128_storage> + Into<vec128_storage>,
+    F: Fn(u64, u64) -> u64,
+{
+    let a: vec128_storage = a.into();
+    let b: vec128_storage = b.into();
+    let ao = unsafe { a.q };
+    let bo = unsafe { b.q };
+    let q = vec128_storage {
+        q: [f(ao[0], bo[0]), f(ao[1], bo[1])],
+    };
+    unsafe { T::unpack(q) }
+}
+
+fn omap<T, F>(a: T, f: F) -> T
+where
+    T: Store<vec128_storage> + Into<vec128_storage>,
+    F: Fn(u128) -> u128,
+{
+    let a: vec128_storage = a.into();
+    let ao = unsafe { a.o };
+    let o = vec128_storage { o: [f(ao[0])] };
+    unsafe { T::unpack(o) }
+}
+
+fn omap2<T, F>(a: T, b: T, f: F) -> T
+where
+    T: Store<vec128_storage> + Into<vec128_storage>,
+    F: Fn(u128, u128) -> u128,
+{
+    let a: vec128_storage = a.into();
+    let b: vec128_storage = b.into();
+    let ao = unsafe { a.o };
+    let bo = unsafe { b.o };
+    let o = vec128_storage {
+        o: [f(ao[0], bo[0])],
+    };
+    unsafe { T::unpack(o) }
+}
+
+impl RotateEachWord128 for u128x1_generic {}
+impl BitOps128 for u128x1_generic {}
+impl BitOps64 for u128x1_generic {}
+impl BitOps64 for u64x2_generic {}
+impl BitOps32 for u128x1_generic {}
+impl BitOps32 for u64x2_generic {}
+impl BitOps32 for u32x4_generic {}
+impl BitOps0 for u128x1_generic {}
+impl BitOps0 for u64x2_generic {}
+impl BitOps0 for u32x4_generic {}
+
+macro_rules! impl_bitops {
+    ($vec:ident) => {
+        impl Not for $vec {
+            type Output = Self;
+            #[inline(always)]
+            fn not(self) -> Self::Output {
+                omap(self, |x| !x)
+            }
+        }
+        impl BitAnd for $vec {
+            type Output = Self;
+            #[inline(always)]
+            fn bitand(self, rhs: Self) -> Self::Output {
+                omap2(self, rhs, |x, y| x & y)
+            }
+        }
+        impl BitOr for $vec {
+            type Output = Self;
+            #[inline(always)]
+            fn bitor(self, rhs: Self) -> Self::Output {
+                omap2(self, rhs, |x, y| x | y)
+            }
+        }
+        impl BitXor for $vec {
+            type Output = Self;
+            #[inline(always)]
+            fn bitxor(self, rhs: Self) -> Self::Output {
+                omap2(self, rhs, |x, y| x ^ y)
+            }
+        }
+        impl AndNot for $vec {
+            type Output = Self;
+            #[inline(always)]
+            fn andnot(self, rhs: Self) -> Self::Output {
+                omap2(self, rhs, |x, y| !x & y)
+            }
+        }
+        impl BitAndAssign for $vec {
+            #[inline(always)]
+            fn bitand_assign(&mut self, rhs: Self) {
+                *self = *self & rhs
+            }
+        }
+        impl BitOrAssign for $vec {
+            #[inline(always)]
+            fn bitor_assign(&mut self, rhs: Self) {
+                *self = *self | rhs
+            }
+        }
+        impl BitXorAssign for $vec {
+            #[inline(always)]
+            fn bitxor_assign(&mut self, rhs: Self) {
+                *self = *self ^ rhs
+            }
+        }
+
+        impl Swap64 for $vec {
+            #[inline]
+            fn swap1(self) -> Self {
+                qmap(self, |x| {
+                    ((x & 0x5555555555555555) << 1) | ((x & 0xaaaaaaaaaaaaaaaa) >> 1)
+                })
+            }
+            #[inline]
+            fn swap2(self) -> Self {
+                qmap(self, |x| {
+                    ((x & 0x3333333333333333) << 2) | ((x & 0xcccccccccccccccc) >> 2)
+                })
+            }
+            #[inline]
+            fn swap4(self) -> Self {
+                qmap(self, |x| {
+                    ((x & 0x0f0f0f0f0f0f0f0f) << 4) | ((x & 0xf0f0f0f0f0f0f0f0) >> 4)
+                })
+            }
+            #[inline]
+            fn swap8(self) -> Self {
+                qmap(self, |x| {
+                    ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8)
+                })
+            }
+            #[inline]
+            fn swap16(self) -> Self {
+                dmap(self, |x| x.rotate_left(16))
+            }
+            #[inline]
+            fn swap32(self) -> Self {
+                qmap(self, |x| x.rotate_left(32))
+            }
+            #[inline]
+            fn swap64(self) -> Self {
+                omap(self, |x| (x << 64) | (x >> 64))
+            }
+        }
+    };
+}
+impl_bitops!(u32x4_generic);
+impl_bitops!(u64x2_generic);
+impl_bitops!(u128x1_generic);
+
+impl RotateEachWord32 for u32x4_generic {
+    #[inline]
+    fn rotate_each_word_right7(self) -> Self {
+        dmap(self, |x| x.rotate_right(7))
+    }
+    #[inline]
+    fn rotate_each_word_right8(self) -> Self {
+        dmap(self, |x| x.rotate_right(8))
+    }
+    #[inline]
+    fn rotate_each_word_right11(self) -> Self {
+        dmap(self, |x| x.rotate_right(11))
+    }
+    #[inline]
+    fn rotate_each_word_right12(self) -> Self {
+        dmap(self, |x| x.rotate_right(12))
+    }
+    #[inline]
+    fn rotate_each_word_right16(self) -> Self {
+        dmap(self, |x| x.rotate_right(16))
+    }
+    #[inline]
+    fn rotate_each_word_right20(self) -> Self {
+        dmap(self, |x| x.rotate_right(20))
+    }
+    #[inline]
+    fn rotate_each_word_right24(self) -> Self {
+        dmap(self, |x| x.rotate_right(24))
+    }
+    #[inline]
+    fn rotate_each_word_right25(self) -> Self {
+        dmap(self, |x| x.rotate_right(25))
+    }
+}
+
+impl RotateEachWord32 for u64x2_generic {
+    #[inline]
+    fn rotate_each_word_right7(self) -> Self {
+        qmap(self, |x| x.rotate_right(7))
+    }
+    #[inline]
+    fn rotate_each_word_right8(self) -> Self {
+        qmap(self, |x| x.rotate_right(8))
+    }
+    #[inline]
+    fn rotate_each_word_right11(self) -> Self {
+        qmap(self, |x| x.rotate_right(11))
+    }
+    #[inline]
+    fn rotate_each_word_right12(self) -> Self {
+        qmap(self, |x| x.rotate_right(12))
+    }
+    #[inline]
+    fn rotate_each_word_right16(self) -> Self {
+        qmap(self, |x| x.rotate_right(16))
+    }
+    #[inline]
+    fn rotate_each_word_right20(self) -> Self {
+        qmap(self, |x| x.rotate_right(20))
+    }
+    #[inline]
+    fn rotate_each_word_right24(self) -> Self {
+        qmap(self, |x| x.rotate_right(24))
+    }
+    #[inline]
+    fn rotate_each_word_right25(self) -> Self {
+        qmap(self, |x| x.rotate_right(25))
+    }
+}
+impl RotateEachWord64 for u64x2_generic {
+    #[inline]
+    fn rotate_each_word_right32(self) -> Self {
+        qmap(self, |x| x.rotate_right(32))
+    }
+}
+
+// workaround for koute/cargo-web#52 (u128::rotate_* broken with cargo web)
+fn rotate_u128_right(x: u128, i: u32) -> u128 {
+    (x >> i) | (x << (128 - i))
+}
+#[test]
+fn test_rotate_u128() {
+    const X: u128 = 0x0001_0203_0405_0607_0809_0a0b_0c0d_0e0f;
+    assert_eq!(rotate_u128_right(X, 17), X.rotate_right(17));
+}
+
+impl RotateEachWord32 for u128x1_generic {
+    #[inline]
+    fn rotate_each_word_right7(self) -> Self {
+        Self([rotate_u128_right(self.0[0], 7)])
+    }
+    #[inline]
+    fn rotate_each_word_right8(self) -> Self {
+        Self([rotate_u128_right(self.0[0], 8)])
+    }
+    #[inline]
+    fn rotate_each_word_right11(self) -> Self {
+        Self([rotate_u128_right(self.0[0], 11)])
+    }
+    #[inline]
+    fn rotate_each_word_right12(self) -> Self {
+        Self([rotate_u128_right(self.0[0], 12)])
+    }
+    #[inline]
+    fn rotate_each_word_right16(self) -> Self {
+        Self([rotate_u128_right(self.0[0], 16)])
+    }
+    #[inline]
+    fn rotate_each_word_right20(self) -> Self {
+        Self([rotate_u128_right(self.0[0], 20)])
+    }
+    #[inline]
+    fn rotate_each_word_right24(self) -> Self {
+        Self([rotate_u128_right(self.0[0], 24)])
+    }
+    #[inline]
+    fn rotate_each_word_right25(self) -> Self {
+        Self([rotate_u128_right(self.0[0], 25)])
+    }
+}
+impl RotateEachWord64 for u128x1_generic {
+    #[inline]
+    fn rotate_each_word_right32(self) -> Self {
+        Self([rotate_u128_right(self.0[0], 32)])
+    }
+}
+
+#[derive(Copy, Clone)]
+pub struct GenericMachine;
+impl Machine for GenericMachine {
+    type u32x4 = u32x4_generic;
+    type u64x2 = u64x2_generic;
+    type u128x1 = u128x1_generic;
+    type u32x4x2 = u32x4x2_generic;
+    type u64x2x2 = u64x2x2_generic;
+    type u64x4 = u64x4_generic;
+    type u128x2 = u128x2_generic;
+    type u32x4x4 = u32x4x4_generic;
+    type u64x2x4 = u64x2x4_generic;
+    type u128x4 = u128x4_generic;
+    #[inline]
+    unsafe fn instance() -> Self {
+        Self
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub struct u32x4_generic([u32; 4]);
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub struct u64x2_generic([u64; 2]);
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub struct u128x1_generic([u128; 1]);
+
+impl From<u32x4_generic> for vec128_storage {
+    #[inline(always)]
+    fn from(d: u32x4_generic) -> Self {
+        Self { d: d.0 }
+    }
+}
+impl From<u64x2_generic> for vec128_storage {
+    #[inline(always)]
+    fn from(q: u64x2_generic) -> Self {
+        Self { q: q.0 }
+    }
+}
+impl From<u128x1_generic> for vec128_storage {
+    #[inline(always)]
+    fn from(o: u128x1_generic) -> Self {
+        Self { o: o.0 }
+    }
+}
+
+impl Store<vec128_storage> for u32x4_generic {
+    #[inline(always)]
+    unsafe fn unpack(s: vec128_storage) -> Self {
+        Self(s.d)
+    }
+}
+impl Store<vec128_storage> for u64x2_generic {
+    #[inline(always)]
+    unsafe fn unpack(s: vec128_storage) -> Self {
+        Self(s.q)
+    }
+}
+impl Store<vec128_storage> for u128x1_generic {
+    #[inline(always)]
+    unsafe fn unpack(s: vec128_storage) -> Self {
+        Self(s.o)
+    }
+}
+
+impl ArithOps for u32x4_generic {}
+impl ArithOps for u64x2_generic {}
+impl ArithOps for u128x1_generic {}
+
+impl Add for u32x4_generic {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self::Output {
+        dmap2(self, rhs, |x, y| x.wrapping_add(y))
+    }
+}
+impl Add for u64x2_generic {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self::Output {
+        qmap2(self, rhs, |x, y| x.wrapping_add(y))
+    }
+}
+impl Add for u128x1_generic {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self::Output {
+        omap2(self, rhs, |x, y| x.wrapping_add(y))
+    }
+}
+impl AddAssign for u32x4_generic {
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = *self + rhs
+    }
+}
+impl AddAssign for u64x2_generic {
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = *self + rhs
+    }
+}
+impl AddAssign for u128x1_generic {
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = *self + rhs
+    }
+}
+impl BSwap for u32x4_generic {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        dmap(self, |x| x.swap_bytes())
+    }
+}
+impl BSwap for u64x2_generic {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        qmap(self, |x| x.swap_bytes())
+    }
+}
+impl BSwap for u128x1_generic {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        omap(self, |x| x.swap_bytes())
+    }
+}
+impl StoreBytes for u32x4_generic {
+    #[inline(always)]
+    unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+        assert_eq!(input.len(), 16);
+        let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
+        dmap(x, |x| x.to_le())
+    }
+    #[inline(always)]
+    unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+        assert_eq!(input.len(), 16);
+        let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
+        dmap(x, |x| x.to_be())
+    }
+    #[inline(always)]
+    fn write_le(self, out: &mut [u8]) {
+        assert_eq!(out.len(), 16);
+        let x = dmap(self, |x| x.to_le());
+        unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
+    }
+    #[inline(always)]
+    fn write_be(self, out: &mut [u8]) {
+        assert_eq!(out.len(), 16);
+        let x = dmap(self, |x| x.to_be());
+        unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
+    }
+}
+impl StoreBytes for u64x2_generic {
+    #[inline(always)]
+    unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+        assert_eq!(input.len(), 16);
+        let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
+        qmap(x, |x| x.to_le())
+    }
+    #[inline(always)]
+    unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+        assert_eq!(input.len(), 16);
+        let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
+        qmap(x, |x| x.to_be())
+    }
+    #[inline(always)]
+    fn write_le(self, out: &mut [u8]) {
+        assert_eq!(out.len(), 16);
+        let x = qmap(self, |x| x.to_le());
+        unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
+    }
+    #[inline(always)]
+    fn write_be(self, out: &mut [u8]) {
+        assert_eq!(out.len(), 16);
+        let x = qmap(self, |x| x.to_be());
+        unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
+    }
+}
+
+#[derive(Copy, Clone)]
+pub struct G0;
+#[derive(Copy, Clone)]
+pub struct G1;
+pub type u32x4x2_generic = x2<u32x4_generic, G0>;
+pub type u64x2x2_generic = x2<u64x2_generic, G0>;
+pub type u64x4_generic = x2<u64x2_generic, G1>;
+pub type u128x2_generic = x2<u128x1_generic, G0>;
+pub type u32x4x4_generic = x4<u32x4_generic>;
+pub type u64x2x4_generic = x4<u64x2_generic>;
+pub type u128x4_generic = x4<u128x1_generic>;
+
+impl MultiLane<[u32; 4]> for u32x4_generic {
+    #[inline(always)]
+    fn to_lanes(self) -> [u32; 4] {
+        self.0
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u32; 4]) -> Self {
+        Self(xs)
+    }
+}
+impl MultiLane<[u64; 2]> for u64x2_generic {
+    #[inline(always)]
+    fn to_lanes(self) -> [u64; 2] {
+        self.0
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u64; 2]) -> Self {
+        Self(xs)
+    }
+}
+impl MultiLane<[u64; 4]> for u64x4_generic {
+    #[inline(always)]
+    fn to_lanes(self) -> [u64; 4] {
+        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
+        [a[0], a[1], b[0], b[1]]
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u64; 4]) -> Self {
+        let (a, b) = (
+            u64x2_generic::from_lanes([xs[0], xs[1]]),
+            u64x2_generic::from_lanes([xs[2], xs[3]]),
+        );
+        x2::new([a, b])
+    }
+}
+impl MultiLane<[u128; 1]> for u128x1_generic {
+    #[inline(always)]
+    fn to_lanes(self) -> [u128; 1] {
+        self.0
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u128; 1]) -> Self {
+        Self(xs)
+    }
+}
+impl Vec4<u32> for u32x4_generic {
+    #[inline(always)]
+    fn extract(self, i: u32) -> u32 {
+        self.0[i as usize]
+    }
+    #[inline(always)]
+    fn insert(mut self, v: u32, i: u32) -> Self {
+        self.0[i as usize] = v;
+        self
+    }
+}
+impl Vec4<u64> for u64x4_generic {
+    #[inline(always)]
+    fn extract(self, i: u32) -> u64 {
+        let d: [u64; 4] = self.to_lanes();
+        d[i as usize]
+    }
+    #[inline(always)]
+    fn insert(self, v: u64, i: u32) -> Self {
+        self.0[(i / 2) as usize].insert(v, i % 2);
+        self
+    }
+}
+impl Vec2<u64> for u64x2_generic {
+    #[inline(always)]
+    fn extract(self, i: u32) -> u64 {
+        self.0[i as usize]
+    }
+    #[inline(always)]
+    fn insert(mut self, v: u64, i: u32) -> Self {
+        self.0[i as usize] = v;
+        self
+    }
+}
+
+impl Words4 for u32x4_generic {
+    #[inline(always)]
+    fn shuffle2301(self) -> Self {
+        self.swap64()
+    }
+    #[inline(always)]
+    fn shuffle1230(self) -> Self {
+        let x = self.0;
+        Self([x[3], x[0], x[1], x[2]])
+    }
+    #[inline(always)]
+    fn shuffle3012(self) -> Self {
+        let x = self.0;
+        Self([x[1], x[2], x[3], x[0]])
+    }
+}
+impl LaneWords4 for u32x4_generic {
+    #[inline(always)]
+    fn shuffle_lane_words2301(self) -> Self {
+        self.shuffle2301()
+    }
+    #[inline(always)]
+    fn shuffle_lane_words1230(self) -> Self {
+        self.shuffle1230()
+    }
+    #[inline(always)]
+    fn shuffle_lane_words3012(self) -> Self {
+        self.shuffle3012()
+    }
+}
+
+impl Words4 for u64x4_generic {
+    #[inline(always)]
+    fn shuffle2301(self) -> Self {
+        x2::new([self.0[1], self.0[0]])
+    }
+    #[inline(always)]
+    fn shuffle1230(self) -> Self {
+        unimplemented!()
+    }
+    #[inline(always)]
+    fn shuffle3012(self) -> Self {
+        unimplemented!()
+    }
+}
+
+impl u32x4<GenericMachine> for u32x4_generic {}
+impl u64x2<GenericMachine> for u64x2_generic {}
+impl u128x1<GenericMachine> for u128x1_generic {}
+impl u32x4x2<GenericMachine> for u32x4x2_generic {}
+impl u64x2x2<GenericMachine> for u64x2x2_generic {}
+impl u64x4<GenericMachine> for u64x4_generic {}
+impl u128x2<GenericMachine> for u128x2_generic {}
+impl u32x4x4<GenericMachine> for u32x4x4_generic {}
+impl u64x2x4<GenericMachine> for u64x2x4_generic {}
+impl u128x4<GenericMachine> for u128x4_generic {}
+
+#[macro_export]
+macro_rules! dispatch {
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+        #[inline]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            let $mach = unsafe { $crate::generic::GenericMachine::instance() };
+            #[inline(always)]
+            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            fn_impl($mach, $($arg),*)
+        }
+    };
+    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+        dispatch!($mach, $MTy, {
+            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+        });
+    }
+}
+#[macro_export]
+macro_rules! dispatch_light128 {
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+        #[inline]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            let $mach = unsafe { $crate::generic::GenericMachine::instance() };
+            #[inline(always)]
+            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            fn_impl($mach, $($arg),*)
+        }
+    };
+    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+        dispatch!($mach, $MTy, {
+            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+        });
+    }
+}
+#[macro_export]
+macro_rules! dispatch_light256 {
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+        #[inline]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            let $mach = unsafe { $crate::generic::GenericMachine::instance() };
+            #[inline(always)]
+            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            fn_impl($mach, $($arg),*)
+        }
+    };
+    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+        dispatch!($mach, $MTy, {
+            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+        });
+    }
+}
+#[macro_export]
+macro_rules! dispatch_light512 {
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+        #[inline]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            let $mach = unsafe { $crate::generic::GenericMachine::instance() };
+            #[inline(always)]
+            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            fn_impl($mach, $($arg),*)
+        }
+    };
+    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+        dispatch!($mach, $MTy, {
+            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+        });
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_bswap32() {
+        let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
+        let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
+
+        let m = unsafe { GenericMachine::instance() };
+
+        let x: <GenericMachine as Machine>::u32x4 = m.vec(xs);
+        let x = x.bswap();
+
+        let y = m.vec(ys);
+        assert_eq!(x, y);
+    }
+}
diff --git a/vendor/ppv-lite86/src/lib.rs b/vendor/ppv-lite86/src/lib.rs
new file mode 100644
index 000000000..43dc5d869
--- /dev/null
+++ b/vendor/ppv-lite86/src/lib.rs
@@ -0,0 +1,22 @@
+#![no_std]
+
+// Design:
+// - safety: safe creation of any machine type is done only by instance methods of a
+//   Machine (which is a ZST + Copy type), which can only by created unsafely or safely
+//   through feature detection (e.g. fn AVX2::try_get() -> Option<Machine>).
+
+mod soft;
+mod types;
+pub use self::types::*;
+
+#[cfg(all(feature = "simd", target_arch = "x86_64", not(miri)))]
+pub mod x86_64;
+#[cfg(all(feature = "simd", target_arch = "x86_64", not(miri)))]
+use self::x86_64 as arch;
+
+#[cfg(any(miri, not(all(feature = "simd", any(target_arch = "x86_64")))))]
+pub mod generic;
+#[cfg(any(miri, not(all(feature = "simd", any(target_arch = "x86_64")))))]
+use self::generic as arch;
+
+pub use self::arch::{vec128_storage, vec256_storage, vec512_storage};
diff --git a/vendor/ppv-lite86/src/soft.rs b/vendor/ppv-lite86/src/soft.rs
new file mode 100644
index 000000000..d12dac528
--- /dev/null
+++ b/vendor/ppv-lite86/src/soft.rs
@@ -0,0 +1,420 @@
+//! Implement 256- and 512- bit in terms of 128-bit, for machines without native wide SIMD.
+
+use core::marker::PhantomData;
+use core::ops::*;
+use crate::types::*;
+use crate::{vec128_storage, vec256_storage, vec512_storage};
+
+#[derive(Copy, Clone, Default)]
+#[allow(non_camel_case_types)]
+pub struct x2<W, G>(pub [W; 2], PhantomData<G>);
+impl<W, G> x2<W, G> {
+    #[inline(always)]
+    pub fn new(xs: [W; 2]) -> Self {
+        x2(xs, PhantomData)
+    }
+}
+macro_rules! fwd_binop_x2 {
+    ($trait:ident, $fn:ident) => {
+        impl<W: $trait + Copy, G> $trait for x2<W, G> {
+            type Output = x2<W::Output, G>;
+            #[inline(always)]
+            fn $fn(self, rhs: Self) -> Self::Output {
+                x2::new([self.0[0].$fn(rhs.0[0]), self.0[1].$fn(rhs.0[1])])
+            }
+        }
+    };
+}
+macro_rules! fwd_binop_assign_x2 {
+    ($trait:ident, $fn_assign:ident) => {
+        impl<W: $trait + Copy, G> $trait for x2<W, G> {
+            #[inline(always)]
+            fn $fn_assign(&mut self, rhs: Self) {
+                (self.0[0]).$fn_assign(rhs.0[0]);
+                (self.0[1]).$fn_assign(rhs.0[1]);
+            }
+        }
+    };
+}
+macro_rules! fwd_unop_x2 {
+    ($fn:ident) => {
+        #[inline(always)]
+        fn $fn(self) -> Self {
+            x2::new([self.0[0].$fn(), self.0[1].$fn()])
+        }
+    };
+}
+impl<W, G> RotateEachWord32 for x2<W, G>
+where
+    W: Copy + RotateEachWord32,
+{
+    fwd_unop_x2!(rotate_each_word_right7);
+    fwd_unop_x2!(rotate_each_word_right8);
+    fwd_unop_x2!(rotate_each_word_right11);
+    fwd_unop_x2!(rotate_each_word_right12);
+    fwd_unop_x2!(rotate_each_word_right16);
+    fwd_unop_x2!(rotate_each_word_right20);
+    fwd_unop_x2!(rotate_each_word_right24);
+    fwd_unop_x2!(rotate_each_word_right25);
+}
+impl<W, G> RotateEachWord64 for x2<W, G>
+where
+    W: Copy + RotateEachWord64,
+{
+    fwd_unop_x2!(rotate_each_word_right32);
+}
+impl<W, G> RotateEachWord128 for x2<W, G> where W: RotateEachWord128 {}
+impl<W, G> BitOps0 for x2<W, G>
+where
+    W: BitOps0,
+    G: Copy,
+{
+}
+impl<W, G> BitOps32 for x2<W, G>
+where
+    W: BitOps32 + BitOps0,
+    G: Copy,
+{
+}
+impl<W, G> BitOps64 for x2<W, G>
+where
+    W: BitOps64 + BitOps0,
+    G: Copy,
+{
+}
+impl<W, G> BitOps128 for x2<W, G>
+where
+    W: BitOps128 + BitOps0,
+    G: Copy,
+{
+}
+fwd_binop_x2!(BitAnd, bitand);
+fwd_binop_x2!(BitOr, bitor);
+fwd_binop_x2!(BitXor, bitxor);
+fwd_binop_x2!(AndNot, andnot);
+fwd_binop_assign_x2!(BitAndAssign, bitand_assign);
+fwd_binop_assign_x2!(BitOrAssign, bitor_assign);
+fwd_binop_assign_x2!(BitXorAssign, bitxor_assign);
+impl<W, G> ArithOps for x2<W, G>
+where
+    W: ArithOps,
+    G: Copy,
+{
+}
+fwd_binop_x2!(Add, add);
+fwd_binop_assign_x2!(AddAssign, add_assign);
+impl<W: Not + Copy, G> Not for x2<W, G> {
+    type Output = x2<W::Output, G>;
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        x2::new([self.0[0].not(), self.0[1].not()])
+    }
+}
+impl<W, G> UnsafeFrom<[W; 2]> for x2<W, G> {
+    #[inline(always)]
+    unsafe fn unsafe_from(xs: [W; 2]) -> Self {
+        x2::new(xs)
+    }
+}
+impl<W: Copy, G> Vec2<W> for x2<W, G> {
+    #[inline(always)]
+    fn extract(self, i: u32) -> W {
+        self.0[i as usize]
+    }
+    #[inline(always)]
+    fn insert(mut self, w: W, i: u32) -> Self {
+        self.0[i as usize] = w;
+        self
+    }
+}
+impl<W: Copy + Store<vec128_storage>, G> Store<vec256_storage> for x2<W, G> {
+    #[inline(always)]
+    unsafe fn unpack(p: vec256_storage) -> Self {
+        let p = p.split128();
+        x2::new([W::unpack(p[0]), W::unpack(p[1])])
+    }
+}
+impl<W, G> From<x2<W, G>> for vec256_storage
+where
+    W: Copy,
+    vec128_storage: From<W>,
+{
+    #[inline(always)]
+    fn from(x: x2<W, G>) -> Self {
+        vec256_storage::new128([x.0[0].into(), x.0[1].into()])
+    }
+}
+impl<W, G> Swap64 for x2<W, G>
+where
+    W: Swap64 + Copy,
+{
+    fwd_unop_x2!(swap1);
+    fwd_unop_x2!(swap2);
+    fwd_unop_x2!(swap4);
+    fwd_unop_x2!(swap8);
+    fwd_unop_x2!(swap16);
+    fwd_unop_x2!(swap32);
+    fwd_unop_x2!(swap64);
+}
+impl<W: Copy, G> MultiLane<[W; 2]> for x2<W, G> {
+    #[inline(always)]
+    fn to_lanes(self) -> [W; 2] {
+        self.0
+    }
+    #[inline(always)]
+    fn from_lanes(lanes: [W; 2]) -> Self {
+        x2::new(lanes)
+    }
+}
+impl<W: BSwap + Copy, G> BSwap for x2<W, G> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        x2::new([self.0[0].bswap(), self.0[1].bswap()])
+    }
+}
+impl<W: StoreBytes + BSwap + Copy, G> StoreBytes for x2<W, G> {
+    #[inline(always)]
+    unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+        let input = input.split_at(16);
+        x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)])
+    }
+    #[inline(always)]
+    unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+        x2::unsafe_read_le(input).bswap()
+    }
+    #[inline(always)]
+    fn write_le(self, out: &mut [u8]) {
+        let out = out.split_at_mut(16);
+        self.0[0].write_le(out.0);
+        self.0[1].write_le(out.1);
+    }
+    #[inline(always)]
+    fn write_be(self, out: &mut [u8]) {
+        let out = out.split_at_mut(16);
+        self.0[0].write_be(out.0);
+        self.0[1].write_be(out.1);
+    }
+}
+
+#[derive(Copy, Clone, Default)]
+#[allow(non_camel_case_types)]
+pub struct x4<W>(pub [W; 4]);
+impl<W> x4<W> {
+    #[inline(always)]
+    pub fn new(xs: [W; 4]) -> Self {
+        x4(xs)
+    }
+}
+macro_rules! fwd_binop_x4 {
+    ($trait:ident, $fn:ident) => {
+        impl<W: $trait + Copy> $trait for x4<W> {
+            type Output = x4<W::Output>;
+            #[inline(always)]
+            fn $fn(self, rhs: Self) -> Self::Output {
+                x4([
+                    self.0[0].$fn(rhs.0[0]),
+                    self.0[1].$fn(rhs.0[1]),
+                    self.0[2].$fn(rhs.0[2]),
+                    self.0[3].$fn(rhs.0[3]),
+                ])
+            }
+        }
+    };
+}
+macro_rules! fwd_binop_assign_x4 {
+    ($trait:ident, $fn_assign:ident) => {
+        impl<W: $trait + Copy> $trait for x4<W> {
+            #[inline(always)]
+            fn $fn_assign(&mut self, rhs: Self) {
+                self.0[0].$fn_assign(rhs.0[0]);
+                self.0[1].$fn_assign(rhs.0[1]);
+                self.0[2].$fn_assign(rhs.0[2]);
+                self.0[3].$fn_assign(rhs.0[3]);
+            }
+        }
+    };
+}
+macro_rules! fwd_unop_x4 {
+    ($fn:ident) => {
+        #[inline(always)]
+        fn $fn(self) -> Self {
+            x4([self.0[0].$fn(), self.0[1].$fn(), self.0[2].$fn(), self.0[3].$fn()])
+        }
+    };
+}
+impl<W> RotateEachWord32 for x4<W>
+where
+    W: Copy + RotateEachWord32,
+{
+    fwd_unop_x4!(rotate_each_word_right7);
+    fwd_unop_x4!(rotate_each_word_right8);
+    fwd_unop_x4!(rotate_each_word_right11);
+    fwd_unop_x4!(rotate_each_word_right12);
+    fwd_unop_x4!(rotate_each_word_right16);
+    fwd_unop_x4!(rotate_each_word_right20);
+    fwd_unop_x4!(rotate_each_word_right24);
+    fwd_unop_x4!(rotate_each_word_right25);
+}
+impl<W> RotateEachWord64 for x4<W>
+where
+    W: Copy + RotateEachWord64,
+{
+    fwd_unop_x4!(rotate_each_word_right32);
+}
+impl<W> RotateEachWord128 for x4<W> where W: RotateEachWord128 {}
+impl<W> BitOps0 for x4<W> where W: BitOps0 {}
+impl<W> BitOps32 for x4<W> where W: BitOps32 + BitOps0 {}
+impl<W> BitOps64 for x4<W> where W: BitOps64 + BitOps0 {}
+impl<W> BitOps128 for x4<W> where W: BitOps128 + BitOps0 {}
+fwd_binop_x4!(BitAnd, bitand);
+fwd_binop_x4!(BitOr, bitor);
+fwd_binop_x4!(BitXor, bitxor);
+fwd_binop_x4!(AndNot, andnot);
+fwd_binop_assign_x4!(BitAndAssign, bitand_assign);
+fwd_binop_assign_x4!(BitOrAssign, bitor_assign);
+fwd_binop_assign_x4!(BitXorAssign, bitxor_assign);
+impl<W> ArithOps for x4<W> where W: ArithOps {}
+fwd_binop_x4!(Add, add);
+fwd_binop_assign_x4!(AddAssign, add_assign);
+impl<W: Not + Copy> Not for x4<W> {
+    type Output = x4<W::Output>;
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        x4([
+            self.0[0].not(),
+            self.0[1].not(),
+            self.0[2].not(),
+            self.0[3].not(),
+        ])
+    }
+}
+impl<W> UnsafeFrom<[W; 4]> for x4<W> {
+    #[inline(always)]
+    unsafe fn unsafe_from(xs: [W; 4]) -> Self {
+        x4(xs)
+    }
+}
+impl<W: Copy> Vec4<W> for x4<W> {
+    #[inline(always)]
+    fn extract(self, i: u32) -> W {
+        self.0[i as usize]
+    }
+    #[inline(always)]
+    fn insert(mut self, w: W, i: u32) -> Self {
+        self.0[i as usize] = w;
+        self
+    }
+}
+impl<W: Copy + Store<vec128_storage>> Store<vec512_storage> for x4<W> {
+    #[inline(always)]
+    unsafe fn unpack(p: vec512_storage) -> Self {
+        let p = p.split128();
+        x4([
+            W::unpack(p[0]),
+            W::unpack(p[1]),
+            W::unpack(p[2]),
+            W::unpack(p[3]),
+        ])
+    }
+}
+impl<W> From<x4<W>> for vec512_storage
+where
+    W: Copy,
+    vec128_storage: From<W>,
+{
+    #[inline(always)]
+    fn from(x: x4<W>) -> Self {
+        vec512_storage::new128([x.0[0].into(), x.0[1].into(), x.0[2].into(), x.0[3].into()])
+    }
+}
+impl<W> Swap64 for x4<W>
+where
+    W: Swap64 + Copy,
+{
+    fwd_unop_x4!(swap1);
+    fwd_unop_x4!(swap2);
+    fwd_unop_x4!(swap4);
+    fwd_unop_x4!(swap8);
+    fwd_unop_x4!(swap16);
+    fwd_unop_x4!(swap32);
+    fwd_unop_x4!(swap64);
+}
+impl<W: Copy> MultiLane<[W; 4]> for x4<W> {
+    #[inline(always)]
+    fn to_lanes(self) -> [W; 4] {
+        self.0
+    }
+    #[inline(always)]
+    fn from_lanes(lanes: [W; 4]) -> Self {
+        x4(lanes)
+    }
+}
+impl<W: BSwap + Copy> BSwap for x4<W> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        x4([
+            self.0[0].bswap(),
+            self.0[1].bswap(),
+            self.0[2].bswap(),
+            self.0[3].bswap(),
+        ])
+    }
+}
+impl<W: StoreBytes + BSwap + Copy> StoreBytes for x4<W> {
+    #[inline(always)]
+    unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+        x4([
+            W::unsafe_read_le(&input[0..16]),
+            W::unsafe_read_le(&input[16..32]),
+            W::unsafe_read_le(&input[32..48]),
+            W::unsafe_read_le(&input[48..64]),
+        ])
+    }
+    #[inline(always)]
+    unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+        x4::unsafe_read_le(input).bswap()
+    }
+    #[inline(always)]
+    fn write_le(self, out: &mut [u8]) {
+        self.0[0].write_le(&mut out[0..16]);
+        self.0[1].write_le(&mut out[16..32]);
+        self.0[2].write_le(&mut out[32..48]);
+        self.0[3].write_le(&mut out[48..64]);
+    }
+    #[inline(always)]
+    fn write_be(self, out: &mut [u8]) {
+        self.0[0].write_be(&mut out[0..16]);
+        self.0[1].write_be(&mut out[16..32]);
+        self.0[2].write_be(&mut out[32..48]);
+        self.0[3].write_be(&mut out[48..64]);
+    }
+}
+impl<W: Copy + LaneWords4> LaneWords4 for x4<W> {
+    #[inline(always)]
+    fn shuffle_lane_words2301(self) -> Self {
+        x4([
+            self.0[0].shuffle_lane_words2301(),
+            self.0[1].shuffle_lane_words2301(),
+            self.0[2].shuffle_lane_words2301(),
+            self.0[3].shuffle_lane_words2301(),
+        ])
+    }
+    #[inline(always)]
+    fn shuffle_lane_words1230(self) -> Self {
+        x4([
+            self.0[0].shuffle_lane_words1230(),
+            self.0[1].shuffle_lane_words1230(),
+            self.0[2].shuffle_lane_words1230(),
+            self.0[3].shuffle_lane_words1230(),
+        ])
+    }
+    #[inline(always)]
+    fn shuffle_lane_words3012(self) -> Self {
+        x4([
+            self.0[0].shuffle_lane_words3012(),
+            self.0[1].shuffle_lane_words3012(),
+            self.0[2].shuffle_lane_words3012(),
+            self.0[3].shuffle_lane_words3012(),
+        ])
+    }
+}
diff --git a/vendor/ppv-lite86/src/types.rs b/vendor/ppv-lite86/src/types.rs
new file mode 100644
index 000000000..119b6bb8d
--- /dev/null
+++ b/vendor/ppv-lite86/src/types.rs
@@ -0,0 +1,279 @@
+use core::ops::{Add, AddAssign, BitAnd, BitOr, BitXor, BitXorAssign, Not};
+
+pub trait AndNot {
+    type Output;
+    fn andnot(self, rhs: Self) -> Self::Output;
+}
+pub trait BSwap {
+    fn bswap(self) -> Self;
+}
+/// Ops that depend on word size
+pub trait ArithOps: Add<Output = Self> + AddAssign + Sized + Copy + Clone + BSwap {}
+/// Ops that are independent of word size and endian
+pub trait BitOps0:
+    BitAnd<Output = Self>
+    + BitOr<Output = Self>
+    + BitXor<Output = Self>
+    + BitXorAssign
+    + Not<Output = Self>
+    + AndNot<Output = Self>
+    + Sized
+    + Copy
+    + Clone
+{
+}
+
+pub trait BitOps32: BitOps0 + RotateEachWord32 {}
+pub trait BitOps64: BitOps32 + RotateEachWord64 {}
+pub trait BitOps128: BitOps64 + RotateEachWord128 {}
+
+pub trait RotateEachWord32 {
+    fn rotate_each_word_right7(self) -> Self;
+    fn rotate_each_word_right8(self) -> Self;
+    fn rotate_each_word_right11(self) -> Self;
+    fn rotate_each_word_right12(self) -> Self;
+    fn rotate_each_word_right16(self) -> Self;
+    fn rotate_each_word_right20(self) -> Self;
+    fn rotate_each_word_right24(self) -> Self;
+    fn rotate_each_word_right25(self) -> Self;
+}
+
+pub trait RotateEachWord64 {
+    fn rotate_each_word_right32(self) -> Self;
+}
+
+pub trait RotateEachWord128 {}
+
+#[allow(non_camel_case_types)]
+mod types {
+    //! Vector type naming scheme:
+    //! uN[xP]xL
+    //! Unsigned; N-bit words * P bits per lane * L lanes
+    //!
+    //! A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of
+    //! wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and
+    //! slow inter-lane operations.
+
+    use crate::arch::{vec128_storage, vec256_storage, vec512_storage};
+    use crate::{ArithOps, BitOps128, BitOps32, BitOps64, Machine, Store, StoreBytes};
+
+    pub trait UnsafeFrom<T> {
+        unsafe fn unsafe_from(t: T) -> Self;
+    }
+
+    /// A vector composed of two elements, which may be words or themselves vectors.
+    pub trait Vec2<W> {
+        fn extract(self, i: u32) -> W;
+        fn insert(self, w: W, i: u32) -> Self;
+    }
+
+    /// A vector composed of four elements, which may be words or themselves vectors.
+    pub trait Vec4<W> {
+        fn extract(self, i: u32) -> W;
+        fn insert(self, w: W, i: u32) -> Self;
+    }
+
+    // TODO: multiples of 4 should inherit this
+    /// A vector composed of four words; depending on their size, operations may cross lanes.
+    pub trait Words4 {
+        fn shuffle1230(self) -> Self;
+        fn shuffle2301(self) -> Self;
+        fn shuffle3012(self) -> Self;
+    }
+
+    /// A vector composed one or more lanes each composed of four words.
+    pub trait LaneWords4 {
+        fn shuffle_lane_words1230(self) -> Self;
+        fn shuffle_lane_words2301(self) -> Self;
+        fn shuffle_lane_words3012(self) -> Self;
+    }
+
+    // TODO: make this a part of BitOps
+    /// Exchange neigboring ranges of bits of the specified size
+    pub trait Swap64 {
+        fn swap1(self) -> Self;
+        fn swap2(self) -> Self;
+        fn swap4(self) -> Self;
+        fn swap8(self) -> Self;
+        fn swap16(self) -> Self;
+        fn swap32(self) -> Self;
+        fn swap64(self) -> Self;
+    }
+
+    pub trait u32x4<M: Machine>:
+        BitOps32
+        + Store<vec128_storage>
+        + ArithOps
+        + Vec4<u32>
+        + Words4
+        + LaneWords4
+        + StoreBytes
+        + MultiLane<[u32; 4]>
+        + Into<vec128_storage>
+    {
+}
+    pub trait u64x2<M: Machine>:
+        BitOps64
+        + Store<vec128_storage>
+        + ArithOps
+        + Vec2<u64>
+        + MultiLane<[u64; 2]>
+        + Into<vec128_storage>
+    {
+}
+    pub trait u128x1<M: Machine>:
+        BitOps128 + Store<vec128_storage> + Swap64 + MultiLane<[u128; 1]> + Into<vec128_storage>
+    {
+}
+
+    pub trait u32x4x2<M: Machine>:
+        BitOps32
+        + Store<vec256_storage>
+        + Vec2<M::u32x4>
+        + MultiLane<[M::u32x4; 2]>
+        + ArithOps
+        + Into<vec256_storage>
+    {
+}
+    pub trait u64x2x2<M: Machine>:
+        BitOps64
+        + Store<vec256_storage>
+        + Vec2<M::u64x2>
+        + MultiLane<[M::u64x2; 2]>
+        + ArithOps
+        + StoreBytes
+        + Into<vec256_storage>
+    {
+}
+    pub trait u64x4<M: Machine>:
+        BitOps64
+        + Store<vec256_storage>
+        + Vec4<u64>
+        + MultiLane<[u64; 4]>
+        + ArithOps
+        + Words4
+        + StoreBytes
+        + Into<vec256_storage>
+    {
+}
+    pub trait u128x2<M: Machine>:
+        BitOps128
+        + Store<vec256_storage>
+        + Vec2<M::u128x1>
+        + MultiLane<[M::u128x1; 2]>
+        + Swap64
+        + Into<vec256_storage>
+    {
+}
+
+    pub trait u32x4x4<M: Machine>:
+        BitOps32
+        + Store<vec512_storage>
+        + Vec4<M::u32x4>
+        + MultiLane<[M::u32x4; 4]>
+        + ArithOps
+        + LaneWords4
+        + Into<vec512_storage>
+    {
+}
+    pub trait u64x2x4<M: Machine>:
+        BitOps64
+        + Store<vec512_storage>
+        + Vec4<M::u64x2>
+        + MultiLane<[M::u64x2; 4]>
+        + ArithOps
+        + Into<vec512_storage>
+    {
+}
+    // TODO: Words4
+    pub trait u128x4<M: Machine>:
+        BitOps128
+        + Store<vec512_storage>
+        + Vec4<M::u128x1>
+        + MultiLane<[M::u128x1; 4]>
+        + Swap64
+        + Into<vec512_storage>
+    {
+}
+
+    /// A vector composed of multiple 128-bit lanes.
+    pub trait MultiLane<Lanes> {
+        /// Split a multi-lane vector into single-lane vectors.
+        fn to_lanes(self) -> Lanes;
+        /// Build a multi-lane vector from individual lanes.
+        fn from_lanes(lanes: Lanes) -> Self;
+    }
+
+    /// Combine single vectors into a multi-lane vector.
+    pub trait VZip<V> {
+        fn vzip(self) -> V;
+    }
+
+    impl<V, T> VZip<V> for T
+    where
+        V: MultiLane<T>,
+    {
+        #[inline(always)]
+        fn vzip(self) -> V {
+            V::from_lanes(self)
+        }
+    }
+}
+pub use self::types::*;
+
+pub trait Machine: Sized + Copy {
+    type u32x4: u32x4<Self>;
+    type u64x2: u64x2<Self>;
+    type u128x1: u128x1<Self>;
+
+    type u32x4x2: u32x4x2<Self>;
+    type u64x2x2: u64x2x2<Self>;
+    type u64x4: u64x4<Self>;
+    type u128x2: u128x2<Self>;
+
+    type u32x4x4: u32x4x4<Self>;
+    type u64x2x4: u64x2x4<Self>;
+    type u128x4: u128x4<Self>;
+
+    #[inline(always)]
+    fn unpack<S, V: Store<S>>(self, s: S) -> V {
+        unsafe { V::unpack(s) }
+    }
+
+    #[inline(always)]
+    fn vec<V, A>(self, a: A) -> V
+    where
+        V: MultiLane<A>,
+    {
+        V::from_lanes(a)
+    }
+
+    #[inline(always)]
+    fn read_le<V>(self, input: &[u8]) -> V
+    where
+        V: StoreBytes,
+    {
+        unsafe { V::unsafe_read_le(input) }
+    }
+
+    #[inline(always)]
+    fn read_be<V>(self, input: &[u8]) -> V
+    where
+        V: StoreBytes,
+    {
+        unsafe { V::unsafe_read_be(input) }
+    }
+
+    unsafe fn instance() -> Self;
+}
+
+pub trait Store<S> {
+    unsafe fn unpack(p: S) -> Self;
+}
+
+pub trait StoreBytes {
+    unsafe fn unsafe_read_le(input: &[u8]) -> Self;
+    unsafe fn unsafe_read_be(input: &[u8]) -> Self;
+    fn write_le(self, out: &mut [u8]);
+    fn write_be(self, out: &mut [u8]);
+}
diff --git a/vendor/ppv-lite86/src/x86_64/mod.rs b/vendor/ppv-lite86/src/x86_64/mod.rs
new file mode 100644
index 000000000..ecf184f36
--- /dev/null
+++ b/vendor/ppv-lite86/src/x86_64/mod.rs
@@ -0,0 +1,433 @@
+// crate minimums: sse2, x86_64
+
+use core::arch::x86_64::{__m128i, __m256i};
+use crate::types::*;
+
+mod sse2;
+
+#[derive(Copy, Clone)]
+pub struct YesS3;
+#[derive(Copy, Clone)]
+pub struct NoS3;
+
+#[derive(Copy, Clone)]
+pub struct YesS4;
+#[derive(Copy, Clone)]
+pub struct NoS4;
+
+#[derive(Copy, Clone)]
+pub struct YesA1;
+#[derive(Copy, Clone)]
+pub struct NoA1;
+
+#[derive(Copy, Clone)]
+pub struct YesA2;
+#[derive(Copy, Clone)]
+pub struct NoA2;
+
+#[derive(Copy, Clone)]
+pub struct YesNI;
+#[derive(Copy, Clone)]
+pub struct NoNI;
+
+use core::marker::PhantomData;
+
+#[derive(Copy, Clone)]
+pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
+impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
+where
+    sse2::u128x1_sse2<S3, S4, NI>: Swap64,
+    sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
+    sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
+    sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
+    sse2::u128x1_sse2<S3, S4, NI>: BSwap,
+    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
+    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
+    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
+    sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
+    sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
+{
+    type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
+    type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
+    type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
+
+    type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
+    type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
+    type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
+    type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
+
+    type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
+    type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
+    type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
+
+    #[inline(always)]
+    unsafe fn instance() -> Self {
+        SseMachine(PhantomData)
+    }
+}
+
+#[derive(Copy, Clone)]
+pub struct Avx2Machine<NI>(PhantomData<NI>);
+impl<NI: Copy> Machine for Avx2Machine<NI>
+where
+    sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
+    sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
+    sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
+    sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
+{
+    type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
+    type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
+    type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
+
+    type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
+    type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
+    type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
+    type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
+
+    type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
+    type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
+    type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
+
+    #[inline(always)]
+    unsafe fn instance() -> Self {
+        Avx2Machine(PhantomData)
+    }
+}
+
+pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
+pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
+pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
+/// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
+/// to avoid expensive SSE/VEX conflicts.
+pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
+pub type AVX2 = Avx2Machine<NoNI>;
+
+/// Generic wrapper for unparameterized storage of any of the possible impls.
+/// Converting into and out of this type should be essentially free, although it may be more
+/// aligned than a particular impl requires.
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone)]
+pub union vec128_storage {
+    u32x4: [u32; 4],
+    u64x2: [u64; 2],
+    u128x1: [u128; 1],
+    sse2: __m128i,
+}
+impl Store<vec128_storage> for vec128_storage {
+    #[inline(always)]
+    unsafe fn unpack(p: vec128_storage) -> Self {
+        p
+    }
+}
+impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
+    #[inline(always)]
+    fn into(self) -> &'a [u32; 4] {
+        unsafe { &self.u32x4 }
+    }
+}
+impl Into<vec128_storage> for [u32; 4] {
+    #[inline(always)]
+    fn into(self) -> vec128_storage {
+        vec128_storage { u32x4: self }
+    }
+}
+impl Default for vec128_storage {
+    #[inline(always)]
+    fn default() -> Self {
+        vec128_storage { u128x1: [0] }
+    }
+}
+impl Eq for vec128_storage {}
+impl PartialEq for vec128_storage {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { self.u128x1 == rhs.u128x1 }
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone)]
+pub union vec256_storage {
+    u32x8: [u32; 8],
+    u64x4: [u64; 4],
+    u128x2: [u128; 2],
+    sse2: [vec128_storage; 2],
+    avx: __m256i,
+}
+impl Into<vec256_storage> for [u64; 4] {
+    #[inline(always)]
+    fn into(self) -> vec256_storage {
+        vec256_storage { u64x4: self }
+    }
+}
+impl Default for vec256_storage {
+    #[inline(always)]
+    fn default() -> Self {
+        vec256_storage { u128x2: [0, 0] }
+    }
+}
+impl vec256_storage {
+    pub fn new128(xs: [vec128_storage; 2]) -> Self {
+        Self { sse2: xs }
+    }
+    pub fn split128(self) -> [vec128_storage; 2] {
+        unsafe { self.sse2 }
+    }
+}
+impl Eq for vec256_storage {}
+impl PartialEq for vec256_storage {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { self.sse2 == rhs.sse2 }
+    }
+}
+
+#[allow(non_camel_case_types)]
+#[derive(Copy, Clone)]
+pub union vec512_storage {
+    u32x16: [u32; 16],
+    u64x8: [u64; 8],
+    u128x4: [u128; 4],
+    sse2: [vec128_storage; 4],
+    avx: [vec256_storage; 2],
+}
+impl Default for vec512_storage {
+    #[inline(always)]
+    fn default() -> Self {
+        vec512_storage {
+            u128x4: [0, 0, 0, 0],
+        }
+    }
+}
+impl vec512_storage {
+    pub fn new128(xs: [vec128_storage; 4]) -> Self {
+        Self { sse2: xs }
+    }
+    pub fn split128(self) -> [vec128_storage; 4] {
+        unsafe { self.sse2 }
+    }
+}
+impl Eq for vec512_storage {}
+impl PartialEq for vec512_storage {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { self.avx == rhs.avx }
+    }
+}
+
+macro_rules! impl_into {
+    ($storage:ident, $array:ty, $name:ident) => {
+        impl Into<$array> for $storage {
+            #[inline(always)]
+            fn into(self) -> $array {
+                unsafe { self.$name }
+            }
+        }
+    };
+}
+impl_into!(vec128_storage, [u32; 4], u32x4);
+impl_into!(vec128_storage, [u64; 2], u64x2);
+impl_into!(vec128_storage, [u128; 1], u128x1);
+impl_into!(vec256_storage, [u32; 8], u32x8);
+impl_into!(vec256_storage, [u64; 4], u64x4);
+impl_into!(vec256_storage, [u128; 2], u128x2);
+impl_into!(vec512_storage, [u32; 16], u32x16);
+impl_into!(vec512_storage, [u64; 8], u64x8);
+impl_into!(vec512_storage, [u128; 4], u128x4);
+
+/// Generate the full set of optimized implementations to take advantage of the most important
+/// hardware feature sets.
+///
+/// This dispatcher is suitable for maximizing throughput.
+#[macro_export]
+macro_rules! dispatch {
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+        #[cfg(feature = "std")]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            #[inline(always)]
+            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            use std::arch::x86_64::*;
+            #[target_feature(enable = "avx2")]
+            unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
+                let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
+                _mm256_zeroupper();
+                ret
+            }
+            #[target_feature(enable = "avx")]
+            #[target_feature(enable = "sse4.1")]
+            #[target_feature(enable = "ssse3")]
+            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
+                let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
+                _mm256_zeroupper();
+                ret
+            }
+            #[target_feature(enable = "sse4.1")]
+            #[target_feature(enable = "ssse3")]
+            unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+            }
+            #[target_feature(enable = "ssse3")]
+            unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+            }
+            #[target_feature(enable = "sse2")]
+            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+            }
+            unsafe {
+                if is_x86_feature_detected!("avx2") {
+                    impl_avx2($($arg),*)
+                } else if is_x86_feature_detected!("avx") {
+                    impl_avx($($arg),*)
+                } else if is_x86_feature_detected!("sse4.1") {
+                    impl_sse41($($arg),*)
+                } else if is_x86_feature_detected!("ssse3") {
+                    impl_ssse3($($arg),*)
+                } else if is_x86_feature_detected!("sse2") {
+                    impl_sse2($($arg),*)
+                } else {
+                    unimplemented!()
+                }
+            }
+        }
+        #[cfg(not(feature = "std"))]
+        #[inline(always)]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            unsafe {
+                if cfg!(target_feature = "avx2") {
+                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
+                } else if cfg!(target_feature = "avx") {
+                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+                } else if cfg!(target_feature = "sse4.1") {
+                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+                } else if cfg!(target_feature = "ssse3") {
+                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+                } else {
+                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+                }
+            }
+        }
+    };
+    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+        dispatch!($mach, $MTy, {
+            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+        });
+    }
+}
+
+/// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
+/// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
+///
+/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
+/// features (e.g. because they are done infrequently), so minimizing their contribution to code
+/// size is more important.
+#[macro_export]
+macro_rules! dispatch_light128 {
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+        #[cfg(feature = "std")]
+        $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            #[inline(always)]
+            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            use std::arch::x86_64::*;
+            #[target_feature(enable = "avx")]
+            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+            }
+            #[target_feature(enable = "sse2")]
+            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+            }
+            unsafe {
+                if is_x86_feature_detected!("avx") {
+                    impl_avx($($arg),*)
+                } else if is_x86_feature_detected!("sse2") {
+                    impl_sse2($($arg),*)
+                } else {
+                    unimplemented!()
+                }
+            }
+        }
+        #[cfg(not(feature = "std"))]
+        #[inline(always)]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            unsafe {
+                if cfg!(target_feature = "avx2") {
+                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
+                } else if cfg!(target_feature = "avx") {
+                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+                } else if cfg!(target_feature = "sse4.1") {
+                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+                } else if cfg!(target_feature = "ssse3") {
+                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+                } else {
+                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+                }
+            }
+        }
+    };
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+        dispatch_light128!($mach, $MTy, {
+            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+        });
+    }
+}
+
+/// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
+/// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
+///
+/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
+/// features (e.g. because they are done infrequently), so minimizing their contribution to code
+/// size is more important.
+#[macro_export]
+macro_rules! dispatch_light256 {
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
+        #[cfg(feature = "std")]
+        $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
+            #[inline(always)]
+            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            use std::arch::x86_64::*;
+            #[target_feature(enable = "avx")]
+            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+            }
+            #[target_feature(enable = "sse2")]
+            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
+                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+            }
+            unsafe {
+                if is_x86_feature_detected!("avx") {
+                    impl_avx($($arg),*)
+                } else if is_x86_feature_detected!("sse2") {
+                    impl_sse2($($arg),*)
+                } else {
+                    unimplemented!()
+                }
+            }
+        }
+        #[cfg(not(feature = "std"))]
+        #[inline(always)]
+        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
+            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
+            unsafe {
+                if cfg!(target_feature = "avx2") {
+                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
+                } else if cfg!(target_feature = "avx") {
+                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
+                } else if cfg!(target_feature = "sse4.1") {
+                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
+                } else if cfg!(target_feature = "ssse3") {
+                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
+                } else {
+                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
+                }
+            }
+        }
+    };
+    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
+        dispatch_light256!($mach, $MTy, {
+            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
+        });
+    }
+}
diff --git a/vendor/ppv-lite86/src/x86_64/sse2.rs b/vendor/ppv-lite86/src/x86_64/sse2.rs
new file mode 100644
index 000000000..60e7681c3
--- /dev/null
+++ b/vendor/ppv-lite86/src/x86_64/sse2.rs
@@ -0,0 +1,1632 @@
+use crate::soft::{x2, x4};
+use crate::types::*;
+use crate::vec128_storage;
+use crate::x86_64::Avx2Machine;
+use crate::x86_64::SseMachine as Machine86;
+use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
+use core::arch::x86_64::*;
+use core::marker::PhantomData;
+use core::ops::{
+    Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
+};
+
+macro_rules! impl_binop {
+    ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
+        impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
+            type Output = Self;
+            #[inline(always)]
+            fn $fn(self, rhs: Self) -> Self::Output {
+                Self::new(unsafe { $impl_fn(self.x, rhs.x) })
+            }
+        }
+    };
+}
+
+macro_rules! impl_binop_assign {
+    ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
+        impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
+        where
+            $vec<S3, S4, NI>: Copy,
+        {
+            #[inline(always)]
+            fn $fn_assign(&mut self, rhs: Self) {
+                *self = self.$fn(rhs);
+            }
+        }
+    };
+}
+
+macro_rules! def_vec {
+    ($vec:ident, $word:ident) => {
+        #[allow(non_camel_case_types)]
+        #[derive(Copy, Clone)]
+        pub struct $vec<S3, S4, NI> {
+            x: __m128i,
+            s3: PhantomData<S3>,
+            s4: PhantomData<S4>,
+            ni: PhantomData<NI>,
+        }
+
+        impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
+            #[inline(always)]
+            unsafe fn unpack(x: vec128_storage) -> Self {
+                Self::new(x.sse2)
+            }
+        }
+        impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
+            #[inline(always)]
+            fn from(x: $vec<S3, S4, NI>) -> Self {
+                vec128_storage { sse2: x.x }
+            }
+        }
+        impl<S3, S4, NI> $vec<S3, S4, NI> {
+            #[inline(always)]
+            fn new(x: __m128i) -> Self {
+                $vec {
+                    x,
+                    s3: PhantomData,
+                    s4: PhantomData,
+                    ni: PhantomData,
+                }
+            }
+        }
+
+        impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
+        where
+            Self: BSwap,
+        {
+            #[inline(always)]
+            unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+                assert_eq!(input.len(), 16);
+                Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
+            }
+            #[inline(always)]
+            unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+                assert_eq!(input.len(), 16);
+                Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
+            }
+            #[inline(always)]
+            fn write_le(self, out: &mut [u8]) {
+                assert_eq!(out.len(), 16);
+                unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
+            }
+            #[inline(always)]
+            fn write_be(self, out: &mut [u8]) {
+                assert_eq!(out.len(), 16);
+                let x = self.bswap().x;
+                unsafe {
+                    _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
+                }
+            }
+        }
+
+        impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
+            #[inline(always)]
+            fn default() -> Self {
+                Self::new(unsafe { _mm_setzero_si128() })
+            }
+        }
+
+        impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
+            type Output = Self;
+            #[inline(always)]
+            fn not(self) -> Self::Output {
+                unsafe {
+                    let ff = _mm_set1_epi64x(-1i64);
+                    self ^ Self::new(ff)
+                }
+            }
+        }
+
+        impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
+        impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
+        impl_binop!($vec, BitOr, bitor, _mm_or_si128);
+        impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
+        impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
+        impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
+        impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
+        impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
+            type Output = Self;
+            #[inline(always)]
+            fn andnot(self, rhs: Self) -> Self {
+                Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
+            }
+        }
+    };
+}
+
+macro_rules! impl_bitops32 {
+    ($vec:ident) => {
+        impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
+            $vec<S3, S4, NI>: RotateEachWord32
+        {
+        }
+    };
+}
+
+macro_rules! impl_bitops64 {
+    ($vec:ident) => {
+        impl_bitops32!($vec);
+        impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
+            $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
+        {
+        }
+    };
+}
+
+macro_rules! impl_bitops128 {
+    ($vec:ident) => {
+        impl_bitops64!($vec);
+        impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
+            $vec<S3, S4, NI>: RotateEachWord128
+        {
+        }
+    };
+}
+
+macro_rules! rotr_32_s3 {
+    ($name:ident, $k0:expr, $k1:expr) => {
+    #[inline(always)]
+    fn $name(self) -> Self {
+        Self::new(unsafe {
+                _mm_shuffle_epi8(
+                    self.x,
+                    _mm_set_epi64x($k0, $k1),
+                )
+            })
+        }
+    };
+}
+macro_rules! rotr_32 {
+    ($name:ident, $i:expr) => {
+    #[inline(always)]
+    fn $name(self) -> Self {
+        Self::new(unsafe {
+            _mm_or_si128(
+                _mm_srli_epi32(self.x, $i as i32),
+                _mm_slli_epi32(self.x, 32 - $i as i32),
+            )
+        })
+    }
+    };
+}
+impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
+    rotr_32!(rotate_each_word_right7, 7);
+    rotr_32_s3!(
+        rotate_each_word_right8,
+        0x0c0f0e0d_080b0a09,
+        0x04070605_00030201
+    );
+    rotr_32!(rotate_each_word_right11, 11);
+    rotr_32!(rotate_each_word_right12, 12);
+    rotr_32_s3!(
+        rotate_each_word_right16,
+        0x0d0c0f0e_09080b0a,
+        0x05040706_01000302
+    );
+    rotr_32!(rotate_each_word_right20, 20);
+    rotr_32_s3!(
+        rotate_each_word_right24,
+        0x0e0d0c0f_0a09080b,
+        0x06050407_02010003
+    );
+    rotr_32!(rotate_each_word_right25, 25);
+}
+impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
+    rotr_32!(rotate_each_word_right7, 7);
+    rotr_32!(rotate_each_word_right8, 8);
+    rotr_32!(rotate_each_word_right11, 11);
+    rotr_32!(rotate_each_word_right12, 12);
+    #[inline(always)]
+    fn rotate_each_word_right16(self) -> Self {
+        Self::new(swap16_s2(self.x))
+    }
+    rotr_32!(rotate_each_word_right20, 20);
+    rotr_32!(rotate_each_word_right24, 24);
+    rotr_32!(rotate_each_word_right25, 25);
+}
+
+macro_rules! rotr_64_s3 {
+    ($name:ident, $k0:expr, $k1:expr) => {
+    #[inline(always)]
+    fn $name(self) -> Self {
+        Self::new(unsafe {
+                _mm_shuffle_epi8(
+                    self.x,
+                    _mm_set_epi64x($k0, $k1),
+                )
+            })
+        }
+    };
+}
+macro_rules! rotr_64 {
+    ($name:ident, $i:expr) => {
+    #[inline(always)]
+    fn $name(self) -> Self {
+        Self::new(unsafe {
+            _mm_or_si128(
+                _mm_srli_epi64(self.x, $i as i32),
+                _mm_slli_epi64(self.x, 64 - $i as i32),
+            )
+        })
+    }
+    };
+}
+impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
+    rotr_64!(rotate_each_word_right7, 7);
+    rotr_64_s3!(
+        rotate_each_word_right8,
+        0x080f_0e0d_0c0b_0a09,
+        0x0007_0605_0403_0201
+    );
+    rotr_64!(rotate_each_word_right11, 11);
+    rotr_64!(rotate_each_word_right12, 12);
+    rotr_64_s3!(
+        rotate_each_word_right16,
+        0x0908_0f0e_0d0c_0b0a,
+        0x0100_0706_0504_0302
+    );
+    rotr_64!(rotate_each_word_right20, 20);
+    rotr_64_s3!(
+        rotate_each_word_right24,
+        0x0a09_080f_0e0d_0c0b,
+        0x0201_0007_0605_0403
+    );
+    rotr_64!(rotate_each_word_right25, 25);
+}
+impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
+    rotr_64!(rotate_each_word_right7, 7);
+    rotr_64!(rotate_each_word_right8, 8);
+    rotr_64!(rotate_each_word_right11, 11);
+    rotr_64!(rotate_each_word_right12, 12);
+    #[inline(always)]
+    fn rotate_each_word_right16(self) -> Self {
+        Self::new(swap16_s2(self.x))
+    }
+    rotr_64!(rotate_each_word_right20, 20);
+    rotr_64!(rotate_each_word_right24, 24);
+    rotr_64!(rotate_each_word_right25, 25);
+}
+impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn rotate_each_word_right32(self) -> Self {
+        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
+    }
+}
+
+macro_rules! rotr_128 {
+    ($name:ident, $i:expr) => {
+    #[inline(always)]
+    fn $name(self) -> Self {
+        Self::new(unsafe {
+            _mm_or_si128(
+                _mm_srli_si128(self.x, $i as i32),
+                _mm_slli_si128(self.x, 128 - $i as i32),
+            )
+        })
+    }
+    };
+}
+// TODO: completely unoptimized
+impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
+    rotr_128!(rotate_each_word_right7, 7);
+    rotr_128!(rotate_each_word_right8, 8);
+    rotr_128!(rotate_each_word_right11, 11);
+    rotr_128!(rotate_each_word_right12, 12);
+    rotr_128!(rotate_each_word_right16, 16);
+    rotr_128!(rotate_each_word_right20, 20);
+    rotr_128!(rotate_each_word_right24, 24);
+    rotr_128!(rotate_each_word_right25, 25);
+}
+// TODO: completely unoptimized
+impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
+    rotr_128!(rotate_each_word_right32, 32);
+}
+impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
+
+def_vec!(u32x4_sse2, u32);
+def_vec!(u64x2_sse2, u64);
+def_vec!(u128x1_sse2, u128);
+
+impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
+    #[inline(always)]
+    fn to_lanes(self) -> [u32; 4] {
+        unsafe {
+            let x = _mm_cvtsi128_si64(self.x) as u64;
+            let y = _mm_extract_epi64(self.x, 1) as u64;
+            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
+        }
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u32; 4]) -> Self {
+        unsafe {
+            let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
+            x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
+            Self::new(x)
+        }
+    }
+}
+impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
+    #[inline(always)]
+    fn to_lanes(self) -> [u32; 4] {
+        unsafe {
+            let x = _mm_cvtsi128_si64(self.x) as u64;
+            let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
+            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
+        }
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u32; 4]) -> Self {
+        unsafe {
+            let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
+            let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
+            let x = _mm_cvtsi64_si128(x);
+            let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
+            Self::new(_mm_or_si128(x, y))
+        }
+    }
+}
+impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
+    #[inline(always)]
+    fn to_lanes(self) -> [u64; 2] {
+        unsafe {
+            [
+                _mm_cvtsi128_si64(self.x) as u64,
+                _mm_extract_epi64(self.x, 1) as u64,
+            ]
+        }
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u64; 2]) -> Self {
+        unsafe {
+            let mut x = _mm_cvtsi64_si128(xs[0] as i64);
+            x = _mm_insert_epi64(x, xs[1] as i64, 1);
+            Self::new(x)
+        }
+    }
+}
+impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
+    #[inline(always)]
+    fn to_lanes(self) -> [u64; 2] {
+        unsafe {
+            [
+                _mm_cvtsi128_si64(self.x) as u64,
+                _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
+            ]
+        }
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u64; 2]) -> Self {
+        unsafe {
+            let x = _mm_cvtsi64_si128(xs[0] as i64);
+            let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
+            Self::new(_mm_or_si128(x, y))
+        }
+    }
+}
+impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn to_lanes(self) -> [u128; 1] {
+        unimplemented!()
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u128; 1]) -> Self {
+        unimplemented!()
+    }
+}
+
+impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
+{
+    #[inline(always)]
+    fn to_lanes(self) -> [u64; 4] {
+        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
+        [a[0], a[1], b[0], b[1]]
+    }
+    #[inline(always)]
+    fn from_lanes(xs: [u64; 4]) -> Self {
+        let (a, b) = (
+            u64x2_sse2::from_lanes([xs[0], xs[1]]),
+            u64x2_sse2::from_lanes([xs[2], xs[3]]),
+        );
+        x2::new([a, b])
+    }
+}
+
+macro_rules! impl_into {
+    ($from:ident, $to:ident) => {
+        impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
+            #[inline(always)]
+            fn from(x: $from<S3, S4, NI>) -> Self {
+                $to::new(x.x)
+            }
+        }
+    };
+}
+
+impl_into!(u128x1_sse2, u32x4_sse2);
+impl_into!(u128x1_sse2, u64x2_sse2);
+
+impl_bitops32!(u32x4_sse2);
+impl_bitops64!(u64x2_sse2);
+impl_bitops128!(u128x1_sse2);
+
+impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
+    u32x4_sse2<S3, S4, NI>: BSwap
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
+    u64x2_sse2<S3, S4, NI>: BSwap
+{
+}
+impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
+impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
+impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
+impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
+
+impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
+where
+    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
+    Machine86<S3, S4, NI>: Machine,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>:
+        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
+    Machine86<S3, S4, NI>: Machine,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
+where
+    u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
+{
+}
+
+impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
+where
+    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
+    Machine86<YesS3, YesS4, NI>: Machine,
+{
+}
+impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
+where
+    u64x2_sse2<YesS3, YesS4, NI>:
+        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
+    Machine86<YesS3, YesS4, NI>: Machine,
+{
+}
+impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
+where
+    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
+    Machine86<YesS3, YesS4, NI>: Machine,
+    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
+    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
+{
+}
+
+impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
+    #[inline(always)]
+    unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
+        Self::new(_mm_set_epi32(
+            xs[3] as i32,
+            xs[2] as i32,
+            xs[1] as i32,
+            xs[0] as i32,
+        ))
+    }
+}
+
+impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
+where
+    Self: MultiLane<[u32; 4]>,
+{
+    #[inline(always)]
+    fn extract(self, i: u32) -> u32 {
+        self.to_lanes()[i as usize]
+    }
+    #[inline(always)]
+    fn insert(self, v: u32, i: u32) -> Self {
+        Self::new(unsafe {
+            match i {
+                0 => _mm_insert_epi32(self.x, v as i32, 0),
+                1 => _mm_insert_epi32(self.x, v as i32, 1),
+                2 => _mm_insert_epi32(self.x, v as i32, 2),
+                3 => _mm_insert_epi32(self.x, v as i32, 3),
+                _ => unreachable!(),
+            }
+        })
+    }
+}
+impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
+where
+    Self: MultiLane<[u32; 4]>,
+{
+    #[inline(always)]
+    fn extract(self, i: u32) -> u32 {
+        self.to_lanes()[i as usize]
+    }
+    #[inline(always)]
+    fn insert(self, v: u32, i: u32) -> Self {
+        Self::new(unsafe {
+            match i {
+                0 => {
+                    let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
+                    _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
+                }
+                1 => {
+                    let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
+                    x = _mm_slli_si128(x, 4);
+                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
+                    _mm_shuffle_epi32(x, 0b1110_0001)
+                }
+                2 => {
+                    let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
+                    x = _mm_slli_si128(x, 4);
+                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
+                    _mm_shuffle_epi32(x, 0b1100_1001)
+                }
+                3 => {
+                    let mut x = _mm_slli_si128(self.x, 4);
+                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
+                    _mm_shuffle_epi32(x, 0b0011_1001)
+                }
+                _ => unreachable!(),
+            }
+        })
+    }
+}
+
+impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn shuffle_lane_words2301(self) -> Self {
+        self.shuffle2301()
+    }
+    #[inline(always)]
+    fn shuffle_lane_words1230(self) -> Self {
+        self.shuffle1230()
+    }
+    #[inline(always)]
+    fn shuffle_lane_words3012(self) -> Self {
+        self.shuffle3012()
+    }
+}
+
+impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn shuffle2301(self) -> Self {
+        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
+    }
+    #[inline(always)]
+    fn shuffle1230(self) -> Self {
+        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
+    }
+    #[inline(always)]
+    fn shuffle3012(self) -> Self {
+        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
+    }
+}
+
+impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
+    #[inline(always)]
+    fn shuffle2301(self) -> Self {
+        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
+    }
+    #[inline(always)]
+    fn shuffle3012(self) -> Self {
+        unsafe {
+            x2::new([
+                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
+                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
+            ])
+        }
+    }
+    #[inline(always)]
+    fn shuffle1230(self) -> Self {
+        unsafe {
+            x2::new([
+                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
+                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
+            ])
+        }
+    }
+}
+impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
+    #[inline(always)]
+    fn shuffle2301(self) -> Self {
+        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
+    }
+    #[inline(always)]
+    fn shuffle3012(self) -> Self {
+        unsafe {
+            let a = _mm_srli_si128(self.0[0].x, 8);
+            let b = _mm_slli_si128(self.0[0].x, 8);
+            let c = _mm_srli_si128(self.0[1].x, 8);
+            let d = _mm_slli_si128(self.0[1].x, 8);
+            let da = _mm_or_si128(d, a);
+            let bc = _mm_or_si128(b, c);
+            x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
+        }
+    }
+    #[inline(always)]
+    fn shuffle1230(self) -> Self {
+        unsafe {
+            let a = _mm_srli_si128(self.0[0].x, 8);
+            let b = _mm_slli_si128(self.0[0].x, 8);
+            let c = _mm_srli_si128(self.0[1].x, 8);
+            let d = _mm_slli_si128(self.0[1].x, 8);
+            let da = _mm_or_si128(d, a);
+            let bc = _mm_or_si128(b, c);
+            x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
+        }
+    }
+}
+
+impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
+    #[inline(always)]
+    unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
+        Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
+    }
+}
+
+impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
+    #[inline(always)]
+    fn extract(self, i: u32) -> u64 {
+        unsafe {
+            match i {
+                0 => _mm_cvtsi128_si64(self.x) as u64,
+                1 => _mm_extract_epi64(self.x, 1) as u64,
+                _ => unreachable!(),
+            }
+        }
+    }
+    #[inline(always)]
+    fn insert(self, x: u64, i: u32) -> Self {
+        Self::new(unsafe {
+            match i {
+                0 => _mm_insert_epi64(self.x, x as i64, 0),
+                1 => _mm_insert_epi64(self.x, x as i64, 1),
+                _ => unreachable!(),
+            }
+        })
+    }
+}
+impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
+    #[inline(always)]
+    fn extract(self, i: u32) -> u64 {
+        unsafe {
+            match i {
+                0 => _mm_cvtsi128_si64(self.x) as u64,
+                1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
+                _ => unreachable!(),
+            }
+        }
+    }
+    #[inline(always)]
+    fn insert(self, x: u64, i: u32) -> Self {
+        Self::new(unsafe {
+            match i {
+                0 => _mm_or_si128(
+                    _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
+                    _mm_cvtsi64_si128(x as i64),
+                ),
+                1 => _mm_or_si128(
+                    _mm_move_epi64(self.x),
+                    _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
+                ),
+                _ => unreachable!(),
+            }
+        })
+    }
+}
+
+impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        Self::new(unsafe {
+            let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
+            _mm_shuffle_epi8(self.x, k)
+        })
+    }
+}
+#[inline(always)]
+fn bswap32_s2(x: __m128i) -> __m128i {
+    unsafe {
+        let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
+        y = _mm_shufflehi_epi16(y, 0b0001_1011);
+        y = _mm_shufflelo_epi16(y, 0b0001_1011);
+        let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
+        z = _mm_shufflehi_epi16(z, 0b0001_1011);
+        z = _mm_shufflelo_epi16(z, 0b0001_1011);
+        _mm_packus_epi16(y, z)
+    }
+}
+impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        Self::new(bswap32_s2(self.x))
+    }
+}
+
+impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        Self::new(unsafe {
+            let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
+            _mm_shuffle_epi8(self.x, k)
+        })
+    }
+}
+impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
+    }
+}
+
+impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        Self::new(unsafe {
+            let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
+            _mm_shuffle_epi8(self.x, k)
+        })
+    }
+}
+impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
+    #[inline(always)]
+    fn bswap(self) -> Self {
+        Self::new(unsafe { unimplemented!() })
+    }
+}
+
+macro_rules! swapi {
+    ($x:expr, $i:expr, $k:expr) => {
+        unsafe {
+            const K: u8 = $k;
+            let k = _mm_set1_epi8(K as i8);
+            u128x1_sse2::new(_mm_or_si128(
+                _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
+                _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
+            ))
+        }
+    };
+}
+#[inline(always)]
+fn swap16_s2(x: __m128i) -> __m128i {
+    unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
+}
+impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
+    #[inline(always)]
+    fn swap1(self) -> Self {
+        swapi!(self, 1, 0xaa)
+    }
+    #[inline(always)]
+    fn swap2(self) -> Self {
+        swapi!(self, 2, 0xcc)
+    }
+    #[inline(always)]
+    fn swap4(self) -> Self {
+        swapi!(self, 4, 0xf0)
+    }
+    #[inline(always)]
+    fn swap8(self) -> Self {
+        u128x1_sse2::new(unsafe {
+            let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
+            _mm_shuffle_epi8(self.x, k)
+        })
+    }
+    #[inline(always)]
+    fn swap16(self) -> Self {
+        u128x1_sse2::new(unsafe {
+            let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
+            _mm_shuffle_epi8(self.x, k)
+        })
+    }
+    #[inline(always)]
+    fn swap32(self) -> Self {
+        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
+    }
+    #[inline(always)]
+    fn swap64(self) -> Self {
+        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
+    }
+}
+impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
+    #[inline(always)]
+    fn swap1(self) -> Self {
+        swapi!(self, 1, 0xaa)
+    }
+    #[inline(always)]
+    fn swap2(self) -> Self {
+        swapi!(self, 2, 0xcc)
+    }
+    #[inline(always)]
+    fn swap4(self) -> Self {
+        swapi!(self, 4, 0xf0)
+    }
+    #[inline(always)]
+    fn swap8(self) -> Self {
+        u128x1_sse2::new(unsafe {
+            _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
+        })
+    }
+    #[inline(always)]
+    fn swap16(self) -> Self {
+        u128x1_sse2::new(swap16_s2(self.x))
+    }
+    #[inline(always)]
+    fn swap32(self) -> Self {
+        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
+    }
+    #[inline(always)]
+    fn swap64(self) -> Self {
+        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
+    }
+}
+
+#[derive(Copy, Clone)]
+pub struct G0;
+#[derive(Copy, Clone)]
+pub struct G1;
+
+#[allow(non_camel_case_types)]
+pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
+#[allow(non_camel_case_types)]
+pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
+#[allow(non_camel_case_types)]
+pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
+#[allow(non_camel_case_types)]
+pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
+
+#[allow(non_camel_case_types)]
+pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
+#[allow(non_camel_case_types)]
+pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
+#[allow(non_camel_case_types)]
+pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
+
+impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
+where
+    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
+    u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
+    u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
+where
+    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
+    u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
+    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
+    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
+    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
+{
+}
+
+impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
+where
+    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
+    u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
+{
+}
+impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
+where
+    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
+    u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
+{
+}
+impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
+where
+    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
+{
+}
+impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
+where
+    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
+    u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
+    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
+    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
+    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
+{
+}
+
+impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
+{
+    #[inline(always)]
+    fn extract(self, i: u32) -> u64 {
+        match i {
+            0 => self.0[0].extract(0),
+            1 => self.0[0].extract(1),
+            2 => self.0[1].extract(0),
+            3 => self.0[1].extract(1),
+            _ => panic!(),
+        }
+    }
+    #[inline(always)]
+    fn insert(mut self, w: u64, i: u32) -> Self {
+        match i {
+            0 => self.0[0] = self.0[0].insert(w, 0),
+            1 => self.0[0] = self.0[0].insert(w, 1),
+            2 => self.0[1] = self.0[1].insert(w, 0),
+            3 => self.0[1] = self.0[1].insert(w, 1),
+            _ => panic!(),
+        };
+        self
+    }
+}
+
+impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
+where
+    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
+    u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
+    u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
+{
+}
+impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
+where
+    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
+    Machine86<S3, S4, NI>: Machine,
+    u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
+    u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
+    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
+    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
+{
+}
+
+impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
+where
+    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
+    u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
+{
+}
+impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
+where
+    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
+    u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
+{
+}
+impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
+where
+    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
+    Avx2Machine<NI>: Machine,
+    u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
+    u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
+    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
+    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
+{
+}
+
+macro_rules! impl_into_x {
+    ($from:ident, $to:ident) => {
+        impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
+            for x2<$to<S3, S4, NI>, Gt>
+        {
+            #[inline(always)]
+            fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
+                x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
+            }
+        }
+        impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
+            #[inline(always)]
+            fn from(x: x4<$from<S3, S4, NI>>) -> Self {
+                x4::new([
+                    $to::from(x.0[0]),
+                    $to::from(x.0[1]),
+                    $to::from(x.0[2]),
+                    $to::from(x.0[3]),
+                ])
+            }
+        }
+    };
+}
+impl_into_x!(u128x1_sse2, u64x2_sse2);
+impl_into_x!(u128x1_sse2, u32x4_sse2);
+
+///// Debugging
+
+use core::fmt::{Debug, Formatter, Result};
+
+impl<W: PartialEq, G> PartialEq for x2<W, G> {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
+    }
+}
+
+#[inline(always)]
+unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
+    let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
+    _mm_cvtsi128_si64(q) == -1
+}
+
+#[inline(always)]
+unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
+    let q = _mm_cmpeq_epi32(x, y);
+    let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
+    let q = _mm_cvtsi128_si64(q);
+    (p & q) == -1
+}
+
+impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { eq128_s2(self.x, rhs.x) }
+    }
+}
+impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
+where
+    Self: Copy + MultiLane<[u32; 4]>,
+{
+    #[cold]
+    fn fmt(&self, fmt: &mut Formatter) -> Result {
+        fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
+    }
+}
+
+impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { eq128_s2(self.x, rhs.x) }
+    }
+}
+impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
+where
+    Self: Copy + MultiLane<[u64; 2]>,
+{
+    #[cold]
+    fn fmt(&self, fmt: &mut Formatter) -> Result {
+        fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
+    }
+}
+
+impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
+where
+    u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
+{
+    #[cold]
+    fn fmt(&self, fmt: &mut Formatter) -> Result {
+        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
+        fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
+    }
+}
+
+#[cfg(test)]
+#[cfg(target_arch = "x86_64")]
+mod test {
+    use super::*;
+    use crate::x86_64::{SSE2, SSE41, SSSE3};
+    use crate::Machine;
+
+    #[test]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
+    fn test_bswap32_s2_vs_s3() {
+        let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
+        let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
+            x_s2.bswap()
+        };
+
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
+            x_s3.bswap()
+        };
+
+        assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
+        assert_eq!(x_s2, s2.vec(ys));
+    }
+
+    #[test]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
+    fn test_bswap64_s2_vs_s3() {
+        let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
+        let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
+            x_s2.bswap()
+        };
+
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
+            x_s3.bswap()
+        };
+
+        assert_eq!(x_s2, s2.vec(ys));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+    }
+
+    #[test]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
+    fn test_shuffle32_s2_vs_s3() {
+        let xs = [0x0, 0x1, 0x2, 0x3];
+        let ys = [0x2, 0x3, 0x0, 0x1];
+        let zs = [0x1, 0x2, 0x3, 0x0];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
+            x_s2.shuffle2301()
+        };
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
+            x_s3.shuffle2301()
+        };
+        assert_eq!(x_s2, s2.vec(ys));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
+            x_s2.shuffle3012()
+        };
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
+            x_s3.shuffle3012()
+        };
+        assert_eq!(x_s2, s2.vec(zs));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+
+        let x_s2 = x_s2.shuffle1230();
+        let x_s3 = x_s3.shuffle1230();
+        assert_eq!(x_s2, s2.vec(xs));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+    }
+
+    #[test]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
+    fn test_shuffle64_s2_vs_s3() {
+        let xs = [0x0, 0x1, 0x2, 0x3];
+        let ys = [0x2, 0x3, 0x0, 0x1];
+        let zs = [0x1, 0x2, 0x3, 0x0];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
+            x_s2.shuffle2301()
+        };
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
+            x_s3.shuffle2301()
+        };
+        assert_eq!(x_s2, s2.vec(ys));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+
+        let x_s2 = {
+            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
+            x_s2.shuffle3012()
+        };
+        let x_s3 = {
+            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
+            x_s3.shuffle3012()
+        };
+        assert_eq!(x_s2, s2.vec(zs));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+
+        let x_s2 = x_s2.shuffle1230();
+        let x_s3 = x_s3.shuffle1230();
+        assert_eq!(x_s2, s2.vec(xs));
+        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+    }
+
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
+    #[test]
+    fn test_lanes_u32x4() {
+        let xs = [0x1, 0x2, 0x3, 0x4];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+        let s4 = unsafe { SSE41::instance() };
+
+        {
+            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
+            let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
+            assert_eq!(x_s2, y_s2);
+            assert_eq!(xs, y_s2.to_lanes());
+        }
+
+        {
+            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
+            let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
+            assert_eq!(x_s3, y_s3);
+            assert_eq!(xs, y_s3.to_lanes());
+        }
+
+        {
+            let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
+            let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
+            assert_eq!(x_s4, y_s4);
+            assert_eq!(xs, y_s4.to_lanes());
+        }
+    }
+
+    #[test]
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
+    fn test_lanes_u64x2() {
+        let xs = [0x1, 0x2];
+
+        let s2 = unsafe { SSE2::instance() };
+        let s3 = unsafe { SSSE3::instance() };
+        let s4 = unsafe { SSE41::instance() };
+
+        {
+            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
+            let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
+            assert_eq!(x_s2, y_s2);
+            assert_eq!(xs, y_s2.to_lanes());
+        }
+
+        {
+            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
+            let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
+            assert_eq!(x_s3, y_s3);
+            assert_eq!(xs, y_s3.to_lanes());
+        }
+
+        {
+            let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
+            let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
+            assert_eq!(x_s4, y_s4);
+            assert_eq!(xs, y_s4.to_lanes());
+        }
+    }
+
+    #[test]
+    fn test_vec4_u32x4_s2() {
+        let xs = [1, 2, 3, 4];
+        let s2 = unsafe { SSE2::instance() };
+        let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
+        assert_eq!(x_s2.extract(0), 1);
+        assert_eq!(x_s2.extract(1), 2);
+        assert_eq!(x_s2.extract(2), 3);
+        assert_eq!(x_s2.extract(3), 4);
+        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
+        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
+        assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
+        assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
+    }
+
+    #[test]
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
+    fn test_vec4_u32x4_s4() {
+        let xs = [1, 2, 3, 4];
+        let s4 = unsafe { SSE41::instance() };
+        let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
+        assert_eq!(x_s4.extract(0), 1);
+        assert_eq!(x_s4.extract(1), 2);
+        assert_eq!(x_s4.extract(2), 3);
+        assert_eq!(x_s4.extract(3), 4);
+        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
+        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
+        assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
+        assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
+    }
+
+    #[test]
+    fn test_vec2_u64x2_s2() {
+        let xs = [0x1, 0x2];
+        let s2 = unsafe { SSE2::instance() };
+        let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
+        assert_eq!(x_s2.extract(0), 1);
+        assert_eq!(x_s2.extract(1), 2);
+        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
+        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
+    }
+
+    #[test]
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
+    fn test_vec4_u64x2_s4() {
+        let xs = [0x1, 0x2];
+        let s4 = unsafe { SSE41::instance() };
+        let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
+        assert_eq!(x_s4.extract(0), 1);
+        assert_eq!(x_s4.extract(1), 2);
+        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
+        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
+    }
+}
+
+pub mod avx2 {
+    #![allow(non_camel_case_types)]
+    use crate::soft::x4;
+    use crate::types::*;
+    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
+    use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
+    use core::arch::x86_64::*;
+    use core::marker::PhantomData;
+    use core::ops::*;
+
+    #[derive(Copy, Clone)]
+    pub struct u32x4x4_avx2<NI> {
+        x: [__m256i; 2],
+        ni: PhantomData<NI>,
+    }
+
+    impl<NI> u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn new(x: [__m256i; 2]) -> Self {
+            Self { x, ni: PhantomData }
+        }
+    }
+
+    impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
+    impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        unsafe fn unpack(p: vec512_storage) -> Self {
+            Self::new([p.avx[0].avx, p.avx[1].avx])
+        }
+    }
+    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
+            unsafe {
+                [
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
+                ]
+            }
+        }
+        #[inline(always)]
+        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
+            Self::new(unsafe {
+                [
+                    _mm256_setr_m128i(x[0].x, x[1].x),
+                    _mm256_setr_m128i(x[2].x, x[3].x),
+                ]
+            })
+        }
+    }
+    impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
+            unsafe {
+                match i {
+                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
+                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
+                    2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
+                    3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
+                    _ => panic!(),
+                }
+            }
+        }
+        #[inline(always)]
+        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
+            Self::new(unsafe {
+                match i {
+                    0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
+                    1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
+                    2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
+                    3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
+                    _ => panic!(),
+                }
+            })
+        }
+    }
+    impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn shuffle_lane_words1230(self) -> Self {
+            Self::new(unsafe {
+                [
+                    _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
+                    _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
+                ]
+            })
+        }
+        #[inline(always)]
+        fn shuffle_lane_words2301(self) -> Self {
+            Self::new(unsafe {
+                [
+                    _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
+                    _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
+                ]
+            })
+        }
+        #[inline(always)]
+        fn shuffle_lane_words3012(self) -> Self {
+            Self::new(unsafe {
+                [
+                    _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
+                    _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
+                ]
+            })
+        }
+    }
+    impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
+    impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
+    macro_rules! shuf_lane_bytes {
+        ($name:ident, $k0:expr, $k1:expr) => {
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe {
+                [
+                    _mm256_shuffle_epi8(
+                        self.x[0],
+                        _mm256_set_epi64x($k0, $k1, $k0, $k1),
+                    ),
+                    _mm256_shuffle_epi8(
+                        self.x[1],
+                        _mm256_set_epi64x($k0, $k1, $k0, $k1),
+                    )
+                ]
+                })
+            }
+        };
+    }
+    macro_rules! rotr_32 {
+        ($name:ident, $i:expr) => {
+            #[inline(always)]
+            fn $name(self) -> Self {
+                Self::new(unsafe {
+                    [
+                        _mm256_or_si256(
+                            _mm256_srli_epi32(self.x[0], $i as i32),
+                            _mm256_slli_epi32(self.x[0], 32 - $i as i32),
+                        ),
+                        _mm256_or_si256(
+                            _mm256_srli_epi32(self.x[1], $i as i32),
+                            _mm256_slli_epi32(self.x[1], 32 - $i as i32),
+                        )
+                    ]
+                })
+            }
+        };
+    }
+    impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
+        rotr_32!(rotate_each_word_right7, 7);
+        shuf_lane_bytes!(
+            rotate_each_word_right8,
+            0x0c0f0e0d_080b0a09,
+            0x04070605_00030201
+        );
+        rotr_32!(rotate_each_word_right11, 11);
+        rotr_32!(rotate_each_word_right12, 12);
+        shuf_lane_bytes!(
+            rotate_each_word_right16,
+            0x0d0c0f0e_09080b0a,
+            0x05040706_01000302
+        );
+        rotr_32!(rotate_each_word_right20, 20);
+        shuf_lane_bytes!(
+            rotate_each_word_right24,
+            0x0e0d0c0f_0a09080b,
+            0x06050407_02010003
+        );
+        rotr_32!(rotate_each_word_right25, 25);
+    }
+    impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
+    impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
+        #[inline(always)]
+        fn from(x: u32x4x4_avx2<NI>) -> Self {
+            Self {
+                avx: [
+                    vec256_storage { avx: x.x[0] },
+                    vec256_storage { avx: x.x[1] },
+                ],
+            }
+        }
+    }
+
+    macro_rules! impl_assign {
+        ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
+            impl<NI> $Assign for $vec<NI>
+            where
+                NI: Copy,
+            {
+                #[inline(always)]
+                fn $assign_fn(&mut self, rhs: Self) {
+                    *self = self.$bin_fn(rhs);
+                }
+            }
+        };
+    }
+    impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
+    impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
+    impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
+    impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
+
+    macro_rules! impl_bitop_x2 {
+        ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
+            impl<NI> $Op for $vec<NI> {
+                type Output = Self;
+                #[inline(always)]
+                fn $op_fn(self, rhs: Self) -> Self::Output {
+                    Self::new(unsafe {
+                        [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
+                    })
+                }
+            }
+        };
+    }
+    impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
+    impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
+    impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
+    impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
+    impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
+
+    impl<NI> Not for u32x4x4_avx2<NI> {
+        type Output = Self;
+        #[inline(always)]
+        fn not(self) -> Self::Output {
+            unsafe {
+                let f = _mm256_set1_epi8(-0x7f);
+                Self::new([f, f]) ^ self
+            }
+        }
+    }
+
+    impl<NI> BSwap for u32x4x4_avx2<NI> {
+        shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
+    }
+
+    impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
+    where
+        NI: Copy,
+    {
+        #[inline(always)]
+        fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
+            Self::new(unsafe {
+                [
+                    _mm256_setr_m128i(x.0[0].x, x.0[1].x),
+                    _mm256_setr_m128i(x.0[2].x, x.0[3].x),
+                ]
+            })
+        }
+    }
+}
-- 
cgit v1.2.3