From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Wed, 17 Apr 2024 14:02:58 +0200
Subject: Adding upstream version 1.64.0+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 vendor/packed_simd_2/src/codegen/bit_manip.rs      | 354 +++++++++++++
 vendor/packed_simd_2/src/codegen/llvm.rs           | 107 ++++
 vendor/packed_simd_2/src/codegen/math.rs           |   3 +
 vendor/packed_simd_2/src/codegen/math/float.rs     |  19 +
 vendor/packed_simd_2/src/codegen/math/float/abs.rs | 103 ++++
 vendor/packed_simd_2/src/codegen/math/float/cos.rs | 103 ++++
 .../packed_simd_2/src/codegen/math/float/cos_pi.rs |  87 ++++
 vendor/packed_simd_2/src/codegen/math/float/exp.rs | 112 +++++
 vendor/packed_simd_2/src/codegen/math/float/ln.rs  | 112 +++++
 .../packed_simd_2/src/codegen/math/float/macros.rs | 559 +++++++++++++++++++++
 .../src/codegen/math/float/mul_add.rs              | 109 ++++
 .../src/codegen/math/float/mul_adde.rs             |  66 +++
 .../packed_simd_2/src/codegen/math/float/powf.rs   | 112 +++++
 vendor/packed_simd_2/src/codegen/math/float/sin.rs | 103 ++++
 .../src/codegen/math/float/sin_cos_pi.rs           | 195 +++++++
 .../packed_simd_2/src/codegen/math/float/sin_pi.rs |  87 ++++
 .../packed_simd_2/src/codegen/math/float/sqrt.rs   | 103 ++++
 .../packed_simd_2/src/codegen/math/float/sqrte.rs  |  67 +++
 .../packed_simd_2/src/codegen/math/float/tanh.rs   | 117 +++++
 .../packed_simd_2/src/codegen/pointer_sized_int.rs |  28 ++
 vendor/packed_simd_2/src/codegen/reductions.rs     |   1 +
 .../packed_simd_2/src/codegen/reductions/mask.rs   |  69 +++
 .../src/codegen/reductions/mask/aarch64.rs         |  71 +++
 .../src/codegen/reductions/mask/arm.rs             |  54 ++
 .../src/codegen/reductions/mask/fallback.rs        |   6 +
 .../src/codegen/reductions/mask/fallback_impl.rs   | 237 +++++++++
 .../src/codegen/reductions/mask/x86.rs             | 188 +++++++
 .../src/codegen/reductions/mask/x86/avx.rs         | 101 ++++
 .../src/codegen/reductions/mask/x86/avx2.rs        |  35 ++
 .../src/codegen/reductions/mask/x86/sse.rs         |  36 ++
 .../src/codegen/reductions/mask/x86/sse2.rs        |  70 +++
 vendor/packed_simd_2/src/codegen/shuffle.rs        | 150 ++++++
 vendor/packed_simd_2/src/codegen/shuffle1_dyn.rs   | 411 +++++++++++++++
 vendor/packed_simd_2/src/codegen/swap_bytes.rs     | 189 +++++++
 vendor/packed_simd_2/src/codegen/v128.rs           |  46 ++
 vendor/packed_simd_2/src/codegen/v16.rs            |   7 +
 vendor/packed_simd_2/src/codegen/v256.rs           |  78 +++
 vendor/packed_simd_2/src/codegen/v32.rs            |  11 +
 vendor/packed_simd_2/src/codegen/v512.rs           | 145 ++++++
 vendor/packed_simd_2/src/codegen/v64.rs            |  21 +
 vendor/packed_simd_2/src/codegen/vPtr.rs           |  35 ++
 vendor/packed_simd_2/src/codegen/vSize.rs          |  43 ++
 42 files changed, 4550 insertions(+)
 create mode 100644 vendor/packed_simd_2/src/codegen/bit_manip.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/llvm.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/abs.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/cos.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/cos_pi.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/exp.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/ln.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/macros.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/mul_add.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/mul_adde.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/powf.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/sin.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/sin_pi.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/sqrt.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/sqrte.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/math/float/tanh.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/pointer_sized_int.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/reductions.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/shuffle.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/shuffle1_dyn.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/swap_bytes.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/v128.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/v16.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/v256.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/v32.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/v512.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/v64.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/vPtr.rs
 create mode 100644 vendor/packed_simd_2/src/codegen/vSize.rs

(limited to 'vendor/packed_simd_2/src/codegen')

diff --git a/vendor/packed_simd_2/src/codegen/bit_manip.rs b/vendor/packed_simd_2/src/codegen/bit_manip.rs
new file mode 100644
index 000000000..83c7d1987
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/bit_manip.rs
@@ -0,0 +1,354 @@
+//! LLVM bit manipulation intrinsics.
+#[rustfmt::skip]
+
+use crate::*;
+
+#[allow(improper_ctypes, dead_code)]
+extern "C" {
+    #[link_name = "llvm.ctlz.v2i8"]
+    fn ctlz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2;
+    #[link_name = "llvm.ctlz.v4i8"]
+    fn ctlz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4;
+    #[link_name = "llvm.ctlz.v8i8"]
+    fn ctlz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8;
+    #[link_name = "llvm.ctlz.v16i8"]
+    fn ctlz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16;
+    #[link_name = "llvm.ctlz.v32i8"]
+    fn ctlz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32;
+    #[link_name = "llvm.ctlz.v64i8"]
+    fn ctlz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64;
+
+    #[link_name = "llvm.ctlz.v2i16"]
+    fn ctlz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2;
+    #[link_name = "llvm.ctlz.v4i16"]
+    fn ctlz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4;
+    #[link_name = "llvm.ctlz.v8i16"]
+    fn ctlz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8;
+    #[link_name = "llvm.ctlz.v16i16"]
+    fn ctlz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16;
+    #[link_name = "llvm.ctlz.v32i16"]
+    fn ctlz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32;
+
+    #[link_name = "llvm.ctlz.v2i32"]
+    fn ctlz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2;
+    #[link_name = "llvm.ctlz.v4i32"]
+    fn ctlz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4;
+    #[link_name = "llvm.ctlz.v8i32"]
+    fn ctlz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8;
+    #[link_name = "llvm.ctlz.v16i32"]
+    fn ctlz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16;
+
+    #[link_name = "llvm.ctlz.v2i64"]
+    fn ctlz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2;
+    #[link_name = "llvm.ctlz.v4i64"]
+    fn ctlz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4;
+    #[link_name = "llvm.ctlz.v8i64"]
+    fn ctlz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8;
+
+    #[link_name = "llvm.ctlz.v1i128"]
+    fn ctlz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1;
+    #[link_name = "llvm.ctlz.v2i128"]
+    fn ctlz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2;
+    #[link_name = "llvm.ctlz.v4i128"]
+    fn ctlz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;
+
+    #[link_name = "llvm.cttz.v2i8"]
+    fn cttz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2;
+    #[link_name = "llvm.cttz.v4i8"]
+    fn cttz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4;
+    #[link_name = "llvm.cttz.v8i8"]
+    fn cttz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8;
+    #[link_name = "llvm.cttz.v16i8"]
+    fn cttz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16;
+    #[link_name = "llvm.cttz.v32i8"]
+    fn cttz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32;
+    #[link_name = "llvm.cttz.v64i8"]
+    fn cttz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64;
+
+    #[link_name = "llvm.cttz.v2i16"]
+    fn cttz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2;
+    #[link_name = "llvm.cttz.v4i16"]
+    fn cttz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4;
+    #[link_name = "llvm.cttz.v8i16"]
+    fn cttz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8;
+    #[link_name = "llvm.cttz.v16i16"]
+    fn cttz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16;
+    #[link_name = "llvm.cttz.v32i16"]
+    fn cttz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32;
+
+    #[link_name = "llvm.cttz.v2i32"]
+    fn cttz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2;
+    #[link_name = "llvm.cttz.v4i32"]
+    fn cttz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4;
+    #[link_name = "llvm.cttz.v8i32"]
+    fn cttz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8;
+    #[link_name = "llvm.cttz.v16i32"]
+    fn cttz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16;
+
+    #[link_name = "llvm.cttz.v2i64"]
+    fn cttz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2;
+    #[link_name = "llvm.cttz.v4i64"]
+    fn cttz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4;
+    #[link_name = "llvm.cttz.v8i64"]
+    fn cttz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8;
+
+    #[link_name = "llvm.cttz.v1i128"]
+    fn cttz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1;
+    #[link_name = "llvm.cttz.v2i128"]
+    fn cttz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2;
+    #[link_name = "llvm.cttz.v4i128"]
+    fn cttz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;
+
+    #[link_name = "llvm.ctpop.v2i8"]
+    fn ctpop_u8x2(x: u8x2) -> u8x2;
+    #[link_name = "llvm.ctpop.v4i8"]
+    fn ctpop_u8x4(x: u8x4) -> u8x4;
+    #[link_name = "llvm.ctpop.v8i8"]
+    fn ctpop_u8x8(x: u8x8) -> u8x8;
+    #[link_name = "llvm.ctpop.v16i8"]
+    fn ctpop_u8x16(x: u8x16) -> u8x16;
+    #[link_name = "llvm.ctpop.v32i8"]
+    fn ctpop_u8x32(x: u8x32) -> u8x32;
+    #[link_name = "llvm.ctpop.v64i8"]
+    fn ctpop_u8x64(x: u8x64) -> u8x64;
+
+    #[link_name = "llvm.ctpop.v2i16"]
+    fn ctpop_u16x2(x: u16x2) -> u16x2;
+    #[link_name = "llvm.ctpop.v4i16"]
+    fn ctpop_u16x4(x: u16x4) -> u16x4;
+    #[link_name = "llvm.ctpop.v8i16"]
+    fn ctpop_u16x8(x: u16x8) -> u16x8;
+    #[link_name = "llvm.ctpop.v16i16"]
+    fn ctpop_u16x16(x: u16x16) -> u16x16;
+    #[link_name = "llvm.ctpop.v32i16"]
+    fn ctpop_u16x32(x: u16x32) -> u16x32;
+
+    #[link_name = "llvm.ctpop.v2i32"]
+    fn ctpop_u32x2(x: u32x2) -> u32x2;
+    #[link_name = "llvm.ctpop.v4i32"]
+    fn ctpop_u32x4(x: u32x4) -> u32x4;
+    #[link_name = "llvm.ctpop.v8i32"]
+    fn ctpop_u32x8(x: u32x8) -> u32x8;
+    #[link_name = "llvm.ctpop.v16i32"]
+    fn ctpop_u32x16(x: u32x16) -> u32x16;
+
+    #[link_name = "llvm.ctpop.v2i64"]
+    fn ctpop_u64x2(x: u64x2) -> u64x2;
+    #[link_name = "llvm.ctpop.v4i64"]
+    fn ctpop_u64x4(x: u64x4) -> u64x4;
+    #[link_name = "llvm.ctpop.v8i64"]
+    fn ctpop_u64x8(x: u64x8) -> u64x8;
+
+    #[link_name = "llvm.ctpop.v1i128"]
+    fn ctpop_u128x1(x: u128x1) -> u128x1;
+    #[link_name = "llvm.ctpop.v2i128"]
+    fn ctpop_u128x2(x: u128x2) -> u128x2;
+    #[link_name = "llvm.ctpop.v4i128"]
+    fn ctpop_u128x4(x: u128x4) -> u128x4;
+}
+
+crate trait BitManip {
+    fn ctpop(self) -> Self;
+    fn ctlz(self) -> Self;
+    fn cttz(self) -> Self;
+}
+
+macro_rules! impl_bit_manip {
+    (inner: $ty:ident, $scalar:ty, $uty:ident,
+     $ctpop:ident, $ctlz:ident, $cttz:ident) => {
+        // FIXME: several LLVM intrinsics break on s390x https://github.com/rust-lang-nursery/packed_simd/issues/192
+        #[cfg(target_arch = "s390x")]
+        impl_bit_manip! { scalar: $ty, $scalar }
+        #[cfg(not(target_arch = "s390x"))]
+        impl BitManip for $ty {
+            #[inline]
+            fn ctpop(self) -> Self {
+                let y: $uty = self.cast();
+                unsafe { $ctpop(y).cast() }
+            }
+
+            #[inline]
+            fn ctlz(self) -> Self {
+                let y: $uty = self.cast();
+                // the ctxx intrinsics need compile-time constant
+                // `is_zero_undef`
+                unsafe { $ctlz(y, false).cast() }
+            }
+
+            #[inline]
+            fn cttz(self) -> Self {
+                let y: $uty = self.cast();
+                unsafe { $cttz(y, false).cast() }
+            }
+        }
+    };
+    (sized_inner: $ty:ident, $scalar:ty, $uty:ident) => {
+        #[cfg(target_arch = "s390x")]
+        impl_bit_manip! { scalar: $ty, $scalar }
+        #[cfg(not(target_arch = "s390x"))]
+        impl BitManip for $ty {
+            #[inline]
+            fn ctpop(self) -> Self {
+                let y: $uty = self.cast();
+                $uty::ctpop(y).cast()
+            }
+
+            #[inline]
+            fn ctlz(self) -> Self {
+                let y: $uty = self.cast();
+                $uty::ctlz(y).cast()
+            }
+
+            #[inline]
+            fn cttz(self) -> Self {
+                let y: $uty = self.cast();
+                $uty::cttz(y).cast()
+            }
+        }
+    };
+    (scalar: $ty:ident, $scalar:ty) => {
+        impl BitManip for $ty {
+            #[inline]
+            fn ctpop(self) -> Self {
+                let mut ones = self;
+                for i in 0..Self::lanes() {
+                    ones = ones
+                        .replace(i, self.extract(i).count_ones() as $scalar);
+                }
+                ones
+            }
+
+            #[inline]
+            fn ctlz(self) -> Self {
+                let mut lz = self;
+                for i in 0..Self::lanes() {
+                    lz = lz.replace(
+                        i,
+                        self.extract(i).leading_zeros() as $scalar,
+                    );
+                }
+                lz
+            }
+
+            #[inline]
+            fn cttz(self) -> Self {
+                let mut tz = self;
+                for i in 0..Self::lanes() {
+                    tz = tz.replace(
+                        i,
+                        self.extract(i).trailing_zeros() as $scalar,
+                    );
+                }
+                tz
+            }
+        }
+    };
+    ($uty:ident, $uscalar:ty, $ity:ident, $iscalar:ty,
+     $ctpop:ident, $ctlz:ident, $cttz:ident) => {
+        impl_bit_manip! { inner: $uty, $uscalar, $uty, $ctpop, $ctlz, $cttz }
+        impl_bit_manip! { inner: $ity, $iscalar, $uty, $ctpop, $ctlz, $cttz }
+    };
+    (sized: $usize:ident, $uscalar:ty, $isize:ident,
+     $iscalar:ty, $ty:ident) => {
+        impl_bit_manip! { sized_inner: $usize, $uscalar, $ty }
+        impl_bit_manip! { sized_inner: $isize, $iscalar, $ty }
+    };
+}
+
+impl_bit_manip! { u8x2   ,   u8, i8x2, i8,   ctpop_u8x2,   ctlz_u8x2,   cttz_u8x2   }
+impl_bit_manip! { u8x4   ,   u8, i8x4, i8,   ctpop_u8x4,   ctlz_u8x4,   cttz_u8x4   }
+#[cfg(not(target_arch = "aarch64"))] // see below
+impl_bit_manip! { u8x8   ,   u8, i8x8, i8,   ctpop_u8x8,   ctlz_u8x8,   cttz_u8x8   }
+impl_bit_manip! { u8x16  ,  u8, i8x16, i8,  ctpop_u8x16,  ctlz_u8x16,  cttz_u8x16  }
+impl_bit_manip! { u8x32  ,  u8, i8x32, i8,  ctpop_u8x32,  ctlz_u8x32,  cttz_u8x32  }
+impl_bit_manip! { u8x64  ,  u8, i8x64, i8,  ctpop_u8x64,  ctlz_u8x64,  cttz_u8x64  }
+impl_bit_manip! { u16x2  ,  u16, i16x2, i16,  ctpop_u16x2,  ctlz_u16x2,  cttz_u16x2  }
+impl_bit_manip! { u16x4  ,  u16, i16x4, i16,  ctpop_u16x4,  ctlz_u16x4,  cttz_u16x4  }
+impl_bit_manip! { u16x8  ,  u16, i16x8, i16,  ctpop_u16x8,  ctlz_u16x8,  cttz_u16x8  }
+impl_bit_manip! { u16x16 , u16, i16x16, i16, ctpop_u16x16, ctlz_u16x16, cttz_u16x16 }
+impl_bit_manip! { u16x32 , u16, i16x32, i16, ctpop_u16x32, ctlz_u16x32, cttz_u16x32 }
+impl_bit_manip! { u32x2  ,  u32, i32x2, i32,  ctpop_u32x2,  ctlz_u32x2,  cttz_u32x2  }
+impl_bit_manip! { u32x4  ,  u32, i32x4, i32,  ctpop_u32x4,  ctlz_u32x4,  cttz_u32x4  }
+impl_bit_manip! { u32x8  ,  u32, i32x8, i32,  ctpop_u32x8,  ctlz_u32x8,  cttz_u32x8  }
+impl_bit_manip! { u32x16 , u32, i32x16, i32, ctpop_u32x16, ctlz_u32x16, cttz_u32x16 }
+impl_bit_manip! { u64x2  ,  u64, i64x2, i64,  ctpop_u64x2,  ctlz_u64x2,  cttz_u64x2  }
+impl_bit_manip! { u64x4  ,  u64, i64x4, i64,  ctpop_u64x4,  ctlz_u64x4,  cttz_u64x4  }
+impl_bit_manip! { u64x8  ,  u64, i64x8, i64,  ctpop_u64x8,  ctlz_u64x8,  cttz_u64x8  }
+impl_bit_manip! { u128x1 , u128, i128x1, i128, ctpop_u128x1, ctlz_u128x1, cttz_u128x1 }
+impl_bit_manip! { u128x2 , u128, i128x2, i128, ctpop_u128x2, ctlz_u128x2, cttz_u128x2 }
+impl_bit_manip! { u128x4 , u128, i128x4, i128, ctpop_u128x4, ctlz_u128x4, cttz_u128x4 }
+
+#[cfg(target_arch = "aarch64")]
+impl BitManip for u8x8 {
+    #[inline]
+    fn ctpop(self) -> Self {
+        let y: u8x8 = self.cast();
+        unsafe { ctpop_u8x8(y).cast() }
+    }
+
+    #[inline]
+    fn ctlz(self) -> Self {
+        let y: u8x8 = self.cast();
+        unsafe { ctlz_u8x8(y, false).cast() }
+    }
+
+    #[inline]
+    fn cttz(self) -> Self {
+        // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191
+        // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64
+        // intrinsics
+        let mut tz = self;
+        for i in 0..Self::lanes() {
+            tz = tz.replace(i, self.extract(i).trailing_zeros() as u8);
+        }
+        tz
+    }
+}
+#[cfg(target_arch = "aarch64")]
+impl BitManip for i8x8 {
+    #[inline]
+    fn ctpop(self) -> Self {
+        let y: u8x8 = self.cast();
+        unsafe { ctpop_u8x8(y).cast() }
+    }
+
+    #[inline]
+    fn ctlz(self) -> Self {
+        let y: u8x8 = self.cast();
+        unsafe { ctlz_u8x8(y, false).cast() }
+    }
+
+    #[inline]
+    fn cttz(self) -> Self {
+        // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191
+        // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64
+        // intrinsics
+        let mut tz = self;
+        for i in 0..Self::lanes() {
+            tz = tz.replace(i, self.extract(i).trailing_zeros() as i8);
+        }
+        tz
+    }
+}
+
+cfg_if! {
+    if #[cfg(target_pointer_width = "8")] {
+        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u8x2 }
+        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u8x4 }
+        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u8x8 }
+    } else if #[cfg(target_pointer_width = "16")] {
+        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u16x2 }
+        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u16x4 }
+        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u16x8 }
+    } else if #[cfg(target_pointer_width = "32")] {
+        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u32x2 }
+        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u32x4 }
+        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u32x8 }
+    } else if #[cfg(target_pointer_width = "64")] {
+        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u64x2 }
+        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u64x4 }
+        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u64x8 }
+    } else {
+        compile_error!("unsupported target_pointer_width");
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/llvm.rs b/vendor/packed_simd_2/src/codegen/llvm.rs
new file mode 100644
index 000000000..93c6ce6b7
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/llvm.rs
@@ -0,0 +1,107 @@
+//! LLVM's platform intrinsics
+#![allow(dead_code)]
+
+use crate::sealed::Shuffle;
+#[allow(unused_imports)] // FIXME: spurious warning?
+use crate::sealed::Simd;
+
+// Shuffle intrinsics: expanded in users' crates, therefore public.
+extern "platform-intrinsic" {
+    // FIXME: Passing this intrinsics an `idx` array with an index that is
+    // out-of-bounds will produce a monomorphization-time error.
+    // https://github.com/rust-lang-nursery/packed_simd/issues/21
+    #[rustc_args_required_const(2)]
+    pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U
+    where
+        T: Simd,
+        <T as Simd>::Element: Shuffle<[u32; 2], Output = U>;
+
+    #[rustc_args_required_const(2)]
+    pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U
+    where
+        T: Simd,
+        <T as Simd>::Element: Shuffle<[u32; 4], Output = U>;
+
+    #[rustc_args_required_const(2)]
+    pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U
+    where
+        T: Simd,
+        <T as Simd>::Element: Shuffle<[u32; 8], Output = U>;
+
+    #[rustc_args_required_const(2)]
+    pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U
+    where
+        T: Simd,
+        <T as Simd>::Element: Shuffle<[u32; 16], Output = U>;
+
+    #[rustc_args_required_const(2)]
+    pub fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U
+    where
+        T: Simd,
+        <T as Simd>::Element: Shuffle<[u32; 32], Output = U>;
+
+    #[rustc_args_required_const(2)]
+    pub fn simd_shuffle64<T, U>(x: T, y: T, idx: [u32; 64]) -> U
+    where
+        T: Simd,
+        <T as Simd>::Element: Shuffle<[u32; 64], Output = U>;
+}
+
+pub use self::simd_shuffle16 as __shuffle_vector16;
+pub use self::simd_shuffle2 as __shuffle_vector2;
+pub use self::simd_shuffle32 as __shuffle_vector32;
+pub use self::simd_shuffle4 as __shuffle_vector4;
+pub use self::simd_shuffle64 as __shuffle_vector64;
+pub use self::simd_shuffle8 as __shuffle_vector8;
+
+extern "platform-intrinsic" {
+    crate fn simd_eq<T, U>(x: T, y: T) -> U;
+    crate fn simd_ne<T, U>(x: T, y: T) -> U;
+    crate fn simd_lt<T, U>(x: T, y: T) -> U;
+    crate fn simd_le<T, U>(x: T, y: T) -> U;
+    crate fn simd_gt<T, U>(x: T, y: T) -> U;
+    crate fn simd_ge<T, U>(x: T, y: T) -> U;
+
+    crate fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
+    crate fn simd_extract<T, U>(x: T, idx: u32) -> U;
+
+    crate fn simd_cast<T, U>(x: T) -> U;
+
+    crate fn simd_add<T>(x: T, y: T) -> T;
+    crate fn simd_sub<T>(x: T, y: T) -> T;
+    crate fn simd_mul<T>(x: T, y: T) -> T;
+    crate fn simd_div<T>(x: T, y: T) -> T;
+    crate fn simd_rem<T>(x: T, y: T) -> T;
+    crate fn simd_shl<T>(x: T, y: T) -> T;
+    crate fn simd_shr<T>(x: T, y: T) -> T;
+    crate fn simd_and<T>(x: T, y: T) -> T;
+    crate fn simd_or<T>(x: T, y: T) -> T;
+    crate fn simd_xor<T>(x: T, y: T) -> T;
+
+    crate fn simd_reduce_add_unordered<T, U>(x: T) -> U;
+    crate fn simd_reduce_mul_unordered<T, U>(x: T) -> U;
+    crate fn simd_reduce_add_ordered<T, U>(x: T, acc: U) -> U;
+    crate fn simd_reduce_mul_ordered<T, U>(x: T, acc: U) -> U;
+    crate fn simd_reduce_min<T, U>(x: T) -> U;
+    crate fn simd_reduce_max<T, U>(x: T) -> U;
+    crate fn simd_reduce_min_nanless<T, U>(x: T) -> U;
+    crate fn simd_reduce_max_nanless<T, U>(x: T) -> U;
+    crate fn simd_reduce_and<T, U>(x: T) -> U;
+    crate fn simd_reduce_or<T, U>(x: T) -> U;
+    crate fn simd_reduce_xor<T, U>(x: T) -> U;
+    crate fn simd_reduce_all<T>(x: T) -> bool;
+    crate fn simd_reduce_any<T>(x: T) -> bool;
+
+    crate fn simd_select<M, T>(m: M, a: T, b: T) -> T;
+
+    crate fn simd_fmin<T>(a: T, b: T) -> T;
+    crate fn simd_fmax<T>(a: T, b: T) -> T;
+
+    crate fn simd_fsqrt<T>(a: T) -> T;
+    crate fn simd_fma<T>(a: T, b: T, c: T) -> T;
+
+    crate fn simd_gather<T, P, M>(value: T, pointers: P, mask: M) -> T;
+    crate fn simd_scatter<T, P, M>(value: T, pointers: P, mask: M);
+
+    crate fn simd_bitmask<T, U>(value: T) -> U;
+}
diff --git a/vendor/packed_simd_2/src/codegen/math.rs b/vendor/packed_simd_2/src/codegen/math.rs
new file mode 100644
index 000000000..f3997c7f1
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math.rs
@@ -0,0 +1,3 @@
+//! Vertical math operations
+
+crate mod float;
diff --git a/vendor/packed_simd_2/src/codegen/math/float.rs b/vendor/packed_simd_2/src/codegen/math/float.rs
new file mode 100644
index 000000000..3743b4990
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float.rs
@@ -0,0 +1,19 @@
+//! Vertical floating-point math operations.
+#![allow(clippy::useless_transmute)]
+
+#[macro_use]
+crate mod macros;
+crate mod abs;
+crate mod cos;
+crate mod cos_pi;
+crate mod exp;
+crate mod ln;
+crate mod mul_add;
+crate mod mul_adde;
+crate mod powf;
+crate mod sin;
+crate mod sin_cos_pi;
+crate mod sin_pi;
+crate mod sqrt;
+crate mod sqrte;
+crate mod tanh;
diff --git a/vendor/packed_simd_2/src/codegen/math/float/abs.rs b/vendor/packed_simd_2/src/codegen/math/float/abs.rs
new file mode 100644
index 000000000..bc4421f61
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/abs.rs
@@ -0,0 +1,103 @@
+//! Vertical floating-point `fabs`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors fabs
+
+use crate::*;
+
+crate trait Abs {
+    fn abs(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.fabs.v2f32"]
+    fn fabs_v2f32(x: f32x2) -> f32x2;
+    #[link_name = "llvm.fabs.v4f32"]
+    fn fabs_v4f32(x: f32x4) -> f32x4;
+    #[link_name = "llvm.fabs.v8f32"]
+    fn fabs_v8f32(x: f32x8) -> f32x8;
+    #[link_name = "llvm.fabs.v16f32"]
+    fn fabs_v16f32(x: f32x16) -> f32x16;
+    /* FIXME 64-bit fabsgle elem vectors
+    #[link_name = "llvm.fabs.v1f64"]
+    fn fabs_v1f64(x: f64x1) -> f64x1;
+     */
+    #[link_name = "llvm.fabs.v2f64"]
+    fn fabs_v2f64(x: f64x2) -> f64x2;
+    #[link_name = "llvm.fabs.v4f64"]
+    fn fabs_v4f64(x: f64x4) -> f64x4;
+    #[link_name = "llvm.fabs.v8f64"]
+    fn fabs_v8f64(x: f64x8) -> f64x8;
+
+    #[link_name = "llvm.fabs.f32"]
+    fn fabs_f32(x: f32) -> f32;
+    #[link_name = "llvm.fabs.f64"]
+    fn fabs_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Abs, abs);
+
+cfg_if! {
+    if #[cfg(target_arch = "s390x")] {
+        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+        impl_unary!(f32x2[f32; 2]: fabs_f32);
+        impl_unary!(f32x4[f32; 4]: fabs_f32);
+        impl_unary!(f32x8[f32; 8]: fabs_f32);
+        impl_unary!(f32x16[f32; 16]: fabs_f32);
+
+        impl_unary!(f64x2[f64; 2]: fabs_f64);
+        impl_unary!(f64x4[f64; 4]: fabs_f64);
+        impl_unary!(f64x8[f64; 8]: fabs_f64);
+    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_avx2128);
+                impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx2);
+                impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx2);
+
+                impl_unary!(f32x4: Sleef_fabsf4_avx2128);
+                impl_unary!(f32x8: Sleef_fabsf8_avx2);
+                impl_unary!(f64x2: Sleef_fabsd2_avx2128);
+                impl_unary!(f64x4: Sleef_fabsd4_avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4);
+                impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx);
+                impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx);
+
+                impl_unary!(f32x4: Sleef_fabsf4_sse4);
+                impl_unary!(f32x8: Sleef_fabsf8_avx);
+                impl_unary!(f64x2: Sleef_fabsd2_sse4);
+                impl_unary!(f64x4: Sleef_fabsd4_avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4);
+                impl_unary!(f32x16[q => f32x4]: Sleef_fabsf4_sse4);
+                impl_unary!(f64x8[q => f64x2]: Sleef_fabsd2_sse4);
+
+                impl_unary!(f32x4: Sleef_fabsf4_sse4);
+                impl_unary!(f32x8[h => f32x4]: Sleef_fabsf4_sse4);
+                impl_unary!(f64x2: Sleef_fabsd2_sse4);
+                impl_unary!(f64x4[h => f64x2]: Sleef_fabsd2_sse4);
+            } else {
+                impl_unary!(f32x2[f32; 2]: fabs_f32);
+                impl_unary!(f32x16: fabs_v16f32);
+                impl_unary!(f64x8: fabs_v8f64);
+
+                impl_unary!(f32x4: fabs_v4f32);
+                impl_unary!(f32x8: fabs_v8f32);
+                impl_unary!(f64x2: fabs_v2f64);
+                impl_unary!(f64x4: fabs_v4f64);
+            }
+        }
+    } else {
+        impl_unary!(f32x2[f32; 2]: fabs_f32);
+        impl_unary!(f32x4: fabs_v4f32);
+        impl_unary!(f32x8: fabs_v8f32);
+        impl_unary!(f32x16: fabs_v16f32);
+
+        impl_unary!(f64x2: fabs_v2f64);
+        impl_unary!(f64x4: fabs_v4f64);
+        impl_unary!(f64x8: fabs_v8f64);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/cos.rs b/vendor/packed_simd_2/src/codegen/math/float/cos.rs
new file mode 100644
index 000000000..50f6c16da
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/cos.rs
@@ -0,0 +1,103 @@
+//! Vertical floating-point `cos`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vector cos
+
+use crate::*;
+
+crate trait Cos {
+    fn cos(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.cos.v2f32"]
+    fn cos_v2f32(x: f32x2) -> f32x2;
+    #[link_name = "llvm.cos.v4f32"]
+    fn cos_v4f32(x: f32x4) -> f32x4;
+    #[link_name = "llvm.cos.v8f32"]
+    fn cos_v8f32(x: f32x8) -> f32x8;
+    #[link_name = "llvm.cos.v16f32"]
+    fn cos_v16f32(x: f32x16) -> f32x16;
+    /* FIXME 64-bit cosgle elem vectors
+    #[link_name = "llvm.cos.v1f64"]
+    fn cos_v1f64(x: f64x1) -> f64x1;
+     */
+    #[link_name = "llvm.cos.v2f64"]
+    fn cos_v2f64(x: f64x2) -> f64x2;
+    #[link_name = "llvm.cos.v4f64"]
+    fn cos_v4f64(x: f64x4) -> f64x4;
+    #[link_name = "llvm.cos.v8f64"]
+    fn cos_v8f64(x: f64x8) -> f64x8;
+
+    #[link_name = "llvm.cos.f32"]
+    fn cos_f32(x: f32) -> f32;
+    #[link_name = "llvm.cos.f64"]
+    fn cos_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Cos, cos);
+
+cfg_if! {
+    if #[cfg(target_arch = "s390x")] {
+        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+        impl_unary!(f32x2[f32; 2]: cos_f32);
+        impl_unary!(f32x4[f32; 4]: cos_f32);
+        impl_unary!(f32x8[f32; 8]: cos_f32);
+        impl_unary!(f32x16[f32; 16]: cos_f32);
+
+        impl_unary!(f64x2[f64; 2]: cos_f64);
+        impl_unary!(f64x4[f64; 4]: cos_f64);
+        impl_unary!(f64x8[f64; 8]: cos_f64);
+    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10avx2128);
+                impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx2);
+                impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx2);
+
+                impl_unary!(f32x4: Sleef_cosf4_u10avx2128);
+                impl_unary!(f32x8: Sleef_cosf8_u10avx2);
+                impl_unary!(f64x2: Sleef_cosd2_u10avx2128);
+                impl_unary!(f64x4: Sleef_cosd4_u10avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4);
+                impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx);
+                impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx);
+
+                impl_unary!(f32x4: Sleef_cosf4_u10sse4);
+                impl_unary!(f32x8: Sleef_cosf8_u10avx);
+                impl_unary!(f64x2: Sleef_cosd2_u10sse4);
+                impl_unary!(f64x4: Sleef_cosd4_u10avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4);
+                impl_unary!(f32x16[q => f32x4]: Sleef_cosf4_u10sse4);
+                impl_unary!(f64x8[q => f64x2]: Sleef_cosd2_u10sse4);
+
+                impl_unary!(f32x4: Sleef_cosf4_u10sse4);
+                impl_unary!(f32x8[h => f32x4]: Sleef_cosf4_u10sse4);
+                impl_unary!(f64x2: Sleef_cosd2_u10sse4);
+                impl_unary!(f64x4[h => f64x2]: Sleef_cosd2_u10sse4);
+            } else {
+                impl_unary!(f32x2[f32; 2]: cos_f32);
+                impl_unary!(f32x16: cos_v16f32);
+                impl_unary!(f64x8: cos_v8f64);
+
+                impl_unary!(f32x4: cos_v4f32);
+                impl_unary!(f32x8: cos_v8f32);
+                impl_unary!(f64x2: cos_v2f64);
+                impl_unary!(f64x4: cos_v4f64);
+            }
+        }
+    } else {
+        impl_unary!(f32x2[f32; 2]: cos_f32);
+        impl_unary!(f32x4: cos_v4f32);
+        impl_unary!(f32x8: cos_v8f32);
+        impl_unary!(f32x16: cos_v16f32);
+
+        impl_unary!(f64x2: cos_v2f64);
+        impl_unary!(f64x4: cos_v4f64);
+        impl_unary!(f64x8: cos_v8f64);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/cos_pi.rs b/vendor/packed_simd_2/src/codegen/math/float/cos_pi.rs
new file mode 100644
index 000000000..ebff5fd1c
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/cos_pi.rs
@@ -0,0 +1,87 @@
+//! Vertical floating-point `cos`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors cos_pi
+
+use crate::*;
+
+crate trait CosPi {
+    fn cos_pi(self) -> Self;
+}
+
+gen_unary_impl_table!(CosPi, cos_pi);
+
+macro_rules! impl_def {
+    ($vid:ident, $PI:path) => {
+        impl CosPi for $vid {
+            #[inline]
+            fn cos_pi(self) -> Self {
+                (self * Self::splat($PI)).cos()
+            }
+        }
+    };
+}
+macro_rules! impl_def32 {
+    ($vid:ident) => {
+        impl_def!($vid, crate::f32::consts::PI);
+    };
+}
+macro_rules! impl_def64 {
+    ($vid:ident) => {
+        impl_def!($vid, crate::f64::consts::PI);
+    };
+}
+
+cfg_if! {
+    if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05avx2128);
+                impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx2);
+                impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx2);
+
+                impl_unary!(f32x4: Sleef_cospif4_u05avx2128);
+                impl_unary!(f32x8: Sleef_cospif8_u05avx2);
+                impl_unary!(f64x2: Sleef_cospid2_u05avx2128);
+                impl_unary!(f64x4: Sleef_cospid4_u05avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4);
+                impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx);
+                impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx);
+
+                impl_unary!(f32x4: Sleef_cospif4_u05sse4);
+                impl_unary!(f32x8: Sleef_cospif8_u05avx);
+                impl_unary!(f64x2: Sleef_cospid2_u05sse4);
+                impl_unary!(f64x4: Sleef_cospid4_u05avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4);
+                impl_unary!(f32x16[q => f32x4]: Sleef_cospif4_u05sse4);
+                impl_unary!(f64x8[q => f64x2]: Sleef_cospid2_u05sse4);
+
+                impl_unary!(f32x4: Sleef_cospif4_u05sse4);
+                impl_unary!(f32x8[h => f32x4]: Sleef_cospif4_u05sse4);
+                impl_unary!(f64x2: Sleef_cospid2_u05sse4);
+                impl_unary!(f64x4[h => f64x2]: Sleef_cospid2_u05sse4);
+            } else {
+                impl_def32!(f32x2);
+                impl_def32!(f32x4);
+                impl_def32!(f32x8);
+                impl_def32!(f32x16);
+
+                impl_def64!(f64x2);
+                impl_def64!(f64x4);
+                impl_def64!(f64x8);
+            }
+        }
+    } else {
+        impl_def32!(f32x2);
+        impl_def32!(f32x4);
+        impl_def32!(f32x8);
+        impl_def32!(f32x16);
+
+        impl_def64!(f64x2);
+        impl_def64!(f64x4);
+        impl_def64!(f64x8);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/exp.rs b/vendor/packed_simd_2/src/codegen/math/float/exp.rs
new file mode 100644
index 000000000..00d10e9fa
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/exp.rs
@@ -0,0 +1,112 @@
+//! Vertical floating-point `exp`
+#![allow(unused)]
+
+// FIXME 64-bit expgle elem vectors misexpg
+
+use crate::*;
+
+crate trait Exp {
+    fn exp(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.exp.v2f32"]
+    fn exp_v2f32(x: f32x2) -> f32x2;
+    #[link_name = "llvm.exp.v4f32"]
+    fn exp_v4f32(x: f32x4) -> f32x4;
+    #[link_name = "llvm.exp.v8f32"]
+    fn exp_v8f32(x: f32x8) -> f32x8;
+    #[link_name = "llvm.exp.v16f32"]
+    fn exp_v16f32(x: f32x16) -> f32x16;
+    /* FIXME 64-bit expgle elem vectors
+    #[link_name = "llvm.exp.v1f64"]
+    fn exp_v1f64(x: f64x1) -> f64x1;
+     */
+    #[link_name = "llvm.exp.v2f64"]
+    fn exp_v2f64(x: f64x2) -> f64x2;
+    #[link_name = "llvm.exp.v4f64"]
+    fn exp_v4f64(x: f64x4) -> f64x4;
+    #[link_name = "llvm.exp.v8f64"]
+    fn exp_v8f64(x: f64x8) -> f64x8;
+
+    #[link_name = "llvm.exp.f32"]
+    fn exp_f32(x: f32) -> f32;
+    #[link_name = "llvm.exp.f64"]
+    fn exp_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Exp, exp);
+
+cfg_if! {
+    if #[cfg(target_arch = "s390x")] {
+        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+        impl_unary!(f32x2[f32; 2]: exp_f32);
+        impl_unary!(f32x4[f32; 4]: exp_f32);
+        impl_unary!(f32x8[f32; 8]: exp_f32);
+        impl_unary!(f32x16[f32; 16]: exp_f32);
+
+        impl_unary!(f64x2[f64; 2]: exp_f64);
+        impl_unary!(f64x4[f64; 4]: exp_f64);
+        impl_unary!(f64x8[f64; 8]: exp_f64);
+    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10avx2128);
+                impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx2);
+                impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx2);
+
+                impl_unary!(f32x4: Sleef_expf4_u10avx2128);
+                impl_unary!(f32x8: Sleef_expf8_u10avx2);
+                impl_unary!(f64x2: Sleef_expd2_u10avx2128);
+                impl_unary!(f64x4: Sleef_expd4_u10avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4);
+                impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx);
+                impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx);
+
+                impl_unary!(f32x4: Sleef_expf4_u10sse4);
+                impl_unary!(f32x8: Sleef_expf8_u10avx);
+                impl_unary!(f64x2: Sleef_expd2_u10sse4);
+                impl_unary!(f64x4: Sleef_expd4_u10avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4);
+                impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse4);
+                impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse4);
+
+                impl_unary!(f32x4: Sleef_expf4_u10sse4);
+                impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse4);
+                impl_unary!(f64x2: Sleef_expd2_u10sse4);
+                impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse4);
+            } else if #[cfg(target_feature = "sse2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse2);
+                impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse2);
+                impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse2);
+
+                impl_unary!(f32x4: Sleef_expf4_u10sse2);
+                impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse2);
+                impl_unary!(f64x2: Sleef_expd2_u10sse2);
+                impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse2);
+            } else {
+                impl_unary!(f32x2[f32; 2]: exp_f32);
+                impl_unary!(f32x16: exp_v16f32);
+                impl_unary!(f64x8: exp_v8f64);
+
+                impl_unary!(f32x4: exp_v4f32);
+                impl_unary!(f32x8: exp_v8f32);
+                impl_unary!(f64x2: exp_v2f64);
+                impl_unary!(f64x4: exp_v4f64);
+            }
+        }
+    } else {
+        impl_unary!(f32x2[f32; 2]: exp_f32);
+        impl_unary!(f32x4: exp_v4f32);
+        impl_unary!(f32x8: exp_v8f32);
+        impl_unary!(f32x16: exp_v16f32);
+
+        impl_unary!(f64x2: exp_v2f64);
+        impl_unary!(f64x4: exp_v4f64);
+        impl_unary!(f64x8: exp_v8f64);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/ln.rs b/vendor/packed_simd_2/src/codegen/math/float/ln.rs
new file mode 100644
index 000000000..88a5a6c6c
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/ln.rs
@@ -0,0 +1,112 @@
+//! Vertical floating-point `ln`
+#![allow(unused)]
+
+// FIXME 64-bit lngle elem vectors mislng
+
+use crate::*;
+
+crate trait Ln {
+    fn ln(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.log.v2f32"]
+    fn ln_v2f32(x: f32x2) -> f32x2;
+    #[link_name = "llvm.log.v4f32"]
+    fn ln_v4f32(x: f32x4) -> f32x4;
+    #[link_name = "llvm.log.v8f32"]
+    fn ln_v8f32(x: f32x8) -> f32x8;
+    #[link_name = "llvm.log.v16f32"]
+    fn ln_v16f32(x: f32x16) -> f32x16;
+    /* FIXME 64-bit lngle elem vectors
+    #[link_name = "llvm.log.v1f64"]
+    fn ln_v1f64(x: f64x1) -> f64x1;
+     */
+    #[link_name = "llvm.log.v2f64"]
+    fn ln_v2f64(x: f64x2) -> f64x2;
+    #[link_name = "llvm.log.v4f64"]
+    fn ln_v4f64(x: f64x4) -> f64x4;
+    #[link_name = "llvm.log.v8f64"]
+    fn ln_v8f64(x: f64x8) -> f64x8;
+
+    #[link_name = "llvm.log.f32"]
+    fn ln_f32(x: f32) -> f32;
+    #[link_name = "llvm.log.f64"]
+    fn ln_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Ln, ln);
+
+cfg_if! {
+    if #[cfg(target_arch = "s390x")] {
+        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+        impl_unary!(f32x2[f32; 2]: ln_f32);
+        impl_unary!(f32x4[f32; 4]: ln_f32);
+        impl_unary!(f32x8[f32; 8]: ln_f32);
+        impl_unary!(f32x16[f32; 16]: ln_f32);
+
+        impl_unary!(f64x2[f64; 2]: ln_f64);
+        impl_unary!(f64x4[f64; 4]: ln_f64);
+        impl_unary!(f64x8[f64; 8]: ln_f64);
+    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10avx2128);
+                impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx2);
+                impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx2);
+
+                impl_unary!(f32x4: Sleef_logf4_u10avx2128);
+                impl_unary!(f32x8: Sleef_logf8_u10avx2);
+                impl_unary!(f64x2: Sleef_logd2_u10avx2128);
+                impl_unary!(f64x4: Sleef_logd4_u10avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4);
+                impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx);
+                impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx);
+
+                impl_unary!(f32x4: Sleef_logf4_u10sse4);
+                impl_unary!(f32x8: Sleef_logf8_u10avx);
+                impl_unary!(f64x2: Sleef_logd2_u10sse4);
+                impl_unary!(f64x4: Sleef_logd4_u10avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4);
+                impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse4);
+                impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse4);
+
+                impl_unary!(f32x4: Sleef_logf4_u10sse4);
+                impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse4);
+                impl_unary!(f64x2: Sleef_logd2_u10sse4);
+                impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse4);
+            } else if #[cfg(target_feature = "sse2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse2);
+                impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse2);
+                impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse2);
+
+                impl_unary!(f32x4: Sleef_logf4_u10sse2);
+                impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse2);
+                impl_unary!(f64x2: Sleef_logd2_u10sse2);
+                impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse2);
+            } else {
+                impl_unary!(f32x2[f32; 2]: ln_f32);
+                impl_unary!(f32x16: ln_v16f32);
+                impl_unary!(f64x8: ln_v8f64);
+
+                impl_unary!(f32x4: ln_v4f32);
+                impl_unary!(f32x8: ln_v8f32);
+                impl_unary!(f64x2: ln_v2f64);
+                impl_unary!(f64x4: ln_v4f64);
+            }
+        }
+    } else {
+        impl_unary!(f32x2[f32; 2]: ln_f32);
+        impl_unary!(f32x4: ln_v4f32);
+        impl_unary!(f32x8: ln_v8f32);
+        impl_unary!(f32x16: ln_v16f32);
+
+        impl_unary!(f64x2: ln_v2f64);
+        impl_unary!(f64x4: ln_v4f64);
+        impl_unary!(f64x8: ln_v8f64);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/macros.rs b/vendor/packed_simd_2/src/codegen/math/float/macros.rs
new file mode 100644
index 000000000..02d0ca3f5
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/macros.rs
@@ -0,0 +1,559 @@
+//! Utility macros
+#![allow(unused)]
+
+
+macro_rules! impl_unary_ {
+    // implementation mapping 1:1
+    (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self) -> Self {
+                unsafe {
+                    use crate::mem::transmute;
+                    transmute($fun(transmute(self)))
+                }
+            }
+        }
+    };
+    // implementation mapping 1:1 for when `$fun` is a generic function
+    // like some of the fp math rustc intrinsics (e.g. `fn fun<T>(x: T) -> T`).
+    (gen | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self) -> Self {
+                unsafe {
+                    use crate::mem::transmute;
+                    transmute($fun(self.0))
+                }
+            }
+        }
+    };
+    (scalar | $trait_id:ident, $trait_method:ident,
+     $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self) -> Self {
+                unsafe {
+                    union U {
+                        vec: $vec_id,
+                        scalars: [$sid; $scount],
+                    }
+                    let mut scalars = U { vec: self }.scalars;
+                    for i in &mut scalars {
+                        *i = $fun(*i);
+                    }
+                    U { scalars }.vec
+                }
+            }
+        }
+    };
+    // implementation calling fun twice on each of the vector halves:
+    (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $vech_id:ident, $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self) -> Self {
+                unsafe {
+                    use crate::mem::transmute;
+                    union U {
+                        vec: $vec_id,
+                        halves: [$vech_id; 2],
+                    }
+
+                    let mut halves = U { vec: self }.halves;
+
+                    *halves.get_unchecked_mut(0) =
+                        transmute($fun(transmute(*halves.get_unchecked(0))));
+                    *halves.get_unchecked_mut(1) =
+                        transmute($fun(transmute(*halves.get_unchecked(1))));
+
+                    U { halves }.vec
+                }
+            }
+        }
+    };
+    // implementation calling fun four times on each of the vector quarters:
+    (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $vecq_id:ident, $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self) -> Self {
+                unsafe {
+                    use crate::mem::transmute;
+                    union U {
+                        vec: $vec_id,
+                        quarters: [$vecq_id; 4],
+                    }
+
+                    let mut quarters = U { vec: self }.quarters;
+
+                    *quarters.get_unchecked_mut(0) =
+                        transmute($fun(transmute(*quarters.get_unchecked(0))));
+                    *quarters.get_unchecked_mut(1) =
+                        transmute($fun(transmute(*quarters.get_unchecked(1))));
+                    *quarters.get_unchecked_mut(2) =
+                        transmute($fun(transmute(*quarters.get_unchecked(2))));
+                    *quarters.get_unchecked_mut(3) =
+                        transmute($fun(transmute(*quarters.get_unchecked(3))));
+
+                    U { quarters }.vec
+                }
+            }
+        }
+    };
+    // implementation calling fun once on a vector twice as large:
+    (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $vect_id:ident, $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self) -> Self {
+                unsafe {
+                    use crate::mem::{transmute, uninitialized};
+
+                    union U {
+                        vec: [$vec_id; 2],
+                        twice: $vect_id,
+                    }
+
+                    let twice = U { vec: [self, uninitialized()] }.twice;
+                    let twice = transmute($fun(transmute(twice)));
+
+                    *(U { twice }.vec.get_unchecked(0))
+                }
+            }
+        }
+    };
+}
+
+macro_rules! gen_unary_impl_table {
+    ($trait_id:ident, $trait_method:ident) => {
+        macro_rules! impl_unary {
+            ($vid:ident: $fun:ident) => {
+                impl_unary_!(vec | $trait_id, $trait_method, $vid, $fun);
+            };
+            ($vid:ident[g]: $fun:ident) => {
+                impl_unary_!(gen | $trait_id, $trait_method, $vid, $fun);
+            };
+            ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {
+                impl_unary_!(
+                    scalar | $trait_id,
+                    $trait_method,
+                    $vid,
+                    [$sid; $sc],
+                    $fun
+                );
+            };
+            ($vid:ident[s]: $fun:ident) => {
+                impl_unary_!(scalar | $trait_id, $trait_method, $vid, $fun);
+            };
+            ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
+                impl_unary_!(
+                    halves | $trait_id,
+                    $trait_method,
+                    $vid,
+                    $vid_h,
+                    $fun
+                );
+            };
+            ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
+                impl_unary_!(
+                    quarter | $trait_id,
+                    $trait_method,
+                    $vid,
+                    $vid_q,
+                    $fun
+                );
+            };
+            ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
+                impl_unary_!(
+                    twice | $trait_id,
+                    $trait_method,
+                    $vid,
+                    $vid_t,
+                    $fun
+                );
+            };
+        }
+    };
+}
+
+macro_rules! impl_tertiary_ {
+    // implementation mapping 1:1
+    (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self, y: Self, z: Self) -> Self {
+                unsafe {
+                    use crate::mem::transmute;
+                    transmute($fun(
+                        transmute(self),
+                        transmute(y),
+                        transmute(z),
+                    ))
+                }
+            }
+        }
+    };
+    (scalar | $trait_id:ident, $trait_method:ident,
+     $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self, y: Self, z: Self) -> Self {
+                unsafe {
+                    union U {
+                        vec: $vec_id,
+                        scalars: [$sid; $scount],
+                    }
+                    let mut x = U { vec: self }.scalars;
+                    let y = U { vec: y }.scalars;
+                    let z = U { vec: z }.scalars;
+                    for (x, (y, z)) in (&mut scalars).zip(&y).zip(&z) {
+                        *i = $fun(*i, *y, *z);
+                    }
+                    U { vec: x }.vec
+                }
+            }
+        }
+    };
+    // implementation calling fun twice on each of the vector halves:
+    (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $vech_id:ident, $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self, y: Self, z: Self) -> Self {
+                unsafe {
+                    use crate::mem::transmute;
+                    union U {
+                        vec: $vec_id,
+                        halves: [$vech_id; 2],
+                    }
+
+                    let mut x_halves = U { vec: self }.halves;
+                    let y_halves = U { vec: y }.halves;
+                    let z_halves = U { vec: z }.halves;
+
+                    *x_halves.get_unchecked_mut(0) = transmute($fun(
+                        transmute(*x_halves.get_unchecked(0)),
+                        transmute(*y_halves.get_unchecked(0)),
+                        transmute(*z_halves.get_unchecked(0)),
+                    ));
+                    *x_halves.get_unchecked_mut(1) = transmute($fun(
+                        transmute(*x_halves.get_unchecked(1)),
+                        transmute(*y_halves.get_unchecked(1)),
+                        transmute(*z_halves.get_unchecked(1)),
+                    ));
+
+                    U { halves: x_halves }.vec
+                }
+            }
+        }
+    };
+    // implementation calling fun four times on each of the vector quarters:
+    (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $vecq_id:ident, $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self, y: Self, z: Self) -> Self {
+                unsafe {
+                    use crate::mem::transmute;
+                    union U {
+                        vec: $vec_id,
+                        quarters: [$vecq_id; 4],
+                    }
+
+                    let mut x_quarters = U { vec: self }.quarters;
+                    let y_quarters = U { vec: y }.quarters;
+                    let z_quarters = U { vec: z }.quarters;
+
+                    *x_quarters.get_unchecked_mut(0) = transmute($fun(
+                        transmute(*x_quarters.get_unchecked(0)),
+                        transmute(*y_quarters.get_unchecked(0)),
+                        transmute(*z_quarters.get_unchecked(0)),
+                    ));
+
+                    *x_quarters.get_unchecked_mut(1) = transmute($fun(
+                        transmute(*x_quarters.get_unchecked(1)),
+                        transmute(*y_quarters.get_unchecked(1)),
+                        transmute(*z_quarters.get_unchecked(1)),
+                    ));
+
+                    *x_quarters.get_unchecked_mut(2) = transmute($fun(
+                        transmute(*x_quarters.get_unchecked(2)),
+                        transmute(*y_quarters.get_unchecked(2)),
+                        transmute(*z_quarters.get_unchecked(2)),
+                    ));
+
+                    *x_quarters.get_unchecked_mut(3) = transmute($fun(
+                        transmute(*x_quarters.get_unchecked(3)),
+                        transmute(*y_quarters.get_unchecked(3)),
+                        transmute(*z_quarters.get_unchecked(3)),
+                    ));
+
+                    U { quarters: x_quarters }.vec
+                }
+            }
+        }
+    };
+    // implementation calling fun once on a vector twice as large:
+    (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $vect_id:ident, $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self, y: Self, z: Self) -> Self {
+                unsafe {
+                    use crate::mem::{transmute, uninitialized};
+
+                    union U {
+                        vec: [$vec_id; 2],
+                        twice: $vect_id,
+                    }
+
+                    let x_twice = U { vec: [self, uninitialized()] }.twice;
+                    let y_twice = U { vec: [y, uninitialized()] }.twice;
+                    let z_twice = U { vec: [z, uninitialized()] }.twice;
+                    let twice: $vect_id = transmute($fun(
+                        transmute(x_twice),
+                        transmute(y_twice),
+                        transmute(z_twice),
+                    ));
+
+                    *(U { twice }.vec.get_unchecked(0))
+                }
+            }
+        }
+    };
+}
+
+macro_rules! gen_tertiary_impl_table {
+    ($trait_id:ident, $trait_method:ident) => {
+        macro_rules! impl_tertiary {
+            ($vid:ident: $fun:ident) => {
+                impl_tertiary_!(vec | $trait_id, $trait_method, $vid, $fun);
+            };
+            ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {
+                impl_tertiary_!(
+                    scalar | $trait_id,
+                    $trait_method,
+                    $vid,
+                    [$sid; $sc],
+                    $fun
+                );
+            };
+            ($vid:ident[s]: $fun:ident) => {
+                impl_tertiary_!(scalar | $trait_id, $trait_method, $vid, $fun);
+            };
+            ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
+                impl_tertiary_!(
+                    halves | $trait_id,
+                    $trait_method,
+                    $vid,
+                    $vid_h,
+                    $fun
+                );
+            };
+            ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
+                impl_tertiary_!(
+                    quarter | $trait_id,
+                    $trait_method,
+                    $vid,
+                    $vid_q,
+                    $fun
+                );
+            };
+            ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
+                impl_tertiary_!(
+                    twice | $trait_id,
+                    $trait_method,
+                    $vid,
+                    $vid_t,
+                    $fun
+                );
+            };
+        }
+    };
+}
+
+macro_rules! impl_binary_ {
+    // implementation mapping 1:1
+    (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self, y: Self) -> Self {
+                unsafe {
+                    use crate::mem::transmute;
+                    transmute($fun(transmute(self), transmute(y)))
+                }
+            }
+        }
+    };
+    (scalar | $trait_id:ident, $trait_method:ident,
+     $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self, y: Self) -> Self {
+                unsafe {
+                    union U {
+                        vec: $vec_id,
+                        scalars: [$sid; $scount],
+                    }
+                    let mut x = U { vec: self }.scalars;
+                    let y = U { vec: y }.scalars;
+                    for (x, y) in x.iter_mut().zip(&y) {
+                        *x = $fun(*x, *y);
+                    }
+                    U { scalars: x }.vec
+                }
+            }
+        }
+    };
+    // implementation calling fun twice on each of the vector halves:
+    (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $vech_id:ident, $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self, y: Self) -> Self {
+                unsafe {
+                    use crate::mem::transmute;
+                    union U {
+                        vec: $vec_id,
+                        halves: [$vech_id; 2],
+                    }
+
+                    let mut x_halves = U { vec: self }.halves;
+                    let y_halves = U { vec: y }.halves;
+
+                    *x_halves.get_unchecked_mut(0) = transmute($fun(
+                        transmute(*x_halves.get_unchecked(0)),
+                        transmute(*y_halves.get_unchecked(0)),
+                    ));
+                    *x_halves.get_unchecked_mut(1) = transmute($fun(
+                        transmute(*x_halves.get_unchecked(1)),
+                        transmute(*y_halves.get_unchecked(1)),
+                    ));
+
+                    U { halves: x_halves }.vec
+                }
+            }
+        }
+    };
+    // implementation calling fun four times on each of the vector quarters:
+    (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $vecq_id:ident, $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self, y: Self) -> Self {
+                unsafe {
+                    use crate::mem::transmute;
+                    union U {
+                        vec: $vec_id,
+                        quarters: [$vecq_id; 4],
+                    }
+
+                    let mut x_quarters = U { vec: self }.quarters;
+                    let y_quarters = U { vec: y }.quarters;
+
+                    *x_quarters.get_unchecked_mut(0) = transmute($fun(
+                        transmute(*x_quarters.get_unchecked(0)),
+                        transmute(*y_quarters.get_unchecked(0)),
+                    ));
+
+                    *x_quarters.get_unchecked_mut(1) = transmute($fun(
+                        transmute(*x_quarters.get_unchecked(1)),
+                        transmute(*y_quarters.get_unchecked(1)),
+                    ));
+
+                    *x_quarters.get_unchecked_mut(2) = transmute($fun(
+                        transmute(*x_quarters.get_unchecked(2)),
+                        transmute(*y_quarters.get_unchecked(2)),
+                    ));
+
+                    *x_quarters.get_unchecked_mut(3) = transmute($fun(
+                        transmute(*x_quarters.get_unchecked(3)),
+                        transmute(*y_quarters.get_unchecked(3)),
+                    ));
+
+                    U { quarters: x_quarters }.vec
+                }
+            }
+        }
+    };
+    // implementation calling fun once on a vector twice as large:
+    (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,
+     $vect_id:ident, $fun:ident) => {
+        impl $trait_id for $vec_id {
+            #[inline]
+            fn $trait_method(self, y: Self) -> Self {
+                unsafe {
+                    use crate::mem::{transmute, uninitialized};
+
+                    union U {
+                        vec: [$vec_id; 2],
+                        twice: $vect_id,
+                    }
+
+                    let x_twice = U { vec: [self, uninitialized()] }.twice;
+                    let y_twice = U { vec: [y, uninitialized()] }.twice;
+                    let twice: $vect_id = transmute($fun(
+                        transmute(x_twice),
+                        transmute(y_twice),
+                    ));
+
+                    *(U { twice }.vec.get_unchecked(0))
+                }
+            }
+        }
+    };
+}
+
+macro_rules! gen_binary_impl_table {
+    ($trait_id:ident, $trait_method:ident) => {
+        macro_rules! impl_binary {
+            ($vid:ident: $fun:ident) => {
+                impl_binary_!(vec | $trait_id, $trait_method, $vid, $fun);
+            };
+            ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {
+                impl_binary_!(
+                    scalar | $trait_id,
+                    $trait_method,
+                    $vid,
+                    [$sid; $sc],
+                    $fun
+                );
+            };
+            ($vid:ident[s]: $fun:ident) => {
+                impl_binary_!(scalar | $trait_id, $trait_method, $vid, $fun);
+            };
+            ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
+                impl_binary_!(
+                    halves | $trait_id,
+                    $trait_method,
+                    $vid,
+                    $vid_h,
+                    $fun
+                );
+            };
+            ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
+                impl_binary_!(
+                    quarter | $trait_id,
+                    $trait_method,
+                    $vid,
+                    $vid_q,
+                    $fun
+                );
+            };
+            ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
+                impl_binary_!(
+                    twice | $trait_id,
+                    $trait_method,
+                    $vid,
+                    $vid_t,
+                    $fun
+                );
+            };
+        }
+    };
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/mul_add.rs b/vendor/packed_simd_2/src/codegen/math/float/mul_add.rs
new file mode 100644
index 000000000..f48a57dc4
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/mul_add.rs
@@ -0,0 +1,109 @@
+//! Vertical floating-point `mul_add`
+#![allow(unused)]
+use crate::*;
+
+// FIXME: 64-bit 1 element mul_add
+
+crate trait MulAdd {
+    fn mul_add(self, y: Self, z: Self) -> Self;
+}
+
+#[cfg(not(target_arch = "s390x"))]
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.fma.v2f32"]
+    fn fma_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2;
+    #[link_name = "llvm.fma.v4f32"]
+    fn fma_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4;
+    #[link_name = "llvm.fma.v8f32"]
+    fn fma_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8;
+    #[link_name = "llvm.fma.v16f32"]
+    fn fma_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16;
+    /* FIXME 64-bit single elem vectors
+    #[link_name = "llvm.fma.v1f64"]
+    fn fma_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1;
+    */
+    #[link_name = "llvm.fma.v2f64"]
+    fn fma_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2;
+    #[link_name = "llvm.fma.v4f64"]
+    fn fma_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4;
+    #[link_name = "llvm.fma.v8f64"]
+    fn fma_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8;
+}
+
+gen_tertiary_impl_table!(MulAdd, mul_add);
+
+cfg_if! {
+    if #[cfg(target_arch = "s390x")] {
+        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+        macro_rules! impl_broken {
+            ($id:ident) => {
+                impl MulAdd for $id {
+                    #[inline]
+                    fn mul_add(self, y: Self, z: Self) -> Self {
+                        self * y + z
+                    }
+                }
+            };
+        }
+
+        impl_broken!(f32x2);
+        impl_broken!(f32x4);
+        impl_broken!(f32x8);
+        impl_broken!(f32x16);
+
+        impl_broken!(f64x2);
+        impl_broken!(f64x4);
+        impl_broken!(f64x8);
+    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_avx2128);
+                impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx2);
+                impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx2);
+
+                impl_tertiary!(f32x4: Sleef_fmaf4_avx2128);
+                impl_tertiary!(f32x8: Sleef_fmaf8_avx2);
+                impl_tertiary!(f64x2: Sleef_fmad2_avx2128);
+                impl_tertiary!(f64x4: Sleef_fmad4_avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4);
+                impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx);
+                impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx);
+
+                impl_tertiary!(f32x4: Sleef_fmaf4_sse4);
+                impl_tertiary!(f32x8: Sleef_fmaf8_avx);
+                impl_tertiary!(f64x2: Sleef_fmad2_sse4);
+                impl_tertiary!(f64x4: Sleef_fmad4_avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4);
+                impl_tertiary!(f32x16[q => f32x4]: Sleef_fmaf4_sse4);
+                impl_tertiary!(f64x8[q => f64x2]: Sleef_fmad2_sse4);
+
+                impl_tertiary!(f32x4: Sleef_fmaf4_sse4);
+                impl_tertiary!(f32x8[h => f32x4]: Sleef_fmaf4_sse4);
+                impl_tertiary!(f64x2: Sleef_fmad2_sse4);
+                impl_tertiary!(f64x4[h => f64x2]: Sleef_fmad2_sse4);
+            } else {
+                impl_tertiary!(f32x2: fma_v2f32);
+                impl_tertiary!(f32x16: fma_v16f32);
+                impl_tertiary!(f64x8: fma_v8f64);
+
+                impl_tertiary!(f32x4: fma_v4f32);
+                impl_tertiary!(f32x8: fma_v8f32);
+                impl_tertiary!(f64x2: fma_v2f64);
+                impl_tertiary!(f64x4: fma_v4f64);
+            }
+        }
+    } else {
+        impl_tertiary!(f32x2: fma_v2f32);
+        impl_tertiary!(f32x4: fma_v4f32);
+        impl_tertiary!(f32x8: fma_v8f32);
+        impl_tertiary!(f32x16: fma_v16f32);
+        // impl_tertiary!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors
+        impl_tertiary!(f64x2: fma_v2f64);
+        impl_tertiary!(f64x4: fma_v4f64);
+        impl_tertiary!(f64x8: fma_v8f64);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/mul_adde.rs b/vendor/packed_simd_2/src/codegen/math/float/mul_adde.rs
new file mode 100644
index 000000000..8c41fb131
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/mul_adde.rs
@@ -0,0 +1,66 @@
+//! Approximation for floating-point `mul_add`
+use crate::*;
+
+// FIXME: 64-bit 1 element mul_adde
+
+crate trait MulAddE {
+    fn mul_adde(self, y: Self, z: Self) -> Self;
+}
+
+#[cfg(not(target_arch = "s390x"))]
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.fmuladd.v2f32"]
+    fn fmuladd_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2;
+    #[link_name = "llvm.fmuladd.v4f32"]
+    fn fmuladd_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4;
+    #[link_name = "llvm.fmuladd.v8f32"]
+    fn fmuladd_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8;
+    #[link_name = "llvm.fmuladd.v16f32"]
+    fn fmuladd_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16;
+    /* FIXME 64-bit single elem vectors
+    #[link_name = "llvm.fmuladd.v1f64"]
+    fn fmuladd_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1;
+    */
+    #[link_name = "llvm.fmuladd.v2f64"]
+    fn fmuladd_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2;
+    #[link_name = "llvm.fmuladd.v4f64"]
+    fn fmuladd_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4;
+    #[link_name = "llvm.fmuladd.v8f64"]
+    fn fmuladd_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8;
+}
+
+macro_rules! impl_mul_adde {
+    ($id:ident : $fn:ident) => {
+        impl MulAddE for $id {
+            #[inline]
+            fn mul_adde(self, y: Self, z: Self) -> Self {
+                #[cfg(not(target_arch = "s390x"))]
+                {
+                    use crate::mem::transmute;
+                    unsafe {
+                        transmute($fn(
+                            transmute(self),
+                            transmute(y),
+                            transmute(z),
+                        ))
+                    }
+                }
+                #[cfg(target_arch = "s390x")]
+                {
+                    // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+                    self * y + z
+                }
+            }
+        }
+    };
+}
+
+impl_mul_adde!(f32x2: fmuladd_v2f32);
+impl_mul_adde!(f32x4: fmuladd_v4f32);
+impl_mul_adde!(f32x8: fmuladd_v8f32);
+impl_mul_adde!(f32x16: fmuladd_v16f32);
+// impl_mul_adde!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors
+impl_mul_adde!(f64x2: fmuladd_v2f64);
+impl_mul_adde!(f64x4: fmuladd_v4f64);
+impl_mul_adde!(f64x8: fmuladd_v8f64);
diff --git a/vendor/packed_simd_2/src/codegen/math/float/powf.rs b/vendor/packed_simd_2/src/codegen/math/float/powf.rs
new file mode 100644
index 000000000..bc15067d7
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/powf.rs
@@ -0,0 +1,112 @@
+//! Vertical floating-point `powf`
+#![allow(unused)]
+
+// FIXME 64-bit powfgle elem vectors mispowfg
+
+use crate::*;
+
+crate trait Powf {
+    fn powf(self, x: Self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.pow.v2f32"]
+    fn powf_v2f32(x: f32x2, y: f32x2) -> f32x2;
+    #[link_name = "llvm.pow.v4f32"]
+    fn powf_v4f32(x: f32x4, y: f32x4) -> f32x4;
+    #[link_name = "llvm.pow.v8f32"]
+    fn powf_v8f32(x: f32x8, y: f32x8) -> f32x8;
+    #[link_name = "llvm.pow.v16f32"]
+    fn powf_v16f32(x: f32x16, y: f32x16) -> f32x16;
+    /* FIXME 64-bit powfgle elem vectors
+    #[link_name = "llvm.pow.v1f64"]
+    fn powf_v1f64(x: f64x1, y: f64x1) -> f64x1;
+     */
+    #[link_name = "llvm.pow.v2f64"]
+    fn powf_v2f64(x: f64x2, y: f64x2) -> f64x2;
+    #[link_name = "llvm.pow.v4f64"]
+    fn powf_v4f64(x: f64x4, y: f64x4) -> f64x4;
+    #[link_name = "llvm.pow.v8f64"]
+    fn powf_v8f64(x: f64x8, y: f64x8) -> f64x8;
+
+    #[link_name = "llvm.pow.f32"]
+    fn powf_f32(x: f32, y: f32) -> f32;
+    #[link_name = "llvm.pow.f64"]
+    fn powf_f64(x: f64, y: f64) -> f64;
+}
+
+gen_binary_impl_table!(Powf, powf);
+
+cfg_if! {
+    if #[cfg(target_arch = "s390x")] {
+        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+        impl_binary!(f32x2[f32; 2]: powf_f32);
+        impl_binary!(f32x4[f32; 4]: powf_f32);
+        impl_binary!(f32x8[f32; 8]: powf_f32);
+        impl_binary!(f32x16[f32; 16]: powf_f32);
+
+        impl_binary!(f64x2[f64; 2]: powf_f64);
+        impl_binary!(f64x4[f64; 4]: powf_f64);
+        impl_binary!(f64x8[f64; 8]: powf_f64);
+    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10avx2128);
+                impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx2);
+                impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx2);
+
+                impl_binary!(f32x4: Sleef_powf4_u10avx2128);
+                impl_binary!(f32x8: Sleef_powf8_u10avx2);
+                impl_binary!(f64x2: Sleef_powd2_u10avx2128);
+                impl_binary!(f64x4: Sleef_powd4_u10avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4);
+                impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx);
+                impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx);
+
+                impl_binary!(f32x4: Sleef_powf4_u10sse4);
+                impl_binary!(f32x8: Sleef_powf8_u10avx);
+                impl_binary!(f64x2: Sleef_powd2_u10sse4);
+                impl_binary!(f64x4: Sleef_powd4_u10avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4);
+                impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse4);
+                impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse4);
+
+                impl_binary!(f32x4: Sleef_powf4_u10sse4);
+                impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse4);
+                impl_binary!(f64x2: Sleef_powd2_u10sse4);
+                impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse4);
+            } else if #[cfg(target_feature = "sse2")] {
+                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse2);
+                impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse2);
+                impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse2);
+
+                impl_binary!(f32x4: Sleef_powf4_u10sse2);
+                impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse2);
+                impl_binary!(f64x2: Sleef_powd2_u10sse2);
+                impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse2);
+            } else {
+                impl_binary!(f32x2[f32; 2]: powf_f32);
+                impl_binary!(f32x4: powf_v4f32);
+                impl_binary!(f32x8: powf_v8f32);
+                impl_binary!(f32x16: powf_v16f32);
+
+                impl_binary!(f64x2: powf_v2f64);
+                impl_binary!(f64x4: powf_v4f64);
+                impl_binary!(f64x8: powf_v8f64);
+            }
+        }
+    } else {
+        impl_binary!(f32x2[f32; 2]: powf_f32);
+        impl_binary!(f32x4: powf_v4f32);
+        impl_binary!(f32x8: powf_v8f32);
+        impl_binary!(f32x16: powf_v16f32);
+
+        impl_binary!(f64x2: powf_v2f64);
+        impl_binary!(f64x4: powf_v4f64);
+        impl_binary!(f64x8: powf_v8f64);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/sin.rs b/vendor/packed_simd_2/src/codegen/math/float/sin.rs
new file mode 100644
index 000000000..7b014d07d
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/sin.rs
@@ -0,0 +1,103 @@
+//! Vertical floating-point `sin`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors sin
+
+use crate::*;
+
+crate trait Sin {
+    fn sin(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.sin.v2f32"]
+    fn sin_v2f32(x: f32x2) -> f32x2;
+    #[link_name = "llvm.sin.v4f32"]
+    fn sin_v4f32(x: f32x4) -> f32x4;
+    #[link_name = "llvm.sin.v8f32"]
+    fn sin_v8f32(x: f32x8) -> f32x8;
+    #[link_name = "llvm.sin.v16f32"]
+    fn sin_v16f32(x: f32x16) -> f32x16;
+    /* FIXME 64-bit single elem vectors
+    #[link_name = "llvm.sin.v1f64"]
+    fn sin_v1f64(x: f64x1) -> f64x1;
+     */
+    #[link_name = "llvm.sin.v2f64"]
+    fn sin_v2f64(x: f64x2) -> f64x2;
+    #[link_name = "llvm.sin.v4f64"]
+    fn sin_v4f64(x: f64x4) -> f64x4;
+    #[link_name = "llvm.sin.v8f64"]
+    fn sin_v8f64(x: f64x8) -> f64x8;
+
+    #[link_name = "llvm.sin.f32"]
+    fn sin_f32(x: f32) -> f32;
+    #[link_name = "llvm.sin.f64"]
+    fn sin_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Sin, sin);
+
+cfg_if! {
+    if #[cfg(target_arch = "s390x")] {
+        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+        impl_unary!(f32x2[f32; 2]: sin_f32);
+        impl_unary!(f32x4[f32; 4]: sin_f32);
+        impl_unary!(f32x8[f32; 8]: sin_f32);
+        impl_unary!(f32x16[f32; 16]: sin_f32);
+
+        impl_unary!(f64x2[f64; 2]: sin_f64);
+        impl_unary!(f64x4[f64; 4]: sin_f64);
+        impl_unary!(f64x8[f64; 8]: sin_f64);
+    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10avx2128);
+                impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx2);
+                impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx2);
+
+                impl_unary!(f32x4: Sleef_sinf4_u10avx2128);
+                impl_unary!(f32x8: Sleef_sinf8_u10avx2);
+                impl_unary!(f64x2: Sleef_sind2_u10avx2128);
+                impl_unary!(f64x4: Sleef_sind4_u10avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4);
+                impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx);
+                impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx);
+
+                impl_unary!(f32x4: Sleef_sinf4_u10sse4);
+                impl_unary!(f32x8: Sleef_sinf8_u10avx);
+                impl_unary!(f64x2: Sleef_sind2_u10sse4);
+                impl_unary!(f64x4: Sleef_sind4_u10avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4);
+                impl_unary!(f32x16[q => f32x4]: Sleef_sinf4_u10sse4);
+                impl_unary!(f64x8[q => f64x2]: Sleef_sind2_u10sse4);
+
+                impl_unary!(f32x4: Sleef_sinf4_u10sse4);
+                impl_unary!(f32x8[h => f32x4]: Sleef_sinf4_u10sse4);
+                impl_unary!(f64x2: Sleef_sind2_u10sse4);
+                impl_unary!(f64x4[h => f64x2]: Sleef_sind2_u10sse4);
+            } else {
+                impl_unary!(f32x2[f32; 2]: sin_f32);
+                impl_unary!(f32x16: sin_v16f32);
+                impl_unary!(f64x8: sin_v8f64);
+
+                impl_unary!(f32x4: sin_v4f32);
+                impl_unary!(f32x8: sin_v8f32);
+                impl_unary!(f64x2: sin_v2f64);
+                impl_unary!(f64x4: sin_v4f64);
+            }
+        }
+    } else {
+        impl_unary!(f32x2[f32; 2]: sin_f32);
+        impl_unary!(f32x4: sin_v4f32);
+        impl_unary!(f32x8: sin_v8f32);
+        impl_unary!(f32x16: sin_v16f32);
+
+        impl_unary!(f64x2: sin_v2f64);
+        impl_unary!(f64x4: sin_v4f64);
+        impl_unary!(f64x8: sin_v8f64);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs b/vendor/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs
new file mode 100644
index 000000000..0f1249ec8
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/sin_cos_pi.rs
@@ -0,0 +1,195 @@
+//! Vertical floating-point `sin_cos`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors sin_cos
+
+use crate::*;
+
+crate trait SinCosPi: Sized {
+    type Output;
+    fn sin_cos_pi(self) -> Self::Output;
+}
+
+macro_rules! impl_def {
+    ($vid:ident, $PI:path) => {
+        impl SinCosPi for $vid {
+            type Output = (Self, Self);
+            #[inline]
+            fn sin_cos_pi(self) -> Self::Output {
+                let v = self * Self::splat($PI);
+                (v.sin(), v.cos())
+            }
+        }
+    };
+}
+
+macro_rules! impl_def32 {
+    ($vid:ident) => {
+        impl_def!($vid, crate::f32::consts::PI);
+    };
+}
+macro_rules! impl_def64 {
+    ($vid:ident) => {
+        impl_def!($vid, crate::f64::consts::PI);
+    };
+}
+
+macro_rules! impl_unary_t {
+    ($vid:ident: $fun:ident) => {
+        impl SinCosPi for $vid {
+            type Output = (Self, Self);
+            fn sin_cos_pi(self) -> Self::Output {
+                unsafe {
+                    use crate::mem::transmute;
+                    transmute($fun(transmute(self)))
+                }
+            }
+        }
+    };
+    ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
+        impl SinCosPi for $vid {
+            type Output = (Self, Self);
+            fn sin_cos_pi(self) -> Self::Output {
+                unsafe {
+                    use crate::mem::{transmute, uninitialized};
+
+                    union U {
+                        vec: [$vid; 2],
+                        twice: $vid_t,
+                    }
+
+                    let twice = U { vec: [self, uninitialized()] }.twice;
+                    let twice = transmute($fun(transmute(twice)));
+
+                    union R {
+                        twice: ($vid_t, $vid_t),
+                        vecs: ([$vid; 2], [$vid; 2]),
+                    }
+                    let r = R { twice }.vecs;
+                    (*r.0.get_unchecked(0), *r.0.get_unchecked(1))
+                }
+            }
+        }
+    };
+    ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
+        impl SinCosPi for $vid {
+            type Output = (Self, Self);
+            fn sin_cos_pi(self) -> Self::Output {
+                unsafe {
+                    use crate::mem::transmute;
+
+                    union U {
+                        vec: $vid,
+                        halves: [$vid_h; 2],
+                    }
+
+                    let halves = U { vec: self }.halves;
+
+                    let res_0: ($vid_h, $vid_h) =
+                        transmute($fun(transmute(*halves.get_unchecked(0))));
+                    let res_1: ($vid_h, $vid_h) =
+                        transmute($fun(transmute(*halves.get_unchecked(1))));
+
+                    union R {
+                        result: ($vid, $vid),
+                        halves: ([$vid_h; 2], [$vid_h; 2]),
+                    }
+                    R { halves: ([res_0.0, res_1.0], [res_0.1, res_1.1]) }
+                        .result
+                }
+            }
+        }
+    };
+    ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
+        impl SinCosPi for $vid {
+            type Output = (Self, Self);
+            fn sin_cos_pi(self) -> Self::Output {
+                unsafe {
+                    use crate::mem::transmute;
+
+                    union U {
+                        vec: $vid,
+                        quarters: [$vid_q; 4],
+                    }
+
+                    let quarters = U { vec: self }.quarters;
+
+                    let res_0: ($vid_q, $vid_q) =
+                        transmute($fun(transmute(*quarters.get_unchecked(0))));
+                    let res_1: ($vid_q, $vid_q) =
+                        transmute($fun(transmute(*quarters.get_unchecked(1))));
+                    let res_2: ($vid_q, $vid_q) =
+                        transmute($fun(transmute(*quarters.get_unchecked(2))));
+                    let res_3: ($vid_q, $vid_q) =
+                        transmute($fun(transmute(*quarters.get_unchecked(3))));
+
+                    union R {
+                        result: ($vid, $vid),
+                        quarters: ([$vid_q; 4], [$vid_q; 4]),
+                    }
+                    R {
+                        quarters: (
+                            [res_0.0, res_1.0, res_2.0, res_3.0],
+                            [res_0.1, res_1.1, res_2.1, res_3.1],
+                        ),
+                    }
+                    .result
+                }
+            }
+        }
+    };
+}
+
+cfg_if! {
+    if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05avx2128);
+                impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx2);
+                impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx2);
+
+                impl_unary_t!(f32x4: Sleef_sincospif4_u05avx2128);
+                impl_unary_t!(f32x8: Sleef_sincospif8_u05avx2);
+                impl_unary_t!(f64x2: Sleef_sincospid2_u05avx2128);
+                impl_unary_t!(f64x4: Sleef_sincospid4_u05avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4);
+                impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx);
+                impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx);
+
+                impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4);
+                impl_unary_t!(f32x8: Sleef_sincospif8_u05avx);
+                impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4);
+                impl_unary_t!(f64x4: Sleef_sincospid4_u05avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4);
+                impl_unary_t!(f32x16[q => f32x4]: Sleef_sincospif4_u05sse4);
+                impl_unary_t!(f64x8[q => f64x2]: Sleef_sincospid2_u05sse4);
+
+                impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4);
+                impl_unary_t!(f32x8[h => f32x4]: Sleef_sincospif4_u05sse4);
+                impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4);
+                impl_unary_t!(f64x4[h => f64x2]: Sleef_sincospid2_u05sse4);
+            } else {
+                impl_def32!(f32x2);
+                impl_def32!(f32x4);
+                impl_def32!(f32x8);
+                impl_def32!(f32x16);
+
+                impl_def64!(f64x2);
+                impl_def64!(f64x4);
+                impl_def64!(f64x8);
+            }
+        }
+    } else {
+        impl_def32!(f32x2);
+        impl_def32!(f32x4);
+        impl_def32!(f32x8);
+        impl_def32!(f32x16);
+
+        impl_def64!(f64x2);
+        impl_def64!(f64x4);
+        impl_def64!(f64x8);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/sin_pi.rs b/vendor/packed_simd_2/src/codegen/math/float/sin_pi.rs
new file mode 100644
index 000000000..72df98c93
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/sin_pi.rs
@@ -0,0 +1,87 @@
+//! Vertical floating-point `sin_pi`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors sin_pi
+
+use crate::*;
+
+crate trait SinPi {
+    fn sin_pi(self) -> Self;
+}
+
+gen_unary_impl_table!(SinPi, sin_pi);
+
+macro_rules! impl_def {
+    ($vid:ident, $PI:path) => {
+        impl SinPi for $vid {
+            #[inline]
+            fn sin_pi(self) -> Self {
+                (self * Self::splat($PI)).sin()
+            }
+        }
+    };
+}
+macro_rules! impl_def32 {
+    ($vid:ident) => {
+        impl_def!($vid, crate::f32::consts::PI);
+    };
+}
+macro_rules! impl_def64 {
+    ($vid:ident) => {
+        impl_def!($vid, crate::f64::consts::PI);
+    };
+}
+
+cfg_if! {
+    if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05avx2128);
+                impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx2);
+                impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx2);
+
+                impl_unary!(f32x4: Sleef_sinpif4_u05avx2128);
+                impl_unary!(f32x8: Sleef_sinpif8_u05avx2);
+                impl_unary!(f64x2: Sleef_sinpid2_u05avx2128);
+                impl_unary!(f64x4: Sleef_sinpid4_u05avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4);
+                impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx);
+                impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx);
+
+                impl_unary!(f32x4: Sleef_sinpif4_u05sse4);
+                impl_unary!(f32x8: Sleef_sinpif8_u05avx);
+                impl_unary!(f64x2: Sleef_sinpid2_u05sse4);
+                impl_unary!(f64x4: Sleef_sinpid4_u05avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4);
+                impl_unary!(f32x16[q => f32x4]: Sleef_sinpif4_u05sse4);
+                impl_unary!(f64x8[q => f64x2]: Sleef_sinpid2_u05sse4);
+
+                impl_unary!(f32x4: Sleef_sinpif4_u05sse4);
+                impl_unary!(f32x8[h => f32x4]: Sleef_sinpif4_u05sse4);
+                impl_unary!(f64x2: Sleef_sinpid2_u05sse4);
+                impl_unary!(f64x4[h => f64x2]: Sleef_sinpid2_u05sse4);
+            } else {
+                impl_def32!(f32x2);
+                impl_def32!(f32x4);
+                impl_def32!(f32x8);
+                impl_def32!(f32x16);
+
+                impl_def64!(f64x2);
+                impl_def64!(f64x4);
+                impl_def64!(f64x8);
+            }
+        }
+    } else {
+        impl_def32!(f32x2);
+        impl_def32!(f32x4);
+        impl_def32!(f32x8);
+        impl_def32!(f32x16);
+
+        impl_def64!(f64x2);
+        impl_def64!(f64x4);
+        impl_def64!(f64x8);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/sqrt.rs b/vendor/packed_simd_2/src/codegen/math/float/sqrt.rs
new file mode 100644
index 000000000..7ce31df62
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/sqrt.rs
@@ -0,0 +1,103 @@
+//! Vertical floating-point `sqrt`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors sqrt
+
+use crate::*;
+
+crate trait Sqrt {
+    fn sqrt(self) -> Self;
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.sqrt.v2f32"]
+    fn sqrt_v2f32(x: f32x2) -> f32x2;
+    #[link_name = "llvm.sqrt.v4f32"]
+    fn sqrt_v4f32(x: f32x4) -> f32x4;
+    #[link_name = "llvm.sqrt.v8f32"]
+    fn sqrt_v8f32(x: f32x8) -> f32x8;
+    #[link_name = "llvm.sqrt.v16f32"]
+    fn sqrt_v16f32(x: f32x16) -> f32x16;
+    /* FIXME 64-bit sqrtgle elem vectors
+    #[link_name = "llvm.sqrt.v1f64"]
+    fn sqrt_v1f64(x: f64x1) -> f64x1;
+     */
+    #[link_name = "llvm.sqrt.v2f64"]
+    fn sqrt_v2f64(x: f64x2) -> f64x2;
+    #[link_name = "llvm.sqrt.v4f64"]
+    fn sqrt_v4f64(x: f64x4) -> f64x4;
+    #[link_name = "llvm.sqrt.v8f64"]
+    fn sqrt_v8f64(x: f64x8) -> f64x8;
+
+    #[link_name = "llvm.sqrt.f32"]
+    fn sqrt_f32(x: f32) -> f32;
+    #[link_name = "llvm.sqrt.f64"]
+    fn sqrt_f64(x: f64) -> f64;
+}
+
+gen_unary_impl_table!(Sqrt, sqrt);
+
+cfg_if! {
+    if #[cfg(target_arch = "s390x")] {
+        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+        impl_unary!(f32x2[f32; 2]: sqrt_f32);
+        impl_unary!(f32x4[f32; 4]: sqrt_f32);
+        impl_unary!(f32x8[f32; 8]: sqrt_f32);
+        impl_unary!(f32x16[f32; 16]: sqrt_f32);
+
+        impl_unary!(f64x2[f64; 2]: sqrt_f64);
+        impl_unary!(f64x4[f64; 4]: sqrt_f64);
+        impl_unary!(f64x8[f64; 8]: sqrt_f64);
+    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_avx2128);
+                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx2);
+                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx2);
+
+                impl_unary!(f32x4: Sleef_sqrtf4_avx2128);
+                impl_unary!(f32x8: Sleef_sqrtf8_avx2);
+                impl_unary!(f64x2: Sleef_sqrtd2_avx2128);
+                impl_unary!(f64x4: Sleef_sqrtd4_avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4);
+                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx);
+                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx);
+
+                impl_unary!(f32x4: Sleef_sqrtf4_sse4);
+                impl_unary!(f32x8: Sleef_sqrtf8_avx);
+                impl_unary!(f64x2: Sleef_sqrtd2_sse4);
+                impl_unary!(f64x4: Sleef_sqrtd4_avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4);
+                impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_sse4);
+                impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_sse4);
+
+                impl_unary!(f32x4: Sleef_sqrtf4_sse4);
+                impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_sse4);
+                impl_unary!(f64x2: Sleef_sqrtd2_sse4);
+                impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_sse4);
+            } else {
+                impl_unary!(f32x2[f32; 2]: sqrt_f32);
+                impl_unary!(f32x16: sqrt_v16f32);
+                impl_unary!(f64x8: sqrt_v8f64);
+
+                impl_unary!(f32x4: sqrt_v4f32);
+                impl_unary!(f32x8: sqrt_v8f32);
+                impl_unary!(f64x2: sqrt_v2f64);
+                impl_unary!(f64x4: sqrt_v4f64);
+            }
+        }
+    } else {
+        impl_unary!(f32x2[f32; 2]: sqrt_f32);
+        impl_unary!(f32x4: sqrt_v4f32);
+        impl_unary!(f32x8: sqrt_v8f32);
+        impl_unary!(f32x16: sqrt_v16f32);
+
+        impl_unary!(f64x2: sqrt_v2f64);
+        impl_unary!(f64x4: sqrt_v4f64);
+        impl_unary!(f64x8: sqrt_v8f64);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/sqrte.rs b/vendor/packed_simd_2/src/codegen/math/float/sqrte.rs
new file mode 100644
index 000000000..c1e379c34
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/sqrte.rs
@@ -0,0 +1,67 @@
+//! Vertical floating-point `sqrt`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors sqrte
+
+use crate::llvm::simd_fsqrt;
+use crate::*;
+
+crate trait Sqrte {
+    fn sqrte(self) -> Self;
+}
+
+gen_unary_impl_table!(Sqrte, sqrte);
+
+cfg_if! {
+    if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35avx2128);
+                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx2);
+                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx2);
+
+                impl_unary!(f32x4: Sleef_sqrtf4_u35avx2128);
+                impl_unary!(f32x8: Sleef_sqrtf8_u35avx2);
+                impl_unary!(f64x2: Sleef_sqrtd2_u35avx2128);
+                impl_unary!(f64x4: Sleef_sqrtd4_u35avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4);
+                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx);
+                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx);
+
+                impl_unary!(f32x4: Sleef_sqrtf4_u35sse4);
+                impl_unary!(f32x8: Sleef_sqrtf8_u35avx);
+                impl_unary!(f64x2: Sleef_sqrtd2_u35sse4);
+                impl_unary!(f64x4: Sleef_sqrtd4_u35avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4);
+                impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_u35sse4);
+                impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_u35sse4);
+
+                impl_unary!(f32x4: Sleef_sqrtf4_u35sse4);
+                impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_u35sse4);
+                impl_unary!(f64x2: Sleef_sqrtd2_u35sse4);
+                impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_u35sse4);
+            } else {
+                impl_unary!(f32x2[g]: simd_fsqrt);
+                impl_unary!(f32x16[g]: simd_fsqrt);
+                impl_unary!(f64x8[g]: simd_fsqrt);
+
+                impl_unary!(f32x4[g]: simd_fsqrt);
+                impl_unary!(f32x8[g]: simd_fsqrt);
+                impl_unary!(f64x2[g]: simd_fsqrt);
+                impl_unary!(f64x4[g]: simd_fsqrt);
+            }
+        }
+    } else {
+        impl_unary!(f32x2[g]: simd_fsqrt);
+        impl_unary!(f32x4[g]: simd_fsqrt);
+        impl_unary!(f32x8[g]: simd_fsqrt);
+        impl_unary!(f32x16[g]: simd_fsqrt);
+
+        impl_unary!(f64x2[g]: simd_fsqrt);
+        impl_unary!(f64x4[g]: simd_fsqrt);
+        impl_unary!(f64x8[g]: simd_fsqrt);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/math/float/tanh.rs b/vendor/packed_simd_2/src/codegen/math/float/tanh.rs
new file mode 100644
index 000000000..5220c7d10
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/math/float/tanh.rs
@@ -0,0 +1,117 @@
+//! Vertical floating-point `tanh`
+#![allow(unused)]
+
+// FIXME 64-bit 1 elem vectors tanh
+
+use crate::*;
+
+crate trait Tanh {
+    fn tanh(self) -> Self;
+}
+
+macro_rules! define_tanh {
+
+    ($name:ident, $basetype:ty, $simdtype:ty, $lanes:expr, $trait:path) => {
+        fn $name(x: $simdtype) -> $simdtype {
+            use core::intrinsics::transmute;
+            let mut buf: [$basetype; $lanes] = unsafe { transmute(x) };
+            for elem in &mut buf {
+                *elem = <$basetype as $trait>::tanh(*elem);
+            }
+            unsafe { transmute(buf) }
+        }
+    };
+
+    (f32 => $name:ident, $type:ty, $lanes:expr) => {
+        define_tanh!($name, f32, $type, $lanes, libm::F32Ext);
+    };
+
+    (f64 => $name:ident, $type:ty, $lanes:expr) => {
+        define_tanh!($name, f64, $type, $lanes, libm::F64Ext);
+    };
+}
+
+// llvm does not seem to expose the hyperbolic versions of trigonometric functions;
+// we thus call the classical rust versions on all of them (which stem from cmath).
+define_tanh!(f32 => tanh_v2f32, f32x2, 2);
+define_tanh!(f32 => tanh_v4f32, f32x4, 4);
+define_tanh!(f32 => tanh_v8f32, f32x8, 8);
+define_tanh!(f32 => tanh_v16f32, f32x16, 16);
+
+define_tanh!(f64 => tanh_v2f64, f64x2, 2);
+define_tanh!(f64 => tanh_v4f64, f64x4, 4);
+define_tanh!(f64 => tanh_v8f64, f64x8, 8);
+
+fn tanh_f32(x: f32) -> f32 {
+    libm::F32Ext::tanh(x)
+}
+
+fn tanh_f64(x: f64) -> f64 {
+    libm::F64Ext::tanh(x)
+}
+
+gen_unary_impl_table!(Tanh, tanh);
+
+cfg_if! {
+    if #[cfg(target_arch = "s390x")] {
+        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
+        impl_unary!(f32x2[f32; 2]: tanh_f32);
+        impl_unary!(f32x4[f32; 4]: tanh_f32);
+        impl_unary!(f32x8[f32; 8]: tanh_f32);
+        impl_unary!(f32x16[f32; 16]: tanh_f32);
+
+        impl_unary!(f64x2[f64; 2]: tanh_f64);
+        impl_unary!(f64x4[f64; 4]: tanh_f64);
+        impl_unary!(f64x8[f64; 8]: tanh_f64);
+    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
+        use sleef_sys::*;
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10avx2128);
+                impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx2);
+                impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx2);
+
+                impl_unary!(f32x4: Sleef_tanhf4_u10avx2128);
+                impl_unary!(f32x8: Sleef_tanhf8_u10avx2);
+                impl_unary!(f64x2: Sleef_tanhd2_u10avx2128);
+                impl_unary!(f64x4: Sleef_tanhd4_u10avx2);
+            } else if #[cfg(target_feature = "avx")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4);
+                impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx);
+                impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx);
+
+                impl_unary!(f32x4: Sleef_tanhf4_u10sse4);
+                impl_unary!(f32x8: Sleef_tanhf8_u10avx);
+                impl_unary!(f64x2: Sleef_tanhd2_u10sse4);
+                impl_unary!(f64x4: Sleef_tanhd4_u10avx);
+            } else if #[cfg(target_feature = "sse4.2")] {
+                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4);
+                impl_unary!(f32x16[q => f32x4]: Sleef_tanhf4_u10sse4);
+                impl_unary!(f64x8[q => f64x2]: Sleef_tanhd2_u10sse4);
+
+                impl_unary!(f32x4: Sleef_tanhf4_u10sse4);
+                impl_unary!(f32x8[h => f32x4]: Sleef_tanhf4_u10sse4);
+                impl_unary!(f64x2: Sleef_tanhd2_u10sse4);
+                impl_unary!(f64x4[h => f64x2]: Sleef_tanhd2_u10sse4);
+            } else {
+                impl_unary!(f32x2[f32; 2]: tanh_f32);
+                impl_unary!(f32x16: tanh_v16f32);
+                impl_unary!(f64x8: tanh_v8f64);
+
+                impl_unary!(f32x4: tanh_v4f32);
+                impl_unary!(f32x8: tanh_v8f32);
+                impl_unary!(f64x2: tanh_v2f64);
+                impl_unary!(f64x4: tanh_v4f64);
+            }
+        }
+    } else {
+        impl_unary!(f32x2[f32; 2]: tanh_f32);
+        impl_unary!(f32x4: tanh_v4f32);
+        impl_unary!(f32x8: tanh_v8f32);
+        impl_unary!(f32x16: tanh_v16f32);
+
+        impl_unary!(f64x2: tanh_v2f64);
+        impl_unary!(f64x4: tanh_v4f64);
+        impl_unary!(f64x8: tanh_v8f64);
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/pointer_sized_int.rs b/vendor/packed_simd_2/src/codegen/pointer_sized_int.rs
new file mode 100644
index 000000000..39f493d3b
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/pointer_sized_int.rs
@@ -0,0 +1,28 @@
+//! Provides `isize` and `usize`
+
+use cfg_if::cfg_if;
+
+cfg_if! {
+    if #[cfg(target_pointer_width = "8")] {
+        crate type isize_ = i8;
+        crate type usize_ = u8;
+    } else if #[cfg(target_pointer_width = "16")] {
+        crate type isize_ = i16;
+        crate type usize_ = u16;
+    } else if #[cfg(target_pointer_width = "32")] {
+        crate type isize_ = i32;
+        crate type usize_ = u32;
+
+    } else if #[cfg(target_pointer_width = "64")] {
+        crate type isize_ = i64;
+        crate type usize_ = u64;
+    } else if #[cfg(target_pointer_width = "64")] {
+        crate type isize_ = i64;
+        crate type usize_ = u64;
+    } else if #[cfg(target_pointer_width = "128")] {
+        crate type isize_ = i128;
+        crate type usize_ = u128;
+    } else {
+        compile_error!("unsupported target_pointer_width");
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/reductions.rs b/vendor/packed_simd_2/src/codegen/reductions.rs
new file mode 100644
index 000000000..7be4f5fab
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/reductions.rs
@@ -0,0 +1 @@
+crate mod mask;
diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask.rs b/vendor/packed_simd_2/src/codegen/reductions/mask.rs
new file mode 100644
index 000000000..97260c6d4
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/reductions/mask.rs
@@ -0,0 +1,69 @@
+//! Code generation workaround for `all()` mask horizontal reduction.
+//!
+//! Works arround [LLVM bug 36702].
+//!
+//! [LLVM bug 36702]: https://bugs.llvm.org/show_bug.cgi?id=36702
+#![allow(unused_macros)]
+
+use crate::*;
+
+crate trait All: crate::marker::Sized {
+    unsafe fn all(self) -> bool;
+}
+
+crate trait Any: crate::marker::Sized {
+    unsafe fn any(self) -> bool;
+}
+
+#[macro_use]
+mod fallback_impl;
+
+cfg_if! {
+    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        #[macro_use]
+        mod x86;
+    } else if #[cfg(all(target_arch = "arm", target_feature = "v7",
+                        target_feature = "neon",
+                        any(feature = "core_arch", libcore_neon)))] {
+        #[macro_use]
+        mod arm;
+    } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+        #[macro_use]
+        mod aarch64;
+    } else {
+        #[macro_use]
+        mod fallback;
+    }
+}
+
+impl_mask_reductions!(m8x2);
+impl_mask_reductions!(m8x4);
+impl_mask_reductions!(m8x8);
+impl_mask_reductions!(m8x16);
+impl_mask_reductions!(m8x32);
+impl_mask_reductions!(m8x64);
+
+impl_mask_reductions!(m16x2);
+impl_mask_reductions!(m16x4);
+impl_mask_reductions!(m16x8);
+impl_mask_reductions!(m16x16);
+impl_mask_reductions!(m16x32);
+
+impl_mask_reductions!(m32x2);
+impl_mask_reductions!(m32x4);
+impl_mask_reductions!(m32x8);
+impl_mask_reductions!(m32x16);
+
+// FIXME: 64-bit single element vector
+// impl_mask_reductions!(m64x1);
+impl_mask_reductions!(m64x2);
+impl_mask_reductions!(m64x4);
+impl_mask_reductions!(m64x8);
+
+impl_mask_reductions!(m128x1);
+impl_mask_reductions!(m128x2);
+impl_mask_reductions!(m128x4);
+
+impl_mask_reductions!(msizex2);
+impl_mask_reductions!(msizex4);
+impl_mask_reductions!(msizex8);
diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs
new file mode 100644
index 000000000..e9586eace
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/reductions/mask/aarch64.rs
@@ -0,0 +1,71 @@
+//! Mask reductions implementation for `aarch64` targets
+
+/// 128-bit wide vectors
+macro_rules! aarch64_128_neon_impl {
+    ($id:ident, $vmin:ident, $vmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn all(self) -> bool {
+                use crate::arch::aarch64::$vmin;
+                $vmin(crate::mem::transmute(self)) != 0
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn any(self) -> bool {
+                use crate::arch::aarch64::$vmax;
+                $vmax(crate::mem::transmute(self)) != 0
+            }
+        }
+    }
+}
+
+/// 64-bit wide vectors
+macro_rules! aarch64_64_neon_impl {
+    ($id:ident, $vec128:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn all(self) -> bool {
+                // Duplicates the 64-bit vector into a 128-bit one and
+                // calls all on that.
+                union U {
+                    halves: ($id, $id),
+                    vec: $vec128,
+                }
+                U {
+                    halves: (self, self),
+                }.vec.all()
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn any(self) -> bool {
+                union U {
+                    halves: ($id, $id),
+                    vec: $vec128,
+                }
+                U {
+                    halves: (self, self),
+                }.vec.any()
+            }
+        }
+    };
+}
+
+/// Mask reduction implementation for `aarch64` targets
+macro_rules! impl_mask_reductions {
+    // 64-bit wide masks
+    (m8x8) => { aarch64_64_neon_impl!(m8x8, m8x16); };
+    (m16x4) => { aarch64_64_neon_impl!(m16x4, m16x8); };
+    (m32x2) => { aarch64_64_neon_impl!(m32x2, m32x4); };
+    // 128-bit wide masks
+    (m8x16) => { aarch64_128_neon_impl!(m8x16, vminvq_u8, vmaxvq_u8); };
+    (m16x8) => { aarch64_128_neon_impl!(m16x8, vminvq_u16, vmaxvq_u16); };
+    (m32x4) => { aarch64_128_neon_impl!(m32x4, vminvq_u32, vmaxvq_u32); };
+    // Fallback to LLVM's default code-generation:
+    ($id:ident) => { fallback_impl!($id); };
+}
diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs
new file mode 100644
index 000000000..1987af7a9
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/reductions/mask/arm.rs
@@ -0,0 +1,54 @@
+//! Mask reductions implementation for `arm` targets
+
+/// Implementation for ARM + v7 + NEON for 64-bit or 128-bit wide vectors with
+/// more than two elements.
+macro_rules! arm_128_v7_neon_impl {
+    ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn all(self) -> bool {
+                use crate::arch::arm::$vpmin;
+                use crate::mem::transmute;
+                union U {
+                    halves: ($half, $half),
+                    vec: $id,
+                }
+                let halves = U { vec: self }.halves;
+                let h: $half = transmute($vpmin(
+                    transmute(halves.0),
+                    transmute(halves.1),
+                ));
+                h.all()
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn any(self) -> bool {
+                use crate::arch::arm::$vpmax;
+                use crate::mem::transmute;
+                union U {
+                    halves: ($half, $half),
+                    vec: $id,
+                }
+                let halves = U { vec: self }.halves;
+                let h: $half = transmute($vpmax(
+                    transmute(halves.0),
+                    transmute(halves.1),
+                ));
+                h.any()
+            }
+        }
+    };
+}
+
+/// Mask reduction implementation for `arm` targets
+macro_rules! impl_mask_reductions {
+    // 128-bit wide masks
+    (m8x16) => { arm_128_v7_neon_impl!(m8x16, m8x8, vpmin_u8, vpmax_u8); };
+    (m16x8) => { arm_128_v7_neon_impl!(m16x8, m16x4, vpmin_u16, vpmax_u16); };
+    (m32x4) => { arm_128_v7_neon_impl!(m32x4, m32x2, vpmin_u32, vpmax_u32); };
+    // Fallback to LLVM's default code-generation:
+    ($id:ident) => { fallback_impl!($id); };
+}
diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs
new file mode 100644
index 000000000..25e5c813a
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback.rs
@@ -0,0 +1,6 @@
+//! Default mask reduction implementations.
+
+/// Default mask reduction implementation
+macro_rules! impl_mask_reductions {
+    ($id:ident) => { fallback_impl!($id); };
+}
diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs
new file mode 100644
index 000000000..0d246e2fd
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/reductions/mask/fallback_impl.rs
@@ -0,0 +1,237 @@
+//! Default implementation of a mask reduction for any target.
+
+macro_rules! fallback_to_other_impl {
+    ($id:ident, $other:ident) => {
+        impl All for $id {
+            #[inline]
+            unsafe fn all(self) -> bool {
+                let m: $other = crate::mem::transmute(self);
+                m.all()
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            unsafe fn any(self) -> bool {
+                let m: $other = crate::mem::transmute(self);
+                m.any()
+            }
+        }
+    };
+}
+
+/// Fallback implementation.
+macro_rules! fallback_impl {
+    // 16-bit wide masks:
+    (m8x2) => {
+        impl All for m8x2 {
+            #[inline]
+            unsafe fn all(self) -> bool {
+                let i: u16 = crate::mem::transmute(self);
+                i == u16::max_value()
+            }
+        }
+        impl Any for m8x2 {
+            #[inline]
+            unsafe fn any(self) -> bool {
+                let i: u16 = crate::mem::transmute(self);
+                i != 0
+            }
+        }
+    };
+    // 32-bit wide masks
+    (m8x4) => {
+        impl All for m8x4 {
+            #[inline]
+            unsafe fn all(self) -> bool {
+                let i: u32 = crate::mem::transmute(self);
+                i == u32::max_value()
+            }
+        }
+        impl Any for m8x4 {
+            #[inline]
+            unsafe fn any(self) -> bool {
+                let i: u32 = crate::mem::transmute(self);
+                i != 0
+            }
+        }
+    };
+    (m16x2) => {
+        fallback_to_other_impl!(m16x2, m8x4);
+    };
+    // 64-bit wide masks:
+    (m8x8) => {
+        impl All for m8x8 {
+            #[inline]
+            unsafe fn all(self) -> bool {
+                let i: u64 = crate::mem::transmute(self);
+                i == u64::max_value()
+            }
+        }
+        impl Any for m8x8 {
+            #[inline]
+            unsafe fn any(self) -> bool {
+                let i: u64 = crate::mem::transmute(self);
+                i != 0
+            }
+        }
+    };
+    (m16x4) => {
+        fallback_to_other_impl!(m16x4, m8x8);
+    };
+    (m32x2) => {
+        fallback_to_other_impl!(m32x2, m16x4);
+    };
+    // FIXME: 64x1 maxk
+    // 128-bit wide masks:
+    (m8x16) => {
+        impl All for m8x16 {
+            #[inline]
+            unsafe fn all(self) -> bool {
+                let i: u128 = crate::mem::transmute(self);
+                i == u128::max_value()
+            }
+        }
+        impl Any for m8x16 {
+            #[inline]
+            unsafe fn any(self) -> bool {
+                let i: u128 = crate::mem::transmute(self);
+                i != 0
+            }
+        }
+    };
+    (m16x8) => {
+        fallback_to_other_impl!(m16x8, m8x16);
+    };
+    (m32x4) => {
+        fallback_to_other_impl!(m32x4, m16x8);
+    };
+    (m64x2) => {
+        fallback_to_other_impl!(m64x2, m32x4);
+    };
+    (m128x1) => {
+        fallback_to_other_impl!(m128x1, m64x2);
+    };
+    // 256-bit wide masks
+    (m8x32) => {
+        impl All for m8x32 {
+            #[inline]
+            unsafe fn all(self) -> bool {
+                let i: [u128; 2] = crate::mem::transmute(self);
+                let o: [u128; 2] = [u128::max_value(); 2];
+                i == o
+            }
+        }
+        impl Any for m8x32 {
+            #[inline]
+            unsafe fn any(self) -> bool {
+                let i: [u128; 2] = crate::mem::transmute(self);
+                let o: [u128; 2] = [0; 2];
+                i != o
+            }
+        }
+    };
+    (m16x16) => {
+        fallback_to_other_impl!(m16x16, m8x32);
+    };
+    (m32x8) => {
+        fallback_to_other_impl!(m32x8, m16x16);
+    };
+    (m64x4) => {
+        fallback_to_other_impl!(m64x4, m32x8);
+    };
+    (m128x2) => {
+        fallback_to_other_impl!(m128x2, m64x4);
+    };
+    // 512-bit wide masks
+    (m8x64) => {
+        impl All for m8x64 {
+            #[inline]
+            unsafe fn all(self) -> bool {
+                let i: [u128; 4] = crate::mem::transmute(self);
+                let o: [u128; 4] = [u128::max_value(); 4];
+                i == o
+            }
+        }
+        impl Any for m8x64 {
+            #[inline]
+            unsafe fn any(self) -> bool {
+                let i: [u128; 4] = crate::mem::transmute(self);
+                let o: [u128; 4] = [0; 4];
+                i != o
+            }
+        }
+    };
+    (m16x32) => {
+        fallback_to_other_impl!(m16x32, m8x64);
+    };
+    (m32x16) => {
+        fallback_to_other_impl!(m32x16, m16x32);
+    };
+    (m64x8) => {
+        fallback_to_other_impl!(m64x8, m32x16);
+    };
+    (m128x4) => {
+        fallback_to_other_impl!(m128x4, m64x8);
+    };
+    // Masks with pointer-sized elements64
+    (msizex2) => {
+        cfg_if! {
+            if #[cfg(target_pointer_width = "64")] {
+                fallback_to_other_impl!(msizex2, m64x2);
+            } else if #[cfg(target_pointer_width = "32")] {
+                fallback_to_other_impl!(msizex2, m32x2);
+            } else {
+                compile_error!("unsupported target_pointer_width");
+            }
+        }
+    };
+    (msizex4) => {
+        cfg_if! {
+            if #[cfg(target_pointer_width = "64")] {
+                fallback_to_other_impl!(msizex4, m64x4);
+            } else if #[cfg(target_pointer_width = "32")] {
+                fallback_to_other_impl!(msizex4, m32x4);
+            } else {
+                compile_error!("unsupported target_pointer_width");
+            }
+        }
+    };
+    (msizex8) => {
+        cfg_if! {
+            if #[cfg(target_pointer_width = "64")] {
+                fallback_to_other_impl!(msizex8, m64x8);
+            } else if #[cfg(target_pointer_width = "32")] {
+                fallback_to_other_impl!(msizex8, m32x8);
+            } else {
+                compile_error!("unsupported target_pointer_width");
+            }
+        }
+    };
+}
+
+macro_rules! recurse_half {
+    ($vid:ident, $vid_h:ident) => {
+        impl All for $vid {
+            #[inline]
+            unsafe fn all(self) -> bool {
+                union U {
+                    halves: ($vid_h, $vid_h),
+                    vec: $vid,
+                }
+                let halves = U { vec: self }.halves;
+                halves.0.all() && halves.1.all()
+            }
+        }
+        impl Any for $vid {
+            #[inline]
+            unsafe fn any(self) -> bool {
+                union U {
+                    halves: ($vid_h, $vid_h),
+                    vec: $vid,
+                }
+                let halves = U { vec: self }.halves;
+                halves.0.any() || halves.1.any()
+            }
+        }
+    };
+}
diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs
new file mode 100644
index 000000000..bcfb1a6e1
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86.rs
@@ -0,0 +1,188 @@
+//! Mask reductions implementation for `x86` and `x86_64` targets
+
+#[cfg(target_feature = "sse")]
+#[macro_use]
+mod sse;
+
+#[cfg(target_feature = "sse2")]
+#[macro_use]
+mod sse2;
+
+#[cfg(target_feature = "avx")]
+#[macro_use]
+mod avx;
+
+#[cfg(target_feature = "avx2")]
+#[macro_use]
+mod avx2;
+
+/// x86 64-bit m8x8 implementation
+macro_rules! x86_m8x8_impl {
+    ($id:ident) => {
+        fallback_impl!($id);
+    };
+}
+
+/// x86 128-bit m8x16 implementation
+macro_rules! x86_m8x16_impl {
+    ($id:ident) => {
+        cfg_if! {
+            if #[cfg(target_feature = "sse2")] {
+                x86_m8x16_sse2_impl!($id);
+            } else {
+                fallback_impl!($id);
+            }
+        }
+    };
+}
+
+/// x86 128-bit m32x4 implementation
+macro_rules! x86_m32x4_impl {
+    ($id:ident) => {
+        cfg_if! {
+            if #[cfg(target_feature = "sse")] {
+                x86_m32x4_sse_impl!($id);
+            } else {
+                fallback_impl!($id);
+            }
+        }
+    };
+}
+
+/// x86 128-bit m64x2 implementation
+macro_rules! x86_m64x2_impl {
+    ($id:ident) => {
+        cfg_if! {
+            if #[cfg(target_feature = "sse2")] {
+                x86_m64x2_sse2_impl!($id);
+            } else if #[cfg(target_feature = "sse")] {
+                x86_m32x4_sse_impl!($id);
+            } else {
+                fallback_impl!($id);
+            }
+        }
+    };
+}
+
+/// x86 256-bit m8x32 implementation
+macro_rules! x86_m8x32_impl {
+    ($id:ident, $half_id:ident) => {
+        cfg_if! {
+            if #[cfg(target_feature = "avx2")] {
+                x86_m8x32_avx2_impl!($id);
+            } else if #[cfg(target_feature = "avx")] {
+                x86_m8x32_avx_impl!($id);
+            } else if #[cfg(target_feature = "sse2")] {
+                recurse_half!($id, $half_id);
+            } else {
+                fallback_impl!($id);
+            }
+        }
+    };
+}
+
+/// x86 256-bit m32x8 implementation
+macro_rules! x86_m32x8_impl {
+    ($id:ident, $half_id:ident) => {
+        cfg_if! {
+            if #[cfg(target_feature = "avx")] {
+                x86_m32x8_avx_impl!($id);
+            } else if #[cfg(target_feature = "sse")] {
+                recurse_half!($id, $half_id);
+            } else {
+                fallback_impl!($id);
+            }
+        }
+    };
+}
+
+/// x86 256-bit m64x4 implementation
+macro_rules! x86_m64x4_impl {
+    ($id:ident, $half_id:ident) => {
+        cfg_if! {
+            if #[cfg(target_feature = "avx")] {
+                x86_m64x4_avx_impl!($id);
+            } else if #[cfg(target_feature = "sse")] {
+                recurse_half!($id, $half_id);
+            } else {
+                fallback_impl!($id);
+            }
+        }
+    };
+}
+
+/// Fallback implementation.
+macro_rules! x86_intr_impl {
+    ($id:ident) => {
+    impl All for $id {
+        #[inline]
+        unsafe fn all(self) -> bool {
+        use crate::llvm::simd_reduce_all;
+            simd_reduce_all(self.0)
+        }
+    }
+        impl Any for $id {
+            #[inline]
+            unsafe fn any(self) -> bool {
+            use crate::llvm::simd_reduce_any;
+                simd_reduce_any(self.0)
+            }
+        }
+    };
+}
+
+/// Mask reduction implementation for `x86` and `x86_64` targets
+macro_rules! impl_mask_reductions {
+    // 64-bit wide masks
+    (m8x8) => { x86_m8x8_impl!(m8x8); };
+    (m16x4) => { x86_m8x8_impl!(m16x4); };
+    (m32x2) => { x86_m8x8_impl!(m32x2); };
+    // 128-bit wide masks
+    (m8x16) => { x86_m8x16_impl!(m8x16); };
+    (m16x8) => { x86_m8x16_impl!(m16x8); };
+    (m32x4) => { x86_m32x4_impl!(m32x4); };
+    (m64x2) => { x86_m64x2_impl!(m64x2); };
+    (m128x1) => { x86_intr_impl!(m128x1); };
+    // 256-bit wide masks:
+    (m8x32) => { x86_m8x32_impl!(m8x32, m8x16); };
+    (m16x16) => { x86_m8x32_impl!(m16x16, m16x8); };
+    (m32x8) => { x86_m32x8_impl!(m32x8, m32x4); };
+    (m64x4) => { x86_m64x4_impl!(m64x4, m64x2); };
+    (m128x2) => { x86_intr_impl!(m128x2); };
+    (msizex2) => {
+        cfg_if! {
+            if #[cfg(target_pointer_width = "64")] {
+                fallback_to_other_impl!(msizex2, m64x2);
+            } else if #[cfg(target_pointer_width = "32")] {
+                fallback_to_other_impl!(msizex2, m32x2);
+            } else {
+                compile_error!("unsupported target_pointer_width");
+            }
+        }
+    };
+    (msizex4) => {
+        cfg_if! {
+            if #[cfg(target_pointer_width = "64")] {
+                fallback_to_other_impl!(msizex4, m64x4);
+            } else if #[cfg(target_pointer_width = "32")] {
+                fallback_to_other_impl!(msizex4, m32x4);
+            } else {
+                compile_error!("unsupported target_pointer_width");
+            }
+        }
+    };
+    (msizex8) => {
+        cfg_if! {
+            if #[cfg(target_pointer_width = "64")] {
+                fallback_to_other_impl!(msizex8, m64x8);
+            } else if #[cfg(target_pointer_width = "32")] {
+                fallback_to_other_impl!(msizex8, m32x8);
+            } else {
+                compile_error!("unsupported target_pointer_width");
+            }
+        }
+    };
+
+    // Fallback to LLVM's default code-generation:
+    ($id:ident) => { fallback_impl!($id); };
+}
diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs
new file mode 100644
index 000000000..d18736fb0
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx.rs
@@ -0,0 +1,101 @@
+//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX`
+
+/// `x86`/`x86_64` 256-bit `AVX` implementation
+/// FIXME: it might be faster here to do two `_mm_movmask_epi8`
+#[cfg(target_feature = "avx")]
+macro_rules! x86_m8x32_avx_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "avx")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm256_testc_si256;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm256_testc_si256;
+                _mm256_testc_si256(
+                    crate::mem::transmute(self),
+                    crate::mem::transmute($id::splat(true)),
+                ) != 0
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "avx")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm256_testz_si256;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm256_testz_si256;
+                _mm256_testz_si256(
+                    crate::mem::transmute(self),
+                    crate::mem::transmute(self),
+                ) == 0
+            }
+        }
+    };
+}
+
+/// `x86`/`x86_64` 256-bit m32x8 `AVX` implementation
+macro_rules! x86_m32x8_avx_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "sse")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm256_movemask_ps;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm256_movemask_ps;
+                // _mm256_movemask_ps(a) creates a 8bit mask containing the
+                // most significant bit of each lane of `a`. If all bits are
+                // set, then all 8 lanes of the mask are true.
+                _mm256_movemask_ps(crate::mem::transmute(self)) == 0b_1111_1111_i32
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "sse")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm256_movemask_ps;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm256_movemask_ps;
+
+                _mm256_movemask_ps(crate::mem::transmute(self)) != 0
+            }
+        }
+    };
+}
+
+/// `x86`/`x86_64` 256-bit m64x4 `AVX` implementation
+macro_rules! x86_m64x4_avx_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "sse")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm256_movemask_pd;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm256_movemask_pd;
+                // _mm256_movemask_pd(a) creates a 4bit mask containing the
+                // most significant bit of each lane of `a`. If all bits are
+                // set, then all 4 lanes of the mask are true.
+                _mm256_movemask_pd(crate::mem::transmute(self)) == 0b_1111_i32
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "sse")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm256_movemask_pd;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm256_movemask_pd;
+
+                _mm256_movemask_pd(crate::mem::transmute(self)) != 0
+            }
+        }
+    };
+}
diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs
new file mode 100644
index 000000000..d37d02342
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/avx2.rs
@@ -0,0 +1,35 @@
+//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX2`.
+#![allow(unused)]
+
+/// x86/x86_64 256-bit m8x32 AVX2 implementation
+macro_rules! x86_m8x32_avx2_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm256_movemask_epi8;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm256_movemask_epi8;
+                // _mm256_movemask_epi8(a) creates a 32bit mask containing the
+                // most significant bit of each byte of `a`. If all
+                // bits are set, then all 32 lanes of the mask are
+                // true.
+                _mm256_movemask_epi8(crate::mem::transmute(self)) == -1_i32
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm256_movemask_epi8;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm256_movemask_epi8;
+
+                _mm256_movemask_epi8(crate::mem::transmute(self)) != 0
+            }
+        }
+    };
+}
diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs
new file mode 100644
index 000000000..eb1ef7fac
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse.rs
@@ -0,0 +1,36 @@
+//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE`.
+#![allow(unused)]
+
+/// `x86`/`x86_64` 128-bit `m32x4` `SSE` implementation
+macro_rules! x86_m32x4_sse_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "sse")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm_movemask_ps;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm_movemask_ps;
+                // _mm_movemask_ps(a) creates a 4bit mask containing the
+                // most significant bit of each lane of `a`. If all
+                // bits are set, then all 4 lanes of the mask are
+                // true.
+                _mm_movemask_ps(crate::mem::transmute(self))
+                    == 0b_1111_i32
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "sse")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm_movemask_ps;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm_movemask_ps;
+
+                _mm_movemask_ps(crate::mem::transmute(self)) != 0
+            }
+        }
+    };
+}
diff --git a/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs
new file mode 100644
index 000000000..a99c606f5
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/reductions/mask/x86/sse2.rs
@@ -0,0 +1,70 @@
+//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE2`.
+#![allow(unused)]
+
+/// `x86`/`x86_64` 128-bit m64x2 `SSE2` implementation
+macro_rules! x86_m64x2_sse2_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "sse")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm_movemask_pd;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm_movemask_pd;
+                // _mm_movemask_pd(a) creates a 2bit mask containing the
+                // most significant bit of each lane of `a`. If all
+                // bits are set, then all 2 lanes of the mask are
+                // true.
+                _mm_movemask_pd(crate::mem::transmute(self))
+                    == 0b_11_i32
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "sse")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm_movemask_pd;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm_movemask_pd;
+
+                _mm_movemask_pd(crate::mem::transmute(self)) != 0
+            }
+        }
+    };
+}
+
+/// `x86`/`x86_64` 128-bit m8x16 `SSE2` implementation
+macro_rules! x86_m8x16_sse2_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm_movemask_epi8;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm_movemask_epi8;
+                // _mm_movemask_epi8(a) creates a 16bit mask containing the
+                // most significant bit of each byte of `a`. If all
+                // bits are set, then all 16 lanes of the mask are
+                // true.
+                _mm_movemask_epi8(crate::mem::transmute(self))
+                    == i32::from(u16::max_value())
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm_movemask_epi8;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm_movemask_epi8;
+
+                _mm_movemask_epi8(crate::mem::transmute(self)) != 0
+            }
+        }
+    };
+}
diff --git a/vendor/packed_simd_2/src/codegen/shuffle.rs b/vendor/packed_simd_2/src/codegen/shuffle.rs
new file mode 100644
index 000000000..d92c9ee22
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/shuffle.rs
@@ -0,0 +1,150 @@
+//! Implementations of the `ShuffleResult` trait for the different numbers of
+//! lanes and vector element types.
+
+use crate::masks::*;
+use crate::sealed::{Shuffle, Seal};
+
+macro_rules! impl_shuffle {
+    ($array:ty, $base:ty, $out:ty) => {
+        impl Seal<$array> for $base {}
+        impl Shuffle<$array> for $base {
+            type Output = $out;
+        }
+    }
+}
+
+impl_shuffle! { [u32; 2], i8, crate::codegen::i8x2 }
+impl_shuffle! { [u32; 4], i8, crate::codegen::i8x4 }
+impl_shuffle! { [u32; 8], i8, crate::codegen::i8x8 }
+impl_shuffle! { [u32; 16], i8, crate::codegen::i8x16 }
+impl_shuffle! { [u32; 32], i8, crate::codegen::i8x32 }
+impl_shuffle! { [u32; 64], i8, crate::codegen::i8x64 }
+
+impl_shuffle! { [u32; 2], u8, crate::codegen::u8x2 }
+impl_shuffle! { [u32; 4], u8, crate::codegen::u8x4 }
+impl_shuffle! { [u32; 8], u8, crate::codegen::u8x8 }
+impl_shuffle! { [u32; 16], u8, crate::codegen::u8x16 }
+impl_shuffle! { [u32; 32], u8, crate::codegen::u8x32 }
+impl_shuffle! { [u32; 64], u8, crate::codegen::u8x64 }
+
+impl_shuffle! { [u32; 2], m8, crate::codegen::m8x2 }
+impl_shuffle! { [u32; 4], m8, crate::codegen::m8x4 }
+impl_shuffle! { [u32; 8], m8, crate::codegen::m8x8 }
+impl_shuffle! { [u32; 16], m8, crate::codegen::m8x16 }
+impl_shuffle! { [u32; 32], m8, crate::codegen::m8x32 }
+impl_shuffle! { [u32; 64], m8, crate::codegen::m8x64 }
+
+impl_shuffle! { [u32; 2], i16, crate::codegen::i16x2 }
+impl_shuffle! { [u32; 4], i16, crate::codegen::i16x4 }
+impl_shuffle! { [u32; 8], i16, crate::codegen::i16x8 }
+impl_shuffle! { [u32; 16], i16, crate::codegen::i16x16 }
+impl_shuffle! { [u32; 32], i16, crate::codegen::i16x32 }
+
+impl_shuffle! { [u32; 2], u16, crate::codegen::u16x2 }
+impl_shuffle! { [u32; 4], u16, crate::codegen::u16x4 }
+impl_shuffle! { [u32; 8], u16, crate::codegen::u16x8 }
+impl_shuffle! { [u32; 16], u16, crate::codegen::u16x16 }
+impl_shuffle! { [u32; 32], u16, crate::codegen::u16x32 }
+
+impl_shuffle! { [u32; 2], m16, crate::codegen::m16x2 }
+impl_shuffle! { [u32; 4], m16, crate::codegen::m16x4 }
+impl_shuffle! { [u32; 8], m16, crate::codegen::m16x8 }
+impl_shuffle! { [u32; 16], m16, crate::codegen::m16x16 }
+
+impl_shuffle! { [u32; 2], i32, crate::codegen::i32x2 }
+impl_shuffle! { [u32; 4], i32, crate::codegen::i32x4 }
+impl_shuffle! { [u32; 8], i32, crate::codegen::i32x8 }
+impl_shuffle! { [u32; 16], i32, crate::codegen::i32x16 }
+
+impl_shuffle! { [u32; 2], u32, crate::codegen::u32x2 }
+impl_shuffle! { [u32; 4], u32, crate::codegen::u32x4 }
+impl_shuffle! { [u32; 8], u32, crate::codegen::u32x8 }
+impl_shuffle! { [u32; 16], u32, crate::codegen::u32x16 }
+
+impl_shuffle! { [u32; 2], f32, crate::codegen::f32x2 }
+impl_shuffle! { [u32; 4], f32, crate::codegen::f32x4 }
+impl_shuffle! { [u32; 8], f32, crate::codegen::f32x8 }
+impl_shuffle! { [u32; 16], f32, crate::codegen::f32x16 }
+
+impl_shuffle! { [u32; 2], m32, crate::codegen::m32x2 }
+impl_shuffle! { [u32; 4], m32, crate::codegen::m32x4 }
+impl_shuffle! { [u32; 8], m32, crate::codegen::m32x8 }
+impl_shuffle! { [u32; 16], m32, crate::codegen::m32x16 }
+
+/* FIXME: 64-bit single element vector
+impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
+*/
+impl_shuffle! { [u32; 2], i64, crate::codegen::i64x2 }
+impl_shuffle! { [u32; 4], i64, crate::codegen::i64x4 }
+impl_shuffle! { [u32; 8], i64, crate::codegen::i64x8 }
+
+/* FIXME: 64-bit single element vector
+impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
+*/
+impl_shuffle! { [u32; 2], u64, crate::codegen::u64x2 }
+impl_shuffle! { [u32; 4], u64, crate::codegen::u64x4 }
+impl_shuffle! { [u32; 8], u64, crate::codegen::u64x8 }
+
+/* FIXME: 64-bit single element vector
+impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
+*/
+impl_shuffle! { [u32; 2], f64, crate::codegen::f64x2 }
+impl_shuffle! { [u32; 4], f64, crate::codegen::f64x4 }
+impl_shuffle! { [u32; 8], f64, crate::codegen::f64x8 }
+
+/* FIXME: 64-bit single element vector
+impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
+*/
+impl_shuffle! { [u32; 2], m64, crate::codegen::m64x2 }
+impl_shuffle! { [u32; 4], m64, crate::codegen::m64x4 }
+impl_shuffle! { [u32; 8], m64, crate::codegen::m64x8 }
+
+impl_shuffle! { [u32; 2], isize, crate::codegen::isizex2 }
+impl_shuffle! { [u32; 4], isize, crate::codegen::isizex4 }
+impl_shuffle! { [u32; 8], isize, crate::codegen::isizex8 }
+
+impl_shuffle! { [u32; 2], usize, crate::codegen::usizex2 }
+impl_shuffle! { [u32; 4], usize, crate::codegen::usizex4 }
+impl_shuffle! { [u32; 8], usize, crate::codegen::usizex8 }
+
+impl_shuffle! { [u32; 2], msize, crate::codegen::msizex2 }
+impl_shuffle! { [u32; 4], msize, crate::codegen::msizex4 }
+impl_shuffle! { [u32; 8], msize, crate::codegen::msizex8 }
+
+impl<T> Seal<[u32; 2]> for *const T {}
+impl<T> Shuffle<[u32; 2]> for *const T {
+    type Output = crate::codegen::cptrx2<T>;
+}
+impl<T> Seal<[u32; 4]> for *const T {}
+impl<T> Shuffle<[u32; 4]> for *const T {
+    type Output = crate::codegen::cptrx4<T>;
+}
+impl<T> Seal<[u32; 8]> for *const T {}
+impl<T> Shuffle<[u32; 8]> for *const T {
+    type Output = crate::codegen::cptrx8<T>;
+}
+
+impl<T> Seal<[u32; 2]> for *mut T {}
+impl<T> Shuffle<[u32; 2]> for *mut T {
+    type Output = crate::codegen::mptrx2<T>;
+}
+impl<T> Seal<[u32; 4]> for *mut T {}
+impl<T> Shuffle<[u32; 4]> for *mut T {
+    type Output = crate::codegen::mptrx4<T>;
+}
+impl<T> Seal<[u32; 8]> for *mut T {}
+impl<T> Shuffle<[u32; 8]> for *mut T {
+    type Output = crate::codegen::mptrx8<T>;
+}
+
+impl_shuffle! { [u32; 1], i128, crate::codegen::i128x1 }
+impl_shuffle! { [u32; 2], i128, crate::codegen::i128x2 }
+impl_shuffle! { [u32; 4], i128, crate::codegen::i128x4 }
+
+impl_shuffle! { [u32; 1], u128, crate::codegen::u128x1 }
+impl_shuffle! { [u32; 2], u128, crate::codegen::u128x2 }
+impl_shuffle! { [u32; 4], u128, crate::codegen::u128x4 }
+
+impl_shuffle! { [u32; 1], m128, crate::codegen::m128x1 }
+impl_shuffle! { [u32; 2], m128, crate::codegen::m128x2 }
+impl_shuffle! { [u32; 4], m128, crate::codegen::m128x4 }
diff --git a/vendor/packed_simd_2/src/codegen/shuffle1_dyn.rs b/vendor/packed_simd_2/src/codegen/shuffle1_dyn.rs
new file mode 100644
index 000000000..8d9577b26
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/shuffle1_dyn.rs
@@ -0,0 +1,411 @@
+//! Shuffle vector lanes with run-time indices.
+
+use crate::*;
+
+pub trait Shuffle1Dyn {
+    type Indices;
+    fn shuffle1_dyn(self, _: Self::Indices) -> Self;
+}
+
+// Fallback implementation
+macro_rules! impl_fallback {
+    ($id:ident) => {
+        impl Shuffle1Dyn for $id {
+            type Indices = Self;
+            #[inline]
+            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                let mut result = Self::splat(0);
+                for i in 0..$id::lanes() {
+                    result = result
+                        .replace(i, self.extract(indices.extract(i) as usize));
+                }
+                result
+            }
+        }
+    };
+}
+
+macro_rules! impl_shuffle1_dyn {
+    (u8x8) => {
+        cfg_if! {
+            if #[cfg(all(
+                any(
+                    all(target_arch = "aarch64", target_feature = "neon"),
+                    all(target_arch = "arm", target_feature = "v7",
+                        target_feature = "neon")
+                ),
+                any(feature = "core_arch", libcore_neon)
+            )
+            )] {
+                impl Shuffle1Dyn for u8x8 {
+                    type Indices = Self;
+                    #[inline]
+                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                        #[cfg(target_arch = "aarch64")]
+                        use crate::arch::aarch64::vtbl1_u8;
+                        #[cfg(target_arch = "arm")]
+                        use crate::arch::arm::vtbl1_u8;
+
+                        // This is safe because the binary is compiled with
+                        // neon enabled at compile-time and can therefore only
+                        // run on CPUs that have it enabled.
+                        unsafe {
+                            Simd(mem::transmute(
+                                vtbl1_u8(mem::transmute(self.0),
+                                        crate::mem::transmute(indices.0))
+                            ))
+                        }
+                    }
+                }
+            } else {
+                impl_fallback!(u8x8);
+            }
+        }
+    };
+    (u8x16) => {
+        cfg_if! {
+            if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
+                         target_feature = "ssse3"))] {
+                impl Shuffle1Dyn for u8x16 {
+                    type Indices = Self;
+                    #[inline]
+                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                        #[cfg(target_arch = "x86")]
+                        use crate::arch::x86::_mm_shuffle_epi8;
+                        #[cfg(target_arch = "x86_64")]
+                        use crate::arch::x86_64::_mm_shuffle_epi8;
+                        // This is safe because the binary is compiled with
+                        // ssse3 enabled at compile-time and can therefore only
+                        // run on CPUs that have it enabled.
+                        unsafe {
+                            Simd(mem::transmute(
+                                _mm_shuffle_epi8(mem::transmute(self.0),
+                                                crate::mem::transmute(indices))
+                            ))
+                        }
+                    }
+                }
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon",
+                                any(feature = "core_arch", libcore_neon)))] {
+                impl Shuffle1Dyn for u8x16 {
+                    type Indices = Self;
+                    #[inline]
+                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                        use crate::arch::aarch64::vqtbl1q_u8;
+
+                        // This is safe because the binary is compiled with
+                        // neon enabled at compile-time and can therefore only
+                        // run on CPUs that have it enabled.
+                        unsafe {
+                            Simd(mem::transmute(
+                                vqtbl1q_u8(mem::transmute(self.0),
+                                          crate::mem::transmute(indices.0))
+                            ))
+                        }
+                    }
+                }
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7",
+                                target_feature = "neon",
+                                any(feature = "core_arch", libcore_neon)))] {
+                impl Shuffle1Dyn for u8x16 {
+                    type Indices = Self;
+                    #[inline]
+                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                        use crate::arch::arm::vtbl2_u8;
+
+                        // This is safe because the binary is compiled with
+                        // neon enabled at compile-time and can therefore only
+                        // run on CPUs that have it enabled.
+                        unsafe {
+                            union U {
+                                j: u8x16,
+                                s: (u8x8, u8x8),
+                            }
+
+                            let (i0, i1) = U { j: y }.s;
+
+                            let r0 = vtbl2_u8(
+                                mem::transmute(x),
+                                crate::mem::transmute(i0)
+                            );
+                            let r1 = vtbl2_u8(
+                                mem::transmute(x),
+                                crate::mem::transmute(i1)
+                            );
+
+                            let r = U { s: (r0, r1) }.j;
+
+                            Simd(mem::transmute(r))
+                        }
+                    }
+                }
+            } else {
+                impl_fallback!(u8x16);
+            }
+        }
+    };
+    (u16x8) => {
+        impl Shuffle1Dyn for u16x8 {
+            type Indices = Self;
+            #[inline]
+            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                let indices: u8x8 = (indices * 2).cast();
+                let indices: u8x16 = shuffle!(
+                    indices, [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7]
+                );
+                let v = u8x16::new(
+                    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+                );
+                let indices = indices + v;
+                unsafe {
+                    let s: u8x16 =crate::mem::transmute(self);
+                   crate::mem::transmute(s.shuffle1_dyn(indices))
+                }
+            }
+        }
+    };
+    (u32x4) => {
+        cfg_if! {
+            if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
+                         target_feature = "avx"))] {
+                impl Shuffle1Dyn for u32x4 {
+                    type Indices = Self;
+                    #[inline]
+                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                        #[cfg(target_arch = "x86")]
+                        use crate::arch::x86::{_mm_permutevar_ps};
+                        #[cfg(target_arch = "x86_64")]
+                        use crate::arch::x86_64::{_mm_permutevar_ps};
+
+                        unsafe {
+                            crate::mem::transmute(
+                                _mm_permutevar_ps(
+                                    crate::mem::transmute(self.0),
+                                    crate::mem::transmute(indices.0)
+                                )
+                            )
+                        }
+                    }
+                }
+            } else {
+                impl Shuffle1Dyn for u32x4 {
+                    type Indices = Self;
+                    #[inline]
+                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                        let indices: u8x4 = (indices * 4).cast();
+                        let indices: u8x16 = shuffle!(
+                            indices,
+                            [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]
+                        );
+                        let v = u8x16::new(
+                            0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+                        );
+                        let indices = indices + v;
+                        unsafe {
+                            let s: u8x16 =crate::mem::transmute(self);
+                           crate::mem::transmute(s.shuffle1_dyn(indices))
+                        }
+                    }
+                }
+            }
+        }
+    };
+    (u64x2) => {
+        cfg_if! {
+            if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
+                         target_feature = "avx"))] {
+                impl Shuffle1Dyn for u64x2 {
+                    type Indices = Self;
+                    #[inline]
+                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                        #[cfg(target_arch = "x86")]
+                        use crate::arch::x86::{_mm_permutevar_pd};
+                        #[cfg(target_arch = "x86_64")]
+                        use crate::arch::x86_64::{_mm_permutevar_pd};
+                        // _mm_permutevar_pd uses the _second_ bit of each
+                        // element to perform the selection, that is: 0b00 => 0,
+                        // 0b10 => 1:
+                        let indices = indices << 1;
+                        unsafe {
+                            crate::mem::transmute(
+                                _mm_permutevar_pd(
+                                    crate::mem::transmute(self),
+                                    crate::mem::transmute(indices)
+                                )
+                            )
+                        }
+                    }
+                }
+            } else {
+                impl Shuffle1Dyn for u64x2 {
+                    type Indices = Self;
+                    #[inline]
+                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                        let indices: u8x2 = (indices * 8).cast();
+                        let indices: u8x16 = shuffle!(
+                            indices,
+                            [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+                        );
+                        let v = u8x16::new(
+                            0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+                        );
+                        let indices = indices + v;
+                        unsafe {
+                            let s: u8x16 =crate::mem::transmute(self);
+                           crate::mem::transmute(s.shuffle1_dyn(indices))
+                        }
+                    }
+                }
+            }
+        }
+    };
+    (u128x1) => {
+        impl Shuffle1Dyn for u128x1 {
+            type Indices = Self;
+            #[inline]
+            fn shuffle1_dyn(self, _indices: Self::Indices) -> Self {
+                self
+            }
+        }
+    };
+    ($id:ident) => { impl_fallback!($id); }
+}
+
+impl_shuffle1_dyn!(u8x2);
+impl_shuffle1_dyn!(u8x4);
+impl_shuffle1_dyn!(u8x8);
+impl_shuffle1_dyn!(u8x16);
+impl_shuffle1_dyn!(u8x32);
+impl_shuffle1_dyn!(u8x64);
+
+impl_shuffle1_dyn!(u16x2);
+impl_shuffle1_dyn!(u16x4);
+impl_shuffle1_dyn!(u16x8);
+impl_shuffle1_dyn!(u16x16);
+impl_shuffle1_dyn!(u16x32);
+
+impl_shuffle1_dyn!(u32x2);
+impl_shuffle1_dyn!(u32x4);
+impl_shuffle1_dyn!(u32x8);
+impl_shuffle1_dyn!(u32x16);
+
+impl_shuffle1_dyn!(u64x2);
+impl_shuffle1_dyn!(u64x4);
+impl_shuffle1_dyn!(u64x8);
+
+impl_shuffle1_dyn!(usizex2);
+impl_shuffle1_dyn!(usizex4);
+impl_shuffle1_dyn!(usizex8);
+
+impl_shuffle1_dyn!(u128x1);
+impl_shuffle1_dyn!(u128x2);
+impl_shuffle1_dyn!(u128x4);
+
+// Implementation for non-unsigned vector types
+macro_rules! impl_shuffle1_dyn_non_u {
+    ($id:ident, $uid:ident) => {
+        impl Shuffle1Dyn for $id {
+            type Indices = $uid;
+            #[inline]
+            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                unsafe {
+                    let u: $uid = crate::mem::transmute(self);
+                    crate::mem::transmute(u.shuffle1_dyn(indices))
+                }
+            }
+        }
+    };
+}
+
+impl_shuffle1_dyn_non_u!(i8x2, u8x2);
+impl_shuffle1_dyn_non_u!(i8x4, u8x4);
+impl_shuffle1_dyn_non_u!(i8x8, u8x8);
+impl_shuffle1_dyn_non_u!(i8x16, u8x16);
+impl_shuffle1_dyn_non_u!(i8x32, u8x32);
+impl_shuffle1_dyn_non_u!(i8x64, u8x64);
+
+impl_shuffle1_dyn_non_u!(i16x2, u16x2);
+impl_shuffle1_dyn_non_u!(i16x4, u16x4);
+impl_shuffle1_dyn_non_u!(i16x8, u16x8);
+impl_shuffle1_dyn_non_u!(i16x16, u16x16);
+impl_shuffle1_dyn_non_u!(i16x32, u16x32);
+
+impl_shuffle1_dyn_non_u!(i32x2, u32x2);
+impl_shuffle1_dyn_non_u!(i32x4, u32x4);
+impl_shuffle1_dyn_non_u!(i32x8, u32x8);
+impl_shuffle1_dyn_non_u!(i32x16, u32x16);
+
+impl_shuffle1_dyn_non_u!(i64x2, u64x2);
+impl_shuffle1_dyn_non_u!(i64x4, u64x4);
+impl_shuffle1_dyn_non_u!(i64x8, u64x8);
+
+impl_shuffle1_dyn_non_u!(isizex2, usizex2);
+impl_shuffle1_dyn_non_u!(isizex4, usizex4);
+impl_shuffle1_dyn_non_u!(isizex8, usizex8);
+
+impl_shuffle1_dyn_non_u!(i128x1, u128x1);
+impl_shuffle1_dyn_non_u!(i128x2, u128x2);
+impl_shuffle1_dyn_non_u!(i128x4, u128x4);
+
+impl_shuffle1_dyn_non_u!(m8x2, u8x2);
+impl_shuffle1_dyn_non_u!(m8x4, u8x4);
+impl_shuffle1_dyn_non_u!(m8x8, u8x8);
+impl_shuffle1_dyn_non_u!(m8x16, u8x16);
+impl_shuffle1_dyn_non_u!(m8x32, u8x32);
+impl_shuffle1_dyn_non_u!(m8x64, u8x64);
+
+impl_shuffle1_dyn_non_u!(m16x2, u16x2);
+impl_shuffle1_dyn_non_u!(m16x4, u16x4);
+impl_shuffle1_dyn_non_u!(m16x8, u16x8);
+impl_shuffle1_dyn_non_u!(m16x16, u16x16);
+impl_shuffle1_dyn_non_u!(m16x32, u16x32);
+
+impl_shuffle1_dyn_non_u!(m32x2, u32x2);
+impl_shuffle1_dyn_non_u!(m32x4, u32x4);
+impl_shuffle1_dyn_non_u!(m32x8, u32x8);
+impl_shuffle1_dyn_non_u!(m32x16, u32x16);
+
+impl_shuffle1_dyn_non_u!(m64x2, u64x2);
+impl_shuffle1_dyn_non_u!(m64x4, u64x4);
+impl_shuffle1_dyn_non_u!(m64x8, u64x8);
+
+impl_shuffle1_dyn_non_u!(msizex2, usizex2);
+impl_shuffle1_dyn_non_u!(msizex4, usizex4);
+impl_shuffle1_dyn_non_u!(msizex8, usizex8);
+
+impl_shuffle1_dyn_non_u!(m128x1, u128x1);
+impl_shuffle1_dyn_non_u!(m128x2, u128x2);
+impl_shuffle1_dyn_non_u!(m128x4, u128x4);
+
+impl_shuffle1_dyn_non_u!(f32x2, u32x2);
+impl_shuffle1_dyn_non_u!(f32x4, u32x4);
+impl_shuffle1_dyn_non_u!(f32x8, u32x8);
+impl_shuffle1_dyn_non_u!(f32x16, u32x16);
+
+impl_shuffle1_dyn_non_u!(f64x2, u64x2);
+impl_shuffle1_dyn_non_u!(f64x4, u64x4);
+impl_shuffle1_dyn_non_u!(f64x8, u64x8);
+
+// Implementation for non-unsigned vector types
+macro_rules! impl_shuffle1_dyn_ptr {
+    ($id:ident, $uid:ident) => {
+        impl<T> Shuffle1Dyn for $id<T> {
+            type Indices = $uid;
+            #[inline]
+            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
+                unsafe {
+                    let u: $uid = crate::mem::transmute(self);
+                    crate::mem::transmute(u.shuffle1_dyn(indices))
+                }
+            }
+        }
+    };
+}
+
+impl_shuffle1_dyn_ptr!(cptrx2, usizex2);
+impl_shuffle1_dyn_ptr!(cptrx4, usizex4);
+impl_shuffle1_dyn_ptr!(cptrx8, usizex8);
+
+impl_shuffle1_dyn_ptr!(mptrx2, usizex2);
+impl_shuffle1_dyn_ptr!(mptrx4, usizex4);
+impl_shuffle1_dyn_ptr!(mptrx8, usizex8);
diff --git a/vendor/packed_simd_2/src/codegen/swap_bytes.rs b/vendor/packed_simd_2/src/codegen/swap_bytes.rs
new file mode 100644
index 000000000..b435fb5da
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/swap_bytes.rs
@@ -0,0 +1,189 @@
+//! Horizontal swap bytes reductions.
+
+// FIXME: investigate using `llvm.bswap`
+// https://github.com/rust-lang-nursery/packed_simd/issues/19
+
+use crate::*;
+
+crate trait SwapBytes {
+    fn swap_bytes(self) -> Self;
+}
+
+macro_rules! impl_swap_bytes {
+    (v16: $($id:ident,)+) => {
+        $(
+            impl SwapBytes for $id {
+                #[inline]
+                fn swap_bytes(self) -> Self {
+                    unsafe { shuffle!(self, [1, 0]) }
+                }
+            }
+        )+
+    };
+    (v32: $($id:ident,)+) => {
+        $(
+            impl SwapBytes for $id {
+                #[inline]
+                #[allow(clippy::useless_transmute)]
+                fn swap_bytes(self) -> Self {
+                    unsafe {
+                        let bytes: u8x4 = crate::mem::transmute(self);
+                        let result: u8x4 = shuffle!(bytes, [3, 2, 1, 0]);
+                        crate::mem::transmute(result)
+                    }
+                }
+            }
+        )+
+    };
+    (v64: $($id:ident,)+) => {
+        $(
+            impl SwapBytes for $id {
+                #[inline]
+                #[allow(clippy::useless_transmute)]
+                fn swap_bytes(self) -> Self {
+                    unsafe {
+                        let bytes: u8x8 = crate::mem::transmute(self);
+                        let result: u8x8 = shuffle!(
+                            bytes, [7, 6, 5, 4, 3, 2, 1, 0]
+                        );
+                        crate::mem::transmute(result)
+                    }
+                }
+            }
+        )+
+    };
+    (v128: $($id:ident,)+) => {
+        $(
+            impl SwapBytes for $id {
+                #[inline]
+                #[allow(clippy::useless_transmute)]
+                fn swap_bytes(self) -> Self {
+                    unsafe {
+                        let bytes: u8x16 = crate::mem::transmute(self);
+                        let result: u8x16 = shuffle!(bytes, [
+                            15, 14, 13, 12, 11, 10, 9, 8,
+                            7, 6, 5, 4, 3, 2, 1, 0
+                        ]);
+                        crate::mem::transmute(result)
+                    }
+                }
+            }
+        )+
+    };
+    (v256: $($id:ident,)+) => {
+        $(
+            impl SwapBytes for $id {
+                #[inline]
+                #[allow(clippy::useless_transmute)]
+                fn swap_bytes(self) -> Self {
+                    unsafe {
+                        let bytes: u8x32 = crate::mem::transmute(self);
+                        let result: u8x32 = shuffle!(bytes, [
+                            31, 30, 29, 28, 27, 26, 25, 24,
+                            23, 22, 21, 20, 19, 18, 17, 16,
+                            15, 14, 13, 12, 11, 10, 9,  8,
+                            7,  6,  5,  4,  3,  2,  1,  0
+                        ]);
+                        crate::mem::transmute(result)
+                    }
+                }
+            }
+        )+
+    };
+    (v512: $($id:ident,)+) => {
+        $(
+            impl SwapBytes for $id {
+                #[inline]
+                #[allow(clippy::useless_transmute)]
+                fn swap_bytes(self) -> Self {
+                    unsafe {
+                        let bytes: u8x64 = crate::mem::transmute(self);
+                        let result: u8x64 = shuffle!(bytes, [
+                            63, 62, 61, 60, 59, 58, 57, 56,
+                            55, 54, 53, 52, 51, 50, 49, 48,
+                            47, 46, 45, 44, 43, 42, 41, 40,
+                            39, 38, 37, 36, 35, 34, 33, 32,
+                            31, 30, 29, 28, 27, 26, 25, 24,
+                            23, 22, 21, 20, 19, 18, 17, 16,
+                            15, 14, 13, 12, 11, 10, 9,  8,
+                            7,  6,  5,  4,  3,  2,  1,  0
+                        ]);
+                        crate::mem::transmute(result)
+                    }
+                }
+            }
+        )+
+    };
+}
+
+impl_swap_bytes!(v16: u8x2, i8x2,);
+impl_swap_bytes!(v32: u8x4, i8x4, u16x2, i16x2,);
+// FIXME: 64-bit single element vector
+impl_swap_bytes!(
+    v64: u8x8,
+    i8x8,
+    u16x4,
+    i16x4,
+    u32x2,
+    i32x2, /* u64x1, i64x1, */
+);
+
+impl_swap_bytes!(
+    v128: u8x16,
+    i8x16,
+    u16x8,
+    i16x8,
+    u32x4,
+    i32x4,
+    u64x2,
+    i64x2,
+    u128x1,
+    i128x1,
+);
+impl_swap_bytes!(
+    v256: u8x32,
+    i8x32,
+    u16x16,
+    i16x16,
+    u32x8,
+    i32x8,
+    u64x4,
+    i64x4,
+    u128x2,
+    i128x2,
+);
+
+impl_swap_bytes!(
+    v512: u8x64,
+    i8x64,
+    u16x32,
+    i16x32,
+    u32x16,
+    i32x16,
+    u64x8,
+    i64x8,
+    u128x4,
+    i128x4,
+);
+
+cfg_if! {
+    if #[cfg(target_pointer_width = "8")] {
+        impl_swap_bytes!(v16: isizex2, usizex2,);
+        impl_swap_bytes!(v32: isizex4, usizex4,);
+        impl_swap_bytes!(v64: isizex8, usizex8,);
+    } else if #[cfg(target_pointer_width = "16")] {
+        impl_swap_bytes!(v32: isizex2, usizex2,);
+        impl_swap_bytes!(v64: isizex4, usizex4,);
+        impl_swap_bytes!(v128: isizex8, usizex8,);
+    } else if #[cfg(target_pointer_width = "32")] {
+        impl_swap_bytes!(v64: isizex2, usizex2,);
+        impl_swap_bytes!(v128: isizex4, usizex4,);
+        impl_swap_bytes!(v256: isizex8, usizex8,);
+    } else if #[cfg(target_pointer_width = "64")] {
+        impl_swap_bytes!(v128: isizex2, usizex2,);
+        impl_swap_bytes!(v256: isizex4, usizex4,);
+        impl_swap_bytes!(v512: isizex8, usizex8,);
+    } else {
+        compile_error!("unsupported target_pointer_width");
+    }
+}
diff --git a/vendor/packed_simd_2/src/codegen/v128.rs b/vendor/packed_simd_2/src/codegen/v128.rs
new file mode 100644
index 000000000..9506424fa
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/v128.rs
@@ -0,0 +1,46 @@
+//! Internal 128-bit wide vector types
+
+use crate::masks::*;
+
+#[rustfmt::skip]
+impl_simd_array!(
+    [i8; 16]: i8x16 |
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [u8; 16]: u8x16 |
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [m8; 16]: m8x16 |
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8
+);
+
+impl_simd_array!([i16; 8]: i16x8 | i16, i16, i16, i16, i16, i16, i16, i16);
+impl_simd_array!([u16; 8]: u16x8 | u16, u16, u16, u16, u16, u16, u16, u16);
+impl_simd_array!([m16; 8]: m16x8 | i16, i16, i16, i16, i16, i16, i16, i16);
+
+impl_simd_array!([i32; 4]: i32x4 | i32, i32, i32, i32);
+impl_simd_array!([u32; 4]: u32x4 | u32, u32, u32, u32);
+impl_simd_array!([f32; 4]: f32x4 | f32, f32, f32, f32);
+impl_simd_array!([m32; 4]: m32x4 | i32, i32, i32, i32);
+
+impl_simd_array!([i64; 2]: i64x2 | i64, i64);
+impl_simd_array!([u64; 2]: u64x2 | u64, u64);
+impl_simd_array!([f64; 2]: f64x2 | f64, f64);
+impl_simd_array!([m64; 2]: m64x2 | i64, i64);
+
+impl_simd_array!([i128; 1]: i128x1 | i128);
+impl_simd_array!([u128; 1]: u128x1 | u128);
+impl_simd_array!([m128; 1]: m128x1 | i128);
diff --git a/vendor/packed_simd_2/src/codegen/v16.rs b/vendor/packed_simd_2/src/codegen/v16.rs
new file mode 100644
index 000000000..4d55a6d89
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/v16.rs
@@ -0,0 +1,7 @@
+//! Internal 16-bit wide vector types
+
+use crate::masks::*;
+
+impl_simd_array!([i8; 2]: i8x2 | i8, i8);
+impl_simd_array!([u8; 2]: u8x2 | u8, u8);
+impl_simd_array!([m8; 2]: m8x2 | i8, i8);
diff --git a/vendor/packed_simd_2/src/codegen/v256.rs b/vendor/packed_simd_2/src/codegen/v256.rs
new file mode 100644
index 000000000..5ca4759f0
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/v256.rs
@@ -0,0 +1,78 @@
+//! Internal 256-bit wide vector types
+
+use crate::masks::*;
+
+#[rustfmt::skip]
+impl_simd_array!(
+    [i8; 32]: i8x32 |
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [u8; 32]: u8x32 |
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [m8; 32]: m8x32 |
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [i16; 16]: i16x16 |
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [u16; 16]: u16x16 |
+    u16, u16, u16, u16,
+    u16, u16, u16, u16,
+    u16, u16, u16, u16,
+    u16, u16, u16, u16
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [m16; 16]: m16x16 |
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16
+);
+
+impl_simd_array!([i32; 8]: i32x8 | i32, i32, i32, i32, i32, i32, i32, i32);
+impl_simd_array!([u32; 8]: u32x8 | u32, u32, u32, u32, u32, u32, u32, u32);
+impl_simd_array!([f32; 8]: f32x8 | f32, f32, f32, f32, f32, f32, f32, f32);
+impl_simd_array!([m32; 8]: m32x8 | i32, i32, i32, i32, i32, i32, i32, i32);
+
+impl_simd_array!([i64; 4]: i64x4 | i64, i64, i64, i64);
+impl_simd_array!([u64; 4]: u64x4 | u64, u64, u64, u64);
+impl_simd_array!([f64; 4]: f64x4 | f64, f64, f64, f64);
+impl_simd_array!([m64; 4]: m64x4 | i64, i64, i64, i64);
+
+impl_simd_array!([i128; 2]: i128x2 | i128, i128);
+impl_simd_array!([u128; 2]: u128x2 | u128, u128);
+impl_simd_array!([m128; 2]: m128x2 | i128, i128);
diff --git a/vendor/packed_simd_2/src/codegen/v32.rs b/vendor/packed_simd_2/src/codegen/v32.rs
new file mode 100644
index 000000000..ae1dabd00
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/v32.rs
@@ -0,0 +1,11 @@
+//! Internal 32-bit wide vector types
+
+use crate::masks::*;
+
+impl_simd_array!([i8; 4]: i8x4 | i8, i8, i8, i8);
+impl_simd_array!([u8; 4]: u8x4 | u8, u8, u8, u8);
+impl_simd_array!([m8; 4]: m8x4 | i8, i8, i8, i8);
+
+impl_simd_array!([i16; 2]: i16x2 | i16, i16);
+impl_simd_array!([u16; 2]: u16x2 | u16, u16);
+impl_simd_array!([m16; 2]: m16x2 | i16, i16);
diff --git a/vendor/packed_simd_2/src/codegen/v512.rs b/vendor/packed_simd_2/src/codegen/v512.rs
new file mode 100644
index 000000000..bf9511034
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/v512.rs
@@ -0,0 +1,145 @@
+//! Internal 512-bit wide vector types
+
+use crate::masks::*;
+
+#[rustfmt::skip]
+impl_simd_array!(
+    [i8; 64]: i8x64 |
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [u8; 64]: u8x64 |
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8,
+    u8, u8, u8, u8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [m8; 64]: m8x64 |
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8,
+    i8, i8, i8, i8
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [i16; 32]: i16x32 |
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [u16; 32]: u16x32 |
+    u16, u16, u16, u16,
+    u16, u16, u16, u16,
+    u16, u16, u16, u16,
+    u16, u16, u16, u16,
+    u16, u16, u16, u16,
+    u16, u16, u16, u16,
+    u16, u16, u16, u16,
+    u16, u16, u16, u16
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [m16; 32]: m16x32 |
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16,
+    i16, i16, i16, i16
+);
+
+#[rustfmt::skip]
+impl_simd_array!(
+    [i32; 16]: i32x16 |
+    i32, i32, i32, i32,
+    i32, i32, i32, i32,
+    i32, i32, i32, i32,
+    i32, i32, i32, i32
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [u32; 16]: u32x16 |
+    u32, u32, u32, u32,
+    u32, u32, u32, u32,
+    u32, u32, u32, u32,
+    u32, u32, u32, u32
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [f32; 16]: f32x16 |
+    f32, f32, f32, f32,
+    f32, f32, f32, f32,
+    f32, f32, f32, f32,
+    f32, f32, f32, f32
+);
+#[rustfmt::skip]
+impl_simd_array!(
+    [m32; 16]: m32x16 |
+    i32, i32, i32, i32,
+    i32, i32, i32, i32,
+    i32, i32, i32, i32,
+    i32, i32, i32, i32
+);
+
+impl_simd_array!([i64; 8]: i64x8 | i64, i64, i64, i64, i64, i64, i64, i64);
+impl_simd_array!([u64; 8]: u64x8 | u64, u64, u64, u64, u64, u64, u64, u64);
+impl_simd_array!([f64; 8]: f64x8 | f64, f64, f64, f64, f64, f64, f64, f64);
+impl_simd_array!([m64; 8]: m64x8 | i64, i64, i64, i64, i64, i64, i64, i64);
+
+impl_simd_array!([i128; 4]: i128x4 | i128, i128, i128, i128);
+impl_simd_array!([u128; 4]: u128x4 | u128, u128, u128, u128);
+impl_simd_array!([m128; 4]: m128x4 | i128, i128, i128, i128);
diff --git a/vendor/packed_simd_2/src/codegen/v64.rs b/vendor/packed_simd_2/src/codegen/v64.rs
new file mode 100644
index 000000000..3cfb67c1a
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/v64.rs
@@ -0,0 +1,21 @@
+//! Internal 64-bit wide vector types
+
+use crate::masks::*;
+
+impl_simd_array!([i8; 8]: i8x8 | i8, i8, i8, i8, i8, i8, i8, i8);
+impl_simd_array!([u8; 8]: u8x8 | u8, u8, u8, u8, u8, u8, u8, u8);
+impl_simd_array!([m8; 8]: m8x8 | i8, i8, i8, i8, i8, i8, i8, i8);
+
+impl_simd_array!([i16; 4]: i16x4 | i16, i16, i16, i16);
+impl_simd_array!([u16; 4]: u16x4 | u16, u16, u16, u16);
+impl_simd_array!([m16; 4]: m16x4 | i16, i16, i16, i16);
+
+impl_simd_array!([i32; 2]: i32x2 | i32, i32);
+impl_simd_array!([u32; 2]: u32x2 | u32, u32);
+impl_simd_array!([f32; 2]: f32x2 | f32, f32);
+impl_simd_array!([m32; 2]: m32x2 | i32, i32);
+
+impl_simd_array!([i64; 1]: i64x1 | i64);
+impl_simd_array!([u64; 1]: u64x1 | u64);
+impl_simd_array!([f64; 1]: f64x1 | f64);
+impl_simd_array!([m64; 1]: m64x1 | i64);
diff --git a/vendor/packed_simd_2/src/codegen/vPtr.rs b/vendor/packed_simd_2/src/codegen/vPtr.rs
new file mode 100644
index 000000000..cf4765538
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/vPtr.rs
@@ -0,0 +1,35 @@
+//! Pointer vector types
+
+macro_rules! impl_simd_ptr {
+    ([$ptr_ty:ty; $elem_count:expr]: $tuple_id:ident | $ty:ident
+     | $($tys:ty),*) => {
+        #[derive(Copy, Clone)]
+        #[repr(simd)]
+        pub struct $tuple_id<$ty>($(crate $tys),*);
+        //^^^^^^^ leaked through SimdArray
+
+        impl<$ty> crate::sealed::Seal for [$ptr_ty; $elem_count] {}
+        impl<$ty> crate::sealed::SimdArray for [$ptr_ty; $elem_count] {
+            type Tuple = $tuple_id<$ptr_ty>;
+            type T = $ptr_ty;
+            const N: usize = $elem_count;
+            type NT = [u32; $elem_count];
+        }
+
+        impl<$ty> crate::sealed::Seal for $tuple_id<$ptr_ty> {}
+        impl<$ty> crate::sealed::Simd for $tuple_id<$ptr_ty> {
+            type Element = $ptr_ty;
+            const LANES: usize = $elem_count;
+            type LanesType = [u32; $elem_count];
+        }
+
+    }
+}
+
+impl_simd_ptr!([*const T; 2]: cptrx2 | T | T, T);
+impl_simd_ptr!([*const T; 4]: cptrx4 | T | T, T, T, T);
+impl_simd_ptr!([*const T; 8]: cptrx8 | T | T, T, T, T, T, T, T, T);
+
+impl_simd_ptr!([*mut T; 2]: mptrx2 | T | T, T);
+impl_simd_ptr!([*mut T; 4]: mptrx4 | T | T, T, T, T);
+impl_simd_ptr!([*mut T; 8]: mptrx8 | T | T, T, T, T, T, T, T, T);
diff --git a/vendor/packed_simd_2/src/codegen/vSize.rs b/vendor/packed_simd_2/src/codegen/vSize.rs
new file mode 100644
index 000000000..3911b2134
--- /dev/null
+++ b/vendor/packed_simd_2/src/codegen/vSize.rs
@@ -0,0 +1,43 @@
+//! Vector types with pointer-sized elements
+
+use crate::codegen::pointer_sized_int::{isize_, usize_};
+use crate::masks::*;
+
+impl_simd_array!([isize; 2]: isizex2 | isize_, isize_);
+impl_simd_array!([usize; 2]: usizex2 | usize_, usize_);
+impl_simd_array!([msize; 2]: msizex2 | isize_, isize_);
+
+impl_simd_array!([isize; 4]: isizex4 | isize_, isize_, isize_, isize_);
+impl_simd_array!([usize; 4]: usizex4 | usize_, usize_, usize_, usize_);
+impl_simd_array!([msize; 4]: msizex4 | isize_, isize_, isize_, isize_);
+
+impl_simd_array!(
+    [isize; 8]: isizex8 | isize_,
+    isize_,
+    isize_,
+    isize_,
+    isize_,
+    isize_,
+    isize_,
+    isize_
+);
+impl_simd_array!(
+    [usize; 8]: usizex8 | usize_,
+    usize_,
+    usize_,
+    usize_,
+    usize_,
+    usize_,
+    usize_,
+    usize_
+);
+impl_simd_array!(
+    [msize; 8]: msizex8 | isize_,
+    isize_,
+    isize_,
+    isize_,
+    isize_,
+    isize_,
+    isize_,
+    isize_
+);
-- 
cgit v1.2.3