2 files changed, 269 insertions, 194 deletions
diff --git a/vendor/ppv-lite86/src/x86_64/mod.rs b/vendor/ppv-lite86/src/x86_64/mod.rs
index ecf184f36..937732da3 100644
--- a/vendor/ppv-lite86/src/x86_64/mod.rs
+++ b/vendor/ppv-lite86/src/x86_64/mod.rs
@@ -1,7 +1,7 @@
 // crate minimums: sse2, x86_64
 
-use core::arch::x86_64::{__m128i, __m256i};
 use crate::types::*;
+use core::arch::x86_64::{__m128i, __m256i};
 
 mod sse2;
 
@@ -79,7 +79,7 @@ where
     type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
     type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
 
-    type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
+    type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>;
     type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
     type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
     type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
@@ -119,16 +119,16 @@ impl Store<vec128_storage> for vec128_storage {
         p
     }
 }
-impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
+impl<'a> From<&'a vec128_storage> for &'a [u32; 4] {
     #[inline(always)]
-    fn into(self) -> &'a [u32; 4] {
-        unsafe { &self.u32x4 }
+    fn from(x: &'a vec128_storage) -> Self {
+        unsafe { &x.u32x4 }
     }
 }
-impl Into<vec128_storage> for [u32; 4] {
+impl From<[u32; 4]> for vec128_storage {
     #[inline(always)]
-    fn into(self) -> vec128_storage {
-        vec128_storage { u32x4: self }
+    fn from(u32x4: [u32; 4]) -> Self {
+        vec128_storage { u32x4 }
     }
 }
 impl Default for vec128_storage {
@@ -154,10 +154,10 @@ pub union vec256_storage {
     sse2: [vec128_storage; 2],
     avx: __m256i,
 }
-impl Into<vec256_storage> for [u64; 4] {
+impl From<[u64; 4]> for vec256_storage {
     #[inline(always)]
-    fn into(self) -> vec256_storage {
-        vec256_storage { u64x4: self }
+    fn from(u64x4: [u64; 4]) -> Self {
+        vec256_storage { u64x4 }
     }
 }
 impl Default for vec256_storage {
@@ -167,9 +167,11 @@ impl Default for vec256_storage {
     }
 }
 impl vec256_storage {
+    #[inline(always)]
     pub fn new128(xs: [vec128_storage; 2]) -> Self {
         Self { sse2: xs }
     }
+    #[inline(always)]
     pub fn split128(self) -> [vec128_storage; 2] {
         unsafe { self.sse2 }
     }
@@ -200,9 +202,11 @@ impl Default for vec512_storage {
     }
 }
 impl vec512_storage {
+    #[inline(always)]
     pub fn new128(xs: [vec128_storage; 4]) -> Self {
         Self { sse2: xs }
     }
+    #[inline(always)]
     pub fn split128(self) -> [vec128_storage; 4] {
         unsafe { self.sse2 }
     }
@@ -217,10 +221,10 @@ impl PartialEq for vec512_storage {
 
 macro_rules! impl_into {
     ($storage:ident, $array:ty, $name:ident) => {
-        impl Into<$array> for $storage {
+        impl From<$storage> for $array {
             #[inline(always)]
-            fn into(self) -> $array {
-                unsafe { self.$name }
+            fn from(vec: $storage) -> Self {
+                unsafe { vec.$name }
             }
         }
     };
diff --git a/vendor/ppv-lite86/src/x86_64/sse2.rs b/vendor/ppv-lite86/src/x86_64/sse2.rs
index 60e7681c3..97197a436 100644
--- a/vendor/ppv-lite86/src/x86_64/sse2.rs
+++ b/vendor/ppv-lite86/src/x86_64/sse2.rs
@@ -166,49 +166,44 @@ macro_rules! impl_bitops128 {
 
 macro_rules! rotr_32_s3 {
     ($name:ident, $k0:expr, $k1:expr) => {
-    #[inline(always)]
-    fn $name(self) -> Self {
-        Self::new(unsafe {
-                _mm_shuffle_epi8(
-                    self.x,
-                    _mm_set_epi64x($k0, $k1),
-                )
-            })
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
         }
     };
 }
 macro_rules! rotr_32 {
     ($name:ident, $i:expr) => {
-    #[inline(always)]
-    fn $name(self) -> Self {
-        Self::new(unsafe {
-            _mm_or_si128(
-                _mm_srli_epi32(self.x, $i as i32),
-                _mm_slli_epi32(self.x, 32 - $i as i32),
-            )
-        })
-    }
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe {
+                _mm_or_si128(
+                    _mm_srli_epi32(self.x, $i as i32),
+                    _mm_slli_epi32(self.x, 32 - $i as i32),
+                )
+            })
+        }
     };
 }
 impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
     rotr_32!(rotate_each_word_right7, 7);
     rotr_32_s3!(
         rotate_each_word_right8,
-        0x0c0f0e0d_080b0a09,
-        0x04070605_00030201
+        0x0c0f_0e0d_080b_0a09,
+        0x0407_0605_0003_0201
     );
     rotr_32!(rotate_each_word_right11, 11);
     rotr_32!(rotate_each_word_right12, 12);
     rotr_32_s3!(
         rotate_each_word_right16,
-        0x0d0c0f0e_09080b0a,
-        0x05040706_01000302
+        0x0d0c_0f0e_0908_0b0a,
+        0x0504_0706_0100_0302
     );
     rotr_32!(rotate_each_word_right20, 20);
     rotr_32_s3!(
         rotate_each_word_right24,
-        0x0e0d0c0f_0a09080b,
-        0x06050407_02010003
+        0x0e0d_0c0f_0a09_080b,
+        0x0605_0407_0201_0003
     );
     rotr_32!(rotate_each_word_right25, 25);
 }
@@ -228,28 +223,23 @@ impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
 
 macro_rules! rotr_64_s3 {
     ($name:ident, $k0:expr, $k1:expr) => {
-    #[inline(always)]
-    fn $name(self) -> Self {
-        Self::new(unsafe {
-                _mm_shuffle_epi8(
-                    self.x,
-                    _mm_set_epi64x($k0, $k1),
-                )
-            })
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
         }
     };
 }
 macro_rules! rotr_64 {
     ($name:ident, $i:expr) => {
-    #[inline(always)]
-    fn $name(self) -> Self {
-        Self::new(unsafe {
-            _mm_or_si128(
-                _mm_srli_epi64(self.x, $i as i32),
-                _mm_slli_epi64(self.x, 64 - $i as i32),
-            )
-        })
-    }
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe {
+                _mm_or_si128(
+                    _mm_srli_epi64(self.x, $i as i32),
+                    _mm_slli_epi64(self.x, 64 - $i as i32),
+                )
+            })
+        }
     };
 }
 impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
@@ -296,15 +286,15 @@ impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
 
 macro_rules! rotr_128 {
     ($name:ident, $i:expr) => {
-    #[inline(always)]
-    fn $name(self) -> Self {
-        Self::new(unsafe {
-            _mm_or_si128(
-                _mm_srli_si128(self.x, $i as i32),
-                _mm_slli_si128(self.x, 128 - $i as i32),
-            )
-        })
-    }
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe {
+                _mm_or_si128(
+                    _mm_srli_si128(self.x, $i as i32),
+                    _mm_slli_si128(self.x, 128 - $i as i32),
+                )
+            })
+        }
     };
 }
 // TODO: completely unoptimized
@@ -411,7 +401,7 @@ impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
     }
     #[inline(always)]
     fn from_lanes(xs: [u128; 1]) -> Self {
-        unimplemented!()
+        unimplemented!("{:?}", xs)
     }
 }
 
@@ -780,7 +770,7 @@ impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
 impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
     #[inline(always)]
     fn bswap(self) -> Self {
-        Self::new(unsafe { unimplemented!() })
+        unimplemented!()
     }
 }
 
@@ -890,6 +880,13 @@ pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
 #[allow(non_camel_case_types)]
 pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
 
+impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn to_scalars(self) -> [u32; 16] {
+        unsafe { core::mem::transmute(self) }
+    }
+}
+
 impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
 where
     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
@@ -993,6 +990,8 @@ where
     Machine86<S3, S4, NI>: Machine,
     u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
     u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+    u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+    u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
 {
 }
 impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
@@ -1014,14 +1013,6 @@ where
 {
 }
 
-impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
-where
-    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
-    Avx2Machine<NI>: Machine,
-    u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
-    u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
-{
-}
 impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
 where
     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
@@ -1078,6 +1069,7 @@ impl<W: PartialEq, G> PartialEq for x2<W, G> {
     }
 }
 
+#[allow(unused)]
 #[inline(always)]
 unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
     let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
@@ -1383,65 +1375,78 @@ mod test {
 
 pub mod avx2 {
     #![allow(non_camel_case_types)]
-    use crate::soft::x4;
+    use crate::soft::{x2, x4};
     use crate::types::*;
-    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
+    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
     use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
     use core::arch::x86_64::*;
     use core::marker::PhantomData;
     use core::ops::*;
 
     #[derive(Copy, Clone)]
-    pub struct u32x4x4_avx2<NI> {
-        x: [__m256i; 2],
+    pub struct u32x4x2_avx2<NI> {
+        x: __m256i,
         ni: PhantomData<NI>,
     }
 
-    impl<NI> u32x4x4_avx2<NI> {
+    impl<NI> u32x4x2_avx2<NI> {
         #[inline(always)]
-        fn new(x: [__m256i; 2]) -> Self {
+        fn new(x: __m256i) -> Self {
             Self { x, ni: PhantomData }
         }
     }
 
-    impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
-    impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
+    impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
+    impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
         #[inline(always)]
-        unsafe fn unpack(p: vec512_storage) -> Self {
-            Self::new([p.avx[0].avx, p.avx[1].avx])
+        unsafe fn unpack(p: vec256_storage) -> Self {
+            Self::new(p.avx)
         }
     }
-    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
+    impl<NI> StoreBytes for u32x4x2_avx2<NI> {
         #[inline(always)]
-        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
+        unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+            assert_eq!(input.len(), 32);
+            Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
+        }
+        #[inline(always)]
+        unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+            Self::unsafe_read_le(input).bswap()
+        }
+        #[inline(always)]
+        fn write_le(self, out: &mut [u8]) {
             unsafe {
-                [
-                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
-                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
-                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
-                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
-                ]
+                assert_eq!(out.len(), 32);
+                _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
             }
         }
         #[inline(always)]
-        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
-            Self::new(unsafe {
+        fn write_be(self, out: &mut [u8]) {
+            self.bswap().write_le(out)
+        }
+    }
+    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> {
+        #[inline(always)]
+        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] {
+            unsafe {
                 [
-                    _mm256_setr_m128i(x[0].x, x[1].x),
-                    _mm256_setr_m128i(x[2].x, x[3].x),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
                 ]
-            })
+            }
+        }
+        #[inline(always)]
+        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self {
+            Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) })
         }
     }
-    impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+    impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
         #[inline(always)]
         fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
             unsafe {
                 match i {
-                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
-                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
-                    2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
-                    3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
+                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
+                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
                     _ => panic!(),
                 }
             }
@@ -1450,61 +1455,21 @@ pub mod avx2 {
         fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
             Self::new(unsafe {
                 match i {
-                    0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
-                    1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
-                    2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
-                    3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
+                    0 => _mm256_inserti128_si256(self.x, w.x, 0),
+                    1 => _mm256_inserti128_si256(self.x, w.x, 1),
                     _ => panic!(),
                 }
             })
         }
     }
-    impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
-        #[inline(always)]
-        fn shuffle_lane_words1230(self) -> Self {
-            Self::new(unsafe {
-                [
-                    _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
-                    _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
-                ]
-            })
-        }
-        #[inline(always)]
-        fn shuffle_lane_words2301(self) -> Self {
-            Self::new(unsafe {
-                [
-                    _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
-                    _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
-                ]
-            })
-        }
-        #[inline(always)]
-        fn shuffle_lane_words3012(self) -> Self {
-            Self::new(unsafe {
-                [
-                    _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
-                    _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
-                ]
-            })
-        }
-    }
-    impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
-    impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
+    impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
+    impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
     macro_rules! shuf_lane_bytes {
         ($name:ident, $k0:expr, $k1:expr) => {
-        #[inline(always)]
-        fn $name(self) -> Self {
-            Self::new(unsafe {
-                [
-                    _mm256_shuffle_epi8(
-                        self.x[0],
-                        _mm256_set_epi64x($k0, $k1, $k0, $k1),
-                    ),
-                    _mm256_shuffle_epi8(
-                        self.x[1],
-                        _mm256_set_epi64x($k0, $k1, $k0, $k1),
-                    )
-                ]
+            #[inline(always)]
+            fn $name(self) -> Self {
+                Self::new(unsafe {
+                    _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
                 })
             }
         };
@@ -1514,52 +1479,41 @@ pub mod avx2 {
             #[inline(always)]
             fn $name(self) -> Self {
                 Self::new(unsafe {
-                    [
-                        _mm256_or_si256(
-                            _mm256_srli_epi32(self.x[0], $i as i32),
-                            _mm256_slli_epi32(self.x[0], 32 - $i as i32),
-                        ),
-                        _mm256_or_si256(
-                            _mm256_srli_epi32(self.x[1], $i as i32),
-                            _mm256_slli_epi32(self.x[1], 32 - $i as i32),
-                        )
-                    ]
+                    _mm256_or_si256(
+                        _mm256_srli_epi32(self.x, $i as i32),
+                        _mm256_slli_epi32(self.x, 32 - $i as i32),
+                    )
                 })
             }
         };
     }
-    impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
+    impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
         rotr_32!(rotate_each_word_right7, 7);
         shuf_lane_bytes!(
             rotate_each_word_right8,
-            0x0c0f0e0d_080b0a09,
-            0x04070605_00030201
+            0x0c0f_0e0d_080b_0a09,
+            0x0407_0605_0003_0201
         );
         rotr_32!(rotate_each_word_right11, 11);
         rotr_32!(rotate_each_word_right12, 12);
         shuf_lane_bytes!(
             rotate_each_word_right16,
-            0x0d0c0f0e_09080b0a,
-            0x05040706_01000302
+            0x0d0c_0f0e_0908_0b0a,
+            0x0504_0706_0100_0302
         );
         rotr_32!(rotate_each_word_right20, 20);
         shuf_lane_bytes!(
             rotate_each_word_right24,
-            0x0e0d0c0f_0a09080b,
-            0x06050407_02010003
+            0x0e0d_0c0f_0a09_080b,
+            0x0605_0407_0201_0003
         );
         rotr_32!(rotate_each_word_right25, 25);
     }
-    impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
-    impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
+    impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
+    impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
         #[inline(always)]
-        fn from(x: u32x4x4_avx2<NI>) -> Self {
-            Self {
-                avx: [
-                    vec256_storage { avx: x.x[0] },
-                    vec256_storage { avx: x.x[1] },
-                ],
-            }
+        fn from(x: u32x4x2_avx2<NI>) -> Self {
+            Self { avx: x.x }
         }
     }
 
@@ -1576,55 +1530,172 @@ pub mod avx2 {
             }
         };
     }
-    impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
-    impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
-    impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
-    impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
+    impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
+    impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
+    impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
+    impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
 
-    macro_rules! impl_bitop_x2 {
+    macro_rules! impl_bitop {
         ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
             impl<NI> $Op for $vec<NI> {
                 type Output = Self;
                 #[inline(always)]
                 fn $op_fn(self, rhs: Self) -> Self::Output {
-                    Self::new(unsafe {
-                        [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
-                    })
+                    Self::new(unsafe { $impl_fn(self.x, rhs.x) })
                 }
             }
         };
     }
-    impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
-    impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
-    impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
-    impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
-    impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
+    impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
+    impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
+    impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
+    impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
+    impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
 
-    impl<NI> Not for u32x4x4_avx2<NI> {
+    impl<NI> Not for u32x4x2_avx2<NI> {
         type Output = Self;
         #[inline(always)]
         fn not(self) -> Self::Output {
             unsafe {
                 let f = _mm256_set1_epi8(-0x7f);
-                Self::new([f, f]) ^ self
+                Self::new(f) ^ self
             }
         }
     }
 
-    impl<NI> BSwap for u32x4x4_avx2<NI> {
+    impl<NI> BSwap for u32x4x2_avx2<NI> {
         shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
     }
 
-    impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
+    impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
     where
         NI: Copy,
     {
         #[inline(always)]
+        fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
+            Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) })
+        }
+    }
+
+    impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
+        #[inline(always)]
+        fn shuffle_lane_words1230(self) -> Self {
+            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) })
+        }
+        #[inline(always)]
+        fn shuffle_lane_words2301(self) -> Self {
+            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) })
+        }
+        #[inline(always)]
+        fn shuffle_lane_words3012(self) -> Self {
+            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) })
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////////
+
+    pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
+    impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
+
+    impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        unsafe fn unpack(p: vec512_storage) -> Self {
+            Self::new([
+                u32x4x2_avx2::unpack(p.avx[0]),
+                u32x4x2_avx2::unpack(p.avx[1]),
+            ])
+        }
+    }
+    impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
+            let [a, b] = self.0[0].to_lanes();
+            let [c, d] = self.0[1].to_lanes();
+            [a, b, c, d]
+        }
+        #[inline(always)]
+        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
+            let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]);
+            let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]);
+            Self::new([ab, cd])
+        }
+    }
+    impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
+            match i {
+                0 => self.0[0].extract(0),
+                1 => self.0[0].extract(1),
+                2 => self.0[1].extract(0),
+                3 => self.0[1].extract(1),
+                _ => panic!(),
+            }
+        }
+        #[inline(always)]
+        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
+            Self::new(match i {
+                0 | 1 => [self.0[0].insert(w, i), self.0[1]],
+                2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)],
+                _ => panic!(),
+            })
+        }
+    }
+    impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
+            /*
+             * a00:a01 a10:a11
+             * b00:b01 b10:b11
+             * c00:c01 c10:c11
+             * d00:d01 d10:d11
+             *       =>
+             * a00:b00 c00:d00
+             * a01:b01 c01:d01
+             * a10:b10 c10:d10
+             * a11:b11 c11:d11
+             */
+            unsafe {
+                let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20));
+                let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31));
+                let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20));
+                let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31));
+                let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20));
+                let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31));
+                let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20));
+                let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31));
+                (
+                    Self::new([ab00, cd00]),
+                    Self::new([ab01, cd01]),
+                    Self::new([ab10, cd10]),
+                    Self::new([ab11, cd11]),
+                )
+            }
+        }
+    }
+    impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn to_scalars(self) -> [u32; 16] {
+            unsafe { core::mem::transmute(self) }
+        }
+    }
+    impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
+        #[inline(always)]
+        fn from(x: u32x4x4_avx2<NI>) -> Self {
+            Self {
+                avx: [
+                    vec256_storage { avx: x.0[0].x },
+                    vec256_storage { avx: x.0[1].x },
+                ],
+            }
+        }
+    }
+    impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
+        #[inline(always)]
         fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
             Self::new(unsafe {
                 [
-                    _mm256_setr_m128i(x.0[0].x, x.0[1].x),
-                    _mm256_setr_m128i(x.0[2].x, x.0[3].x),
+                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)),
+                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)),
                 ]
             })
         }