// Copyright 2016 Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use packed_simd::u16x8; use packed_simd::u8x16; use packed_simd::FromBits; // TODO: Migrate unaligned access to stdlib code if/when the RFC // https://github.com/rust-lang/rfcs/pull/1725 is implemented. #[inline(always)] pub unsafe fn load16_unaligned(ptr: *const u8) -> u8x16 { let mut simd = ::std::mem::uninitialized(); ::std::ptr::copy_nonoverlapping(ptr, &mut simd as *mut u8x16 as *mut u8, 16); simd } #[allow(dead_code)] #[inline(always)] pub unsafe fn load16_aligned(ptr: *const u8) -> u8x16 { *(ptr as *const u8x16) } #[inline(always)] pub unsafe fn store16_unaligned(ptr: *mut u8, s: u8x16) { ::std::ptr::copy_nonoverlapping(&s as *const u8x16 as *const u8, ptr, 16); } #[allow(dead_code)] #[inline(always)] pub unsafe fn store16_aligned(ptr: *mut u8, s: u8x16) { *(ptr as *mut u8x16) = s; } #[inline(always)] pub unsafe fn load8_unaligned(ptr: *const u16) -> u16x8 { let mut simd = ::std::mem::uninitialized(); ::std::ptr::copy_nonoverlapping(ptr as *const u8, &mut simd as *mut u16x8 as *mut u8, 16); simd } #[allow(dead_code)] #[inline(always)] pub unsafe fn load8_aligned(ptr: *const u16) -> u16x8 { *(ptr as *const u16x8) } #[inline(always)] pub unsafe fn store8_unaligned(ptr: *mut u16, s: u16x8) { ::std::ptr::copy_nonoverlapping(&s as *const u16x8 as *const u8, ptr as *mut u8, 16); } #[allow(dead_code)] #[inline(always)] pub unsafe fn store8_aligned(ptr: *mut u16, s: u16x8) { *(ptr as *mut u16x8) = s; } cfg_if! { if #[cfg(all(target_feature = "sse2", target_arch = "x86_64"))] { use std::arch::x86_64::__m128i; use std::arch::x86_64::_mm_movemask_epi8; use std::arch::x86_64::_mm_packus_epi16; } else if #[cfg(all(target_feature = "sse2", target_arch = "x86"))] { use std::arch::x86::__m128i; use std::arch::x86::_mm_movemask_epi8; use std::arch::x86::_mm_packus_epi16; } else if #[cfg(target_arch = "aarch64")]{ use std::arch::aarch64::uint8x16_t; use std::arch::aarch64::uint16x8_t; use std::arch::aarch64::vmaxvq_u8; use std::arch::aarch64::vmaxvq_u16; } else { } } // #[inline(always)] // fn simd_byte_swap_u8(s: u8x16) -> u8x16 { // unsafe { // shuffle!(s, s, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) // } // } // #[inline(always)] // pub fn simd_byte_swap(s: u16x8) -> u16x8 { // to_u16_lanes(simd_byte_swap_u8(to_u8_lanes(s))) // } #[inline(always)] pub fn simd_byte_swap(s: u16x8) -> u16x8 { let left = s << 8; let right = s >> 8; left | right } #[inline(always)] pub fn to_u16_lanes(s: u8x16) -> u16x8 { u16x8::from_bits(s) } cfg_if! { if #[cfg(target_feature = "sse2")] { // Expose low-level mask instead of higher-level conclusion, // because the non-ASCII case would perform less well otherwise. #[inline(always)] pub fn mask_ascii(s: u8x16) -> i32 { unsafe { _mm_movemask_epi8(__m128i::from_bits(s)) } } } else { } } cfg_if! { if #[cfg(target_feature = "sse2")] { #[inline(always)] pub fn simd_is_ascii(s: u8x16) -> bool { unsafe { _mm_movemask_epi8(__m128i::from_bits(s)) == 0 } } } else if #[cfg(target_arch = "aarch64")]{ #[inline(always)] pub fn simd_is_ascii(s: u8x16) -> bool { unsafe { vmaxvq_u8(uint8x16_t::from_bits(s)) < 0x80 } } } else { #[inline(always)] pub fn simd_is_ascii(s: u8x16) -> bool { // This optimizes better on ARM than // the lt formulation. let highest_ascii = u8x16::splat(0x7F); !s.gt(highest_ascii).any() } } } cfg_if! { if #[cfg(target_feature = "sse2")] { #[inline(always)] pub fn simd_is_str_latin1(s: u8x16) -> bool { if simd_is_ascii(s) { return true; } let above_str_latin1 = u8x16::splat(0xC4); s.lt(above_str_latin1).all() } } else if #[cfg(target_arch = "aarch64")]{ #[inline(always)] pub fn simd_is_str_latin1(s: u8x16) -> bool { unsafe { vmaxvq_u8(uint8x16_t::from_bits(s)) < 0xC4 } } } else { #[inline(always)] pub fn simd_is_str_latin1(s: u8x16) -> bool { let above_str_latin1 = u8x16::splat(0xC4); s.lt(above_str_latin1).all() } } } cfg_if! { if #[cfg(target_arch = "aarch64")]{ #[inline(always)] pub fn simd_is_basic_latin(s: u16x8) -> bool { unsafe { vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x80 } } #[inline(always)] pub fn simd_is_latin1(s: u16x8) -> bool { unsafe { vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x100 } } } else { #[inline(always)] pub fn simd_is_basic_latin(s: u16x8) -> bool { let above_ascii = u16x8::splat(0x80); s.lt(above_ascii).all() } #[inline(always)] pub fn simd_is_latin1(s: u16x8) -> bool { // For some reason, on SSE2 this formulation // seems faster in this case while the above // function is better the other way round... let highest_latin1 = u16x8::splat(0xFF); !s.gt(highest_latin1).any() } } } #[inline(always)] pub fn contains_surrogates(s: u16x8) -> bool { let mask = u16x8::splat(0xF800); let surrogate_bits = u16x8::splat(0xD800); (s & mask).eq(surrogate_bits).any() } cfg_if! { if #[cfg(target_arch = "aarch64")]{ macro_rules! aarch64_return_false_if_below_hebrew { ($s:ident) => ({ unsafe { if vmaxvq_u16(uint16x8_t::from_bits($s)) < 0x0590 { return false; } } }) } macro_rules! non_aarch64_return_false_if_all { ($s:ident) => () } } else { macro_rules! aarch64_return_false_if_below_hebrew { ($s:ident) => () } macro_rules! non_aarch64_return_false_if_all { ($s:ident) => ({ if $s.all() { return false; } }) } } } macro_rules! in_range16x8 { ($s:ident, $start:expr, $end:expr) => {{ // SIMD sub is wrapping ($s - u16x8::splat($start)).lt(u16x8::splat($end - $start)) }}; } #[inline(always)] pub fn is_u16x8_bidi(s: u16x8) -> bool { // We try to first quickly refute the RTLness of the vector. If that // fails, we do the real RTL check, so in that case we end up wasting // the work for the up-front quick checks. Even the quick-check is // two-fold in order to return `false` ASAP if everything is below // Hebrew. aarch64_return_false_if_below_hebrew!(s); let below_hebrew = s.lt(u16x8::splat(0x0590)); non_aarch64_return_false_if_all!(below_hebrew); if (below_hebrew | in_range16x8!(s, 0x0900, 0x200F) | in_range16x8!(s, 0x2068, 0xD802)).all() { return false; } // Quick refutation failed. Let's do the full check. (in_range16x8!(s, 0x0590, 0x0900) | in_range16x8!(s, 0xFB1D, 0xFE00) | in_range16x8!(s, 0xFE70, 0xFEFF) | in_range16x8!(s, 0xD802, 0xD804) | in_range16x8!(s, 0xD83A, 0xD83C) | s.eq(u16x8::splat(0x200F)) | s.eq(u16x8::splat(0x202B)) | s.eq(u16x8::splat(0x202E)) | s.eq(u16x8::splat(0x2067))) .any() } #[inline(always)] pub fn simd_unpack(s: u8x16) -> (u16x8, u16x8) { unsafe { let first: u8x16 = shuffle!( s, u8x16::splat(0), [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23] ); let second: u8x16 = shuffle!( s, u8x16::splat(0), [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31] ); (u16x8::from_bits(first), u16x8::from_bits(second)) } } cfg_if! { if #[cfg(target_feature = "sse2")] { #[inline(always)] pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 { unsafe { u8x16::from_bits(_mm_packus_epi16(__m128i::from_bits(a), __m128i::from_bits(b))) } } } else { #[inline(always)] pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 { unsafe { let first = u8x16::from_bits(a); let second = u8x16::from_bits(b); shuffle!( first, second, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] ) } } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_unpack() { let ascii: [u8; 16] = [ 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ]; let basic_latin: [u16; 16] = [ 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ]; let simd = unsafe { load16_unaligned(ascii.as_ptr()) }; let mut vec = Vec::with_capacity(16); vec.resize(16, 0u16); let (first, second) = simd_unpack(simd); let ptr = vec.as_mut_ptr(); unsafe { store8_unaligned(ptr, first); store8_unaligned(ptr.add(8), second); } assert_eq!(&vec[..], &basic_latin[..]); } #[test] fn test_simd_is_basic_latin_success() { let ascii: [u8; 16] = [ 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ]; let basic_latin: [u16; 16] = [ 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ]; let first = unsafe { load8_unaligned(basic_latin.as_ptr()) }; let second = unsafe { load8_unaligned(basic_latin.as_ptr().add(8)) }; let mut vec = Vec::with_capacity(16); vec.resize(16, 0u8); let ptr = vec.as_mut_ptr(); assert!(simd_is_basic_latin(first | second)); unsafe { store16_unaligned(ptr, simd_pack(first, second)); } assert_eq!(&vec[..], &ascii[..]); } #[test] fn test_simd_is_basic_latin_c0() { let input: [u16; 16] = [ 0x61, 0x62, 0x63, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ]; let first = unsafe { load8_unaligned(input.as_ptr()) }; let second = unsafe { load8_unaligned(input.as_ptr().add(8)) }; assert!(!simd_is_basic_latin(first | second)); } #[test] fn test_simd_is_basic_latin_0fff() { let input: [u16; 16] = [ 0x61, 0x62, 0x63, 0x0FFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ]; let first = unsafe { load8_unaligned(input.as_ptr()) }; let second = unsafe { load8_unaligned(input.as_ptr().add(8)) }; assert!(!simd_is_basic_latin(first | second)); } #[test] fn test_simd_is_basic_latin_ffff() { let input: [u16; 16] = [ 0x61, 0x62, 0x63, 0xFFFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ]; let first = unsafe { load8_unaligned(input.as_ptr()) }; let second = unsafe { load8_unaligned(input.as_ptr().add(8)) }; assert!(!simd_is_basic_latin(first | second)); } #[test] fn test_simd_is_ascii_success() { let ascii: [u8; 16] = [ 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ]; let simd = unsafe { load16_unaligned(ascii.as_ptr()) }; assert!(simd_is_ascii(simd)); } #[test] fn test_simd_is_ascii_failure() { let input: [u8; 16] = [ 0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ]; let simd = unsafe { load16_unaligned(input.as_ptr()) }; assert!(!simd_is_ascii(simd)); } #[cfg(target_feature = "sse2")] #[test] fn test_check_ascii() { let input: [u8; 16] = [ 0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ]; let simd = unsafe { load16_unaligned(input.as_ptr()) }; let mask = mask_ascii(simd); assert_ne!(mask, 0); assert_eq!(mask.trailing_zeros(), 4); } #[test] fn test_alu() { let input: [u8; 16] = [ 0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, ]; let mut alu = 0u64; unsafe { ::std::ptr::copy_nonoverlapping(input.as_ptr(), &mut alu as *mut u64 as *mut u8, 8); } let masked = alu & 0x8080808080808080; assert_eq!(masked.trailing_zeros(), 39); } }