From ef24de24a82fe681581cc130f342363c47c0969a Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 7 Jun 2024 07:48:48 +0200 Subject: Merging upstream version 1.75.0+dfsg1. Signed-off-by: Daniel Baumann --- .../src/intrinsics/cpuid.rs | 74 --- .../rustc_codegen_cranelift/src/intrinsics/llvm.rs | 19 +- .../src/intrinsics/llvm_aarch64.rs | 112 +++- .../src/intrinsics/llvm_x86.rs | 606 ++++++++++++++++++--- .../rustc_codegen_cranelift/src/intrinsics/mod.rs | 66 ++- .../rustc_codegen_cranelift/src/intrinsics/simd.rs | 39 +- 6 files changed, 753 insertions(+), 163 deletions(-) delete mode 100644 compiler/rustc_codegen_cranelift/src/intrinsics/cpuid.rs (limited to 'compiler/rustc_codegen_cranelift/src/intrinsics') diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/cpuid.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/cpuid.rs deleted file mode 100644 index 5120b89c4..000000000 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/cpuid.rs +++ /dev/null @@ -1,74 +0,0 @@ -//! Emulation of a subset of the cpuid x86 instruction. - -use crate::prelude::*; - -/// Emulates a subset of the cpuid x86 instruction. -/// -/// This emulates an intel cpu with sse and sse2 support, but which doesn't support anything else. -pub(crate) fn codegen_cpuid_call<'tcx>( - fx: &mut FunctionCx<'_, '_, 'tcx>, - leaf: Value, - _sub_leaf: Value, -) -> (Value, Value, Value, Value) { - let leaf_0 = fx.bcx.create_block(); - let leaf_1 = fx.bcx.create_block(); - let leaf_7 = fx.bcx.create_block(); - let leaf_8000_0000 = fx.bcx.create_block(); - let leaf_8000_0001 = fx.bcx.create_block(); - let unsupported_leaf = fx.bcx.create_block(); - - let dest = fx.bcx.create_block(); - let eax = fx.bcx.append_block_param(dest, types::I32); - let ebx = fx.bcx.append_block_param(dest, types::I32); - let ecx = fx.bcx.append_block_param(dest, types::I32); - let edx = fx.bcx.append_block_param(dest, types::I32); - - let mut switch = cranelift_frontend::Switch::new(); - switch.set_entry(0, leaf_0); - switch.set_entry(1, leaf_1); - switch.set_entry(7, leaf_7); - switch.set_entry(0x8000_0000, leaf_8000_0000); - switch.set_entry(0x8000_0001, leaf_8000_0001); - switch.emit(&mut fx.bcx, leaf, unsupported_leaf); - - fx.bcx.switch_to_block(leaf_0); - let max_basic_leaf = fx.bcx.ins().iconst(types::I32, 1); - let vend0 = fx.bcx.ins().iconst(types::I32, i64::from(u32::from_le_bytes(*b"Genu"))); - let vend2 = fx.bcx.ins().iconst(types::I32, i64::from(u32::from_le_bytes(*b"ineI"))); - let vend1 = fx.bcx.ins().iconst(types::I32, i64::from(u32::from_le_bytes(*b"ntel"))); - fx.bcx.ins().jump(dest, &[max_basic_leaf, vend0, vend1, vend2]); - - fx.bcx.switch_to_block(leaf_1); - let cpu_signature = fx.bcx.ins().iconst(types::I32, 0); - let additional_information = fx.bcx.ins().iconst(types::I32, 0); - let ecx_features = fx.bcx.ins().iconst(types::I32, 0); - let edx_features = fx.bcx.ins().iconst(types::I32, 1 << 25 /* sse */ | 1 << 26 /* sse2 */); - fx.bcx.ins().jump(dest, &[cpu_signature, additional_information, ecx_features, edx_features]); - - fx.bcx.switch_to_block(leaf_7); - // This leaf technically has subleaves, but we just return zero for all subleaves. - let zero = fx.bcx.ins().iconst(types::I32, 0); - fx.bcx.ins().jump(dest, &[zero, zero, zero, zero]); - - fx.bcx.switch_to_block(leaf_8000_0000); - let extended_max_basic_leaf = fx.bcx.ins().iconst(types::I32, 0); - let zero = fx.bcx.ins().iconst(types::I32, 0); - fx.bcx.ins().jump(dest, &[extended_max_basic_leaf, zero, zero, zero]); - - fx.bcx.switch_to_block(leaf_8000_0001); - let zero = fx.bcx.ins().iconst(types::I32, 0); - let proc_info_ecx = fx.bcx.ins().iconst(types::I32, 0); - let proc_info_edx = fx.bcx.ins().iconst(types::I32, 0); - fx.bcx.ins().jump(dest, &[zero, zero, proc_info_ecx, proc_info_edx]); - - fx.bcx.switch_to_block(unsupported_leaf); - crate::trap::trap_unimplemented( - fx, - "__cpuid_count arch intrinsic doesn't yet support specified leaf", - ); - - fx.bcx.switch_to_block(dest); - fx.bcx.ins().nop(); - - (eax, ebx, ecx, edx) -} diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs index 63b5402f2..e9b7daf14 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs @@ -1,10 +1,10 @@ //! Emulate LLVM intrinsics +use rustc_middle::ty::GenericArgsRef; + use crate::intrinsics::*; use crate::prelude::*; -use rustc_middle::ty::GenericArgsRef; - pub(crate) fn codegen_llvm_intrinsic_call<'tcx>( fx: &mut FunctionCx<'_, '_, 'tcx>, intrinsic: &str, @@ -51,6 +51,21 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>( }); } + _ if intrinsic.starts_with("llvm.fma.v") => { + intrinsic_args!(fx, args => (x,y,z); intrinsic); + + simd_trio_for_each_lane( + fx, + x, + y, + z, + ret, + &|fx, _lane_ty, _res_lane_ty, lane_x, lane_y, lane_z| { + fx.bcx.ins().fma(lane_x, lane_y, lane_z) + }, + ); + } + _ => { fx.tcx .sess diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs index c20a99159..ee098be1f 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs @@ -1,10 +1,10 @@ //! Emulate AArch64 LLVM intrinsics +use rustc_middle::ty::GenericArgsRef; + use crate::intrinsics::*; use crate::prelude::*; -use rustc_middle::ty::GenericArgsRef; - pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( fx: &mut FunctionCx<'_, '_, 'tcx>, intrinsic: &str, @@ -44,7 +44,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( }); } - _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") => { + _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") + || intrinsic.starts_with("llvm.aarch64.neon.uqadd.v") => + { intrinsic_args!(fx, args => (x, y); intrinsic); simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| { @@ -52,7 +54,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( }); } - _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") => { + _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") + || intrinsic.starts_with("llvm.aarch64.neon.uqsub.v") => + { intrinsic_args!(fx, args => (x, y); intrinsic); simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| { @@ -156,6 +160,106 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( }); } + _ if intrinsic.starts_with("llvm.aarch64.neon.umaxp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umax(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.smaxp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smax(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.uminp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umin(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.sminp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smin(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.fminp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmin(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.fmaxp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmax(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.addp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().iadd(x_lane, y_lane), + ); + } + + // FIXME generalize vector types + "llvm.aarch64.neon.tbl1.v16i8" => { + intrinsic_args!(fx, args => (t, idx); intrinsic); + + let zero = fx.bcx.ins().iconst(types::I8, 0); + for i in 0..16 { + let idx_lane = idx.value_lane(fx, i).load_scalar(fx); + let is_zero = + fx.bcx.ins().icmp_imm(IntCC::UnsignedGreaterThanOrEqual, idx_lane, 16); + let t_idx = fx.bcx.ins().uextend(fx.pointer_type, idx_lane); + let t_lane = t.value_lane_dyn(fx, t_idx).load_scalar(fx); + let res = fx.bcx.ins().select(is_zero, zero, t_lane); + ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted()); + } + } + /* _ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v") || intrinsic.starts_with("llvm.aarch64.neon.sqshl.v") diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs index e62de6b61..4c5360486 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs @@ -1,10 +1,10 @@ //! Emulate x86 LLVM intrinsics +use rustc_middle::ty::GenericArgsRef; + use crate::intrinsics::*; use crate::prelude::*; -use rustc_middle::ty::GenericArgsRef; - pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( fx: &mut FunctionCx<'_, '_, 'tcx>, intrinsic: &str, @@ -20,53 +20,23 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( // Used by is_x86_feature_detected!(); "llvm.x86.xgetbv" => { - // FIXME use the actual xgetbv instruction - intrinsic_args!(fx, args => (v); intrinsic); - - let v = v.load_scalar(fx); + intrinsic_args!(fx, args => (xcr_no); intrinsic); - // As of writing on XCR0 exists - fx.bcx.ins().trapnz(v, TrapCode::UnreachableCodeReached); + let xcr_no = xcr_no.load_scalar(fx); - let res = fx.bcx.ins().iconst(types::I64, 1 /* bit 0 must be set */); - ret.write_cvalue(fx, CValue::by_val(res, fx.layout_of(fx.tcx.types.i64))); + crate::inline_asm::codegen_xgetbv(fx, xcr_no, ret); } - // Used by `_mm_movemask_epi8` and `_mm256_movemask_epi8` - "llvm.x86.sse2.pmovmskb.128" - | "llvm.x86.avx2.pmovmskb" - | "llvm.x86.sse.movmsk.ps" - | "llvm.x86.sse2.movmsk.pd" => { - intrinsic_args!(fx, args => (a); intrinsic); - - let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx); - let lane_ty = fx.clif_type(lane_ty).unwrap(); - assert!(lane_count <= 32); - - let mut res = fx.bcx.ins().iconst(types::I32, 0); - - for lane in (0..lane_count).rev() { - let a_lane = a.value_lane(fx, lane).load_scalar(fx); + "llvm.x86.sse3.ldu.dq" | "llvm.x86.avx.ldu.dq.256" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128&ig_expand=4009 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256&ig_expand=4010 + intrinsic_args!(fx, args => (ptr); intrinsic); - // cast float to int - let a_lane = match lane_ty { - types::F32 => codegen_bitcast(fx, types::I32, a_lane), - types::F64 => codegen_bitcast(fx, types::I64, a_lane), - _ => a_lane, - }; - - // extract sign bit of an int - let a_lane_sign = fx.bcx.ins().ushr_imm(a_lane, i64::from(lane_ty.bits() - 1)); - - // shift sign bit into result - let a_lane_sign = clif_intcast(fx, a_lane_sign, types::I32, false); - res = fx.bcx.ins().ishl_imm(res, 1); - res = fx.bcx.ins().bor(res, a_lane_sign); - } - - let res = CValue::by_val(res, fx.layout_of(fx.tcx.types.i32)); - ret.write_cvalue(fx, res); + // FIXME correctly handle unalignedness + let val = CValue::by_ref(Pointer::new(ptr.load_scalar(fx)), ret.layout()); + ret.write_cvalue(fx, val); } + "llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => { let (x, y, kind) = match args { [x, y, kind] => (x, y, kind), @@ -74,8 +44,10 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( }; let x = codegen_operand(fx, x); let y = codegen_operand(fx, y); - let kind = crate::constant::mir_operand_get_const_val(fx, kind) - .expect("llvm.x86.sse2.cmp.* kind not const"); + let kind = match kind { + Operand::Constant(const_) => crate::constant::eval_mir_constant(fx, const_).0, + Operand::Copy(_) | Operand::Move(_) => unreachable!("{kind:?}"), + }; let flt_cc = match kind .try_to_bits(Size::from_bytes(1)) @@ -210,8 +182,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( } } } - "llvm.x86.avx2.vperm2i128" => { + "llvm.x86.avx2.vperm2i128" + | "llvm.x86.avx.vperm2f128.ps.256" + | "llvm.x86.avx.vperm2f128.pd.256" => { // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd let (a, b, imm8) = match args { [a, b, imm8] => (a, b, imm8), _ => bug!("wrong number of args for intrinsic {intrinsic}"), @@ -220,19 +196,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( let b = codegen_operand(fx, b); let imm8 = codegen_operand(fx, imm8).load_scalar(fx); - let a_0 = a.value_lane(fx, 0).load_scalar(fx); - let a_1 = a.value_lane(fx, 1).load_scalar(fx); - let a_low = fx.bcx.ins().iconcat(a_0, a_1); - let a_2 = a.value_lane(fx, 2).load_scalar(fx); - let a_3 = a.value_lane(fx, 3).load_scalar(fx); - let a_high = fx.bcx.ins().iconcat(a_2, a_3); + let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx); + let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx); - let b_0 = b.value_lane(fx, 0).load_scalar(fx); - let b_1 = b.value_lane(fx, 1).load_scalar(fx); - let b_low = fx.bcx.ins().iconcat(b_0, b_1); - let b_2 = b.value_lane(fx, 2).load_scalar(fx); - let b_3 = b.value_lane(fx, 3).load_scalar(fx); - let b_high = fx.bcx.ins().iconcat(b_2, b_3); + let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx); + let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx); fn select4( fx: &mut FunctionCx<'_, '_, '_>, @@ -257,16 +225,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( let control0 = imm8; let res_low = select4(fx, a_high, a_low, b_high, b_low, control0); - let (res_0, res_1) = fx.bcx.ins().isplit(res_low); let control1 = fx.bcx.ins().ushr_imm(imm8, 4); let res_high = select4(fx, a_high, a_low, b_high, b_low, control1); - let (res_2, res_3) = fx.bcx.ins().isplit(res_high); - ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted()); - ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted()); - ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted()); - ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted()); + ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store( + fx, + res_low, + MemFlags::trusted(), + ); + ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store( + fx, + res_high, + MemFlags::trusted(), + ); } "llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => { let a = match args { @@ -308,6 +280,512 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( let val = CValue::by_val_pair(cb_out, c, layout); ret.write_cvalue(fx, val); } + "llvm.x86.sse2.pavg.b" | "llvm.x86.sse2.pavg.w" => { + intrinsic_args!(fx, args => (a, b); intrinsic); + + // FIXME use vector instructions when possible + simd_pair_for_each_lane( + fx, + a, + b, + ret, + &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane| { + // (a + b + 1) >> 1 + let lane_ty = fx.bcx.func.dfg.value_type(a_lane); + let a_lane = fx.bcx.ins().uextend(lane_ty.double_width().unwrap(), a_lane); + let b_lane = fx.bcx.ins().uextend(lane_ty.double_width().unwrap(), b_lane); + let sum = fx.bcx.ins().iadd(a_lane, b_lane); + let num_plus_one = fx.bcx.ins().iadd_imm(sum, 1); + let res = fx.bcx.ins().ushr_imm(num_plus_one, 1); + fx.bcx.ins().ireduce(lane_ty, res) + }, + ); + } + "llvm.x86.sse2.psra.w" => { + intrinsic_args!(fx, args => (a, count); intrinsic); + + let count_lane = count.force_stack(fx).0.load(fx, types::I64, MemFlags::trusted()); + let lane_ty = fx.clif_type(a.layout().ty.simd_size_and_type(fx.tcx).1).unwrap(); + let max_count = fx.bcx.ins().iconst(types::I64, i64::from(lane_ty.bits() - 1)); + let saturated_count = fx.bcx.ins().umin(count_lane, max_count); + + // FIXME use vector instructions when possible + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, a_lane| { + fx.bcx.ins().sshr(a_lane, saturated_count) + }); + } + "llvm.x86.sse2.psad.bw" | "llvm.x86.avx2.psad.bw" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8&ig_expand=5770 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8&ig_expand=5771 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.u8); + assert_eq!(ret_lane_ty, fx.tcx.types.u64); + assert_eq!(lane_count, ret_lane_count * 8); + + let ret_lane_layout = fx.layout_of(fx.tcx.types.u64); + for out_lane_idx in 0..lane_count / 8 { + let mut lane_diff_acc = fx.bcx.ins().iconst(types::I64, 0); + + for lane_idx in out_lane_idx * 8..out_lane_idx * 8 + 1 { + let a_lane = a.value_lane(fx, lane_idx).load_scalar(fx); + let b_lane = b.value_lane(fx, lane_idx).load_scalar(fx); + + let lane_diff = fx.bcx.ins().isub(a_lane, b_lane); + let abs_lane_diff = fx.bcx.ins().iabs(lane_diff); + let abs_lane_diff = fx.bcx.ins().uextend(types::I64, abs_lane_diff); + lane_diff_acc = fx.bcx.ins().iadd(lane_diff_acc, abs_lane_diff); + } + + let res_lane = CValue::by_val(lane_diff_acc, ret_lane_layout); + + ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane); + } + } + "llvm.x86.ssse3.pmadd.ub.sw.128" | "llvm.x86.avx2.pmadd.ub.sw" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16&ig_expand=4267 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16&ig_expand=4270 + intrinsic_args!(fx, args => (a, b); intrinsic); + + let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.u8); + assert_eq!(ret_lane_ty, fx.tcx.types.i16); + assert_eq!(lane_count, ret_lane_count * 2); + + let ret_lane_layout = fx.layout_of(fx.tcx.types.i16); + for out_lane_idx in 0..lane_count / 2 { + let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx); + let a_lane0 = fx.bcx.ins().uextend(types::I16, a_lane0); + let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx); + let b_lane0 = fx.bcx.ins().sextend(types::I16, b_lane0); + + let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx); + let a_lane1 = fx.bcx.ins().uextend(types::I16, a_lane1); + let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx); + let b_lane1 = fx.bcx.ins().sextend(types::I16, b_lane1); + + let mul0: Value = fx.bcx.ins().imul(a_lane0, b_lane0); + let mul1 = fx.bcx.ins().imul(a_lane1, b_lane1); + + let (val, has_overflow) = fx.bcx.ins().sadd_overflow(mul0, mul1); + + let rhs_ge_zero = fx.bcx.ins().icmp_imm(IntCC::SignedGreaterThanOrEqual, mul1, 0); + + let min = fx.bcx.ins().iconst(types::I16, i64::from(i16::MIN as u16)); + let max = fx.bcx.ins().iconst(types::I16, i64::from(i16::MAX as u16)); + + let sat_val = fx.bcx.ins().select(rhs_ge_zero, max, min); + let res_lane = fx.bcx.ins().select(has_overflow, sat_val, val); + + let res_lane = CValue::by_val(res_lane, ret_lane_layout); + + ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane); + } + } + "llvm.x86.sse2.pmadd.wd" | "llvm.x86.avx2.pmadd.wd" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16&ig_expand=4231 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16&ig_expand=4234 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i16); + assert_eq!(ret_lane_ty, fx.tcx.types.i32); + assert_eq!(lane_count, ret_lane_count * 2); + + let ret_lane_layout = fx.layout_of(fx.tcx.types.i32); + for out_lane_idx in 0..lane_count / 2 { + let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx); + let a_lane0 = fx.bcx.ins().uextend(types::I32, a_lane0); + let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx); + let b_lane0 = fx.bcx.ins().sextend(types::I32, b_lane0); + + let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx); + let a_lane1 = fx.bcx.ins().uextend(types::I32, a_lane1); + let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx); + let b_lane1 = fx.bcx.ins().sextend(types::I32, b_lane1); + + let mul0: Value = fx.bcx.ins().imul(a_lane0, b_lane0); + let mul1 = fx.bcx.ins().imul(a_lane1, b_lane1); + + let res_lane = fx.bcx.ins().iadd(mul0, mul1); + let res_lane = CValue::by_val(res_lane, ret_lane_layout); + + ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.ssse3.pmul.hr.sw.128" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16&ig_expand=4782 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i16); + assert_eq!(ret_lane_ty, fx.tcx.types.i16); + assert_eq!(lane_count, ret_lane_count); + + let ret_lane_layout = fx.layout_of(fx.tcx.types.i16); + for out_lane_idx in 0..lane_count { + let a_lane = a.value_lane(fx, out_lane_idx).load_scalar(fx); + let a_lane = fx.bcx.ins().sextend(types::I32, a_lane); + let b_lane = b.value_lane(fx, out_lane_idx).load_scalar(fx); + let b_lane = fx.bcx.ins().sextend(types::I32, b_lane); + + let mul: Value = fx.bcx.ins().imul(a_lane, b_lane); + let shifted = fx.bcx.ins().ushr_imm(mul, 14); + let incremented = fx.bcx.ins().iadd_imm(shifted, 1); + let shifted_again = fx.bcx.ins().ushr_imm(incremented, 1); + + let res_lane = fx.bcx.ins().ireduce(types::I16, shifted_again); + let res_lane = CValue::by_val(res_lane, ret_lane_layout); + + ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.sse2.packuswb.128" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i16); + assert_eq!(ret_lane_ty, fx.tcx.types.u8); + assert_eq!(lane_count * 2, ret_lane_count); + + let zero = fx.bcx.ins().iconst(types::I16, 0); + let max_u8 = fx.bcx.ins().iconst(types::I16, 255); + let ret_lane_layout = fx.layout_of(fx.tcx.types.u8); + + for idx in 0..lane_count { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.avx2.packuswb" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i16); + assert_eq!(ret_lane_ty, fx.tcx.types.u8); + assert_eq!(lane_count * 2, ret_lane_count); + + let zero = fx.bcx.ins().iconst(types::I16, 0); + let max_u8 = fx.bcx.ins().iconst(types::I16, 255); + let ret_lane_layout = fx.layout_of(fx.tcx.types.u8); + + for idx in 0..lane_count / 2 { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.sse2.packssdw.128" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i32); + assert_eq!(ret_lane_ty, fx.tcx.types.i16); + assert_eq!(lane_count * 2, ret_lane_count); + + let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16)); + let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16)); + let ret_lane_layout = fx.layout_of(fx.tcx.types.i16); + + for idx in 0..lane_count { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.sse41.packusdw" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i32); + assert_eq!(ret_lane_ty, fx.tcx.types.u16); + assert_eq!(lane_count * 2, ret_lane_count); + + let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN)); + let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX)); + let ret_lane_layout = fx.layout_of(fx.tcx.types.u16); + + for idx in 0..lane_count { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().umax(lane, min_u16); + let sat = fx.bcx.ins().umin(sat, max_u16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().umax(lane, min_u16); + let sat = fx.bcx.ins().umin(sat, max_u16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.avx2.packssdw" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i32); + assert_eq!(ret_lane_ty, fx.tcx.types.i16); + assert_eq!(lane_count * 2, ret_lane_count); + + let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16)); + let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16)); + let ret_lane_layout = fx.layout_of(fx.tcx.types.i16); + + for idx in 0..lane_count / 2 { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.pclmulqdq" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772 + intrinsic_args!(fx, args => (a, b, imm8); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i64); + assert_eq!(ret_lane_ty, fx.tcx.types.i64); + assert_eq!(lane_count, 2); + assert_eq!(ret_lane_count, 2); + + let imm8 = imm8.load_scalar(fx); + + let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001); + let a_lane0 = a.value_lane(fx, 0).load_scalar(fx); + let a_lane1 = a.value_lane(fx, 1).load_scalar(fx); + let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0); + + let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000); + let b_lane0 = b.value_lane(fx, 0).load_scalar(fx); + let b_lane1 = b.value_lane(fx, 1).load_scalar(fx); + let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0); + + fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value { + let tmp = fx.bcx.ins().ushr_imm(val, bit); + fx.bcx.ins().band_imm(tmp, 1) + } + + let mut res1 = fx.bcx.ins().iconst(types::I64, 0); + for i in 0..=63 { + let x = extract_bit(fx, temp1, 0); + let y = extract_bit(fx, temp2, i); + let mut temp = fx.bcx.ins().band(x, y); + for j in 1..=i { + let x = extract_bit(fx, temp1, j); + let y = extract_bit(fx, temp2, i - j); + let z = fx.bcx.ins().band(x, y); + temp = fx.bcx.ins().bxor(temp, z); + } + let temp = fx.bcx.ins().ishl_imm(temp, i); + res1 = fx.bcx.ins().bor(res1, temp); + } + ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted()); + + let mut res2 = fx.bcx.ins().iconst(types::I64, 0); + for i in 64..=127 { + let mut temp = fx.bcx.ins().iconst(types::I64, 0); + for j in i - 63..=63 { + let x = extract_bit(fx, temp1, j); + let y = extract_bit(fx, temp2, i - j); + let z = fx.bcx.ins().band(x, y); + temp = fx.bcx.ins().bxor(temp, z); + } + let temp = fx.bcx.ins().ishl_imm(temp, i); + res2 = fx.bcx.ins().bor(res2, temp); + } + ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted()); + } + + "llvm.x86.avx.ptestz.256" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i64); + assert_eq!(ret.layout().ty, fx.tcx.types.i32); + assert_eq!(lane_count, 4); + + let a_lane0 = a.value_lane(fx, 0).load_scalar(fx); + let a_lane1 = a.value_lane(fx, 1).load_scalar(fx); + let a_lane2 = a.value_lane(fx, 2).load_scalar(fx); + let a_lane3 = a.value_lane(fx, 3).load_scalar(fx); + let b_lane0 = b.value_lane(fx, 0).load_scalar(fx); + let b_lane1 = b.value_lane(fx, 1).load_scalar(fx); + let b_lane2 = b.value_lane(fx, 2).load_scalar(fx); + let b_lane3 = b.value_lane(fx, 3).load_scalar(fx); + + let zero0 = fx.bcx.ins().band(a_lane0, b_lane0); + let zero1 = fx.bcx.ins().band(a_lane1, b_lane1); + let zero2 = fx.bcx.ins().band(a_lane2, b_lane2); + let zero3 = fx.bcx.ins().band(a_lane3, b_lane3); + + let all_zero0 = fx.bcx.ins().bor(zero0, zero1); + let all_zero1 = fx.bcx.ins().bor(zero2, zero3); + let all_zero = fx.bcx.ins().bor(all_zero0, all_zero1); + + let res = fx.bcx.ins().icmp_imm(IntCC::Equal, all_zero, 0); + let res = CValue::by_val( + fx.bcx.ins().uextend(types::I32, res), + fx.layout_of(fx.tcx.types.i32), + ); + ret.write_cvalue(fx, res); + } + _ => { fx.tcx .sess diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs index 36e9ba9c7..bfeeb117f 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs @@ -12,23 +12,20 @@ macro_rules! intrinsic_args { } } -mod cpuid; mod llvm; mod llvm_aarch64; mod llvm_x86; mod simd; -pub(crate) use cpuid::codegen_cpuid_call; -pub(crate) use llvm::codegen_llvm_intrinsic_call; - +use cranelift_codegen::ir::AtomicRmwOp; use rustc_middle::ty; use rustc_middle::ty::layout::{HasParamEnv, ValidityRequirement}; use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths}; use rustc_middle::ty::GenericArgsRef; use rustc_span::symbol::{kw, sym, Symbol}; +pub(crate) use self::llvm::codegen_llvm_intrinsic_call; use crate::prelude::*; -use cranelift_codegen::ir::AtomicRmwOp; fn bug_on_incorrect_arg_count(intrinsic: impl std::fmt::Display) -> ! { bug!("wrong number of args for intrinsic {}", intrinsic); @@ -135,6 +132,65 @@ fn simd_pair_for_each_lane<'tcx>( } } +fn simd_horizontal_pair_for_each_lane<'tcx>( + fx: &mut FunctionCx<'_, '_, 'tcx>, + x: CValue<'tcx>, + y: CValue<'tcx>, + ret: CPlace<'tcx>, + f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value) -> Value, +) { + assert_eq!(x.layout(), y.layout()); + let layout = x.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let lane_layout = fx.layout_of(lane_ty); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + let ret_lane_layout = fx.layout_of(ret_lane_ty); + assert_eq!(lane_count, ret_lane_count); + + for lane_idx in 0..lane_count { + let src = if lane_idx < (lane_count / 2) { x } else { y }; + let src_idx = lane_idx % (lane_count / 2); + + let lhs_lane = src.value_lane(fx, src_idx * 2).load_scalar(fx); + let rhs_lane = src.value_lane(fx, src_idx * 2 + 1).load_scalar(fx); + + let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, lhs_lane, rhs_lane); + let res_lane = CValue::by_val(res_lane, ret_lane_layout); + + ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane); + } +} + +fn simd_trio_for_each_lane<'tcx>( + fx: &mut FunctionCx<'_, '_, 'tcx>, + x: CValue<'tcx>, + y: CValue<'tcx>, + z: CValue<'tcx>, + ret: CPlace<'tcx>, + f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value, Value) -> Value, +) { + assert_eq!(x.layout(), y.layout()); + let layout = x.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let lane_layout = fx.layout_of(lane_ty); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + let ret_lane_layout = fx.layout_of(ret_lane_ty); + assert_eq!(lane_count, ret_lane_count); + + for lane_idx in 0..lane_count { + let x_lane = x.value_lane(fx, lane_idx).load_scalar(fx); + let y_lane = y.value_lane(fx, lane_idx).load_scalar(fx); + let z_lane = z.value_lane(fx, lane_idx).load_scalar(fx); + + let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, x_lane, y_lane, z_lane); + let res_lane = CValue::by_val(res_lane, ret_lane_layout); + + ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane); + } +} + fn simd_reduce<'tcx>( fx: &mut FunctionCx<'_, '_, 'tcx>, val: CValue<'tcx>, diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs index 6efbe1498..ea137c4ca 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs @@ -148,7 +148,7 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( let total_len = lane_count * 2; let indexes = - idx.iter().map(|idx| idx.unwrap_leaf().try_to_u16().unwrap()).collect::>(); + idx.iter().map(|idx| idx.unwrap_leaf().try_to_u32().unwrap()).collect::>(); for &idx in &indexes { assert!(u64::from(idx) < total_len, "idx {} out of range 0..{}", idx, total_len); @@ -216,8 +216,10 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( let indexes = { use rustc_middle::mir::interpret::*; - let idx_const = crate::constant::mir_operand_get_const_val(fx, idx) - .expect("simd_shuffle idx not const"); + let idx_const = match idx { + Operand::Constant(const_) => crate::constant::eval_mir_constant(fx, const_).0, + Operand::Copy(_) | Operand::Move(_) => unreachable!("{idx:?}"), + }; let idx_bytes = match idx_const { ConstValue::Indirect { alloc_id, offset } => { @@ -343,7 +345,11 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( ret.write_cvalue(fx, ret_lane); } - sym::simd_neg => { + sym::simd_neg + | sym::simd_bswap + | sym::simd_bitreverse + | sym::simd_ctlz + | sym::simd_cttz => { intrinsic_args!(fx, args => (a); intrinsic); if !a.layout().ty.is_simd() { @@ -351,16 +357,21 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( return; } - simd_for_each_lane( - fx, - a, - ret, - &|fx, lane_ty, _ret_lane_ty, lane| match lane_ty.kind() { - ty::Int(_) => fx.bcx.ins().ineg(lane), - ty::Float(_) => fx.bcx.ins().fneg(lane), - _ => unreachable!(), - }, - ); + simd_for_each_lane(fx, a, ret, &|fx, lane_ty, _ret_lane_ty, lane| match ( + lane_ty.kind(), + intrinsic, + ) { + (ty::Int(_), sym::simd_neg) => fx.bcx.ins().ineg(lane), + (ty::Float(_), sym::simd_neg) => fx.bcx.ins().fneg(lane), + + (ty::Uint(ty::UintTy::U8) | ty::Int(ty::IntTy::I8), sym::simd_bswap) => lane, + (ty::Uint(_) | ty::Int(_), sym::simd_bswap) => fx.bcx.ins().bswap(lane), + (ty::Uint(_) | ty::Int(_), sym::simd_bitreverse) => fx.bcx.ins().bitrev(lane), + (ty::Uint(_) | ty::Int(_), sym::simd_ctlz) => fx.bcx.ins().clz(lane), + (ty::Uint(_) | ty::Int(_), sym::simd_cttz) => fx.bcx.ins().ctz(lane), + + _ => unreachable!(), + }); } sym::simd_add -- cgit v1.2.3