From dc0db358abe19481e475e10c32149b53370f1a1c Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 30 May 2024 05:57:31 +0200 Subject: Merging upstream version 1.72.1+dfsg1. Signed-off-by: Daniel Baumann --- .../src/intrinsics/llvm_x86.rs | 200 ++++++++++++++++++++- .../rustc_codegen_cranelift/src/intrinsics/mod.rs | 23 +-- .../rustc_codegen_cranelift/src/intrinsics/simd.rs | 138 ++++++++++++-- 3 files changed, 327 insertions(+), 34 deletions(-) (limited to 'compiler/rustc_codegen_cranelift/src/intrinsics') diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs index 56d8f13ce..24ad0083a 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs @@ -19,7 +19,10 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( } // Used by `_mm_movemask_epi8` and `_mm256_movemask_epi8` - "llvm.x86.sse2.pmovmskb.128" | "llvm.x86.avx2.pmovmskb" | "llvm.x86.sse2.movmsk.pd" => { + "llvm.x86.sse2.pmovmskb.128" + | "llvm.x86.avx2.pmovmskb" + | "llvm.x86.sse.movmsk.ps" + | "llvm.x86.sse2.movmsk.pd" => { intrinsic_args!(fx, args => (a); intrinsic); let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx); @@ -101,6 +104,23 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( }); } "llvm.x86.sse2.pslli.d" => { + let (a, imm8) = match args { + [a, imm8] => (a, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) + .expect("llvm.x86.sse2.pslli.d imm8 not const"); + + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 + .try_to_bits(Size::from_bytes(4)) + .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8)) + { + imm8 if imm8 < 32 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)), + _ => fx.bcx.ins().iconst(types::I32, 0), + }); + } + "llvm.x86.sse2.psrli.w" => { let (a, imm8) = match args { [a, imm8] => (a, imm8), _ => bug!("wrong number of args for intrinsic {intrinsic}"), @@ -109,6 +129,57 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) .expect("llvm.x86.sse2.psrli.d imm8 not const"); + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 + .try_to_bits(Size::from_bytes(4)) + .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8)) + { + imm8 if imm8 < 16 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)), + _ => fx.bcx.ins().iconst(types::I32, 0), + }); + } + "llvm.x86.sse2.pslli.w" => { + let (a, imm8) = match args { + [a, imm8] => (a, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) + .expect("llvm.x86.sse2.pslli.d imm8 not const"); + + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 + .try_to_bits(Size::from_bytes(4)) + .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8)) + { + imm8 if imm8 < 16 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)), + _ => fx.bcx.ins().iconst(types::I32, 0), + }); + } + "llvm.x86.avx.psrli.d" => { + let (a, imm8) = match args { + [a, imm8] => (a, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) + .expect("llvm.x86.avx.psrli.d imm8 not const"); + + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 + .try_to_bits(Size::from_bytes(4)) + .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8)) + { + imm8 if imm8 < 32 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)), + _ => fx.bcx.ins().iconst(types::I32, 0), + }); + } + "llvm.x86.avx.pslli.d" => { + let (a, imm8) = match args { + [a, imm8] => (a, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) + .expect("llvm.x86.avx.pslli.d imm8 not const"); + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 .try_to_bits(Size::from_bytes(4)) .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8)) @@ -117,6 +188,131 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( _ => fx.bcx.ins().iconst(types::I32, 0), }); } + "llvm.x86.avx2.psrli.w" => { + let (a, imm8) = match args { + [a, imm8] => (a, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) + .expect("llvm.x86.avx.psrli.w imm8 not const"); + + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 + .try_to_bits(Size::from_bytes(4)) + .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8)) + { + imm8 if imm8 < 16 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)), + _ => fx.bcx.ins().iconst(types::I32, 0), + }); + } + "llvm.x86.avx2.pslli.w" => { + let (a, imm8) = match args { + [a, imm8] => (a, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8) + .expect("llvm.x86.avx.pslli.w imm8 not const"); + + simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8 + .try_to_bits(Size::from_bytes(4)) + .unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8)) + { + imm8 if imm8 < 16 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)), + _ => fx.bcx.ins().iconst(types::I32, 0), + }); + } + "llvm.x86.ssse3.pshuf.b.128" | "llvm.x86.avx2.pshuf.b" => { + let (a, b) = match args { + [a, b] => (a, b), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let b = codegen_operand(fx, b); + + // Based on the pseudocode at https://github.com/rust-lang/stdarch/blob/1cfbca8b38fd9b4282b2f054f61c6ca69fc7ce29/crates/core_arch/src/x86/avx2.rs#L2319-L2332 + let zero = fx.bcx.ins().iconst(types::I8, 0); + for i in 0..16 { + let b_lane = b.value_lane(fx, i).load_scalar(fx); + let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80); + let a_idx = fx.bcx.ins().band_imm(b_lane, 0xf); + let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx); + let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx); + let res = fx.bcx.ins().select(is_zero, zero, a_lane); + ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted()); + } + + if intrinsic == "llvm.x86.avx2.pshuf.b" { + for i in 16..32 { + let b_lane = b.value_lane(fx, i).load_scalar(fx); + let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80); + let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf); + let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16); + let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx); + let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx); + let res = fx.bcx.ins().select(is_zero, zero, a_lane); + ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted()); + } + } + } + "llvm.x86.avx2.vperm2i128" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256 + let (a, b, imm8) = match args { + [a, b, imm8] => (a, b, imm8), + _ => bug!("wrong number of args for intrinsic {intrinsic}"), + }; + let a = codegen_operand(fx, a); + let b = codegen_operand(fx, b); + let imm8 = codegen_operand(fx, imm8).load_scalar(fx); + + let a_0 = a.value_lane(fx, 0).load_scalar(fx); + let a_1 = a.value_lane(fx, 1).load_scalar(fx); + let a_low = fx.bcx.ins().iconcat(a_0, a_1); + let a_2 = a.value_lane(fx, 2).load_scalar(fx); + let a_3 = a.value_lane(fx, 3).load_scalar(fx); + let a_high = fx.bcx.ins().iconcat(a_2, a_3); + + let b_0 = b.value_lane(fx, 0).load_scalar(fx); + let b_1 = b.value_lane(fx, 1).load_scalar(fx); + let b_low = fx.bcx.ins().iconcat(b_0, b_1); + let b_2 = b.value_lane(fx, 2).load_scalar(fx); + let b_3 = b.value_lane(fx, 3).load_scalar(fx); + let b_high = fx.bcx.ins().iconcat(b_2, b_3); + + fn select4( + fx: &mut FunctionCx<'_, '_, '_>, + a_high: Value, + a_low: Value, + b_high: Value, + b_low: Value, + control: Value, + ) -> Value { + let a_or_b = fx.bcx.ins().band_imm(control, 0b0010); + let high_or_low = fx.bcx.ins().band_imm(control, 0b0001); + let is_zero = fx.bcx.ins().band_imm(control, 0b1000); + + let zero = fx.bcx.ins().iconst(types::I64, 0); + let zero = fx.bcx.ins().iconcat(zero, zero); + + let res_a = fx.bcx.ins().select(high_or_low, a_high, a_low); + let res_b = fx.bcx.ins().select(high_or_low, b_high, b_low); + let res = fx.bcx.ins().select(a_or_b, res_b, res_a); + fx.bcx.ins().select(is_zero, zero, res) + } + + let control0 = imm8; + let res_low = select4(fx, a_high, a_low, b_high, b_low, control0); + let (res_0, res_1) = fx.bcx.ins().isplit(res_low); + + let control1 = fx.bcx.ins().ushr_imm(imm8, 4); + let res_high = select4(fx, a_high, a_low, b_high, b_low, control1); + let (res_2, res_3) = fx.bcx.ins().isplit(res_high); + + ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted()); + ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted()); + ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted()); + ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted()); + } "llvm.x86.sse2.storeu.dq" => { intrinsic_args!(fx, args => (mem_addr, a); intrinsic); let mem_addr = mem_addr.load_scalar(fx); @@ -190,7 +386,7 @@ fn llvm_add_sub<'tcx>( // carry0 | carry1 -> carry or borrow respectively let cb_out = fx.bcx.ins().bor(cb0, cb1); - let layout = fx.layout_of(fx.tcx.mk_tup(&[fx.tcx.types.u8, fx.tcx.types.u64])); + let layout = fx.layout_of(Ty::new_tup(fx.tcx, &[fx.tcx.types.u8, fx.tcx.types.u64])); let val = CValue::by_val_pair(cb_out, c, layout); ret.write_cvalue(fx, val); } diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs index 0a513b08b..5862f1829 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs @@ -472,28 +472,11 @@ fn codegen_regular_intrinsic_call<'tcx>( ret.write_cvalue(fx, CValue::by_val(align, usize_layout)); } - sym::unchecked_add - | sym::unchecked_sub - | sym::unchecked_mul - | sym::unchecked_div - | sym::exact_div - | sym::unchecked_rem - | sym::unchecked_shl - | sym::unchecked_shr => { + sym::exact_div => { intrinsic_args!(fx, args => (x, y); intrinsic); - // FIXME trap on overflow - let bin_op = match intrinsic { - sym::unchecked_add => BinOp::Add, - sym::unchecked_sub => BinOp::Sub, - sym::unchecked_mul => BinOp::Mul, - sym::unchecked_div | sym::exact_div => BinOp::Div, - sym::unchecked_rem => BinOp::Rem, - sym::unchecked_shl => BinOp::Shl, - sym::unchecked_shr => BinOp::Shr, - _ => unreachable!(), - }; - let res = crate::num::codegen_int_binop(fx, bin_op, x, y); + // FIXME trap on inexact + let res = crate::num::codegen_int_binop(fx, BinOp::Div, x, y); ret.write_cvalue(fx, res); } sym::saturating_add | sym::saturating_sub => { diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs index 5a038bfca..6741362e8 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs @@ -434,8 +434,36 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( }); } - sym::simd_round => { - intrinsic_args!(fx, args => (a); intrinsic); + sym::simd_fpow => { + intrinsic_args!(fx, args => (a, b); intrinsic); + + if !a.layout().ty.is_simd() { + report_simd_type_validation_error(fx, intrinsic, span, a.layout().ty); + return; + } + + simd_pair_for_each_lane(fx, a, b, ret, &|fx, lane_ty, _ret_lane_ty, a_lane, b_lane| { + match lane_ty.kind() { + ty::Float(FloatTy::F32) => fx.lib_call( + "powf", + vec![AbiParam::new(types::F32), AbiParam::new(types::F32)], + vec![AbiParam::new(types::F32)], + &[a_lane, b_lane], + )[0], + ty::Float(FloatTy::F64) => fx.lib_call( + "pow", + vec![AbiParam::new(types::F64), AbiParam::new(types::F64)], + vec![AbiParam::new(types::F64)], + &[a_lane, b_lane], + )[0], + _ => unreachable!("{:?}", lane_ty), + } + }); + } + + sym::simd_fpowi => { + intrinsic_args!(fx, args => (a, exp); intrinsic); + let exp = exp.load_scalar(fx); if !a.layout().ty.is_simd() { report_simd_type_validation_error(fx, intrinsic, span, a.layout().ty); @@ -448,22 +476,71 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( ret, &|fx, lane_ty, _ret_lane_ty, lane| match lane_ty.kind() { ty::Float(FloatTy::F32) => fx.lib_call( - "roundf", + "__powisf2", // compiler-builtins + vec![AbiParam::new(types::F32), AbiParam::new(types::I32)], vec![AbiParam::new(types::F32)], - vec![AbiParam::new(types::F32)], - &[lane], + &[lane, exp], )[0], ty::Float(FloatTy::F64) => fx.lib_call( - "round", - vec![AbiParam::new(types::F64)], + "__powidf2", // compiler-builtins + vec![AbiParam::new(types::F64), AbiParam::new(types::I32)], vec![AbiParam::new(types::F64)], - &[lane], + &[lane, exp], )[0], _ => unreachable!("{:?}", lane_ty), }, ); } + sym::simd_fsin + | sym::simd_fcos + | sym::simd_fexp + | sym::simd_fexp2 + | sym::simd_flog + | sym::simd_flog10 + | sym::simd_flog2 + | sym::simd_round => { + intrinsic_args!(fx, args => (a); intrinsic); + + if !a.layout().ty.is_simd() { + report_simd_type_validation_error(fx, intrinsic, span, a.layout().ty); + return; + } + + simd_for_each_lane(fx, a, ret, &|fx, lane_ty, _ret_lane_ty, lane| { + let lane_ty = match lane_ty.kind() { + ty::Float(FloatTy::F32) => types::F32, + ty::Float(FloatTy::F64) => types::F64, + _ => unreachable!("{:?}", lane_ty), + }; + let name = match (intrinsic, lane_ty) { + (sym::simd_fsin, types::F32) => "sinf", + (sym::simd_fsin, types::F64) => "sin", + (sym::simd_fcos, types::F32) => "cosf", + (sym::simd_fcos, types::F64) => "cos", + (sym::simd_fexp, types::F32) => "expf", + (sym::simd_fexp, types::F64) => "exp", + (sym::simd_fexp2, types::F32) => "exp2f", + (sym::simd_fexp2, types::F64) => "exp2", + (sym::simd_flog, types::F32) => "logf", + (sym::simd_flog, types::F64) => "log", + (sym::simd_flog10, types::F32) => "log10f", + (sym::simd_flog10, types::F64) => "log10", + (sym::simd_flog2, types::F32) => "log2f", + (sym::simd_flog2, types::F64) => "log2", + (sym::simd_round, types::F32) => "roundf", + (sym::simd_round, types::F64) => "round", + _ => unreachable!("{:?}", intrinsic), + }; + fx.lib_call( + name, + vec![AbiParam::new(lane_ty)], + vec![AbiParam::new(lane_ty)], + &[lane], + )[0] + }); + } + sym::simd_fabs | sym::simd_fsqrt | sym::simd_ceil | sym::simd_floor | sym::simd_trunc => { intrinsic_args!(fx, args => (a); intrinsic); @@ -488,7 +565,7 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( }); } - sym::simd_reduce_add_ordered | sym::simd_reduce_add_unordered => { + sym::simd_reduce_add_ordered => { intrinsic_args!(fx, args => (v, acc); intrinsic); let acc = acc.load_scalar(fx); @@ -507,7 +584,25 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( }); } - sym::simd_reduce_mul_ordered | sym::simd_reduce_mul_unordered => { + sym::simd_reduce_add_unordered => { + intrinsic_args!(fx, args => (v); intrinsic); + + // FIXME there must be no acc param for integer vectors + if !v.layout().ty.is_simd() { + report_simd_type_validation_error(fx, intrinsic, span, v.layout().ty); + return; + } + + simd_reduce(fx, v, None, ret, &|fx, lane_ty, a, b| { + if lane_ty.is_floating_point() { + fx.bcx.ins().fadd(a, b) + } else { + fx.bcx.ins().iadd(a, b) + } + }); + } + + sym::simd_reduce_mul_ordered => { intrinsic_args!(fx, args => (v, acc); intrinsic); let acc = acc.load_scalar(fx); @@ -526,6 +621,24 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( }); } + sym::simd_reduce_mul_unordered => { + intrinsic_args!(fx, args => (v); intrinsic); + + // FIXME there must be no acc param for integer vectors + if !v.layout().ty.is_simd() { + report_simd_type_validation_error(fx, intrinsic, span, v.layout().ty); + return; + } + + simd_reduce(fx, v, None, ret, &|fx, lane_ty, a, b| { + if lane_ty.is_floating_point() { + fx.bcx.ins().fmul(a, b) + } else { + fx.bcx.ins().imul(a, b) + } + }); + } + sym::simd_reduce_all => { intrinsic_args!(fx, args => (v); intrinsic); @@ -581,7 +694,7 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( simd_reduce(fx, v, None, ret, &|fx, _ty, a, b| fx.bcx.ins().bxor(a, b)); } - sym::simd_reduce_min => { + sym::simd_reduce_min | sym::simd_reduce_min_nanless => { intrinsic_args!(fx, args => (v); intrinsic); if !v.layout().ty.is_simd() { @@ -600,7 +713,7 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( }); } - sym::simd_reduce_max => { + sym::simd_reduce_max | sym::simd_reduce_max_nanless => { intrinsic_args!(fx, args => (v); intrinsic); if !v.layout().ty.is_simd() { @@ -878,6 +991,7 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>( fx.tcx.sess.span_err(span, format!("Unknown SIMD intrinsic {}", intrinsic)); // Prevent verifier error fx.bcx.ins().trap(TrapCode::UnreachableCodeReached); + return; } } let ret_block = fx.get_block(target); -- cgit v1.2.3