From 9918693037dce8aa4bb6f08741b6812923486c18 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Wed, 19 Jun 2024 11:26:03 +0200
Subject: Merging upstream version 1.76.0+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 .../rustc_codegen_cranelift/src/abi/comments.rs    |   1 -
 compiler/rustc_codegen_cranelift/src/abi/mod.rs    |   3 +-
 compiler/rustc_codegen_cranelift/src/analyze.rs    |   1 -
 compiler/rustc_codegen_cranelift/src/base.rs       |  22 +-
 compiler/rustc_codegen_cranelift/src/common.rs     |  14 +-
 .../src/concurrency_limiter.rs                     |   4 +-
 compiler/rustc_codegen_cranelift/src/constant.rs   |  52 +-
 compiler/rustc_codegen_cranelift/src/driver/aot.rs |   2 +-
 compiler/rustc_codegen_cranelift/src/inline_asm.rs | 167 ++---
 .../rustc_codegen_cranelift/src/intrinsics/llvm.rs |   4 +-
 .../src/intrinsics/llvm_aarch64.rs                 |   2 -
 .../src/intrinsics/llvm_x86.rs                     | 715 +++++++++++++++++++--
 .../rustc_codegen_cranelift/src/intrinsics/mod.rs  |  20 +-
 .../rustc_codegen_cranelift/src/intrinsics/simd.rs |  68 +-
 .../rustc_codegen_cranelift/src/pretty_clif.rs     |  21 +-
 compiler/rustc_codegen_cranelift/src/unsize.rs     | 114 +++-
 .../rustc_codegen_cranelift/src/value_and_place.rs |  93 ++-
 17 files changed, 995 insertions(+), 308 deletions(-)

(limited to 'compiler/rustc_codegen_cranelift/src')

diff --git a/compiler/rustc_codegen_cranelift/src/abi/comments.rs b/compiler/rustc_codegen_cranelift/src/abi/comments.rs
index ade6968de..a318cae17 100644
--- a/compiler/rustc_codegen_cranelift/src/abi/comments.rs
+++ b/compiler/rustc_codegen_cranelift/src/abi/comments.rs
@@ -3,7 +3,6 @@
 
 use std::borrow::Cow;
 
-use rustc_middle::mir;
 use rustc_target::abi::call::PassMode;
 
 use crate::prelude::*;
diff --git a/compiler/rustc_codegen_cranelift/src/abi/mod.rs b/compiler/rustc_codegen_cranelift/src/abi/mod.rs
index c4572e035..2c194f6d6 100644
--- a/compiler/rustc_codegen_cranelift/src/abi/mod.rs
+++ b/compiler/rustc_codegen_cranelift/src/abi/mod.rs
@@ -6,7 +6,7 @@ mod returning;
 
 use std::borrow::Cow;
 
-use cranelift_codegen::ir::{AbiParam, SigRef};
+use cranelift_codegen::ir::SigRef;
 use cranelift_module::ModuleError;
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrFlags;
 use rustc_middle::ty::layout::FnAbiOf;
@@ -383,6 +383,7 @@ pub(crate) fn codegen_terminator_call<'tcx>(
                 args,
                 ret_place,
                 target,
+                source_info.span,
             );
             return;
         }
diff --git a/compiler/rustc_codegen_cranelift/src/analyze.rs b/compiler/rustc_codegen_cranelift/src/analyze.rs
index 321612238..c5762638a 100644
--- a/compiler/rustc_codegen_cranelift/src/analyze.rs
+++ b/compiler/rustc_codegen_cranelift/src/analyze.rs
@@ -2,7 +2,6 @@
 
 use rustc_index::IndexVec;
 use rustc_middle::mir::StatementKind::*;
-use rustc_middle::ty::Ty;
 
 use crate::prelude::*;
 
diff --git a/compiler/rustc_codegen_cranelift/src/base.rs b/compiler/rustc_codegen_cranelift/src/base.rs
index 91b1547cb..df40a5eb4 100644
--- a/compiler/rustc_codegen_cranelift/src/base.rs
+++ b/compiler/rustc_codegen_cranelift/src/base.rs
@@ -176,10 +176,10 @@ pub(crate) fn compile_fn(
         match module.define_function(codegened_func.func_id, context) {
             Ok(()) => {}
             Err(ModuleError::Compilation(CodegenError::ImplLimitExceeded)) => {
-                let handler = rustc_session::EarlyErrorHandler::new(
+                let early_dcx = rustc_session::EarlyDiagCtxt::new(
                     rustc_session::config::ErrorOutputType::default(),
                 );
-                handler.early_error(format!(
+                early_dcx.early_error(format!(
                     "backend implementation limit exceeded while compiling {name}",
                     name = codegened_func.symbol_name
                 ));
@@ -353,7 +353,7 @@ fn codegen_fn_body(fx: &mut FunctionCx<'_, '_, '_>, start_block: Block) {
                             fx,
                             rustc_hir::LangItem::PanicBoundsCheck,
                             &[index, len, location],
-                            source_info.span,
+                            Some(source_info.span),
                         );
                     }
                     AssertKind::MisalignedPointerDereference { ref required, ref found } => {
@@ -365,7 +365,7 @@ fn codegen_fn_body(fx: &mut FunctionCx<'_, '_, '_>, start_block: Block) {
                             fx,
                             rustc_hir::LangItem::PanicMisalignedPointerDereference,
                             &[required, found, location],
-                            source_info.span,
+                            Some(source_info.span),
                         );
                     }
                     _ => {
@@ -456,7 +456,7 @@ fn codegen_fn_body(fx: &mut FunctionCx<'_, '_, '_>, start_block: Block) {
                     );
                 }
 
-                crate::inline_asm::codegen_inline_asm(
+                crate::inline_asm::codegen_inline_asm_terminator(
                     fx,
                     source_info.span,
                     template,
@@ -945,19 +945,19 @@ pub(crate) fn codegen_panic<'tcx>(
     let msg_len = fx.bcx.ins().iconst(fx.pointer_type, i64::try_from(msg_str.len()).unwrap());
     let args = [msg_ptr, msg_len, location];
 
-    codegen_panic_inner(fx, rustc_hir::LangItem::Panic, &args, source_info.span);
+    codegen_panic_inner(fx, rustc_hir::LangItem::Panic, &args, Some(source_info.span));
 }
 
 pub(crate) fn codegen_panic_nounwind<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     msg_str: &str,
-    source_info: mir::SourceInfo,
+    span: Option<Span>,
 ) {
     let msg_ptr = fx.anonymous_str(msg_str);
     let msg_len = fx.bcx.ins().iconst(fx.pointer_type, i64::try_from(msg_str.len()).unwrap());
     let args = [msg_ptr, msg_len];
 
-    codegen_panic_inner(fx, rustc_hir::LangItem::PanicNounwind, &args, source_info.span);
+    codegen_panic_inner(fx, rustc_hir::LangItem::PanicNounwind, &args, span);
 }
 
 pub(crate) fn codegen_unwind_terminate<'tcx>(
@@ -967,16 +967,16 @@ pub(crate) fn codegen_unwind_terminate<'tcx>(
 ) {
     let args = [];
 
-    codegen_panic_inner(fx, reason.lang_item(), &args, source_info.span);
+    codegen_panic_inner(fx, reason.lang_item(), &args, Some(source_info.span));
 }
 
 fn codegen_panic_inner<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     lang_item: rustc_hir::LangItem,
     args: &[Value],
-    span: Span,
+    span: Option<Span>,
 ) {
-    let def_id = fx.tcx.require_lang_item(lang_item, Some(span));
+    let def_id = fx.tcx.require_lang_item(lang_item, span);
 
     let instance = Instance::mono(fx.tcx, def_id).polymorphize(fx.tcx);
     let symbol_name = fx.tcx.symbol_name(instance).name;
diff --git a/compiler/rustc_codegen_cranelift/src/common.rs b/compiler/rustc_codegen_cranelift/src/common.rs
index 63562d335..bd19a7ed0 100644
--- a/compiler/rustc_codegen_cranelift/src/common.rs
+++ b/compiler/rustc_codegen_cranelift/src/common.rs
@@ -98,11 +98,15 @@ fn clif_pair_type_from_ty<'tcx>(
 
 /// Is a pointer to this type a fat ptr?
 pub(crate) fn has_ptr_meta<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> bool {
-    let ptr_ty = Ty::new_ptr(tcx, TypeAndMut { ty, mutbl: rustc_hir::Mutability::Not });
-    match &tcx.layout_of(ParamEnv::reveal_all().and(ptr_ty)).unwrap().abi {
-        Abi::Scalar(_) => false,
-        Abi::ScalarPair(_, _) => true,
-        abi => unreachable!("Abi of ptr to {:?} is {:?}???", ty, abi),
+    if ty.is_sized(tcx, ParamEnv::reveal_all()) {
+        return false;
+    }
+
+    let tail = tcx.struct_tail_erasing_lifetimes(ty, ParamEnv::reveal_all());
+    match tail.kind() {
+        ty::Foreign(..) => false,
+        ty::Str | ty::Slice(..) | ty::Dynamic(..) => true,
+        _ => bug!("unexpected unsized tail: {:?}", tail),
     }
 }
 
diff --git a/compiler/rustc_codegen_cranelift/src/concurrency_limiter.rs b/compiler/rustc_codegen_cranelift/src/concurrency_limiter.rs
index 20f2ee4c7..967896913 100644
--- a/compiler/rustc_codegen_cranelift/src/concurrency_limiter.rs
+++ b/compiler/rustc_codegen_cranelift/src/concurrency_limiter.rs
@@ -46,7 +46,7 @@ impl ConcurrencyLimiter {
         }
     }
 
-    pub(super) fn acquire(&mut self, handler: &rustc_errors::Handler) -> ConcurrencyLimiterToken {
+    pub(super) fn acquire(&mut self, dcx: &rustc_errors::DiagCtxt) -> ConcurrencyLimiterToken {
         let mut state = self.state.lock().unwrap();
         loop {
             state.assert_invariants();
@@ -64,7 +64,7 @@ impl ConcurrencyLimiter {
                     // Make sure to drop the mutex guard first to prevent poisoning the mutex.
                     drop(state);
                     if let Some(err) = err {
-                        handler.fatal(err).raise();
+                        dcx.fatal(err);
                     } else {
                         // The error was already emitted, but compilation continued. Raise a silent
                         // fatal error.
diff --git a/compiler/rustc_codegen_cranelift/src/constant.rs b/compiler/rustc_codegen_cranelift/src/constant.rs
index b0853d30e..9ffa006e5 100644
--- a/compiler/rustc_codegen_cranelift/src/constant.rs
+++ b/compiler/rustc_codegen_cranelift/src/constant.rs
@@ -1,10 +1,12 @@
 //! Handling of `static`s, `const`s and promoted allocations
 
+use std::cmp::Ordering;
+
 use cranelift_module::*;
-use rustc_data_structures::fx::{FxHashMap, FxHashSet};
+use rustc_data_structures::fx::FxHashSet;
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrFlags;
 use rustc_middle::mir::interpret::{read_target_uint, AllocId, GlobalAlloc, Scalar};
-use rustc_middle::mir::ConstValue;
+use rustc_middle::ty::ScalarInt;
 
 use crate::prelude::*;
 
@@ -123,7 +125,8 @@ pub(crate) fn codegen_const_value<'tcx>(
                 }
             }
             Scalar::Ptr(ptr, _size) => {
-                let (alloc_id, offset) = ptr.into_parts(); // we know the `offset` is relative
+                let (prov, offset) = ptr.into_parts(); // we know the `offset` is relative
+                let alloc_id = prov.alloc_id();
                 let base_addr = match fx.tcx.global_alloc(alloc_id) {
                     GlobalAlloc::Memory(alloc) => {
                         let data_id = data_id_for_alloc_id(
@@ -371,7 +374,8 @@ fn define_all_allocs(tcx: TyCtxt<'_>, module: &mut dyn Module, cx: &mut Constant
         let bytes = alloc.inspect_with_uninit_and_ptr_outside_interpreter(0..alloc.len()).to_vec();
         data.define(bytes.into_boxed_slice());
 
-        for &(offset, alloc_id) in alloc.provenance().ptrs().iter() {
+        for &(offset, prov) in alloc.provenance().ptrs().iter() {
+            let alloc_id = prov.alloc_id();
             let addend = {
                 let endianness = tcx.data_layout.endian;
                 let offset = offset.bytes() as usize;
@@ -430,9 +434,9 @@ fn define_all_allocs(tcx: TyCtxt<'_>, module: &mut dyn Module, cx: &mut Constant
 pub(crate) fn mir_operand_get_const_val<'tcx>(
     fx: &FunctionCx<'_, '_, 'tcx>,
     operand: &Operand<'tcx>,
-) -> Option<ConstValue<'tcx>> {
+) -> Option<ScalarInt> {
     match operand {
-        Operand::Constant(const_) => Some(eval_mir_constant(fx, const_).0),
+        Operand::Constant(const_) => eval_mir_constant(fx, const_).0.try_to_scalar_int(),
         // FIXME(rust-lang/rust#85105): Casts like `IMM8 as u32` result in the const being stored
         // inside a temporary before being passed to the intrinsic requiring the const argument.
         // This code tries to find a single constant defining definition of the referenced local.
@@ -440,7 +444,7 @@ pub(crate) fn mir_operand_get_const_val<'tcx>(
             if !place.projection.is_empty() {
                 return None;
             }
-            let mut computed_const_val = None;
+            let mut computed_scalar_int = None;
             for bb_data in fx.mir.basic_blocks.iter() {
                 for stmt in &bb_data.statements {
                     match &stmt.kind {
@@ -456,22 +460,38 @@ pub(crate) fn mir_operand_get_const_val<'tcx>(
                                     operand,
                                     ty,
                                 ) => {
-                                    if computed_const_val.is_some() {
+                                    if computed_scalar_int.is_some() {
                                         return None; // local assigned twice
                                     }
                                     if !matches!(ty.kind(), ty::Uint(_) | ty::Int(_)) {
                                         return None;
                                     }
-                                    let const_val = mir_operand_get_const_val(fx, operand)?;
-                                    if fx.layout_of(*ty).size
-                                        != const_val.try_to_scalar_int()?.size()
+                                    let scalar_int = mir_operand_get_const_val(fx, operand)?;
+                                    let scalar_int = match fx
+                                        .layout_of(*ty)
+                                        .size
+                                        .cmp(&scalar_int.size())
                                     {
-                                        return None;
-                                    }
-                                    computed_const_val = Some(const_val);
+                                        Ordering::Equal => scalar_int,
+                                        Ordering::Less => match ty.kind() {
+                                            ty::Uint(_) => ScalarInt::try_from_uint(
+                                                scalar_int.try_to_uint(scalar_int.size()).unwrap(),
+                                                fx.layout_of(*ty).size,
+                                            )
+                                            .unwrap(),
+                                            ty::Int(_) => ScalarInt::try_from_int(
+                                                scalar_int.try_to_int(scalar_int.size()).unwrap(),
+                                                fx.layout_of(*ty).size,
+                                            )
+                                            .unwrap(),
+                                            _ => unreachable!(),
+                                        },
+                                        Ordering::Greater => return None,
+                                    };
+                                    computed_scalar_int = Some(scalar_int);
                                 }
                                 Rvalue::Use(operand) => {
-                                    computed_const_val = mir_operand_get_const_val(fx, operand)
+                                    computed_scalar_int = mir_operand_get_const_val(fx, operand)
                                 }
                                 _ => return None,
                             }
@@ -522,7 +542,7 @@ pub(crate) fn mir_operand_get_const_val<'tcx>(
                     TerminatorKind::Call { .. } => {}
                 }
             }
-            computed_const_val
+            computed_scalar_int
         }
     }
 }
diff --git a/compiler/rustc_codegen_cranelift/src/driver/aot.rs b/compiler/rustc_codegen_cranelift/src/driver/aot.rs
index 11229dd42..b3ab533df 100644
--- a/compiler/rustc_codegen_cranelift/src/driver/aot.rs
+++ b/compiler/rustc_codegen_cranelift/src/driver/aot.rs
@@ -422,7 +422,7 @@ pub(crate) fn run_aot(
                                     backend_config.clone(),
                                     global_asm_config.clone(),
                                     cgu.name(),
-                                    concurrency_limiter.acquire(tcx.sess.diagnostic()),
+                                    concurrency_limiter.acquire(tcx.sess.dcx()),
                                 ),
                                 module_codegen,
                                 Some(rustc_middle::dep_graph::hash_result),
diff --git a/compiler/rustc_codegen_cranelift/src/inline_asm.rs b/compiler/rustc_codegen_cranelift/src/inline_asm.rs
index ce0eecca8..73f4bc7c1 100644
--- a/compiler/rustc_codegen_cranelift/src/inline_asm.rs
+++ b/compiler/rustc_codegen_cranelift/src/inline_asm.rs
@@ -3,14 +3,13 @@
 use std::fmt::Write;
 
 use rustc_ast::ast::{InlineAsmOptions, InlineAsmTemplatePiece};
-use rustc_middle::mir::InlineAsmOperand;
 use rustc_span::sym;
 use rustc_target::asm::*;
 use target_lexicon::BinaryFormat;
 
 use crate::prelude::*;
 
-enum CInlineAsmOperand<'tcx> {
+pub(crate) enum CInlineAsmOperand<'tcx> {
     In {
         reg: InlineAsmRegOrRegClass,
         value: Value,
@@ -34,7 +33,7 @@ enum CInlineAsmOperand<'tcx> {
     },
 }
 
-pub(crate) fn codegen_inline_asm<'tcx>(
+pub(crate) fn codegen_inline_asm_terminator<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     span: Span,
     template: &[InlineAsmTemplatePiece],
@@ -42,8 +41,6 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     options: InlineAsmOptions,
     destination: Option<mir::BasicBlock>,
 ) {
-    // FIXME add .eh_frame unwind info directives
-
     // Used by panic_abort on Windows, but uses a syntax which only happens to work with
     // asm!() by accident and breaks with the GNU assembler as well as global_asm!() for
     // the LLVM backend.
@@ -135,15 +132,33 @@ pub(crate) fn codegen_inline_asm<'tcx>(
         })
         .collect::<Vec<_>>();
 
-    let mut inputs = Vec::new();
-    let mut outputs = Vec::new();
+    codegen_inline_asm_inner(fx, template, &operands, options);
+
+    match destination {
+        Some(destination) => {
+            let destination_block = fx.get_block(destination);
+            fx.bcx.ins().jump(destination_block, &[]);
+        }
+        None => {
+            fx.bcx.ins().trap(TrapCode::UnreachableCodeReached);
+        }
+    }
+}
+
+pub(crate) fn codegen_inline_asm_inner<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    template: &[InlineAsmTemplatePiece],
+    operands: &[CInlineAsmOperand<'tcx>],
+    options: InlineAsmOptions,
+) {
+    // FIXME add .eh_frame unwind info directives
 
     let mut asm_gen = InlineAssemblyGenerator {
         tcx: fx.tcx,
         arch: fx.tcx.sess.asm_arch.unwrap(),
         enclosing_def_id: fx.instance.def_id(),
         template,
-        operands: &operands,
+        operands,
         options,
         registers: Vec::new(),
         stack_slots_clobber: Vec::new(),
@@ -165,6 +180,8 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
     fx.cx.global_asm.push_str(&generated_asm);
 
+    let mut inputs = Vec::new();
+    let mut outputs = Vec::new();
     for (i, operand) in operands.iter().enumerate() {
         match operand {
             CInlineAsmOperand::In { reg: _, value } => {
@@ -186,16 +203,6 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     }
 
     call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
-
-    match destination {
-        Some(destination) => {
-            let destination_block = fx.get_block(destination);
-            fx.bcx.ins().jump(destination_block, &[]);
-        }
-        None => {
-            fx.bcx.ins().trap(TrapCode::UnreachableCodeReached);
-        }
-    }
 }
 
 struct InlineAssemblyGenerator<'a, 'tcx> {
@@ -637,8 +644,21 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
     ) {
         match arch {
             InlineAsmArch::X86_64 => {
-                write!(generated_asm, "    mov [rbx+0x{:x}], ", offset.bytes()).unwrap();
-                reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap();
+                match reg {
+                    InlineAsmReg::X86(reg)
+                        if reg as u32 >= X86InlineAsmReg::xmm0 as u32
+                            && reg as u32 <= X86InlineAsmReg::xmm15 as u32 =>
+                    {
+                        // rustc emits x0 rather than xmm0
+                        write!(generated_asm, "    movups [rbx+0x{:x}], ", offset.bytes()).unwrap();
+                        write!(generated_asm, "xmm{}", reg as u32 - X86InlineAsmReg::xmm0 as u32)
+                            .unwrap();
+                    }
+                    _ => {
+                        write!(generated_asm, "    mov [rbx+0x{:x}], ", offset.bytes()).unwrap();
+                        reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap();
+                    }
+                }
                 generated_asm.push('\n');
             }
             InlineAsmArch::AArch64 => {
@@ -663,8 +683,24 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
     ) {
         match arch {
             InlineAsmArch::X86_64 => {
-                generated_asm.push_str("    mov ");
-                reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap();
+                match reg {
+                    InlineAsmReg::X86(reg)
+                        if reg as u32 >= X86InlineAsmReg::xmm0 as u32
+                            && reg as u32 <= X86InlineAsmReg::xmm15 as u32 =>
+                    {
+                        // rustc emits x0 rather than xmm0
+                        write!(
+                            generated_asm,
+                            "    movups xmm{}",
+                            reg as u32 - X86InlineAsmReg::xmm0 as u32
+                        )
+                        .unwrap();
+                    }
+                    _ => {
+                        generated_asm.push_str("    mov ");
+                        reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap()
+                    }
+                }
                 writeln!(generated_asm, ", [rbx+0x{:x}]", offset.bytes()).unwrap();
             }
             InlineAsmArch::AArch64 => {
@@ -720,7 +756,12 @@ fn call_inline_asm<'tcx>(
     fx.bcx.ins().call(inline_asm_func, &[stack_slot_addr]);
 
     for (offset, place) in outputs {
-        let ty = fx.clif_type(place.layout().ty).unwrap();
+        let ty = if place.layout().ty.is_simd() {
+            let (lane_count, lane_type) = place.layout().ty.simd_size_and_type(fx.tcx);
+            fx.clif_type(lane_type).unwrap().by(lane_count.try_into().unwrap()).unwrap()
+        } else {
+            fx.clif_type(place.layout().ty).unwrap()
+        };
         let value = stack_slot.offset(fx, i32::try_from(offset.bytes()).unwrap().into()).load(
             fx,
             ty,
@@ -729,83 +770,3 @@ fn call_inline_asm<'tcx>(
         place.write_cvalue(fx, CValue::by_val(value, place.layout()));
     }
 }
-
-pub(crate) fn codegen_xgetbv<'tcx>(
-    fx: &mut FunctionCx<'_, '_, 'tcx>,
-    xcr_no: Value,
-    ret: CPlace<'tcx>,
-) {
-    // FIXME add .eh_frame unwind info directives
-
-    let operands = vec![
-        CInlineAsmOperand::In {
-            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
-            value: xcr_no,
-        },
-        CInlineAsmOperand::Out {
-            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
-            late: true,
-            place: Some(ret),
-        },
-        CInlineAsmOperand::Out {
-            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
-            late: true,
-            place: None,
-        },
-    ];
-    let options = InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM;
-
-    let mut inputs = Vec::new();
-    let mut outputs = Vec::new();
-
-    let mut asm_gen = InlineAssemblyGenerator {
-        tcx: fx.tcx,
-        arch: fx.tcx.sess.asm_arch.unwrap(),
-        enclosing_def_id: fx.instance.def_id(),
-        template: &[InlineAsmTemplatePiece::String(
-            "
-            xgetbv
-            // out = rdx << 32 | rax
-            shl rdx, 32
-            or rax, rdx
-            "
-            .to_string(),
-        )],
-        operands: &operands,
-        options,
-        registers: Vec::new(),
-        stack_slots_clobber: Vec::new(),
-        stack_slots_input: Vec::new(),
-        stack_slots_output: Vec::new(),
-        stack_slot_size: Size::from_bytes(0),
-    };
-    asm_gen.allocate_registers();
-    asm_gen.allocate_stack_slots();
-
-    let inline_asm_index = fx.cx.inline_asm_index.get();
-    fx.cx.inline_asm_index.set(inline_asm_index + 1);
-    let asm_name = format!(
-        "__inline_asm_{}_n{}",
-        fx.cx.cgu_name.as_str().replace('.', "__").replace('-', "_"),
-        inline_asm_index
-    );
-
-    let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
-    fx.cx.global_asm.push_str(&generated_asm);
-
-    for (i, operand) in operands.iter().enumerate() {
-        match operand {
-            CInlineAsmOperand::In { reg: _, value } => {
-                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
-            }
-            CInlineAsmOperand::Out { reg: _, late: _, place } => {
-                if let Some(place) = place {
-                    outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place));
-                }
-            }
-            _ => unreachable!(),
-        }
-    }
-
-    call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
-}
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
index e9b7daf14..dbd5db875 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
@@ -1,7 +1,5 @@
 //! Emulate LLVM intrinsics
 
-use rustc_middle::ty::GenericArgsRef;
-
 use crate::intrinsics::*;
 use crate::prelude::*;
 
@@ -12,6 +10,7 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
     args: &[mir::Operand<'tcx>],
     ret: CPlace<'tcx>,
     target: Option<BasicBlock>,
+    span: Span,
 ) {
     if intrinsic.starts_with("llvm.aarch64") {
         return llvm_aarch64::codegen_aarch64_llvm_intrinsic_call(
@@ -31,6 +30,7 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
             args,
             ret,
             target,
+            span,
         );
     }
 
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs
index ee098be1f..e1e514dca 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs
@@ -1,7 +1,5 @@
 //! Emulate AArch64 LLVM intrinsics
 
-use rustc_middle::ty::GenericArgsRef;
-
 use crate::intrinsics::*;
 use crate::prelude::*;
 
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
index 4c5360486..99bb5c4ea 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
@@ -1,7 +1,9 @@
 //! Emulate x86 LLVM intrinsics
 
-use rustc_middle::ty::GenericArgsRef;
+use rustc_ast::ast::{InlineAsmOptions, InlineAsmTemplatePiece};
+use rustc_target::asm::*;
 
+use crate::inline_asm::{codegen_inline_asm_inner, CInlineAsmOperand};
 use crate::intrinsics::*;
 use crate::prelude::*;
 
@@ -12,19 +14,53 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
     args: &[mir::Operand<'tcx>],
     ret: CPlace<'tcx>,
     target: Option<BasicBlock>,
+    span: Span,
 ) {
     match intrinsic {
         "llvm.x86.sse2.pause" | "llvm.aarch64.isb" => {
             // Spin loop hint
         }
 
+        "llvm.x86.avx.vzeroupper" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper&ig_expand=7218
+            // Do nothing. It is a perf hint anyway.
+        }
+
         // Used by is_x86_feature_detected!();
         "llvm.x86.xgetbv" => {
             intrinsic_args!(fx, args => (xcr_no); intrinsic);
 
             let xcr_no = xcr_no.load_scalar(fx);
 
-            crate::inline_asm::codegen_xgetbv(fx, xcr_no, ret);
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(
+                    "
+                    xgetbv
+                    // out = rdx << 32 | rax
+                    shl rdx, 32
+                    or rax, rdx
+                    "
+                    .to_string(),
+                )],
+                &[
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
+                        value: xcr_no,
+                    },
+                    CInlineAsmOperand::Out {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
+                        late: true,
+                        place: Some(ret),
+                    },
+                    CInlineAsmOperand::Out {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
+                        late: true,
+                        place: None,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
         }
 
         "llvm.x86.sse3.ldu.dq" | "llvm.x86.avx.ldu.dq.256" => {
@@ -37,6 +73,103 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             ret.write_cvalue(fx, val);
         }
 
+        "llvm.x86.avx2.gather.d.d"
+        | "llvm.x86.avx2.gather.d.q"
+        | "llvm.x86.avx2.gather.d.ps"
+        | "llvm.x86.avx2.gather.d.pd"
+        | "llvm.x86.avx2.gather.d.d.256"
+        | "llvm.x86.avx2.gather.d.q.256"
+        | "llvm.x86.avx2.gather.d.ps.256"
+        | "llvm.x86.avx2.gather.d.pd.256"
+        | "llvm.x86.avx2.gather.q.d"
+        | "llvm.x86.avx2.gather.q.q"
+        | "llvm.x86.avx2.gather.q.ps"
+        | "llvm.x86.avx2.gather.q.pd"
+        | "llvm.x86.avx2.gather.q.d.256"
+        | "llvm.x86.avx2.gather.q.q.256"
+        | "llvm.x86.avx2.gather.q.ps.256"
+        | "llvm.x86.avx2.gather.q.pd.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd&ig_expand=3818
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd&ig_expand=3819
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd&ig_expand=3821
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd&ig_expand=3822
+            // ...
+
+            intrinsic_args!(fx, args => (src, ptr, index, mask, scale); intrinsic);
+
+            let (src_lane_count, src_lane_ty) = src.layout().ty.simd_size_and_type(fx.tcx);
+            let (index_lane_count, index_lane_ty) = index.layout().ty.simd_size_and_type(fx.tcx);
+            let (mask_lane_count, mask_lane_ty) = mask.layout().ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(src_lane_ty, ret_lane_ty);
+            assert!(index_lane_ty.is_integral());
+            assert_eq!(src_lane_count, mask_lane_count);
+            assert_eq!(src_lane_count, ret_lane_count);
+
+            let lane_clif_ty = fx.clif_type(ret_lane_ty).unwrap();
+            let index_lane_clif_ty = fx.clif_type(index_lane_ty).unwrap();
+            let mask_lane_clif_ty = fx.clif_type(mask_lane_ty).unwrap();
+            let ret_lane_layout = fx.layout_of(ret_lane_ty);
+
+            let ptr = ptr.load_scalar(fx);
+            let scale = scale.load_scalar(fx);
+            let scale = fx.bcx.ins().uextend(types::I64, scale);
+            for lane_idx in 0..std::cmp::min(src_lane_count, index_lane_count) {
+                let src_lane = src.value_lane(fx, lane_idx).load_scalar(fx);
+                let index_lane = index.value_lane(fx, lane_idx).load_scalar(fx);
+                let mask_lane = mask.value_lane(fx, lane_idx).load_scalar(fx);
+                let mask_lane =
+                    fx.bcx.ins().bitcast(mask_lane_clif_ty.as_int(), MemFlags::new(), mask_lane);
+
+                let if_enabled = fx.bcx.create_block();
+                let if_disabled = fx.bcx.create_block();
+                let next = fx.bcx.create_block();
+                let res_lane = fx.bcx.append_block_param(next, lane_clif_ty);
+
+                let mask_lane = match mask_lane_clif_ty {
+                    types::I32 | types::F32 => {
+                        fx.bcx.ins().band_imm(mask_lane, 0x8000_0000u64 as i64)
+                    }
+                    types::I64 | types::F64 => {
+                        fx.bcx.ins().band_imm(mask_lane, 0x8000_0000_0000_0000u64 as i64)
+                    }
+                    _ => unreachable!(),
+                };
+                fx.bcx.ins().brif(mask_lane, if_enabled, &[], if_disabled, &[]);
+                fx.bcx.seal_block(if_enabled);
+                fx.bcx.seal_block(if_disabled);
+
+                fx.bcx.switch_to_block(if_enabled);
+                let index_lane = if index_lane_clif_ty != types::I64 {
+                    fx.bcx.ins().sextend(types::I64, index_lane)
+                } else {
+                    index_lane
+                };
+                let offset = fx.bcx.ins().imul(index_lane, scale);
+                let lane_ptr = fx.bcx.ins().iadd(ptr, offset);
+                let res = fx.bcx.ins().load(lane_clif_ty, MemFlags::trusted(), lane_ptr, 0);
+                fx.bcx.ins().jump(next, &[res]);
+
+                fx.bcx.switch_to_block(if_disabled);
+                fx.bcx.ins().jump(next, &[src_lane]);
+
+                fx.bcx.seal_block(next);
+                fx.bcx.switch_to_block(next);
+
+                fx.bcx.ins().nop();
+
+                ret.place_lane(fx, lane_idx)
+                    .write_cvalue(fx, CValue::by_val(res_lane, ret_lane_layout));
+            }
+
+            for lane_idx in std::cmp::min(src_lane_count, index_lane_count)..ret_lane_count {
+                let zero_lane = fx.bcx.ins().iconst(mask_lane_clif_ty.as_int(), 0);
+                let zero_lane = fx.bcx.ins().bitcast(mask_lane_clif_ty, MemFlags::new(), zero_lane);
+                ret.place_lane(fx, lane_idx)
+                    .write_cvalue(fx, CValue::by_val(zero_lane, ret_lane_layout));
+            }
+        }
+
         "llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
             let (x, y, kind) = match args {
                 [x, y, kind] => (x, y, kind),
@@ -241,16 +374,31 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             );
         }
         "llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
-            let a = match args {
-                [a] => a,
-                _ => bug!("wrong number of args for intrinsic {intrinsic}"),
-            };
-            let a = codegen_operand(fx, a);
+            intrinsic_args!(fx, args => (a); intrinsic);
 
             simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| {
                 fx.bcx.ins().iabs(lane)
             });
         }
+        "llvm.x86.sse2.cvttps2dq" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32&ig_expand=2429
+            intrinsic_args!(fx, args => (a); intrinsic);
+            let a = a.load_scalar(fx);
+
+            // Using inline asm instead of fcvt_to_sint_sat as unrepresentable values are turned
+            // into 0x80000000 for which Cranelift doesn't have a native instruction.
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("cvttps2dq xmm0, xmm0"))],
+                &[CInlineAsmOperand::InOut {
+                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                    _late: true,
+                    in_value: a,
+                    out_place: Some(ret),
+                }],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
         "llvm.x86.addcarry.32" | "llvm.x86.addcarry.64" => {
             intrinsic_args!(fx, args => (c_in, a, b); intrinsic);
             let c_in = c_in.load_scalar(fx);
@@ -332,9 +480,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             for out_lane_idx in 0..lane_count / 8 {
                 let mut lane_diff_acc = fx.bcx.ins().iconst(types::I64, 0);
 
-                for lane_idx in out_lane_idx * 8..out_lane_idx * 8 + 1 {
+                for lane_idx in out_lane_idx * 8..out_lane_idx * 8 + 8 {
                     let a_lane = a.value_lane(fx, lane_idx).load_scalar(fx);
+                    let a_lane = fx.bcx.ins().uextend(types::I16, a_lane);
                     let b_lane = b.value_lane(fx, lane_idx).load_scalar(fx);
+                    let b_lane = fx.bcx.ins().uextend(types::I16, b_lane);
 
                     let lane_diff = fx.bcx.ins().isub(a_lane, b_lane);
                     let abs_lane_diff = fx.bcx.ins().iabs(lane_diff);
@@ -405,12 +555,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             let ret_lane_layout = fx.layout_of(fx.tcx.types.i32);
             for out_lane_idx in 0..lane_count / 2 {
                 let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
-                let a_lane0 = fx.bcx.ins().uextend(types::I32, a_lane0);
+                let a_lane0 = fx.bcx.ins().sextend(types::I32, a_lane0);
                 let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
                 let b_lane0 = fx.bcx.ins().sextend(types::I32, b_lane0);
 
                 let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
-                let a_lane1 = fx.bcx.ins().uextend(types::I32, a_lane1);
+                let a_lane1 = fx.bcx.ins().sextend(types::I32, a_lane1);
                 let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
                 let b_lane1 = fx.bcx.ins().sextend(types::I32, b_lane1);
 
@@ -565,14 +715,14 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             assert_eq!(ret_lane_ty, fx.tcx.types.i16);
             assert_eq!(lane_count * 2, ret_lane_count);
 
-            let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
-            let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
+            let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
+            let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
             let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
 
             for idx in 0..lane_count {
                 let lane = a.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -582,7 +732,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             for idx in 0..lane_count {
                 let lane = b.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -609,8 +759,8 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
             for idx in 0..lane_count {
                 let lane = a.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().umax(lane, min_u16);
-                let sat = fx.bcx.ins().umin(sat, max_u16);
+                let sat = fx.bcx.ins().smax(lane, min_u16);
+                let sat = fx.bcx.ins().smin(sat, max_u16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -619,8 +769,8 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
             for idx in 0..lane_count {
                 let lane = b.value_lane(fx, idx).load_scalar(fx);
-                let sat = fx.bcx.ins().umax(lane, min_u16);
-                let sat = fx.bcx.ins().umin(sat, max_u16);
+                let sat = fx.bcx.ins().smax(lane, min_u16);
+                let sat = fx.bcx.ins().smin(sat, max_u16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -641,14 +791,14 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             assert_eq!(ret_lane_ty, fx.tcx.types.i16);
             assert_eq!(lane_count * 2, ret_lane_count);
 
-            let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
-            let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
+            let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
+            let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
             let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
 
             for idx in 0..lane_count / 2 {
                 let lane = a.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -658,7 +808,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             for idx in 0..lane_count / 2 {
                 let lane = b.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -668,7 +818,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             for idx in 0..lane_count / 2 {
                 let lane = a.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -678,7 +828,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             for idx in 0..lane_count / 2 {
                 let lane = b.value_lane(fx, idx).load_scalar(fx);
                 let sat = fx.bcx.ins().smax(lane, min_i16);
-                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let sat = fx.bcx.ins().smin(sat, max_i16);
                 let res = fx.bcx.ins().ireduce(types::I16, sat);
 
                 let res_lane = CValue::by_val(res, ret_lane_layout);
@@ -686,66 +836,489 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             }
         }
 
-        "llvm.x86.pclmulqdq" => {
-            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
-            intrinsic_args!(fx, args => (a, b, imm8); intrinsic);
+        "llvm.x86.fma.vfmaddsub.ps"
+        | "llvm.x86.fma.vfmaddsub.pd"
+        | "llvm.x86.fma.vfmaddsub.ps.256"
+        | "llvm.x86.fma.vfmaddsub.pd.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps&ig_expand=3205
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd&ig_expand=3181
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps&ig_expand=3209
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd&ig_expand=3185
+            intrinsic_args!(fx, args => (a, b, c); intrinsic);
 
             assert_eq!(a.layout(), b.layout());
+            assert_eq!(a.layout(), c.layout());
             let layout = a.layout();
 
             let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
             let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
-            assert_eq!(lane_ty, fx.tcx.types.i64);
-            assert_eq!(ret_lane_ty, fx.tcx.types.i64);
-            assert_eq!(lane_count, 2);
-            assert_eq!(ret_lane_count, 2);
+            assert!(lane_ty.is_floating_point());
+            assert!(ret_lane_ty.is_floating_point());
+            assert_eq!(lane_count, ret_lane_count);
+            let ret_lane_layout = fx.layout_of(ret_lane_ty);
 
-            let imm8 = imm8.load_scalar(fx);
+            for idx in 0..lane_count {
+                let a_lane = a.value_lane(fx, idx).load_scalar(fx);
+                let b_lane = b.value_lane(fx, idx).load_scalar(fx);
+                let c_lane = c.value_lane(fx, idx).load_scalar(fx);
+
+                let mul = fx.bcx.ins().fmul(a_lane, b_lane);
+                let res = if idx & 1 == 0 {
+                    fx.bcx.ins().fsub(mul, c_lane)
+                } else {
+                    fx.bcx.ins().fadd(mul, c_lane)
+                };
 
-            let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001);
-            let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
-            let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
-            let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0);
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+        }
 
-            let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000);
-            let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
-            let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
-            let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0);
+        "llvm.x86.fma.vfmsubadd.ps"
+        | "llvm.x86.fma.vfmsubadd.pd"
+        | "llvm.x86.fma.vfmsubadd.ps.256"
+        | "llvm.x86.fma.vfmsubadd.pd.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps&ig_expand=3325
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd&ig_expand=3301
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps&ig_expand=3329
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd&ig_expand=3305
+            intrinsic_args!(fx, args => (a, b, c); intrinsic);
 
-            fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value {
-                let tmp = fx.bcx.ins().ushr_imm(val, bit);
-                fx.bcx.ins().band_imm(tmp, 1)
-            }
+            assert_eq!(a.layout(), b.layout());
+            assert_eq!(a.layout(), c.layout());
+            let layout = a.layout();
 
-            let mut res1 = fx.bcx.ins().iconst(types::I64, 0);
-            for i in 0..=63 {
-                let x = extract_bit(fx, temp1, 0);
-                let y = extract_bit(fx, temp2, i);
-                let mut temp = fx.bcx.ins().band(x, y);
-                for j in 1..=i {
-                    let x = extract_bit(fx, temp1, j);
-                    let y = extract_bit(fx, temp2, i - j);
-                    let z = fx.bcx.ins().band(x, y);
-                    temp = fx.bcx.ins().bxor(temp, z);
-                }
-                let temp = fx.bcx.ins().ishl_imm(temp, i);
-                res1 = fx.bcx.ins().bor(res1, temp);
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert!(lane_ty.is_floating_point());
+            assert!(ret_lane_ty.is_floating_point());
+            assert_eq!(lane_count, ret_lane_count);
+            let ret_lane_layout = fx.layout_of(ret_lane_ty);
+
+            for idx in 0..lane_count {
+                let a_lane = a.value_lane(fx, idx).load_scalar(fx);
+                let b_lane = b.value_lane(fx, idx).load_scalar(fx);
+                let c_lane = c.value_lane(fx, idx).load_scalar(fx);
+
+                let mul = fx.bcx.ins().fmul(a_lane, b_lane);
+                let res = if idx & 1 == 0 {
+                    fx.bcx.ins().fadd(mul, c_lane)
+                } else {
+                    fx.bcx.ins().fsub(mul, c_lane)
+                };
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
             }
-            ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted());
-
-            let mut res2 = fx.bcx.ins().iconst(types::I64, 0);
-            for i in 64..=127 {
-                let mut temp = fx.bcx.ins().iconst(types::I64, 0);
-                for j in i - 63..=63 {
-                    let x = extract_bit(fx, temp1, j);
-                    let y = extract_bit(fx, temp2, i - j);
-                    let z = fx.bcx.ins().band(x, y);
-                    temp = fx.bcx.ins().bxor(temp, z);
-                }
-                let temp = fx.bcx.ins().ishl_imm(temp, i);
-                res2 = fx.bcx.ins().bor(res2, temp);
+        }
+
+        "llvm.x86.fma.vfnmadd.ps"
+        | "llvm.x86.fma.vfnmadd.pd"
+        | "llvm.x86.fma.vfnmadd.ps.256"
+        | "llvm.x86.fma.vfnmadd.pd.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps&ig_expand=3391
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd&ig_expand=3367
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps&ig_expand=3395
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd&ig_expand=3371
+            intrinsic_args!(fx, args => (a, b, c); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            assert_eq!(a.layout(), c.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert!(lane_ty.is_floating_point());
+            assert!(ret_lane_ty.is_floating_point());
+            assert_eq!(lane_count, ret_lane_count);
+            let ret_lane_layout = fx.layout_of(ret_lane_ty);
+
+            for idx in 0..lane_count {
+                let a_lane = a.value_lane(fx, idx).load_scalar(fx);
+                let b_lane = b.value_lane(fx, idx).load_scalar(fx);
+                let c_lane = c.value_lane(fx, idx).load_scalar(fx);
+
+                let mul = fx.bcx.ins().fmul(a_lane, b_lane);
+                let neg_mul = fx.bcx.ins().fneg(mul);
+                let res = fx.bcx.ins().fadd(neg_mul, c_lane);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
             }
-            ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());
+        }
+
+        "llvm.x86.sse42.pcmpestri128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri&ig_expand=939
+            intrinsic_args!(fx, args => (a, la, b, lb, _imm8); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let la = la.load_scalar(fx);
+            let b = b.load_scalar(fx);
+            let lb = lb.load_scalar(fx);
+
+            let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[4])
+            {
+                imm8
+            } else {
+                fx.tcx.sess.span_fatal(span, "Index argument for `_mm_cmpestri` is not a constant");
+            };
+
+            let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("pcmpestri xmm0, xmm1, {imm8}"))],
+                &[
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        value: a,
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: b,
+                    },
+                    // Implicit argument to the pcmpestri intrinsic
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
+                        value: la,
+                    },
+                    // Implicit argument to the pcmpestri intrinsic
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
+                        value: lb,
+                    },
+                    // Implicit result of the pcmpestri intrinsic
+                    CInlineAsmOperand::Out {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
+                        late: true,
+                        place: Some(ret),
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.sse42.pcmpestrm128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm&ig_expand=940
+            intrinsic_args!(fx, args => (a, la, b, lb, _imm8); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let la = la.load_scalar(fx);
+            let b = b.load_scalar(fx);
+            let lb = lb.load_scalar(fx);
+
+            let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[4])
+            {
+                imm8
+            } else {
+                fx.tcx.sess.span_fatal(span, "Index argument for `_mm_cmpestrm` is not a constant");
+            };
+
+            let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("pcmpestrm xmm0, xmm1, {imm8}"))],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: b,
+                    },
+                    // Implicit argument to the pcmpestri intrinsic
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
+                        value: la,
+                    },
+                    // Implicit argument to the pcmpestri intrinsic
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
+                        value: lb,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.pclmulqdq" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
+            intrinsic_args!(fx, args => (a, b, _imm8); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[2])
+            {
+                imm8
+            } else {
+                fx.tcx.sess.span_fatal(
+                    span,
+                    "Index argument for `_mm_clmulepi64_si128` is not a constant",
+                );
+            };
+
+            let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("pclmulqdq xmm0, xmm1, {imm8}"))],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: b,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aeskeygenassist" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128&ig_expand=261
+            intrinsic_args!(fx, args => (a, _imm8); intrinsic);
+
+            let a = a.load_scalar(fx);
+
+            let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[1])
+            {
+                imm8
+            } else {
+                fx.tcx.sess.span_fatal(
+                    span,
+                    "Index argument for `_mm_aeskeygenassist_si128` is not a constant",
+                );
+            };
+
+            let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String(format!("aeskeygenassist xmm0, xmm0, {imm8}"))],
+                &[CInlineAsmOperand::InOut {
+                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                    _late: true,
+                    in_value: a,
+                    out_place: Some(ret),
+                }],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesimc" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128&ig_expand=260
+            intrinsic_args!(fx, args => (a); intrinsic);
+
+            let a = a.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesimc xmm0, xmm0".to_string())],
+                &[CInlineAsmOperand::InOut {
+                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                    _late: true,
+                    in_value: a,
+                    out_place: Some(ret),
+                }],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesenc" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenc_si128&ig_expand=252
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesenc xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesenclast" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128&ig_expand=257
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesenclast xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesdec" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128&ig_expand=242
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesdec xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.aesni.aesdeclast" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128&ig_expand=247
+            intrinsic_args!(fx, args => (a, round_key); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let round_key = round_key.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("aesdeclast xmm0, xmm1".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        value: round_key,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.sha256rnds2" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256rnds2_epu32&ig_expand=5977
+            intrinsic_args!(fx, args => (a, b, k); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+            let k = k.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("sha256rnds2 xmm1, xmm2".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),
+                        value: b,
+                    },
+                    // Implicit argument to the sha256rnds2 instruction
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
+                        value: k,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.sha256msg1" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg1_epu32&ig_expand=5975
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("sha256msg1 xmm1, xmm2".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),
+                        value: b,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
+        }
+
+        "llvm.x86.sha256msg2" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sha256msg2_epu32&ig_expand=5976
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            codegen_inline_asm_inner(
+                fx,
+                &[InlineAsmTemplatePiece::String("sha256msg2 xmm1, xmm2".to_string())],
+                &[
+                    CInlineAsmOperand::InOut {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
+                        _late: true,
+                        in_value: a,
+                        out_place: Some(ret),
+                    },
+                    CInlineAsmOperand::In {
+                        reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm2)),
+                        value: b,
+                    },
+                ],
+                InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
+            );
         }
 
         "llvm.x86.avx.ptestz.256" => {
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
index bfeeb117f..68126f124 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
@@ -487,13 +487,12 @@ fn codegen_regular_intrinsic_call<'tcx>(
             let layout = fx.layout_of(generic_args.type_at(0));
             // Note: Can't use is_unsized here as truly unsized types need to take the fixed size
             // branch
-            let size = if let Abi::ScalarPair(_, _) = ptr.layout().abi {
-                let (_ptr, info) = ptr.load_scalar_pair(fx);
-                let (size, _align) = crate::unsize::size_and_align_of_dst(fx, layout, info);
-                size
+            let meta = if let Abi::ScalarPair(_, _) = ptr.layout().abi {
+                Some(ptr.load_scalar_pair(fx).1)
             } else {
-                fx.bcx.ins().iconst(fx.pointer_type, layout.size.bytes() as i64)
+                None
             };
+            let (size, _align) = crate::unsize::size_and_align_of(fx, layout, meta);
             ret.write_cvalue(fx, CValue::by_val(size, usize_layout));
         }
         sym::min_align_of_val => {
@@ -502,13 +501,12 @@ fn codegen_regular_intrinsic_call<'tcx>(
             let layout = fx.layout_of(generic_args.type_at(0));
             // Note: Can't use is_unsized here as truly unsized types need to take the fixed size
             // branch
-            let align = if let Abi::ScalarPair(_, _) = ptr.layout().abi {
-                let (_ptr, info) = ptr.load_scalar_pair(fx);
-                let (_size, align) = crate::unsize::size_and_align_of_dst(fx, layout, info);
-                align
+            let meta = if let Abi::ScalarPair(_, _) = ptr.layout().abi {
+                Some(ptr.load_scalar_pair(fx).1)
             } else {
-                fx.bcx.ins().iconst(fx.pointer_type, layout.align.abi.bytes() as i64)
+                None
             };
+            let (_size, align) = crate::unsize::size_and_align_of(fx, layout, meta);
             ret.write_cvalue(fx, CValue::by_val(align, usize_layout));
         }
 
@@ -688,7 +686,7 @@ fn codegen_regular_intrinsic_call<'tcx>(
                             }
                         })
                     });
-                    crate::base::codegen_panic_nounwind(fx, &msg_str, source_info);
+                    crate::base::codegen_panic_nounwind(fx, &msg_str, Some(source_info.span));
                     return;
                 }
             }
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
index ea137c4ca..fe4f073f7 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
@@ -1,7 +1,6 @@
 //! Codegen `extern "platform-intrinsic"` intrinsics.
 
-use rustc_middle::ty::GenericArgsRef;
-use rustc_span::Symbol;
+use cranelift_codegen::ir::immediates::Offset32;
 use rustc_target::abi::Endian;
 
 use super::*;
@@ -282,11 +281,11 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
                 fx.tcx.sess.span_fatal(span, "Index argument for `simd_insert` is not a constant");
             };
 
-            let idx = idx_const
-                .try_to_bits(Size::from_bytes(4 /* u32*/))
-                .unwrap_or_else(|| panic!("kind not scalar: {:?}", idx_const));
+            let idx: u32 = idx_const
+                .try_to_u32()
+                .unwrap_or_else(|_| panic!("kind not scalar: {:?}", idx_const));
             let (lane_count, _lane_ty) = base.layout().ty.simd_size_and_type(fx.tcx);
-            if idx >= lane_count.into() {
+            if u64::from(idx) >= lane_count {
                 fx.tcx.sess.span_fatal(
                     fx.mir.span,
                     format!("[simd_insert] idx {} >= lane_count {}", idx, lane_count),
@@ -331,10 +330,10 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
             };
 
             let idx = idx_const
-                .try_to_bits(Size::from_bytes(4 /* u32*/))
-                .unwrap_or_else(|| panic!("kind not scalar: {:?}", idx_const));
+                .try_to_u32()
+                .unwrap_or_else(|_| panic!("kind not scalar: {:?}", idx_const));
             let (lane_count, _lane_ty) = v.layout().ty.simd_size_and_type(fx.tcx);
-            if idx >= lane_count.into() {
+            if u64::from(idx) >= lane_count {
                 fx.tcx.sess.span_fatal(
                     fx.mir.span,
                     format!("[simd_extract] idx {} >= lane_count {}", idx, lane_count),
@@ -1008,8 +1007,57 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
             }
         }
 
+        sym::simd_masked_load => {
+            intrinsic_args!(fx, args => (mask, ptr, val); intrinsic);
+
+            let (val_lane_count, val_lane_ty) = val.layout().ty.simd_size_and_type(fx.tcx);
+            let (mask_lane_count, _mask_lane_ty) = mask.layout().ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(val_lane_count, mask_lane_count);
+            assert_eq!(val_lane_count, ret_lane_count);
+
+            let lane_clif_ty = fx.clif_type(val_lane_ty).unwrap();
+            let ret_lane_layout = fx.layout_of(ret_lane_ty);
+            let ptr_val = ptr.load_scalar(fx);
+
+            for lane_idx in 0..ret_lane_count {
+                let val_lane = val.value_lane(fx, lane_idx).load_scalar(fx);
+                let mask_lane = mask.value_lane(fx, lane_idx).load_scalar(fx);
+
+                let if_enabled = fx.bcx.create_block();
+                let if_disabled = fx.bcx.create_block();
+                let next = fx.bcx.create_block();
+                let res_lane = fx.bcx.append_block_param(next, lane_clif_ty);
+
+                fx.bcx.ins().brif(mask_lane, if_enabled, &[], if_disabled, &[]);
+                fx.bcx.seal_block(if_enabled);
+                fx.bcx.seal_block(if_disabled);
+
+                fx.bcx.switch_to_block(if_enabled);
+                let offset = lane_idx as i32 * lane_clif_ty.bytes() as i32;
+                let res = fx.bcx.ins().load(
+                    lane_clif_ty,
+                    MemFlags::trusted(),
+                    ptr_val,
+                    Offset32::new(offset),
+                );
+                fx.bcx.ins().jump(next, &[res]);
+
+                fx.bcx.switch_to_block(if_disabled);
+                fx.bcx.ins().jump(next, &[val_lane]);
+
+                fx.bcx.seal_block(next);
+                fx.bcx.switch_to_block(next);
+
+                fx.bcx.ins().nop();
+
+                ret.place_lane(fx, lane_idx)
+                    .write_cvalue(fx, CValue::by_val(res_lane, ret_lane_layout));
+            }
+        }
+
         sym::simd_scatter => {
-            intrinsic_args!(fx, args => (val, ptr, mask); intrinsic);
+            intrinsic_args!(fx, args => (mask, ptr, val); intrinsic);
 
             let (val_lane_count, _val_lane_ty) = val.layout().ty.simd_size_and_type(fx.tcx);
             let (ptr_lane_count, _ptr_lane_ty) = ptr.layout().ty.simd_size_and_type(fx.tcx);
diff --git a/compiler/rustc_codegen_cranelift/src/pretty_clif.rs b/compiler/rustc_codegen_cranelift/src/pretty_clif.rs
index da84e54a9..196418023 100644
--- a/compiler/rustc_codegen_cranelift/src/pretty_clif.rs
+++ b/compiler/rustc_codegen_cranelift/src/pretty_clif.rs
@@ -58,11 +58,10 @@
 use std::fmt;
 use std::io::Write;
 
-use cranelift_codegen::{
-    entity::SecondaryMap,
-    ir::entities::AnyEntity,
-    write::{FuncWriter, PlainWriter},
-};
+use cranelift_codegen::entity::SecondaryMap;
+use cranelift_codegen::ir::entities::AnyEntity;
+use cranelift_codegen::ir::Fact;
+use cranelift_codegen::write::{FuncWriter, PlainWriter};
 use rustc_middle::ty::layout::FnAbiOf;
 use rustc_middle::ty::print::with_no_trimmed_paths;
 use rustc_session::config::{OutputFilenames, OutputType};
@@ -155,8 +154,13 @@ impl FuncWriter for &'_ CommentWriter {
         _func: &Function,
         entity: AnyEntity,
         value: &dyn fmt::Display,
+        maybe_fact: Option<&Fact>,
     ) -> fmt::Result {
-        write!(w, "    {} = {}", entity, value)?;
+        if let Some(fact) = maybe_fact {
+            write!(w, "    {} ! {} = {}", entity, fact, value)?;
+        } else {
+            write!(w, "    {} = {}", entity, value)?;
+        }
 
         if let Some(comment) = self.entity_comments.get(&entity) {
             writeln!(w, " ; {}", comment.replace('\n', "\n; "))
@@ -227,9 +231,8 @@ pub(crate) fn write_ir_file(
     let res = std::fs::File::create(clif_file_name).and_then(|mut file| write(&mut file));
     if let Err(err) = res {
         // Using early_warn as no Session is available here
-        let handler = rustc_session::EarlyErrorHandler::new(
-            rustc_session::config::ErrorOutputType::default(),
-        );
+        let handler =
+            rustc_session::EarlyDiagCtxt::new(rustc_session::config::ErrorOutputType::default());
         handler.early_warn(format!("error writing ir file: {}", err));
     }
 }
diff --git a/compiler/rustc_codegen_cranelift/src/unsize.rs b/compiler/rustc_codegen_cranelift/src/unsize.rs
index c6133f2b3..f777e1137 100644
--- a/compiler/rustc_codegen_cranelift/src/unsize.rs
+++ b/compiler/rustc_codegen_cranelift/src/unsize.rs
@@ -2,6 +2,9 @@
 //!
 //! [`PointerCoercion::Unsize`]: `rustc_middle::ty::adjustment::PointerCoercion::Unsize`
 
+use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths};
+
+use crate::base::codegen_panic_nounwind;
 use crate::prelude::*;
 
 // Adapted from https://github.com/rust-lang/rust/blob/2a663555ddf36f6b041445894a8c175cd1bc718c/src/librustc_codegen_ssa/base.rs#L159-L307
@@ -187,63 +190,113 @@ pub(crate) fn coerce_dyn_star<'tcx>(
 
 // Adapted from https://github.com/rust-lang/rust/blob/2a663555ddf36f6b041445894a8c175cd1bc718c/src/librustc_codegen_ssa/glue.rs
 
-pub(crate) fn size_and_align_of_dst<'tcx>(
+pub(crate) fn size_and_align_of<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     layout: TyAndLayout<'tcx>,
-    info: Value,
+    info: Option<Value>,
 ) -> (Value, Value) {
-    assert!(layout.is_unsized() || layout.abi == Abi::Uninhabited);
-    match layout.ty.kind() {
+    if layout.is_sized() {
+        return (
+            fx.bcx.ins().iconst(fx.pointer_type, layout.size.bytes() as i64),
+            fx.bcx.ins().iconst(fx.pointer_type, layout.align.abi.bytes() as i64),
+        );
+    }
+
+    let ty = layout.ty;
+    match ty.kind() {
         ty::Dynamic(..) => {
             // load size/align from vtable
-            (crate::vtable::size_of_obj(fx, info), crate::vtable::min_align_of_obj(fx, info))
+            (
+                crate::vtable::size_of_obj(fx, info.unwrap()),
+                crate::vtable::min_align_of_obj(fx, info.unwrap()),
+            )
         }
         ty::Slice(_) | ty::Str => {
             let unit = layout.field(fx, 0);
             // The info in this case is the length of the str, so the size is that
             // times the unit size.
             (
-                fx.bcx.ins().imul_imm(info, unit.size.bytes() as i64),
+                fx.bcx.ins().imul_imm(info.unwrap(), unit.size.bytes() as i64),
                 fx.bcx.ins().iconst(fx.pointer_type, unit.align.abi.bytes() as i64),
             )
         }
-        _ => {
+        ty::Foreign(_) => {
+            let trap_block = fx.bcx.create_block();
+            let true_ = fx.bcx.ins().iconst(types::I8, 1);
+            let next_block = fx.bcx.create_block();
+            fx.bcx.ins().brif(true_, trap_block, &[], next_block, &[]);
+            fx.bcx.seal_block(trap_block);
+            fx.bcx.seal_block(next_block);
+            fx.bcx.switch_to_block(trap_block);
+
+            // `extern` type. We cannot compute the size, so panic.
+            let msg_str = with_no_visible_paths!({
+                with_no_trimmed_paths!({
+                    format!("attempted to compute the size or alignment of extern type `{ty}`")
+                })
+            });
+
+            codegen_panic_nounwind(fx, &msg_str, None);
+
+            fx.bcx.switch_to_block(next_block);
+
+            // This function does not return so we can now return whatever we want.
+            let size = fx.bcx.ins().iconst(fx.pointer_type, 42);
+            let align = fx.bcx.ins().iconst(fx.pointer_type, 42);
+            (size, align)
+        }
+        ty::Adt(..) | ty::Tuple(..) => {
             // First get the size of all statically known fields.
             // Don't use size_of because it also rounds up to alignment, which we
             // want to avoid, as the unsized field's alignment could be smaller.
             assert!(!layout.ty.is_simd());
 
             let i = layout.fields.count() - 1;
-            let sized_size = layout.fields.offset(i).bytes();
+            let unsized_offset_unadjusted = layout.fields.offset(i).bytes();
+            let unsized_offset_unadjusted =
+                fx.bcx.ins().iconst(fx.pointer_type, unsized_offset_unadjusted as i64);
             let sized_align = layout.align.abi.bytes();
             let sized_align = fx.bcx.ins().iconst(fx.pointer_type, sized_align as i64);
 
             // Recurse to get the size of the dynamically sized field (must be
             // the last field).
             let field_layout = layout.field(fx, i);
-            let (unsized_size, mut unsized_align) = size_and_align_of_dst(fx, field_layout, info);
-
-            // FIXME (#26403, #27023): We should be adding padding
-            // to `sized_size` (to accommodate the `unsized_align`
-            // required of the unsized field that follows) before
-            // summing it with `sized_size`. (Note that since #26403
-            // is unfixed, we do not yet add the necessary padding
-            // here. But this is where the add would go.)
-
-            // Return the sum of sizes and max of aligns.
-            let size = fx.bcx.ins().iadd_imm(unsized_size, sized_size as i64);
-
-            // Packed types ignore the alignment of their fields.
-            if let ty::Adt(def, _) = layout.ty.kind() {
-                if def.repr().packed() {
-                    unsized_align = sized_align;
+            let (unsized_size, mut unsized_align) = size_and_align_of(fx, field_layout, info);
+
+            // # First compute the dynamic alignment
+
+            // For packed types, we need to cap the alignment.
+            if let ty::Adt(def, _) = ty.kind() {
+                if let Some(packed) = def.repr().pack {
+                    if packed.bytes() == 1 {
+                        // We know this will be capped to 1.
+                        unsized_align = fx.bcx.ins().iconst(fx.pointer_type, 1);
+                    } else {
+                        // We have to dynamically compute `min(unsized_align, packed)`.
+                        let packed = fx.bcx.ins().iconst(fx.pointer_type, packed.bytes() as i64);
+                        let cmp = fx.bcx.ins().icmp(IntCC::UnsignedLessThan, unsized_align, packed);
+                        unsized_align = fx.bcx.ins().select(cmp, unsized_align, packed);
+                    }
                 }
             }
 
             // Choose max of two known alignments (combined value must
             // be aligned according to more restrictive of the two).
             let cmp = fx.bcx.ins().icmp(IntCC::UnsignedGreaterThan, sized_align, unsized_align);
-            let align = fx.bcx.ins().select(cmp, sized_align, unsized_align);
+            let full_align = fx.bcx.ins().select(cmp, sized_align, unsized_align);
+
+            // # Then compute the dynamic size
+
+            // The full formula for the size would be:
+            // let unsized_offset_adjusted = unsized_offset_unadjusted.align_to(unsized_align);
+            // let full_size = (unsized_offset_adjusted + unsized_size).align_to(full_align);
+            // However, `unsized_size` is a multiple of `unsized_align`.
+            // Therefore, we can equivalently do the `align_to(unsized_align)` *after* adding `unsized_size`:
+            // let full_size = (unsized_offset_unadjusted + unsized_size).align_to(unsized_align).align_to(full_align);
+            // Furthermore, `align >= unsized_align`, and therefore we only need to do:
+            // let full_size = (unsized_offset_unadjusted + unsized_size).align_to(full_align);
+
+            let full_size = fx.bcx.ins().iadd(unsized_offset_unadjusted, unsized_size);
 
             // Issue #27023: must add any necessary padding to `size`
             // (to make it a multiple of `align`) before returning it.
@@ -255,12 +308,13 @@ pub(crate) fn size_and_align_of_dst<'tcx>(
             // emulated via the semi-standard fast bit trick:
             //
             //   `(size + (align-1)) & -align`
-            let addend = fx.bcx.ins().iadd_imm(align, -1);
-            let add = fx.bcx.ins().iadd(size, addend);
-            let neg = fx.bcx.ins().ineg(align);
-            let size = fx.bcx.ins().band(add, neg);
+            let addend = fx.bcx.ins().iadd_imm(full_align, -1);
+            let add = fx.bcx.ins().iadd(full_size, addend);
+            let neg = fx.bcx.ins().ineg(full_align);
+            let full_size = fx.bcx.ins().band(add, neg);
 
-            (size, align)
+            (full_size, full_align)
         }
+        _ => bug!("size_and_align_of_dst: {ty} not supported"),
     }
 }
diff --git a/compiler/rustc_codegen_cranelift/src/value_and_place.rs b/compiler/rustc_codegen_cranelift/src/value_and_place.rs
index 21ad2a835..567a5669d 100644
--- a/compiler/rustc_codegen_cranelift/src/value_and_place.rs
+++ b/compiler/rustc_codegen_cranelift/src/value_and_place.rs
@@ -20,34 +20,36 @@ fn codegen_field<'tcx>(
         (base.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap()), field_layout)
     };
 
-    if let Some(extra) = extra {
-        if field_layout.is_sized() {
-            return simple(fx);
-        }
-        match field_layout.ty.kind() {
-            ty::Slice(..) | ty::Str | ty::Foreign(..) => simple(fx),
-            ty::Adt(def, _) if def.repr().packed() => {
-                assert_eq!(layout.align.abi.bytes(), 1);
-                simple(fx)
-            }
-            _ => {
-                // We have to align the offset for DST's
-                let unaligned_offset = field_offset.bytes();
-                let (_, unsized_align) =
-                    crate::unsize::size_and_align_of_dst(fx, field_layout, extra);
+    if field_layout.is_sized() {
+        return simple(fx);
+    }
+    match field_layout.ty.kind() {
+        ty::Slice(..) | ty::Str => simple(fx),
+        _ => {
+            let unaligned_offset = field_offset.bytes();
 
-                let one = fx.bcx.ins().iconst(fx.pointer_type, 1);
-                let align_sub_1 = fx.bcx.ins().isub(unsized_align, one);
-                let and_lhs = fx.bcx.ins().iadd_imm(align_sub_1, unaligned_offset as i64);
-                let zero = fx.bcx.ins().iconst(fx.pointer_type, 0);
-                let and_rhs = fx.bcx.ins().isub(zero, unsized_align);
-                let offset = fx.bcx.ins().band(and_lhs, and_rhs);
+            // Get the alignment of the field
+            let (_, mut unsized_align) = crate::unsize::size_and_align_of(fx, field_layout, extra);
 
-                (base.offset_value(fx, offset), field_layout)
+            // For packed types, we need to cap alignment.
+            if let ty::Adt(def, _) = layout.ty.kind() {
+                if let Some(packed) = def.repr().pack {
+                    let packed = fx.bcx.ins().iconst(fx.pointer_type, packed.bytes() as i64);
+                    let cmp = fx.bcx.ins().icmp(IntCC::UnsignedLessThan, unsized_align, packed);
+                    unsized_align = fx.bcx.ins().select(cmp, unsized_align, packed);
+                }
             }
+
+            // Bump the unaligned offset up to the appropriate alignment
+            let one = fx.bcx.ins().iconst(fx.pointer_type, 1);
+            let align_sub_1 = fx.bcx.ins().isub(unsized_align, one);
+            let and_lhs = fx.bcx.ins().iadd_imm(align_sub_1, unaligned_offset as i64);
+            let zero = fx.bcx.ins().iconst(fx.pointer_type, 0);
+            let and_rhs = fx.bcx.ins().isub(zero, unsized_align);
+            let offset = fx.bcx.ins().band(and_lhs, and_rhs);
+
+            (base.offset_value(fx, offset), field_layout)
         }
-    } else {
-        simple(fx)
     }
 }
 
@@ -329,7 +331,13 @@ impl<'tcx> CValue<'tcx> {
                 let msb = fx.bcx.ins().iconst(types::I64, (const_val >> 64) as u64 as i64);
                 fx.bcx.ins().iconcat(lsb, msb)
             }
-            ty::Bool | ty::Char | ty::Uint(_) | ty::Int(_) | ty::Ref(..) | ty::RawPtr(..) => {
+            ty::Bool
+            | ty::Char
+            | ty::Uint(_)
+            | ty::Int(_)
+            | ty::Ref(..)
+            | ty::RawPtr(..)
+            | ty::FnPtr(..) => {
                 let raw_val = const_val.size().truncate(const_val.to_bits(layout.size).unwrap());
                 fx.bcx.ins().iconst(clif_ty, raw_val as i64)
             }
@@ -725,13 +733,8 @@ impl<'tcx> CPlace<'tcx> {
         };
 
         let (field_ptr, field_layout) = codegen_field(fx, base, extra, layout, field);
-        if field_layout.is_unsized() {
-            if let ty::Foreign(_) = field_layout.ty.kind() {
-                assert!(extra.is_none());
-                CPlace::for_ptr(field_ptr, field_layout)
-            } else {
-                CPlace::for_ptr_with_extra(field_ptr, extra.unwrap(), field_layout)
-            }
+        if has_ptr_meta(fx.tcx, field_layout.ty) {
+            CPlace::for_ptr_with_extra(field_ptr, extra.unwrap(), field_layout)
         } else {
             CPlace::for_ptr(field_ptr, field_layout)
         }
@@ -971,6 +974,32 @@ pub(crate) fn assert_assignable<'tcx>(
                 }
             }
         }
+        (&ty::Coroutine(def_id_a, args_a, mov_a), &ty::Coroutine(def_id_b, args_b, mov_b))
+            if def_id_a == def_id_b && mov_a == mov_b =>
+        {
+            let mut types_a = args_a.types();
+            let mut types_b = args_b.types();
+            loop {
+                match (types_a.next(), types_b.next()) {
+                    (Some(a), Some(b)) => assert_assignable(fx, a, b, limit - 1),
+                    (None, None) => return,
+                    (Some(_), None) | (None, Some(_)) => panic!("{:#?}/{:#?}", from_ty, to_ty),
+                }
+            }
+        }
+        (&ty::CoroutineWitness(def_id_a, args_a), &ty::CoroutineWitness(def_id_b, args_b))
+            if def_id_a == def_id_b =>
+        {
+            let mut types_a = args_a.types();
+            let mut types_b = args_b.types();
+            loop {
+                match (types_a.next(), types_b.next()) {
+                    (Some(a), Some(b)) => assert_assignable(fx, a, b, limit - 1),
+                    (None, None) => return,
+                    (Some(_), None) | (None, Some(_)) => panic!("{:#?}/{:#?}", from_ty, to_ty),
+                }
+            }
+        }
         (ty::Param(_), _) | (_, ty::Param(_)) if fx.tcx.sess.opts.unstable_opts.polymorphize => {
             // No way to check if it is correct or not with polymorphization enabled
         }
-- 
cgit v1.2.3