29 files changed, 1298 insertions, 718 deletions
diff --git a/compiler/rustc_codegen_cranelift/src/abi/mod.rs b/compiler/rustc_codegen_cranelift/src/abi/mod.rs
index 5d775b9b5..c4572e035 100644
--- a/compiler/rustc_codegen_cranelift/src/abi/mod.rs
+++ b/compiler/rustc_codegen_cranelift/src/abi/mod.rs
@@ -6,6 +6,7 @@ mod returning;
 
 use std::borrow::Cow;
 
+use cranelift_codegen::ir::{AbiParam, SigRef};
 use cranelift_module::ModuleError;
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrFlags;
 use rustc_middle::ty::layout::FnAbiOf;
@@ -13,12 +14,9 @@ use rustc_session::Session;
 use rustc_target::abi::call::{Conv, FnAbi};
 use rustc_target::spec::abi::Abi;
 
-use cranelift_codegen::ir::{AbiParam, SigRef};
-
 use self::pass_mode::*;
-use crate::prelude::*;
-
 pub(crate) use self::returning::codegen_return;
+use crate::prelude::*;
 
 fn clif_sig_from_fn_abi<'tcx>(
     tcx: TyCtxt<'tcx>,
@@ -30,7 +28,7 @@ fn clif_sig_from_fn_abi<'tcx>(
     let inputs = fn_abi.args.iter().flat_map(|arg_abi| arg_abi.get_abi_param(tcx).into_iter());
 
     let (return_ptr, returns) = fn_abi.ret.get_abi_return(tcx);
-    // Sometimes the first param is an pointer to the place where the return value needs to be stored.
+    // Sometimes the first param is a pointer to the place where the return value needs to be stored.
     let params: Vec<_> = return_ptr.into_iter().chain(inputs).collect();
 
     Signature { params, returns, call_conv }
@@ -122,32 +120,25 @@ impl<'tcx> FunctionCx<'_, '_, 'tcx> {
         args: &[Value],
     ) -> Cow<'_, [Value]> {
         if self.tcx.sess.target.is_like_windows {
-            let (mut params, mut args): (Vec<_>, Vec<_>) =
-                params
-                    .into_iter()
-                    .zip(args)
-                    .map(|(param, &arg)| {
-                        if param.value_type == types::I128 {
-                            let arg_ptr = Pointer::stack_slot(self.bcx.create_sized_stack_slot(
-                                StackSlotData { kind: StackSlotKind::ExplicitSlot, size: 16 },
-                            ));
-                            arg_ptr.store(self, arg, MemFlags::trusted());
-                            (AbiParam::new(self.pointer_type), arg_ptr.get_addr(self))
-                        } else {
-                            (param, arg)
-                        }
-                    })
-                    .unzip();
+            let (mut params, mut args): (Vec<_>, Vec<_>) = params
+                .into_iter()
+                .zip(args)
+                .map(|(param, &arg)| {
+                    if param.value_type == types::I128 {
+                        let arg_ptr = self.create_stack_slot(16, 16);
+                        arg_ptr.store(self, arg, MemFlags::trusted());
+                        (AbiParam::new(self.pointer_type), arg_ptr.get_addr(self))
+                    } else {
+                        (param, arg)
+                    }
+                })
+                .unzip();
 
             let indirect_ret_val = returns.len() == 1 && returns[0].value_type == types::I128;
 
             if indirect_ret_val {
                 params.insert(0, AbiParam::new(self.pointer_type));
-                let ret_ptr =
-                    Pointer::stack_slot(self.bcx.create_sized_stack_slot(StackSlotData {
-                        kind: StackSlotKind::ExplicitSlot,
-                        size: 16,
-                    }));
+                let ret_ptr = self.create_stack_slot(16, 16);
                 args.insert(0, ret_ptr.get_addr(self));
                 self.lib_call_unadjusted(name, params, vec![], &args);
                 return Cow::Owned(vec![ret_ptr.load(self, types::I128, MemFlags::trusted())]);
diff --git a/compiler/rustc_codegen_cranelift/src/abi/pass_mode.rs b/compiler/rustc_codegen_cranelift/src/abi/pass_mode.rs
index 0d16da480..065226700 100644
--- a/compiler/rustc_codegen_cranelift/src/abi/pass_mode.rs
+++ b/compiler/rustc_codegen_cranelift/src/abi/pass_mode.rs
@@ -1,14 +1,14 @@
 //! Argument passing
 
-use crate::prelude::*;
-use crate::value_and_place::assert_assignable;
-
 use cranelift_codegen::ir::{ArgumentExtension, ArgumentPurpose};
 use rustc_target::abi::call::{
     ArgAbi, ArgAttributes, ArgExtension as RustcArgExtension, CastTarget, PassMode, Reg, RegKind,
 };
 use smallvec::{smallvec, SmallVec};
 
+use crate::prelude::*;
+use crate::value_and_place::assert_assignable;
+
 pub(super) trait ArgAbiExt<'tcx> {
     fn get_abi_param(&self, tcx: TyCtxt<'tcx>) -> SmallVec<[AbiParam; 2]>;
     fn get_abi_return(&self, tcx: TyCtxt<'tcx>) -> (Option<AbiParam>, Vec<AbiParam>);
@@ -189,16 +189,13 @@ pub(super) fn from_casted_value<'tcx>(
     let abi_params = cast_target_to_abi_params(cast);
     let abi_param_size: u32 = abi_params.iter().map(|param| param.value_type.bytes()).sum();
     let layout_size = u32::try_from(layout.size.bytes()).unwrap();
-    let stack_slot = fx.bcx.create_sized_stack_slot(StackSlotData {
-        kind: StackSlotKind::ExplicitSlot,
-        // FIXME Don't force the size to a multiple of 16 bytes once Cranelift gets a way to
-        // specify stack slot alignment.
+    let ptr = fx.create_stack_slot(
         // Stack slot size may be bigger for example `[u8; 3]` which is packed into an `i32`.
         // It may also be smaller for example when the type is a wrapper around an integer with a
         // larger alignment than the integer.
-        size: (std::cmp::max(abi_param_size, layout_size) + 15) / 16 * 16,
-    });
-    let ptr = Pointer::stack_slot(stack_slot);
+        std::cmp::max(abi_param_size, layout_size),
+        u32::try_from(layout.align.pref.bytes()).unwrap(),
+    );
     let mut offset = 0;
     let mut block_params_iter = block_params.iter().copied();
     for param in abi_params {
diff --git a/compiler/rustc_codegen_cranelift/src/abi/returning.rs b/compiler/rustc_codegen_cranelift/src/abi/returning.rs
index 646fb4a3c..0799a22c6 100644
--- a/compiler/rustc_codegen_cranelift/src/abi/returning.rs
+++ b/compiler/rustc_codegen_cranelift/src/abi/returning.rs
@@ -1,10 +1,10 @@
 //! Return value handling
 
-use crate::prelude::*;
-
 use rustc_target::abi::call::{ArgAbi, PassMode};
 use smallvec::{smallvec, SmallVec};
 
+use crate::prelude::*;
+
 /// Return a place where the return value of the current function can be written to. If necessary
 /// this adds an extra parameter pointing to where the return value needs to be stored.
 pub(super) fn codegen_return_param<'tcx>(
diff --git a/compiler/rustc_codegen_cranelift/src/allocator.rs b/compiler/rustc_codegen_cranelift/src/allocator.rs
index 4e4c595de..e8af3e8c2 100644
--- a/compiler/rustc_codegen_cranelift/src/allocator.rs
+++ b/compiler/rustc_codegen_cranelift/src/allocator.rs
@@ -1,8 +1,6 @@
 //! Allocator shim
 // Adapted from rustc
 
-use crate::prelude::*;
-
 use rustc_ast::expand::allocator::{
     alloc_error_handler_name, default_fn_name, global_fn_name, AllocatorKind, AllocatorTy,
     ALLOCATOR_METHODS, NO_ALLOC_SHIM_IS_UNSTABLE,
@@ -10,6 +8,8 @@ use rustc_ast::expand::allocator::{
 use rustc_codegen_ssa::base::allocator_kind_for_codegen;
 use rustc_session::config::OomStrategy;
 
+use crate::prelude::*;
+
 /// Returns whether an allocator shim was created
 pub(crate) fn codegen(
     tcx: TyCtxt<'_>,
diff --git a/compiler/rustc_codegen_cranelift/src/analyze.rs b/compiler/rustc_codegen_cranelift/src/analyze.rs
index 359d581c1..321612238 100644
--- a/compiler/rustc_codegen_cranelift/src/analyze.rs
+++ b/compiler/rustc_codegen_cranelift/src/analyze.rs
@@ -1,11 +1,11 @@
 //! SSA analysis
 
-use crate::prelude::*;
-
 use rustc_index::IndexVec;
 use rustc_middle::mir::StatementKind::*;
 use rustc_middle::ty::Ty;
 
+use crate::prelude::*;
+
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub(crate) enum SsaKind {
     NotSsa,
diff --git a/compiler/rustc_codegen_cranelift/src/base.rs b/compiler/rustc_codegen_cranelift/src/base.rs
index 0a451dad9..91b1547cb 100644
--- a/compiler/rustc_codegen_cranelift/src/base.rs
+++ b/compiler/rustc_codegen_cranelift/src/base.rs
@@ -1,15 +1,14 @@
 //! Codegen of a single function
 
+use cranelift_codegen::ir::UserFuncName;
+use cranelift_codegen::CodegenError;
+use cranelift_module::ModuleError;
 use rustc_ast::InlineAsmOptions;
 use rustc_index::IndexVec;
 use rustc_middle::ty::adjustment::PointerCoercion;
 use rustc_middle::ty::layout::FnAbiOf;
 use rustc_middle::ty::print::with_no_trimmed_paths;
 
-use cranelift_codegen::ir::UserFuncName;
-use cranelift_codegen::CodegenError;
-use cranelift_module::ModuleError;
-
 use crate::constant::ConstantCx;
 use crate::debuginfo::FunctionDebugContext;
 use crate::prelude::*;
@@ -250,17 +249,6 @@ pub(crate) fn verify_func(
 }
 
 fn codegen_fn_body(fx: &mut FunctionCx<'_, '_, '_>, start_block: Block) {
-    if let Err(err) =
-        fx.mir.post_mono_checks(fx.tcx, ty::ParamEnv::reveal_all(), |c| Ok(fx.monomorphize(c)))
-    {
-        err.emit_err(fx.tcx);
-        fx.bcx.append_block_params_for_function_params(fx.block_map[START_BLOCK]);
-        fx.bcx.switch_to_block(fx.block_map[START_BLOCK]);
-        // compilation should have been aborted
-        fx.bcx.ins().trap(TrapCode::UnreachableCodeReached);
-        return;
-    }
-
     let arg_uninhabited = fx
         .mir
         .args_iter()
@@ -490,7 +478,7 @@ fn codegen_fn_body(fx: &mut FunctionCx<'_, '_, '_>, start_block: Block) {
             TerminatorKind::Yield { .. }
             | TerminatorKind::FalseEdge { .. }
             | TerminatorKind::FalseUnwind { .. }
-            | TerminatorKind::GeneratorDrop => {
+            | TerminatorKind::CoroutineDrop => {
                 bug!("shouldn't exist at codegen {:?}", bb_data.terminator());
             }
             TerminatorKind::Drop { place, target, unwind: _, replace: _ } => {
@@ -778,7 +766,7 @@ fn codegen_stmt<'tcx>(
                         NullOp::SizeOf => layout.size.bytes(),
                         NullOp::AlignOf => layout.align.abi.bytes(),
                         NullOp::OffsetOf(fields) => {
-                            layout.offset_of_subfield(fx, fields.iter().map(|f| f.index())).bytes()
+                            layout.offset_of_subfield(fx, fields.iter()).bytes()
                         }
                     };
                     let val = CValue::by_val(
diff --git a/compiler/rustc_codegen_cranelift/src/cast.rs b/compiler/rustc_codegen_cranelift/src/cast.rs
index 6bf3a866b..0b5cb1547 100644
--- a/compiler/rustc_codegen_cranelift/src/cast.rs
+++ b/compiler/rustc_codegen_cranelift/src/cast.rs
@@ -104,11 +104,7 @@ pub(crate) fn clif_int_or_float_cast(
                     &[from],
                 )[0];
                 // FIXME(bytecodealliance/wasmtime#6104) use bitcast instead of store to get from i64x2 to i128
-                let stack_slot = fx.bcx.create_sized_stack_slot(StackSlotData {
-                    kind: StackSlotKind::ExplicitSlot,
-                    size: 16,
-                });
-                let ret_ptr = Pointer::stack_slot(stack_slot);
+                let ret_ptr = fx.create_stack_slot(16, 16);
                 ret_ptr.store(fx, ret, MemFlags::trusted());
                 ret_ptr.load(fx, types::I128, MemFlags::trusted())
             } else {
@@ -129,8 +125,8 @@ pub(crate) fn clif_int_or_float_cast(
             let (min, max) = match (to_ty, to_signed) {
                 (types::I8, false) => (0, i64::from(u8::MAX)),
                 (types::I16, false) => (0, i64::from(u16::MAX)),
-                (types::I8, true) => (i64::from(i8::MIN), i64::from(i8::MAX)),
-                (types::I16, true) => (i64::from(i16::MIN), i64::from(i16::MAX)),
+                (types::I8, true) => (i64::from(i8::MIN as u32), i64::from(i8::MAX as u32)),
+                (types::I16, true) => (i64::from(i16::MIN as u32), i64::from(i16::MAX as u32)),
                 _ => unreachable!(),
             };
             let min_val = fx.bcx.ins().iconst(types::I32, min);
diff --git a/compiler/rustc_codegen_cranelift/src/common.rs b/compiler/rustc_codegen_cranelift/src/common.rs
index 359b430b4..63562d335 100644
--- a/compiler/rustc_codegen_cranelift/src/common.rs
+++ b/compiler/rustc_codegen_cranelift/src/common.rs
@@ -1,6 +1,5 @@
 use cranelift_codegen::isa::TargetFrontendConfig;
 use gimli::write::FileId;
-
 use rustc_data_structures::sync::Lrc;
 use rustc_index::IndexVec;
 use rustc_middle::ty::layout::{
@@ -204,9 +203,9 @@ pub(crate) fn type_min_max_value(
         (types::I8, false) | (types::I16, false) | (types::I32, false) | (types::I64, false) => {
             0i64
         }
-        (types::I8, true) => i64::from(i8::MIN),
-        (types::I16, true) => i64::from(i16::MIN),
-        (types::I32, true) => i64::from(i32::MIN),
+        (types::I8, true) => i64::from(i8::MIN as u8),
+        (types::I16, true) => i64::from(i16::MIN as u16),
+        (types::I32, true) => i64::from(i32::MIN as u32),
         (types::I64, true) => i64::MIN,
         _ => unreachable!(),
     };
@@ -216,9 +215,9 @@ pub(crate) fn type_min_max_value(
         (types::I16, false) => i64::from(u16::MAX),
         (types::I32, false) => i64::from(u32::MAX),
         (types::I64, false) => u64::MAX as i64,
-        (types::I8, true) => i64::from(i8::MAX),
-        (types::I16, true) => i64::from(i16::MAX),
-        (types::I32, true) => i64::from(i32::MAX),
+        (types::I8, true) => i64::from(i8::MAX as u8),
+        (types::I16, true) => i64::from(i16::MAX as u16),
+        (types::I32, true) => i64::from(i32::MAX as u32),
         (types::I64, true) => i64::MAX,
         _ => unreachable!(),
     };
@@ -384,6 +383,25 @@ impl<'tcx> FunctionCx<'_, '_, 'tcx> {
         })
     }
 
+    pub(crate) fn create_stack_slot(&mut self, size: u32, align: u32) -> Pointer {
+        if align <= 16 {
+            let stack_slot = self.bcx.create_sized_stack_slot(StackSlotData {
+                kind: StackSlotKind::ExplicitSlot,
+                // FIXME Don't force the size to a multiple of 16 bytes once Cranelift gets a way to
+                // specify stack slot alignment.
+                size: (size + 15) / 16 * 16,
+            });
+            Pointer::stack_slot(stack_slot)
+        } else {
+            // Alignment is too big to handle using the above hack. Dynamically realign a stack slot
+            // instead. This wastes some space for the realignment.
+            let base_ptr = self.create_stack_slot(size + align, 16).get_addr(self);
+            let misalign_offset = self.bcx.ins().urem_imm(base_ptr, i64::from(align));
+            let realign_offset = self.bcx.ins().irsub_imm(misalign_offset, i64::from(align));
+            Pointer::new(self.bcx.ins().iadd(base_ptr, realign_offset))
+        }
+    }
+
     pub(crate) fn set_debug_loc(&mut self, source_info: mir::SourceInfo) {
         if let Some(debug_context) = &mut self.cx.debug_context {
             let (file, line, column) =
@@ -412,46 +430,11 @@ impl<'tcx> FunctionCx<'_, '_, 'tcx> {
         }
     }
 
-    // Note: must be kept in sync with get_caller_location from cg_ssa
-    pub(crate) fn get_caller_location(&mut self, mut source_info: mir::SourceInfo) -> CValue<'tcx> {
-        let span_to_caller_location = |fx: &mut FunctionCx<'_, '_, 'tcx>, span: Span| {
-            let topmost = span.ctxt().outer_expn().expansion_cause().unwrap_or(span);
-            let caller = fx.tcx.sess.source_map().lookup_char_pos(topmost.lo());
-            let const_loc = fx.tcx.const_caller_location((
-                rustc_span::symbol::Symbol::intern(
-                    &caller.file.name.prefer_remapped().to_string_lossy(),
-                ),
-                caller.line as u32,
-                caller.col_display as u32 + 1,
-            ));
-            crate::constant::codegen_const_value(fx, const_loc, fx.tcx.caller_location_ty())
-        };
-
-        // Walk up the `SourceScope`s, in case some of them are from MIR inlining.
-        // If so, the starting `source_info.span` is in the innermost inlined
-        // function, and will be replaced with outer callsite spans as long
-        // as the inlined functions were `#[track_caller]`.
-        loop {
-            let scope_data = &self.mir.source_scopes[source_info.scope];
-
-            if let Some((callee, callsite_span)) = scope_data.inlined {
-                // Stop inside the most nested non-`#[track_caller]` function,
-                // before ever reaching its caller (which is irrelevant).
-                if !callee.def.requires_caller_location(self.tcx) {
-                    return span_to_caller_location(self, source_info.span);
-                }
-                source_info.span = callsite_span;
-            }
-
-            // Skip past all of the parents with `inlined: None`.
-            match scope_data.inlined_parent_scope {
-                Some(parent) => source_info.scope = parent,
-                None => break,
-            }
-        }
-
-        // No inlined `SourceScope`s, or all of them were `#[track_caller]`.
-        self.caller_location.unwrap_or_else(|| span_to_caller_location(self, source_info.span))
+    pub(crate) fn get_caller_location(&mut self, source_info: mir::SourceInfo) -> CValue<'tcx> {
+        self.mir.caller_location_span(source_info, self.caller_location, self.tcx, |span| {
+            let const_loc = self.tcx.span_as_caller_location(span);
+            crate::constant::codegen_const_value(self, const_loc, self.tcx.caller_location_ty())
+        })
     }
 
     pub(crate) fn anonymous_str(&mut self, msg: &str) -> Value {
diff --git a/compiler/rustc_codegen_cranelift/src/concurrency_limiter.rs b/compiler/rustc_codegen_cranelift/src/concurrency_limiter.rs
index d2b928db7..20f2ee4c7 100644
--- a/compiler/rustc_codegen_cranelift/src/concurrency_limiter.rs
+++ b/compiler/rustc_codegen_cranelift/src/concurrency_limiter.rs
@@ -1,8 +1,7 @@
 use std::sync::{Arc, Condvar, Mutex};
 
-use rustc_session::Session;
-
 use jobserver::HelperThread;
+use rustc_session::Session;
 
 // FIXME don't panic when a worker thread panics
 
diff --git a/compiler/rustc_codegen_cranelift/src/constant.rs b/compiler/rustc_codegen_cranelift/src/constant.rs
index 14b10ed8b..b0853d30e 100644
--- a/compiler/rustc_codegen_cranelift/src/constant.rs
+++ b/compiler/rustc_codegen_cranelift/src/constant.rs
@@ -1,12 +1,11 @@
 //! Handling of `static`s, `const`s and promoted allocations
 
+use cranelift_module::*;
 use rustc_data_structures::fx::{FxHashMap, FxHashSet};
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrFlags;
 use rustc_middle::mir::interpret::{read_target_uint, AllocId, GlobalAlloc, Scalar};
 use rustc_middle::mir::ConstValue;
 
-use cranelift_module::*;
-
 use crate::prelude::*;
 
 pub(crate) struct ConstantCx {
@@ -101,7 +100,7 @@ pub(crate) fn codegen_const_value<'tcx>(
                 if fx.clif_type(layout.ty).is_some() {
                     return CValue::const_val(fx, layout, int);
                 } else {
-                    let raw_val = int.to_bits(int.size()).unwrap();
+                    let raw_val = int.size().truncate(int.to_bits(int.size()).unwrap());
                     let val = match int.size().bytes() {
                         1 => fx.bcx.ins().iconst(types::I8, raw_val as i64),
                         2 => fx.bcx.ins().iconst(types::I16, raw_val as i64),
@@ -187,8 +186,7 @@ pub(crate) fn codegen_const_value<'tcx>(
         ConstValue::Slice { data, meta } => {
             let alloc_id = fx.tcx.reserve_and_set_memory_alloc(data);
             let ptr = pointer_for_allocation(fx, alloc_id).get_addr(fx);
-            // FIXME: the `try_from` here can actually fail, e.g. for very long ZST slices.
-            let len = fx.bcx.ins().iconst(fx.pointer_type, i64::try_from(meta).unwrap());
+            let len = fx.bcx.ins().iconst(fx.pointer_type, meta as i64);
             CValue::by_val_pair(ptr, len, layout)
         }
     }
@@ -512,7 +510,7 @@ pub(crate) fn mir_operand_get_const_val<'tcx>(
                     | TerminatorKind::Drop { .. }
                     | TerminatorKind::Assert { .. } => {}
                     TerminatorKind::Yield { .. }
-                    | TerminatorKind::GeneratorDrop
+                    | TerminatorKind::CoroutineDrop
                     | TerminatorKind::FalseEdge { .. }
                     | TerminatorKind::FalseUnwind { .. } => unreachable!(),
                     TerminatorKind::InlineAsm { .. } => return None,
diff --git a/compiler/rustc_codegen_cranelift/src/debuginfo/emit.rs b/compiler/rustc_codegen_cranelift/src/debuginfo/emit.rs
index c4a5627e6..81b819a55 100644
--- a/compiler/rustc_codegen_cranelift/src/debuginfo/emit.rs
+++ b/compiler/rustc_codegen_cranelift/src/debuginfo/emit.rs
@@ -1,10 +1,9 @@
 //! Write the debuginfo into an object file.
 
 use cranelift_object::ObjectProduct;
-use rustc_data_structures::fx::FxHashMap;
-
 use gimli::write::{Address, AttributeValue, EndianVec, Result, Sections, Writer};
 use gimli::{RunTimeEndian, SectionId};
+use rustc_data_structures::fx::FxHashMap;
 
 use super::object::WriteDebugInfo;
 use super::DebugContext;
diff --git a/compiler/rustc_codegen_cranelift/src/debuginfo/line_info.rs b/compiler/rustc_codegen_cranelift/src/debuginfo/line_info.rs
index b19b935a0..6230ca15d 100644
--- a/compiler/rustc_codegen_cranelift/src/debuginfo/line_info.rs
+++ b/compiler/rustc_codegen_cranelift/src/debuginfo/line_info.rs
@@ -3,20 +3,18 @@
 use std::ffi::OsStr;
 use std::path::{Component, Path};
 
-use crate::debuginfo::FunctionDebugContext;
-use crate::prelude::*;
-
-use rustc_data_structures::sync::Lrc;
-use rustc_span::{
-    FileName, Pos, SourceFile, SourceFileAndLine, SourceFileHash, SourceFileHashAlgorithm,
-};
-
 use cranelift_codegen::binemit::CodeOffset;
 use cranelift_codegen::MachSrcLoc;
-
 use gimli::write::{
     Address, AttributeValue, FileId, FileInfo, LineProgram, LineString, LineStringTable,
 };
+use rustc_data_structures::sync::Lrc;
+use rustc_span::{
+    FileName, Pos, SourceFile, SourceFileAndLine, SourceFileHash, SourceFileHashAlgorithm,
+};
+
+use crate::debuginfo::FunctionDebugContext;
+use crate::prelude::*;
 
 // OPTIMIZATION: It is cheaper to do this in one pass than using `.parent()` and `.file_name()`.
 fn split_path_dir_and_file(path: &Path) -> (&Path, &OsStr) {
@@ -97,7 +95,11 @@ impl DebugContext {
         match &source_file.name {
             FileName::Real(path) => {
                 let (dir_path, file_name) =
-                    split_path_dir_and_file(path.remapped_path_if_available());
+                    split_path_dir_and_file(if self.should_remap_filepaths {
+                        path.remapped_path_if_available()
+                    } else {
+                        path.local_path_if_available()
+                    });
                 let dir_name = osstr_as_utf8_bytes(dir_path.as_os_str());
                 let file_name = osstr_as_utf8_bytes(file_name);
 
@@ -118,7 +120,14 @@ impl DebugContext {
             filename => {
                 let dir_id = line_program.default_directory();
                 let dummy_file_name = LineString::new(
-                    filename.prefer_remapped().to_string().into_bytes(),
+                    filename
+                        .display(if self.should_remap_filepaths {
+                            FileNameDisplayPreference::Remapped
+                        } else {
+                            FileNameDisplayPreference::Local
+                        })
+                        .to_string()
+                        .into_bytes(),
                     line_program.encoding(),
                     line_strings,
                 );
diff --git a/compiler/rustc_codegen_cranelift/src/debuginfo/mod.rs b/compiler/rustc_codegen_cranelift/src/debuginfo/mod.rs
index 8a4b1cccf..e6edc452c 100644
--- a/compiler/rustc_codegen_cranelift/src/debuginfo/mod.rs
+++ b/compiler/rustc_codegen_cranelift/src/debuginfo/mod.rs
@@ -5,11 +5,8 @@ mod line_info;
 mod object;
 mod unwind;
 
-use crate::prelude::*;
-
 use cranelift_codegen::ir::Endianness;
 use cranelift_codegen::isa::TargetIsa;
-
 use gimli::write::{
     Address, AttributeValue, DwarfUnit, FileId, LineProgram, LineString, Range, RangeList,
     UnitEntryId,
@@ -17,12 +14,13 @@ use gimli::write::{
 use gimli::{Encoding, Format, LineEncoding, RunTimeEndian};
 use indexmap::IndexSet;
 
-pub(crate) use emit::{DebugReloc, DebugRelocName};
-pub(crate) use unwind::UnwindContext;
+pub(crate) use self::emit::{DebugReloc, DebugRelocName};
+pub(crate) use self::unwind::UnwindContext;
+use crate::prelude::*;
 
 pub(crate) fn producer() -> String {
     format!(
-        "cg_clif (rustc {}, cranelift {})",
+        "rustc version {} with cranelift {}",
         rustc_interface::util::rustc_version_str().unwrap_or("unknown version"),
         cranelift_codegen::VERSION,
     )
@@ -33,6 +31,8 @@ pub(crate) struct DebugContext {
 
     dwarf: DwarfUnit,
     unit_range_list: RangeList,
+
+    should_remap_filepaths: bool,
 }
 
 pub(crate) struct FunctionDebugContext {
@@ -65,12 +65,18 @@ impl DebugContext {
 
         let mut dwarf = DwarfUnit::new(encoding);
 
+        let should_remap_filepaths = tcx.sess.should_prefer_remapped_for_codegen();
+
         let producer = producer();
         let comp_dir = tcx
             .sess
             .opts
             .working_dir
-            .to_string_lossy(FileNameDisplayPreference::Remapped)
+            .to_string_lossy(if should_remap_filepaths {
+                FileNameDisplayPreference::Remapped
+            } else {
+                FileNameDisplayPreference::Local
+            })
             .into_owned();
         let (name, file_info) = match tcx.sess.local_crate_source_file() {
             Some(path) => {
@@ -104,7 +110,12 @@ impl DebugContext {
             root.set(gimli::DW_AT_low_pc, AttributeValue::Address(Address::Constant(0)));
         }
 
-        DebugContext { endian, dwarf, unit_range_list: RangeList(Vec::new()) }
+        DebugContext {
+            endian,
+            dwarf,
+            unit_range_list: RangeList(Vec::new()),
+            should_remap_filepaths,
+        }
     }
 
     pub(crate) fn define_function(
diff --git a/compiler/rustc_codegen_cranelift/src/debuginfo/object.rs b/compiler/rustc_codegen_cranelift/src/debuginfo/object.rs
index 9dc9b2cf9..f1840a7bf 100644
--- a/compiler/rustc_codegen_cranelift/src/debuginfo/object.rs
+++ b/compiler/rustc_codegen_cranelift/src/debuginfo/object.rs
@@ -1,12 +1,9 @@
-use rustc_data_structures::fx::FxHashMap;
-
 use cranelift_module::FuncId;
 use cranelift_object::ObjectProduct;
-
+use gimli::SectionId;
 use object::write::{Relocation, StandardSegment};
 use object::{RelocationEncoding, SectionKind};
-
-use gimli::SectionId;
+use rustc_data_structures::fx::FxHashMap;
 
 use crate::debuginfo::{DebugReloc, DebugRelocName};
 
diff --git a/compiler/rustc_codegen_cranelift/src/debuginfo/unwind.rs b/compiler/rustc_codegen_cranelift/src/debuginfo/unwind.rs
index 493359c74..35278e6fb 100644
--- a/compiler/rustc_codegen_cranelift/src/debuginfo/unwind.rs
+++ b/compiler/rustc_codegen_cranelift/src/debuginfo/unwind.rs
@@ -1,15 +1,13 @@
 //! Unwind info generation (`.eh_frame`)
 
-use crate::prelude::*;
-
 use cranelift_codegen::ir::Endianness;
 use cranelift_codegen::isa::{unwind::UnwindInfo, TargetIsa};
-
 use cranelift_object::ObjectProduct;
 use gimli::write::{Address, CieId, EhFrame, FrameTable, Section};
 use gimli::RunTimeEndian;
 
 use super::object::WriteDebugInfo;
+use crate::prelude::*;
 
 pub(crate) struct UnwindContext {
     endian: RunTimeEndian,
diff --git a/compiler/rustc_codegen_cranelift/src/driver/aot.rs b/compiler/rustc_codegen_cranelift/src/driver/aot.rs
index 3e9383095..11229dd42 100644
--- a/compiler/rustc_codegen_cranelift/src/driver/aot.rs
+++ b/compiler/rustc_codegen_cranelift/src/driver/aot.rs
@@ -6,19 +6,19 @@ use std::path::PathBuf;
 use std::sync::Arc;
 use std::thread::JoinHandle;
 
+use cranelift_object::{ObjectBuilder, ObjectModule};
+use rustc_codegen_ssa::assert_module_sources::CguReuse;
 use rustc_codegen_ssa::back::metadata::create_compressed_metadata_file;
+use rustc_codegen_ssa::base::determine_cgu_reuse;
 use rustc_codegen_ssa::{CodegenResults, CompiledModule, CrateInfo, ModuleKind};
 use rustc_data_structures::profiling::SelfProfilerRef;
 use rustc_data_structures::stable_hasher::{HashStable, StableHasher};
 use rustc_metadata::EncodedMetadata;
 use rustc_middle::dep_graph::{WorkProduct, WorkProductId};
 use rustc_middle::mir::mono::{CodegenUnit, MonoItem};
-use rustc_session::cgu_reuse_tracker::CguReuse;
 use rustc_session::config::{DebugInfo, OutputFilenames, OutputType};
 use rustc_session::Session;
 
-use cranelift_object::{ObjectBuilder, ObjectModule};
-
 use crate::concurrency_limiter::{ConcurrencyLimiter, ConcurrencyLimiterToken};
 use crate::global_asm::GlobalAsmConfig;
 use crate::{prelude::*, BackendConfig};
@@ -361,12 +361,26 @@ pub(crate) fn run_aot(
     metadata: EncodedMetadata,
     need_metadata_module: bool,
 ) -> Box<OngoingCodegen> {
+    // FIXME handle `-Ctarget-cpu=native`
+    let target_cpu = match tcx.sess.opts.cg.target_cpu {
+        Some(ref name) => name,
+        None => tcx.sess.target.cpu.as_ref(),
+    }
+    .to_owned();
+
     let cgus = if tcx.sess.opts.output_types.should_codegen() {
         tcx.collect_and_partition_mono_items(()).1
     } else {
         // If only `--emit metadata` is used, we shouldn't perform any codegen.
         // Also `tcx.collect_and_partition_mono_items` may panic in that case.
-        &[]
+        return Box::new(OngoingCodegen {
+            modules: vec![],
+            allocator_module: None,
+            metadata_module: None,
+            metadata,
+            crate_info: CrateInfo::new(tcx, target_cpu),
+            concurrency_limiter: ConcurrencyLimiter::new(tcx.sess, 0),
+        });
     };
 
     if tcx.dep_graph.is_fully_enabled() {
@@ -375,20 +389,28 @@ pub(crate) fn run_aot(
         }
     }
 
+    // Calculate the CGU reuse
+    let cgu_reuse = tcx.sess.time("find_cgu_reuse", || {
+        cgus.iter().map(|cgu| determine_cgu_reuse(tcx, &cgu)).collect::<Vec<_>>()
+    });
+
+    rustc_codegen_ssa::assert_module_sources::assert_module_sources(tcx, &|cgu_reuse_tracker| {
+        for (i, cgu) in cgus.iter().enumerate() {
+            let cgu_reuse = cgu_reuse[i];
+            cgu_reuse_tracker.set_actual_reuse(cgu.name().as_str(), cgu_reuse);
+        }
+    });
+
     let global_asm_config = Arc::new(crate::global_asm::GlobalAsmConfig::new(tcx));
 
     let mut concurrency_limiter = ConcurrencyLimiter::new(tcx.sess, cgus.len());
 
     let modules = tcx.sess.time("codegen mono items", || {
         cgus.iter()
-            .map(|cgu| {
-                let cgu_reuse = if backend_config.disable_incr_cache {
-                    CguReuse::No
-                } else {
-                    determine_cgu_reuse(tcx, cgu)
-                };
-                tcx.sess.cgu_reuse_tracker.set_actual_reuse(cgu.name().as_str(), cgu_reuse);
-
+            .enumerate()
+            .map(|(i, cgu)| {
+                let cgu_reuse =
+                    if backend_config.disable_incr_cache { CguReuse::No } else { cgu_reuse[i] };
                 match cgu_reuse {
                     CguReuse::No => {
                         let dep_node = cgu.codegen_dep_node(tcx);
@@ -407,8 +429,7 @@ pub(crate) fn run_aot(
                             )
                             .0
                     }
-                    CguReuse::PreLto => unreachable!(),
-                    CguReuse::PostLto => {
+                    CguReuse::PreLto | CguReuse::PostLto => {
                         concurrency_limiter.job_already_done();
                         OngoingModuleCodegen::Sync(reuse_workproduct_for_cgu(tcx, cgu))
                     }
@@ -474,13 +495,6 @@ pub(crate) fn run_aot(
         None
     };
 
-    // FIXME handle `-Ctarget-cpu=native`
-    let target_cpu = match tcx.sess.opts.cg.target_cpu {
-        Some(ref name) => name,
-        None => tcx.sess.target.cpu.as_ref(),
-    }
-    .to_owned();
-
     Box::new(OngoingCodegen {
         modules,
         allocator_module,
@@ -490,32 +504,3 @@ pub(crate) fn run_aot(
         concurrency_limiter,
     })
 }
-
-// Adapted from https://github.com/rust-lang/rust/blob/303d8aff6092709edd4dbd35b1c88e9aa40bf6d8/src/librustc_codegen_ssa/base.rs#L922-L953
-fn determine_cgu_reuse<'tcx>(tcx: TyCtxt<'tcx>, cgu: &CodegenUnit<'tcx>) -> CguReuse {
-    if !tcx.dep_graph.is_fully_enabled() {
-        return CguReuse::No;
-    }
-
-    let work_product_id = &cgu.work_product_id();
-    if tcx.dep_graph.previous_work_product(work_product_id).is_none() {
-        // We don't have anything cached for this CGU. This can happen
-        // if the CGU did not exist in the previous session.
-        return CguReuse::No;
-    }
-
-    // Try to mark the CGU as green. If it we can do so, it means that nothing
-    // affecting the LLVM module has changed and we can re-use a cached version.
-    // If we compile with any kind of LTO, this means we can re-use the bitcode
-    // of the Pre-LTO stage (possibly also the Post-LTO version but we'll only
-    // know that later). If we are not doing LTO, there is only one optimized
-    // version of each module, so we re-use that.
-    let dep_node = cgu.codegen_dep_node(tcx);
-    assert!(
-        !tcx.dep_graph.dep_node_exists(&dep_node),
-        "CompileCodegenUnit dep-node for CGU `{}` already exists before marking.",
-        cgu.name()
-    );
-
-    if tcx.try_mark_green(&dep_node) { CguReuse::PostLto } else { CguReuse::No }
-}
diff --git a/compiler/rustc_codegen_cranelift/src/driver/jit.rs b/compiler/rustc_codegen_cranelift/src/driver/jit.rs
index 1c606494f..6ee65d12c 100644
--- a/compiler/rustc_codegen_cranelift/src/driver/jit.rs
+++ b/compiler/rustc_codegen_cranelift/src/driver/jit.rs
@@ -6,13 +6,12 @@ use std::ffi::CString;
 use std::os::raw::{c_char, c_int};
 use std::sync::{mpsc, Mutex, OnceLock};
 
+use cranelift_jit::{JITBuilder, JITModule};
 use rustc_codegen_ssa::CrateInfo;
 use rustc_middle::mir::mono::MonoItem;
 use rustc_session::Session;
 use rustc_span::Symbol;
 
-use cranelift_jit::{JITBuilder, JITModule};
-
 use crate::{prelude::*, BackendConfig};
 use crate::{CodegenCx, CodegenMode};
 
diff --git a/compiler/rustc_codegen_cranelift/src/global_asm.rs b/compiler/rustc_codegen_cranelift/src/global_asm.rs
index baadd7a9e..b14007f4e 100644
--- a/compiler/rustc_codegen_cranelift/src/global_asm.rs
+++ b/compiler/rustc_codegen_cranelift/src/global_asm.rs
@@ -9,16 +9,22 @@ use std::sync::Arc;
 use rustc_ast::{InlineAsmOptions, InlineAsmTemplatePiece};
 use rustc_hir::{InlineAsmOperand, ItemId};
 use rustc_session::config::{OutputFilenames, OutputType};
+use rustc_target::asm::InlineAsmArch;
 
 use crate::prelude::*;
 
 pub(crate) fn codegen_global_asm_item(tcx: TyCtxt<'_>, global_asm: &mut String, item_id: ItemId) {
     let item = tcx.hir().item(item_id);
     if let rustc_hir::ItemKind::GlobalAsm(asm) = item.kind {
-        if !asm.options.contains(InlineAsmOptions::ATT_SYNTAX) {
-            global_asm.push_str("\n.intel_syntax noprefix\n");
-        } else {
-            global_asm.push_str("\n.att_syntax\n");
+        let is_x86 =
+            matches!(tcx.sess.asm_arch.unwrap(), InlineAsmArch::X86 | InlineAsmArch::X86_64);
+
+        if is_x86 {
+            if !asm.options.contains(InlineAsmOptions::ATT_SYNTAX) {
+                global_asm.push_str("\n.intel_syntax noprefix\n");
+            } else {
+                global_asm.push_str("\n.att_syntax\n");
+            }
         }
         for piece in asm.template {
             match *piece {
@@ -40,6 +46,13 @@ pub(crate) fn codegen_global_asm_item(tcx: TyCtxt<'_>, global_asm: &mut String,
                             global_asm.push_str(&string);
                         }
                         InlineAsmOperand::SymFn { anon_const } => {
+                            if cfg!(not(feature = "inline_asm_sym")) {
+                                tcx.sess.span_err(
+                                    item.span,
+                                    "asm! and global_asm! sym operands are not yet supported",
+                                );
+                            }
+
                             let ty = tcx.typeck_body(anon_const.body).node_type(anon_const.hir_id);
                             let instance = match ty.kind() {
                                 &ty::FnDef(def_id, args) => Instance::new(def_id, args),
@@ -51,6 +64,13 @@ pub(crate) fn codegen_global_asm_item(tcx: TyCtxt<'_>, global_asm: &mut String,
                             global_asm.push_str(symbol.name);
                         }
                         InlineAsmOperand::SymStatic { path: _, def_id } => {
+                            if cfg!(not(feature = "inline_asm_sym")) {
+                                tcx.sess.span_err(
+                                    item.span,
+                                    "asm! and global_asm! sym operands are not yet supported",
+                                );
+                            }
+
                             let instance = Instance::mono(tcx, def_id).polymorphize(tcx);
                             let symbol = tcx.symbol_name(instance);
                             global_asm.push_str(symbol.name);
@@ -65,7 +85,11 @@ pub(crate) fn codegen_global_asm_item(tcx: TyCtxt<'_>, global_asm: &mut String,
                 }
             }
         }
-        global_asm.push_str("\n.att_syntax\n\n");
+
+        global_asm.push('\n');
+        if is_x86 {
+            global_asm.push_str(".att_syntax\n\n");
+        }
     } else {
         bug!("Expected GlobalAsm found {:?}", item);
     }
@@ -73,18 +97,21 @@ pub(crate) fn codegen_global_asm_item(tcx: TyCtxt<'_>, global_asm: &mut String,
 
 #[derive(Debug)]
 pub(crate) struct GlobalAsmConfig {
-    asm_enabled: bool,
     assembler: PathBuf,
+    target: String,
     pub(crate) output_filenames: Arc<OutputFilenames>,
 }
 
 impl GlobalAsmConfig {
     pub(crate) fn new(tcx: TyCtxt<'_>) -> Self {
-        let asm_enabled = cfg!(feature = "inline_asm") && !tcx.sess.target.is_like_windows;
-
         GlobalAsmConfig {
-            asm_enabled,
             assembler: crate::toolchain::get_toolchain_binary(tcx.sess, "as"),
+            target: match &tcx.sess.opts.target_triple {
+                rustc_target::spec::TargetTriple::TargetTriple(triple) => triple.clone(),
+                rustc_target::spec::TargetTriple::TargetJson { path_for_rustdoc, .. } => {
+                    path_for_rustdoc.to_str().unwrap().to_owned()
+                }
+            },
             output_filenames: tcx.output_filenames(()).clone(),
         }
     }
@@ -99,42 +126,75 @@ pub(crate) fn compile_global_asm(
         return Ok(None);
     }
 
-    if !config.asm_enabled {
-        if global_asm.contains("__rust_probestack") {
-            return Ok(None);
-        }
-
-        if cfg!(not(feature = "inline_asm")) {
-            return Err(
-                "asm! and global_asm! support is disabled while compiling rustc_codegen_cranelift"
-                    .to_owned(),
-            );
-        } else {
-            return Err("asm! and global_asm! are not yet supported on Windows".to_owned());
-        }
-    }
-
     // Remove all LLVM style comments
-    let global_asm = global_asm
+    let mut global_asm = global_asm
         .lines()
         .map(|line| if let Some(index) = line.find("//") { &line[0..index] } else { line })
         .collect::<Vec<_>>()
         .join("\n");
+    global_asm.push('\n');
 
-    let output_object_file = config.output_filenames.temp_path(OutputType::Object, Some(cgu_name));
+    let global_asm_object_file = add_file_stem_postfix(
+        config.output_filenames.temp_path(OutputType::Object, Some(cgu_name)),
+        ".asm",
+    );
 
     // Assemble `global_asm`
-    let global_asm_object_file = add_file_stem_postfix(output_object_file, ".asm");
-    let mut child = Command::new(&config.assembler)
-        .arg("-o")
-        .arg(&global_asm_object_file)
-        .stdin(Stdio::piped())
-        .spawn()
-        .expect("Failed to spawn `as`.");
-    child.stdin.take().unwrap().write_all(global_asm.as_bytes()).unwrap();
-    let status = child.wait().expect("Failed to wait for `as`.");
-    if !status.success() {
-        return Err(format!("Failed to assemble `{}`", global_asm));
+    if option_env!("CG_CLIF_FORCE_GNU_AS").is_some() {
+        let mut child = Command::new(&config.assembler)
+            .arg("-o")
+            .arg(&global_asm_object_file)
+            .stdin(Stdio::piped())
+            .spawn()
+            .expect("Failed to spawn `as`.");
+        child.stdin.take().unwrap().write_all(global_asm.as_bytes()).unwrap();
+        let status = child.wait().expect("Failed to wait for `as`.");
+        if !status.success() {
+            return Err(format!("Failed to assemble `{}`", global_asm));
+        }
+    } else {
+        let mut child = Command::new(std::env::current_exe().unwrap())
+            .arg("--target")
+            .arg(&config.target)
+            .arg("--crate-type")
+            .arg("staticlib")
+            .arg("--emit")
+            .arg("obj")
+            .arg("-o")
+            .arg(&global_asm_object_file)
+            .arg("-")
+            .arg("-Abad_asm_style")
+            .arg("-Zcodegen-backend=llvm")
+            .stdin(Stdio::piped())
+            .spawn()
+            .expect("Failed to spawn `as`.");
+        let mut stdin = child.stdin.take().unwrap();
+        stdin
+            .write_all(
+                br####"
+                #![feature(decl_macro, no_core, rustc_attrs)]
+                #![allow(internal_features)]
+                #![no_core]
+                #[rustc_builtin_macro]
+                #[rustc_macro_transparency = "semitransparent"]
+                macro global_asm() { /* compiler built-in */ }
+                global_asm!(r###"
+                "####,
+            )
+            .unwrap();
+        stdin.write_all(global_asm.as_bytes()).unwrap();
+        stdin
+            .write_all(
+                br####"
+                "###);
+                "####,
+            )
+            .unwrap();
+        std::mem::drop(stdin);
+        let status = child.wait().expect("Failed to wait for `as`.");
+        if !status.success() {
+            return Err(format!("Failed to assemble `{}`", global_asm));
+        }
     }
 
     Ok(Some(global_asm_object_file))
diff --git a/compiler/rustc_codegen_cranelift/src/inline_asm.rs b/compiler/rustc_codegen_cranelift/src/inline_asm.rs
index 50bbf8105..ce0eecca8 100644
--- a/compiler/rustc_codegen_cranelift/src/inline_asm.rs
+++ b/compiler/rustc_codegen_cranelift/src/inline_asm.rs
@@ -1,18 +1,19 @@
 //! Codegen of `asm!` invocations.
 
-use crate::prelude::*;
-
 use std::fmt::Write;
 
 use rustc_ast::ast::{InlineAsmOptions, InlineAsmTemplatePiece};
 use rustc_middle::mir::InlineAsmOperand;
 use rustc_span::sym;
 use rustc_target::asm::*;
+use target_lexicon::BinaryFormat;
+
+use crate::prelude::*;
 
 enum CInlineAsmOperand<'tcx> {
     In {
         reg: InlineAsmRegOrRegClass,
-        value: CValue<'tcx>,
+        value: Value,
     },
     Out {
         reg: InlineAsmRegOrRegClass,
@@ -22,7 +23,7 @@ enum CInlineAsmOperand<'tcx> {
     InOut {
         reg: InlineAsmRegOrRegClass,
         _late: bool,
-        in_value: CValue<'tcx>,
+        in_value: Value,
         out_place: Option<CPlace<'tcx>>,
     },
     Const {
@@ -43,191 +44,23 @@ pub(crate) fn codegen_inline_asm<'tcx>(
 ) {
     // FIXME add .eh_frame unwind info directives
 
-    if !template.is_empty() {
-        // Used by panic_abort
-        if template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string()) {
-            fx.bcx.ins().trap(TrapCode::User(1));
-            return;
-        }
-
-        // Used by stdarch
-        if template[0] == InlineAsmTemplatePiece::String("mov ".to_string())
-            && matches!(
-                template[1],
-                InlineAsmTemplatePiece::Placeholder {
-                    operand_idx: 0,
-                    modifier: Some('r'),
-                    span: _
-                }
-            )
-            && template[2] == InlineAsmTemplatePiece::String(", rbx".to_string())
-            && template[3] == InlineAsmTemplatePiece::String("\n".to_string())
-            && template[4] == InlineAsmTemplatePiece::String("cpuid".to_string())
-            && template[5] == InlineAsmTemplatePiece::String("\n".to_string())
-            && template[6] == InlineAsmTemplatePiece::String("xchg ".to_string())
-            && matches!(
-                template[7],
-                InlineAsmTemplatePiece::Placeholder {
-                    operand_idx: 0,
-                    modifier: Some('r'),
-                    span: _
-                }
-            )
-            && template[8] == InlineAsmTemplatePiece::String(", rbx".to_string())
-        {
-            assert_eq!(operands.len(), 4);
-            let (leaf, eax_place) = match operands[1] {
-                InlineAsmOperand::InOut {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
-                    late: _,
-                    ref in_value,
-                    out_place: Some(out_place),
-                } => (
-                    crate::base::codegen_operand(fx, in_value).load_scalar(fx),
-                    crate::base::codegen_place(fx, out_place),
-                ),
-                _ => unreachable!(),
-            };
-            let ebx_place = match operands[0] {
-                InlineAsmOperand::Out {
-                    reg:
-                        InlineAsmRegOrRegClass::RegClass(InlineAsmRegClass::X86(
-                            X86InlineAsmRegClass::reg,
-                        )),
-                    late: _,
-                    place: Some(place),
-                } => crate::base::codegen_place(fx, place),
-                _ => unreachable!(),
-            };
-            let (sub_leaf, ecx_place) = match operands[2] {
-                InlineAsmOperand::InOut {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
-                    late: _,
-                    ref in_value,
-                    out_place: Some(out_place),
-                } => (
-                    crate::base::codegen_operand(fx, in_value).load_scalar(fx),
-                    crate::base::codegen_place(fx, out_place),
-                ),
-                _ => unreachable!(),
-            };
-            let edx_place = match operands[3] {
-                InlineAsmOperand::Out {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
-                    late: _,
-                    place: Some(place),
-                } => crate::base::codegen_place(fx, place),
-                _ => unreachable!(),
-            };
-
-            let (eax, ebx, ecx, edx) = crate::intrinsics::codegen_cpuid_call(fx, leaf, sub_leaf);
-
-            eax_place.write_cvalue(fx, CValue::by_val(eax, fx.layout_of(fx.tcx.types.u32)));
-            ebx_place.write_cvalue(fx, CValue::by_val(ebx, fx.layout_of(fx.tcx.types.u32)));
-            ecx_place.write_cvalue(fx, CValue::by_val(ecx, fx.layout_of(fx.tcx.types.u32)));
-            edx_place.write_cvalue(fx, CValue::by_val(edx, fx.layout_of(fx.tcx.types.u32)));
-            let destination_block = fx.get_block(destination.unwrap());
-            fx.bcx.ins().jump(destination_block, &[]);
-            return;
-        }
-
-        // Used by compiler-builtins
-        if fx.tcx.symbol_name(fx.instance).name.starts_with("___chkstk") {
-            // ___chkstk, ___chkstk_ms and __alloca are only used on Windows
-            crate::trap::trap_unimplemented(fx, "Stack probes are not supported");
-            return;
-        } else if fx.tcx.symbol_name(fx.instance).name == "__alloca" {
-            crate::trap::trap_unimplemented(fx, "Alloca is not supported");
-            return;
-        }
-
-        // Used by measureme
-        if template[0] == InlineAsmTemplatePiece::String("xor %eax, %eax".to_string())
-            && template[1] == InlineAsmTemplatePiece::String("\n".to_string())
-            && template[2] == InlineAsmTemplatePiece::String("mov %rbx, ".to_string())
-            && matches!(
-                template[3],
-                InlineAsmTemplatePiece::Placeholder {
-                    operand_idx: 0,
-                    modifier: Some('r'),
-                    span: _
-                }
-            )
-            && template[4] == InlineAsmTemplatePiece::String("\n".to_string())
-            && template[5] == InlineAsmTemplatePiece::String("cpuid".to_string())
-            && template[6] == InlineAsmTemplatePiece::String("\n".to_string())
-            && template[7] == InlineAsmTemplatePiece::String("mov ".to_string())
-            && matches!(
-                template[8],
-                InlineAsmTemplatePiece::Placeholder {
-                    operand_idx: 0,
-                    modifier: Some('r'),
-                    span: _
-                }
-            )
-            && template[9] == InlineAsmTemplatePiece::String(", %rbx".to_string())
-        {
-            let destination_block = fx.get_block(destination.unwrap());
-            fx.bcx.ins().jump(destination_block, &[]);
-            return;
-        } else if template[0] == InlineAsmTemplatePiece::String("rdpmc".to_string()) {
-            // Return zero dummy values for all performance counters
-            match operands[0] {
-                InlineAsmOperand::In {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
-                    value: _,
-                } => {}
-                _ => unreachable!(),
-            };
-            let lo = match operands[1] {
-                InlineAsmOperand::Out {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
-                    late: true,
-                    place: Some(place),
-                } => crate::base::codegen_place(fx, place),
-                _ => unreachable!(),
-            };
-            let hi = match operands[2] {
-                InlineAsmOperand::Out {
-                    reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
-                    late: true,
-                    place: Some(place),
-                } => crate::base::codegen_place(fx, place),
-                _ => unreachable!(),
-            };
-
-            let u32_layout = fx.layout_of(fx.tcx.types.u32);
-            let zero = fx.bcx.ins().iconst(types::I32, 0);
-            lo.write_cvalue(fx, CValue::by_val(zero, u32_layout));
-            hi.write_cvalue(fx, CValue::by_val(zero, u32_layout));
-
-            let destination_block = fx.get_block(destination.unwrap());
-            fx.bcx.ins().jump(destination_block, &[]);
-            return;
-        } else if template[0] == InlineAsmTemplatePiece::String("lock xadd ".to_string())
-            && matches!(
-                template[1],
-                InlineAsmTemplatePiece::Placeholder { operand_idx: 1, modifier: None, span: _ }
-            )
-            && template[2] == InlineAsmTemplatePiece::String(", (".to_string())
-            && matches!(
-                template[3],
-                InlineAsmTemplatePiece::Placeholder { operand_idx: 0, modifier: None, span: _ }
-            )
-            && template[4] == InlineAsmTemplatePiece::String(")".to_string())
-        {
-            let destination_block = fx.get_block(destination.unwrap());
-            fx.bcx.ins().jump(destination_block, &[]);
-            return;
-        }
+    // Used by panic_abort on Windows, but uses a syntax which only happens to work with
+    // asm!() by accident and breaks with the GNU assembler as well as global_asm!() for
+    // the LLVM backend.
+    if template.len() == 1
+        && template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string())
+    {
+        fx.bcx.ins().trap(TrapCode::User(1));
+        return;
     }
 
     let operands = operands
         .into_iter()
         .map(|operand| match *operand {
-            InlineAsmOperand::In { reg, ref value } => {
-                CInlineAsmOperand::In { reg, value: crate::base::codegen_operand(fx, value) }
-            }
+            InlineAsmOperand::In { reg, ref value } => CInlineAsmOperand::In {
+                reg,
+                value: crate::base::codegen_operand(fx, value).load_scalar(fx),
+            },
             InlineAsmOperand::Out { reg, late, ref place } => CInlineAsmOperand::Out {
                 reg,
                 late,
@@ -237,7 +70,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
                 CInlineAsmOperand::InOut {
                     reg,
                     _late: late,
-                    in_value: crate::base::codegen_operand(fx, in_value),
+                    in_value: crate::base::codegen_operand(fx, in_value).load_scalar(fx),
                     out_place: out_place.map(|place| crate::base::codegen_place(fx, place)),
                 }
             }
@@ -252,6 +85,12 @@ pub(crate) fn codegen_inline_asm<'tcx>(
                 CInlineAsmOperand::Const { value }
             }
             InlineAsmOperand::SymFn { ref value } => {
+                if cfg!(not(feature = "inline_asm_sym")) {
+                    fx.tcx
+                        .sess
+                        .span_err(span, "asm! and global_asm! sym operands are not yet supported");
+                }
+
                 let const_ = fx.monomorphize(value.const_);
                 if let ty::FnDef(def_id, args) = *const_.ty().kind() {
                     let instance = ty::Instance::resolve_for_fn_ptr(
@@ -329,7 +168,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
     for (i, operand) in operands.iter().enumerate() {
         match operand {
             CInlineAsmOperand::In { reg: _, value } => {
-                inputs.push((asm_gen.stack_slots_input[i].unwrap(), value.load_scalar(fx)));
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
             }
             CInlineAsmOperand::Out { reg: _, late: _, place } => {
                 if let Some(place) = place {
@@ -337,7 +176,7 @@ pub(crate) fn codegen_inline_asm<'tcx>(
                 }
             }
             CInlineAsmOperand::InOut { reg: _, _late: _, in_value, out_place } => {
-                inputs.push((asm_gen.stack_slots_input[i].unwrap(), in_value.load_scalar(fx)));
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *in_value));
                 if let Some(out_place) = out_place {
                     outputs.push((asm_gen.stack_slots_output[i].unwrap(), *out_place));
                 }
@@ -589,11 +428,29 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
     }
 
     fn generate_asm_wrapper(&self, asm_name: &str) -> String {
+        let binary_format = crate::target_triple(self.tcx.sess).binary_format;
+
         let mut generated_asm = String::new();
-        writeln!(generated_asm, ".globl {}", asm_name).unwrap();
-        writeln!(generated_asm, ".type {},@function", asm_name).unwrap();
-        writeln!(generated_asm, ".section .text.{},\"ax\",@progbits", asm_name).unwrap();
-        writeln!(generated_asm, "{}:", asm_name).unwrap();
+        match binary_format {
+            BinaryFormat::Elf => {
+                writeln!(generated_asm, ".globl {}", asm_name).unwrap();
+                writeln!(generated_asm, ".type {},@function", asm_name).unwrap();
+                writeln!(generated_asm, ".section .text.{},\"ax\",@progbits", asm_name).unwrap();
+                writeln!(generated_asm, "{}:", asm_name).unwrap();
+            }
+            BinaryFormat::Macho => {
+                writeln!(generated_asm, ".globl _{}", asm_name).unwrap();
+                writeln!(generated_asm, "_{}:", asm_name).unwrap();
+            }
+            BinaryFormat::Coff => {
+                writeln!(generated_asm, ".globl {}", asm_name).unwrap();
+                writeln!(generated_asm, "{}:", asm_name).unwrap();
+            }
+            _ => self
+                .tcx
+                .sess
+                .fatal(format!("Unsupported binary format for inline asm: {binary_format:?}")),
+        }
 
         let is_x86 = matches!(self.arch, InlineAsmArch::X86 | InlineAsmArch::X86_64);
 
@@ -690,8 +547,19 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
         if is_x86 {
             generated_asm.push_str(".att_syntax\n");
         }
-        writeln!(generated_asm, ".size {name}, .-{name}", name = asm_name).unwrap();
-        generated_asm.push_str(".text\n");
+
+        match binary_format {
+            BinaryFormat::Elf => {
+                writeln!(generated_asm, ".size {name}, .-{name}", name = asm_name).unwrap();
+                generated_asm.push_str(".text\n");
+            }
+            BinaryFormat::Macho | BinaryFormat::Coff => {}
+            _ => self
+                .tcx
+                .sess
+                .fatal(format!("Unsupported binary format for inline asm: {binary_format:?}")),
+        }
+
         generated_asm.push_str("\n\n");
 
         generated_asm
@@ -699,25 +567,26 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
 
     fn prologue(generated_asm: &mut String, arch: InlineAsmArch) {
         match arch {
-            InlineAsmArch::X86 => {
-                generated_asm.push_str("    push ebp\n");
-                generated_asm.push_str("    mov ebp,[esp+8]\n");
-            }
             InlineAsmArch::X86_64 => {
                 generated_asm.push_str("    push rbp\n");
-                generated_asm.push_str("    mov rbp,rdi\n");
-            }
-            InlineAsmArch::RiscV32 => {
-                generated_asm.push_str("    addi sp, sp, -8\n");
-                generated_asm.push_str("    sw ra, 4(sp)\n");
-                generated_asm.push_str("    sw s0, 0(sp)\n");
-                generated_asm.push_str("    mv s0, a0\n");
+                generated_asm.push_str("    mov rbp,rsp\n");
+                generated_asm.push_str("    push rbx\n"); // rbx is callee saved
+                // rbx is reserved by LLVM for the "base pointer", so rustc doesn't allow using it
+                generated_asm.push_str("    mov rbx,rdi\n");
+            }
+            InlineAsmArch::AArch64 => {
+                generated_asm.push_str("    stp fp, lr, [sp, #-32]!\n");
+                generated_asm.push_str("    mov fp, sp\n");
+                generated_asm.push_str("    str x19, [sp, #24]\n"); // x19 is callee saved
+                // x19 is reserved by LLVM for the "base pointer", so rustc doesn't allow using it
+                generated_asm.push_str("    mov x19, x0\n");
             }
             InlineAsmArch::RiscV64 => {
                 generated_asm.push_str("    addi sp, sp, -16\n");
                 generated_asm.push_str("    sd ra, 8(sp)\n");
-                generated_asm.push_str("    sd s0, 0(sp)\n");
-                generated_asm.push_str("    mv s0, a0\n");
+                generated_asm.push_str("    sd s1, 0(sp)\n"); // s1 is callee saved
+                // s1/x9 is reserved by LLVM for the "base pointer", so rustc doesn't allow using it
+                generated_asm.push_str("    mv s1, a0\n");
             }
             _ => unimplemented!("prologue for {:?}", arch),
         }
@@ -725,22 +594,18 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
 
     fn epilogue(generated_asm: &mut String, arch: InlineAsmArch) {
         match arch {
-            InlineAsmArch::X86 => {
-                generated_asm.push_str("    pop ebp\n");
-                generated_asm.push_str("    ret\n");
-            }
             InlineAsmArch::X86_64 => {
+                generated_asm.push_str("    pop rbx\n");
                 generated_asm.push_str("    pop rbp\n");
                 generated_asm.push_str("    ret\n");
             }
-            InlineAsmArch::RiscV32 => {
-                generated_asm.push_str("    lw s0, 0(sp)\n");
-                generated_asm.push_str("    lw ra, 4(sp)\n");
-                generated_asm.push_str("    addi sp, sp, 8\n");
+            InlineAsmArch::AArch64 => {
+                generated_asm.push_str("    ldr x19, [sp, #24]\n");
+                generated_asm.push_str("    ldp fp, lr, [sp], #32\n");
                 generated_asm.push_str("    ret\n");
             }
             InlineAsmArch::RiscV64 => {
-                generated_asm.push_str("    ld s0, 0(sp)\n");
+                generated_asm.push_str("    ld s1, 0(sp)\n");
                 generated_asm.push_str("    ld ra, 8(sp)\n");
                 generated_asm.push_str("    addi sp, sp, 16\n");
                 generated_asm.push_str("    ret\n");
@@ -751,10 +616,13 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
 
     fn epilogue_noreturn(generated_asm: &mut String, arch: InlineAsmArch) {
         match arch {
-            InlineAsmArch::X86 | InlineAsmArch::X86_64 => {
+            InlineAsmArch::X86_64 => {
                 generated_asm.push_str("    ud2\n");
             }
-            InlineAsmArch::RiscV32 | InlineAsmArch::RiscV64 => {
+            InlineAsmArch::AArch64 => {
+                generated_asm.push_str("    brk     #0x1\n");
+            }
+            InlineAsmArch::RiscV64 => {
                 generated_asm.push_str("    ebreak\n");
             }
             _ => unimplemented!("epilogue_noreturn for {:?}", arch),
@@ -768,25 +636,20 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
         offset: Size,
     ) {
         match arch {
-            InlineAsmArch::X86 => {
-                write!(generated_asm, "    mov [ebp+0x{:x}], ", offset.bytes()).unwrap();
-                reg.emit(generated_asm, InlineAsmArch::X86, None).unwrap();
-                generated_asm.push('\n');
-            }
             InlineAsmArch::X86_64 => {
-                write!(generated_asm, "    mov [rbp+0x{:x}], ", offset.bytes()).unwrap();
+                write!(generated_asm, "    mov [rbx+0x{:x}], ", offset.bytes()).unwrap();
                 reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap();
                 generated_asm.push('\n');
             }
-            InlineAsmArch::RiscV32 => {
-                generated_asm.push_str("    sw ");
-                reg.emit(generated_asm, InlineAsmArch::RiscV32, None).unwrap();
-                writeln!(generated_asm, ", 0x{:x}(s0)", offset.bytes()).unwrap();
+            InlineAsmArch::AArch64 => {
+                generated_asm.push_str("    str ");
+                reg.emit(generated_asm, InlineAsmArch::AArch64, None).unwrap();
+                writeln!(generated_asm, ", [x19, 0x{:x}]", offset.bytes()).unwrap();
             }
             InlineAsmArch::RiscV64 => {
                 generated_asm.push_str("    sd ");
                 reg.emit(generated_asm, InlineAsmArch::RiscV64, None).unwrap();
-                writeln!(generated_asm, ", 0x{:x}(s0)", offset.bytes()).unwrap();
+                writeln!(generated_asm, ", 0x{:x}(s1)", offset.bytes()).unwrap();
             }
             _ => unimplemented!("save_register for {:?}", arch),
         }
@@ -799,25 +662,20 @@ impl<'tcx> InlineAssemblyGenerator<'_, 'tcx> {
         offset: Size,
     ) {
         match arch {
-            InlineAsmArch::X86 => {
-                generated_asm.push_str("    mov ");
-                reg.emit(generated_asm, InlineAsmArch::X86, None).unwrap();
-                writeln!(generated_asm, ", [ebp+0x{:x}]", offset.bytes()).unwrap();
-            }
             InlineAsmArch::X86_64 => {
                 generated_asm.push_str("    mov ");
                 reg.emit(generated_asm, InlineAsmArch::X86_64, None).unwrap();
-                writeln!(generated_asm, ", [rbp+0x{:x}]", offset.bytes()).unwrap();
+                writeln!(generated_asm, ", [rbx+0x{:x}]", offset.bytes()).unwrap();
             }
-            InlineAsmArch::RiscV32 => {
-                generated_asm.push_str("    lw ");
-                reg.emit(generated_asm, InlineAsmArch::RiscV32, None).unwrap();
-                writeln!(generated_asm, ", 0x{:x}(s0)", offset.bytes()).unwrap();
+            InlineAsmArch::AArch64 => {
+                generated_asm.push_str("    ldr ");
+                reg.emit(generated_asm, InlineAsmArch::AArch64, None).unwrap();
+                writeln!(generated_asm, ", [x19, 0x{:x}]", offset.bytes()).unwrap();
             }
             InlineAsmArch::RiscV64 => {
                 generated_asm.push_str("    ld ");
                 reg.emit(generated_asm, InlineAsmArch::RiscV64, None).unwrap();
-                writeln!(generated_asm, ", 0x{:x}(s0)", offset.bytes()).unwrap();
+                writeln!(generated_asm, ", 0x{:x}(s1)", offset.bytes()).unwrap();
             }
             _ => unimplemented!("restore_register for {:?}", arch),
         }
@@ -831,13 +689,7 @@ fn call_inline_asm<'tcx>(
     inputs: Vec<(Size, Value)>,
     outputs: Vec<(Size, CPlace<'tcx>)>,
 ) {
-    let stack_slot = fx.bcx.func.create_sized_stack_slot(StackSlotData {
-        kind: StackSlotKind::ExplicitSlot,
-        size: u32::try_from(slot_size.bytes()).unwrap(),
-    });
-    if fx.clif_comments.enabled() {
-        fx.add_comment(stack_slot, "inline asm scratch slot");
-    }
+    let stack_slot = fx.create_stack_slot(u32::try_from(slot_size.bytes()).unwrap(), 16);
 
     let inline_asm_func = fx
         .module
@@ -857,15 +709,103 @@ fn call_inline_asm<'tcx>(
     }
 
     for (offset, value) in inputs {
-        fx.bcx.ins().stack_store(value, stack_slot, i32::try_from(offset.bytes()).unwrap());
+        stack_slot.offset(fx, i32::try_from(offset.bytes()).unwrap().into()).store(
+            fx,
+            value,
+            MemFlags::trusted(),
+        );
     }
 
-    let stack_slot_addr = fx.bcx.ins().stack_addr(fx.pointer_type, stack_slot, 0);
+    let stack_slot_addr = stack_slot.get_addr(fx);
     fx.bcx.ins().call(inline_asm_func, &[stack_slot_addr]);
 
     for (offset, place) in outputs {
         let ty = fx.clif_type(place.layout().ty).unwrap();
-        let value = fx.bcx.ins().stack_load(ty, stack_slot, i32::try_from(offset.bytes()).unwrap());
+        let value = stack_slot.offset(fx, i32::try_from(offset.bytes()).unwrap().into()).load(
+            fx,
+            ty,
+            MemFlags::trusted(),
+        );
         place.write_cvalue(fx, CValue::by_val(value, place.layout()));
     }
 }
+
+pub(crate) fn codegen_xgetbv<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    xcr_no: Value,
+    ret: CPlace<'tcx>,
+) {
+    // FIXME add .eh_frame unwind info directives
+
+    let operands = vec![
+        CInlineAsmOperand::In {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
+            value: xcr_no,
+        },
+        CInlineAsmOperand::Out {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
+            late: true,
+            place: Some(ret),
+        },
+        CInlineAsmOperand::Out {
+            reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
+            late: true,
+            place: None,
+        },
+    ];
+    let options = InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM;
+
+    let mut inputs = Vec::new();
+    let mut outputs = Vec::new();
+
+    let mut asm_gen = InlineAssemblyGenerator {
+        tcx: fx.tcx,
+        arch: fx.tcx.sess.asm_arch.unwrap(),
+        enclosing_def_id: fx.instance.def_id(),
+        template: &[InlineAsmTemplatePiece::String(
+            "
+            xgetbv
+            // out = rdx << 32 | rax
+            shl rdx, 32
+            or rax, rdx
+            "
+            .to_string(),
+        )],
+        operands: &operands,
+        options,
+        registers: Vec::new(),
+        stack_slots_clobber: Vec::new(),
+        stack_slots_input: Vec::new(),
+        stack_slots_output: Vec::new(),
+        stack_slot_size: Size::from_bytes(0),
+    };
+    asm_gen.allocate_registers();
+    asm_gen.allocate_stack_slots();
+
+    let inline_asm_index = fx.cx.inline_asm_index.get();
+    fx.cx.inline_asm_index.set(inline_asm_index + 1);
+    let asm_name = format!(
+        "__inline_asm_{}_n{}",
+        fx.cx.cgu_name.as_str().replace('.', "__").replace('-', "_"),
+        inline_asm_index
+    );
+
+    let generated_asm = asm_gen.generate_asm_wrapper(&asm_name);
+    fx.cx.global_asm.push_str(&generated_asm);
+
+    for (i, operand) in operands.iter().enumerate() {
+        match operand {
+            CInlineAsmOperand::In { reg: _, value } => {
+                inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value));
+            }
+            CInlineAsmOperand::Out { reg: _, late: _, place } => {
+                if let Some(place) = place {
+                    outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place));
+                }
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs);
+}
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/cpuid.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/cpuid.rs
deleted file mode 100644
index 5120b89c4..000000000
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/cpuid.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-//! Emulation of a subset of the cpuid x86 instruction.
-
-use crate::prelude::*;
-
-/// Emulates a subset of the cpuid x86 instruction.
-///
-/// This emulates an intel cpu with sse and sse2 support, but which doesn't support anything else.
-pub(crate) fn codegen_cpuid_call<'tcx>(
-    fx: &mut FunctionCx<'_, '_, 'tcx>,
-    leaf: Value,
-    _sub_leaf: Value,
-) -> (Value, Value, Value, Value) {
-    let leaf_0 = fx.bcx.create_block();
-    let leaf_1 = fx.bcx.create_block();
-    let leaf_7 = fx.bcx.create_block();
-    let leaf_8000_0000 = fx.bcx.create_block();
-    let leaf_8000_0001 = fx.bcx.create_block();
-    let unsupported_leaf = fx.bcx.create_block();
-
-    let dest = fx.bcx.create_block();
-    let eax = fx.bcx.append_block_param(dest, types::I32);
-    let ebx = fx.bcx.append_block_param(dest, types::I32);
-    let ecx = fx.bcx.append_block_param(dest, types::I32);
-    let edx = fx.bcx.append_block_param(dest, types::I32);
-
-    let mut switch = cranelift_frontend::Switch::new();
-    switch.set_entry(0, leaf_0);
-    switch.set_entry(1, leaf_1);
-    switch.set_entry(7, leaf_7);
-    switch.set_entry(0x8000_0000, leaf_8000_0000);
-    switch.set_entry(0x8000_0001, leaf_8000_0001);
-    switch.emit(&mut fx.bcx, leaf, unsupported_leaf);
-
-    fx.bcx.switch_to_block(leaf_0);
-    let max_basic_leaf = fx.bcx.ins().iconst(types::I32, 1);
-    let vend0 = fx.bcx.ins().iconst(types::I32, i64::from(u32::from_le_bytes(*b"Genu")));
-    let vend2 = fx.bcx.ins().iconst(types::I32, i64::from(u32::from_le_bytes(*b"ineI")));
-    let vend1 = fx.bcx.ins().iconst(types::I32, i64::from(u32::from_le_bytes(*b"ntel")));
-    fx.bcx.ins().jump(dest, &[max_basic_leaf, vend0, vend1, vend2]);
-
-    fx.bcx.switch_to_block(leaf_1);
-    let cpu_signature = fx.bcx.ins().iconst(types::I32, 0);
-    let additional_information = fx.bcx.ins().iconst(types::I32, 0);
-    let ecx_features = fx.bcx.ins().iconst(types::I32, 0);
-    let edx_features = fx.bcx.ins().iconst(types::I32, 1 << 25 /* sse */ | 1 << 26 /* sse2 */);
-    fx.bcx.ins().jump(dest, &[cpu_signature, additional_information, ecx_features, edx_features]);
-
-    fx.bcx.switch_to_block(leaf_7);
-    // This leaf technically has subleaves, but we just return zero for all subleaves.
-    let zero = fx.bcx.ins().iconst(types::I32, 0);
-    fx.bcx.ins().jump(dest, &[zero, zero, zero, zero]);
-
-    fx.bcx.switch_to_block(leaf_8000_0000);
-    let extended_max_basic_leaf = fx.bcx.ins().iconst(types::I32, 0);
-    let zero = fx.bcx.ins().iconst(types::I32, 0);
-    fx.bcx.ins().jump(dest, &[extended_max_basic_leaf, zero, zero, zero]);
-
-    fx.bcx.switch_to_block(leaf_8000_0001);
-    let zero = fx.bcx.ins().iconst(types::I32, 0);
-    let proc_info_ecx = fx.bcx.ins().iconst(types::I32, 0);
-    let proc_info_edx = fx.bcx.ins().iconst(types::I32, 0);
-    fx.bcx.ins().jump(dest, &[zero, zero, proc_info_ecx, proc_info_edx]);
-
-    fx.bcx.switch_to_block(unsupported_leaf);
-    crate::trap::trap_unimplemented(
-        fx,
-        "__cpuid_count arch intrinsic doesn't yet support specified leaf",
-    );
-
-    fx.bcx.switch_to_block(dest);
-    fx.bcx.ins().nop();
-
-    (eax, ebx, ecx, edx)
-}
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
index 63b5402f2..e9b7daf14 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs
@@ -1,10 +1,10 @@
 //! Emulate LLVM intrinsics
 
+use rustc_middle::ty::GenericArgsRef;
+
 use crate::intrinsics::*;
 use crate::prelude::*;
 
-use rustc_middle::ty::GenericArgsRef;
-
 pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     intrinsic: &str,
@@ -51,6 +51,21 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>(
             });
         }
 
+        _ if intrinsic.starts_with("llvm.fma.v") => {
+            intrinsic_args!(fx, args => (x,y,z); intrinsic);
+
+            simd_trio_for_each_lane(
+                fx,
+                x,
+                y,
+                z,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, lane_x, lane_y, lane_z| {
+                    fx.bcx.ins().fma(lane_x, lane_y, lane_z)
+                },
+            );
+        }
+
         _ => {
             fx.tcx
                 .sess
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs
index c20a99159..ee098be1f 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs
@@ -1,10 +1,10 @@
 //! Emulate AArch64 LLVM intrinsics
 
+use rustc_middle::ty::GenericArgsRef;
+
 use crate::intrinsics::*;
 use crate::prelude::*;
 
-use rustc_middle::ty::GenericArgsRef;
-
 pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     intrinsic: &str,
@@ -44,7 +44,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             });
         }
 
-        _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") => {
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v")
+            || intrinsic.starts_with("llvm.aarch64.neon.uqadd.v") =>
+        {
             intrinsic_args!(fx, args => (x, y); intrinsic);
 
             simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
@@ -52,7 +54,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             });
         }
 
-        _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") => {
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v")
+            || intrinsic.starts_with("llvm.aarch64.neon.uqsub.v") =>
+        {
             intrinsic_args!(fx, args => (x, y); intrinsic);
 
             simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| {
@@ -156,6 +160,106 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             });
         }
 
+        _ if intrinsic.starts_with("llvm.aarch64.neon.umaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umax(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.smaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smax(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.uminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.sminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.fminp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmin(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.fmaxp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmax(x_lane, y_lane),
+            );
+        }
+
+        _ if intrinsic.starts_with("llvm.aarch64.neon.addp.v") => {
+            intrinsic_args!(fx, args => (x, y); intrinsic);
+
+            simd_horizontal_pair_for_each_lane(
+                fx,
+                x,
+                y,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().iadd(x_lane, y_lane),
+            );
+        }
+
+        // FIXME generalize vector types
+        "llvm.aarch64.neon.tbl1.v16i8" => {
+            intrinsic_args!(fx, args => (t, idx); intrinsic);
+
+            let zero = fx.bcx.ins().iconst(types::I8, 0);
+            for i in 0..16 {
+                let idx_lane = idx.value_lane(fx, i).load_scalar(fx);
+                let is_zero =
+                    fx.bcx.ins().icmp_imm(IntCC::UnsignedGreaterThanOrEqual, idx_lane, 16);
+                let t_idx = fx.bcx.ins().uextend(fx.pointer_type, idx_lane);
+                let t_lane = t.value_lane_dyn(fx, t_idx).load_scalar(fx);
+                let res = fx.bcx.ins().select(is_zero, zero, t_lane);
+                ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
+            }
+        }
+
         /*
         _ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v")
             || intrinsic.starts_with("llvm.aarch64.neon.sqshl.v")
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
index e62de6b61..4c5360486 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs
@@ -1,10 +1,10 @@
 //! Emulate x86 LLVM intrinsics
 
+use rustc_middle::ty::GenericArgsRef;
+
 use crate::intrinsics::*;
 use crate::prelude::*;
 
-use rustc_middle::ty::GenericArgsRef;
-
 pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     intrinsic: &str,
@@ -20,53 +20,23 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
         // Used by is_x86_feature_detected!();
         "llvm.x86.xgetbv" => {
-            // FIXME use the actual xgetbv instruction
-            intrinsic_args!(fx, args => (v); intrinsic);
-
-            let v = v.load_scalar(fx);
+            intrinsic_args!(fx, args => (xcr_no); intrinsic);
 
-            // As of writing on XCR0 exists
-            fx.bcx.ins().trapnz(v, TrapCode::UnreachableCodeReached);
+            let xcr_no = xcr_no.load_scalar(fx);
 
-            let res = fx.bcx.ins().iconst(types::I64, 1 /* bit 0 must be set */);
-            ret.write_cvalue(fx, CValue::by_val(res, fx.layout_of(fx.tcx.types.i64)));
+            crate::inline_asm::codegen_xgetbv(fx, xcr_no, ret);
         }
 
-        // Used by `_mm_movemask_epi8` and `_mm256_movemask_epi8`
-        "llvm.x86.sse2.pmovmskb.128"
-        | "llvm.x86.avx2.pmovmskb"
-        | "llvm.x86.sse.movmsk.ps"
-        | "llvm.x86.sse2.movmsk.pd" => {
-            intrinsic_args!(fx, args => (a); intrinsic);
-
-            let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx);
-            let lane_ty = fx.clif_type(lane_ty).unwrap();
-            assert!(lane_count <= 32);
-
-            let mut res = fx.bcx.ins().iconst(types::I32, 0);
-
-            for lane in (0..lane_count).rev() {
-                let a_lane = a.value_lane(fx, lane).load_scalar(fx);
+        "llvm.x86.sse3.ldu.dq" | "llvm.x86.avx.ldu.dq.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128&ig_expand=4009
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256&ig_expand=4010
+            intrinsic_args!(fx, args => (ptr); intrinsic);
 
-                // cast float to int
-                let a_lane = match lane_ty {
-                    types::F32 => codegen_bitcast(fx, types::I32, a_lane),
-                    types::F64 => codegen_bitcast(fx, types::I64, a_lane),
-                    _ => a_lane,
-                };
-
-                // extract sign bit of an int
-                let a_lane_sign = fx.bcx.ins().ushr_imm(a_lane, i64::from(lane_ty.bits() - 1));
-
-                // shift sign bit into result
-                let a_lane_sign = clif_intcast(fx, a_lane_sign, types::I32, false);
-                res = fx.bcx.ins().ishl_imm(res, 1);
-                res = fx.bcx.ins().bor(res, a_lane_sign);
-            }
-
-            let res = CValue::by_val(res, fx.layout_of(fx.tcx.types.i32));
-            ret.write_cvalue(fx, res);
+            // FIXME correctly handle unalignedness
+            let val = CValue::by_ref(Pointer::new(ptr.load_scalar(fx)), ret.layout());
+            ret.write_cvalue(fx, val);
         }
+
         "llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
             let (x, y, kind) = match args {
                 [x, y, kind] => (x, y, kind),
@@ -74,8 +44,10 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             };
             let x = codegen_operand(fx, x);
             let y = codegen_operand(fx, y);
-            let kind = crate::constant::mir_operand_get_const_val(fx, kind)
-                .expect("llvm.x86.sse2.cmp.* kind not const");
+            let kind = match kind {
+                Operand::Constant(const_) => crate::constant::eval_mir_constant(fx, const_).0,
+                Operand::Copy(_) | Operand::Move(_) => unreachable!("{kind:?}"),
+            };
 
             let flt_cc = match kind
                 .try_to_bits(Size::from_bytes(1))
@@ -210,8 +182,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
                 }
             }
         }
-        "llvm.x86.avx2.vperm2i128" => {
+        "llvm.x86.avx2.vperm2i128"
+        | "llvm.x86.avx.vperm2f128.ps.256"
+        | "llvm.x86.avx.vperm2f128.pd.256" => {
             // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd
             let (a, b, imm8) = match args {
                 [a, b, imm8] => (a, b, imm8),
                 _ => bug!("wrong number of args for intrinsic {intrinsic}"),
@@ -220,19 +196,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             let b = codegen_operand(fx, b);
             let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
 
-            let a_0 = a.value_lane(fx, 0).load_scalar(fx);
-            let a_1 = a.value_lane(fx, 1).load_scalar(fx);
-            let a_low = fx.bcx.ins().iconcat(a_0, a_1);
-            let a_2 = a.value_lane(fx, 2).load_scalar(fx);
-            let a_3 = a.value_lane(fx, 3).load_scalar(fx);
-            let a_high = fx.bcx.ins().iconcat(a_2, a_3);
+            let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
+            let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
 
-            let b_0 = b.value_lane(fx, 0).load_scalar(fx);
-            let b_1 = b.value_lane(fx, 1).load_scalar(fx);
-            let b_low = fx.bcx.ins().iconcat(b_0, b_1);
-            let b_2 = b.value_lane(fx, 2).load_scalar(fx);
-            let b_3 = b.value_lane(fx, 3).load_scalar(fx);
-            let b_high = fx.bcx.ins().iconcat(b_2, b_3);
+            let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx);
+            let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx);
 
             fn select4(
                 fx: &mut FunctionCx<'_, '_, '_>,
@@ -257,16 +225,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
 
             let control0 = imm8;
             let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
-            let (res_0, res_1) = fx.bcx.ins().isplit(res_low);
 
             let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
             let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
-            let (res_2, res_3) = fx.bcx.ins().isplit(res_high);
 
-            ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
-            ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
-            ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
-            ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
+            ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store(
+                fx,
+                res_low,
+                MemFlags::trusted(),
+            );
+            ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store(
+                fx,
+                res_high,
+                MemFlags::trusted(),
+            );
         }
         "llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
             let a = match args {
@@ -308,6 +280,512 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             let val = CValue::by_val_pair(cb_out, c, layout);
             ret.write_cvalue(fx, val);
         }
+        "llvm.x86.sse2.pavg.b" | "llvm.x86.sse2.pavg.w" => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            // FIXME use vector instructions when possible
+            simd_pair_for_each_lane(
+                fx,
+                a,
+                b,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane| {
+                    // (a + b + 1) >> 1
+                    let lane_ty = fx.bcx.func.dfg.value_type(a_lane);
+                    let a_lane = fx.bcx.ins().uextend(lane_ty.double_width().unwrap(), a_lane);
+                    let b_lane = fx.bcx.ins().uextend(lane_ty.double_width().unwrap(), b_lane);
+                    let sum = fx.bcx.ins().iadd(a_lane, b_lane);
+                    let num_plus_one = fx.bcx.ins().iadd_imm(sum, 1);
+                    let res = fx.bcx.ins().ushr_imm(num_plus_one, 1);
+                    fx.bcx.ins().ireduce(lane_ty, res)
+                },
+            );
+        }
+        "llvm.x86.sse2.psra.w" => {
+            intrinsic_args!(fx, args => (a, count); intrinsic);
+
+            let count_lane = count.force_stack(fx).0.load(fx, types::I64, MemFlags::trusted());
+            let lane_ty = fx.clif_type(a.layout().ty.simd_size_and_type(fx.tcx).1).unwrap();
+            let max_count = fx.bcx.ins().iconst(types::I64, i64::from(lane_ty.bits() - 1));
+            let saturated_count = fx.bcx.ins().umin(count_lane, max_count);
+
+            // FIXME use vector instructions when possible
+            simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, a_lane| {
+                fx.bcx.ins().sshr(a_lane, saturated_count)
+            });
+        }
+        "llvm.x86.sse2.psad.bw" | "llvm.x86.avx2.psad.bw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8&ig_expand=5770
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8&ig_expand=5771
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.u8);
+            assert_eq!(ret_lane_ty, fx.tcx.types.u64);
+            assert_eq!(lane_count, ret_lane_count * 8);
+
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.u64);
+            for out_lane_idx in 0..lane_count / 8 {
+                let mut lane_diff_acc = fx.bcx.ins().iconst(types::I64, 0);
+
+                for lane_idx in out_lane_idx * 8..out_lane_idx * 8 + 1 {
+                    let a_lane = a.value_lane(fx, lane_idx).load_scalar(fx);
+                    let b_lane = b.value_lane(fx, lane_idx).load_scalar(fx);
+
+                    let lane_diff = fx.bcx.ins().isub(a_lane, b_lane);
+                    let abs_lane_diff = fx.bcx.ins().iabs(lane_diff);
+                    let abs_lane_diff = fx.bcx.ins().uextend(types::I64, abs_lane_diff);
+                    lane_diff_acc = fx.bcx.ins().iadd(lane_diff_acc, abs_lane_diff);
+                }
+
+                let res_lane = CValue::by_val(lane_diff_acc, ret_lane_layout);
+
+                ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
+            }
+        }
+        "llvm.x86.ssse3.pmadd.ub.sw.128" | "llvm.x86.avx2.pmadd.ub.sw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16&ig_expand=4267
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16&ig_expand=4270
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.u8);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
+            assert_eq!(lane_count, ret_lane_count * 2);
+
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
+            for out_lane_idx in 0..lane_count / 2 {
+                let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
+                let a_lane0 = fx.bcx.ins().uextend(types::I16, a_lane0);
+                let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
+                let b_lane0 = fx.bcx.ins().sextend(types::I16, b_lane0);
+
+                let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
+                let a_lane1 = fx.bcx.ins().uextend(types::I16, a_lane1);
+                let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
+                let b_lane1 = fx.bcx.ins().sextend(types::I16, b_lane1);
+
+                let mul0: Value = fx.bcx.ins().imul(a_lane0, b_lane0);
+                let mul1 = fx.bcx.ins().imul(a_lane1, b_lane1);
+
+                let (val, has_overflow) = fx.bcx.ins().sadd_overflow(mul0, mul1);
+
+                let rhs_ge_zero = fx.bcx.ins().icmp_imm(IntCC::SignedGreaterThanOrEqual, mul1, 0);
+
+                let min = fx.bcx.ins().iconst(types::I16, i64::from(i16::MIN as u16));
+                let max = fx.bcx.ins().iconst(types::I16, i64::from(i16::MAX as u16));
+
+                let sat_val = fx.bcx.ins().select(rhs_ge_zero, max, min);
+                let res_lane = fx.bcx.ins().select(has_overflow, sat_val, val);
+
+                let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+                ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
+            }
+        }
+        "llvm.x86.sse2.pmadd.wd" | "llvm.x86.avx2.pmadd.wd" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16&ig_expand=4231
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16&ig_expand=4234
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i16);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i32);
+            assert_eq!(lane_count, ret_lane_count * 2);
+
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i32);
+            for out_lane_idx in 0..lane_count / 2 {
+                let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
+                let a_lane0 = fx.bcx.ins().uextend(types::I32, a_lane0);
+                let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
+                let b_lane0 = fx.bcx.ins().sextend(types::I32, b_lane0);
+
+                let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
+                let a_lane1 = fx.bcx.ins().uextend(types::I32, a_lane1);
+                let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
+                let b_lane1 = fx.bcx.ins().sextend(types::I32, b_lane1);
+
+                let mul0: Value = fx.bcx.ins().imul(a_lane0, b_lane0);
+                let mul1 = fx.bcx.ins().imul(a_lane1, b_lane1);
+
+                let res_lane = fx.bcx.ins().iadd(mul0, mul1);
+                let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+                ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.ssse3.pmul.hr.sw.128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16&ig_expand=4782
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i16);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
+            assert_eq!(lane_count, ret_lane_count);
+
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
+            for out_lane_idx in 0..lane_count {
+                let a_lane = a.value_lane(fx, out_lane_idx).load_scalar(fx);
+                let a_lane = fx.bcx.ins().sextend(types::I32, a_lane);
+                let b_lane = b.value_lane(fx, out_lane_idx).load_scalar(fx);
+                let b_lane = fx.bcx.ins().sextend(types::I32, b_lane);
+
+                let mul: Value = fx.bcx.ins().imul(a_lane, b_lane);
+                let shifted = fx.bcx.ins().ushr_imm(mul, 14);
+                let incremented = fx.bcx.ins().iadd_imm(shifted, 1);
+                let shifted_again = fx.bcx.ins().ushr_imm(incremented, 1);
+
+                let res_lane = fx.bcx.ins().ireduce(types::I16, shifted_again);
+                let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+                ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.sse2.packuswb.128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i16);
+            assert_eq!(ret_lane_ty, fx.tcx.types.u8);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let zero = fx.bcx.ins().iconst(types::I16, 0);
+            let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
+
+            for idx in 0..lane_count {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.avx2.packuswb" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i16);
+            assert_eq!(ret_lane_ty, fx.tcx.types.u8);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let zero = fx.bcx.ins().iconst(types::I16, 0);
+            let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, zero);
+                let sat = fx.bcx.ins().umin(sat, max_u8);
+                let res = fx.bcx.ins().ireduce(types::I8, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.sse2.packssdw.128" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i32);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
+            let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
+
+            for idx in 0..lane_count {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.sse41.packusdw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i32);
+            assert_eq!(ret_lane_ty, fx.tcx.types.u16);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN));
+            let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX));
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.u16);
+
+            for idx in 0..lane_count {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().umax(lane, min_u16);
+                let sat = fx.bcx.ins().umin(sat, max_u16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().umax(lane, min_u16);
+                let sat = fx.bcx.ins().umin(sat, max_u16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.avx2.packssdw" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i32);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i16);
+            assert_eq!(lane_count * 2, ret_lane_count);
+
+            let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
+            let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
+            let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = a.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
+            }
+
+            for idx in 0..lane_count / 2 {
+                let lane = b.value_lane(fx, idx).load_scalar(fx);
+                let sat = fx.bcx.ins().smax(lane, min_i16);
+                let sat = fx.bcx.ins().umin(sat, max_i16);
+                let res = fx.bcx.ins().ireduce(types::I16, sat);
+
+                let res_lane = CValue::by_val(res, ret_lane_layout);
+                ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
+            }
+        }
+
+        "llvm.x86.pclmulqdq" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
+            intrinsic_args!(fx, args => (a, b, imm8); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i64);
+            assert_eq!(ret_lane_ty, fx.tcx.types.i64);
+            assert_eq!(lane_count, 2);
+            assert_eq!(ret_lane_count, 2);
+
+            let imm8 = imm8.load_scalar(fx);
+
+            let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001);
+            let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
+            let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
+            let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0);
+
+            let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000);
+            let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
+            let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
+            let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0);
+
+            fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value {
+                let tmp = fx.bcx.ins().ushr_imm(val, bit);
+                fx.bcx.ins().band_imm(tmp, 1)
+            }
+
+            let mut res1 = fx.bcx.ins().iconst(types::I64, 0);
+            for i in 0..=63 {
+                let x = extract_bit(fx, temp1, 0);
+                let y = extract_bit(fx, temp2, i);
+                let mut temp = fx.bcx.ins().band(x, y);
+                for j in 1..=i {
+                    let x = extract_bit(fx, temp1, j);
+                    let y = extract_bit(fx, temp2, i - j);
+                    let z = fx.bcx.ins().band(x, y);
+                    temp = fx.bcx.ins().bxor(temp, z);
+                }
+                let temp = fx.bcx.ins().ishl_imm(temp, i);
+                res1 = fx.bcx.ins().bor(res1, temp);
+            }
+            ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted());
+
+            let mut res2 = fx.bcx.ins().iconst(types::I64, 0);
+            for i in 64..=127 {
+                let mut temp = fx.bcx.ins().iconst(types::I64, 0);
+                for j in i - 63..=63 {
+                    let x = extract_bit(fx, temp1, j);
+                    let y = extract_bit(fx, temp2, i - j);
+                    let z = fx.bcx.ins().band(x, y);
+                    temp = fx.bcx.ins().bxor(temp, z);
+                }
+                let temp = fx.bcx.ins().ishl_imm(temp, i);
+                res2 = fx.bcx.ins().bor(res2, temp);
+            }
+            ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());
+        }
+
+        "llvm.x86.avx.ptestz.256" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            assert_eq!(a.layout(), b.layout());
+            let layout = a.layout();
+
+            let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+            assert_eq!(lane_ty, fx.tcx.types.i64);
+            assert_eq!(ret.layout().ty, fx.tcx.types.i32);
+            assert_eq!(lane_count, 4);
+
+            let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
+            let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
+            let a_lane2 = a.value_lane(fx, 2).load_scalar(fx);
+            let a_lane3 = a.value_lane(fx, 3).load_scalar(fx);
+            let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
+            let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
+            let b_lane2 = b.value_lane(fx, 2).load_scalar(fx);
+            let b_lane3 = b.value_lane(fx, 3).load_scalar(fx);
+
+            let zero0 = fx.bcx.ins().band(a_lane0, b_lane0);
+            let zero1 = fx.bcx.ins().band(a_lane1, b_lane1);
+            let zero2 = fx.bcx.ins().band(a_lane2, b_lane2);
+            let zero3 = fx.bcx.ins().band(a_lane3, b_lane3);
+
+            let all_zero0 = fx.bcx.ins().bor(zero0, zero1);
+            let all_zero1 = fx.bcx.ins().bor(zero2, zero3);
+            let all_zero = fx.bcx.ins().bor(all_zero0, all_zero1);
+
+            let res = fx.bcx.ins().icmp_imm(IntCC::Equal, all_zero, 0);
+            let res = CValue::by_val(
+                fx.bcx.ins().uextend(types::I32, res),
+                fx.layout_of(fx.tcx.types.i32),
+            );
+            ret.write_cvalue(fx, res);
+        }
+
         _ => {
             fx.tcx
                 .sess
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
index 36e9ba9c7..bfeeb117f 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
@@ -12,23 +12,20 @@ macro_rules! intrinsic_args {
     }
 }
 
-mod cpuid;
 mod llvm;
 mod llvm_aarch64;
 mod llvm_x86;
 mod simd;
 
-pub(crate) use cpuid::codegen_cpuid_call;
-pub(crate) use llvm::codegen_llvm_intrinsic_call;
-
+use cranelift_codegen::ir::AtomicRmwOp;
 use rustc_middle::ty;
 use rustc_middle::ty::layout::{HasParamEnv, ValidityRequirement};
 use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths};
 use rustc_middle::ty::GenericArgsRef;
 use rustc_span::symbol::{kw, sym, Symbol};
 
+pub(crate) use self::llvm::codegen_llvm_intrinsic_call;
 use crate::prelude::*;
-use cranelift_codegen::ir::AtomicRmwOp;
 
 fn bug_on_incorrect_arg_count(intrinsic: impl std::fmt::Display) -> ! {
     bug!("wrong number of args for intrinsic {}", intrinsic);
@@ -135,6 +132,65 @@ fn simd_pair_for_each_lane<'tcx>(
     }
 }
 
+fn simd_horizontal_pair_for_each_lane<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    x: CValue<'tcx>,
+    y: CValue<'tcx>,
+    ret: CPlace<'tcx>,
+    f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value) -> Value,
+) {
+    assert_eq!(x.layout(), y.layout());
+    let layout = x.layout();
+
+    let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+    let lane_layout = fx.layout_of(lane_ty);
+    let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+    let ret_lane_layout = fx.layout_of(ret_lane_ty);
+    assert_eq!(lane_count, ret_lane_count);
+
+    for lane_idx in 0..lane_count {
+        let src = if lane_idx < (lane_count / 2) { x } else { y };
+        let src_idx = lane_idx % (lane_count / 2);
+
+        let lhs_lane = src.value_lane(fx, src_idx * 2).load_scalar(fx);
+        let rhs_lane = src.value_lane(fx, src_idx * 2 + 1).load_scalar(fx);
+
+        let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, lhs_lane, rhs_lane);
+        let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+        ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
+    }
+}
+
+fn simd_trio_for_each_lane<'tcx>(
+    fx: &mut FunctionCx<'_, '_, 'tcx>,
+    x: CValue<'tcx>,
+    y: CValue<'tcx>,
+    z: CValue<'tcx>,
+    ret: CPlace<'tcx>,
+    f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value, Value) -> Value,
+) {
+    assert_eq!(x.layout(), y.layout());
+    let layout = x.layout();
+
+    let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+    let lane_layout = fx.layout_of(lane_ty);
+    let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
+    let ret_lane_layout = fx.layout_of(ret_lane_ty);
+    assert_eq!(lane_count, ret_lane_count);
+
+    for lane_idx in 0..lane_count {
+        let x_lane = x.value_lane(fx, lane_idx).load_scalar(fx);
+        let y_lane = y.value_lane(fx, lane_idx).load_scalar(fx);
+        let z_lane = z.value_lane(fx, lane_idx).load_scalar(fx);
+
+        let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, x_lane, y_lane, z_lane);
+        let res_lane = CValue::by_val(res_lane, ret_lane_layout);
+
+        ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane);
+    }
+}
+
 fn simd_reduce<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
     val: CValue<'tcx>,
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
index 6efbe1498..ea137c4ca 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/simd.rs
@@ -148,7 +148,7 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
             let total_len = lane_count * 2;
 
             let indexes =
-                idx.iter().map(|idx| idx.unwrap_leaf().try_to_u16().unwrap()).collect::<Vec<u16>>();
+                idx.iter().map(|idx| idx.unwrap_leaf().try_to_u32().unwrap()).collect::<Vec<u32>>();
 
             for &idx in &indexes {
                 assert!(u64::from(idx) < total_len, "idx {} out of range 0..{}", idx, total_len);
@@ -216,8 +216,10 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
 
             let indexes = {
                 use rustc_middle::mir::interpret::*;
-                let idx_const = crate::constant::mir_operand_get_const_val(fx, idx)
-                    .expect("simd_shuffle idx not const");
+                let idx_const = match idx {
+                    Operand::Constant(const_) => crate::constant::eval_mir_constant(fx, const_).0,
+                    Operand::Copy(_) | Operand::Move(_) => unreachable!("{idx:?}"),
+                };
 
                 let idx_bytes = match idx_const {
                     ConstValue::Indirect { alloc_id, offset } => {
@@ -343,7 +345,11 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
             ret.write_cvalue(fx, ret_lane);
         }
 
-        sym::simd_neg => {
+        sym::simd_neg
+        | sym::simd_bswap
+        | sym::simd_bitreverse
+        | sym::simd_ctlz
+        | sym::simd_cttz => {
             intrinsic_args!(fx, args => (a); intrinsic);
 
             if !a.layout().ty.is_simd() {
@@ -351,16 +357,21 @@ pub(super) fn codegen_simd_intrinsic_call<'tcx>(
                 return;
             }
 
-            simd_for_each_lane(
-                fx,
-                a,
-                ret,
-                &|fx, lane_ty, _ret_lane_ty, lane| match lane_ty.kind() {
-                    ty::Int(_) => fx.bcx.ins().ineg(lane),
-                    ty::Float(_) => fx.bcx.ins().fneg(lane),
-                    _ => unreachable!(),
-                },
-            );
+            simd_for_each_lane(fx, a, ret, &|fx, lane_ty, _ret_lane_ty, lane| match (
+                lane_ty.kind(),
+                intrinsic,
+            ) {
+                (ty::Int(_), sym::simd_neg) => fx.bcx.ins().ineg(lane),
+                (ty::Float(_), sym::simd_neg) => fx.bcx.ins().fneg(lane),
+
+                (ty::Uint(ty::UintTy::U8) | ty::Int(ty::IntTy::I8), sym::simd_bswap) => lane,
+                (ty::Uint(_) | ty::Int(_), sym::simd_bswap) => fx.bcx.ins().bswap(lane),
+                (ty::Uint(_) | ty::Int(_), sym::simd_bitreverse) => fx.bcx.ins().bitrev(lane),
+                (ty::Uint(_) | ty::Int(_), sym::simd_ctlz) => fx.bcx.ins().clz(lane),
+                (ty::Uint(_) | ty::Int(_), sym::simd_cttz) => fx.bcx.ins().ctz(lane),
+
+                _ => unreachable!(),
+            });
         }
 
         sym::simd_add
diff --git a/compiler/rustc_codegen_cranelift/src/lib.rs b/compiler/rustc_codegen_cranelift/src/lib.rs
index d01ded8ab..148193b5a 100644
--- a/compiler/rustc_codegen_cranelift/src/lib.rs
+++ b/compiler/rustc_codegen_cranelift/src/lib.rs
@@ -1,3 +1,6 @@
+#![cfg_attr(all(doc, not(bootstrap)), allow(internal_features))]
+#![cfg_attr(all(doc, not(bootstrap)), feature(rustdoc_internals))]
+#![cfg_attr(all(doc, not(bootstrap)), doc(rust_logo))]
 #![feature(rustc_private)]
 // Note: please avoid adding other feature gates where possible
 #![warn(rust_2018_idioms)]
@@ -29,6 +32,8 @@ use std::any::Any;
 use std::cell::{Cell, RefCell};
 use std::sync::Arc;
 
+use cranelift_codegen::isa::TargetIsa;
+use cranelift_codegen::settings::{self, Configurable};
 use rustc_codegen_ssa::traits::CodegenBackend;
 use rustc_codegen_ssa::CodegenResults;
 use rustc_data_structures::profiling::SelfProfilerRef;
@@ -39,9 +44,6 @@ use rustc_session::config::OutputFilenames;
 use rustc_session::Session;
 use rustc_span::Symbol;
 
-use cranelift_codegen::isa::TargetIsa;
-use cranelift_codegen::settings::{self, Configurable};
-
 pub use crate::config::*;
 use crate::prelude::*;
 
@@ -76,22 +78,6 @@ mod value_and_place;
 mod vtable;
 
 mod prelude {
-    pub(crate) use rustc_span::{FileNameDisplayPreference, Span};
-
-    pub(crate) use rustc_hir::def_id::{DefId, LOCAL_CRATE};
-    pub(crate) use rustc_middle::bug;
-    pub(crate) use rustc_middle::mir::{self, *};
-    pub(crate) use rustc_middle::ty::layout::{self, LayoutOf, TyAndLayout};
-    pub(crate) use rustc_middle::ty::{
-        self, FloatTy, Instance, InstanceDef, IntTy, ParamEnv, Ty, TyCtxt, TypeAndMut,
-        TypeFoldable, TypeVisitableExt, UintTy,
-    };
-    pub(crate) use rustc_target::abi::{Abi, FieldIdx, Scalar, Size, VariantIdx, FIRST_VARIANT};
-
-    pub(crate) use rustc_data_structures::fx::{FxHashMap, FxIndexMap};
-
-    pub(crate) use rustc_index::Idx;
-
     pub(crate) use cranelift_codegen::ir::condcodes::{FloatCC, IntCC};
     pub(crate) use cranelift_codegen::ir::function::Function;
     pub(crate) use cranelift_codegen::ir::types;
@@ -103,6 +89,18 @@ mod prelude {
     pub(crate) use cranelift_codegen::Context;
     pub(crate) use cranelift_frontend::{FunctionBuilder, FunctionBuilderContext, Variable};
     pub(crate) use cranelift_module::{self, DataDescription, FuncId, Linkage, Module};
+    pub(crate) use rustc_data_structures::fx::{FxHashMap, FxIndexMap};
+    pub(crate) use rustc_hir::def_id::{DefId, LOCAL_CRATE};
+    pub(crate) use rustc_index::Idx;
+    pub(crate) use rustc_middle::bug;
+    pub(crate) use rustc_middle::mir::{self, *};
+    pub(crate) use rustc_middle::ty::layout::{self, LayoutOf, TyAndLayout};
+    pub(crate) use rustc_middle::ty::{
+        self, FloatTy, Instance, InstanceDef, IntTy, ParamEnv, Ty, TyCtxt, TypeAndMut,
+        TypeFoldable, TypeVisitableExt, UintTy,
+    };
+    pub(crate) use rustc_span::{FileNameDisplayPreference, Span};
+    pub(crate) use rustc_target::abi::{Abi, FieldIdx, Scalar, Size, VariantIdx, FIRST_VARIANT};
 
     pub(crate) use crate::abi::*;
     pub(crate) use crate::base::{codegen_operand, codegen_place};
@@ -191,7 +189,7 @@ impl CodegenBackend for CraneliftCodegenBackend {
     }
 
     fn target_features(&self, _sess: &Session, _allow_unstable: bool) -> Vec<rustc_span::Symbol> {
-        vec![]
+        vec![] // FIXME necessary for #[cfg(target_feature]
     }
 
     fn print_version(&self) {
@@ -263,9 +261,9 @@ fn build_isa(sess: &Session, backend_config: &BackendConfig) -> Arc<dyn isa::Tar
     let preserve_frame_pointer = sess.target.options.frame_pointer
         != rustc_target::spec::FramePointer::MayOmit
         || matches!(sess.opts.cg.force_frame_pointers, Some(true));
-    if preserve_frame_pointer {
-        flags_builder.set("preserve_frame_pointers", "true").unwrap();
-    }
+    flags_builder
+        .set("preserve_frame_pointers", if preserve_frame_pointer { "true" } else { "false" })
+        .unwrap();
 
     let tls_model = match target_triple.binary_format {
         BinaryFormat::Elf => "elf_gd",
diff --git a/compiler/rustc_codegen_cranelift/src/pointer.rs b/compiler/rustc_codegen_cranelift/src/pointer.rs
index b60e56720..11ac6b946 100644
--- a/compiler/rustc_codegen_cranelift/src/pointer.rs
+++ b/compiler/rustc_codegen_cranelift/src/pointer.rs
@@ -1,11 +1,10 @@
 //! Defines [`Pointer`] which is used to improve the quality of the generated clif ir for pointer
 //! operations.
 
-use crate::prelude::*;
-
+use cranelift_codegen::ir::immediates::Offset32;
 use rustc_target::abi::Align;
 
-use cranelift_codegen::ir::immediates::Offset32;
+use crate::prelude::*;
 
 /// A pointer pointing either to a certain address, a certain stack slot or nothing.
 #[derive(Copy, Clone, Debug)]
diff --git a/compiler/rustc_codegen_cranelift/src/pretty_clif.rs b/compiler/rustc_codegen_cranelift/src/pretty_clif.rs
index 0ead50c34..da84e54a9 100644
--- a/compiler/rustc_codegen_cranelift/src/pretty_clif.rs
+++ b/compiler/rustc_codegen_cranelift/src/pretty_clif.rs
@@ -63,8 +63,8 @@ use cranelift_codegen::{
     ir::entities::AnyEntity,
     write::{FuncWriter, PlainWriter},
 };
-
 use rustc_middle::ty::layout::FnAbiOf;
+use rustc_middle::ty::print::with_no_trimmed_paths;
 use rustc_session::config::{OutputFilenames, OutputType};
 
 use crate::prelude::*;
@@ -80,15 +80,17 @@ impl CommentWriter {
     pub(crate) fn new<'tcx>(tcx: TyCtxt<'tcx>, instance: Instance<'tcx>) -> Self {
         let enabled = should_write_ir(tcx);
         let global_comments = if enabled {
-            vec![
-                format!("symbol {}", tcx.symbol_name(instance).name),
-                format!("instance {:?}", instance),
-                format!(
-                    "abi {:?}",
-                    RevealAllLayoutCx(tcx).fn_abi_of_instance(instance, ty::List::empty())
-                ),
-                String::new(),
-            ]
+            with_no_trimmed_paths!({
+                vec![
+                    format!("symbol {}", tcx.symbol_name(instance).name),
+                    format!("instance {:?}", instance),
+                    format!(
+                        "abi {:?}",
+                        RevealAllLayoutCx(tcx).fn_abi_of_instance(instance, ty::List::empty())
+                    ),
+                    String::new(),
+                ]
+            })
         } else {
             vec![]
         };
diff --git a/compiler/rustc_codegen_cranelift/src/value_and_place.rs b/compiler/rustc_codegen_cranelift/src/value_and_place.rs
index 45893a4f3..21ad2a835 100644
--- a/compiler/rustc_codegen_cranelift/src/value_and_place.rs
+++ b/compiler/rustc_codegen_cranelift/src/value_and_place.rs
@@ -1,11 +1,10 @@
 //! Definition of [`CValue`] and [`CPlace`]
 
-use crate::prelude::*;
-
-use rustc_middle::ty::FnSig;
-
 use cranelift_codegen::entity::EntityRef;
 use cranelift_codegen::ir::immediates::Offset32;
+use rustc_middle::ty::FnSig;
+
+use crate::prelude::*;
 
 fn codegen_field<'tcx>(
     fx: &mut FunctionCx<'_, '_, 'tcx>,
@@ -133,18 +132,11 @@ impl<'tcx> CValue<'tcx> {
                 (ptr.get_addr(fx), vtable)
             }
             CValueInner::ByValPair(data, vtable) => {
-                let stack_slot = fx.bcx.create_sized_stack_slot(StackSlotData {
-                    kind: StackSlotKind::ExplicitSlot,
-                    // FIXME Don't force the size to a multiple of 16 bytes once Cranelift gets a way to
-                    // specify stack slot alignment.
-                    size: (u32::try_from(fx.target_config.pointer_type().bytes()).unwrap() + 15)
-                        / 16
-                        * 16,
-                });
-                let data_ptr = Pointer::stack_slot(stack_slot);
-                let mut flags = MemFlags::new();
-                flags.set_notrap();
-                data_ptr.store(fx, data, flags);
+                let data_ptr = fx.create_stack_slot(
+                    u32::try_from(fx.target_config.pointer_type().bytes()).unwrap(),
+                    u32::try_from(fx.target_config.pointer_type().bytes()).unwrap(),
+                );
+                data_ptr.store(fx, data, MemFlags::trusted());
 
                 (data_ptr.get_addr(fx), vtable)
             }
@@ -251,6 +243,34 @@ impl<'tcx> CValue<'tcx> {
         let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
         let lane_layout = fx.layout_of(lane_ty);
         assert!(lane_idx < lane_count);
+
+        match self.0 {
+            CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
+            CValueInner::ByRef(ptr, None) => {
+                let field_offset = lane_layout.size * lane_idx;
+                let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
+                CValue::by_ref(field_ptr, lane_layout)
+            }
+            CValueInner::ByRef(_, Some(_)) => unreachable!(),
+        }
+    }
+
+    /// Like [`CValue::value_field`] except using the passed type as lane type instead of the one
+    /// specified by the vector type.
+    pub(crate) fn value_typed_lane(
+        self,
+        fx: &mut FunctionCx<'_, '_, 'tcx>,
+        lane_ty: Ty<'tcx>,
+        lane_idx: u64,
+    ) -> CValue<'tcx> {
+        let layout = self.1;
+        assert!(layout.ty.is_simd());
+        let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+        let lane_layout = fx.layout_of(lane_ty);
+        assert!(
+            (lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
+        );
+
         match self.0 {
             CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
             CValueInner::ByRef(ptr, None) => {
@@ -310,7 +330,8 @@ impl<'tcx> CValue<'tcx> {
                 fx.bcx.ins().iconcat(lsb, msb)
             }
             ty::Bool | ty::Char | ty::Uint(_) | ty::Int(_) | ty::Ref(..) | ty::RawPtr(..) => {
-                fx.bcx.ins().iconst(clif_ty, const_val.to_bits(layout.size).unwrap() as i64)
+                let raw_val = const_val.size().truncate(const_val.to_bits(layout.size).unwrap());
+                fx.bcx.ins().iconst(clif_ty, raw_val as i64)
             }
             ty::Float(FloatTy::F32) => {
                 fx.bcx.ins().f32const(Ieee32::with_bits(u32::try_from(const_val).unwrap()))
@@ -372,13 +393,11 @@ impl<'tcx> CPlace<'tcx> {
                 .fatal(format!("values of type {} are too big to store on the stack", layout.ty));
         }
 
-        let stack_slot = fx.bcx.create_sized_stack_slot(StackSlotData {
-            kind: StackSlotKind::ExplicitSlot,
-            // FIXME Don't force the size to a multiple of 16 bytes once Cranelift gets a way to
-            // specify stack slot alignment.
-            size: (u32::try_from(layout.size.bytes()).unwrap() + 15) / 16 * 16,
-        });
-        CPlace { inner: CPlaceInner::Addr(Pointer::stack_slot(stack_slot), None), layout }
+        let stack_slot = fx.create_stack_slot(
+            u32::try_from(layout.size.bytes()).unwrap(),
+            u32::try_from(layout.align.pref.bytes()).unwrap(),
+        );
+        CPlace { inner: CPlaceInner::Addr(stack_slot, None), layout }
     }
 
     pub(crate) fn new_var(
@@ -543,13 +562,7 @@ impl<'tcx> CPlace<'tcx> {
                 _ if src_ty.is_vector() && dst_ty.is_vector() => codegen_bitcast(fx, dst_ty, data),
                 _ if src_ty.is_vector() || dst_ty.is_vector() => {
                     // FIXME(bytecodealliance/wasmtime#6104) do something more efficient for transmutes between vectors and integers.
-                    let stack_slot = fx.bcx.create_sized_stack_slot(StackSlotData {
-                        kind: StackSlotKind::ExplicitSlot,
-                        // FIXME Don't force the size to a multiple of 16 bytes once Cranelift gets a way to
-                        // specify stack slot alignment.
-                        size: (src_ty.bytes() + 15) / 16 * 16,
-                    });
-                    let ptr = Pointer::stack_slot(stack_slot);
+                    let ptr = fx.create_stack_slot(src_ty.bytes(), src_ty.bytes());
                     ptr.store(fx, data, MemFlags::trusted());
                     ptr.load(fx, dst_ty, MemFlags::trusted())
                 }
@@ -749,6 +762,34 @@ impl<'tcx> CPlace<'tcx> {
         }
     }
 
+    /// Like [`CPlace::place_field`] except using the passed type as lane type instead of the one
+    /// specified by the vector type.
+    pub(crate) fn place_typed_lane(
+        self,
+        fx: &mut FunctionCx<'_, '_, 'tcx>,
+        lane_ty: Ty<'tcx>,
+        lane_idx: u64,
+    ) -> CPlace<'tcx> {
+        let layout = self.layout();
+        assert!(layout.ty.is_simd());
+        let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
+        let lane_layout = fx.layout_of(lane_ty);
+        assert!(
+            (lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size
+        );
+
+        match self.inner {
+            CPlaceInner::Var(_, _) => unreachable!(),
+            CPlaceInner::VarPair(_, _, _) => unreachable!(),
+            CPlaceInner::Addr(ptr, None) => {
+                let field_offset = lane_layout.size * lane_idx;
+                let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap());
+                CPlace::for_ptr(field_ptr, lane_layout)
+            }
+            CPlaceInner::Addr(_, Some(_)) => unreachable!(),
+        }
+    }
+
     pub(crate) fn place_index(
         self,
         fx: &mut FunctionCx<'_, '_, 'tcx>,