summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig24
-rw-r--r--arch/x86/boot/compressed/efi_mixed.S29
-rw-r--r--arch/x86/coco/core.c7
-rw-r--r--arch/x86/events/amd/core.c1
-rw-r--r--arch/x86/hyperv/hv_vtl.c26
-rw-r--r--arch/x86/include/asm/asm-prototypes.h1
-rw-r--r--arch/x86/include/asm/asm.h14
-rw-r--r--arch/x86/include/asm/coco.h8
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/crash_core.h2
-rw-r--r--arch/x86/include/asm/mem_encrypt.h15
-rw-r--r--arch/x86/include/asm/msr-index.h8
-rw-r--r--arch/x86/include/asm/nospec-branch.h21
-rw-r--r--arch/x86/include/asm/page.h6
-rw-r--r--arch/x86/include/asm/sev.h4
-rw-r--r--arch/x86/include/asm/suspend_32.h10
-rw-r--r--arch/x86/include/asm/vsyscall.h10
-rw-r--r--arch/x86/include/asm/x86_init.h3
-rw-r--r--arch/x86/kernel/acpi/cppc.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c10
-rw-r--r--arch/x86/kernel/cpu/bugs.c92
-rw-r--r--arch/x86/kernel/cpu/common.c38
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c10
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h8
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c48
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c14
-rw-r--r--arch/x86/kernel/eisa.c3
-rw-r--r--arch/x86/kernel/fpu/xstate.c5
-rw-r--r--arch/x86/kernel/fpu/xstate.h14
-rw-r--r--arch/x86/kernel/kprobes/core.c11
-rw-r--r--arch/x86/kernel/mpparse.c10
-rw-r--r--arch/x86/kernel/nmi.c2
-rw-r--r--arch/x86/kernel/probe_roms.c10
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/kernel/sev-shared.c12
-rw-r--r--arch/x86/kernel/sev.c31
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/cpuid.c21
-rw-r--r--arch/x86/kvm/hyperv.c2
-rw-r--r--arch/x86/kvm/lapic.c5
-rw-r--r--arch/x86/kvm/reverse_cpuid.h35
-rw-r--r--arch/x86/kvm/svm/sev.c23
-rw-r--r--arch/x86/kvm/x86.c15
-rw-r--r--arch/x86/kvm/xen.c4
-rw-r--r--arch/x86/kvm/xen.h18
-rw-r--r--arch/x86/lib/retpoline.S11
-rw-r--r--arch/x86/mm/fault.c9
-rw-r--r--arch/x86/mm/maccess.c10
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c18
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c40
-rw-r--r--arch/x86/tools/relocs.c8
-rw-r--r--arch/x86/xen/smp.c12
52 files changed, 513 insertions, 233 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1566748f16..9241274858 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1539,19 +1539,6 @@ config AMD_MEM_ENCRYPT
This requires an AMD processor that supports Secure Memory
Encryption (SME).
-config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
- bool "Activate AMD Secure Memory Encryption (SME) by default"
- depends on AMD_MEM_ENCRYPT
- help
- Say yes to have system memory encrypted by default if running on
- an AMD processor that supports Secure Memory Encryption (SME).
-
- If set to Y, then the encryption of system memory can be
- deactivated with the mem_encrypt=off command line option.
-
- If set to N, then the encryption of system memory can be
- activated with the mem_encrypt=on command line option.
-
# Common NUMA Features
config NUMA
bool "NUMA Memory Allocation and Scheduler Support"
@@ -2609,6 +2596,17 @@ config GDS_FORCE_MITIGATION
If in doubt, say N.
+config MITIGATION_RFDS
+ bool "RFDS Mitigation"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for Register File Data Sampling (RFDS) by default.
+ RFDS is a hardware vulnerability which affects Intel Atom CPUs. It
+ allows unprivileged speculative access to stale data previously
+ stored in floating point, vector and integer registers.
+ See also <file:Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst>
+
endif
config ARCH_HAS_ADD_PAGES
diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S
index f4e22ef774..876fc6d46a 100644
--- a/arch/x86/boot/compressed/efi_mixed.S
+++ b/arch/x86/boot/compressed/efi_mixed.S
@@ -15,10 +15,12 @@
*/
#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/page_types.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
+#include <asm/setup.h>
.code64
.text
@@ -49,6 +51,11 @@ SYM_FUNC_START(startup_64_mixed_mode)
lea efi32_boot_args(%rip), %rdx
mov 0(%rdx), %edi
mov 4(%rdx), %esi
+
+ /* Switch to the firmware's stack */
+ movl efi32_boot_sp(%rip), %esp
+ andl $~7, %esp
+
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
mov 8(%rdx), %edx // saved bootparams pointer
test %edx, %edx
@@ -144,6 +151,7 @@ SYM_FUNC_END(__efi64_thunk)
SYM_FUNC_START(efi32_stub_entry)
call 1f
1: popl %ecx
+ leal (efi32_boot_args - 1b)(%ecx), %ebx
/* Clear BSS */
xorl %eax, %eax
@@ -158,6 +166,7 @@ SYM_FUNC_START(efi32_stub_entry)
popl %ecx
popl %edx
popl %esi
+ movl %esi, 8(%ebx)
jmp efi32_entry
SYM_FUNC_END(efi32_stub_entry)
#endif
@@ -234,8 +243,6 @@ SYM_FUNC_END(efi_enter32)
*
* Arguments: %ecx image handle
* %edx EFI system table pointer
- * %esi struct bootparams pointer (or NULL when not using
- * the EFI handover protocol)
*
* Since this is the point of no return for ordinary execution, no registers
* are considered live except for the function parameters. [Note that the EFI
@@ -254,13 +261,25 @@ SYM_FUNC_START_LOCAL(efi32_entry)
/* Store firmware IDT descriptor */
sidtl (efi32_boot_idt - 1b)(%ebx)
+ /* Store firmware stack pointer */
+ movl %esp, (efi32_boot_sp - 1b)(%ebx)
+
/* Store boot arguments */
leal (efi32_boot_args - 1b)(%ebx), %ebx
movl %ecx, 0(%ebx)
movl %edx, 4(%ebx)
- movl %esi, 8(%ebx)
movb $0x0, 12(%ebx) // efi_is64
+ /*
+ * Allocate some memory for a temporary struct boot_params, which only
+ * needs the minimal pieces that startup_32() relies on.
+ */
+ subl $PARAM_SIZE, %esp
+ movl %esp, %esi
+ movl $PAGE_SIZE, BP_kernel_alignment(%esi)
+ movl $_end - 1b, BP_init_size(%esi)
+ subl $startup_32 - 1b, BP_init_size(%esi)
+
/* Disable paging */
movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax
@@ -286,8 +305,7 @@ SYM_FUNC_START(efi32_pe_entry)
movl 8(%ebp), %ecx // image_handle
movl 12(%ebp), %edx // sys_table
- xorl %esi, %esi
- jmp efi32_entry // pass %ecx, %edx, %esi
+ jmp efi32_entry // pass %ecx, %edx
// no other registers remain live
2: popl %edi // restore callee-save registers
@@ -318,5 +336,6 @@ SYM_DATA_END(efi32_boot_idt)
SYM_DATA_LOCAL(efi32_boot_cs, .word 0)
SYM_DATA_LOCAL(efi32_boot_ds, .word 0)
+SYM_DATA_LOCAL(efi32_boot_sp, .long 0)
SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0)
SYM_DATA(efi_is64, .byte 1)
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index eeec998657..d07be9d05c 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -14,7 +14,7 @@
#include <asm/processor.h>
enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE;
-static u64 cc_mask __ro_after_init;
+u64 cc_mask __ro_after_init;
static bool noinstr intel_cc_platform_has(enum cc_attr attr)
{
@@ -148,8 +148,3 @@ u64 cc_mkdec(u64 val)
}
}
EXPORT_SYMBOL_GPL(cc_mkdec);
-
-__init void cc_set_mask(u64 mask)
-{
- cc_mask = mask;
-}
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index e24976593a..5365d6acbf 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -604,7 +604,6 @@ static void amd_pmu_cpu_dead(int cpu)
kfree(cpuhw->lbr_sel);
cpuhw->lbr_sel = NULL;
- amd_pmu_cpu_reset(cpu);
if (!x86_pmu.amd_nb_constraints)
return;
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 96e6c51515..c8062975a5 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -12,10 +12,16 @@
#include <asm/i8259.h>
#include <asm/mshyperv.h>
#include <asm/realmode.h>
+#include <../kernel/smpboot.h>
extern struct boot_params boot_params;
static struct real_mode_header hv_vtl_real_mode_header;
+static bool __init hv_vtl_msi_ext_dest_id(void)
+{
+ return true;
+}
+
void __init hv_vtl_init_platform(void)
{
pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
@@ -38,6 +44,8 @@ void __init hv_vtl_init_platform(void)
x86_platform.legacy.warm_reset = 0;
x86_platform.legacy.reserve_bios_regions = 0;
x86_platform.legacy.devices.pnpbios = 0;
+
+ x86_init.hyper.msi_ext_dest_id = hv_vtl_msi_ext_dest_id;
}
static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc)
@@ -57,7 +65,7 @@ static void hv_vtl_ap_entry(void)
((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params);
}
-static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
+static int hv_vtl_bringup_vcpu(u32 target_vp_index, int cpu, u64 eip_ignored)
{
u64 status;
int ret = 0;
@@ -71,7 +79,9 @@ static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
struct ldttss_desc *ldt;
struct desc_struct *gdt;
- u64 rsp = current->thread.sp;
+ struct task_struct *idle = idle_thread_get(cpu);
+ u64 rsp = (unsigned long)idle->thread.sp;
+
u64 rip = (u64)&hv_vtl_ap_entry;
native_store_gdt(&gdt_ptr);
@@ -198,7 +208,15 @@ static int hv_vtl_apicid_to_vp_id(u32 apic_id)
static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
{
- int vp_id;
+ int vp_id, cpu;
+
+ /* Find the logical CPU for the APIC ID */
+ for_each_present_cpu(cpu) {
+ if (arch_match_cpu_phys_id(cpu, apicid))
+ break;
+ }
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
vp_id = hv_vtl_apicid_to_vp_id(apicid);
@@ -212,7 +230,7 @@ static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
return -EINVAL;
}
- return hv_vtl_bringup_vcpu(vp_id, start_eip);
+ return hv_vtl_bringup_vcpu(vp_id, cpu, start_eip);
}
int __init hv_vtl_early_init(void)
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index b1a98fa388..0e82074517 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -13,6 +13,7 @@
#include <asm/preempt.h>
#include <asm/asm.h>
#include <asm/gsseg.h>
+#include <asm/nospec-branch.h>
#ifndef CONFIG_X86_CMPXCHG64
extern void cmpxchg8b_emu(void);
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index fbcfec4dc4..ca8eed1d49 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -113,6 +113,20 @@
#endif
+#ifndef __ASSEMBLY__
+#ifndef __pic__
+static __always_inline __pure void *rip_rel_ptr(void *p)
+{
+ asm("leaq %c1(%%rip), %0" : "=r"(p) : "i"(p));
+
+ return p;
+}
+#define RIP_REL_REF(var) (*(typeof(&(var)))rip_rel_ptr(&(var)))
+#else
+#define RIP_REL_REF(var) (var)
+#endif
+#endif
+
/*
* Macros to generate condition code outputs from inline assembly,
* The output operand must be type "bool".
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
index 6ae2d16a76..21940ef8d2 100644
--- a/arch/x86/include/asm/coco.h
+++ b/arch/x86/include/asm/coco.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_COCO_H
#define _ASM_X86_COCO_H
+#include <asm/asm.h>
#include <asm/types.h>
enum cc_vendor {
@@ -11,9 +12,14 @@ enum cc_vendor {
};
extern enum cc_vendor cc_vendor;
+extern u64 cc_mask;
#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
-void cc_set_mask(u64 mask);
+static inline void cc_set_mask(u64 mask)
+{
+ RIP_REL_REF(cc_mask) = mask;
+}
+
u64 cc_mkenc(u64 val);
u64 cc_mkdec(u64 val);
#else
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index caf4cf2e10..0e4f2da9f6 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -499,4 +499,5 @@
/* BUG word 2 */
#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */
#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */
+#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/crash_core.h b/arch/x86/include/asm/crash_core.h
index 76af98f4e8..041020da8d 100644
--- a/arch/x86/include/asm/crash_core.h
+++ b/arch/x86/include/asm/crash_core.h
@@ -39,4 +39,6 @@ static inline unsigned long crash_low_size_default(void)
#endif
}
+#define HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+
#endif /* _X86_CRASH_CORE_H */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 359ada486f..b31eb9fd59 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -15,7 +15,8 @@
#include <linux/init.h>
#include <linux/cc_platform.h>
-#include <asm/bootparam.h>
+#include <asm/asm.h>
+struct boot_params;
#ifdef CONFIG_X86_MEM_ENCRYPT
void __init mem_encrypt_init(void);
@@ -58,6 +59,11 @@ void __init mem_encrypt_free_decrypted_mem(void);
void __init sev_es_init_vc_handling(void);
+static inline u64 sme_get_me_mask(void)
+{
+ return RIP_REL_REF(sme_me_mask);
+}
+
#define __bss_decrypted __section(".bss..decrypted")
#else /* !CONFIG_AMD_MEM_ENCRYPT */
@@ -89,6 +95,8 @@ early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool en
static inline void mem_encrypt_free_decrypted_mem(void) { }
+static inline u64 sme_get_me_mask(void) { return 0; }
+
#define __bss_decrypted
#endif /* CONFIG_AMD_MEM_ENCRYPT */
@@ -106,11 +114,6 @@ void add_encrypt_protection_map(void);
extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
-static inline u64 sme_get_me_mask(void)
-{
- return sme_me_mask;
-}
-
#endif /* __ASSEMBLY__ */
#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1d51e1850e..857839df66 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -165,6 +165,14 @@
* CPU is not vulnerable to Gather
* Data Sampling (GDS).
*/
+#define ARCH_CAP_RFDS_NO BIT(27) /*
+ * Not susceptible to Register
+ * File Data Sampling.
+ */
+#define ARCH_CAP_RFDS_CLEAR BIT(28) /*
+ * VERW clears CPU Register
+ * File.
+ */
#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
* IA32_XAPIC_DISABLE_STATUS MSR
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index d15b35815e..4e33cc834b 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -271,11 +271,20 @@
.Lskip_rsb_\@:
.endm
+/*
+ * The CALL to srso_alias_untrain_ret() must be patched in directly at
+ * the spot where untraining must be done, ie., srso_alias_untrain_ret()
+ * must be the target of a CALL instruction instead of indirectly
+ * jumping to a wrapper which then calls it. Therefore, this macro is
+ * called outside of __UNTRAIN_RET below, for the time being, before the
+ * kernel can support nested alternatives with arbitrary nesting.
+ */
+.macro CALL_UNTRAIN_RET
#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)
-#define CALL_UNTRAIN_RET "call entry_untrain_ret"
-#else
-#define CALL_UNTRAIN_RET ""
+ ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \
+ "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS
#endif
+.endm
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the
@@ -291,8 +300,8 @@
.macro __UNTRAIN_RET ibpb_feature, call_depth_insns
#if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY)
VALIDATE_UNRET_END
- ALTERNATIVE_3 "", \
- CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ CALL_UNTRAIN_RET
+ ALTERNATIVE_2 "", \
"call entry_ibpb", \ibpb_feature, \
__stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH
#endif
@@ -351,6 +360,8 @@ extern void retbleed_return_thunk(void);
static inline void retbleed_return_thunk(void) {}
#endif
+extern void srso_alias_untrain_ret(void);
+
#ifdef CONFIG_CPU_SRSO
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index d18e5c332c..1b93ff80b4 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -66,10 +66,14 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
* virt_addr_valid(kaddr) returns true.
*/
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
extern bool __virt_addr_valid(unsigned long kaddr);
#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr))
+static __always_inline void *pfn_to_kaddr(unsigned long pfn)
+{
+ return __va(pfn << PAGE_SHIFT);
+}
+
static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits)
{
return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 5b4a1ce3d3..36f9057970 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -203,12 +203,12 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
unsigned long npages);
void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
unsigned long npages);
-void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op);
void snp_set_memory_shared(unsigned long vaddr, unsigned long npages);
void snp_set_memory_private(unsigned long vaddr, unsigned long npages);
void snp_set_wakeup_secondary_cpu(void);
bool snp_init(struct boot_params *bp);
void __init __noreturn snp_abort(void);
+void snp_dmi_setup(void);
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 snp_get_unsupported_features(u64 status);
@@ -227,12 +227,12 @@ static inline void __init
early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }
static inline void __init
early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }
-static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { }
static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { }
static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { }
static inline void snp_set_wakeup_secondary_cpu(void) { }
static inline bool snp_init(struct boot_params *bp) { return false; }
static inline void snp_abort(void) { }
+static inline void snp_dmi_setup(void) { }
static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
{
return -ENOTTY;
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index a800abb1a9..d8416b3bf8 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -12,11 +12,6 @@
/* image of the saved processor state */
struct saved_context {
- /*
- * On x86_32, all segment registers except gs are saved at kernel
- * entry in pt_regs.
- */
- u16 gs;
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
struct saved_msrs saved_msrs;
@@ -27,6 +22,11 @@ struct saved_context {
unsigned long tr;
unsigned long safety;
unsigned long return_address;
+ /*
+ * On x86_32, all segment registers except gs are saved at kernel
+ * entry in pt_regs.
+ */
+ u16 gs;
bool misc_enable_saved;
} __attribute__((packed));
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index ab60a71a8d..472f0263db 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -4,6 +4,7 @@
#include <linux/seqlock.h>
#include <uapi/asm/vsyscall.h>
+#include <asm/page_types.h>
#ifdef CONFIG_X86_VSYSCALL_EMULATION
extern void map_vsyscall(void);
@@ -24,4 +25,13 @@ static inline bool emulate_vsyscall(unsigned long error_code,
}
#endif
+/*
+ * The (legacy) vsyscall page is the long page in the kernel portion
+ * of the address space that has user-accessible permissions.
+ */
+static inline bool is_vsyscall_vaddr(unsigned long vaddr)
+{
+ return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
+}
+
#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index c878616a18..550dcbbbb1 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -30,12 +30,13 @@ struct x86_init_mpparse {
* @reserve_resources: reserve the standard resources for the
* platform
* @memory_setup: platform specific memory setup
- *
+ * @dmi_setup: platform specific DMI setup
*/
struct x86_init_resources {
void (*probe_roms)(void);
void (*reserve_resources)(void);
char *(*memory_setup)(void);
+ void (*dmi_setup)(void);
};
/**
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 8d8752b44f..ff8f25faca 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -20,7 +20,7 @@ bool cpc_supported_by_cpu(void)
(boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f)))
return true;
else if (boot_cpu_data.x86 == 0x17 &&
- boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f)
+ boot_cpu_data.x86_model >= 0x30 && boot_cpu_data.x86_model <= 0x7f)
return true;
return boot_cpu_has(X86_FEATURE_CPPC);
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2055fb308f..77a1ceb717 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1002,11 +1002,11 @@ static bool cpu_has_zenbleed_microcode(void)
u32 good_rev = 0;
switch (boot_cpu_data.x86_model) {
- case 0x30 ... 0x3f: good_rev = 0x0830107a; break;
- case 0x60 ... 0x67: good_rev = 0x0860010b; break;
- case 0x68 ... 0x6f: good_rev = 0x08608105; break;
- case 0x70 ... 0x7f: good_rev = 0x08701032; break;
- case 0xa0 ... 0xaf: good_rev = 0x08a00008; break;
+ case 0x30 ... 0x3f: good_rev = 0x0830107b; break;
+ case 0x60 ... 0x67: good_rev = 0x0860010c; break;
+ case 0x68 ... 0x6f: good_rev = 0x08608107; break;
+ case 0x70 ... 0x7f: good_rev = 0x08701033; break;
+ case 0xa0 ... 0xaf: good_rev = 0x08a00009; break;
default:
return false;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 48d049cd74..01ac18f561 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -422,6 +422,13 @@ static void __init mmio_select_mitigation(void)
if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
boot_cpu_has(X86_FEATURE_RTM)))
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+
+ /*
+ * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based
+ * mitigations, disable KVM-only mitigation in that case.
+ */
+ if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF))
+ static_branch_disable(&mmio_stale_data_clear);
else
static_branch_enable(&mmio_stale_data_clear);
@@ -474,6 +481,57 @@ static int __init mmio_stale_data_parse_cmdline(char *str)
early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "Register File Data Sampling: " fmt
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF;
+
+static const char * const rfds_strings[] = {
+ [RFDS_MITIGATION_OFF] = "Vulnerable",
+ [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
+ [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+};
+
+static void __init rfds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) {
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ return;
+ }
+ if (rfds_mitigation == RFDS_MITIGATION_OFF)
+ return;
+
+ if (x86_read_arch_cap_msr() & ARCH_CAP_RFDS_CLEAR)
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ else
+ rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
+}
+
+static __init int rfds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ return 0;
+}
+early_param("reg_file_data_sampling", rfds_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "" fmt
static void __init md_clear_update_mitigation(void)
@@ -498,11 +556,19 @@ static void __init md_clear_update_mitigation(void)
taa_mitigation = TAA_MITIGATION_VERW;
taa_select_mitigation();
}
- if (mmio_mitigation == MMIO_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
+ /*
+ * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear
+ * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state.
+ */
+ if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
mmio_mitigation = MMIO_MITIGATION_VERW;
mmio_select_mitigation();
}
+ if (rfds_mitigation == RFDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_RFDS)) {
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+ rfds_select_mitigation();
+ }
out:
if (boot_cpu_has_bug(X86_BUG_MDS))
pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
@@ -512,6 +578,8 @@ out:
pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN))
pr_info("MMIO Stale Data: Unknown: No mitigations\n");
+ if (boot_cpu_has_bug(X86_BUG_RFDS))
+ pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]);
}
static void __init md_clear_select_mitigation(void)
@@ -519,11 +587,12 @@ static void __init md_clear_select_mitigation(void)
mds_select_mitigation();
taa_select_mitigation();
mmio_select_mitigation();
+ rfds_select_mitigation();
/*
- * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update
- * and print their mitigation after MDS, TAA and MMIO Stale Data
- * mitigation selection is done.
+ * As these mitigations are inter-related and rely on VERW instruction
+ * to clear the microarchitural buffers, update and print their status
+ * after mitigation selection is done for each of these vulnerabilities.
*/
md_clear_update_mitigation();
}
@@ -2612,6 +2681,11 @@ static ssize_t mmio_stale_data_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t rfds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
+}
+
static char *stibp_state(void)
{
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
@@ -2771,6 +2845,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_GDS:
return gds_show_state(buf);
+ case X86_BUG_RFDS:
+ return rfds_show_state(buf);
+
default:
break;
}
@@ -2845,4 +2922,9 @@ ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *bu
{
return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
}
+
+ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 34cac9ea19..97ea52a4e8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1274,6 +1274,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define SRSO BIT(5)
/* CPU is affected by GDS */
#define GDS BIT(6)
+/* CPU is affected by Register File Data Sampling */
+#define RFDS BIT(7)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
@@ -1301,9 +1303,18 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
@@ -1337,6 +1348,24 @@ static bool arch_cap_mmio_immune(u64 ia32_cap)
ia32_cap & ARCH_CAP_SBDR_SSDP_NO);
}
+static bool __init vulnerable_to_rfds(u64 ia32_cap)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (ia32_cap & ARCH_CAP_RFDS_NO)
+ return false;
+
+ /*
+ * VMMs set ARCH_CAP_RFDS_CLEAR for processors not in the blacklist to
+ * indicate that mitigation is needed because guest is running on a
+ * vulnerable hardware or may migrate to such hardware:
+ */
+ if (ia32_cap & ARCH_CAP_RFDS_CLEAR)
+ return true;
+
+ /* Only consult the blacklist when there is no enumeration: */
+ return cpu_matches(cpu_vuln_blacklist, RFDS);
+}
+
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 ia32_cap = x86_read_arch_cap_msr();
@@ -1448,6 +1477,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
boot_cpu_has(X86_FEATURE_AVX))
setup_force_cpu_bug(X86_BUG_GDS);
+ if (vulnerable_to_rfds(ia32_cap))
+ setup_force_cpu_bug(X86_BUG_RFDS);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 19e0681f04..d04371e851 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -231,9 +231,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- union cpuid_0x10_3_eax eax;
- union cpuid_0x10_x_edx edx;
- u32 ebx, ecx, subleaf;
+ u32 eax, ebx, ecx, edx, subleaf;
/*
* Query CPUID_Fn80000020_EDX_x01 for MBA and
@@ -241,9 +239,9 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
*/
subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1;
- cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full);
- hw_res->num_closid = edx.split.cos_max + 1;
- r->default_ctrl = MAX_MBA_BW_AMD;
+ cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
+ hw_res->num_closid = edx + 1;
+ r->default_ctrl = 1 << eax;
/* AMD does not use delay */
r->membw.delay_linear = false;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index a4f1aa15f0..52e7e7deee 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -18,7 +18,6 @@
#define MBM_OVERFLOW_INTERVAL 1000
#define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4
-#define MAX_MBA_BW_AMD 0x800
#define MBM_CNTR_WIDTH_OFFSET_AMD 20
#define RMID_VAL_ERROR BIT_ULL(63)
@@ -296,14 +295,10 @@ struct rftype {
* struct mbm_state - status for each MBM counter in each domain
* @prev_bw_bytes: Previous bytes value read for bandwidth calculation
* @prev_bw: The most recent bandwidth in MBps
- * @delta_bw: Difference between the current and previous bandwidth
- * @delta_comp: Indicates whether to compute the delta_bw
*/
struct mbm_state {
u64 prev_bw_bytes;
u32 prev_bw;
- u32 delta_bw;
- bool delta_comp;
};
/**
@@ -395,6 +390,8 @@ struct rdt_parse_data {
* @msr_update: Function pointer to update QOS MSRs
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
* @mbm_width: Monitor width, to detect and correct for overflow.
+ * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth
+ * Monitoring Event Configuration (BMEC) is supported.
* @cdp_enabled: CDP state of this resource
*
* Members of this structure are either private to the architecture
@@ -409,6 +406,7 @@ struct rdt_hw_resource {
struct rdt_resource *r);
unsigned int mon_scale;
unsigned int mbm_width;
+ unsigned int mbm_cfg_mask;
bool cdp_enabled;
};
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index f136ac0468..3a6c069614 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -440,9 +440,6 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
cur_bw = bytes / SZ_1M;
- if (m->delta_comp)
- m->delta_bw = abs(cur_bw - m->prev_bw);
- m->delta_comp = false;
m->prev_bw = cur_bw;
}
@@ -520,11 +517,11 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
{
u32 closid, rmid, cur_msr_val, new_msr_val;
struct mbm_state *pmbm_data, *cmbm_data;
- u32 cur_bw, delta_bw, user_bw;
struct rdt_resource *r_mba;
struct rdt_domain *dom_mba;
struct list_head *head;
struct rdtgroup *entry;
+ u32 cur_bw, user_bw;
if (!is_mbm_local_enabled())
return;
@@ -543,7 +540,6 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
cur_bw = pmbm_data->prev_bw;
user_bw = dom_mba->mbps_val[closid];
- delta_bw = pmbm_data->delta_bw;
/* MBA resource doesn't support CDP */
cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
@@ -555,49 +551,31 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
list_for_each_entry(entry, head, mon.crdtgrp_list) {
cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
cur_bw += cmbm_data->prev_bw;
- delta_bw += cmbm_data->delta_bw;
}
/*
* Scale up/down the bandwidth linearly for the ctrl group. The
* bandwidth step is the bandwidth granularity specified by the
* hardware.
- *
- * The delta_bw is used when increasing the bandwidth so that we
- * dont alternately increase and decrease the control values
- * continuously.
- *
- * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
- * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
- * switching between 90 and 110 continuously if we only check
- * cur_bw < user_bw.
+ * Always increase throttling if current bandwidth is above the
+ * target set by user.
+ * But avoid thrashing up and down on every poll by checking
+ * whether a decrease in throttling is likely to push the group
+ * back over target. E.g. if currently throttling to 30% of bandwidth
+ * on a system with 10% granularity steps, check whether moving to
+ * 40% would go past the limit by multiplying current bandwidth by
+ * "(30 + 10) / 30".
*/
if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
} else if (cur_msr_val < MAX_MBA_BW &&
- (user_bw > (cur_bw + delta_bw))) {
+ (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
} else {
return;
}
resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
-
- /*
- * Delta values are updated dynamically package wise for each
- * rdtgrp every time the throttle MSR changes value.
- *
- * This is because (1)the increase in bandwidth is not perfectly
- * linear and only "approximately" linear even when the hardware
- * says it is linear.(2)Also since MBA is a core specific
- * mechanism, the delta values vary based on number of cores used
- * by the rdtgrp.
- */
- pmbm_data->delta_comp = true;
- list_for_each_entry(entry, head, mon.crdtgrp_list) {
- cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
- cmbm_data->delta_comp = true;
- }
}
static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
@@ -813,6 +791,12 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
return ret;
if (rdt_cpu_has(X86_FEATURE_BMEC)) {
+ u32 eax, ebx, ecx, edx;
+
+ /* Detect list of bandwidth sources that can be tracked */
+ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
+ hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
+
if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
mbm_total_event.configurable = true;
mbm_config_rftype_init("mbm_total_bytes_config");
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 69a1de9238..2b69e560b0 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1620,12 +1620,6 @@ static int mbm_config_write_domain(struct rdt_resource *r,
struct mon_config_info mon_info = {0};
int ret = 0;
- /* mon_config cannot be more than the supported set of events */
- if (val > MAX_EVT_CONFIG_BITS) {
- rdt_last_cmd_puts("Invalid event configuration\n");
- return -EINVAL;
- }
-
/*
* Read the current config value first. If both are the same then
* no need to write it again.
@@ -1663,6 +1657,7 @@ out:
static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
char *dom_str = NULL, *id_str;
unsigned long dom_id, val;
struct rdt_domain *d;
@@ -1686,6 +1681,13 @@ next:
return -EINVAL;
}
+ /* Value from user cannot be more than the supported set of events */
+ if ((val & hw_res->mbm_cfg_mask) != val) {
+ rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
+ hw_res->mbm_cfg_mask);
+ return -EINVAL;
+ }
+
list_for_each_entry(d, &r->domains, list) {
if (d->id == dom_id) {
ret = mbm_config_write_domain(r, d, evtid, val);
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
index e963344b04..53935b4d62 100644
--- a/arch/x86/kernel/eisa.c
+++ b/arch/x86/kernel/eisa.c
@@ -2,6 +2,7 @@
/*
* EISA specific code
*/
+#include <linux/cc_platform.h>
#include <linux/ioport.h>
#include <linux/eisa.h>
#include <linux/io.h>
@@ -12,7 +13,7 @@ static __init int eisa_bus_probe(void)
{
void __iomem *p;
- if (xen_pv_domain() && !xen_initial_domain())
+ if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return 0;
p = ioremap(0x0FFFD9, 4);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 117e74c44e..33a214b1a4 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -178,10 +178,11 @@ void fpu__init_cpu_xstate(void)
* Must happen after CR4 setup and before xsetbv() to allow KVM
* lazy passthrough. Write independent of the dynamic state static
* key as that does not work on the boot CPU. This also ensures
- * that any stale state is wiped out from XFD.
+ * that any stale state is wiped out from XFD. Reset the per CPU
+ * xfd cache too.
*/
if (cpu_feature_enabled(X86_FEATURE_XFD))
- wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
+ xfd_set_state(init_fpstate.xfd);
/*
* XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 3518fb26d0..19ca623ffa 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -148,20 +148,26 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs
#endif
#ifdef CONFIG_X86_64
+static inline void xfd_set_state(u64 xfd)
+{
+ wrmsrl(MSR_IA32_XFD, xfd);
+ __this_cpu_write(xfd_state, xfd);
+}
+
static inline void xfd_update_state(struct fpstate *fpstate)
{
if (fpu_state_size_dynamic()) {
u64 xfd = fpstate->xfd;
- if (__this_cpu_read(xfd_state) != xfd) {
- wrmsrl(MSR_IA32_XFD, xfd);
- __this_cpu_write(xfd_state, xfd);
- }
+ if (__this_cpu_read(xfd_state) != xfd)
+ xfd_set_state(xfd);
}
}
extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
#else
+static inline void xfd_set_state(u64 xfd) { }
+
static inline void xfd_update_state(struct fpstate *fpstate) { }
static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index a0ce46c0a2..a6a3475e1d 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -335,7 +335,16 @@ out:
kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
bool *on_func_entry)
{
- if (is_endbr(*(u32 *)addr)) {
+ u32 insn;
+
+ /*
+ * Since 'addr' is not guaranteed to be safe to access, use
+ * copy_from_kernel_nofault() to read the instruction:
+ */
+ if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32)))
+ return NULL;
+
+ if (is_endbr(insn)) {
*on_func_entry = !offset || offset == 4;
if (*on_func_entry)
offset = 4;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b223922248..15c700d358 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -196,12 +196,12 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
if (!smp_check_mpc(mpc, oem, str))
return 0;
- /* Initialize the lapic mapping */
- if (!acpi_lapic)
- register_lapic_address(mpc->lapic);
-
- if (early)
+ if (early) {
+ /* Initialize the lapic mapping */
+ if (!acpi_lapic)
+ register_lapic_address(mpc->lapic);
return 1;
+ }
/* Now process the configuration blocks. */
while (count < mpc->length) {
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 3082cf24b6..6da2cfa23c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -636,7 +636,7 @@ void nmi_backtrace_stall_check(const struct cpumask *btp)
msgp = nmi_check_stall_msg[idx];
if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
modp = ", but OK because ignore_nmis was set";
- if (nmi_seq & ~0x1)
+ if (nmi_seq & 0x1)
msghp = " (CPU currently in NMI handler function)";
else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
msghp = " (CPU exited one NMI handler function)";
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 319fef37d9..cc2c34ba72 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -203,16 +203,6 @@ void __init probe_roms(void)
unsigned char c;
int i;
- /*
- * The ROM memory range is not part of the e820 table and is therefore not
- * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted
- * memory, and SNP requires encrypted memory to be validated before access.
- * Do that here.
- */
- snp_prep_memory(video_rom_resource.start,
- ((system_rom_resource.end + 1) - video_rom_resource.start),
- SNP_PAGE_STATE_PRIVATE);
-
/* video rom */
upper = adapter_rom_resources[0].start;
for (start = video_rom_resource.start; start < upper; start += 2048) {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1526747bed..b002ebf024 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -9,7 +9,6 @@
#include <linux/console.h>
#include <linux/crash_dump.h>
#include <linux/dma-map-ops.h>
-#include <linux/dmi.h>
#include <linux/efi.h>
#include <linux/ima.h>
#include <linux/init_ohci1394_dma.h>
@@ -904,7 +903,7 @@ void __init setup_arch(char **cmdline_p)
efi_init();
reserve_ibft_region();
- dmi_setup();
+ x86_init.resources.dmi_setup();
/*
* VMware detection requires dmi to be available, so this
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index ccb0915e84..466fe09898 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -556,9 +556,9 @@ static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_le
leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
/* Skip post-processing for out-of-range zero leafs. */
- if (!(leaf->fn <= cpuid_std_range_max ||
- (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
- (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
+ if (!(leaf->fn <= RIP_REL_REF(cpuid_std_range_max) ||
+ (leaf->fn >= 0x40000000 && leaf->fn <= RIP_REL_REF(cpuid_hyp_range_max)) ||
+ (leaf->fn >= 0x80000000 && leaf->fn <= RIP_REL_REF(cpuid_ext_range_max))))
return 0;
}
@@ -1063,11 +1063,11 @@ static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
if (fn->eax_in == 0x0)
- cpuid_std_range_max = fn->eax;
+ RIP_REL_REF(cpuid_std_range_max) = fn->eax;
else if (fn->eax_in == 0x40000000)
- cpuid_hyp_range_max = fn->eax;
+ RIP_REL_REF(cpuid_hyp_range_max) = fn->eax;
else if (fn->eax_in == 0x80000000)
- cpuid_ext_range_max = fn->eax;
+ RIP_REL_REF(cpuid_ext_range_max) = fn->eax;
}
}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index c67285824e..0f58242b54 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -23,6 +23,7 @@
#include <linux/platform_device.h>
#include <linux/io.h>
#include <linux/psp-sev.h>
+#include <linux/dmi.h>
#include <uapi/linux/sev-guest.h>
#include <asm/cpu_entry_area.h>
@@ -748,7 +749,7 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
* This eliminates worries about jump tables or checking boot_cpu_data
* in the cc_platform_has() function.
*/
- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
return;
/*
@@ -767,28 +768,13 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr
* This eliminates worries about jump tables or checking boot_cpu_data
* in the cc_platform_has() function.
*/
- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
return;
/* Ask hypervisor to mark the memory pages shared in the RMP table. */
early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);
}
-void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op)
-{
- unsigned long vaddr, npages;
-
- vaddr = (unsigned long)__va(paddr);
- npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
-
- if (op == SNP_PAGE_STATE_PRIVATE)
- early_snp_set_memory_private(vaddr, paddr, npages);
- else if (op == SNP_PAGE_STATE_SHARED)
- early_snp_set_memory_shared(vaddr, paddr, npages);
- else
- WARN(1, "invalid memory op %d\n", op);
-}
-
static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
unsigned long vaddr_end, int op)
{
@@ -2112,6 +2098,17 @@ void __init __noreturn snp_abort(void)
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
}
+/*
+ * SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are
+ * enabled, as the alternative (fallback) logic for DMI probing in the legacy
+ * ROM region can cause a crash since this region is not pre-validated.
+ */
+void __init snp_dmi_setup(void)
+{
+ if (efi_enabled(EFI_CONFIG_TABLES))
+ dmi_setup();
+}
+
static void dump_cpuid_table(void)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a37ebd3b47..3f0718b4a7 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -3,6 +3,7 @@
*
* For licencing details see kernel-base/COPYING
*/
+#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/export.h>
@@ -66,6 +67,7 @@ struct x86_init_ops x86_init __initdata = {
.probe_roms = probe_roms,
.reserve_resources = reserve_standard_io_resources,
.memory_setup = e820__memory_setup_default,
+ .dmi_setup = dmi_setup,
},
.mpparse = {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index dda6fc4cfa..1811a9ddfe 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -679,6 +679,11 @@ void kvm_set_cpu_caps(void)
F(AMX_COMPLEX)
);
+ kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
+ F(INTEL_PSFD) | F(IPRED_CTRL) | F(RRSBA_CTRL) | F(DDPD_U) |
+ F(BHI_CTRL) | F(MCDT_NO)
+ );
+
kvm_cpu_cap_mask(CPUID_D_1_EAX,
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
);
@@ -960,13 +965,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
break;
/* function 7 has additional index. */
case 7:
- entry->eax = min(entry->eax, 1u);
+ max_idx = entry->eax = min(entry->eax, 2u);
cpuid_entry_override(entry, CPUID_7_0_EBX);
cpuid_entry_override(entry, CPUID_7_ECX);
cpuid_entry_override(entry, CPUID_7_EDX);
- /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */
- if (entry->eax == 1) {
+ /* KVM only supports up to 0x7.2, capped above via min(). */
+ if (max_idx >= 1) {
entry = do_host_cpuid(array, function, 1);
if (!entry)
goto out;
@@ -976,6 +981,16 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ebx = 0;
entry->ecx = 0;
}
+ if (max_idx >= 2) {
+ entry = do_host_cpuid(array, function, 2);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_7_2_EDX);
+ entry->ecx = 0;
+ entry->ebx = 0;
+ entry->eax = 0;
+ }
break;
case 0xa: { /* Architectural Performance Monitoring */
union cpuid10_eax eax;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 238afd7335..4943f6b2bb 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2388,7 +2388,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *h
if (!eventfd)
return HV_STATUS_INVALID_PORT_ID;
- eventfd_signal(eventfd, 1);
+ eventfd_signal(eventfd);
return HV_STATUS_SUCCESS;
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 245b20973c..23fab75993 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -41,6 +41,7 @@
#include "ioapic.h"
#include "trace.h"
#include "x86.h"
+#include "xen.h"
#include "cpuid.h"
#include "hyperv.h"
#include "smm.h"
@@ -499,8 +500,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
}
/* Check if there are APF page ready requests pending */
- if (enabled)
+ if (enabled) {
kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
+ kvm_xen_sw_enable_lapic(apic->vcpu);
+ }
}
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index b816506783..aadefcaa95 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -16,6 +16,7 @@ enum kvm_only_cpuid_leafs {
CPUID_7_1_EDX,
CPUID_8000_0007_EDX,
CPUID_8000_0022_EAX,
+ CPUID_7_2_EDX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -46,6 +47,14 @@ enum kvm_only_cpuid_leafs {
#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8)
#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */
+#define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0)
+#define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1)
+#define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2)
+#define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3)
+#define X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
+#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5)
+
/* CPUID level 0x80000007 (EDX). */
#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
@@ -80,6 +89,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX},
[CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
[CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
+ [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
};
/*
@@ -106,18 +116,19 @@ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
*/
static __always_inline u32 __feature_translate(int x86_feature)
{
- if (x86_feature == X86_FEATURE_SGX1)
- return KVM_X86_FEATURE_SGX1;
- else if (x86_feature == X86_FEATURE_SGX2)
- return KVM_X86_FEATURE_SGX2;
- else if (x86_feature == X86_FEATURE_SGX_EDECCSSA)
- return KVM_X86_FEATURE_SGX_EDECCSSA;
- else if (x86_feature == X86_FEATURE_CONSTANT_TSC)
- return KVM_X86_FEATURE_CONSTANT_TSC;
- else if (x86_feature == X86_FEATURE_PERFMON_V2)
- return KVM_X86_FEATURE_PERFMON_V2;
-
- return x86_feature;
+#define KVM_X86_TRANSLATE_FEATURE(f) \
+ case X86_FEATURE_##f: return KVM_X86_FEATURE_##f
+
+ switch (x86_feature) {
+ KVM_X86_TRANSLATE_FEATURE(SGX1);
+ KVM_X86_TRANSLATE_FEATURE(SGX2);
+ KVM_X86_TRANSLATE_FEATURE(SGX_EDECCSSA);
+ KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC);
+ KVM_X86_TRANSLATE_FEATURE(PERFMON_V2);
+ KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL);
+ default:
+ return x86_feature;
+ }
}
static __always_inline u32 __feature_leaf(int x86_feature)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 6ee925d666..1226bb2151 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -57,7 +57,7 @@ static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);
/* enable/disable SEV-ES DebugSwap support */
-static bool sev_es_debug_swap_enabled = true;
+static bool sev_es_debug_swap_enabled = false;
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
#else
#define sev_enabled false
@@ -612,8 +612,11 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xss = svm->vcpu.arch.ia32_xss;
save->dr6 = svm->vcpu.arch.dr6;
- if (sev_es_debug_swap_enabled)
+ if (sev_es_debug_swap_enabled) {
save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
+ pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. "
+ "This will not work starting with Linux 6.10\n");
+ }
pr_debug("Virtual Machine Save Area (VMSA):\n");
print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
@@ -1975,20 +1978,22 @@ int sev_mem_enc_register_region(struct kvm *kvm,
goto e_free;
}
- region->uaddr = range->addr;
- region->size = range->size;
-
- list_add_tail(&region->list, &sev->regions_list);
- mutex_unlock(&kvm->lock);
-
/*
* The guest may change the memory encryption attribute from C=0 -> C=1
* or vice versa for this memory range. Lets make sure caches are
* flushed to ensure that guest data gets written into memory with
- * correct C-bit.
+ * correct C-bit. Note, this must be done before dropping kvm->lock,
+ * as region and its array of pages can be freed by a different task
+ * once kvm->lock is released.
*/
sev_clflush_pages(region->pages, region->npages);
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
return ret;
e_free:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 468870450b..365caf7328 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1620,7 +1620,8 @@ static bool kvm_is_immutable_feature_msr(u32 msr)
ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
- ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
+ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR)
static u64 kvm_get_arch_capabilities(void)
{
@@ -1652,6 +1653,8 @@ static u64 kvm_get_arch_capabilities(void)
data |= ARCH_CAP_SSB_NO;
if (!boot_cpu_has_bug(X86_BUG_MDS))
data |= ARCH_CAP_MDS_NO;
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ data |= ARCH_CAP_RFDS_NO;
if (!boot_cpu_has(X86_FEATURE_RTM)) {
/*
@@ -7948,6 +7951,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (r < 0)
return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * Mark the page dirty _before_ checking whether or not the CMPXCHG was
+ * successful, as the old value is written back on failure. Note, for
+ * live migration, this is unnecessarily conservative as CMPXCHG writes
+ * back the original value and the access is atomic, but KVM's ABI is
+ * that all writes are dirty logged, regardless of the value written.
+ */
+ kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa));
+
if (r)
return X86EMUL_CMPXCHG_FAILED;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index e53fad915a..c069521f24 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -493,7 +493,7 @@ void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
}
-static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
{
struct kvm_lapic_irq irq = { };
int r;
@@ -2088,7 +2088,7 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
if (ret < 0 && ret != -ENOTCONN)
return false;
} else {
- eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
+ eventfd_signal(evtchnfd->deliver.eventfd.ctx);
}
*r = 0;
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index f8f1fe22d0..f5841d9000 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -18,6 +18,7 @@ extern struct static_key_false_deferred kvm_xen_enabled;
int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *vcpu);
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
@@ -36,6 +37,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
const struct kvm_irq_routing_entry *ue);
void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu);
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The local APIC is being enabled. If the per-vCPU upcall vector is
+ * set and the vCPU's evtchn_upcall_pending flag is set, inject the
+ * interrupt.
+ */
+ if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.vcpu_info_cache.active &&
+ vcpu->arch.xen.upcall_vector && __kvm_xen_has_interrupt(vcpu))
+ kvm_xen_inject_vcpu_vector(vcpu);
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return static_branch_unlikely(&kvm_xen_enabled.key) &&
@@ -101,6 +115,10 @@ static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
{
}
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return false;
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 7b2589877d..1e59367b46 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -163,6 +163,7 @@ SYM_CODE_START_NOALIGN(srso_alias_untrain_ret)
lfence
jmp srso_alias_return_thunk
SYM_FUNC_END(srso_alias_untrain_ret)
+__EXPORT_THUNK(srso_alias_untrain_ret)
.popsection
.pushsection .text..__x86.rethunk_safe
@@ -224,10 +225,12 @@ SYM_CODE_START(srso_return_thunk)
SYM_CODE_END(srso_return_thunk)
#define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret"
-#define JMP_SRSO_ALIAS_UNTRAIN_RET "jmp srso_alias_untrain_ret"
#else /* !CONFIG_CPU_SRSO */
#define JMP_SRSO_UNTRAIN_RET "ud2"
-#define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2"
+/* Dummy for the alternative in CALL_UNTRAIN_RET. */
+SYM_CODE_START(srso_alias_untrain_ret)
+ RET
+SYM_FUNC_END(srso_alias_untrain_ret)
#endif /* CONFIG_CPU_SRSO */
#ifdef CONFIG_CPU_UNRET_ENTRY
@@ -319,9 +322,7 @@ SYM_FUNC_END(retbleed_untrain_ret)
#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)
SYM_FUNC_START(entry_untrain_ret)
- ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET, \
- JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO, \
- JMP_SRSO_ALIAS_UNTRAIN_RET, X86_FEATURE_SRSO_ALIAS
+ ALTERNATIVE JMP_RETBLEED_UNTRAIN_RET, JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO
SYM_FUNC_END(entry_untrain_ret)
__EXPORT_THUNK(entry_untrain_ret)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 679b09cfe2..d6375b3c63 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -798,15 +798,6 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
show_opcodes(regs, loglvl);
}
-/*
- * The (legacy) vsyscall page is the long page in the kernel portion
- * of the address space that has user-accessible permissions.
- */
-static bool is_vsyscall_vaddr(unsigned long vaddr)
-{
- return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
-}
-
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 pkey, int si_code)
diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c
index 6993f026ad..42115ac079 100644
--- a/arch/x86/mm/maccess.c
+++ b/arch/x86/mm/maccess.c
@@ -3,6 +3,8 @@
#include <linux/uaccess.h>
#include <linux/kernel.h>
+#include <asm/vsyscall.h>
+
#ifdef CONFIG_X86_64
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
{
@@ -16,6 +18,14 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
return false;
/*
+ * Reading from the vsyscall page may cause an unhandled fault in
+ * certain cases. Though it is at an address above TASK_SIZE_MAX, it is
+ * usually considered as a user space address.
+ */
+ if (is_vsyscall_vaddr(vaddr))
+ return false;
+
+ /*
* Allow everything during early boot before 'x86_virt_bits'
* is initialized. Needed for instruction decoding in early
* exception handlers.
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 70b91de2e0..94cd06d4b0 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -492,6 +492,24 @@ void __init sme_early_init(void)
*/
if (sev_status & MSR_AMD64_SEV_ENABLED)
ia32_disable();
+
+ /*
+ * Override init functions that scan the ROM region in SEV-SNP guests,
+ * as this memory is not pre-validated and would thus cause a crash.
+ */
+ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) {
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.pci.init_irq = x86_init_noop;
+ x86_init.resources.probe_roms = x86_init_noop;
+
+ /*
+ * DMI setup behavior for SEV-SNP guests depends on
+ * efi_enabled(EFI_CONFIG_TABLES), which hasn't been
+ * parsed yet. snp_dmi_setup() will run after that
+ * parsing has happened.
+ */
+ x86_init.resources.dmi_setup = snp_dmi_setup;
+ }
}
void __init mem_encrypt_free_decrypted_mem(void)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index d73aeb1641..0166ab1780 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -97,7 +97,6 @@ static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch");
static char sme_cmdline_arg[] __initdata = "mem_encrypt";
static char sme_cmdline_on[] __initdata = "on";
-static char sme_cmdline_off[] __initdata = "off";
static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
{
@@ -305,7 +304,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* instrumentation or checking boot_cpu_data in the cc_platform_has()
* function.
*/
- if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED)
+ if (!sme_get_me_mask() ||
+ RIP_REL_REF(sev_status) & MSR_AMD64_SEV_ENABLED)
return;
/*
@@ -504,10 +504,9 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
void __init sme_enable(struct boot_params *bp)
{
- const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
+ const char *cmdline_ptr, *cmdline_arg, *cmdline_on;
unsigned int eax, ebx, ecx, edx;
unsigned long feature_mask;
- bool active_by_default;
unsigned long me_mask;
char buffer[16];
bool snp;
@@ -543,11 +542,11 @@ void __init sme_enable(struct boot_params *bp)
me_mask = 1UL << (ebx & 0x3f);
/* Check the SEV MSR whether SEV or SME is enabled */
- sev_status = __rdmsr(MSR_AMD64_SEV);
- feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+ RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV);
+ feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
/* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */
- if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ if (snp && !(msr & MSR_AMD64_SEV_SNP_ENABLED))
snp_abort();
/* Check if memory encryption is enabled */
@@ -573,7 +572,6 @@ void __init sme_enable(struct boot_params *bp)
return;
} else {
/* SEV state cannot be controlled by a command line option */
- sme_me_mask = me_mask;
goto out;
}
@@ -588,31 +586,17 @@ void __init sme_enable(struct boot_params *bp)
asm ("lea sme_cmdline_on(%%rip), %0"
: "=r" (cmdline_on)
: "p" (sme_cmdline_on));
- asm ("lea sme_cmdline_off(%%rip), %0"
- : "=r" (cmdline_off)
- : "p" (sme_cmdline_off));
-
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
- active_by_default = true;
- else
- active_by_default = false;
cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
((u64)bp->ext_cmd_line_ptr << 32));
- if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0)
+ if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0 ||
+ strncmp(buffer, cmdline_on, sizeof(buffer)))
return;
- if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
- sme_me_mask = me_mask;
- else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
- sme_me_mask = 0;
- else
- sme_me_mask = active_by_default ? me_mask : 0;
out:
- if (sme_me_mask) {
- physical_mask &= ~sme_me_mask;
- cc_vendor = CC_VENDOR_AMD;
- cc_set_mask(sme_me_mask);
- }
+ RIP_REL_REF(sme_me_mask) = me_mask;
+ physical_mask &= ~me_mask;
+ cc_vendor = CC_VENDOR_AMD;
+ cc_set_mask(me_mask);
}
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index d30949e25e..e701328364 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -653,6 +653,14 @@ static void print_absolute_relocs(void)
if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
continue;
}
+ /*
+ * Do not perform relocations in .notes section; any
+ * values there are meant for pre-boot consumption (e.g.
+ * startup_xen).
+ */
+ if (sec_applies->shdr.sh_type == SHT_NOTE) {
+ continue;
+ }
sh_symtab = sec_symtab->symtab;
sym_strtab = sec_symtab->link->strtab;
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 4b0d6fff88..1fb9a1644d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -65,6 +65,8 @@ int xen_smp_intr_init(unsigned int cpu)
char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+ if (!resched_name)
+ goto fail_mem;
per_cpu(xen_resched_irq, cpu).name = resched_name;
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
@@ -77,6 +79,8 @@ int xen_smp_intr_init(unsigned int cpu)
per_cpu(xen_resched_irq, cpu).irq = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+ if (!callfunc_name)
+ goto fail_mem;
per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
@@ -90,6 +94,9 @@ int xen_smp_intr_init(unsigned int cpu)
if (!xen_fifo_events) {
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
+ if (!debug_name)
+ goto fail_mem;
+
per_cpu(xen_debug_irq, cpu).name = debug_name;
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu,
xen_debug_interrupt,
@@ -101,6 +108,9 @@ int xen_smp_intr_init(unsigned int cpu)
}
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
+ if (!callfunc_name)
+ goto fail_mem;
+
per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
cpu,
@@ -114,6 +124,8 @@ int xen_smp_intr_init(unsigned int cpu)
return 0;
+ fail_mem:
+ rc = -ENOMEM;
fail:
xen_smp_intr_free(cpu);
return rc;