Adding upstream version 6.1.76.upstream/6.1.76 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:49:45 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:49:45 +0000
commit: 2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree: 848558de17fb3008cdf4d861b01ac7781903ce39 /arch/x86/entry
parent: Initial commit. (diff)
download: linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz
linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip
41 files changed, 7625 insertions, 0 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
new file mode 100644
index 000000000..ca2fe1869
--- /dev/null
+++ b/arch/x86/entry/Makefile
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the x86 low level entry code
+#
+
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+CFLAGS_REMOVE_common.o		= $(CC_FLAGS_FTRACE)
+
+CFLAGS_common.o			+= -fno-stack-protector
+
+obj-y				:= entry.o entry_$(BITS).o syscall_$(BITS).o
+obj-y				+= common.o
+
+obj-y				+= vdso/
+obj-y				+= vsyscall/
+
+obj-$(CONFIG_PREEMPTION)	+= thunk_$(BITS).o
+obj-$(CONFIG_IA32_EMULATION)	+= entry_64_compat.o syscall_32.o
+obj-$(CONFIG_X86_X32_ABI)	+= syscall_x32.o
+
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
new file mode 100644
index 000000000..f69076271
--- /dev/null
+++ b/arch/x86/entry/calling.h
@@ -0,0 +1,422 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/jump_label.h>
+#include <asm/unwind_hints.h>
+#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
+#include <asm/ptrace-abi.h>
+#include <asm/msr.h>
+#include <asm/nospec-branch.h>
+
+/*
+
+ x86 function call convention, 64-bit:
+ -------------------------------------
+  arguments           |  callee-saved      | extra caller-saved | return
+ [callee-clobbered]   |                    | [callee-clobbered] |
+ ---------------------------------------------------------------------------
+ rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11             | rax, rdx [**]
+
+ ( rsp is obviously invariant across normal function calls. (gcc can 'merge'
+   functions when it sees tail-call optimization possibilities) rflags is
+   clobbered. Leftover arguments are passed over the stack frame.)
+
+ [*]  In the frame-pointers case rbp is fixed to the stack frame.
+
+ [**] for struct return values wider than 64 bits the return convention is a
+      bit more complex: up to 128 bits width we return small structures
+      straight in rax, rdx. For structures larger than that (3 words or
+      larger) the caller puts a pointer to an on-stack return struct
+      [allocated in the caller's stack frame] into the first argument - i.e.
+      into rdi. All other arguments shift up by one in this case.
+      Fortunately this case is rare in the kernel.
+
+For 32-bit we have the following conventions - kernel is built with
+-mregparm=3 and -freg-struct-return:
+
+ x86 function calling convention, 32-bit:
+ ----------------------------------------
+  arguments         | callee-saved        | extra caller-saved | return
+ [callee-clobbered] |                     | [callee-clobbered] |
+ -------------------------------------------------------------------------
+ eax edx ecx        | ebx edi esi ebp [*] | <none>             | eax, edx [**]
+
+ ( here too esp is obviously invariant across normal function calls. eflags
+   is clobbered. Leftover arguments are passed over the stack frame. )
+
+ [*]  In the frame-pointers case ebp is fixed to the stack frame.
+
+ [**] We build with -freg-struct-return, which on 32-bit means similar
+      semantics as on 64-bit: edx can be used for a second return value
+      (i.e. covering integer and structure sizes up to 64 bits) - after that
+      it gets more complex and more expensive: 3-word or larger struct returns
+      get done in the caller's frame and the pointer to the return struct goes
+      into regparm0, i.e. eax - the other arguments shift up and the
+      function's register parameters degenerate to regparm=2 in essence.
+
+*/
+
+#ifdef CONFIG_X86_64
+
+/*
+ * 64-bit system call stack frame layout defines and helpers,
+ * for assembly code:
+ */
+
+.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0
+	.if \save_ret
+	pushq	%rsi		/* pt_regs->si */
+	movq	8(%rsp), %rsi	/* temporarily store the return address in %rsi */
+	movq	%rdi, 8(%rsp)	/* pt_regs->di (overwriting original return address) */
+	.else
+	pushq   %rdi		/* pt_regs->di */
+	pushq   %rsi		/* pt_regs->si */
+	.endif
+	pushq	\rdx		/* pt_regs->dx */
+	pushq   \rcx		/* pt_regs->cx */
+	pushq   \rax		/* pt_regs->ax */
+	pushq   %r8		/* pt_regs->r8 */
+	pushq   %r9		/* pt_regs->r9 */
+	pushq   %r10		/* pt_regs->r10 */
+	pushq   %r11		/* pt_regs->r11 */
+	pushq	%rbx		/* pt_regs->rbx */
+	pushq	%rbp		/* pt_regs->rbp */
+	pushq	%r12		/* pt_regs->r12 */
+	pushq	%r13		/* pt_regs->r13 */
+	pushq	%r14		/* pt_regs->r14 */
+	pushq	%r15		/* pt_regs->r15 */
+	UNWIND_HINT_REGS
+
+	.if \save_ret
+	pushq	%rsi		/* return address on top of stack */
+	.endif
+.endm
+
+.macro CLEAR_REGS
+	/*
+	 * Sanitize registers of values that a speculation attack might
+	 * otherwise want to exploit. The lower registers are likely clobbered
+	 * well before they could be put to use in a speculative execution
+	 * gadget.
+	 */
+	xorl	%esi,  %esi	/* nospec si  */
+	xorl	%edx,  %edx	/* nospec dx  */
+	xorl	%ecx,  %ecx	/* nospec cx  */
+	xorl	%r8d,  %r8d	/* nospec r8  */
+	xorl	%r9d,  %r9d	/* nospec r9  */
+	xorl	%r10d, %r10d	/* nospec r10 */
+	xorl	%r11d, %r11d	/* nospec r11 */
+	xorl	%ebx,  %ebx	/* nospec rbx */
+	xorl	%ebp,  %ebp	/* nospec rbp */
+	xorl	%r12d, %r12d	/* nospec r12 */
+	xorl	%r13d, %r13d	/* nospec r13 */
+	xorl	%r14d, %r14d	/* nospec r14 */
+	xorl	%r15d, %r15d	/* nospec r15 */
+
+.endm
+
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0
+	PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret
+	CLEAR_REGS
+.endm
+
+.macro POP_REGS pop_rdi=1
+	popq %r15
+	popq %r14
+	popq %r13
+	popq %r12
+	popq %rbp
+	popq %rbx
+	popq %r11
+	popq %r10
+	popq %r9
+	popq %r8
+	popq %rax
+	popq %rcx
+	popq %rdx
+	popq %rsi
+	.if \pop_rdi
+	popq %rdi
+	.endif
+.endm
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+/*
+ * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
+ * halves:
+ */
+#define PTI_USER_PGTABLE_BIT		PAGE_SHIFT
+#define PTI_USER_PGTABLE_MASK		(1 << PTI_USER_PGTABLE_BIT)
+#define PTI_USER_PCID_BIT		X86_CR3_PTI_PCID_USER_BIT
+#define PTI_USER_PCID_MASK		(1 << PTI_USER_PCID_BIT)
+#define PTI_USER_PGTABLE_AND_PCID_MASK  (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
+
+.macro SET_NOFLUSH_BIT	reg:req
+	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg
+.endm
+
+.macro ADJUST_KERNEL_CR3 reg:req
+	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
+	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+	andq    $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+	mov	%cr3, \scratch_reg
+	ADJUST_KERNEL_CR3 \scratch_reg
+	mov	\scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+#define THIS_CPU_user_pcid_flush_mask   \
+	PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+	mov	%cr3, \scratch_reg
+
+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+	/*
+	 * Test if the ASID needs a flush.
+	 */
+	movq	\scratch_reg, \scratch_reg2
+	andq	$(0x7FF), \scratch_reg		/* mask ASID */
+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jnc	.Lnoflush_\@
+
+	/* Flush needed, clear the bit */
+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	movq	\scratch_reg2, \scratch_reg
+	jmp	.Lwrcr3_pcid_\@
+
+.Lnoflush_\@:
+	movq	\scratch_reg2, \scratch_reg
+	SET_NOFLUSH_BIT \scratch_reg
+
+.Lwrcr3_pcid_\@:
+	/* Flip the ASID to the user version */
+	orq	$(PTI_USER_PCID_MASK), \scratch_reg
+
+.Lwrcr3_\@:
+	/* Flip the PGD to the user version */
+	orq     $(PTI_USER_PGTABLE_MASK), \scratch_reg
+	mov	\scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro SWITCH_TO_USER_CR3_STACK	scratch_reg:req
+	pushq	%rax
+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+	popq	%rax
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+	ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
+	movq	%cr3, \scratch_reg
+	movq	\scratch_reg, \save_reg
+	/*
+	 * Test the user pagetable bit. If set, then the user page tables
+	 * are active. If clear CR3 already has the kernel page table
+	 * active.
+	 */
+	bt	$PTI_USER_PGTABLE_BIT, \scratch_reg
+	jnc	.Ldone_\@
+
+	ADJUST_KERNEL_CR3 \scratch_reg
+	movq	\scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+	/*
+	 * KERNEL pages can always resume with NOFLUSH as we do
+	 * explicit flushes.
+	 */
+	bt	$PTI_USER_PGTABLE_BIT, \save_reg
+	jnc	.Lnoflush_\@
+
+	/*
+	 * Check if there's a pending flush for the user ASID we're
+	 * about to set.
+	 */
+	movq	\save_reg, \scratch_reg
+	andq	$(0x7FF), \scratch_reg
+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jnc	.Lnoflush_\@
+
+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jmp	.Lwrcr3_\@
+
+.Lnoflush_\@:
+	SET_NOFLUSH_BIT \save_reg
+
+.Lwrcr3_\@:
+	/*
+	 * The CR3 write could be avoided when not changing its value,
+	 * but would require a CR3 read *and* a scratch register.
+	 */
+	movq	\save_reg, %cr3
+.Lend_\@:
+.endm
+
+#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+.endm
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+.endm
+
+#endif
+
+/*
+ * IBRS kernel mitigation for Spectre_v2.
+ *
+ * Assumes full context is established (PUSH_REGS, CR3 and GS) and it clobbers
+ * the regs it uses (AX, CX, DX). Must be called before the first RET
+ * instruction (NOTE! UNTRAIN_RET includes a RET instruction)
+ *
+ * The optional argument is used to save/restore the current value,
+ * which is used on the paranoid paths.
+ *
+ * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set.
+ */
+.macro IBRS_ENTER save_reg
+#ifdef CONFIG_CPU_IBRS_ENTRY
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
+	movl	$MSR_IA32_SPEC_CTRL, %ecx
+
+.ifnb \save_reg
+	rdmsr
+	shl	$32, %rdx
+	or	%rdx, %rax
+	mov	%rax, \save_reg
+	test	$SPEC_CTRL_IBRS, %eax
+	jz	.Ldo_wrmsr_\@
+	lfence
+	jmp	.Lend_\@
+.Ldo_wrmsr_\@:
+.endif
+
+	movq	PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+	movl	%edx, %eax
+	shr	$32, %rdx
+	wrmsr
+.Lend_\@:
+#endif
+.endm
+
+/*
+ * Similar to IBRS_ENTER, requires KERNEL GS,CR3 and clobbers (AX, CX, DX)
+ * regs. Must be called after the last RET.
+ */
+.macro IBRS_EXIT save_reg
+#ifdef CONFIG_CPU_IBRS_ENTRY
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
+	movl	$MSR_IA32_SPEC_CTRL, %ecx
+
+.ifnb \save_reg
+	mov	\save_reg, %rdx
+.else
+	movq	PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+	andl	$(~SPEC_CTRL_IBRS), %edx
+.endif
+
+	movl	%edx, %eax
+	shr	$32, %rdx
+	wrmsr
+.Lend_\@:
+#endif
+.endm
+
+/*
+ * Mitigate Spectre v1 for conditional swapgs code paths.
+ *
+ * FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to
+ * prevent a speculative swapgs when coming from kernel space.
+ *
+ * FENCE_SWAPGS_KERNEL_ENTRY is used in the kernel entry non-swapgs code path,
+ * to prevent the swapgs from getting speculatively skipped when coming from
+ * user space.
+ */
+.macro FENCE_SWAPGS_USER_ENTRY
+	ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_USER
+.endm
+.macro FENCE_SWAPGS_KERNEL_ENTRY
+	ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_KERNEL
+.endm
+
+.macro STACKLEAK_ERASE_NOCLOBBER
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+	PUSH_AND_CLEAR_REGS
+	call stackleak_erase
+	POP_REGS
+#endif
+.endm
+
+.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
+	rdgsbase \save_reg
+	GET_PERCPU_BASE \scratch_reg
+	wrgsbase \scratch_reg
+.endm
+
+#else /* CONFIG_X86_64 */
+# undef		UNWIND_HINT_IRET_REGS
+# define	UNWIND_HINT_IRET_REGS
+#endif /* !CONFIG_X86_64 */
+
+.macro STACKLEAK_ERASE
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+	call stackleak_erase
+#endif
+.endm
+
+#ifdef CONFIG_SMP
+
+/*
+ * CPU/node NR is loaded from the limit (size) field of a special segment
+ * descriptor entry in GDT.
+ */
+.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req
+	movq	$__CPUNODE_SEG, \reg
+	lsl	\reg, \reg
+.endm
+
+/*
+ * Fetch the per-CPU GSBASE value for this processor and put it in @reg.
+ * We normally use %gs for accessing per-CPU data, but we are setting up
+ * %gs here and obviously can not use %gs itself to access per-CPU data.
+ *
+ * Do not use RDPID, because KVM loads guest's TSC_AUX on vm-entry and
+ * may not restore the host's value until the CPU returns to userspace.
+ * Thus the kernel would consume a guest's TSC_AUX if an NMI arrives
+ * while running KVM's run loop.
+ */
+.macro GET_PERCPU_BASE reg:req
+	LOAD_CPU_AND_NODE_SEG_LIMIT \reg
+	andq	$VDSO_CPUNODE_MASK, \reg
+	movq	__per_cpu_offset(, \reg, 8), \reg
+.endm
+
+#else
+
+.macro GET_PERCPU_BASE reg:req
+	movq	pcpu_unit_offsets(%rip), \reg
+.endm
+
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
new file mode 100644
index 000000000..9c0b26ae5
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * common.c - C code for kernel entry and exit
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * Based on asm and ptrace code by many authors.  The code here originated
+ * in ptrace.c and signal.c.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/entry-common.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/export.h>
+#include <linux/nospec.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+#ifdef CONFIG_XEN_PV
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#endif
+
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/traps.h>
+#include <asm/vdso.h>
+#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
+#include <asm/nospec-branch.h>
+#include <asm/io_bitmap.h>
+#include <asm/syscall.h>
+#include <asm/irq_stack.h>
+
+#ifdef CONFIG_X86_64
+
+static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
+{
+	/*
+	 * Convert negative numbers to very high and thus out of range
+	 * numbers for comparisons.
+	 */
+	unsigned int unr = nr;
+
+	if (likely(unr < NR_syscalls)) {
+		unr = array_index_nospec(unr, NR_syscalls);
+		regs->ax = sys_call_table[unr](regs);
+		return true;
+	}
+	return false;
+}
+
+static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
+{
+	/*
+	 * Adjust the starting offset of the table, and convert numbers
+	 * < __X32_SYSCALL_BIT to very high and thus out of range
+	 * numbers for comparisons.
+	 */
+	unsigned int xnr = nr - __X32_SYSCALL_BIT;
+
+	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
+		xnr = array_index_nospec(xnr, X32_NR_syscalls);
+		regs->ax = x32_sys_call_table[xnr](regs);
+		return true;
+	}
+	return false;
+}
+
+__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
+{
+	add_random_kstack_offset();
+	nr = syscall_enter_from_user_mode(regs, nr);
+
+	instrumentation_begin();
+
+	if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
+		/* Invalid system call, but still a system call. */
+		regs->ax = __x64_sys_ni_syscall(regs);
+	}
+
+	instrumentation_end();
+	syscall_exit_to_user_mode(regs);
+}
+#endif
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+static __always_inline int syscall_32_enter(struct pt_regs *regs)
+{
+	if (IS_ENABLED(CONFIG_IA32_EMULATION))
+		current_thread_info()->status |= TS_COMPAT;
+
+	return (int)regs->orig_ax;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+bool __ia32_enabled __ro_after_init = true;
+#endif
+
+/*
+ * Invoke a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.
+ */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
+{
+	/*
+	 * Convert negative numbers to very high and thus out of range
+	 * numbers for comparisons.
+	 */
+	unsigned int unr = nr;
+
+	if (likely(unr < IA32_NR_syscalls)) {
+		unr = array_index_nospec(unr, IA32_NR_syscalls);
+		regs->ax = ia32_sys_call_table[unr](regs);
+	} else if (nr != -1) {
+		regs->ax = __ia32_sys_ni_syscall(regs);
+	}
+}
+
+#ifdef CONFIG_IA32_EMULATION
+static __always_inline bool int80_is_external(void)
+{
+	const unsigned int offs = (0x80 / 32) * 0x10;
+	const u32 bit = BIT(0x80 % 32);
+
+	/* The local APIC on XENPV guests is fake */
+	if (cpu_feature_enabled(X86_FEATURE_XENPV))
+		return false;
+
+	/*
+	 * If vector 0x80 is set in the APIC ISR then this is an external
+	 * interrupt. Either from broken hardware or injected by a VMM.
+	 *
+	 * Note: In guest mode this is only valid for secure guests where
+	 * the secure module fully controls the vAPIC exposed to the guest.
+	 */
+	return apic_read(APIC_ISR + offs) & bit;
+}
+
+/**
+ * int80_emulation - 32-bit legacy syscall entry
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls.  Instances of INT $0x80 can be found inline in
+ * various programs and libraries.  It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method.  Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * The arguments for the INT $0x80 based syscall are on stack in the
+ * pt_regs structure:
+ *   eax:				system call number
+ *   ebx, ecx, edx, esi, edi, ebp:	arg1 - arg 6
+ */
+DEFINE_IDTENTRY_RAW(int80_emulation)
+{
+	int nr;
+
+	/* Kernel does not use INT $0x80! */
+	if (unlikely(!user_mode(regs))) {
+		irqentry_enter(regs);
+		instrumentation_begin();
+		panic("Unexpected external interrupt 0x80\n");
+	}
+
+	/*
+	 * Establish kernel context for instrumentation, including for
+	 * int80_is_external() below which calls into the APIC driver.
+	 * Identical for soft and external interrupts.
+	 */
+	enter_from_user_mode(regs);
+
+	instrumentation_begin();
+	add_random_kstack_offset();
+
+	/* Validate that this is a soft interrupt to the extent possible */
+	if (unlikely(int80_is_external()))
+		panic("Unexpected external interrupt 0x80\n");
+
+	/*
+	 * The low level idtentry code pushed -1 into regs::orig_ax
+	 * and regs::ax contains the syscall number.
+	 *
+	 * User tracing code (ptrace or signal handlers) might assume
+	 * that the regs::orig_ax contains a 32-bit number on invoking
+	 * a 32-bit syscall.
+	 *
+	 * Establish the syscall convention by saving the 32bit truncated
+	 * syscall number in regs::orig_ax and by invalidating regs::ax.
+	 */
+	regs->orig_ax = regs->ax & GENMASK(31, 0);
+	regs->ax = -ENOSYS;
+
+	nr = syscall_32_enter(regs);
+
+	local_irq_enable();
+	nr = syscall_enter_from_user_mode_work(regs, nr);
+	do_syscall_32_irqs_on(regs, nr);
+
+	instrumentation_end();
+	syscall_exit_to_user_mode(regs);
+}
+#else /* CONFIG_IA32_EMULATION */
+
+/* Handles int $0x80 on a 32bit kernel */
+__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
+{
+	int nr = syscall_32_enter(regs);
+
+	add_random_kstack_offset();
+	/*
+	 * Subtlety here: if ptrace pokes something larger than 2^31-1 into
+	 * orig_ax, the int return value truncates it. This matches
+	 * the semantics of syscall_get_nr().
+	 */
+	nr = syscall_enter_from_user_mode(regs, nr);
+	instrumentation_begin();
+
+	do_syscall_32_irqs_on(regs, nr);
+
+	instrumentation_end();
+	syscall_exit_to_user_mode(regs);
+}
+#endif /* !CONFIG_IA32_EMULATION */
+
+static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
+{
+	int nr = syscall_32_enter(regs);
+	int res;
+
+	add_random_kstack_offset();
+	/*
+	 * This cannot use syscall_enter_from_user_mode() as it has to
+	 * fetch EBP before invoking any of the syscall entry work
+	 * functions.
+	 */
+	syscall_enter_from_user_mode_prepare(regs);
+
+	instrumentation_begin();
+	/* Fetch EBP from where the vDSO stashed it. */
+	if (IS_ENABLED(CONFIG_X86_64)) {
+		/*
+		 * Micro-optimization: the pointer we're following is
+		 * explicitly 32 bits, so it can't be out of range.
+		 */
+		res = __get_user(*(u32 *)&regs->bp,
+			 (u32 __user __force *)(unsigned long)(u32)regs->sp);
+	} else {
+		res = get_user(*(u32 *)&regs->bp,
+		       (u32 __user __force *)(unsigned long)(u32)regs->sp);
+	}
+
+	if (res) {
+		/* User code screwed up. */
+		regs->ax = -EFAULT;
+
+		local_irq_disable();
+		instrumentation_end();
+		irqentry_exit_to_user_mode(regs);
+		return false;
+	}
+
+	nr = syscall_enter_from_user_mode_work(regs, nr);
+
+	/* Now this is just like a normal syscall. */
+	do_syscall_32_irqs_on(regs, nr);
+
+	instrumentation_end();
+	syscall_exit_to_user_mode(regs);
+	return true;
+}
+
+/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
+__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
+{
+	/*
+	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
+	 * convention.  Adjust regs so it looks like we entered using int80.
+	 */
+	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+					vdso_image_32.sym_int80_landing_pad;
+
+	/*
+	 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
+	 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
+	 * Fix it up.
+	 */
+	regs->ip = landing_pad;
+
+	/* Invoke the syscall. If it failed, keep it simple: use IRET. */
+	if (!__do_fast_syscall_32(regs))
+		return 0;
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
+	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
+	 * bother with SYSEXIT.
+	 *
+	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
+	 * because the ECX fixup above will ensure that this is essentially
+	 * never the case.
+	 */
+	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
+		regs->ip == landing_pad &&
+		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
+#else
+	/*
+	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
+	 *
+	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
+	 * because the ECX fixup above will ensure that this is essentially
+	 * never the case.
+	 *
+	 * We don't allow syscalls at all from VM86 mode, but we still
+	 * need to check VM, because we might be returning from sys_vm86.
+	 */
+	return static_cpu_has(X86_FEATURE_SEP) &&
+		regs->cs == __USER_CS && regs->ss == __USER_DS &&
+		regs->ip == landing_pad &&
+		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
+#endif
+}
+
+/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
+__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
+{
+	/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
+	regs->sp = regs->bp;
+
+	/* SYSENTER clobbers EFLAGS.IF.  Assume it was set in usermode. */
+	regs->flags |= X86_EFLAGS_IF;
+
+	return do_fast_syscall_32(regs);
+}
+#endif
+
+SYSCALL_DEFINE0(ni_syscall)
+{
+	return -ENOSYS;
+}
+
+#ifdef CONFIG_XEN_PV
+#ifndef CONFIG_PREEMPTION
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+
+/*
+ * In case of scheduling the flag must be cleared and restored after
+ * returning from schedule as the task might move to a different CPU.
+ */
+static __always_inline bool get_and_clear_inhcall(void)
+{
+	bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+	__this_cpu_write(xen_in_preemptible_hcall, false);
+	return inhcall;
+}
+
+static __always_inline void restore_inhcall(bool inhcall)
+{
+	__this_cpu_write(xen_in_preemptible_hcall, inhcall);
+}
+#else
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
+static __always_inline void restore_inhcall(bool inhcall) { }
+#endif
+
+static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	inc_irq_stat(irq_hv_callback_count);
+
+	xen_evtchn_do_upcall();
+
+	set_irq_regs(old_regs);
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+	irqentry_state_t state = irqentry_enter(regs);
+	bool inhcall;
+
+	instrumentation_begin();
+	run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
+
+	inhcall = get_and_clear_inhcall();
+	if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
+		irqentry_exit_cond_resched();
+		instrumentation_end();
+		restore_inhcall(inhcall);
+	} else {
+		instrumentation_end();
+		irqentry_exit(regs, state);
+	}
+}
+#endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
new file mode 100644
index 000000000..bfb7bcb36
--- /dev/null
+++ b/arch/x86/entry/entry.S
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common place for both 32- and 64-bit entry routines.
+ */
+
+#include <linux/linkage.h>
+#include <asm/export.h>
+#include <asm/msr-index.h>
+
+.pushsection .noinstr.text, "ax"
+
+SYM_FUNC_START(entry_ibpb)
+	movl	$MSR_IA32_PRED_CMD, %ecx
+	movl	$PRED_CMD_IBPB, %eax
+	xorl	%edx, %edx
+	wrmsr
+	RET
+SYM_FUNC_END(entry_ibpb)
+/* For KVM */
+EXPORT_SYMBOL_GPL(entry_ibpb);
+
+.popsection
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
new file mode 100644
index 000000000..e309e7156
--- /dev/null
+++ b/arch/x86/entry/entry_32.S
@@ -0,0 +1,1252 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Copyright (C) 1991,1992  Linus Torvalds
+ *
+ * entry_32.S contains the system-call and low-level fault and trap handling routines.
+ *
+ * Stack layout while running C code:
+ *	ptrace needs to have all registers on the stack.
+ *	If the order here is changed, it needs to be
+ *	updated in fork.c:copy_process(), signal.c:do_signal(),
+ *	ptrace.c and ptrace.h
+ *
+ *	 0(%esp) - %ebx
+ *	 4(%esp) - %ecx
+ *	 8(%esp) - %edx
+ *	 C(%esp) - %esi
+ *	10(%esp) - %edi
+ *	14(%esp) - %ebp
+ *	18(%esp) - %eax
+ *	1C(%esp) - %ds
+ *	20(%esp) - %es
+ *	24(%esp) - %fs
+ *	28(%esp) - unused -- was %gs on old stackprotector kernels
+ *	2C(%esp) - orig_eax
+ *	30(%esp) - %eip
+ *	34(%esp) - %cs
+ *	38(%esp) - %eflags
+ *	3C(%esp) - %oldesp
+ *	40(%esp) - %oldss
+ */
+
+#include <linux/linkage.h>
+#include <linux/err.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/irq_vectors.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/frame.h>
+#include <asm/trapnr.h>
+#include <asm/nospec-branch.h>
+
+#include "calling.h"
+
+	.section .entry.text, "ax"
+
+#define PTI_SWITCH_MASK         (1 << PAGE_SHIFT)
+
+/* Unconditionally switch to user cr3 */
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+	movl	%cr3, \scratch_reg
+	orl	$PTI_SWITCH_MASK, \scratch_reg
+	movl	\scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro BUG_IF_WRONG_CR3 no_user_check=0
+#ifdef CONFIG_DEBUG_ENTRY
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+	.if \no_user_check == 0
+	/* coming from usermode? */
+	testl	$USER_SEGMENT_RPL_MASK, PT_CS(%esp)
+	jz	.Lend_\@
+	.endif
+	/* On user-cr3? */
+	movl	%cr3, %eax
+	testl	$PTI_SWITCH_MASK, %eax
+	jnz	.Lend_\@
+	/* From userspace with kernel cr3 - BUG */
+	ud2
+.Lend_\@:
+#endif
+.endm
+
+/*
+ * Switch to kernel cr3 if not already loaded and return current cr3 in
+ * \scratch_reg
+ */
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+	movl	%cr3, \scratch_reg
+	/* Test if we are already on kernel CR3 */
+	testl	$PTI_SWITCH_MASK, \scratch_reg
+	jz	.Lend_\@
+	andl	$(~PTI_SWITCH_MASK), \scratch_reg
+	movl	\scratch_reg, %cr3
+	/* Return original CR3 in \scratch_reg */
+	orl	$PTI_SWITCH_MASK, \scratch_reg
+.Lend_\@:
+.endm
+
+#define CS_FROM_ENTRY_STACK	(1 << 31)
+#define CS_FROM_USER_CR3	(1 << 30)
+#define CS_FROM_KERNEL		(1 << 29)
+#define CS_FROM_ESPFIX		(1 << 28)
+
+.macro FIXUP_FRAME
+	/*
+	 * The high bits of the CS dword (__csh) are used for CS_FROM_*.
+	 * Clear them in case hardware didn't do this for us.
+	 */
+	andl	$0x0000ffff, 4*4(%esp)
+
+#ifdef CONFIG_VM86
+	testl	$X86_EFLAGS_VM, 5*4(%esp)
+	jnz	.Lfrom_usermode_no_fixup_\@
+#endif
+	testl	$USER_SEGMENT_RPL_MASK, 4*4(%esp)
+	jnz	.Lfrom_usermode_no_fixup_\@
+
+	orl	$CS_FROM_KERNEL, 4*4(%esp)
+
+	/*
+	 * When we're here from kernel mode; the (exception) stack looks like:
+	 *
+	 *  6*4(%esp) - <previous context>
+	 *  5*4(%esp) - flags
+	 *  4*4(%esp) - cs
+	 *  3*4(%esp) - ip
+	 *  2*4(%esp) - orig_eax
+	 *  1*4(%esp) - gs / function
+	 *  0*4(%esp) - fs
+	 *
+	 * Lets build a 5 entry IRET frame after that, such that struct pt_regs
+	 * is complete and in particular regs->sp is correct. This gives us
+	 * the original 6 entries as gap:
+	 *
+	 * 14*4(%esp) - <previous context>
+	 * 13*4(%esp) - gap / flags
+	 * 12*4(%esp) - gap / cs
+	 * 11*4(%esp) - gap / ip
+	 * 10*4(%esp) - gap / orig_eax
+	 *  9*4(%esp) - gap / gs / function
+	 *  8*4(%esp) - gap / fs
+	 *  7*4(%esp) - ss
+	 *  6*4(%esp) - sp
+	 *  5*4(%esp) - flags
+	 *  4*4(%esp) - cs
+	 *  3*4(%esp) - ip
+	 *  2*4(%esp) - orig_eax
+	 *  1*4(%esp) - gs / function
+	 *  0*4(%esp) - fs
+	 */
+
+	pushl	%ss		# ss
+	pushl	%esp		# sp (points at ss)
+	addl	$7*4, (%esp)	# point sp back at the previous context
+	pushl	7*4(%esp)	# flags
+	pushl	7*4(%esp)	# cs
+	pushl	7*4(%esp)	# ip
+	pushl	7*4(%esp)	# orig_eax
+	pushl	7*4(%esp)	# gs / function
+	pushl	7*4(%esp)	# fs
+.Lfrom_usermode_no_fixup_\@:
+.endm
+
+.macro IRET_FRAME
+	/*
+	 * We're called with %ds, %es, %fs, and %gs from the interrupted
+	 * frame, so we shouldn't use them.  Also, we may be in ESPFIX
+	 * mode and therefore have a nonzero SS base and an offset ESP,
+	 * so any attempt to access the stack needs to use SS.  (except for
+	 * accesses through %esp, which automatically use SS.)
+	 */
+	testl $CS_FROM_KERNEL, 1*4(%esp)
+	jz .Lfinished_frame_\@
+
+	/*
+	 * Reconstruct the 3 entry IRET frame right after the (modified)
+	 * regs->sp without lowering %esp in between, such that an NMI in the
+	 * middle doesn't scribble our stack.
+	 */
+	pushl	%eax
+	pushl	%ecx
+	movl	5*4(%esp), %eax		# (modified) regs->sp
+
+	movl	4*4(%esp), %ecx		# flags
+	movl	%ecx, %ss:-1*4(%eax)
+
+	movl	3*4(%esp), %ecx		# cs
+	andl	$0x0000ffff, %ecx
+	movl	%ecx, %ss:-2*4(%eax)
+
+	movl	2*4(%esp), %ecx		# ip
+	movl	%ecx, %ss:-3*4(%eax)
+
+	movl	1*4(%esp), %ecx		# eax
+	movl	%ecx, %ss:-4*4(%eax)
+
+	popl	%ecx
+	lea	-4*4(%eax), %esp
+	popl	%eax
+.Lfinished_frame_\@:
+.endm
+
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
+	cld
+.if \skip_gs == 0
+	pushl	$0
+.endif
+	pushl	%fs
+
+	pushl	%eax
+	movl	$(__KERNEL_PERCPU), %eax
+	movl	%eax, %fs
+.if \unwind_espfix > 0
+	UNWIND_ESPFIX_STACK
+.endif
+	popl	%eax
+
+	FIXUP_FRAME
+	pushl	%es
+	pushl	%ds
+	pushl	\pt_regs_ax
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+	movl	$(__USER_DS), %edx
+	movl	%edx, %ds
+	movl	%edx, %es
+	/* Switch to kernel stack if necessary */
+.if \switch_stacks > 0
+	SWITCH_TO_KERNEL_STACK
+.endif
+.endm
+
+.macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0
+	SAVE_ALL unwind_espfix=\unwind_espfix
+
+	BUG_IF_WRONG_CR3
+
+	/*
+	 * Now switch the CR3 when PTI is enabled.
+	 *
+	 * We can enter with either user or kernel cr3, the code will
+	 * store the old cr3 in \cr3_reg and switches to the kernel cr3
+	 * if necessary.
+	 */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg
+
+.Lend_\@:
+.endm
+
+.macro RESTORE_INT_REGS
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%eax
+.endm
+
+.macro RESTORE_REGS pop=0
+	RESTORE_INT_REGS
+1:	popl	%ds
+2:	popl	%es
+3:	popl	%fs
+4:	addl	$(4 + \pop), %esp	/* pop the unused "gs" slot */
+	IRET_FRAME
+
+	/*
+	 * There is no _ASM_EXTABLE_TYPE_REG() for ASM, however since this is
+	 * ASM the registers are known and we can trivially hard-code them.
+	 */
+	_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_POP_ZERO|EX_REG_DS)
+	_ASM_EXTABLE_TYPE(2b, 3b, EX_TYPE_POP_ZERO|EX_REG_ES)
+	_ASM_EXTABLE_TYPE(3b, 4b, EX_TYPE_POP_ZERO|EX_REG_FS)
+.endm
+
+.macro RESTORE_ALL_NMI cr3_reg:req pop=0
+	/*
+	 * Now switch the CR3 when PTI is enabled.
+	 *
+	 * We enter with kernel cr3 and switch the cr3 to the value
+	 * stored on \cr3_reg, which is either a user or a kernel cr3.
+	 */
+	ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI
+
+	testl	$PTI_SWITCH_MASK, \cr3_reg
+	jz	.Lswitched_\@
+
+	/* User cr3 in \cr3_reg - write it to hardware cr3 */
+	movl	\cr3_reg, %cr3
+
+.Lswitched_\@:
+
+	BUG_IF_WRONG_CR3
+
+	RESTORE_REGS pop=\pop
+.endm
+
+.macro CHECK_AND_APPLY_ESPFIX
+#ifdef CONFIG_X86_ESPFIX32
+#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET
+
+	ALTERNATIVE	"jmp .Lend_\@", "", X86_BUG_ESPFIX
+
+	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
+	/*
+	 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+	 * are returning to the kernel.
+	 * See comments in process.c:copy_thread() for details.
+	 */
+	movb	PT_OLDSS(%esp), %ah
+	movb	PT_CS(%esp), %al
+	andl	$(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+	cmpl	$((SEGMENT_LDT << 8) | USER_RPL), %eax
+	jne	.Lend_\@	# returning to user-space with LDT SS
+
+	/*
+	 * Setup and switch to ESPFIX stack
+	 *
+	 * We're returning to userspace with a 16 bit stack. The CPU will not
+	 * restore the high word of ESP for us on executing iret... This is an
+	 * "official" bug of all the x86-compatible CPUs, which we can work
+	 * around to make dosemu and wine happy. We do this by preloading the
+	 * high word of ESP with the high word of the userspace ESP while
+	 * compensating for the offset by changing to the ESPFIX segment with
+	 * a base address that matches for the difference.
+	 */
+	mov	%esp, %edx			/* load kernel esp */
+	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
+	mov	%dx, %ax			/* eax: new kernel esp */
+	sub	%eax, %edx			/* offset (low word is 0) */
+	shr	$16, %edx
+	mov	%dl, GDT_ESPFIX_SS + 4		/* bits 16..23 */
+	mov	%dh, GDT_ESPFIX_SS + 7		/* bits 24..31 */
+	pushl	$__ESPFIX_SS
+	pushl	%eax				/* new kernel esp */
+	/*
+	 * Disable interrupts, but do not irqtrace this section: we
+	 * will soon execute iret and the tracer was already set to
+	 * the irqstate after the IRET:
+	 */
+	cli
+	lss	(%esp), %esp			/* switch to espfix segment */
+.Lend_\@:
+#endif /* CONFIG_X86_ESPFIX32 */
+.endm
+
+/*
+ * Called with pt_regs fully populated and kernel segments loaded,
+ * so we can access PER_CPU and use the integer registers.
+ *
+ * We need to be very careful here with the %esp switch, because an NMI
+ * can happen everywhere. If the NMI handler finds itself on the
+ * entry-stack, it will overwrite the task-stack and everything we
+ * copied there. So allocate the stack-frame on the task-stack and
+ * switch to it before we do any copying.
+ */
+
+.macro SWITCH_TO_KERNEL_STACK
+
+	BUG_IF_WRONG_CR3
+
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
+
+	/*
+	 * %eax now contains the entry cr3 and we carry it forward in
+	 * that register for the time this macro runs
+	 */
+
+	/* Are we on the entry stack? Bail out if not! */
+	movl	PER_CPU_VAR(cpu_entry_area), %ecx
+	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+	subl	%esp, %ecx	/* ecx = (end of entry_stack) - esp */
+	cmpl	$SIZEOF_entry_stack, %ecx
+	jae	.Lend_\@
+
+	/* Load stack pointer into %esi and %edi */
+	movl	%esp, %esi
+	movl	%esi, %edi
+
+	/* Move %edi to the top of the entry stack */
+	andl	$(MASK_entry_stack), %edi
+	addl	$(SIZEOF_entry_stack), %edi
+
+	/* Load top of task-stack into %edi */
+	movl	TSS_entry2task_stack(%edi), %edi
+
+	/* Special case - entry from kernel mode via entry stack */
+#ifdef CONFIG_VM86
+	movl	PT_EFLAGS(%esp), %ecx		# mix EFLAGS and CS
+	movb	PT_CS(%esp), %cl
+	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx
+#else
+	movl	PT_CS(%esp), %ecx
+	andl	$SEGMENT_RPL_MASK, %ecx
+#endif
+	cmpl	$USER_RPL, %ecx
+	jb	.Lentry_from_kernel_\@
+
+	/* Bytes to copy */
+	movl	$PTREGS_SIZE, %ecx
+
+#ifdef CONFIG_VM86
+	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esi)
+	jz	.Lcopy_pt_regs_\@
+
+	/*
+	 * Stack-frame contains 4 additional segment registers when
+	 * coming from VM86 mode
+	 */
+	addl	$(4 * 4), %ecx
+
+#endif
+.Lcopy_pt_regs_\@:
+
+	/* Allocate frame on task-stack */
+	subl	%ecx, %edi
+
+	/* Switch to task-stack */
+	movl	%edi, %esp
+
+	/*
+	 * We are now on the task-stack and can safely copy over the
+	 * stack-frame
+	 */
+	shrl	$2, %ecx
+	cld
+	rep movsl
+
+	jmp .Lend_\@
+
+.Lentry_from_kernel_\@:
+
+	/*
+	 * This handles the case when we enter the kernel from
+	 * kernel-mode and %esp points to the entry-stack. When this
+	 * happens we need to switch to the task-stack to run C code,
+	 * but switch back to the entry-stack again when we approach
+	 * iret and return to the interrupted code-path. This usually
+	 * happens when we hit an exception while restoring user-space
+	 * segment registers on the way back to user-space or when the
+	 * sysenter handler runs with eflags.tf set.
+	 *
+	 * When we switch to the task-stack here, we can't trust the
+	 * contents of the entry-stack anymore, as the exception handler
+	 * might be scheduled out or moved to another CPU. Therefore we
+	 * copy the complete entry-stack to the task-stack and set a
+	 * marker in the iret-frame (bit 31 of the CS dword) to detect
+	 * what we've done on the iret path.
+	 *
+	 * On the iret path we copy everything back and switch to the
+	 * entry-stack, so that the interrupted kernel code-path
+	 * continues on the same stack it was interrupted with.
+	 *
+	 * Be aware that an NMI can happen anytime in this code.
+	 *
+	 * %esi: Entry-Stack pointer (same as %esp)
+	 * %edi: Top of the task stack
+	 * %eax: CR3 on kernel entry
+	 */
+
+	/* Calculate number of bytes on the entry stack in %ecx */
+	movl	%esi, %ecx
+
+	/* %ecx to the top of entry-stack */
+	andl	$(MASK_entry_stack), %ecx
+	addl	$(SIZEOF_entry_stack), %ecx
+
+	/* Number of bytes on the entry stack to %ecx */
+	sub	%esi, %ecx
+
+	/* Mark stackframe as coming from entry stack */
+	orl	$CS_FROM_ENTRY_STACK, PT_CS(%esp)
+
+	/*
+	 * Test the cr3 used to enter the kernel and add a marker
+	 * so that we can switch back to it before iret.
+	 */
+	testl	$PTI_SWITCH_MASK, %eax
+	jz	.Lcopy_pt_regs_\@
+	orl	$CS_FROM_USER_CR3, PT_CS(%esp)
+
+	/*
+	 * %esi and %edi are unchanged, %ecx contains the number of
+	 * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate
+	 * the stack-frame on task-stack and copy everything over
+	 */
+	jmp .Lcopy_pt_regs_\@
+
+.Lend_\@:
+.endm
+
+/*
+ * Switch back from the kernel stack to the entry stack.
+ *
+ * The %esp register must point to pt_regs on the task stack. It will
+ * first calculate the size of the stack-frame to copy, depending on
+ * whether we return to VM86 mode or not. With that it uses 'rep movsl'
+ * to copy the contents of the stack over to the entry stack.
+ *
+ * We must be very careful here, as we can't trust the contents of the
+ * task-stack once we switched to the entry-stack. When an NMI happens
+ * while on the entry-stack, the NMI handler will switch back to the top
+ * of the task stack, overwriting our stack-frame we are about to copy.
+ * Therefore we switch the stack only after everything is copied over.
+ */
+.macro SWITCH_TO_ENTRY_STACK
+
+	/* Bytes to copy */
+	movl	$PTREGS_SIZE, %ecx
+
+#ifdef CONFIG_VM86
+	testl	$(X86_EFLAGS_VM), PT_EFLAGS(%esp)
+	jz	.Lcopy_pt_regs_\@
+
+	/* Additional 4 registers to copy when returning to VM86 mode */
+	addl    $(4 * 4), %ecx
+
+.Lcopy_pt_regs_\@:
+#endif
+
+	/* Initialize source and destination for movsl */
+	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
+	subl	%ecx, %edi
+	movl	%esp, %esi
+
+	/* Save future stack pointer in %ebx */
+	movl	%edi, %ebx
+
+	/* Copy over the stack-frame */
+	shrl	$2, %ecx
+	cld
+	rep movsl
+
+	/*
+	 * Switch to entry-stack - needs to happen after everything is
+	 * copied because the NMI handler will overwrite the task-stack
+	 * when on entry-stack
+	 */
+	movl	%ebx, %esp
+
+.Lend_\@:
+.endm
+
+/*
+ * This macro handles the case when we return to kernel-mode on the iret
+ * path and have to switch back to the entry stack and/or user-cr3
+ *
+ * See the comments below the .Lentry_from_kernel_\@ label in the
+ * SWITCH_TO_KERNEL_STACK macro for more details.
+ */
+.macro PARANOID_EXIT_TO_KERNEL_MODE
+
+	/*
+	 * Test if we entered the kernel with the entry-stack. Most
+	 * likely we did not, because this code only runs on the
+	 * return-to-kernel path.
+	 */
+	testl	$CS_FROM_ENTRY_STACK, PT_CS(%esp)
+	jz	.Lend_\@
+
+	/* Unlikely slow-path */
+
+	/* Clear marker from stack-frame */
+	andl	$(~CS_FROM_ENTRY_STACK), PT_CS(%esp)
+
+	/* Copy the remaining task-stack contents to entry-stack */
+	movl	%esp, %esi
+	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
+
+	/* Bytes on the task-stack to ecx */
+	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
+	subl	%esi, %ecx
+
+	/* Allocate stack-frame on entry-stack */
+	subl	%ecx, %edi
+
+	/*
+	 * Save future stack-pointer, we must not switch until the
+	 * copy is done, otherwise the NMI handler could destroy the
+	 * contents of the task-stack we are about to copy.
+	 */
+	movl	%edi, %ebx
+
+	/* Do the copy */
+	shrl	$2, %ecx
+	cld
+	rep movsl
+
+	/* Safe to switch to entry-stack now */
+	movl	%ebx, %esp
+
+	/*
+	 * We came from entry-stack and need to check if we also need to
+	 * switch back to user cr3.
+	 */
+	testl	$CS_FROM_USER_CR3, PT_CS(%esp)
+	jz	.Lend_\@
+
+	/* Clear marker from stack-frame */
+	andl	$(~CS_FROM_USER_CR3), PT_CS(%esp)
+
+	SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+.Lend_\@:
+.endm
+
+/**
+ * idtentry - Macro to generate entry stubs for simple IDT entries
+ * @vector:		Vector number
+ * @asmsym:		ASM symbol for the entry point
+ * @cfunc:		C function to be called
+ * @has_error_code:	Hardware pushed error code on stack
+ */
+.macro idtentry vector asmsym cfunc has_error_code:req
+SYM_CODE_START(\asmsym)
+	ASM_CLAC
+	cld
+
+	.if \has_error_code == 0
+		pushl	$0		/* Clear the error code */
+	.endif
+
+	/* Push the C-function address into the GS slot */
+	pushl	$\cfunc
+	/* Invoke the common exception entry */
+	jmp	handle_exception
+SYM_CODE_END(\asmsym)
+.endm
+
+.macro idtentry_irq vector cfunc
+	.p2align CONFIG_X86_L1_CACHE_SHIFT
+SYM_CODE_START_LOCAL(asm_\cfunc)
+	ASM_CLAC
+	SAVE_ALL switch_stacks=1
+	ENCODE_FRAME_POINTER
+	movl	%esp, %eax
+	movl	PT_ORIG_EAX(%esp), %edx		/* get the vector from stack */
+	movl	$-1, PT_ORIG_EAX(%esp)		/* no syscall to restart */
+	call	\cfunc
+	jmp	handle_exception_return
+SYM_CODE_END(asm_\cfunc)
+.endm
+
+.macro idtentry_sysvec vector cfunc
+	idtentry \vector asm_\cfunc \cfunc has_error_code=0
+.endm
+
+/*
+ * Include the defines which emit the idt entries which are shared
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
+ * so the stacktrace boundary checks work.
+ */
+	.align 16
+	.globl __irqentry_text_start
+__irqentry_text_start:
+
+#include <asm/idtentry.h>
+
+	.align 16
+	.globl __irqentry_text_end
+__irqentry_text_end:
+
+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+.pushsection .text, "ax"
+SYM_CODE_START(__switch_to_asm)
+	/*
+	 * Save callee-saved registers
+	 * This must match the order in struct inactive_task_frame
+	 */
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%edi
+	pushl	%esi
+	/*
+	 * Flags are saved to prevent AC leakage. This could go
+	 * away if objtool would have 32bit support to verify
+	 * the STAC/CLAC correctness.
+	 */
+	pushfl
+
+	/* switch stack */
+	movl	%esp, TASK_threadsp(%eax)
+	movl	TASK_threadsp(%edx), %esp
+
+#ifdef CONFIG_STACKPROTECTOR
+	movl	TASK_stack_canary(%edx), %ebx
+	movl	%ebx, PER_CPU_VAR(__stack_chk_guard)
+#endif
+
+	/*
+	 * When switching from a shallower to a deeper call stack
+	 * the RSB may either underflow or use entries populated
+	 * with userspace addresses. On CPUs where those concerns
+	 * exist, overwrite the RSB with entries which capture
+	 * speculative execution to prevent attack.
+	 */
+	FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+
+	/* Restore flags or the incoming task to restore AC state. */
+	popfl
+	/* restore callee-saved registers */
+	popl	%esi
+	popl	%edi
+	popl	%ebx
+	popl	%ebp
+
+	jmp	__switch_to
+SYM_CODE_END(__switch_to_asm)
+.popsection
+
+/*
+ * The unwinder expects the last frame on the stack to always be at the same
+ * offset from the end of the page, which allows it to validate the stack.
+ * Calling schedule_tail() directly would break that convention because its an
+ * asmlinkage function so its argument has to be pushed on the stack.  This
+ * wrapper creates a proper "end of stack" frame header before the call.
+ */
+.pushsection .text, "ax"
+SYM_FUNC_START(schedule_tail_wrapper)
+	FRAME_BEGIN
+
+	pushl	%eax
+	call	schedule_tail
+	popl	%eax
+
+	FRAME_END
+	RET
+SYM_FUNC_END(schedule_tail_wrapper)
+.popsection
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * eax: prev task we switched from
+ * ebx: kernel thread func (NULL for user thread)
+ * edi: kernel thread arg
+ */
+.pushsection .text, "ax"
+SYM_CODE_START(ret_from_fork)
+	call	schedule_tail_wrapper
+
+	testl	%ebx, %ebx
+	jnz	1f		/* kernel threads are uncommon */
+
+2:
+	/* When we fork, we trace the syscall return in the child, too. */
+	movl    %esp, %eax
+	call    syscall_exit_to_user_mode
+	jmp     .Lsyscall_32_done
+
+	/* kernel thread */
+1:	movl	%edi, %eax
+	CALL_NOSPEC ebx
+	/*
+	 * A kernel thread is allowed to return here after successfully
+	 * calling kernel_execve().  Exit to userspace to complete the execve()
+	 * syscall.
+	 */
+	movl	$0, PT_EAX(%esp)
+	jmp	2b
+SYM_CODE_END(ret_from_fork)
+.popsection
+
+SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
+/*
+ * All code from here through __end_SYSENTER_singlestep_region is subject
+ * to being single-stepped if a user program sets TF and executes SYSENTER.
+ * There is absolutely nothing that we can do to prevent this from happening
+ * (thanks Intel!).  To keep our handling of this situation as simple as
+ * possible, we handle TF just like AC and NT, except that our #DB handler
+ * will ignore all of the single-step traps generated in this range.
+ */
+
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * if X86_FEATURE_SEP is available.  This is the preferred system call
+ * entry on 32-bit systems.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old EIP (!!!), ESP, or EFLAGS.
+ *
+ * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
+ * user and/or vm86 state), we explicitly disable the SYSENTER
+ * instruction in vm86 mode by reprogramming the MSRs.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ */
+SYM_FUNC_START(entry_SYSENTER_32)
+	/*
+	 * On entry-stack with all userspace-regs live - save and
+	 * restore eflags and %eax to use it as scratch-reg for the cr3
+	 * switch.
+	 */
+	pushfl
+	pushl	%eax
+	BUG_IF_WRONG_CR3 no_user_check=1
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
+	popl	%eax
+	popfl
+
+	/* Stack empty again, switch to task stack */
+	movl	TSS_entry2task_stack(%esp), %esp
+
+.Lsysenter_past_esp:
+	pushl	$__USER_DS		/* pt_regs->ss */
+	pushl	$0			/* pt_regs->sp (placeholder) */
+	pushfl				/* pt_regs->flags (except IF = 0) */
+	pushl	$__USER_CS		/* pt_regs->cs */
+	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
+	pushl	%eax			/* pt_regs->orig_ax */
+	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest, stack already switched */
+
+	/*
+	 * SYSENTER doesn't filter flags, so we need to clear NT, AC
+	 * and TF ourselves.  To save a few cycles, we can check whether
+	 * either was set instead of doing an unconditional popfq.
+	 * This needs to happen before enabling interrupts so that
+	 * we don't get preempted with NT set.
+	 *
+	 * If TF is set, we will single-step all the way to here -- do_debug
+	 * will ignore all the traps.  (Yes, this is slow, but so is
+	 * single-stepping in general.  This allows us to avoid having
+	 * a more complicated code to handle the case where a user program
+	 * forces us to single-step through the SYSENTER entry code.)
+	 *
+	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+	 * out-of-line as an optimization: NT is unlikely to be set in the
+	 * majority of the cases and instead of polluting the I$ unnecessarily,
+	 * we're keeping that code behind a branch which will predict as
+	 * not-taken and therefore its instructions won't be fetched.
+	 */
+	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
+	jnz	.Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+
+	movl	%esp, %eax
+	call	do_SYSENTER_32
+	testl	%eax, %eax
+	jz	.Lsyscall_32_done
+
+	STACKLEAK_ERASE
+
+	/* Opportunistic SYSEXIT */
+
+	/*
+	 * Setup entry stack - we keep the pointer in %eax and do the
+	 * switch after almost all user-state is restored.
+	 */
+
+	/* Load entry stack pointer and allocate frame for eflags/eax */
+	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax
+	subl	$(2*4), %eax
+
+	/* Copy eflags and eax to entry stack */
+	movl	PT_EFLAGS(%esp), %edi
+	movl	PT_EAX(%esp), %esi
+	movl	%edi, (%eax)
+	movl	%esi, 4(%eax)
+
+	/* Restore user registers and segments */
+	movl	PT_EIP(%esp), %edx	/* pt_regs->ip */
+	movl	PT_OLDESP(%esp), %ecx	/* pt_regs->sp */
+1:	mov	PT_FS(%esp), %fs
+
+	popl	%ebx			/* pt_regs->bx */
+	addl	$2*4, %esp		/* skip pt_regs->cx and pt_regs->dx */
+	popl	%esi			/* pt_regs->si */
+	popl	%edi			/* pt_regs->di */
+	popl	%ebp			/* pt_regs->bp */
+
+	/* Switch to entry stack */
+	movl	%eax, %esp
+
+	/* Now ready to switch the cr3 */
+	SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+	/*
+	 * Restore all flags except IF. (We restore IF separately because
+	 * STI gives a one-instruction window in which we won't be interrupted,
+	 * whereas POPF does not.)
+	 */
+	btrl	$X86_EFLAGS_IF_BIT, (%esp)
+	BUG_IF_WRONG_CR3 no_user_check=1
+	popfl
+	popl	%eax
+
+	/*
+	 * Return back to the vDSO, which will pop ecx and edx.
+	 * Don't bother with DS and ES (they already contain __USER_DS).
+	 */
+	sti
+	sysexit
+
+2:	movl    $0, PT_FS(%esp)
+	jmp     1b
+	_ASM_EXTABLE(1b, 2b)
+
+.Lsysenter_fix_flags:
+	pushl	$X86_EFLAGS_FIXED
+	popfl
+	jmp	.Lsysenter_flags_fixed
+SYM_ENTRY(__end_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
+SYM_FUNC_END(entry_SYSENTER_32)
+
+/*
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction.  INT $0x80 lands here.
+ *
+ * This entry point can be used by any 32-bit perform system calls.
+ * Instances of INT $0x80 can be found inline in various programs and
+ * libraries.  It is also used by the vDSO's __kernel_vsyscall
+ * fallback for hardware that doesn't support a faster entry method.
+ * Restarted 32-bit system calls also fall back to INT $0x80
+ * regardless of what instruction was originally used to do the system
+ * call.  (64-bit programs can use INT $0x80 as well, but they can
+ * only run on 64-bit kernels and therefore land in
+ * entry_INT80_compat.)
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  arg6
+ */
+SYM_FUNC_START(entry_INT80_32)
+	ASM_CLAC
+	pushl	%eax			/* pt_regs->orig_ax */
+
+	SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1	/* save rest */
+
+	movl	%esp, %eax
+	call	do_int80_syscall_32
+.Lsyscall_32_done:
+	STACKLEAK_ERASE
+
+restore_all_switch_stack:
+	SWITCH_TO_ENTRY_STACK
+	CHECK_AND_APPLY_ESPFIX
+
+	/* Switch back to user CR3 */
+	SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+	BUG_IF_WRONG_CR3
+
+	/* Restore user state */
+	RESTORE_REGS pop=4			# skip orig_eax/error_code
+.Lirq_return:
+	/*
+	 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+	 * when returning from IPI handler and when returning from
+	 * scheduler to user-space.
+	 */
+	iret
+
+.Lasm_iret_error:
+	pushl	$0				# no error code
+	pushl	$iret_error
+
+#ifdef CONFIG_DEBUG_ENTRY
+	/*
+	 * The stack-frame here is the one that iret faulted on, so its a
+	 * return-to-user frame. We are on kernel-cr3 because we come here from
+	 * the fixup code. This confuses the CR3 checker, so switch to user-cr3
+	 * as the checker expects it.
+	 */
+	pushl	%eax
+	SWITCH_TO_USER_CR3 scratch_reg=%eax
+	popl	%eax
+#endif
+
+	jmp	handle_exception
+
+	_ASM_EXTABLE(.Lirq_return, .Lasm_iret_error)
+SYM_FUNC_END(entry_INT80_32)
+
+.macro FIXUP_ESPFIX_STACK
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ *
+ * We might be on user CR3 here, so percpu data is not mapped and we can't
+ * access the GDT through the percpu segment.  Instead, use SGDT to find
+ * the cpu_entry_area alias of the GDT.
+ */
+#ifdef CONFIG_X86_ESPFIX32
+	/* fixup the stack */
+	pushl	%ecx
+	subl	$2*4, %esp
+	sgdt	(%esp)
+	movl	2(%esp), %ecx				/* GDT address */
+	/*
+	 * Careful: ECX is a linear pointer, so we need to force base
+	 * zero.  %cs is the only known-linear segment we have right now.
+	 */
+	mov	%cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al	/* bits 16..23 */
+	mov	%cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah	/* bits 24..31 */
+	shl	$16, %eax
+	addl	$2*4, %esp
+	popl	%ecx
+	addl	%esp, %eax			/* the adjusted stack pointer */
+	pushl	$__KERNEL_DS
+	pushl	%eax
+	lss	(%esp), %esp			/* switch to the normal stack segment */
+#endif
+.endm
+
+.macro UNWIND_ESPFIX_STACK
+	/* It's safe to clobber %eax, all other regs need to be preserved */
+#ifdef CONFIG_X86_ESPFIX32
+	movl	%ss, %eax
+	/* see if on espfix stack */
+	cmpw	$__ESPFIX_SS, %ax
+	jne	.Lno_fixup_\@
+	/* switch to normal stack */
+	FIXUP_ESPFIX_STACK
+.Lno_fixup_\@:
+#endif
+.endm
+
+SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
+	/* the function address is in %gs's slot on the stack */
+	SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
+	ENCODE_FRAME_POINTER
+
+	movl	PT_GS(%esp), %edi		# get the function address
+
+	/* fixup orig %eax */
+	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
+	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
+
+	movl	%esp, %eax			# pt_regs pointer
+	CALL_NOSPEC edi
+
+handle_exception_return:
+#ifdef CONFIG_VM86
+	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS and CS
+	movb	PT_CS(%esp), %al
+	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
+	/*
+	 * We can be coming here from child spawned by kernel_thread().
+	 */
+	movl	PT_CS(%esp), %eax
+	andl	$SEGMENT_RPL_MASK, %eax
+#endif
+	cmpl	$USER_RPL, %eax			# returning to v8086 or userspace ?
+	jnb	ret_to_user
+
+	PARANOID_EXIT_TO_KERNEL_MODE
+	BUG_IF_WRONG_CR3
+	RESTORE_REGS 4
+	jmp	.Lirq_return
+
+ret_to_user:
+	movl	%esp, %eax
+	jmp	restore_all_switch_stack
+SYM_CODE_END(handle_exception)
+
+SYM_CODE_START(asm_exc_double_fault)
+1:
+	/*
+	 * This is a task gate handler, not an interrupt gate handler.
+	 * The error code is on the stack, but the stack is otherwise
+	 * empty.  Interrupts are off.  Our state is sane with the following
+	 * exceptions:
+	 *
+	 *  - CR0.TS is set.  "TS" literally means "task switched".
+	 *  - EFLAGS.NT is set because we're a "nested task".
+	 *  - The doublefault TSS has back_link set and has been marked busy.
+	 *  - TR points to the doublefault TSS and the normal TSS is busy.
+	 *  - CR3 is the normal kernel PGD.  This would be delightful, except
+	 *    that the CPU didn't bother to save the old CR3 anywhere.  This
+	 *    would make it very awkward to return back to the context we came
+	 *    from.
+	 *
+	 * The rest of EFLAGS is sanitized for us, so we don't need to
+	 * worry about AC or DF.
+	 *
+	 * Don't even bother popping the error code.  It's always zero,
+	 * and ignoring it makes us a bit more robust against buggy
+	 * hypervisor task gate implementations.
+	 *
+	 * We will manually undo the task switch instead of doing a
+	 * task-switching IRET.
+	 */
+
+	clts				/* clear CR0.TS */
+	pushl	$X86_EFLAGS_FIXED
+	popfl				/* clear EFLAGS.NT */
+
+	call	doublefault_shim
+
+	/* We don't support returning, so we have no IRET here. */
+1:
+	hlt
+	jmp 1b
+SYM_CODE_END(asm_exc_double_fault)
+
+/*
+ * NMI is doubly nasty.  It can happen on the first instruction of
+ * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
+ * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
+ * switched stacks.  We handle both conditions by simply checking whether we
+ * interrupted kernel code running on the SYSENTER stack.
+ */
+SYM_CODE_START(asm_exc_nmi)
+	ASM_CLAC
+
+#ifdef CONFIG_X86_ESPFIX32
+	/*
+	 * ESPFIX_SS is only ever set on the return to user path
+	 * after we've switched to the entry stack.
+	 */
+	pushl	%eax
+	movl	%ss, %eax
+	cmpw	$__ESPFIX_SS, %ax
+	popl	%eax
+	je	.Lnmi_espfix_stack
+#endif
+
+	pushl	%eax				# pt_regs->orig_ax
+	SAVE_ALL_NMI cr3_reg=%edi
+	ENCODE_FRAME_POINTER
+	xorl	%edx, %edx			# zero error code
+	movl	%esp, %eax			# pt_regs pointer
+
+	/* Are we currently on the SYSENTER stack? */
+	movl	PER_CPU_VAR(cpu_entry_area), %ecx
+	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */
+	cmpl	$SIZEOF_entry_stack, %ecx
+	jb	.Lnmi_from_sysenter_stack
+
+	/* Not on SYSENTER stack. */
+	call	exc_nmi
+	jmp	.Lnmi_return
+
+.Lnmi_from_sysenter_stack:
+	/*
+	 * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
+	 * is using the thread stack right now, so it's safe for us to use it.
+	 */
+	movl	%esp, %ebx
+	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esp
+	call	exc_nmi
+	movl	%ebx, %esp
+
+.Lnmi_return:
+#ifdef CONFIG_X86_ESPFIX32
+	testl	$CS_FROM_ESPFIX, PT_CS(%esp)
+	jnz	.Lnmi_from_espfix
+#endif
+
+	CHECK_AND_APPLY_ESPFIX
+	RESTORE_ALL_NMI cr3_reg=%edi pop=4
+	jmp	.Lirq_return
+
+#ifdef CONFIG_X86_ESPFIX32
+.Lnmi_espfix_stack:
+	/*
+	 * Create the pointer to LSS back
+	 */
+	pushl	%ss
+	pushl	%esp
+	addl	$4, (%esp)
+
+	/* Copy the (short) IRET frame */
+	pushl	4*4(%esp)	# flags
+	pushl	4*4(%esp)	# cs
+	pushl	4*4(%esp)	# ip
+
+	pushl	%eax		# orig_ax
+
+	SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1
+	ENCODE_FRAME_POINTER
+
+	/* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */
+	xorl	$(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp)
+
+	xorl	%edx, %edx			# zero error code
+	movl	%esp, %eax			# pt_regs pointer
+	jmp	.Lnmi_from_sysenter_stack
+
+.Lnmi_from_espfix:
+	RESTORE_ALL_NMI cr3_reg=%edi
+	/*
+	 * Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to
+	 * fix up the gap and long frame:
+	 *
+	 *  3 - original frame	(exception)
+	 *  2 - ESPFIX block	(above)
+	 *  6 - gap		(FIXUP_FRAME)
+	 *  5 - long frame	(FIXUP_FRAME)
+	 *  1 - orig_ax
+	 */
+	lss	(1+5+6)*4(%esp), %esp			# back to espfix stack
+	jmp	.Lirq_return
+#endif
+SYM_CODE_END(asm_exc_nmi)
+
+.pushsection .text, "ax"
+SYM_CODE_START(rewind_stack_and_make_dead)
+	/* Prevent any naive code from trying to unwind to our caller. */
+	xorl	%ebp, %ebp
+
+	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esi
+	leal	-TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
+
+	call	make_task_dead
+1:	jmp 1b
+SYM_CODE_END(rewind_stack_and_make_dead)
+.popsection
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
new file mode 100644
index 000000000..9953d966d
--- /dev/null
+++ b/arch/x86/entry/entry_64.S
@@ -0,0 +1,1530 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  linux/arch/x86_64/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ *
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * Some of this is documented in Documentation/x86/entry_64.rst
+ *
+ * A note on terminology:
+ * - iret frame:	Architecture defined interrupt frame from SS to RIP
+ *			at the top of the kernel process stack.
+ *
+ * Some macro usage:
+ * - SYM_FUNC_START/END:Define functions in the symbol table.
+ * - idtentry:		Define exception entry points.
+ */
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+#include <asm/page_types.h>
+#include <asm/irqflags.h>
+#include <asm/paravirt.h>
+#include <asm/percpu.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/pgtable_types.h>
+#include <asm/export.h>
+#include <asm/frame.h>
+#include <asm/trapnr.h>
+#include <asm/nospec-branch.h>
+#include <asm/fsgsbase.h>
+#include <linux/err.h>
+
+#include "calling.h"
+
+.code64
+.section .entry.text, "ax"
+
+/*
+ * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
+ *
+ * This is the only entry point used for 64-bit system calls.  The
+ * hardware interface is reasonably well designed and the register to
+ * argument mapping Linux uses fits well with the registers that are
+ * available when SYSCALL is used.
+ *
+ * SYSCALL instructions can be found inlined in libc implementations as
+ * well as some other programs and libraries.  There are also a handful
+ * of SYSCALL instructions in the vDSO used, for example, as a
+ * clock_gettimeofday fallback.
+ *
+ * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Registers on entry:
+ * rax  system call number
+ * rcx  return address
+ * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
+ * rdi  arg0
+ * rsi  arg1
+ * rdx  arg2
+ * r10  arg3 (needs to be moved to rcx to conform to C ABI)
+ * r8   arg4
+ * r9   arg5
+ * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
+ *
+ * Only called from user space.
+ *
+ * When user can change pt_regs->foo always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
+ */
+
+SYM_CODE_START(entry_SYSCALL_64)
+	UNWIND_HINT_ENTRY
+	ENDBR
+
+	swapgs
+	/* tss.sp2 is scratch space. */
+	movq	%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
+	ANNOTATE_NOENDBR
+
+	/* Construct struct pt_regs on stack */
+	pushq	$__USER_DS				/* pt_regs->ss */
+	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */
+	pushq	%r11					/* pt_regs->flags */
+	pushq	$__USER_CS				/* pt_regs->cs */
+	pushq	%rcx					/* pt_regs->ip */
+SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
+	pushq	%rax					/* pt_regs->orig_ax */
+
+	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
+
+	/* IRQs are off. */
+	movq	%rsp, %rdi
+	/* Sign extend the lower 32bit as syscall numbers are treated as int */
+	movslq	%eax, %rsi
+
+	/* clobbers %rax, make sure it is after saving the syscall nr */
+	IBRS_ENTER
+	UNTRAIN_RET
+
+	call	do_syscall_64		/* returns with IRQs disabled */
+
+	/*
+	 * Try to use SYSRET instead of IRET if we're returning to
+	 * a completely clean 64-bit userspace context.  If we're not,
+	 * go to the slow exit path.
+	 * In the Xen PV case we must use iret anyway.
+	 */
+
+	ALTERNATIVE "", "jmp	swapgs_restore_regs_and_return_to_usermode", \
+		X86_FEATURE_XENPV
+
+	movq	RCX(%rsp), %rcx
+	movq	RIP(%rsp), %r11
+
+	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
+	jne	swapgs_restore_regs_and_return_to_usermode
+
+	/*
+	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+	 * in kernel space.  This essentially lets the user take over
+	 * the kernel, since userspace controls RSP.
+	 *
+	 * If width of "canonical tail" ever becomes variable, this will need
+	 * to be updated to remain correct on both old and new CPUs.
+	 *
+	 * Change top bits to match most significant bit (47th or 56th bit
+	 * depending on paging mode) in the address.
+	 */
+#ifdef CONFIG_X86_5LEVEL
+	ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
+		"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
+#else
+	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+#endif
+
+	/* If this changed %rcx, it was not canonical */
+	cmpq	%rcx, %r11
+	jne	swapgs_restore_regs_and_return_to_usermode
+
+	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
+	jne	swapgs_restore_regs_and_return_to_usermode
+
+	movq	R11(%rsp), %r11
+	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
+	jne	swapgs_restore_regs_and_return_to_usermode
+
+	/*
+	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
+	 * restore RF properly. If the slowpath sets it for whatever reason, we
+	 * need to restore it correctly.
+	 *
+	 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
+	 * trap from userspace immediately after SYSRET.  This would cause an
+	 * infinite loop whenever #DB happens with register state that satisfies
+	 * the opportunistic SYSRET conditions.  For example, single-stepping
+	 * this user code:
+	 *
+	 *           movq	$stuck_here, %rcx
+	 *           pushfq
+	 *           popq %r11
+	 *   stuck_here:
+	 *
+	 * would never get past 'stuck_here'.
+	 */
+	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+	jnz	swapgs_restore_regs_and_return_to_usermode
+
+	/* nothing to check for RSP */
+
+	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
+	jne	swapgs_restore_regs_and_return_to_usermode
+
+	/*
+	 * We win! This label is here just for ease of understanding
+	 * perf profiles. Nothing jumps here.
+	 */
+syscall_return_via_sysret:
+	IBRS_EXIT
+	POP_REGS pop_rdi=0
+
+	/*
+	 * Now all regs are restored except RSP and RDI.
+	 * Save old stack pointer and switch to trampoline stack.
+	 */
+	movq	%rsp, %rdi
+	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+	UNWIND_HINT_EMPTY
+
+	pushq	RSP-RDI(%rdi)	/* RSP */
+	pushq	(%rdi)		/* RDI */
+
+	/*
+	 * We are on the trampoline stack.  All regs except RDI are live.
+	 * We can do future final exit work right here.
+	 */
+	STACKLEAK_ERASE_NOCLOBBER
+
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
+	popq	%rdi
+	popq	%rsp
+SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
+	ANNOTATE_NOENDBR
+	swapgs
+	sysretq
+SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
+	ANNOTATE_NOENDBR
+	int3
+SYM_CODE_END(entry_SYSCALL_64)
+
+/*
+ * %rdi: prev task
+ * %rsi: next task
+ */
+.pushsection .text, "ax"
+SYM_FUNC_START(__switch_to_asm)
+	/*
+	 * Save callee-saved registers
+	 * This must match the order in inactive_task_frame
+	 */
+	pushq	%rbp
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	/* switch stack */
+	movq	%rsp, TASK_threadsp(%rdi)
+	movq	TASK_threadsp(%rsi), %rsp
+
+#ifdef CONFIG_STACKPROTECTOR
+	movq	TASK_stack_canary(%rsi), %rbx
+	movq	%rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
+#endif
+
+	/*
+	 * When switching from a shallower to a deeper call stack
+	 * the RSB may either underflow or use entries populated
+	 * with userspace addresses. On CPUs where those concerns
+	 * exist, overwrite the RSB with entries which capture
+	 * speculative execution to prevent attack.
+	 */
+	FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+
+	/* restore callee-saved registers */
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	popq	%rbp
+
+	jmp	__switch_to
+SYM_FUNC_END(__switch_to_asm)
+.popsection
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rax: prev task we switched from
+ * rbx: kernel thread func (NULL for user thread)
+ * r12: kernel thread arg
+ */
+.pushsection .text, "ax"
+SYM_CODE_START(ret_from_fork)
+	UNWIND_HINT_EMPTY
+	ANNOTATE_NOENDBR // copy_thread
+	movq	%rax, %rdi
+	call	schedule_tail			/* rdi: 'prev' task parameter */
+
+	testq	%rbx, %rbx			/* from kernel_thread? */
+	jnz	1f				/* kernel threads are uncommon */
+
+2:
+	UNWIND_HINT_REGS
+	movq	%rsp, %rdi
+	call	syscall_exit_to_user_mode	/* returns with IRQs disabled */
+	jmp	swapgs_restore_regs_and_return_to_usermode
+
+1:
+	/* kernel thread */
+	UNWIND_HINT_EMPTY
+	movq	%r12, %rdi
+	CALL_NOSPEC rbx
+	/*
+	 * A kernel thread is allowed to return here after successfully
+	 * calling kernel_execve().  Exit to userspace to complete the execve()
+	 * syscall.
+	 */
+	movq	$0, RAX(%rsp)
+	jmp	2b
+SYM_CODE_END(ret_from_fork)
+.popsection
+
+.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
+#ifdef CONFIG_DEBUG_ENTRY
+	pushq %rax
+	SAVE_FLAGS
+	testl $X86_EFLAGS_IF, %eax
+	jz .Lokay_\@
+	ud2
+.Lokay_\@:
+	popq %rax
+#endif
+.endm
+
+SYM_CODE_START_LOCAL(xen_error_entry)
+	UNWIND_HINT_FUNC
+	PUSH_AND_CLEAR_REGS save_ret=1
+	ENCODE_FRAME_POINTER 8
+	UNTRAIN_RET
+	RET
+SYM_CODE_END(xen_error_entry)
+
+/**
+ * idtentry_body - Macro to emit code calling the C function
+ * @cfunc:		C function to be called
+ * @has_error_code:	Hardware pushed error code on stack
+ */
+.macro idtentry_body cfunc has_error_code:req
+
+	/*
+	 * Call error_entry() and switch to the task stack if from userspace.
+	 *
+	 * When in XENPV, it is already in the task stack, and it can't fault
+	 * for native_iret() nor native_load_gs_index() since XENPV uses its
+	 * own pvops for IRET and load_gs_index().  And it doesn't need to
+	 * switch the CR3.  So it can skip invoking error_entry().
+	 */
+	ALTERNATIVE "call error_entry; movq %rax, %rsp", \
+		    "call xen_error_entry", X86_FEATURE_XENPV
+
+	ENCODE_FRAME_POINTER
+	UNWIND_HINT_REGS
+
+	movq	%rsp, %rdi			/* pt_regs pointer into 1st argument*/
+
+	.if \has_error_code == 1
+		movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
+		movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
+	.endif
+
+	call	\cfunc
+
+	/* For some configurations \cfunc ends up being a noreturn. */
+	REACHABLE
+
+	jmp	error_return
+.endm
+
+/**
+ * idtentry - Macro to generate entry stubs for simple IDT entries
+ * @vector:		Vector number
+ * @asmsym:		ASM symbol for the entry point
+ * @cfunc:		C function to be called
+ * @has_error_code:	Hardware pushed error code on stack
+ *
+ * The macro emits code to set up the kernel context for straight forward
+ * and simple IDT entries. No IST stack, no paranoid entry checks.
+ */
+.macro idtentry vector asmsym cfunc has_error_code:req
+SYM_CODE_START(\asmsym)
+	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+	ENDBR
+	ASM_CLAC
+	cld
+
+	.if \has_error_code == 0
+		pushq	$-1			/* ORIG_RAX: no syscall to restart */
+	.endif
+
+	.if \vector == X86_TRAP_BP
+		/*
+		 * If coming from kernel space, create a 6-word gap to allow the
+		 * int3 handler to emulate a call instruction.
+		 */
+		testb	$3, CS-ORIG_RAX(%rsp)
+		jnz	.Lfrom_usermode_no_gap_\@
+		.rept	6
+		pushq	5*8(%rsp)
+		.endr
+		UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
+	.endif
+
+	idtentry_body \cfunc \has_error_code
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+
+/*
+ * Interrupt entry/exit.
+ *
+ + The interrupt stubs push (vector) onto the stack, which is the error_code
+ * position of idtentry exceptions, and jump to one of the two idtentry points
+ * (common/spurious).
+ *
+ * common_interrupt is a hotpath, align it to a cache line
+ */
+.macro idtentry_irq vector cfunc
+	.p2align CONFIG_X86_L1_CACHE_SHIFT
+	idtentry \vector asm_\cfunc \cfunc has_error_code=1
+.endm
+
+/*
+ * System vectors which invoke their handlers directly and are not
+ * going through the regular common device interrupt handling code.
+ */
+.macro idtentry_sysvec vector cfunc
+	idtentry \vector asm_\cfunc \cfunc has_error_code=0
+.endm
+
+/**
+ * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
+ * @vector:		Vector number
+ * @asmsym:		ASM symbol for the entry point
+ * @cfunc:		C function to be called
+ *
+ * The macro emits code to set up the kernel context for #MC and #DB
+ *
+ * If the entry comes from user space it uses the normal entry path
+ * including the return to user space work and preemption checks on
+ * exit.
+ *
+ * If hits in kernel mode then it needs to go through the paranoid
+ * entry as the exception can hit any random state. No preemption
+ * check on exit to keep the paranoid path simple.
+ */
+.macro idtentry_mce_db vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+	UNWIND_HINT_IRET_REGS
+	ENDBR
+	ASM_CLAC
+	cld
+
+	pushq	$-1			/* ORIG_RAX: no syscall to restart */
+
+	/*
+	 * If the entry is from userspace, switch stacks and treat it as
+	 * a normal entry.
+	 */
+	testb	$3, CS-ORIG_RAX(%rsp)
+	jnz	.Lfrom_usermode_switch_stack_\@
+
+	/* paranoid_entry returns GS information for paranoid_exit in EBX. */
+	call	paranoid_entry
+
+	UNWIND_HINT_REGS
+
+	movq	%rsp, %rdi		/* pt_regs pointer */
+
+	call	\cfunc
+
+	jmp	paranoid_exit
+
+	/* Switch to the regular task stack and use the noist entry point */
+.Lfrom_usermode_switch_stack_\@:
+	idtentry_body noist_\cfunc, has_error_code=0
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/**
+ * idtentry_vc - Macro to generate entry stub for #VC
+ * @vector:		Vector number
+ * @asmsym:		ASM symbol for the entry point
+ * @cfunc:		C function to be called
+ *
+ * The macro emits code to set up the kernel context for #VC. The #VC handler
+ * runs on an IST stack and needs to be able to cause nested #VC exceptions.
+ *
+ * To make this work the #VC entry code tries its best to pretend it doesn't use
+ * an IST stack by switching to the task stack if coming from user-space (which
+ * includes early SYSCALL entry path) or back to the stack in the IRET frame if
+ * entered from kernel-mode.
+ *
+ * If entered from kernel-mode the return stack is validated first, and if it is
+ * not safe to use (e.g. because it points to the entry stack) the #VC handler
+ * will switch to a fall-back stack (VC2) and call a special handler function.
+ *
+ * The macro is only used for one vector, but it is planned to be extended in
+ * the future for the #HV exception.
+ */
+.macro idtentry_vc vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+	UNWIND_HINT_IRET_REGS
+	ENDBR
+	ASM_CLAC
+	cld
+
+	/*
+	 * If the entry is from userspace, switch stacks and treat it as
+	 * a normal entry.
+	 */
+	testb	$3, CS-ORIG_RAX(%rsp)
+	jnz	.Lfrom_usermode_switch_stack_\@
+
+	/*
+	 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
+	 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
+	 */
+	call	paranoid_entry
+
+	UNWIND_HINT_REGS
+
+	/*
+	 * Switch off the IST stack to make it free for nested exceptions. The
+	 * vc_switch_off_ist() function will switch back to the interrupted
+	 * stack if it is safe to do so. If not it switches to the VC fall-back
+	 * stack.
+	 */
+	movq	%rsp, %rdi		/* pt_regs pointer */
+	call	vc_switch_off_ist
+	movq	%rax, %rsp		/* Switch to new stack */
+
+	ENCODE_FRAME_POINTER
+	UNWIND_HINT_REGS
+
+	/* Update pt_regs */
+	movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
+	movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
+
+	movq	%rsp, %rdi		/* pt_regs pointer */
+
+	call	kernel_\cfunc
+
+	/*
+	 * No need to switch back to the IST stack. The current stack is either
+	 * identical to the stack in the IRET frame or the VC fall-back stack,
+	 * so it is definitely mapped even with PTI enabled.
+	 */
+	jmp	paranoid_exit
+
+	/* Switch to the regular task stack */
+.Lfrom_usermode_switch_stack_\@:
+	idtentry_body user_\cfunc, has_error_code=1
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+#endif
+
+/*
+ * Double fault entry. Straight paranoid. No checks from which context
+ * this comes because for the espfix induced #DF this would do the wrong
+ * thing.
+ */
+.macro idtentry_df vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+	UNWIND_HINT_IRET_REGS offset=8
+	ENDBR
+	ASM_CLAC
+	cld
+
+	/* paranoid_entry returns GS information for paranoid_exit in EBX. */
+	call	paranoid_entry
+	UNWIND_HINT_REGS
+
+	movq	%rsp, %rdi		/* pt_regs pointer into first argument */
+	movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
+	movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
+	call	\cfunc
+
+	/* For some configurations \cfunc ends up being a noreturn. */
+	REACHABLE
+
+	jmp	paranoid_exit
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+
+/*
+ * Include the defines which emit the idt entries which are shared
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
+ * so the stacktrace boundary checks work.
+ */
+	.align 16
+	.globl __irqentry_text_start
+__irqentry_text_start:
+
+#include <asm/idtentry.h>
+
+	.align 16
+	.globl __irqentry_text_end
+__irqentry_text_end:
+	ANNOTATE_NOENDBR
+
+SYM_CODE_START_LOCAL(common_interrupt_return)
+SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
+	IBRS_EXIT
+#ifdef CONFIG_DEBUG_ENTRY
+	/* Assert that pt_regs indicates user mode. */
+	testb	$3, CS(%rsp)
+	jnz	1f
+	ud2
+1:
+#endif
+#ifdef CONFIG_XEN_PV
+	ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+#endif
+
+	POP_REGS pop_rdi=0
+
+	/*
+	 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
+	 * Save old stack pointer and switch to trampoline stack.
+	 */
+	movq	%rsp, %rdi
+	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+	UNWIND_HINT_EMPTY
+
+	/* Copy the IRET frame to the trampoline stack. */
+	pushq	6*8(%rdi)	/* SS */
+	pushq	5*8(%rdi)	/* RSP */
+	pushq	4*8(%rdi)	/* EFLAGS */
+	pushq	3*8(%rdi)	/* CS */
+	pushq	2*8(%rdi)	/* RIP */
+
+	/* Push user RDI on the trampoline stack. */
+	pushq	(%rdi)
+
+	/*
+	 * We are on the trampoline stack.  All regs except RDI are live.
+	 * We can do future final exit work right here.
+	 */
+	STACKLEAK_ERASE_NOCLOBBER
+
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
+	/* Restore RDI. */
+	popq	%rdi
+	swapgs
+	jmp	.Lnative_iret
+
+
+SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
+#ifdef CONFIG_DEBUG_ENTRY
+	/* Assert that pt_regs indicates kernel mode. */
+	testb	$3, CS(%rsp)
+	jz	1f
+	ud2
+1:
+#endif
+	POP_REGS
+	addq	$8, %rsp	/* skip regs->orig_ax */
+	/*
+	 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+	 * when returning from IPI handler.
+	 */
+#ifdef CONFIG_XEN_PV
+SYM_INNER_LABEL(early_xen_iret_patch, SYM_L_GLOBAL)
+	ANNOTATE_NOENDBR
+	.byte 0xe9
+	.long .Lnative_iret - (. + 4)
+#endif
+
+.Lnative_iret:
+	UNWIND_HINT_IRET_REGS
+	/*
+	 * Are we returning to a stack segment from the LDT?  Note: in
+	 * 64-bit mode SS:RSP on the exception stack is always valid.
+	 */
+#ifdef CONFIG_X86_ESPFIX64
+	testb	$4, (SS-RIP)(%rsp)
+	jnz	native_irq_return_ldt
+#endif
+
+SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
+	ANNOTATE_NOENDBR // exc_double_fault
+	/*
+	 * This may fault.  Non-paranoid faults on return to userspace are
+	 * handled by fixup_bad_iret.  These include #SS, #GP, and #NP.
+	 * Double-faults due to espfix64 are handled in exc_double_fault.
+	 * Other faults here are fatal.
+	 */
+	iretq
+
+#ifdef CONFIG_X86_ESPFIX64
+native_irq_return_ldt:
+	/*
+	 * We are running with user GSBASE.  All GPRs contain their user
+	 * values.  We have a percpu ESPFIX stack that is eight slots
+	 * long (see ESPFIX_STACK_SIZE).  espfix_waddr points to the bottom
+	 * of the ESPFIX stack.
+	 *
+	 * We clobber RAX and RDI in this code.  We stash RDI on the
+	 * normal stack and RAX on the ESPFIX stack.
+	 *
+	 * The ESPFIX stack layout we set up looks like this:
+	 *
+	 * --- top of ESPFIX stack ---
+	 * SS
+	 * RSP
+	 * RFLAGS
+	 * CS
+	 * RIP  <-- RSP points here when we're done
+	 * RAX  <-- espfix_waddr points here
+	 * --- bottom of ESPFIX stack ---
+	 */
+
+	pushq	%rdi				/* Stash user RDI */
+	swapgs					/* to kernel GS */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */
+
+	movq	PER_CPU_VAR(espfix_waddr), %rdi
+	movq	%rax, (0*8)(%rdi)		/* user RAX */
+	movq	(1*8)(%rsp), %rax		/* user RIP */
+	movq	%rax, (1*8)(%rdi)
+	movq	(2*8)(%rsp), %rax		/* user CS */
+	movq	%rax, (2*8)(%rdi)
+	movq	(3*8)(%rsp), %rax		/* user RFLAGS */
+	movq	%rax, (3*8)(%rdi)
+	movq	(5*8)(%rsp), %rax		/* user SS */
+	movq	%rax, (5*8)(%rdi)
+	movq	(4*8)(%rsp), %rax		/* user RSP */
+	movq	%rax, (4*8)(%rdi)
+	/* Now RAX == RSP. */
+
+	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
+
+	/*
+	 * espfix_stack[31:16] == 0.  The page tables are set up such that
+	 * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
+	 * espfix_waddr for any X.  That is, there are 65536 RO aliases of
+	 * the same page.  Set up RSP so that RSP[31:16] contains the
+	 * respective 16 bits of the /userspace/ RSP and RSP nonetheless
+	 * still points to an RO alias of the ESPFIX stack.
+	 */
+	orq	PER_CPU_VAR(espfix_stack), %rax
+
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+	swapgs					/* to user GS */
+	popq	%rdi				/* Restore user RDI */
+
+	movq	%rax, %rsp
+	UNWIND_HINT_IRET_REGS offset=8
+
+	/*
+	 * At this point, we cannot write to the stack any more, but we can
+	 * still read.
+	 */
+	popq	%rax				/* Restore user RAX */
+
+	/*
+	 * RSP now points to an ordinary IRET frame, except that the page
+	 * is read-only and RSP[31:16] are preloaded with the userspace
+	 * values.  We can now IRET back to userspace.
+	 */
+	jmp	native_irq_return_iret
+#endif
+SYM_CODE_END(common_interrupt_return)
+_ASM_NOKPROBE(common_interrupt_return)
+
+/*
+ * Reload gs selector with exception handling
+ * edi:  new selector
+ *
+ * Is in entry.text as it shouldn't be instrumented.
+ */
+SYM_FUNC_START(asm_load_gs_index)
+	FRAME_BEGIN
+	swapgs
+.Lgs_change:
+	ANNOTATE_NOENDBR // error_entry
+	movl	%edi, %gs
+2:	ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
+	swapgs
+	FRAME_END
+	RET
+
+	/* running with kernelgs */
+.Lbad_gs:
+	swapgs					/* switch back to user gs */
+.macro ZAP_GS
+	/* This can't be a string because the preprocessor needs to see it. */
+	movl $__USER_DS, %eax
+	movl %eax, %gs
+.endm
+	ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
+	xorl	%eax, %eax
+	movl	%eax, %gs
+	jmp	2b
+
+	_ASM_EXTABLE(.Lgs_change, .Lbad_gs)
+
+SYM_FUNC_END(asm_load_gs_index)
+EXPORT_SYMBOL(asm_load_gs_index)
+
+#ifdef CONFIG_XEN_PV
+/*
+ * A note on the "critical region" in our callback handler.
+ * We want to avoid stacking callback handlers due to events occurring
+ * during handling of the last event. To do this, we keep events disabled
+ * until we've done all processing. HOWEVER, we must enable events before
+ * popping the stack frame (can't be done atomically) and so it would still
+ * be possible to get enough handler activations to overflow the stack.
+ * Although unlikely, bugs of that kind are hard to track down, so we'd
+ * like to avoid the possibility.
+ * So, on entry to the handler we detect whether we interrupted an
+ * existing activation in its critical region -- if so, we pop the current
+ * activation and restart the handler using the previous one.
+ *
+ * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
+ */
+SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
+
+/*
+ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+ * see the correct pointer to the pt_regs
+ */
+	UNWIND_HINT_FUNC
+	movq	%rdi, %rsp			/* we don't return, adjust the stack frame */
+	UNWIND_HINT_REGS
+
+	call	xen_pv_evtchn_do_upcall
+
+	jmp	error_return
+SYM_CODE_END(exc_xen_hypervisor_callback)
+
+/*
+ * Hypervisor uses this for application faults while it executes.
+ * We get here for two reasons:
+ *  1. Fault while reloading DS, ES, FS or GS
+ *  2. Fault while executing IRET
+ * Category 1 we do not need to fix up as Xen has already reloaded all segment
+ * registers that could be reloaded and zeroed the others.
+ * Category 2 we fix up by killing the current process. We cannot use the
+ * normal Linux return path in this case because if we use the IRET hypercall
+ * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+ * We distinguish between categories by comparing each saved segment register
+ * with its current contents: any discrepancy means we in category 1.
+ */
+SYM_CODE_START(xen_failsafe_callback)
+	UNWIND_HINT_EMPTY
+	ENDBR
+	movl	%ds, %ecx
+	cmpw	%cx, 0x10(%rsp)
+	jne	1f
+	movl	%es, %ecx
+	cmpw	%cx, 0x18(%rsp)
+	jne	1f
+	movl	%fs, %ecx
+	cmpw	%cx, 0x20(%rsp)
+	jne	1f
+	movl	%gs, %ecx
+	cmpw	%cx, 0x28(%rsp)
+	jne	1f
+	/* All segments match their saved values => Category 2 (Bad IRET). */
+	movq	(%rsp), %rcx
+	movq	8(%rsp), %r11
+	addq	$0x30, %rsp
+	pushq	$0				/* RIP */
+	UNWIND_HINT_IRET_REGS offset=8
+	jmp	asm_exc_general_protection
+1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+	movq	(%rsp), %rcx
+	movq	8(%rsp), %r11
+	addq	$0x30, %rsp
+	UNWIND_HINT_IRET_REGS
+	pushq	$-1 /* orig_ax = -1 => not a system call */
+	PUSH_AND_CLEAR_REGS
+	ENCODE_FRAME_POINTER
+	jmp	error_return
+SYM_CODE_END(xen_failsafe_callback)
+#endif /* CONFIG_XEN_PV */
+
+/*
+ * Save all registers in pt_regs. Return GSBASE related information
+ * in EBX depending on the availability of the FSGSBASE instructions:
+ *
+ * FSGSBASE	R/EBX
+ *     N        0 -> SWAPGS on exit
+ *              1 -> no SWAPGS on exit
+ *
+ *     Y        GSBASE value at entry, must be restored in paranoid_exit
+ *
+ * R14 - old CR3
+ * R15 - old SPEC_CTRL
+ */
+SYM_CODE_START_LOCAL(paranoid_entry)
+	UNWIND_HINT_FUNC
+	PUSH_AND_CLEAR_REGS save_ret=1
+	ENCODE_FRAME_POINTER 8
+
+	/*
+	 * Always stash CR3 in %r14.  This value will be restored,
+	 * verbatim, at exit.  Needed if paranoid_entry interrupted
+	 * another entry that already switched to the user CR3 value
+	 * but has not yet returned to userspace.
+	 *
+	 * This is also why CS (stashed in the "iret frame" by the
+	 * hardware at entry) can not be used: this may be a return
+	 * to kernel code, but with a user CR3 value.
+	 *
+	 * Switching CR3 does not depend on kernel GSBASE so it can
+	 * be done before switching to the kernel GSBASE. This is
+	 * required for FSGSBASE because the kernel GSBASE has to
+	 * be retrieved from a kernel internal table.
+	 */
+	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
+
+	/*
+	 * Handling GSBASE depends on the availability of FSGSBASE.
+	 *
+	 * Without FSGSBASE the kernel enforces that negative GSBASE
+	 * values indicate kernel GSBASE. With FSGSBASE no assumptions
+	 * can be made about the GSBASE value when entering from user
+	 * space.
+	 */
+	ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
+
+	/*
+	 * Read the current GSBASE and store it in %rbx unconditionally,
+	 * retrieve and set the current CPUs kernel GSBASE. The stored value
+	 * has to be restored in paranoid_exit unconditionally.
+	 *
+	 * The unconditional write to GS base below ensures that no subsequent
+	 * loads based on a mispredicted GS base can happen, therefore no LFENCE
+	 * is needed here.
+	 */
+	SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
+	jmp .Lparanoid_gsbase_done
+
+.Lparanoid_entry_checkgs:
+	/* EBX = 1 -> kernel GSBASE active, no restore required */
+	movl	$1, %ebx
+
+	/*
+	 * The kernel-enforced convention is a negative GSBASE indicates
+	 * a kernel value. No SWAPGS needed on entry and exit.
+	 */
+	movl	$MSR_GS_BASE, %ecx
+	rdmsr
+	testl	%edx, %edx
+	js	.Lparanoid_kernel_gsbase
+
+	/* EBX = 0 -> SWAPGS required on exit */
+	xorl	%ebx, %ebx
+	swapgs
+.Lparanoid_kernel_gsbase:
+	FENCE_SWAPGS_KERNEL_ENTRY
+.Lparanoid_gsbase_done:
+
+	/*
+	 * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like
+	 * CR3 above, keep the old value in a callee saved register.
+	 */
+	IBRS_ENTER save_reg=%r15
+	UNTRAIN_RET
+
+	RET
+SYM_CODE_END(paranoid_entry)
+
+/*
+ * "Paranoid" exit path from exception stack.  This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated.  Fortunately, there's no good reason to try
+ * to handle preemption here.
+ *
+ * R/EBX contains the GSBASE related information depending on the
+ * availability of the FSGSBASE instructions:
+ *
+ * FSGSBASE	R/EBX
+ *     N        0 -> SWAPGS on exit
+ *              1 -> no SWAPGS on exit
+ *
+ *     Y        User space GSBASE, must be restored unconditionally
+ *
+ * R14 - old CR3
+ * R15 - old SPEC_CTRL
+ */
+SYM_CODE_START_LOCAL(paranoid_exit)
+	UNWIND_HINT_REGS
+
+	/*
+	 * Must restore IBRS state before both CR3 and %GS since we need access
+	 * to the per-CPU x86_spec_ctrl_shadow variable.
+	 */
+	IBRS_EXIT save_reg=%r15
+
+	/*
+	 * The order of operations is important. RESTORE_CR3 requires
+	 * kernel GSBASE.
+	 *
+	 * NB to anyone to try to optimize this code: this code does
+	 * not execute at all for exceptions from user mode. Those
+	 * exceptions go through error_exit instead.
+	 */
+	RESTORE_CR3	scratch_reg=%rax save_reg=%r14
+
+	/* Handle the three GSBASE cases */
+	ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
+
+	/* With FSGSBASE enabled, unconditionally restore GSBASE */
+	wrgsbase	%rbx
+	jmp		restore_regs_and_return_to_kernel
+
+.Lparanoid_exit_checkgs:
+	/* On non-FSGSBASE systems, conditionally do SWAPGS */
+	testl		%ebx, %ebx
+	jnz		restore_regs_and_return_to_kernel
+
+	/* We are returning to a context with user GSBASE */
+	swapgs
+	jmp		restore_regs_and_return_to_kernel
+SYM_CODE_END(paranoid_exit)
+
+/*
+ * Switch GS and CR3 if needed.
+ */
+SYM_CODE_START_LOCAL(error_entry)
+	UNWIND_HINT_FUNC
+
+	PUSH_AND_CLEAR_REGS save_ret=1
+	ENCODE_FRAME_POINTER 8
+
+	testb	$3, CS+8(%rsp)
+	jz	.Lerror_kernelspace
+
+	/*
+	 * We entered from user mode or we're pretending to have entered
+	 * from user mode due to an IRET fault.
+	 */
+	swapgs
+	FENCE_SWAPGS_USER_ENTRY
+	/* We have user CR3.  Change to kernel CR3. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+	IBRS_ENTER
+	UNTRAIN_RET
+
+	leaq	8(%rsp), %rdi			/* arg0 = pt_regs pointer */
+.Lerror_entry_from_usermode_after_swapgs:
+
+	/* Put us onto the real thread stack. */
+	call	sync_regs
+	RET
+
+	/*
+	 * There are two places in the kernel that can potentially fault with
+	 * usergs. Handle them here.  B stepping K8s sometimes report a
+	 * truncated RIP for IRET exceptions returning to compat mode. Check
+	 * for these here too.
+	 */
+.Lerror_kernelspace:
+	leaq	native_irq_return_iret(%rip), %rcx
+	cmpq	%rcx, RIP+8(%rsp)
+	je	.Lerror_bad_iret
+	movl	%ecx, %eax			/* zero extend */
+	cmpq	%rax, RIP+8(%rsp)
+	je	.Lbstep_iret
+	cmpq	$.Lgs_change, RIP+8(%rsp)
+	jne	.Lerror_entry_done_lfence
+
+	/*
+	 * hack: .Lgs_change can fail with user gsbase.  If this happens, fix up
+	 * gsbase and proceed.  We'll fix up the exception and land in
+	 * .Lgs_change's error handler with kernel gsbase.
+	 */
+	swapgs
+
+	/*
+	 * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
+	 * kernel or user gsbase.
+	 */
+.Lerror_entry_done_lfence:
+	FENCE_SWAPGS_KERNEL_ENTRY
+	leaq	8(%rsp), %rax			/* return pt_regs pointer */
+	ANNOTATE_UNRET_END
+	RET
+
+.Lbstep_iret:
+	/* Fix truncated RIP */
+	movq	%rcx, RIP+8(%rsp)
+	/* fall through */
+
+.Lerror_bad_iret:
+	/*
+	 * We came from an IRET to user mode, so we have user
+	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
+	 */
+	swapgs
+	FENCE_SWAPGS_USER_ENTRY
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+	IBRS_ENTER
+	UNTRAIN_RET
+
+	/*
+	 * Pretend that the exception came from user mode: set up pt_regs
+	 * as if we faulted immediately after IRET.
+	 */
+	leaq	8(%rsp), %rdi			/* arg0 = pt_regs pointer */
+	call	fixup_bad_iret
+	mov	%rax, %rdi
+	jmp	.Lerror_entry_from_usermode_after_swapgs
+SYM_CODE_END(error_entry)
+
+SYM_CODE_START_LOCAL(error_return)
+	UNWIND_HINT_REGS
+	DEBUG_ENTRY_ASSERT_IRQS_OFF
+	testb	$3, CS(%rsp)
+	jz	restore_regs_and_return_to_kernel
+	jmp	swapgs_restore_regs_and_return_to_usermode
+SYM_CODE_END(error_return)
+
+/*
+ * Runs on exception stack.  Xen PV does not go through this path at all,
+ * so we can use real assembly here.
+ *
+ * Registers:
+ *	%r14: Used to save/restore the CR3 of the interrupted context
+ *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
+ */
+SYM_CODE_START(asm_exc_nmi)
+	UNWIND_HINT_IRET_REGS
+	ENDBR
+
+	/*
+	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
+	 * the iretq it performs will take us out of NMI context.
+	 * This means that we can have nested NMIs where the next
+	 * NMI is using the top of the stack of the previous NMI. We
+	 * can't let it execute because the nested NMI will corrupt the
+	 * stack of the previous NMI. NMI handlers are not re-entrant
+	 * anyway.
+	 *
+	 * To handle this case we do the following:
+	 *  Check the a special location on the stack that contains
+	 *  a variable that is set when NMIs are executing.
+	 *  The interrupted task's stack is also checked to see if it
+	 *  is an NMI stack.
+	 *  If the variable is not set and the stack is not the NMI
+	 *  stack then:
+	 *    o Set the special variable on the stack
+	 *    o Copy the interrupt frame into an "outermost" location on the
+	 *      stack
+	 *    o Copy the interrupt frame into an "iret" location on the stack
+	 *    o Continue processing the NMI
+	 *  If the variable is set or the previous stack is the NMI stack:
+	 *    o Modify the "iret" location to jump to the repeat_nmi
+	 *    o return back to the first NMI
+	 *
+	 * Now on exit of the first NMI, we first clear the stack variable
+	 * The NMI stack will tell any nested NMIs at that point that it is
+	 * nested. Then we pop the stack normally with iret, and if there was
+	 * a nested NMI that updated the copy interrupt stack frame, a
+	 * jump will be made to the repeat_nmi code that will handle the second
+	 * NMI.
+	 *
+	 * However, espfix prevents us from directly returning to userspace
+	 * with a single IRET instruction.  Similarly, IRET to user mode
+	 * can fault.  We therefore handle NMIs from user space like
+	 * other IST entries.
+	 */
+
+	ASM_CLAC
+	cld
+
+	/* Use %rdx as our temp variable throughout */
+	pushq	%rdx
+
+	testb	$3, CS-RIP+8(%rsp)
+	jz	.Lnmi_from_kernel
+
+	/*
+	 * NMI from user mode.  We need to run on the thread stack, but we
+	 * can't go through the normal entry paths: NMIs are masked, and
+	 * we don't want to enable interrupts, because then we'll end
+	 * up in an awkward situation in which IRQs are on but NMIs
+	 * are off.
+	 *
+	 * We also must not push anything to the stack before switching
+	 * stacks lest we corrupt the "NMI executing" variable.
+	 */
+
+	swapgs
+	FENCE_SWAPGS_USER_ENTRY
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
+	movq	%rsp, %rdx
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+	UNWIND_HINT_IRET_REGS base=%rdx offset=8
+	pushq	5*8(%rdx)	/* pt_regs->ss */
+	pushq	4*8(%rdx)	/* pt_regs->rsp */
+	pushq	3*8(%rdx)	/* pt_regs->flags */
+	pushq	2*8(%rdx)	/* pt_regs->cs */
+	pushq	1*8(%rdx)	/* pt_regs->rip */
+	UNWIND_HINT_IRET_REGS
+	pushq   $-1		/* pt_regs->orig_ax */
+	PUSH_AND_CLEAR_REGS rdx=(%rdx)
+	ENCODE_FRAME_POINTER
+
+	IBRS_ENTER
+	UNTRAIN_RET
+
+	/*
+	 * At this point we no longer need to worry about stack damage
+	 * due to nesting -- we're on the normal thread stack and we're
+	 * done with the NMI stack.
+	 */
+
+	movq	%rsp, %rdi
+	movq	$-1, %rsi
+	call	exc_nmi
+
+	/*
+	 * Return back to user mode.  We must *not* do the normal exit
+	 * work, because we don't want to enable interrupts.
+	 */
+	jmp	swapgs_restore_regs_and_return_to_usermode
+
+.Lnmi_from_kernel:
+	/*
+	 * Here's what our stack frame will look like:
+	 * +---------------------------------------------------------+
+	 * | original SS                                             |
+	 * | original Return RSP                                     |
+	 * | original RFLAGS                                         |
+	 * | original CS                                             |
+	 * | original RIP                                            |
+	 * +---------------------------------------------------------+
+	 * | temp storage for rdx                                    |
+	 * +---------------------------------------------------------+
+	 * | "NMI executing" variable                                |
+	 * +---------------------------------------------------------+
+	 * | iret SS          } Copied from "outermost" frame        |
+	 * | iret Return RSP  } on each loop iteration; overwritten  |
+	 * | iret RFLAGS      } by a nested NMI to force another     |
+	 * | iret CS          } iteration if needed.                 |
+	 * | iret RIP         }                                      |
+	 * +---------------------------------------------------------+
+	 * | outermost SS          } initialized in first_nmi;       |
+	 * | outermost Return RSP  } will not be changed before      |
+	 * | outermost RFLAGS      } NMI processing is done.         |
+	 * | outermost CS          } Copied to "iret" frame on each  |
+	 * | outermost RIP         } iteration.                      |
+	 * +---------------------------------------------------------+
+	 * | pt_regs                                                 |
+	 * +---------------------------------------------------------+
+	 *
+	 * The "original" frame is used by hardware.  Before re-enabling
+	 * NMIs, we need to be done with it, and we need to leave enough
+	 * space for the asm code here.
+	 *
+	 * We return by executing IRET while RSP points to the "iret" frame.
+	 * That will either return for real or it will loop back into NMI
+	 * processing.
+	 *
+	 * The "outermost" frame is copied to the "iret" frame on each
+	 * iteration of the loop, so each iteration starts with the "iret"
+	 * frame pointing to the final return target.
+	 */
+
+	/*
+	 * Determine whether we're a nested NMI.
+	 *
+	 * If we interrupted kernel code between repeat_nmi and
+	 * end_repeat_nmi, then we are a nested NMI.  We must not
+	 * modify the "iret" frame because it's being written by
+	 * the outer NMI.  That's okay; the outer NMI handler is
+	 * about to about to call exc_nmi() anyway, so we can just
+	 * resume the outer NMI.
+	 */
+
+	movq	$repeat_nmi, %rdx
+	cmpq	8(%rsp), %rdx
+	ja	1f
+	movq	$end_repeat_nmi, %rdx
+	cmpq	8(%rsp), %rdx
+	ja	nested_nmi_out
+1:
+
+	/*
+	 * Now check "NMI executing".  If it's set, then we're nested.
+	 * This will not detect if we interrupted an outer NMI just
+	 * before IRET.
+	 */
+	cmpl	$1, -8(%rsp)
+	je	nested_nmi
+
+	/*
+	 * Now test if the previous stack was an NMI stack.  This covers
+	 * the case where we interrupt an outer NMI after it clears
+	 * "NMI executing" but before IRET.  We need to be careful, though:
+	 * there is one case in which RSP could point to the NMI stack
+	 * despite there being no NMI active: naughty userspace controls
+	 * RSP at the very beginning of the SYSCALL targets.  We can
+	 * pull a fast one on naughty userspace, though: we program
+	 * SYSCALL to mask DF, so userspace cannot cause DF to be set
+	 * if it controls the kernel's RSP.  We set DF before we clear
+	 * "NMI executing".
+	 */
+	lea	6*8(%rsp), %rdx
+	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
+	cmpq	%rdx, 4*8(%rsp)
+	/* If the stack pointer is above the NMI stack, this is a normal NMI */
+	ja	first_nmi
+
+	subq	$EXCEPTION_STKSZ, %rdx
+	cmpq	%rdx, 4*8(%rsp)
+	/* If it is below the NMI stack, it is a normal NMI */
+	jb	first_nmi
+
+	/* Ah, it is within the NMI stack. */
+
+	testb	$(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
+	jz	first_nmi	/* RSP was user controlled. */
+
+	/* This is a nested NMI. */
+
+nested_nmi:
+	/*
+	 * Modify the "iret" frame to point to repeat_nmi, forcing another
+	 * iteration of NMI handling.
+	 */
+	subq	$8, %rsp
+	leaq	-10*8(%rsp), %rdx
+	pushq	$__KERNEL_DS
+	pushq	%rdx
+	pushfq
+	pushq	$__KERNEL_CS
+	pushq	$repeat_nmi
+
+	/* Put stack back */
+	addq	$(6*8), %rsp
+
+nested_nmi_out:
+	popq	%rdx
+
+	/* We are returning to kernel mode, so this cannot result in a fault. */
+	iretq
+
+first_nmi:
+	/* Restore rdx. */
+	movq	(%rsp), %rdx
+
+	/* Make room for "NMI executing". */
+	pushq	$0
+
+	/* Leave room for the "iret" frame */
+	subq	$(5*8), %rsp
+
+	/* Copy the "original" frame to the "outermost" frame */
+	.rept 5
+	pushq	11*8(%rsp)
+	.endr
+	UNWIND_HINT_IRET_REGS
+
+	/* Everything up to here is safe from nested NMIs */
+
+#ifdef CONFIG_DEBUG_ENTRY
+	/*
+	 * For ease of testing, unmask NMIs right away.  Disabled by
+	 * default because IRET is very expensive.
+	 */
+	pushq	$0		/* SS */
+	pushq	%rsp		/* RSP (minus 8 because of the previous push) */
+	addq	$8, (%rsp)	/* Fix up RSP */
+	pushfq			/* RFLAGS */
+	pushq	$__KERNEL_CS	/* CS */
+	pushq	$1f		/* RIP */
+	iretq			/* continues at repeat_nmi below */
+	UNWIND_HINT_IRET_REGS
+1:
+#endif
+
+repeat_nmi:
+	ANNOTATE_NOENDBR // this code
+	/*
+	 * If there was a nested NMI, the first NMI's iret will return
+	 * here. But NMIs are still enabled and we can take another
+	 * nested NMI. The nested NMI checks the interrupted RIP to see
+	 * if it is between repeat_nmi and end_repeat_nmi, and if so
+	 * it will just return, as we are about to repeat an NMI anyway.
+	 * This makes it safe to copy to the stack frame that a nested
+	 * NMI will update.
+	 *
+	 * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
+	 * we're repeating an NMI, gsbase has the same value that it had on
+	 * the first iteration.  paranoid_entry will load the kernel
+	 * gsbase if needed before we call exc_nmi().  "NMI executing"
+	 * is zero.
+	 */
+	movq	$1, 10*8(%rsp)		/* Set "NMI executing". */
+
+	/*
+	 * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
+	 * here must not modify the "iret" frame while we're writing to
+	 * it or it will end up containing garbage.
+	 */
+	addq	$(10*8), %rsp
+	.rept 5
+	pushq	-6*8(%rsp)
+	.endr
+	subq	$(5*8), %rsp
+end_repeat_nmi:
+	ANNOTATE_NOENDBR // this code
+
+	/*
+	 * Everything below this point can be preempted by a nested NMI.
+	 * If this happens, then the inner NMI will change the "iret"
+	 * frame to point back to repeat_nmi.
+	 */
+	pushq	$-1				/* ORIG_RAX: no syscall to restart */
+
+	/*
+	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
+	 * as we should not be calling schedule in NMI context.
+	 * Even with normal interrupts enabled. An NMI should not be
+	 * setting NEED_RESCHED or anything that normal interrupts and
+	 * exceptions might do.
+	 */
+	call	paranoid_entry
+	UNWIND_HINT_REGS
+
+	movq	%rsp, %rdi
+	movq	$-1, %rsi
+	call	exc_nmi
+
+	/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
+	IBRS_EXIT save_reg=%r15
+
+	/* Always restore stashed CR3 value (see paranoid_entry) */
+	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+
+	/*
+	 * The above invocation of paranoid_entry stored the GSBASE
+	 * related information in R/EBX depending on the availability
+	 * of FSGSBASE.
+	 *
+	 * If FSGSBASE is enabled, restore the saved GSBASE value
+	 * unconditionally, otherwise take the conditional SWAPGS path.
+	 */
+	ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
+
+	wrgsbase	%rbx
+	jmp	nmi_restore
+
+nmi_no_fsgsbase:
+	/* EBX == 0 -> invoke SWAPGS */
+	testl	%ebx, %ebx
+	jnz	nmi_restore
+
+nmi_swapgs:
+	swapgs
+
+nmi_restore:
+	POP_REGS
+
+	/*
+	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
+	 * at the "iret" frame.
+	 */
+	addq	$6*8, %rsp
+
+	/*
+	 * Clear "NMI executing".  Set DF first so that we can easily
+	 * distinguish the remaining code between here and IRET from
+	 * the SYSCALL entry and exit paths.
+	 *
+	 * We arguably should just inspect RIP instead, but I (Andy) wrote
+	 * this code when I had the misapprehension that Xen PV supported
+	 * NMIs, and Xen PV would break that approach.
+	 */
+	std
+	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
+
+	/*
+	 * iretq reads the "iret" frame and exits the NMI stack in a
+	 * single instruction.  We are returning to kernel mode, so this
+	 * cannot result in a fault.  Similarly, we don't need to worry
+	 * about espfix64 on the way back to kernel mode.
+	 */
+	iretq
+SYM_CODE_END(asm_exc_nmi)
+
+#ifndef CONFIG_IA32_EMULATION
+/*
+ * This handles SYSCALL from 32-bit code.  There is no way to program
+ * MSRs to fully disable 32-bit SYSCALL.
+ */
+SYM_CODE_START(ignore_sysret)
+	UNWIND_HINT_EMPTY
+	ENDBR
+	mov	$-ENOSYS, %eax
+	sysretl
+SYM_CODE_END(ignore_sysret)
+#endif
+
+.pushsection .text, "ax"
+SYM_CODE_START(rewind_stack_and_make_dead)
+	UNWIND_HINT_FUNC
+	/* Prevent any naive code from trying to unwind to our caller. */
+	xorl	%ebp, %ebp
+
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rax
+	leaq	-PTREGS_SIZE(%rax), %rsp
+	UNWIND_HINT_REGS
+
+	call	make_task_dead
+SYM_CODE_END(rewind_stack_and_make_dead)
+.popsection
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
new file mode 100644
index 000000000..d6c08d898
--- /dev/null
+++ b/arch/x86/entry/entry_64_compat.S
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Compatibility mode system call entry point for x86-64.
+ *
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */
+#include <asm/asm-offsets.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
+#include <asm/segment.h>
+#include <asm/irqflags.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/nospec-branch.h>
+#include <linux/linkage.h>
+#include <linux/err.h>
+
+#include "calling.h"
+
+	.section .entry.text, "ax"
+
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on Intel CPUs.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old RIP (!!!), RSP, or RFLAGS.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ */
+SYM_CODE_START(entry_SYSENTER_compat)
+	UNWIND_HINT_ENTRY
+	ENDBR
+	/* Interrupts are off on entry. */
+	swapgs
+
+	pushq	%rax
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+	popq	%rax
+
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+	/* Construct struct pt_regs on stack */
+	pushq	$__USER32_DS		/* pt_regs->ss */
+	pushq	$0			/* pt_regs->sp = 0 (placeholder) */
+
+	/*
+	 * Push flags.  This is nasty.  First, interrupts are currently
+	 * off, but we need pt_regs->flags to have IF set.  Second, if TS
+	 * was set in usermode, it's still set, and we're singlestepping
+	 * through this code.  do_SYSENTER_32() will fix up IF.
+	 */
+	pushfq				/* pt_regs->flags (except IF = 0) */
+	pushq	$__USER32_CS		/* pt_regs->cs */
+	pushq	$0			/* pt_regs->ip = 0 (placeholder) */
+SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
+
+	/*
+	 * User tracing code (ptrace or signal handlers) might assume that
+	 * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+	 * syscall.  Just in case the high bits are nonzero, zero-extend
+	 * the syscall number.  (This could almost certainly be deleted
+	 * with no ill effects.)
+	 */
+	movl	%eax, %eax
+
+	pushq	%rax			/* pt_regs->orig_ax */
+	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
+	UNWIND_HINT_REGS
+
+	cld
+
+	IBRS_ENTER
+	UNTRAIN_RET
+
+	/*
+	 * SYSENTER doesn't filter flags, so we need to clear NT and AC
+	 * ourselves.  To save a few cycles, we can check whether
+	 * either was set instead of doing an unconditional popfq.
+	 * This needs to happen before enabling interrupts so that
+	 * we don't get preempted with NT set.
+	 *
+	 * If TF is set, we will single-step all the way to here -- do_debug
+	 * will ignore all the traps.  (Yes, this is slow, but so is
+	 * single-stepping in general.  This allows us to avoid having
+	 * a more complicated code to handle the case where a user program
+	 * forces us to single-step through the SYSENTER entry code.)
+	 *
+	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+	 * out-of-line as an optimization: NT is unlikely to be set in the
+	 * majority of the cases and instead of polluting the I$ unnecessarily,
+	 * we're keeping that code behind a branch which will predict as
+	 * not-taken and therefore its instructions won't be fetched.
+	 */
+	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
+	jnz	.Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+
+	movq	%rsp, %rdi
+	call	do_SYSENTER_32
+	/* XEN PV guests always use IRET path */
+	ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+		    "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+	jmp	sysret32_from_system_call
+
+.Lsysenter_fix_flags:
+	pushq	$X86_EFLAGS_FIXED
+	popfq
+	jmp	.Lsysenter_flags_fixed
+SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL)
+	ANNOTATE_NOENDBR // is_sysenter_singlestep
+SYM_CODE_END(entry_SYSENTER_compat)
+
+/*
+ * 32-bit SYSCALL entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on AMD CPUs.
+ *
+ * The SYSCALL instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, it appears that this really is the case.
+ * As evidence:
+ *
+ *  - The calling convention for SYSCALL has changed several times without
+ *    anyone noticing.
+ *
+ *  - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
+ *    user task that did SYSCALL without immediately reloading SS
+ *    would randomly crash.
+ *
+ *  - Most programmers do not directly target AMD CPUs, and the 32-bit
+ *    SYSCALL instruction does not exist on Intel CPUs.  Even on AMD
+ *    CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
+ *    because the SYSCALL instruction in legacy/native 32-bit mode (as
+ *    opposed to compat mode) is sufficiently poorly designed as to be
+ *    essentially unusable.
+ *
+ * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
+ * RFLAGS to R11, then loads new SS, CS, and RIP from previously
+ * programmed MSRs.  RFLAGS gets masked by a value from another MSR
+ * (so CLD and CLAC are not needed).  SYSCALL does not save anything on
+ * the stack and does not change RSP.
+ *
+ * Note: RFLAGS saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
+ * Arguments:
+ * eax  system call number
+ * ecx  return address
+ * ebx  arg1
+ * ebp  arg2	(note: not saved in the stack frame, should not be touched)
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * esp  user stack
+ * 0(%esp) arg6
+ */
+SYM_CODE_START(entry_SYSCALL_compat)
+	UNWIND_HINT_ENTRY
+	ENDBR
+	/* Interrupts are off on entry. */
+	swapgs
+
+	/* Stash user ESP */
+	movl	%esp, %r8d
+
+	/* Use %rsp as scratch reg. User ESP is stashed in r8 */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
+	/* Switch to the kernel stack */
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
+	ANNOTATE_NOENDBR
+
+	/* Construct struct pt_regs on stack */
+	pushq	$__USER32_DS		/* pt_regs->ss */
+	pushq	%r8			/* pt_regs->sp */
+	pushq	%r11			/* pt_regs->flags */
+	pushq	$__USER32_CS		/* pt_regs->cs */
+	pushq	%rcx			/* pt_regs->ip */
+SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
+	movl	%eax, %eax		/* discard orig_ax high bits */
+	pushq	%rax			/* pt_regs->orig_ax */
+	PUSH_AND_CLEAR_REGS rcx=%rbp rax=$-ENOSYS
+	UNWIND_HINT_REGS
+
+	IBRS_ENTER
+	UNTRAIN_RET
+
+	movq	%rsp, %rdi
+	call	do_fast_syscall_32
+	/* XEN PV guests always use IRET path */
+	ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+		    "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+
+	/* Opportunistic SYSRET */
+sysret32_from_system_call:
+	/*
+	 * We are not going to return to userspace from the trampoline
+	 * stack. So let's erase the thread stack right now.
+	 */
+	STACKLEAK_ERASE
+
+	IBRS_EXIT
+
+	movq	RBX(%rsp), %rbx		/* pt_regs->rbx */
+	movq	RBP(%rsp), %rbp		/* pt_regs->rbp */
+	movq	EFLAGS(%rsp), %r11	/* pt_regs->flags (in r11) */
+	movq	RIP(%rsp), %rcx		/* pt_regs->ip (in rcx) */
+	addq	$RAX, %rsp		/* Skip r8-r15 */
+	popq	%rax			/* pt_regs->rax */
+	popq	%rdx			/* Skip pt_regs->cx */
+	popq	%rdx			/* pt_regs->dx */
+	popq	%rsi			/* pt_regs->si */
+	popq	%rdi			/* pt_regs->di */
+
+        /*
+         * USERGS_SYSRET32 does:
+         *  GSBASE = user's GS base
+         *  EIP = ECX
+         *  RFLAGS = R11
+         *  CS = __USER32_CS
+         *  SS = __USER_DS
+         *
+	 * ECX will not match pt_regs->cx, but we're returning to a vDSO
+	 * trampoline that will fix up RCX, so this is okay.
+	 *
+	 * R12-R15 are callee-saved, so they contain whatever was in them
+	 * when the system call started, which is already known to user
+	 * code.  We zero R8-R10 to avoid info leaks.
+         */
+	movq	RSP-ORIG_RAX(%rsp), %rsp
+SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL)
+	ANNOTATE_NOENDBR
+
+	/*
+	 * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
+	 * on the process stack which is not mapped to userspace and
+	 * not readable after we SWITCH_TO_USER_CR3.  Delay the CR3
+	 * switch until after after the last reference to the process
+	 * stack.
+	 *
+	 * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
+	 */
+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
+
+	xorl	%r8d, %r8d
+	xorl	%r9d, %r9d
+	xorl	%r10d, %r10d
+	swapgs
+	sysretl
+SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL)
+	ANNOTATE_NOENDBR
+	int3
+SYM_CODE_END(entry_SYSCALL_compat)
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
new file mode 100644
index 000000000..8cfc9bc73
--- /dev/null
+++ b/arch/x86/entry/syscall_32.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+/* System call table for i386. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <linux/syscalls.h>
+#include <asm/syscall.h>
+
+#ifdef CONFIG_IA32_EMULATION
+#define __SYSCALL_WITH_COMPAT(nr, native, compat)	__SYSCALL(nr, compat)
+#else
+#define __SYSCALL_WITH_COMPAT(nr, native, compat)	__SYSCALL(nr, native)
+#endif
+
+#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
+
+#include <asm/syscalls_32.h>
+#undef __SYSCALL
+
+#define __SYSCALL(nr, sym) __ia32_##sym,
+
+__visible const sys_call_ptr_t ia32_sys_call_table[] = {
+#include <asm/syscalls_32.h>
+};
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
new file mode 100644
index 000000000..be120eec1
--- /dev/null
+++ b/arch/x86/entry/syscall_64.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/* System call table for x86-64. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <linux/syscalls.h>
+#include <asm/syscall.h>
+
+#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#include <asm/syscalls_64.h>
+#undef __SYSCALL
+
+#define __SYSCALL(nr, sym) __x64_##sym,
+
+asmlinkage const sys_call_ptr_t sys_call_table[] = {
+#include <asm/syscalls_64.h>
+};
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
new file mode 100644
index 000000000..bdd0e03a1
--- /dev/null
+++ b/arch/x86/entry/syscall_x32.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/* System call table for x32 ABI. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <linux/syscalls.h>
+#include <asm/syscall.h>
+
+#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#include <asm/syscalls_x32.h>
+#undef __SYSCALL
+
+#define __SYSCALL(nr, sym) __x64_##sym,
+
+asmlinkage const sys_call_ptr_t x32_sys_call_table[] = {
+#include <asm/syscalls_x32.h>
+};
diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile
new file mode 100644
index 000000000..eca5d6eff
--- /dev/null
+++ b/arch/x86/entry/syscalls/Makefile
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-2.0
+out := arch/$(SRCARCH)/include/generated/asm
+uapi := arch/$(SRCARCH)/include/generated/uapi/asm
+
+# Create output directory if not already present
+$(shell mkdir -p $(out) $(uapi))
+
+syscall32 := $(src)/syscall_32.tbl
+syscall64 := $(src)/syscall_64.tbl
+
+syshdr := $(srctree)/scripts/syscallhdr.sh
+systbl := $(srctree)/scripts/syscalltbl.sh
+offset :=
+prefix :=
+
+quiet_cmd_syshdr = SYSHDR  $@
+      cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) --emit-nr \
+		$(if $(offset),--offset $(offset)) \
+		$(if $(prefix),--prefix $(prefix)) \
+		$< $@
+quiet_cmd_systbl = SYSTBL  $@
+      cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@
+
+quiet_cmd_hypercalls = HYPERCALLS $@
+      cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs))
+
+$(uapi)/unistd_32.h: abis := i386
+$(uapi)/unistd_32.h: $(syscall32) $(syshdr) FORCE
+	$(call if_changed,syshdr)
+
+$(out)/unistd_32_ia32.h: abis := i386
+$(out)/unistd_32_ia32.h: prefix := ia32_
+$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) FORCE
+	$(call if_changed,syshdr)
+
+$(uapi)/unistd_x32.h: abis := common,x32
+$(uapi)/unistd_x32.h: offset := __X32_SYSCALL_BIT
+$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) FORCE
+	$(call if_changed,syshdr)
+
+$(uapi)/unistd_64.h: abis := common,64
+$(uapi)/unistd_64.h: $(syscall64) $(syshdr) FORCE
+	$(call if_changed,syshdr)
+
+$(out)/unistd_64_x32.h: abis := x32
+$(out)/unistd_64_x32.h: prefix := x32_
+$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE
+	$(call if_changed,syshdr)
+
+$(out)/syscalls_32.h: abis := i386
+$(out)/syscalls_32.h: $(syscall32) $(systbl) FORCE
+	$(call if_changed,systbl)
+$(out)/syscalls_64.h: abis := common,64
+$(out)/syscalls_64.h: $(syscall64) $(systbl) FORCE
+	$(call if_changed,systbl)
+$(out)/syscalls_x32.h: abis := common,x32
+$(out)/syscalls_x32.h: $(syscall64) $(systbl) FORCE
+	$(call if_changed,systbl)
+
+$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh FORCE
+	$(call if_changed,hypercalls)
+
+$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h
+
+uapisyshdr-y			+= unistd_32.h unistd_64.h unistd_x32.h
+syshdr-y			+= syscalls_32.h
+syshdr-$(CONFIG_X86_64)		+= unistd_32_ia32.h unistd_64_x32.h
+syshdr-$(CONFIG_X86_64)		+= syscalls_64.h
+syshdr-$(CONFIG_X86_X32_ABI)	+= syscalls_x32.h
+syshdr-$(CONFIG_XEN)		+= xen-hypercalls.h
+
+uapisyshdr-y	:= $(addprefix $(uapi)/, $(uapisyshdr-y))
+syshdr-y	:= $(addprefix $(out)/, $(syshdr-y))
+targets		+= $(addprefix ../../../../, $(uapisyshdr-y) $(syshdr-y))
+
+PHONY += all
+all: $(uapisyshdr-y) $(syshdr-y)
+	@:
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
new file mode 100644
index 000000000..320480a8d
--- /dev/null
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -0,0 +1,457 @@
+#
+# 32-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point> <compat entry point>
+#
+# The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for
+# sys_*() system calls and compat_sys_*() compat system calls if
+# IA32_EMULATION is defined, and expect struct pt_regs *regs as their only
+# parameter.
+#
+# The abi is always "i386" for this file.
+#
+0	i386	restart_syscall		sys_restart_syscall
+1	i386	exit			sys_exit
+2	i386	fork			sys_fork
+3	i386	read			sys_read
+4	i386	write			sys_write
+5	i386	open			sys_open			compat_sys_open
+6	i386	close			sys_close
+7	i386	waitpid			sys_waitpid
+8	i386	creat			sys_creat
+9	i386	link			sys_link
+10	i386	unlink			sys_unlink
+11	i386	execve			sys_execve			compat_sys_execve
+12	i386	chdir			sys_chdir
+13	i386	time			sys_time32
+14	i386	mknod			sys_mknod
+15	i386	chmod			sys_chmod
+16	i386	lchown			sys_lchown16
+17	i386	break
+18	i386	oldstat			sys_stat
+19	i386	lseek			sys_lseek			compat_sys_lseek
+20	i386	getpid			sys_getpid
+21	i386	mount			sys_mount
+22	i386	umount			sys_oldumount
+23	i386	setuid			sys_setuid16
+24	i386	getuid			sys_getuid16
+25	i386	stime			sys_stime32
+26	i386	ptrace			sys_ptrace			compat_sys_ptrace
+27	i386	alarm			sys_alarm
+28	i386	oldfstat		sys_fstat
+29	i386	pause			sys_pause
+30	i386	utime			sys_utime32
+31	i386	stty
+32	i386	gtty
+33	i386	access			sys_access
+34	i386	nice			sys_nice
+35	i386	ftime
+36	i386	sync			sys_sync
+37	i386	kill			sys_kill
+38	i386	rename			sys_rename
+39	i386	mkdir			sys_mkdir
+40	i386	rmdir			sys_rmdir
+41	i386	dup			sys_dup
+42	i386	pipe			sys_pipe
+43	i386	times			sys_times			compat_sys_times
+44	i386	prof
+45	i386	brk			sys_brk
+46	i386	setgid			sys_setgid16
+47	i386	getgid			sys_getgid16
+48	i386	signal			sys_signal
+49	i386	geteuid			sys_geteuid16
+50	i386	getegid			sys_getegid16
+51	i386	acct			sys_acct
+52	i386	umount2			sys_umount
+53	i386	lock
+54	i386	ioctl			sys_ioctl			compat_sys_ioctl
+55	i386	fcntl			sys_fcntl			compat_sys_fcntl64
+56	i386	mpx
+57	i386	setpgid			sys_setpgid
+58	i386	ulimit
+59	i386	oldolduname		sys_olduname
+60	i386	umask			sys_umask
+61	i386	chroot			sys_chroot
+62	i386	ustat			sys_ustat			compat_sys_ustat
+63	i386	dup2			sys_dup2
+64	i386	getppid			sys_getppid
+65	i386	getpgrp			sys_getpgrp
+66	i386	setsid			sys_setsid
+67	i386	sigaction		sys_sigaction			compat_sys_sigaction
+68	i386	sgetmask		sys_sgetmask
+69	i386	ssetmask		sys_ssetmask
+70	i386	setreuid		sys_setreuid16
+71	i386	setregid		sys_setregid16
+72	i386	sigsuspend		sys_sigsuspend
+73	i386	sigpending		sys_sigpending			compat_sys_sigpending
+74	i386	sethostname		sys_sethostname
+75	i386	setrlimit		sys_setrlimit			compat_sys_setrlimit
+76	i386	getrlimit		sys_old_getrlimit		compat_sys_old_getrlimit
+77	i386	getrusage		sys_getrusage			compat_sys_getrusage
+78	i386	gettimeofday		sys_gettimeofday		compat_sys_gettimeofday
+79	i386	settimeofday		sys_settimeofday		compat_sys_settimeofday
+80	i386	getgroups		sys_getgroups16
+81	i386	setgroups		sys_setgroups16
+82	i386	select			sys_old_select			compat_sys_old_select
+83	i386	symlink			sys_symlink
+84	i386	oldlstat		sys_lstat
+85	i386	readlink		sys_readlink
+86	i386	uselib			sys_uselib
+87	i386	swapon			sys_swapon
+88	i386	reboot			sys_reboot
+89	i386	readdir			sys_old_readdir			compat_sys_old_readdir
+90	i386	mmap			sys_old_mmap			compat_sys_ia32_mmap
+91	i386	munmap			sys_munmap
+92	i386	truncate		sys_truncate			compat_sys_truncate
+93	i386	ftruncate		sys_ftruncate			compat_sys_ftruncate
+94	i386	fchmod			sys_fchmod
+95	i386	fchown			sys_fchown16
+96	i386	getpriority		sys_getpriority
+97	i386	setpriority		sys_setpriority
+98	i386	profil
+99	i386	statfs			sys_statfs			compat_sys_statfs
+100	i386	fstatfs			sys_fstatfs			compat_sys_fstatfs
+101	i386	ioperm			sys_ioperm
+102	i386	socketcall		sys_socketcall			compat_sys_socketcall
+103	i386	syslog			sys_syslog
+104	i386	setitimer		sys_setitimer			compat_sys_setitimer
+105	i386	getitimer		sys_getitimer			compat_sys_getitimer
+106	i386	stat			sys_newstat			compat_sys_newstat
+107	i386	lstat			sys_newlstat			compat_sys_newlstat
+108	i386	fstat			sys_newfstat			compat_sys_newfstat
+109	i386	olduname		sys_uname
+110	i386	iopl			sys_iopl
+111	i386	vhangup			sys_vhangup
+112	i386	idle
+113	i386	vm86old			sys_vm86old			sys_ni_syscall
+114	i386	wait4			sys_wait4			compat_sys_wait4
+115	i386	swapoff			sys_swapoff
+116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
+117	i386	ipc			sys_ipc				compat_sys_ipc
+118	i386	fsync			sys_fsync
+119	i386	sigreturn		sys_sigreturn			compat_sys_sigreturn
+120	i386	clone			sys_clone			compat_sys_ia32_clone
+121	i386	setdomainname		sys_setdomainname
+122	i386	uname			sys_newuname
+123	i386	modify_ldt		sys_modify_ldt
+124	i386	adjtimex		sys_adjtimex_time32
+125	i386	mprotect		sys_mprotect
+126	i386	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
+127	i386	create_module
+128	i386	init_module		sys_init_module
+129	i386	delete_module		sys_delete_module
+130	i386	get_kernel_syms
+131	i386	quotactl		sys_quotactl
+132	i386	getpgid			sys_getpgid
+133	i386	fchdir			sys_fchdir
+134	i386	bdflush			sys_ni_syscall
+135	i386	sysfs			sys_sysfs
+136	i386	personality		sys_personality
+137	i386	afs_syscall
+138	i386	setfsuid		sys_setfsuid16
+139	i386	setfsgid		sys_setfsgid16
+140	i386	_llseek			sys_llseek
+141	i386	getdents		sys_getdents			compat_sys_getdents
+142	i386	_newselect		sys_select			compat_sys_select
+143	i386	flock			sys_flock
+144	i386	msync			sys_msync
+145	i386	readv			sys_readv
+146	i386	writev			sys_writev
+147	i386	getsid			sys_getsid
+148	i386	fdatasync		sys_fdatasync
+149	i386	_sysctl			sys_ni_syscall
+150	i386	mlock			sys_mlock
+151	i386	munlock			sys_munlock
+152	i386	mlockall		sys_mlockall
+153	i386	munlockall		sys_munlockall
+154	i386	sched_setparam		sys_sched_setparam
+155	i386	sched_getparam		sys_sched_getparam
+156	i386	sched_setscheduler	sys_sched_setscheduler
+157	i386	sched_getscheduler	sys_sched_getscheduler
+158	i386	sched_yield		sys_sched_yield
+159	i386	sched_get_priority_max	sys_sched_get_priority_max
+160	i386	sched_get_priority_min	sys_sched_get_priority_min
+161	i386	sched_rr_get_interval	sys_sched_rr_get_interval_time32
+162	i386	nanosleep		sys_nanosleep_time32
+163	i386	mremap			sys_mremap
+164	i386	setresuid		sys_setresuid16
+165	i386	getresuid		sys_getresuid16
+166	i386	vm86			sys_vm86			sys_ni_syscall
+167	i386	query_module
+168	i386	poll			sys_poll
+169	i386	nfsservctl
+170	i386	setresgid		sys_setresgid16
+171	i386	getresgid		sys_getresgid16
+172	i386	prctl			sys_prctl
+173	i386	rt_sigreturn		sys_rt_sigreturn		compat_sys_rt_sigreturn
+174	i386	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction
+175	i386	rt_sigprocmask		sys_rt_sigprocmask		compat_sys_rt_sigprocmask
+176	i386	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending
+177	i386	rt_sigtimedwait		sys_rt_sigtimedwait_time32	compat_sys_rt_sigtimedwait_time32
+178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
+179	i386	rt_sigsuspend		sys_rt_sigsuspend		compat_sys_rt_sigsuspend
+180	i386	pread64			sys_ia32_pread64
+181	i386	pwrite64		sys_ia32_pwrite64
+182	i386	chown			sys_chown16
+183	i386	getcwd			sys_getcwd
+184	i386	capget			sys_capget
+185	i386	capset			sys_capset
+186	i386	sigaltstack		sys_sigaltstack			compat_sys_sigaltstack
+187	i386	sendfile		sys_sendfile			compat_sys_sendfile
+188	i386	getpmsg
+189	i386	putpmsg
+190	i386	vfork			sys_vfork
+191	i386	ugetrlimit		sys_getrlimit			compat_sys_getrlimit
+192	i386	mmap2			sys_mmap_pgoff
+193	i386	truncate64		sys_ia32_truncate64
+194	i386	ftruncate64		sys_ia32_ftruncate64
+195	i386	stat64			sys_stat64			compat_sys_ia32_stat64
+196	i386	lstat64			sys_lstat64			compat_sys_ia32_lstat64
+197	i386	fstat64			sys_fstat64			compat_sys_ia32_fstat64
+198	i386	lchown32		sys_lchown
+199	i386	getuid32		sys_getuid
+200	i386	getgid32		sys_getgid
+201	i386	geteuid32		sys_geteuid
+202	i386	getegid32		sys_getegid
+203	i386	setreuid32		sys_setreuid
+204	i386	setregid32		sys_setregid
+205	i386	getgroups32		sys_getgroups
+206	i386	setgroups32		sys_setgroups
+207	i386	fchown32		sys_fchown
+208	i386	setresuid32		sys_setresuid
+209	i386	getresuid32		sys_getresuid
+210	i386	setresgid32		sys_setresgid
+211	i386	getresgid32		sys_getresgid
+212	i386	chown32			sys_chown
+213	i386	setuid32		sys_setuid
+214	i386	setgid32		sys_setgid
+215	i386	setfsuid32		sys_setfsuid
+216	i386	setfsgid32		sys_setfsgid
+217	i386	pivot_root		sys_pivot_root
+218	i386	mincore			sys_mincore
+219	i386	madvise			sys_madvise
+220	i386	getdents64		sys_getdents64
+221	i386	fcntl64			sys_fcntl64			compat_sys_fcntl64
+# 222 is unused
+# 223 is unused
+224	i386	gettid			sys_gettid
+225	i386	readahead		sys_ia32_readahead
+226	i386	setxattr		sys_setxattr
+227	i386	lsetxattr		sys_lsetxattr
+228	i386	fsetxattr		sys_fsetxattr
+229	i386	getxattr		sys_getxattr
+230	i386	lgetxattr		sys_lgetxattr
+231	i386	fgetxattr		sys_fgetxattr
+232	i386	listxattr		sys_listxattr
+233	i386	llistxattr		sys_llistxattr
+234	i386	flistxattr		sys_flistxattr
+235	i386	removexattr		sys_removexattr
+236	i386	lremovexattr		sys_lremovexattr
+237	i386	fremovexattr		sys_fremovexattr
+238	i386	tkill			sys_tkill
+239	i386	sendfile64		sys_sendfile64
+240	i386	futex			sys_futex_time32
+241	i386	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
+242	i386	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
+243	i386	set_thread_area		sys_set_thread_area
+244	i386	get_thread_area		sys_get_thread_area
+245	i386	io_setup		sys_io_setup			compat_sys_io_setup
+246	i386	io_destroy		sys_io_destroy
+247	i386	io_getevents		sys_io_getevents_time32
+248	i386	io_submit		sys_io_submit			compat_sys_io_submit
+249	i386	io_cancel		sys_io_cancel
+250	i386	fadvise64		sys_ia32_fadvise64
+# 251 is available for reuse (was briefly sys_set_zone_reclaim)
+252	i386	exit_group		sys_exit_group
+253	i386	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+254	i386	epoll_create		sys_epoll_create
+255	i386	epoll_ctl		sys_epoll_ctl
+256	i386	epoll_wait		sys_epoll_wait
+257	i386	remap_file_pages	sys_remap_file_pages
+258	i386	set_tid_address		sys_set_tid_address
+259	i386	timer_create		sys_timer_create		compat_sys_timer_create
+260	i386	timer_settime		sys_timer_settime32
+261	i386	timer_gettime		sys_timer_gettime32
+262	i386	timer_getoverrun	sys_timer_getoverrun
+263	i386	timer_delete		sys_timer_delete
+264	i386	clock_settime		sys_clock_settime32
+265	i386	clock_gettime		sys_clock_gettime32
+266	i386	clock_getres		sys_clock_getres_time32
+267	i386	clock_nanosleep		sys_clock_nanosleep_time32
+268	i386	statfs64		sys_statfs64			compat_sys_statfs64
+269	i386	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
+270	i386	tgkill			sys_tgkill
+271	i386	utimes			sys_utimes_time32
+272	i386	fadvise64_64		sys_ia32_fadvise64_64
+273	i386	vserver
+274	i386	mbind			sys_mbind
+275	i386	get_mempolicy		sys_get_mempolicy
+276	i386	set_mempolicy		sys_set_mempolicy
+277	i386	mq_open			sys_mq_open			compat_sys_mq_open
+278	i386	mq_unlink		sys_mq_unlink
+279	i386	mq_timedsend		sys_mq_timedsend_time32
+280	i386	mq_timedreceive		sys_mq_timedreceive_time32
+281	i386	mq_notify		sys_mq_notify			compat_sys_mq_notify
+282	i386	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr
+283	i386	kexec_load		sys_kexec_load			compat_sys_kexec_load
+284	i386	waitid			sys_waitid			compat_sys_waitid
+# 285 sys_setaltroot
+286	i386	add_key			sys_add_key
+287	i386	request_key		sys_request_key
+288	i386	keyctl			sys_keyctl			compat_sys_keyctl
+289	i386	ioprio_set		sys_ioprio_set
+290	i386	ioprio_get		sys_ioprio_get
+291	i386	inotify_init		sys_inotify_init
+292	i386	inotify_add_watch	sys_inotify_add_watch
+293	i386	inotify_rm_watch	sys_inotify_rm_watch
+294	i386	migrate_pages		sys_migrate_pages
+295	i386	openat			sys_openat			compat_sys_openat
+296	i386	mkdirat			sys_mkdirat
+297	i386	mknodat			sys_mknodat
+298	i386	fchownat		sys_fchownat
+299	i386	futimesat		sys_futimesat_time32
+300	i386	fstatat64		sys_fstatat64			compat_sys_ia32_fstatat64
+301	i386	unlinkat		sys_unlinkat
+302	i386	renameat		sys_renameat
+303	i386	linkat			sys_linkat
+304	i386	symlinkat		sys_symlinkat
+305	i386	readlinkat		sys_readlinkat
+306	i386	fchmodat		sys_fchmodat
+307	i386	faccessat		sys_faccessat
+308	i386	pselect6		sys_pselect6_time32		compat_sys_pselect6_time32
+309	i386	ppoll			sys_ppoll_time32		compat_sys_ppoll_time32
+310	i386	unshare			sys_unshare
+311	i386	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
+312	i386	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
+313	i386	splice			sys_splice
+314	i386	sync_file_range		sys_ia32_sync_file_range
+315	i386	tee			sys_tee
+316	i386	vmsplice		sys_vmsplice
+317	i386	move_pages		sys_move_pages
+318	i386	getcpu			sys_getcpu
+319	i386	epoll_pwait		sys_epoll_pwait
+320	i386	utimensat		sys_utimensat_time32
+321	i386	signalfd		sys_signalfd			compat_sys_signalfd
+322	i386	timerfd_create		sys_timerfd_create
+323	i386	eventfd			sys_eventfd
+324	i386	fallocate		sys_ia32_fallocate
+325	i386	timerfd_settime		sys_timerfd_settime32
+326	i386	timerfd_gettime		sys_timerfd_gettime32
+327	i386	signalfd4		sys_signalfd4			compat_sys_signalfd4
+328	i386	eventfd2		sys_eventfd2
+329	i386	epoll_create1		sys_epoll_create1
+330	i386	dup3			sys_dup3
+331	i386	pipe2			sys_pipe2
+332	i386	inotify_init1		sys_inotify_init1
+333	i386	preadv			sys_preadv			compat_sys_preadv
+334	i386	pwritev			sys_pwritev			compat_sys_pwritev
+335	i386	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
+336	i386	perf_event_open		sys_perf_event_open
+337	i386	recvmmsg		sys_recvmmsg_time32		compat_sys_recvmmsg_time32
+338	i386	fanotify_init		sys_fanotify_init
+339	i386	fanotify_mark		sys_fanotify_mark		compat_sys_fanotify_mark
+340	i386	prlimit64		sys_prlimit64
+341	i386	name_to_handle_at	sys_name_to_handle_at
+342	i386	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
+343	i386	clock_adjtime		sys_clock_adjtime32
+344	i386	syncfs			sys_syncfs
+345	i386	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
+346	i386	setns			sys_setns
+347	i386	process_vm_readv	sys_process_vm_readv
+348	i386	process_vm_writev	sys_process_vm_writev
+349	i386	kcmp			sys_kcmp
+350	i386	finit_module		sys_finit_module
+351	i386	sched_setattr		sys_sched_setattr
+352	i386	sched_getattr		sys_sched_getattr
+353	i386	renameat2		sys_renameat2
+354	i386	seccomp			sys_seccomp
+355	i386	getrandom		sys_getrandom
+356	i386	memfd_create		sys_memfd_create
+357	i386	bpf			sys_bpf
+358	i386	execveat		sys_execveat			compat_sys_execveat
+359	i386	socket			sys_socket
+360	i386	socketpair		sys_socketpair
+361	i386	bind			sys_bind
+362	i386	connect			sys_connect
+363	i386	listen			sys_listen
+364	i386	accept4			sys_accept4
+365	i386	getsockopt		sys_getsockopt			sys_getsockopt
+366	i386	setsockopt		sys_setsockopt			sys_setsockopt
+367	i386	getsockname		sys_getsockname
+368	i386	getpeername		sys_getpeername
+369	i386	sendto			sys_sendto
+370	i386	sendmsg			sys_sendmsg			compat_sys_sendmsg
+371	i386	recvfrom		sys_recvfrom			compat_sys_recvfrom
+372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg
+373	i386	shutdown		sys_shutdown
+374	i386	userfaultfd		sys_userfaultfd
+375	i386	membarrier		sys_membarrier
+376	i386	mlock2			sys_mlock2
+377	i386	copy_file_range		sys_copy_file_range
+378	i386	preadv2			sys_preadv2			compat_sys_preadv2
+379	i386	pwritev2		sys_pwritev2			compat_sys_pwritev2
+380	i386	pkey_mprotect		sys_pkey_mprotect
+381	i386	pkey_alloc		sys_pkey_alloc
+382	i386	pkey_free		sys_pkey_free
+383	i386	statx			sys_statx
+384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
+385	i386	io_pgetevents		sys_io_pgetevents_time32	compat_sys_io_pgetevents
+386	i386	rseq			sys_rseq
+393	i386	semget			sys_semget
+394	i386	semctl			sys_semctl    			compat_sys_semctl
+395	i386	shmget			sys_shmget
+396	i386	shmctl			sys_shmctl    			compat_sys_shmctl
+397	i386	shmat			sys_shmat     			compat_sys_shmat
+398	i386	shmdt			sys_shmdt
+399	i386	msgget			sys_msgget
+400	i386	msgsnd			sys_msgsnd    			compat_sys_msgsnd
+401	i386	msgrcv			sys_msgrcv    			compat_sys_msgrcv
+402	i386	msgctl			sys_msgctl    			compat_sys_msgctl
+403	i386	clock_gettime64		sys_clock_gettime
+404	i386	clock_settime64		sys_clock_settime
+405	i386	clock_adjtime64		sys_clock_adjtime
+406	i386	clock_getres_time64	sys_clock_getres
+407	i386	clock_nanosleep_time64	sys_clock_nanosleep
+408	i386	timer_gettime64		sys_timer_gettime
+409	i386	timer_settime64		sys_timer_settime
+410	i386	timerfd_gettime64	sys_timerfd_gettime
+411	i386	timerfd_settime64	sys_timerfd_settime
+412	i386	utimensat_time64	sys_utimensat
+413	i386	pselect6_time64		sys_pselect6			compat_sys_pselect6_time64
+414	i386	ppoll_time64		sys_ppoll			compat_sys_ppoll_time64
+416	i386	io_pgetevents_time64	sys_io_pgetevents
+417	i386	recvmmsg_time64		sys_recvmmsg			compat_sys_recvmmsg_time64
+418	i386	mq_timedsend_time64	sys_mq_timedsend
+419	i386	mq_timedreceive_time64	sys_mq_timedreceive
+420	i386	semtimedop_time64	sys_semtimedop
+421	i386	rt_sigtimedwait_time64	sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time64
+422	i386	futex_time64		sys_futex
+423	i386	sched_rr_get_interval_time64	sys_sched_rr_get_interval
+424	i386	pidfd_send_signal	sys_pidfd_send_signal
+425	i386	io_uring_setup		sys_io_uring_setup
+426	i386	io_uring_enter		sys_io_uring_enter
+427	i386	io_uring_register	sys_io_uring_register
+428	i386	open_tree		sys_open_tree
+429	i386	move_mount		sys_move_mount
+430	i386	fsopen			sys_fsopen
+431	i386	fsconfig		sys_fsconfig
+432	i386	fsmount			sys_fsmount
+433	i386	fspick			sys_fspick
+434	i386	pidfd_open		sys_pidfd_open
+435	i386	clone3			sys_clone3
+436	i386	close_range		sys_close_range
+437	i386	openat2			sys_openat2
+438	i386	pidfd_getfd		sys_pidfd_getfd
+439	i386	faccessat2		sys_faccessat2
+440	i386	process_madvise		sys_process_madvise
+441	i386	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
+442	i386	mount_setattr		sys_mount_setattr
+443	i386	quotactl_fd		sys_quotactl_fd
+444	i386	landlock_create_ruleset	sys_landlock_create_ruleset
+445	i386	landlock_add_rule	sys_landlock_add_rule
+446	i386	landlock_restrict_self	sys_landlock_restrict_self
+447	i386	memfd_secret		sys_memfd_secret
+448	i386	process_mrelease	sys_process_mrelease
+449	i386	futex_waitv		sys_futex_waitv
+450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
new file mode 100644
index 000000000..c84d12608
--- /dev/null
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -0,0 +1,419 @@
+#
+# 64-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point>
+#
+# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
+#
+# The abi is "common", "64" or "x32" for this file.
+#
+0	common	read			sys_read
+1	common	write			sys_write
+2	common	open			sys_open
+3	common	close			sys_close
+4	common	stat			sys_newstat
+5	common	fstat			sys_newfstat
+6	common	lstat			sys_newlstat
+7	common	poll			sys_poll
+8	common	lseek			sys_lseek
+9	common	mmap			sys_mmap
+10	common	mprotect		sys_mprotect
+11	common	munmap			sys_munmap
+12	common	brk			sys_brk
+13	64	rt_sigaction		sys_rt_sigaction
+14	common	rt_sigprocmask		sys_rt_sigprocmask
+15	64	rt_sigreturn		sys_rt_sigreturn
+16	64	ioctl			sys_ioctl
+17	common	pread64			sys_pread64
+18	common	pwrite64		sys_pwrite64
+19	64	readv			sys_readv
+20	64	writev			sys_writev
+21	common	access			sys_access
+22	common	pipe			sys_pipe
+23	common	select			sys_select
+24	common	sched_yield		sys_sched_yield
+25	common	mremap			sys_mremap
+26	common	msync			sys_msync
+27	common	mincore			sys_mincore
+28	common	madvise			sys_madvise
+29	common	shmget			sys_shmget
+30	common	shmat			sys_shmat
+31	common	shmctl			sys_shmctl
+32	common	dup			sys_dup
+33	common	dup2			sys_dup2
+34	common	pause			sys_pause
+35	common	nanosleep		sys_nanosleep
+36	common	getitimer		sys_getitimer
+37	common	alarm			sys_alarm
+38	common	setitimer		sys_setitimer
+39	common	getpid			sys_getpid
+40	common	sendfile		sys_sendfile64
+41	common	socket			sys_socket
+42	common	connect			sys_connect
+43	common	accept			sys_accept
+44	common	sendto			sys_sendto
+45	64	recvfrom		sys_recvfrom
+46	64	sendmsg			sys_sendmsg
+47	64	recvmsg			sys_recvmsg
+48	common	shutdown		sys_shutdown
+49	common	bind			sys_bind
+50	common	listen			sys_listen
+51	common	getsockname		sys_getsockname
+52	common	getpeername		sys_getpeername
+53	common	socketpair		sys_socketpair
+54	64	setsockopt		sys_setsockopt
+55	64	getsockopt		sys_getsockopt
+56	common	clone			sys_clone
+57	common	fork			sys_fork
+58	common	vfork			sys_vfork
+59	64	execve			sys_execve
+60	common	exit			sys_exit
+61	common	wait4			sys_wait4
+62	common	kill			sys_kill
+63	common	uname			sys_newuname
+64	common	semget			sys_semget
+65	common	semop			sys_semop
+66	common	semctl			sys_semctl
+67	common	shmdt			sys_shmdt
+68	common	msgget			sys_msgget
+69	common	msgsnd			sys_msgsnd
+70	common	msgrcv			sys_msgrcv
+71	common	msgctl			sys_msgctl
+72	common	fcntl			sys_fcntl
+73	common	flock			sys_flock
+74	common	fsync			sys_fsync
+75	common	fdatasync		sys_fdatasync
+76	common	truncate		sys_truncate
+77	common	ftruncate		sys_ftruncate
+78	common	getdents		sys_getdents
+79	common	getcwd			sys_getcwd
+80	common	chdir			sys_chdir
+81	common	fchdir			sys_fchdir
+82	common	rename			sys_rename
+83	common	mkdir			sys_mkdir
+84	common	rmdir			sys_rmdir
+85	common	creat			sys_creat
+86	common	link			sys_link
+87	common	unlink			sys_unlink
+88	common	symlink			sys_symlink
+89	common	readlink		sys_readlink
+90	common	chmod			sys_chmod
+91	common	fchmod			sys_fchmod
+92	common	chown			sys_chown
+93	common	fchown			sys_fchown
+94	common	lchown			sys_lchown
+95	common	umask			sys_umask
+96	common	gettimeofday		sys_gettimeofday
+97	common	getrlimit		sys_getrlimit
+98	common	getrusage		sys_getrusage
+99	common	sysinfo			sys_sysinfo
+100	common	times			sys_times
+101	64	ptrace			sys_ptrace
+102	common	getuid			sys_getuid
+103	common	syslog			sys_syslog
+104	common	getgid			sys_getgid
+105	common	setuid			sys_setuid
+106	common	setgid			sys_setgid
+107	common	geteuid			sys_geteuid
+108	common	getegid			sys_getegid
+109	common	setpgid			sys_setpgid
+110	common	getppid			sys_getppid
+111	common	getpgrp			sys_getpgrp
+112	common	setsid			sys_setsid
+113	common	setreuid		sys_setreuid
+114	common	setregid		sys_setregid
+115	common	getgroups		sys_getgroups
+116	common	setgroups		sys_setgroups
+117	common	setresuid		sys_setresuid
+118	common	getresuid		sys_getresuid
+119	common	setresgid		sys_setresgid
+120	common	getresgid		sys_getresgid
+121	common	getpgid			sys_getpgid
+122	common	setfsuid		sys_setfsuid
+123	common	setfsgid		sys_setfsgid
+124	common	getsid			sys_getsid
+125	common	capget			sys_capget
+126	common	capset			sys_capset
+127	64	rt_sigpending		sys_rt_sigpending
+128	64	rt_sigtimedwait		sys_rt_sigtimedwait
+129	64	rt_sigqueueinfo		sys_rt_sigqueueinfo
+130	common	rt_sigsuspend		sys_rt_sigsuspend
+131	64	sigaltstack		sys_sigaltstack
+132	common	utime			sys_utime
+133	common	mknod			sys_mknod
+134	64	uselib
+135	common	personality		sys_personality
+136	common	ustat			sys_ustat
+137	common	statfs			sys_statfs
+138	common	fstatfs			sys_fstatfs
+139	common	sysfs			sys_sysfs
+140	common	getpriority		sys_getpriority
+141	common	setpriority		sys_setpriority
+142	common	sched_setparam		sys_sched_setparam
+143	common	sched_getparam		sys_sched_getparam
+144	common	sched_setscheduler	sys_sched_setscheduler
+145	common	sched_getscheduler	sys_sched_getscheduler
+146	common	sched_get_priority_max	sys_sched_get_priority_max
+147	common	sched_get_priority_min	sys_sched_get_priority_min
+148	common	sched_rr_get_interval	sys_sched_rr_get_interval
+149	common	mlock			sys_mlock
+150	common	munlock			sys_munlock
+151	common	mlockall		sys_mlockall
+152	common	munlockall		sys_munlockall
+153	common	vhangup			sys_vhangup
+154	common	modify_ldt		sys_modify_ldt
+155	common	pivot_root		sys_pivot_root
+156	64	_sysctl			sys_ni_syscall
+157	common	prctl			sys_prctl
+158	common	arch_prctl		sys_arch_prctl
+159	common	adjtimex		sys_adjtimex
+160	common	setrlimit		sys_setrlimit
+161	common	chroot			sys_chroot
+162	common	sync			sys_sync
+163	common	acct			sys_acct
+164	common	settimeofday		sys_settimeofday
+165	common	mount			sys_mount
+166	common	umount2			sys_umount
+167	common	swapon			sys_swapon
+168	common	swapoff			sys_swapoff
+169	common	reboot			sys_reboot
+170	common	sethostname		sys_sethostname
+171	common	setdomainname		sys_setdomainname
+172	common	iopl			sys_iopl
+173	common	ioperm			sys_ioperm
+174	64	create_module
+175	common	init_module		sys_init_module
+176	common	delete_module		sys_delete_module
+177	64	get_kernel_syms
+178	64	query_module
+179	common	quotactl		sys_quotactl
+180	64	nfsservctl
+181	common	getpmsg
+182	common	putpmsg
+183	common	afs_syscall
+184	common	tuxcall
+185	common	security
+186	common	gettid			sys_gettid
+187	common	readahead		sys_readahead
+188	common	setxattr		sys_setxattr
+189	common	lsetxattr		sys_lsetxattr
+190	common	fsetxattr		sys_fsetxattr
+191	common	getxattr		sys_getxattr
+192	common	lgetxattr		sys_lgetxattr
+193	common	fgetxattr		sys_fgetxattr
+194	common	listxattr		sys_listxattr
+195	common	llistxattr		sys_llistxattr
+196	common	flistxattr		sys_flistxattr
+197	common	removexattr		sys_removexattr
+198	common	lremovexattr		sys_lremovexattr
+199	common	fremovexattr		sys_fremovexattr
+200	common	tkill			sys_tkill
+201	common	time			sys_time
+202	common	futex			sys_futex
+203	common	sched_setaffinity	sys_sched_setaffinity
+204	common	sched_getaffinity	sys_sched_getaffinity
+205	64	set_thread_area
+206	64	io_setup		sys_io_setup
+207	common	io_destroy		sys_io_destroy
+208	common	io_getevents		sys_io_getevents
+209	64	io_submit		sys_io_submit
+210	common	io_cancel		sys_io_cancel
+211	64	get_thread_area
+212	common	lookup_dcookie		sys_lookup_dcookie
+213	common	epoll_create		sys_epoll_create
+214	64	epoll_ctl_old
+215	64	epoll_wait_old
+216	common	remap_file_pages	sys_remap_file_pages
+217	common	getdents64		sys_getdents64
+218	common	set_tid_address		sys_set_tid_address
+219	common	restart_syscall		sys_restart_syscall
+220	common	semtimedop		sys_semtimedop
+221	common	fadvise64		sys_fadvise64
+222	64	timer_create		sys_timer_create
+223	common	timer_settime		sys_timer_settime
+224	common	timer_gettime		sys_timer_gettime
+225	common	timer_getoverrun	sys_timer_getoverrun
+226	common	timer_delete		sys_timer_delete
+227	common	clock_settime		sys_clock_settime
+228	common	clock_gettime		sys_clock_gettime
+229	common	clock_getres		sys_clock_getres
+230	common	clock_nanosleep		sys_clock_nanosleep
+231	common	exit_group		sys_exit_group
+232	common	epoll_wait		sys_epoll_wait
+233	common	epoll_ctl		sys_epoll_ctl
+234	common	tgkill			sys_tgkill
+235	common	utimes			sys_utimes
+236	64	vserver
+237	common	mbind			sys_mbind
+238	common	set_mempolicy		sys_set_mempolicy
+239	common	get_mempolicy		sys_get_mempolicy
+240	common	mq_open			sys_mq_open
+241	common	mq_unlink		sys_mq_unlink
+242	common	mq_timedsend		sys_mq_timedsend
+243	common	mq_timedreceive		sys_mq_timedreceive
+244	64	mq_notify		sys_mq_notify
+245	common	mq_getsetattr		sys_mq_getsetattr
+246	64	kexec_load		sys_kexec_load
+247	64	waitid			sys_waitid
+248	common	add_key			sys_add_key
+249	common	request_key		sys_request_key
+250	common	keyctl			sys_keyctl
+251	common	ioprio_set		sys_ioprio_set
+252	common	ioprio_get		sys_ioprio_get
+253	common	inotify_init		sys_inotify_init
+254	common	inotify_add_watch	sys_inotify_add_watch
+255	common	inotify_rm_watch	sys_inotify_rm_watch
+256	common	migrate_pages		sys_migrate_pages
+257	common	openat			sys_openat
+258	common	mkdirat			sys_mkdirat
+259	common	mknodat			sys_mknodat
+260	common	fchownat		sys_fchownat
+261	common	futimesat		sys_futimesat
+262	common	newfstatat		sys_newfstatat
+263	common	unlinkat		sys_unlinkat
+264	common	renameat		sys_renameat
+265	common	linkat			sys_linkat
+266	common	symlinkat		sys_symlinkat
+267	common	readlinkat		sys_readlinkat
+268	common	fchmodat		sys_fchmodat
+269	common	faccessat		sys_faccessat
+270	common	pselect6		sys_pselect6
+271	common	ppoll			sys_ppoll
+272	common	unshare			sys_unshare
+273	64	set_robust_list		sys_set_robust_list
+274	64	get_robust_list		sys_get_robust_list
+275	common	splice			sys_splice
+276	common	tee			sys_tee
+277	common	sync_file_range		sys_sync_file_range
+278	64	vmsplice		sys_vmsplice
+279	64	move_pages		sys_move_pages
+280	common	utimensat		sys_utimensat
+281	common	epoll_pwait		sys_epoll_pwait
+282	common	signalfd		sys_signalfd
+283	common	timerfd_create		sys_timerfd_create
+284	common	eventfd			sys_eventfd
+285	common	fallocate		sys_fallocate
+286	common	timerfd_settime		sys_timerfd_settime
+287	common	timerfd_gettime		sys_timerfd_gettime
+288	common	accept4			sys_accept4
+289	common	signalfd4		sys_signalfd4
+290	common	eventfd2		sys_eventfd2
+291	common	epoll_create1		sys_epoll_create1
+292	common	dup3			sys_dup3
+293	common	pipe2			sys_pipe2
+294	common	inotify_init1		sys_inotify_init1
+295	64	preadv			sys_preadv
+296	64	pwritev			sys_pwritev
+297	64	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo
+298	common	perf_event_open		sys_perf_event_open
+299	64	recvmmsg		sys_recvmmsg
+300	common	fanotify_init		sys_fanotify_init
+301	common	fanotify_mark		sys_fanotify_mark
+302	common	prlimit64		sys_prlimit64
+303	common	name_to_handle_at	sys_name_to_handle_at
+304	common	open_by_handle_at	sys_open_by_handle_at
+305	common	clock_adjtime		sys_clock_adjtime
+306	common	syncfs			sys_syncfs
+307	64	sendmmsg		sys_sendmmsg
+308	common	setns			sys_setns
+309	common	getcpu			sys_getcpu
+310	64	process_vm_readv	sys_process_vm_readv
+311	64	process_vm_writev	sys_process_vm_writev
+312	common	kcmp			sys_kcmp
+313	common	finit_module		sys_finit_module
+314	common	sched_setattr		sys_sched_setattr
+315	common	sched_getattr		sys_sched_getattr
+316	common	renameat2		sys_renameat2
+317	common	seccomp			sys_seccomp
+318	common	getrandom		sys_getrandom
+319	common	memfd_create		sys_memfd_create
+320	common	kexec_file_load		sys_kexec_file_load
+321	common	bpf			sys_bpf
+322	64	execveat		sys_execveat
+323	common	userfaultfd		sys_userfaultfd
+324	common	membarrier		sys_membarrier
+325	common	mlock2			sys_mlock2
+326	common	copy_file_range		sys_copy_file_range
+327	64	preadv2			sys_preadv2
+328	64	pwritev2		sys_pwritev2
+329	common	pkey_mprotect		sys_pkey_mprotect
+330	common	pkey_alloc		sys_pkey_alloc
+331	common	pkey_free		sys_pkey_free
+332	common	statx			sys_statx
+333	common	io_pgetevents		sys_io_pgetevents
+334	common	rseq			sys_rseq
+# don't use numbers 387 through 423, add new calls after the last
+# 'common' entry
+424	common	pidfd_send_signal	sys_pidfd_send_signal
+425	common	io_uring_setup		sys_io_uring_setup
+426	common	io_uring_enter		sys_io_uring_enter
+427	common	io_uring_register	sys_io_uring_register
+428	common	open_tree		sys_open_tree
+429	common	move_mount		sys_move_mount
+430	common	fsopen			sys_fsopen
+431	common	fsconfig		sys_fsconfig
+432	common	fsmount			sys_fsmount
+433	common	fspick			sys_fspick
+434	common	pidfd_open		sys_pidfd_open
+435	common	clone3			sys_clone3
+436	common	close_range		sys_close_range
+437	common	openat2			sys_openat2
+438	common	pidfd_getfd		sys_pidfd_getfd
+439	common	faccessat2		sys_faccessat2
+440	common	process_madvise		sys_process_madvise
+441	common	epoll_pwait2		sys_epoll_pwait2
+442	common	mount_setattr		sys_mount_setattr
+443	common	quotactl_fd		sys_quotactl_fd
+444	common	landlock_create_ruleset	sys_landlock_create_ruleset
+445	common	landlock_add_rule	sys_landlock_add_rule
+446	common	landlock_restrict_self	sys_landlock_restrict_self
+447	common	memfd_secret		sys_memfd_secret
+448	common	process_mrelease	sys_process_mrelease
+449	common	futex_waitv		sys_futex_waitv
+450	common	set_mempolicy_home_node	sys_set_mempolicy_home_node
+
+#
+# Due to a historical design error, certain syscalls are numbered differently
+# in x32 as compared to native x86_64.  These syscalls have numbers 512-547.
+# Do not add new syscalls to this range.  Numbers 548 and above are available
+# for non-x32 use.
+#
+512	x32	rt_sigaction		compat_sys_rt_sigaction
+513	x32	rt_sigreturn		compat_sys_x32_rt_sigreturn
+514	x32	ioctl			compat_sys_ioctl
+515	x32	readv			sys_readv
+516	x32	writev			sys_writev
+517	x32	recvfrom		compat_sys_recvfrom
+518	x32	sendmsg			compat_sys_sendmsg
+519	x32	recvmsg			compat_sys_recvmsg
+520	x32	execve			compat_sys_execve
+521	x32	ptrace			compat_sys_ptrace
+522	x32	rt_sigpending		compat_sys_rt_sigpending
+523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait_time64
+524	x32	rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
+525	x32	sigaltstack		compat_sys_sigaltstack
+526	x32	timer_create		compat_sys_timer_create
+527	x32	mq_notify		compat_sys_mq_notify
+528	x32	kexec_load		compat_sys_kexec_load
+529	x32	waitid			compat_sys_waitid
+530	x32	set_robust_list		compat_sys_set_robust_list
+531	x32	get_robust_list		compat_sys_get_robust_list
+532	x32	vmsplice		sys_vmsplice
+533	x32	move_pages		sys_move_pages
+534	x32	preadv			compat_sys_preadv64
+535	x32	pwritev			compat_sys_pwritev64
+536	x32	rt_tgsigqueueinfo	compat_sys_rt_tgsigqueueinfo
+537	x32	recvmmsg		compat_sys_recvmmsg_time64
+538	x32	sendmmsg		compat_sys_sendmmsg
+539	x32	process_vm_readv	sys_process_vm_readv
+540	x32	process_vm_writev	sys_process_vm_writev
+541	x32	setsockopt		sys_setsockopt
+542	x32	getsockopt		sys_getsockopt
+543	x32	io_setup		compat_sys_io_setup
+544	x32	io_submit		compat_sys_io_submit
+545	x32	execveat		compat_sys_execveat
+546	x32	preadv2			compat_sys_preadv64v2
+547	x32	pwritev2		compat_sys_pwritev64v2
+# This is the end of the legacy x32 range.  Numbers 548 and above are
+# not special and are not to be used for x32-specific syscalls.
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
new file mode 100644
index 000000000..ff6e7003d
--- /dev/null
+++ b/arch/x86/entry/thunk_32.S
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
+ * Copyright 2008 by Steven Rostedt, Red Hat, Inc
+ *  (inspired by Andi Kleen's thunk_64.S)
+ */
+	#include <linux/linkage.h>
+	#include <asm/asm.h>
+	#include <asm/export.h>
+
+	/* put return address in eax (arg1) */
+	.macro THUNK name, func, put_ret_addr_in_eax=0
+SYM_CODE_START_NOALIGN(\name)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+
+	.if \put_ret_addr_in_eax
+	/* Place EIP in the arg1 */
+	movl 3*4(%esp), %eax
+	.endif
+
+	call \func
+	popl %edx
+	popl %ecx
+	popl %eax
+	RET
+	_ASM_NOKPROBE(\name)
+SYM_CODE_END(\name)
+	.endm
+
+	THUNK preempt_schedule_thunk, preempt_schedule
+	THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
+	EXPORT_SYMBOL(preempt_schedule_thunk)
+	EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
+
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
new file mode 100644
index 000000000..f38b07d27
--- /dev/null
+++ b/arch/x86/entry/thunk_64.S
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Save registers before calling assembly functions. This avoids
+ * disturbance of register allocation in some inline assembly constructs.
+ * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ */
+#include <linux/linkage.h>
+#include "calling.h"
+#include <asm/asm.h>
+#include <asm/export.h>
+
+	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */
+	.macro THUNK name, func
+SYM_FUNC_START_NOALIGN(\name)
+	pushq %rbp
+	movq %rsp, %rbp
+
+	pushq %rdi
+	pushq %rsi
+	pushq %rdx
+	pushq %rcx
+	pushq %rax
+	pushq %r8
+	pushq %r9
+	pushq %r10
+	pushq %r11
+
+	call \func
+	jmp  __thunk_restore
+SYM_FUNC_END(\name)
+	_ASM_NOKPROBE(\name)
+	.endm
+
+	THUNK preempt_schedule_thunk, preempt_schedule
+	THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
+	EXPORT_SYMBOL(preempt_schedule_thunk)
+	EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
+
+SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore)
+	popq %r11
+	popq %r10
+	popq %r9
+	popq %r8
+	popq %rax
+	popq %rcx
+	popq %rdx
+	popq %rsi
+	popq %rdi
+	popq %rbp
+	RET
+	_ASM_NOKPROBE(__thunk_restore)
+SYM_CODE_END(__thunk_restore)
diff --git a/arch/x86/entry/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore
new file mode 100644
index 000000000..37a6129d5
--- /dev/null
+++ b/arch/x86/entry/vdso/.gitignore
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso.lds
+vdsox32.lds
+vdso32-syscall-syms.lds
+vdso32-sysenter-syms.lds
+vdso32-int80-syms.lds
+vdso-image-*.c
+vdso2c
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
new file mode 100644
index 000000000..3e88b9df8
--- /dev/null
+++ b/arch/x86/entry/vdso/Makefile
@@ -0,0 +1,220 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Building vDSO images for x86.
+#
+
+# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
+# the inclusion of generic Makefile.
+ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE|
+ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE
+include $(srctree)/lib/vdso/Makefile
+
+# Sanitizer runtimes are unavailable and cannot be linked here.
+KASAN_SANITIZE			:= n
+KMSAN_SANITIZE_vclock_gettime.o := n
+KMSAN_SANITIZE_vgetcpu.o	:= n
+
+UBSAN_SANITIZE			:= n
+KCSAN_SANITIZE			:= n
+OBJECT_FILES_NON_STANDARD	:= y
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
+VDSO64-$(CONFIG_X86_64)		:= y
+VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
+VDSO32-$(CONFIG_X86_32)		:= y
+VDSO32-$(CONFIG_IA32_EMULATION)	:= y
+
+# files to link into the vdso
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
+vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
+vobjs32-y += vdso32/vclock_gettime.o
+vobjs-$(CONFIG_X86_SGX)	+= vsgx.o
+
+# files to link into kernel
+obj-y				+= vma.o extable.o
+KASAN_SANITIZE_vma.o		:= y
+UBSAN_SANITIZE_vma.o		:= y
+KCSAN_SANITIZE_vma.o		:= y
+OBJECT_FILES_NON_STANDARD_vma.o	:= n
+
+# vDSO images to build
+vdso_img-$(VDSO64-y)		+= 64
+vdso_img-$(VDSOX32-y)		+= x32
+vdso_img-$(VDSO32-y)		+= 32
+
+obj-$(VDSO32-y)			+= vdso32-setup.o
+
+vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
+vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F)
+
+$(obj)/vdso.o: $(obj)/vdso.so
+
+targets += vdso.lds $(vobjs-y)
+targets += vdso32/vdso32.lds $(vobjs32-y)
+
+# Build the vDSO image C files and link them in.
+vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o)
+vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c)
+vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg)
+obj-y += $(vdso_img_objs)
+targets += $(vdso_img_cfiles)
+targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so)
+
+CPPFLAGS_vdso.lds += -P -C
+
+VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \
+			-z max-page-size=4096
+
+$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
+	$(call if_changed,vdso_and_check)
+
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi
+hostprogs += vdso2c
+
+quiet_cmd_vdso2c = VDSO2C  $@
+      cmd_vdso2c = $(obj)/vdso2c $< $(<:%.dbg=%) $@
+
+$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
+	$(call if_changed,vdso2c)
+
+#
+# Don't omit frame pointers for ease of userspace debugging, but do
+# optimize sibling calls.
+#
+CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
+       $(filter -g%,$(KBUILD_CFLAGS)) -fno-stack-protector \
+       -fno-omit-frame-pointer -foptimize-sibling-calls \
+       -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
+
+ifdef CONFIG_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+  CFL += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
+
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
+
+#
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+#
+CFLAGS_REMOVE_vclock_gettime.o = -pg
+CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
+CFLAGS_REMOVE_vgetcpu.o = -pg
+CFLAGS_REMOVE_vsgx.o = -pg
+
+#
+# X32 processes use x32 vDSO to access 64bit kernel data.
+#
+# Build x32 vDSO image:
+# 1. Compile x32 vDSO as 64bit.
+# 2. Convert object files to x32.
+# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes
+# so that it can reach 64bit address space with 64bit pointers.
+#
+
+CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
+VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \
+			   -z max-page-size=4096
+
+# x32-rebranded versions
+vobjx32s-y := $(vobjs-y:.o=-x32.o)
+
+# same thing, but in the output directory
+vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
+
+# Convert 64bit object file to x32 for x32 vDSO.
+quiet_cmd_x32 = X32     $@
+      cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@
+
+$(obj)/%-x32.o: $(obj)/%.o FORCE
+	$(call if_changed,x32)
+
+targets += vdsox32.lds $(vobjx32s-y)
+
+$(obj)/%.so: OBJCOPYFLAGS := -S --remove-section __ex_table
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+	$(call if_changed,objcopy)
+
+$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
+	$(call if_changed,vdso_and_check)
+
+CPPFLAGS_vdso32/vdso32.lds = $(CPPFLAGS_vdso.lds)
+VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
+$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32
+
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_CFI),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
+KBUILD_CFLAGS_32 += -fno-stack-protector
+KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
+KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
+KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
+
+ifdef CONFIG_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+  KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
+
+$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+
+$(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE
+	$(call if_changed,vdso_and_check)
+
+#
+# The DSO images are built using a special linker script.
+#
+quiet_cmd_vdso = VDSO    $@
+      cmd_vdso = $(LD) -o $@ \
+		       $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
+		       -T $(filter %.lds,$^) $(filter %.o,$^) && \
+		 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
+
+VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 \
+	$(call ld-option, --eh-frame-hdr) -Bsymbolic -z noexecstack
+GCOV_PROFILE := n
+
+quiet_cmd_vdso_and_check = VDSO    $@
+      cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check)
+
+#
+# Install the unstripped copies of vdso*.so.  If our toolchain supports
+# build-id, install .build-id links as well.
+#
+quiet_cmd_vdso_install = INSTALL $(@:install_%=%)
+define cmd_vdso_install
+	cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \
+	if readelf -n $< |grep -q 'Build ID'; then \
+	  buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
+	  first=`echo $$buildid | cut -b-2`; \
+	  last=`echo $$buildid | cut -b3-`; \
+	  mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
+	  ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
+	fi
+endef
+
+vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
+
+$(MODLIB)/vdso: FORCE
+	@mkdir -p $(MODLIB)/vdso
+
+$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso
+	$(call cmd,vdso_install)
+
+PHONY += vdso_install $(vdso_img_insttargets)
+vdso_install: $(vdso_img_insttargets)
+
+clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh
new file mode 100755
index 000000000..7ee90a9b5
--- /dev/null
+++ b/arch/x86/entry/vdso/checkundef.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+nm="$1"
+file="$2"
+$nm "$file" | grep '^ *U' > /dev/null 2>&1
+if [ $? -eq 1 ]; then
+    exit 0
+else
+    echo "$file: undefined symbols found" >&2
+    exit 1
+fi
diff --git a/arch/x86/entry/vdso/extable.c b/arch/x86/entry/vdso/extable.c
new file mode 100644
index 000000000..afcf5b65b
--- /dev/null
+++ b/arch/x86/entry/vdso/extable.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <asm/current.h>
+#include <asm/traps.h>
+#include <asm/vdso.h>
+
+struct vdso_exception_table_entry {
+	int insn, fixup;
+};
+
+bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+			  unsigned long error_code, unsigned long fault_addr)
+{
+	const struct vdso_image *image = current->mm->context.vdso_image;
+	const struct vdso_exception_table_entry *extable;
+	unsigned int nr_entries, i;
+	unsigned long base;
+
+	/*
+	 * Do not attempt to fixup #DB or #BP.  It's impossible to identify
+	 * whether or not a #DB/#BP originated from within an SGX enclave and
+	 * SGX enclaves are currently the only use case for vDSO fixup.
+	 */
+	if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP)
+		return false;
+
+	if (!current->mm->context.vdso)
+		return false;
+
+	base =  (unsigned long)current->mm->context.vdso + image->extable_base;
+	nr_entries = image->extable_len / (sizeof(*extable));
+	extable = image->extable;
+
+	for (i = 0; i < nr_entries; i++) {
+		if (regs->ip == base + extable[i].insn) {
+			regs->ip = base + extable[i].fixup;
+			regs->di = trapnr;
+			regs->si = error_code;
+			regs->dx = fault_addr;
+			return true;
+		}
+	}
+
+	return false;
+}
diff --git a/arch/x86/entry/vdso/extable.h b/arch/x86/entry/vdso/extable.h
new file mode 100644
index 000000000..b56f6b012
--- /dev/null
+++ b/arch/x86/entry/vdso/extable.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __VDSO_EXTABLE_H
+#define __VDSO_EXTABLE_H
+
+/*
+ * Inject exception fixup for vDSO code.  Unlike normal exception fixup,
+ * vDSO uses a dedicated handler the addresses are relative to the overall
+ * exception table, not each individual entry.
+ */
+#ifdef __ASSEMBLY__
+#define _ASM_VDSO_EXTABLE_HANDLE(from, to)	\
+	ASM_VDSO_EXTABLE_HANDLE from to
+
+.macro ASM_VDSO_EXTABLE_HANDLE from:req to:req
+	.pushsection __ex_table, "a"
+	.long (\from) - __ex_table
+	.long (\to) - __ex_table
+	.popsection
+.endm
+#else
+#define _ASM_VDSO_EXTABLE_HANDLE(from, to)	\
+	".pushsection __ex_table, \"a\"\n"      \
+	".long (" #from ") - __ex_table\n"      \
+	".long (" #to ") - __ex_table\n"        \
+	".popsection\n"
+#endif
+
+#endif /* __VDSO_EXTABLE_H */
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
new file mode 100644
index 000000000..7d70935b6
--- /dev/null
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Fast user context implementation of clock_gettime, gettimeofday, and time.
+ *
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Copyright 2019 ARM Limited
+ *
+ * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net>
+ *  sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
+ */
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "../../../../lib/vdso/gettimeofday.c"
+
+extern int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
+extern __kernel_old_time_t __vdso_time(__kernel_old_time_t *t);
+
+int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
+{
+	return __cvdso_gettimeofday(tv, tz);
+}
+
+int gettimeofday(struct __kernel_old_timeval *, struct timezone *)
+	__attribute__((weak, alias("__vdso_gettimeofday")));
+
+__kernel_old_time_t __vdso_time(__kernel_old_time_t *t)
+{
+	return __cvdso_time(t);
+}
+
+__kernel_old_time_t time(__kernel_old_time_t *t)	__attribute__((weak, alias("__vdso_time")));
+
+
+#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
+/* both 64-bit and x32 use these */
+extern int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
+extern int __vdso_clock_getres(clockid_t clock, struct __kernel_timespec *res);
+
+int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
+{
+	return __cvdso_clock_gettime(clock, ts);
+}
+
+int clock_gettime(clockid_t, struct __kernel_timespec *)
+	__attribute__((weak, alias("__vdso_clock_gettime")));
+
+int __vdso_clock_getres(clockid_t clock,
+			struct __kernel_timespec *res)
+{
+	return __cvdso_clock_getres(clock, res);
+}
+int clock_getres(clockid_t, struct __kernel_timespec *)
+	__attribute__((weak, alias("__vdso_clock_getres")));
+
+#else
+/* i386 only */
+extern int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts);
+extern int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res);
+
+int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts)
+{
+	return __cvdso_clock_gettime32(clock, ts);
+}
+
+int clock_gettime(clockid_t, struct old_timespec32 *)
+	__attribute__((weak, alias("__vdso_clock_gettime")));
+
+int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts)
+{
+	return __cvdso_clock_gettime(clock, ts);
+}
+
+int clock_gettime64(clockid_t, struct __kernel_timespec *)
+	__attribute__((weak, alias("__vdso_clock_gettime64")));
+
+int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res)
+{
+	return __cvdso_clock_getres_time32(clock, res);
+}
+
+int clock_getres(clockid_t, struct old_timespec32 *)
+	__attribute__((weak, alias("__vdso_clock_getres")));
+#endif
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
new file mode 100644
index 000000000..bafa73f09
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/vdso.h>
+
+/*
+ * Linker script for vDSO.  This is an ELF shared object prelinked to
+ * its virtual address, and with only one read-only segment.
+ * This script controls its layout.
+ */
+
+SECTIONS
+{
+	/*
+	 * User/kernel shared data is before the vDSO.  This may be a little
+	 * uglier than putting it after the vDSO, but it avoids issues with
+	 * non-allocatable things that dangle past the end of the PT_LOAD
+	 * segment.
+	 */
+
+	vvar_start = . - 4 * PAGE_SIZE;
+	vvar_page  = vvar_start;
+
+	/* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
+#include <asm/vvar.h>
+#undef EMIT_VVAR
+
+	pvclock_page = vvar_start + PAGE_SIZE;
+	hvclock_page = vvar_start + 2 * PAGE_SIZE;
+	timens_page  = vvar_start + 3 * PAGE_SIZE;
+
+#undef _ASM_X86_VVAR_H
+	/* Place all vvars in timens too at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) timens_ ## name = timens_page + offset;
+#include <asm/vvar.h>
+#undef EMIT_VVAR
+
+	. = SIZEOF_HEADERS;
+
+	.hash		: { *(.hash) }			:text
+	.gnu.hash	: { *(.gnu.hash) }
+	.dynsym		: { *(.dynsym) }
+	.dynstr		: { *(.dynstr) }
+	.gnu.version	: { *(.gnu.version) }
+	.gnu.version_d	: { *(.gnu.version_d) }
+	.gnu.version_r	: { *(.gnu.version_r) }
+
+	.dynamic	: { *(.dynamic) }		:text	:dynamic
+
+	.rodata		: {
+		*(.rodata*)
+		*(.data*)
+		*(.sdata*)
+		*(.got.plt) *(.got)
+		*(.gnu.linkonce.d.*)
+		*(.bss*)
+		*(.dynbss*)
+		*(.gnu.linkonce.b.*)
+	}						:text
+
+	/*
+	 * Discard .note.gnu.property sections which are unused and have
+	 * different alignment requirement from vDSO note sections.
+	 */
+	/DISCARD/ : {
+		*(.note.gnu.property)
+	}
+	.note		: { *(.note.*) }		:text	:note
+
+	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
+	.eh_frame	: { KEEP (*(.eh_frame)) }	:text
+
+
+	/*
+	 * Text is well-separated from actual data: there's plenty of
+	 * stuff that isn't used at runtime in between.
+	 */
+
+	.text		: {
+		*(.text*)
+	}						:text	=0x90909090,
+
+
+
+	.altinstructions	: { *(.altinstructions) }	:text
+	.altinstr_replacement	: { *(.altinstr_replacement) }	:text
+
+	__ex_table		: { *(__ex_table) }		:text
+
+	/DISCARD/ : {
+		*(.discard)
+		*(.discard.*)
+		*(__bug_table)
+	}
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME	0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+	text		PT_LOAD		FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
+	dynamic		PT_DYNAMIC	FLAGS(4);		/* PF_R */
+	note		PT_NOTE		FLAGS(4);		/* PF_R */
+	eh_frame_hdr	PT_GNU_EH_FRAME;
+}
diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S
new file mode 100644
index 000000000..794231701
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso-note.S
@@ -0,0 +1,15 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/build-salt.h>
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+	.long LINUX_VERSION_CODE
+ELFNOTE_END
+
+BUILD_SALT
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
new file mode 100644
index 000000000..e8c60ae7a
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for 64-bit vDSO.
+ * We #include the file to define the layout details.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.
+ */
+
+#define BUILD_VDSO64
+
+#include "vdso-layout.lds.S"
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION {
+	LINUX_2.6 {
+	global:
+		clock_gettime;
+		__vdso_clock_gettime;
+		gettimeofday;
+		__vdso_gettimeofday;
+		getcpu;
+		__vdso_getcpu;
+		time;
+		__vdso_time;
+		clock_getres;
+		__vdso_clock_getres;
+#ifdef CONFIG_X86_SGX
+		__vdso_sgx_enter_enclave;
+#endif
+	local: *;
+	};
+}
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
new file mode 100644
index 000000000..90d15f2a7
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vdso2c - A vdso image preparation tool
+ * Copyright (c) 2014 Andy Lutomirski and others
+ *
+ * vdso2c requires stripped and unstripped input.  It would be trivial
+ * to fully strip the input in here, but, for reasons described below,
+ * we need to write a section table.  Doing this is more or less
+ * equivalent to dropping all non-allocatable sections, but it's
+ * easier to let objcopy handle that instead of doing it ourselves.
+ * If we ever need to do something fancier than what objcopy provides,
+ * it would be straightforward to add here.
+ *
+ * We're keep a section table for a few reasons:
+ *
+ * The Go runtime had a couple of bugs: it would read the section
+ * table to try to figure out how many dynamic symbols there were (it
+ * shouldn't have looked at the section table at all) and, if there
+ * were no SHT_SYNDYM section table entry, it would use an
+ * uninitialized value for the number of symbols.  An empty DYNSYM
+ * table would work, but I see no reason not to write a valid one (and
+ * keep full performance for old Go programs).  This hack is only
+ * needed on x86_64.
+ *
+ * The bug was introduced on 2012-08-31 by:
+ * https://code.google.com/p/go/source/detail?r=56ea40aac72b
+ * and was fixed on 2014-06-13 by:
+ * https://code.google.com/p/go/source/detail?r=fc1cd5e12595
+ *
+ * Binutils has issues debugging the vDSO: it reads the section table to
+ * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
+ * would break build-id if we removed the section table.  Binutils
+ * also requires that shstrndx != 0.  See:
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=17064
+ *
+ * elfutils might not look for PT_NOTE if there is a section table at
+ * all.  I don't know whether this matters for any practical purpose.
+ *
+ * For simplicity, rather than hacking up a partial section table, we
+ * just write a mostly complete one.  We omit non-dynamic symbols,
+ * though, since they're rather large.
+ *
+ * Once binutils gets fixed, we might be able to drop this for all but
+ * the 64-bit vdso, since build-id only works in kernel RPMs, and
+ * systems that update to new enough kernel RPMs will likely update
+ * binutils in sync.  build-id has never worked for home-built kernel
+ * RPMs without manual symlinking, and I suspect that no one ever does
+ * that.
+ */
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <err.h>
+
+#include <sys/mman.h>
+#include <sys/types.h>
+
+#include <tools/le_byteshift.h>
+
+#include <linux/elf.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+const char *outfilename;
+
+/* Symbols that we need in vdso2c. */
+enum {
+	sym_vvar_start,
+	sym_vvar_page,
+	sym_pvclock_page,
+	sym_hvclock_page,
+	sym_timens_page,
+};
+
+const int special_pages[] = {
+	sym_vvar_page,
+	sym_pvclock_page,
+	sym_hvclock_page,
+	sym_timens_page,
+};
+
+struct vdso_sym {
+	const char *name;
+	bool export;
+};
+
+struct vdso_sym required_syms[] = {
+	[sym_vvar_start] = {"vvar_start", true},
+	[sym_vvar_page] = {"vvar_page", true},
+	[sym_pvclock_page] = {"pvclock_page", true},
+	[sym_hvclock_page] = {"hvclock_page", true},
+	[sym_timens_page] = {"timens_page", true},
+	{"VDSO32_NOTE_MASK", true},
+	{"__kernel_vsyscall", true},
+	{"__kernel_sigreturn", true},
+	{"__kernel_rt_sigreturn", true},
+	{"int80_landing_pad", true},
+	{"vdso32_rt_sigreturn_landing_pad", true},
+	{"vdso32_sigreturn_landing_pad", true},
+};
+
+__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
+static void fail(const char *format, ...)
+{
+	va_list ap;
+	va_start(ap, format);
+	fprintf(stderr, "Error: ");
+	vfprintf(stderr, format, ap);
+	if (outfilename)
+		unlink(outfilename);
+	exit(1);
+	va_end(ap);
+}
+
+/*
+ * Evil macros for little-endian reads and writes
+ */
+#define GLE(x, bits, ifnot)						\
+	__builtin_choose_expr(						\
+		(sizeof(*(x)) == bits/8),				\
+		(__typeof__(*(x)))get_unaligned_le##bits(x), ifnot)
+
+extern void bad_get_le(void);
+#define LAST_GLE(x)							\
+	__builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le())
+
+#define GET_LE(x)							\
+	GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x))))
+
+#define PLE(x, val, bits, ifnot)					\
+	__builtin_choose_expr(						\
+		(sizeof(*(x)) == bits/8),				\
+		put_unaligned_le##bits((val), (x)), ifnot)
+
+extern void bad_put_le(void);
+#define LAST_PLE(x, val)						\
+	__builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le())
+
+#define PUT_LE(x, val)					\
+	PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val))))
+
+
+#define NSYMS ARRAY_SIZE(required_syms)
+
+#define BITSFUNC3(name, bits, suffix) name##bits##suffix
+#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix)
+#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, )
+
+#define INT_BITS BITSFUNC2(int, ELF_BITS, _t)
+
+#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
+#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
+#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
+
+#define ELF_BITS 64
+#include "vdso2c.h"
+#undef ELF_BITS
+
+#define ELF_BITS 32
+#include "vdso2c.h"
+#undef ELF_BITS
+
+static void go(void *raw_addr, size_t raw_len,
+	       void *stripped_addr, size_t stripped_len,
+	       FILE *outfile, const char *name)
+{
+	Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr;
+
+	if (hdr->e_ident[EI_CLASS] == ELFCLASS64) {
+		go64(raw_addr, raw_len, stripped_addr, stripped_len,
+		     outfile, name);
+	} else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) {
+		go32(raw_addr, raw_len, stripped_addr, stripped_len,
+		     outfile, name);
+	} else {
+		fail("unknown ELF class\n");
+	}
+}
+
+static void map_input(const char *name, void **addr, size_t *len, int prot)
+{
+	off_t tmp_len;
+
+	int fd = open(name, O_RDONLY);
+	if (fd == -1)
+		err(1, "open(%s)", name);
+
+	tmp_len = lseek(fd, 0, SEEK_END);
+	if (tmp_len == (off_t)-1)
+		err(1, "lseek");
+	*len = (size_t)tmp_len;
+
+	*addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0);
+	if (*addr == MAP_FAILED)
+		err(1, "mmap");
+
+	close(fd);
+}
+
+int main(int argc, char **argv)
+{
+	size_t raw_len, stripped_len;
+	void *raw_addr, *stripped_addr;
+	FILE *outfile;
+	char *name, *tmp;
+	int namelen;
+
+	if (argc != 4) {
+		printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n");
+		return 1;
+	}
+
+	/*
+	 * Figure out the struct name.  If we're writing to a .so file,
+	 * generate raw output instead.
+	 */
+	name = strdup(argv[3]);
+	namelen = strlen(name);
+	if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) {
+		name = NULL;
+	} else {
+		tmp = strrchr(name, '/');
+		if (tmp)
+			name = tmp + 1;
+		tmp = strchr(name, '.');
+		if (tmp)
+			*tmp = '\0';
+		for (tmp = name; *tmp; tmp++)
+			if (*tmp == '-')
+				*tmp = '_';
+	}
+
+	map_input(argv[1], &raw_addr, &raw_len, PROT_READ);
+	map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ);
+
+	outfilename = argv[3];
+	outfile = fopen(outfilename, "w");
+	if (!outfile)
+		err(1, "fopen(%s)", outfilename);
+
+	go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name);
+
+	munmap(raw_addr, raw_len);
+	munmap(stripped_addr, stripped_len);
+	fclose(outfile);
+
+	return 0;
+}
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
new file mode 100644
index 000000000..5264daa88
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -0,0 +1,222 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file is included twice from vdso2c.c.  It generates code for 32-bit
+ * and 64-bit vDSOs.  We need both for 64-bit builds, since 32-bit vDSOs
+ * are built for 32-bit userspace.
+ */
+
+static void BITSFUNC(copy)(FILE *outfile, const unsigned char *data, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		if (i % 10 == 0)
+			fprintf(outfile, "\n\t");
+		fprintf(outfile, "0x%02X, ", (int)(data)[i]);
+	}
+}
+
+
+/*
+ * Extract a section from the input data into a standalone blob.  Used to
+ * capture kernel-only data that needs to persist indefinitely, e.g. the
+ * exception fixup tables, but only in the kernel, i.e. the section can
+ * be stripped from the final vDSO image.
+ */
+static void BITSFUNC(extract)(const unsigned char *data, size_t data_len,
+			      FILE *outfile, ELF(Shdr) *sec, const char *name)
+{
+	unsigned long offset;
+	size_t len;
+
+	offset = (unsigned long)GET_LE(&sec->sh_offset);
+	len = (size_t)GET_LE(&sec->sh_size);
+
+	if (offset + len > data_len)
+		fail("section to extract overruns input data");
+
+	fprintf(outfile, "static const unsigned char %s[%zu] = {", name, len);
+	BITSFUNC(copy)(outfile, data + offset, len);
+	fprintf(outfile, "\n};\n\n");
+}
+
+static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
+			 void *stripped_addr, size_t stripped_len,
+			 FILE *outfile, const char *image_name)
+{
+	int found_load = 0;
+	unsigned long load_size = -1;  /* Work around bogus warning */
+	unsigned long mapping_size;
+	ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
+	unsigned long i, syms_nr;
+	ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
+		*alt_sec = NULL, *extable_sec = NULL;
+	ELF(Dyn) *dyn = 0, *dyn_end = 0;
+	const char *secstrings;
+	INT_BITS syms[NSYMS] = {};
+
+	ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
+
+	if (GET_LE(&hdr->e_type) != ET_DYN)
+		fail("input is not a shared object\n");
+
+	/* Walk the segment table. */
+	for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
+		if (GET_LE(&pt[i].p_type) == PT_LOAD) {
+			if (found_load)
+				fail("multiple PT_LOAD segs\n");
+
+			if (GET_LE(&pt[i].p_offset) != 0 ||
+			    GET_LE(&pt[i].p_vaddr) != 0)
+				fail("PT_LOAD in wrong place\n");
+
+			if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz))
+				fail("cannot handle memsz != filesz\n");
+
+			load_size = GET_LE(&pt[i].p_memsz);
+			found_load = 1;
+		} else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) {
+			dyn = raw_addr + GET_LE(&pt[i].p_offset);
+			dyn_end = raw_addr + GET_LE(&pt[i].p_offset) +
+				GET_LE(&pt[i].p_memsz);
+		}
+	}
+	if (!found_load)
+		fail("no PT_LOAD seg\n");
+
+	if (stripped_len < load_size)
+		fail("stripped input is too short\n");
+
+	if (!dyn)
+		fail("input has no PT_DYNAMIC section -- your toolchain is buggy\n");
+
+	/* Walk the dynamic table */
+	for (i = 0; dyn + i < dyn_end &&
+		     GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
+		typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag);
+		if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA ||
+		    tag == DT_RELENT || tag == DT_TEXTREL)
+			fail("vdso image contains dynamic relocations\n");
+	}
+
+	/* Walk the section table */
+	secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
+		GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
+	secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset);
+	for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
+		ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) +
+			GET_LE(&hdr->e_shentsize) * i;
+		if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
+			symtab_hdr = sh;
+
+		if (!strcmp(secstrings + GET_LE(&sh->sh_name),
+			    ".altinstructions"))
+			alt_sec = sh;
+		if (!strcmp(secstrings + GET_LE(&sh->sh_name), "__ex_table"))
+			extable_sec = sh;
+	}
+
+	if (!symtab_hdr)
+		fail("no symbol table\n");
+
+	strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
+		GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
+
+	syms_nr = GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
+	/* Walk the symbol table */
+	for (i = 0; i < syms_nr; i++) {
+		unsigned int k;
+		ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
+			GET_LE(&symtab_hdr->sh_entsize) * i;
+		const char *sym_name = raw_addr +
+				       GET_LE(&strtab_hdr->sh_offset) +
+				       GET_LE(&sym->st_name);
+
+		for (k = 0; k < NSYMS; k++) {
+			if (!strcmp(sym_name, required_syms[k].name)) {
+				if (syms[k]) {
+					fail("duplicate symbol %s\n",
+					     required_syms[k].name);
+				}
+
+				/*
+				 * Careful: we use negative addresses, but
+				 * st_value is unsigned, so we rely
+				 * on syms[k] being a signed type of the
+				 * correct width.
+				 */
+				syms[k] = GET_LE(&sym->st_value);
+			}
+		}
+	}
+
+	/* Validate mapping addresses. */
+	for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
+		INT_BITS symval = syms[special_pages[i]];
+
+		if (!symval)
+			continue;  /* The mapping isn't used; ignore it. */
+
+		if (symval % 4096)
+			fail("%s must be a multiple of 4096\n",
+			     required_syms[i].name);
+		if (symval + 4096 < syms[sym_vvar_start])
+			fail("%s underruns vvar_start\n",
+			     required_syms[i].name);
+		if (symval + 4096 > 0)
+			fail("%s is on the wrong side of the vdso text\n",
+			     required_syms[i].name);
+	}
+	if (syms[sym_vvar_start] % 4096)
+		fail("vvar_begin must be a multiple of 4096\n");
+
+	if (!image_name) {
+		fwrite(stripped_addr, stripped_len, 1, outfile);
+		return;
+	}
+
+	mapping_size = (stripped_len + 4095) / 4096 * 4096;
+
+	fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
+	fprintf(outfile, "#include <linux/linkage.h>\n");
+	fprintf(outfile, "#include <asm/page_types.h>\n");
+	fprintf(outfile, "#include <asm/vdso.h>\n");
+	fprintf(outfile, "\n");
+	fprintf(outfile,
+		"static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
+		mapping_size);
+	for (i = 0; i < stripped_len; i++) {
+		if (i % 10 == 0)
+			fprintf(outfile, "\n\t");
+		fprintf(outfile, "0x%02X, ",
+			(int)((unsigned char *)stripped_addr)[i]);
+	}
+	fprintf(outfile, "\n};\n\n");
+	if (extable_sec)
+		BITSFUNC(extract)(raw_addr, raw_len, outfile,
+				  extable_sec, "extable");
+
+	fprintf(outfile, "const struct vdso_image %s = {\n", image_name);
+	fprintf(outfile, "\t.data = raw_data,\n");
+	fprintf(outfile, "\t.size = %lu,\n", mapping_size);
+	if (alt_sec) {
+		fprintf(outfile, "\t.alt = %lu,\n",
+			(unsigned long)GET_LE(&alt_sec->sh_offset));
+		fprintf(outfile, "\t.alt_len = %lu,\n",
+			(unsigned long)GET_LE(&alt_sec->sh_size));
+	}
+	if (extable_sec) {
+		fprintf(outfile, "\t.extable_base = %lu,\n",
+			(unsigned long)GET_LE(&extable_sec->sh_offset));
+		fprintf(outfile, "\t.extable_len = %lu,\n",
+			(unsigned long)GET_LE(&extable_sec->sh_size));
+		fprintf(outfile, "\t.extable = extable,\n");
+	}
+
+	for (i = 0; i < NSYMS; i++) {
+		if (required_syms[i].export && syms[i])
+			fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
+				required_syms[i].name, (int64_t)syms[i]);
+	}
+	fprintf(outfile, "};\n");
+}
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
new file mode 100644
index 000000000..43842fade
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/elf.h>
+
+#include <asm/processor.h>
+#include <asm/vdso.h>
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT	0
+#else
+#define VDSO_DEFAULT	1
+#endif
+
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT;
+
+static int __init vdso32_setup(char *s)
+{
+	vdso32_enabled = simple_strtoul(s, NULL, 0);
+
+	if (vdso32_enabled > 1) {
+		pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n");
+		vdso32_enabled = 0;
+	}
+
+	return 1;
+}
+
+/*
+ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
+ * behavior on both 64-bit and 32-bit kernels.
+ * On 32-bit kernels, vdso=[012] means the same thing.
+ */
+__setup("vdso32=", vdso32_setup);
+
+#ifdef CONFIG_X86_32
+__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
+#endif
+
+int __init sysenter_setup(void)
+{
+	init_vdso_image(&vdso_image_32);
+
+	return 0;
+}
+
+#ifdef CONFIG_X86_64
+
+subsys_initcall(sysenter_setup);
+
+#ifdef CONFIG_SYSCTL
+/* Register vsyscall32 into the ABI table */
+#include <linux/sysctl.h>
+
+static struct ctl_table abi_table2[] = {
+	{
+		.procname	= "vsyscall32",
+		.data		= &vdso32_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{}
+};
+
+static struct ctl_table abi_root_table2[] = {
+	{
+		.procname = "abi",
+		.mode = 0555,
+		.child = abi_table2
+	},
+	{}
+};
+
+static __init int ia32_binfmt_init(void)
+{
+	register_sysctl_table(abi_root_table2);
+	return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif /* CONFIG_SYSCTL */
+
+#endif	/* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore
new file mode 100644
index 000000000..516738484
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso32.lds
diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S
new file mode 100644
index 000000000..2cbd39939
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/note.S
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/build-salt.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+/* Ideally this would use UTS_NAME, but using a quoted string here
+   doesn't work. Remember to change this when changing the
+   kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
+	.long LINUX_VERSION_CODE
+ELFNOTE_END
+
+BUILD_SALT
diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S
new file mode 100644
index 000000000..1bd068f72
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/sigreturn.S
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/unistd_32.h>
+#include <asm/asm-offsets.h>
+
+#ifndef SYSCALL_ENTER_KERNEL
+#define	SYSCALL_ENTER_KERNEL	int $0x80
+#endif
+
+	.text
+	.globl __kernel_sigreturn
+	.type __kernel_sigreturn,@function
+	nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */
+	ALIGN
+__kernel_sigreturn:
+.LSTART_sigreturn:
+	popl %eax		/* XXX does this mean it needs unwind info? */
+	movl $__NR_sigreturn, %eax
+	SYSCALL_ENTER_KERNEL
+.LEND_sigreturn:
+SYM_INNER_LABEL(vdso32_sigreturn_landing_pad, SYM_L_GLOBAL)
+	nop
+	.size __kernel_sigreturn,.-.LSTART_sigreturn
+
+	.globl __kernel_rt_sigreturn
+	.type __kernel_rt_sigreturn,@function
+	ALIGN
+__kernel_rt_sigreturn:
+.LSTART_rt_sigreturn:
+	movl $__NR_rt_sigreturn, %eax
+	SYSCALL_ENTER_KERNEL
+.LEND_rt_sigreturn:
+SYM_INNER_LABEL(vdso32_rt_sigreturn_landing_pad, SYM_L_GLOBAL)
+	nop
+	.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+	.previous
+
+	.section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI1:
+	.long .LENDCIEDLSI1-.LSTARTCIEDLSI1
+.LSTARTCIEDLSI1:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zRS"		/* NUL-terminated augmentation string */
+	.uleb128 1		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 8			/* Return address register column */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+	.byte 0			/* DW_CFA_nop */
+	.align 4
+.LENDCIEDLSI1:
+	.long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
+.LSTARTFDEDLSI1:
+	.long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
+	/* HACK: The dwarf2 unwind routines will subtract 1 from the
+	   return address to get an address in the middle of the
+	   presumed call instruction.  Since we didn't get here via
+	   a call, we need to include the nop before the real start
+	   to make up for it.  */
+	.long .LSTART_sigreturn-1-.	/* PC-relative start address */
+	.long .LEND_sigreturn-.LSTART_sigreturn+1
+	.uleb128 0			/* Augmentation */
+	/* What follows are the instructions for the table generation.
+	   We record the locations of each register saved.  This is
+	   complicated by the fact that the "CFA" is always assumed to
+	   be the value of the stack pointer in the caller.  This means
+	   that we must define the CFA of this body of code to be the
+	   saved value of the stack pointer in the sigcontext.  Which
+	   also means that there is no fixed relation to the other
+	   saved registers, which means that we must use DW_CFA_expression
+	   to compute their addresses.  It also means that when we
+	   adjust the stack with the popl, we have to do it all over again.  */
+
+#define do_cfa_expr(offset)						\
+	.byte 0x0f;			/* DW_CFA_def_cfa_expression */	\
+	.uleb128 1f-0f;			/*   length */			\
+0:	.byte 0x74;			/*     DW_OP_breg4 */		\
+	.sleb128 offset;		/*      offset */		\
+	.byte 0x06;			/*     DW_OP_deref */		\
+1:
+
+#define do_expr(regno, offset)						\
+	.byte 0x10;			/* DW_CFA_expression */		\
+	.uleb128 regno;			/*   regno */			\
+	.uleb128 1f-0f;			/*   length */			\
+0:	.byte 0x74;			/*     DW_OP_breg4 */		\
+	.sleb128 offset;		/*       offset */		\
+1:
+
+	do_cfa_expr(IA32_SIGCONTEXT_sp+4)
+	do_expr(0, IA32_SIGCONTEXT_ax+4)
+	do_expr(1, IA32_SIGCONTEXT_cx+4)
+	do_expr(2, IA32_SIGCONTEXT_dx+4)
+	do_expr(3, IA32_SIGCONTEXT_bx+4)
+	do_expr(5, IA32_SIGCONTEXT_bp+4)
+	do_expr(6, IA32_SIGCONTEXT_si+4)
+	do_expr(7, IA32_SIGCONTEXT_di+4)
+	do_expr(8, IA32_SIGCONTEXT_ip+4)
+
+	.byte 0x42	/* DW_CFA_advance_loc 2 -- nop; popl eax. */
+
+	do_cfa_expr(IA32_SIGCONTEXT_sp)
+	do_expr(0, IA32_SIGCONTEXT_ax)
+	do_expr(1, IA32_SIGCONTEXT_cx)
+	do_expr(2, IA32_SIGCONTEXT_dx)
+	do_expr(3, IA32_SIGCONTEXT_bx)
+	do_expr(5, IA32_SIGCONTEXT_bp)
+	do_expr(6, IA32_SIGCONTEXT_si)
+	do_expr(7, IA32_SIGCONTEXT_di)
+	do_expr(8, IA32_SIGCONTEXT_ip)
+
+	.align 4
+.LENDFDEDLSI1:
+
+	.long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
+.LSTARTFDEDLSI2:
+	.long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
+	/* HACK: See above wrt unwind library assumptions.  */
+	.long .LSTART_rt_sigreturn-1-.	/* PC-relative start address */
+	.long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
+	.uleb128 0			/* Augmentation */
+	/* What follows are the instructions for the table generation.
+	   We record the locations of each register saved.  This is
+	   slightly less complicated than the above, since we don't
+	   modify the stack pointer in the process.  */
+
+	do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp)
+	do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax)
+	do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx)
+	do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx)
+	do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx)
+	do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp)
+	do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si)
+	do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di)
+	do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip)
+
+	.align 4
+.LENDFDEDLSI2:
+	.previous
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
new file mode 100644
index 000000000..d33c6513f
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AT_SYSINFO entry point
+*/
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
+
+	.text
+	.globl __kernel_vsyscall
+	.type __kernel_vsyscall,@function
+	ALIGN
+__kernel_vsyscall:
+	CFI_STARTPROC
+	/*
+	 * Reshuffle regs so that all of any of the entry instructions
+	 * will preserve enough state.
+	 *
+	 * A really nice entry sequence would be:
+	 *  pushl %edx
+	 *  pushl %ecx
+	 *  movl  %esp, %ecx
+	 *
+	 * Unfortunately, naughty Android versions between July and December
+	 * 2015 actually hardcode the traditional Linux SYSENTER entry
+	 * sequence.  That is severely broken for a number of reasons (ask
+	 * anyone with an AMD CPU, for example).  Nonetheless, we try to keep
+	 * it working approximately as well as it ever worked.
+	 *
+	 * This link may elucidate some of the history:
+	 *   https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
+	 * personally, I find it hard to understand what's going on there.
+	 *
+	 * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE.
+	 * Execute an indirect call to the address in the AT_SYSINFO auxv
+	 * entry.  That is the ONLY correct way to make a fast 32-bit system
+	 * call on Linux.  (Open-coding int $0x80 is also fine, but it's
+	 * slow.)
+	 */
+	pushl	%ecx
+	CFI_ADJUST_CFA_OFFSET	4
+	CFI_REL_OFFSET		ecx, 0
+	pushl	%edx
+	CFI_ADJUST_CFA_OFFSET	4
+	CFI_REL_OFFSET		edx, 0
+	pushl	%ebp
+	CFI_ADJUST_CFA_OFFSET	4
+	CFI_REL_OFFSET		ebp, 0
+
+	#define SYSENTER_SEQUENCE	"movl %esp, %ebp; sysenter"
+	#define SYSCALL_SEQUENCE	"movl %ecx, %ebp; syscall"
+
+#ifdef CONFIG_X86_64
+	/* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
+	ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
+	                  SYSCALL_SEQUENCE,  X86_FEATURE_SYSCALL32
+#else
+	ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
+#endif
+
+	/* Enter using int $0x80 */
+	int	$0x80
+SYM_INNER_LABEL(int80_landing_pad, SYM_L_GLOBAL)
+
+	/*
+	 * Restore EDX and ECX in case they were clobbered.  EBP is not
+	 * clobbered (the kernel restores it), but it's cleaner and
+	 * probably faster to pop it than to adjust ESP using addl.
+	 */
+	popl	%ebp
+	CFI_RESTORE		ebp
+	CFI_ADJUST_CFA_OFFSET	-4
+	popl	%edx
+	CFI_RESTORE		edx
+	CFI_ADJUST_CFA_OFFSET	-4
+	popl	%ecx
+	CFI_RESTORE		ecx
+	CFI_ADJUST_CFA_OFFSET	-4
+	RET
+	CFI_ENDPROC
+
+	.size __kernel_vsyscall,.-__kernel_vsyscall
+	.previous
diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
new file mode 100644
index 000000000..283ed9d00
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+#define BUILD_VDSO32
+
+#ifdef CONFIG_X86_64
+
+/*
+ * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel
+ * configuration
+ */
+#undef CONFIG_64BIT
+#undef CONFIG_X86_64
+#undef CONFIG_COMPAT
+#undef CONFIG_PGTABLE_LEVELS
+#undef CONFIG_ILLEGAL_POINTER_VALUE
+#undef CONFIG_SPARSEMEM_VMEMMAP
+#undef CONFIG_NR_CPUS
+#undef CONFIG_PARAVIRT_XXL
+
+#define CONFIG_X86_32 1
+#define CONFIG_PGTABLE_LEVELS 2
+#define CONFIG_PAGE_OFFSET 0
+#define CONFIG_ILLEGAL_POINTER_VALUE 0
+#define CONFIG_NR_CPUS 1
+
+#define BUILD_VDSO32_64
+
+#endif
+
+#include "../vclock_gettime.c"
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
new file mode 100644
index 000000000..c7720995a
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for 32-bit vDSO.
+ * We #include the file to define the layout details.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.
+ */
+
+#include <asm/page.h>
+
+#define BUILD_VDSO32
+
+#include "../vdso-layout.lds.S"
+
+/* The ELF entry point can be used to set the AT_SYSINFO value.  */
+ENTRY(__kernel_vsyscall);
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION
+{
+	LINUX_2.6 {
+	global:
+		__vdso_clock_gettime;
+		__vdso_gettimeofday;
+		__vdso_time;
+		__vdso_clock_getres;
+		__vdso_clock_gettime64;
+	};
+
+	LINUX_2.5 {
+	global:
+		__kernel_vsyscall;
+		__kernel_sigreturn;
+		__kernel_rt_sigreturn;
+	local: *;
+	};
+}
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S
new file mode 100644
index 000000000..16a8050a4
--- /dev/null
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for x32 vDSO.
+ * We #include the file to define the layout details.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.
+ */
+
+#define BUILD_VDSOX32
+
+#include "vdso-layout.lds.S"
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION {
+	LINUX_2.6 {
+	global:
+		__vdso_clock_gettime;
+		__vdso_gettimeofday;
+		__vdso_getcpu;
+		__vdso_time;
+		__vdso_clock_getres;
+	local: *;
+	};
+}
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
new file mode 100644
index 000000000..b88a82bbc
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ *
+ * Fast user context implementation of getcpu()
+ */
+
+#include <linux/kernel.h>
+#include <linux/getcpu.h>
+#include <linux/time.h>
+#include <asm/vgtod.h>
+
+notrace long
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+{
+	vdso_read_cpunode(cpu, node);
+
+	return 0;
+}
+
+long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+	__attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
new file mode 100644
index 000000000..128866139
--- /dev/null
+++ b/arch/x86/entry/vdso/vma.c
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2007 Andi Kleen, SUSE Labs.
+ *
+ * This contains most of the x86 vDSO kernel-side code.
+ */
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/elf.h>
+#include <linux/cpu.h>
+#include <linux/ptrace.h>
+#include <linux/time_namespace.h>
+
+#include <asm/pvclock.h>
+#include <asm/vgtod.h>
+#include <asm/proto.h>
+#include <asm/vdso.h>
+#include <asm/vvar.h>
+#include <asm/tlb.h>
+#include <asm/page.h>
+#include <asm/desc.h>
+#include <asm/cpufeature.h>
+#include <clocksource/hyperv_timer.h>
+
+#undef _ASM_X86_VVAR_H
+#define EMIT_VVAR(name, offset)	\
+	const size_t name ## _offset = offset;
+#include <asm/vvar.h>
+
+struct vdso_data *arch_get_vdso_data(void *vvar_page)
+{
+	return (struct vdso_data *)(vvar_page + _vdso_data_offset);
+}
+#undef EMIT_VVAR
+
+unsigned int vclocks_used __read_mostly;
+
+#if defined(CONFIG_X86_64)
+unsigned int __read_mostly vdso64_enabled = 1;
+#endif
+
+void __init init_vdso_image(const struct vdso_image *image)
+{
+	BUG_ON(image->size % PAGE_SIZE != 0);
+
+	apply_alternatives((struct alt_instr *)(image->data + image->alt),
+			   (struct alt_instr *)(image->data + image->alt +
+						image->alt_len));
+}
+
+static const struct vm_special_mapping vvar_mapping;
+struct linux_binprm;
+
+static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
+		      struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+
+	if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
+		return VM_FAULT_SIGBUS;
+
+	vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
+	get_page(vmf->page);
+	return 0;
+}
+
+static void vdso_fix_landing(const struct vdso_image *image,
+		struct vm_area_struct *new_vma)
+{
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+	if (in_ia32_syscall() && image == &vdso_image_32) {
+		struct pt_regs *regs = current_pt_regs();
+		unsigned long vdso_land = image->sym_int80_landing_pad;
+		unsigned long old_land_addr = vdso_land +
+			(unsigned long)current->mm->context.vdso;
+
+		/* Fixing userspace landing - look at do_fast_syscall_32 */
+		if (regs->ip == old_land_addr)
+			regs->ip = new_vma->vm_start + vdso_land;
+	}
+#endif
+}
+
+static int vdso_mremap(const struct vm_special_mapping *sm,
+		struct vm_area_struct *new_vma)
+{
+	const struct vdso_image *image = current->mm->context.vdso_image;
+
+	vdso_fix_landing(image, new_vma);
+	current->mm->context.vdso = (void __user *)new_vma->vm_start;
+
+	return 0;
+}
+
+#ifdef CONFIG_TIME_NS
+static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+	if (likely(vma->vm_mm == current->mm))
+		return current->nsproxy->time_ns->vvar_page;
+
+	/*
+	 * VM_PFNMAP | VM_IO protect .fault() handler from being called
+	 * through interfaces like /proc/$pid/mem or
+	 * process_vm_{readv,writev}() as long as there's no .access()
+	 * in special_mapping_vmops().
+	 * For more details check_vma_flags() and __access_remote_vm()
+	 */
+
+	WARN(1, "vvar_page accessed remotely");
+
+	return NULL;
+}
+
+/*
+ * The vvar page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_data() for details.
+ */
+int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+	struct mm_struct *mm = task->mm;
+	struct vm_area_struct *vma;
+	VMA_ITERATOR(vmi, mm, 0);
+
+	mmap_read_lock(mm);
+	for_each_vma(vmi, vma) {
+		unsigned long size = vma->vm_end - vma->vm_start;
+
+		if (vma_is_special_mapping(vma, &vvar_mapping))
+			zap_page_range(vma, vma->vm_start, size);
+	}
+	mmap_read_unlock(mm);
+
+	return 0;
+}
+#else
+static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+#endif
+
+static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
+		      struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+	unsigned long pfn;
+	long sym_offset;
+
+	if (!image)
+		return VM_FAULT_SIGBUS;
+
+	sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
+		image->sym_vvar_start;
+
+	/*
+	 * Sanity check: a symbol offset of zero means that the page
+	 * does not exist for this vdso image, not that the page is at
+	 * offset zero relative to the text mapping.  This should be
+	 * impossible here, because sym_offset should only be zero for
+	 * the page past the end of the vvar mapping.
+	 */
+	if (sym_offset == 0)
+		return VM_FAULT_SIGBUS;
+
+	if (sym_offset == image->sym_vvar_page) {
+		struct page *timens_page = find_timens_vvar_page(vma);
+
+		pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT;
+
+		/*
+		 * If a task belongs to a time namespace then a namespace
+		 * specific VVAR is mapped with the sym_vvar_page offset and
+		 * the real VVAR page is mapped with the sym_timens_page
+		 * offset.
+		 * See also the comment near timens_setup_vdso_data().
+		 */
+		if (timens_page) {
+			unsigned long addr;
+			vm_fault_t err;
+
+			/*
+			 * Optimization: inside time namespace pre-fault
+			 * VVAR page too. As on timens page there are only
+			 * offsets for clocks on VVAR, it'll be faulted
+			 * shortly by VDSO code.
+			 */
+			addr = vmf->address + (image->sym_timens_page - sym_offset);
+			err = vmf_insert_pfn(vma, addr, pfn);
+			if (unlikely(err & VM_FAULT_ERROR))
+				return err;
+
+			pfn = page_to_pfn(timens_page);
+		}
+
+		return vmf_insert_pfn(vma, vmf->address, pfn);
+	} else if (sym_offset == image->sym_pvclock_page) {
+		struct pvclock_vsyscall_time_info *pvti =
+			pvclock_get_pvti_cpu0_va();
+		if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) {
+			return vmf_insert_pfn_prot(vma, vmf->address,
+					__pa(pvti) >> PAGE_SHIFT,
+					pgprot_decrypted(vma->vm_page_prot));
+		}
+	} else if (sym_offset == image->sym_hvclock_page) {
+		struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
+
+		if (tsc_pg && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK))
+			return vmf_insert_pfn(vma, vmf->address,
+					virt_to_phys(tsc_pg) >> PAGE_SHIFT);
+	} else if (sym_offset == image->sym_timens_page) {
+		struct page *timens_page = find_timens_vvar_page(vma);
+
+		if (!timens_page)
+			return VM_FAULT_SIGBUS;
+
+		pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT;
+		return vmf_insert_pfn(vma, vmf->address, pfn);
+	}
+
+	return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_special_mapping vdso_mapping = {
+	.name = "[vdso]",
+	.fault = vdso_fault,
+	.mremap = vdso_mremap,
+};
+static const struct vm_special_mapping vvar_mapping = {
+	.name = "[vvar]",
+	.fault = vvar_fault,
+};
+
+/*
+ * Add vdso and vvar mappings to current process.
+ * @image          - blob to map
+ * @addr           - request a specific address (zero to map at free addr)
+ */
+static int map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long text_start;
+	int ret = 0;
+
+	if (mmap_write_lock_killable(mm))
+		return -EINTR;
+
+	addr = get_unmapped_area(NULL, addr,
+				 image->size - image->sym_vvar_start, 0, 0);
+	if (IS_ERR_VALUE(addr)) {
+		ret = addr;
+		goto up_fail;
+	}
+
+	text_start = addr - image->sym_vvar_start;
+
+	/*
+	 * MAYWRITE to allow gdb to COW and set breakpoints
+	 */
+	vma = _install_special_mapping(mm,
+				       text_start,
+				       image->size,
+				       VM_READ|VM_EXEC|
+				       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+				       &vdso_mapping);
+
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto up_fail;
+	}
+
+	vma = _install_special_mapping(mm,
+				       addr,
+				       -image->sym_vvar_start,
+				       VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+				       VM_PFNMAP,
+				       &vvar_mapping);
+
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		do_munmap(mm, text_start, image->size, NULL);
+	} else {
+		current->mm->context.vdso = (void __user *)text_start;
+		current->mm->context.vdso_image = image;
+	}
+
+up_fail:
+	mmap_write_unlock(mm);
+	return ret;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Put the vdso above the (randomized) stack with another randomized
+ * offset.  This way there is no hole in the middle of address space.
+ * To save memory make sure it is still in the same PTE as the stack
+ * top.  This doesn't give that many random bits.
+ *
+ * Note that this algorithm is imperfect: the distribution of the vdso
+ * start address within a PMD is biased toward the end.
+ *
+ * Only used for the 64-bit and x32 vdsos.
+ */
+static unsigned long vdso_addr(unsigned long start, unsigned len)
+{
+	unsigned long addr, end;
+	unsigned offset;
+
+	/*
+	 * Round up the start address.  It can start out unaligned as a result
+	 * of stack start randomization.
+	 */
+	start = PAGE_ALIGN(start);
+
+	/* Round the lowest possible end address up to a PMD boundary. */
+	end = (start + len + PMD_SIZE - 1) & PMD_MASK;
+	if (end >= DEFAULT_MAP_WINDOW)
+		end = DEFAULT_MAP_WINDOW;
+	end -= len;
+
+	if (end > start) {
+		offset = prandom_u32_max(((end - start) >> PAGE_SHIFT) + 1);
+		addr = start + (offset << PAGE_SHIFT);
+	} else {
+		addr = start;
+	}
+
+	/*
+	 * Forcibly align the final address in case we have a hardware
+	 * issue that requires alignment for performance reasons.
+	 */
+	addr = align_vdso_addr(addr);
+
+	return addr;
+}
+
+static int map_vdso_randomized(const struct vdso_image *image)
+{
+	unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start);
+
+	return map_vdso(image, addr);
+}
+#endif
+
+int map_vdso_once(const struct vdso_image *image, unsigned long addr)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	VMA_ITERATOR(vmi, mm, 0);
+
+	mmap_write_lock(mm);
+	/*
+	 * Check if we have already mapped vdso blob - fail to prevent
+	 * abusing from userspace install_special_mapping, which may
+	 * not do accounting and rlimit right.
+	 * We could search vma near context.vdso, but it's a slowpath,
+	 * so let's explicitly check all VMAs to be completely sure.
+	 */
+	for_each_vma(vmi, vma) {
+		if (vma_is_special_mapping(vma, &vdso_mapping) ||
+				vma_is_special_mapping(vma, &vvar_mapping)) {
+			mmap_write_unlock(mm);
+			return -EEXIST;
+		}
+	}
+	mmap_write_unlock(mm);
+
+	return map_vdso(image, addr);
+}
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+static int load_vdso32(void)
+{
+	if (vdso32_enabled != 1)  /* Other values all mean "disabled" */
+		return 0;
+
+	return map_vdso(&vdso_image_32, 0);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+	if (!vdso64_enabled)
+		return 0;
+
+	return map_vdso_randomized(&vdso_image_64);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
+				       int uses_interp, bool x32)
+{
+#ifdef CONFIG_X86_X32_ABI
+	if (x32) {
+		if (!vdso64_enabled)
+			return 0;
+		return map_vdso_randomized(&vdso_image_x32);
+	}
+#endif
+#ifdef CONFIG_IA32_EMULATION
+	return load_vdso32();
+#else
+	return 0;
+#endif
+}
+#endif
+#else
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+	return load_vdso32();
+}
+#endif
+
+bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs)
+{
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+	const struct vdso_image *image = current->mm->context.vdso_image;
+	unsigned long vdso = (unsigned long) current->mm->context.vdso;
+
+	if (in_ia32_syscall() && image == &vdso_image_32) {
+		if (regs->ip == vdso + image->sym_vdso32_sigreturn_landing_pad ||
+		    regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad)
+			return true;
+	}
+#endif
+	return false;
+}
+
+#ifdef CONFIG_X86_64
+static __init int vdso_setup(char *s)
+{
+	vdso64_enabled = simple_strtoul(s, NULL, 0);
+	return 1;
+}
+__setup("vdso=", vdso_setup);
+
+static int __init init_vdso(void)
+{
+	BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32);
+
+	init_vdso_image(&vdso_image_64);
+
+#ifdef CONFIG_X86_X32_ABI
+	init_vdso_image(&vdso_image_x32);
+#endif
+
+	return 0;
+}
+subsys_initcall(init_vdso);
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
new file mode 100644
index 000000000..d77d278ee
--- /dev/null
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/export.h>
+#include <asm/errno.h>
+#include <asm/enclu.h>
+
+#include "extable.h"
+
+/* Relative to %rbp. */
+#define SGX_ENCLAVE_OFFSET_OF_RUN		16
+
+/* The offsets relative to struct sgx_enclave_run. */
+#define SGX_ENCLAVE_RUN_TCS			0
+#define SGX_ENCLAVE_RUN_LEAF			8
+#define SGX_ENCLAVE_RUN_EXCEPTION_VECTOR	12
+#define SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE	14
+#define SGX_ENCLAVE_RUN_EXCEPTION_ADDR		16
+#define SGX_ENCLAVE_RUN_USER_HANDLER		24
+#define SGX_ENCLAVE_RUN_USER_DATA		32	/* not used */
+#define SGX_ENCLAVE_RUN_RESERVED_START		40
+#define SGX_ENCLAVE_RUN_RESERVED_END		256
+
+.code64
+.section .text, "ax"
+
+SYM_FUNC_START(__vdso_sgx_enter_enclave)
+	/* Prolog */
+	.cfi_startproc
+	push	%rbp
+	.cfi_adjust_cfa_offset	8
+	.cfi_rel_offset		%rbp, 0
+	mov	%rsp, %rbp
+	.cfi_def_cfa_register	%rbp
+	push	%rbx
+	.cfi_rel_offset		%rbx, -8
+
+	mov	%ecx, %eax
+.Lenter_enclave:
+	/* EENTER <= function <= ERESUME */
+	cmp	$EENTER, %eax
+	jb	.Linvalid_input
+	cmp	$ERESUME, %eax
+	ja	.Linvalid_input
+
+	mov	SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rcx
+
+	/* Validate that the reserved area contains only zeros. */
+	mov	$SGX_ENCLAVE_RUN_RESERVED_START, %rbx
+1:
+	cmpq	$0, (%rcx, %rbx)
+	jne	.Linvalid_input
+	add	$8, %rbx
+	cmpq	$SGX_ENCLAVE_RUN_RESERVED_END, %rbx
+	jne	1b
+
+	/* Load TCS and AEP */
+	mov	SGX_ENCLAVE_RUN_TCS(%rcx), %rbx
+	lea	.Lasync_exit_pointer(%rip), %rcx
+
+	/* Single ENCLU serving as both EENTER and AEP (ERESUME) */
+.Lasync_exit_pointer:
+.Lenclu_eenter_eresume:
+	enclu
+
+	/* EEXIT jumps here unless the enclave is doing something fancy. */
+	mov	SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
+
+	/* Set exit_reason. */
+	movl	$EEXIT, SGX_ENCLAVE_RUN_LEAF(%rbx)
+
+	/* Invoke userspace's exit handler if one was provided. */
+.Lhandle_exit:
+	cmpq	$0, SGX_ENCLAVE_RUN_USER_HANDLER(%rbx)
+	jne	.Linvoke_userspace_handler
+
+	/* Success, in the sense that ENCLU was attempted. */
+	xor	%eax, %eax
+
+.Lout:
+	pop	%rbx
+	leave
+	.cfi_def_cfa		%rsp, 8
+	RET
+
+	/* The out-of-line code runs with the pre-leave stack frame. */
+	.cfi_def_cfa		%rbp, 16
+
+.Linvalid_input:
+	mov	$(-EINVAL), %eax
+	jmp	.Lout
+
+.Lhandle_exception:
+	mov	SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
+
+	/* Set the exception info. */
+	mov	%eax, (SGX_ENCLAVE_RUN_LEAF)(%rbx)
+	mov	%di,  (SGX_ENCLAVE_RUN_EXCEPTION_VECTOR)(%rbx)
+	mov	%si,  (SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE)(%rbx)
+	mov	%rdx, (SGX_ENCLAVE_RUN_EXCEPTION_ADDR)(%rbx)
+	jmp	.Lhandle_exit
+
+.Linvoke_userspace_handler:
+	/* Pass the untrusted RSP (at exit) to the callback via %rcx. */
+	mov	%rsp, %rcx
+
+	/* Save struct sgx_enclave_exception %rbx is about to be clobbered. */
+	mov	%rbx, %rax
+
+	/* Save the untrusted RSP offset in %rbx (non-volatile register). */
+	mov	%rsp, %rbx
+	and	$0xf, %rbx
+
+	/*
+	 * Align stack per x86_64 ABI. Note, %rsp needs to be 16-byte aligned
+	 * _after_ pushing the parameters on the stack, hence the bonus push.
+	 */
+	and	$-0x10, %rsp
+	push	%rax
+
+	/* Push struct sgx_enclave_exception as a param to the callback. */
+	push	%rax
+
+	/* Clear RFLAGS.DF per x86_64 ABI */
+	cld
+
+	/*
+	 * Load the callback pointer to %rax and lfence for LVI (load value
+	 * injection) protection before making the call.
+	 */
+	mov	SGX_ENCLAVE_RUN_USER_HANDLER(%rax), %rax
+	lfence
+	call	*%rax
+
+	/* Undo the post-exit %rsp adjustment. */
+	lea	0x10(%rsp, %rbx), %rsp
+
+	/*
+	 * If the return from callback is zero or negative, return immediately,
+	 * else re-execute ENCLU with the positive return value interpreted as
+	 * the requested ENCLU function.
+	 */
+	cmp	$0, %eax
+	jle	.Lout
+	jmp	.Lenter_enclave
+
+	.cfi_endproc
+
+_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception)
+
+SYM_FUNC_END(__vdso_sgx_enter_enclave)
diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile
new file mode 100644
index 000000000..93c1b3e94
--- /dev/null
+++ b/arch/x86/entry/vsyscall/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the x86 low level vsyscall code
+#
+obj-$(CONFIG_X86_VSYSCALL_EMULATION)	+= vsyscall_64.o vsyscall_emu_64.o
+
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
new file mode 100644
index 000000000..4af81df13
--- /dev/null
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -0,0 +1,398 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
+ *
+ * Based on the original implementation which is:
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  Parts of the original code have been moved to arch/x86/vdso/vma.c
+ *
+ * This file implements vsyscall emulation.  vsyscalls are a legacy ABI:
+ * Userspace can request certain kernel services by calling fixed
+ * addresses.  This concept is problematic:
+ *
+ * - It interferes with ASLR.
+ * - It's awkward to write code that lives in kernel addresses but is
+ *   callable by userspace at fixed addresses.
+ * - The whole concept is impossible for 32-bit compat userspace.
+ * - UML cannot easily virtualize a vsyscall.
+ *
+ * As of mid-2014, I believe that there is no new userspace code that
+ * will use a vsyscall if the vDSO is present.  I hope that there will
+ * soon be no new userspace code that will ever use a vsyscall.
+ *
+ * The code in this file emulates vsyscalls when notified of a page
+ * fault to a vsyscall address.
+ */
+
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/sched/signal.h>
+#include <linux/mm_types.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
+
+#include <asm/vsyscall.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/traps.h>
+#include <asm/paravirt.h>
+
+#define CREATE_TRACE_POINTS
+#include "vsyscall_trace.h"
+
+static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
+#ifdef CONFIG_LEGACY_VSYSCALL_NONE
+	NONE;
+#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
+	XONLY;
+#else
+	#error VSYSCALL config is broken
+#endif
+
+static int __init vsyscall_setup(char *str)
+{
+	if (str) {
+		if (!strcmp("emulate", str))
+			vsyscall_mode = EMULATE;
+		else if (!strcmp("xonly", str))
+			vsyscall_mode = XONLY;
+		else if (!strcmp("none", str))
+			vsyscall_mode = NONE;
+		else
+			return -EINVAL;
+
+		return 0;
+	}
+
+	return -EINVAL;
+}
+early_param("vsyscall", vsyscall_setup);
+
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
+			      const char *message)
+{
+	if (!show_unhandled_signals)
+		return;
+
+	printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+			   level, current->comm, task_pid_nr(current),
+			   message, regs->ip, regs->cs,
+			   regs->sp, regs->ax, regs->si, regs->di);
+}
+
+static int addr_to_vsyscall_nr(unsigned long addr)
+{
+	int nr;
+
+	if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
+		return -EINVAL;
+
+	nr = (addr & 0xC00UL) >> 10;
+	if (nr >= 3)
+		return -EINVAL;
+
+	return nr;
+}
+
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+	/*
+	 * XXX: if access_ok, get_user, and put_user handled
+	 * sig_on_uaccess_err, this could go away.
+	 */
+
+	if (!access_ok((void __user *)ptr, size)) {
+		struct thread_struct *thread = &current->thread;
+
+		thread->error_code	= X86_PF_USER | X86_PF_WRITE;
+		thread->cr2		= ptr;
+		thread->trap_nr		= X86_TRAP_PF;
+
+		force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);
+		return false;
+	} else {
+		return true;
+	}
+}
+
+bool emulate_vsyscall(unsigned long error_code,
+		      struct pt_regs *regs, unsigned long address)
+{
+	struct task_struct *tsk;
+	unsigned long caller;
+	int vsyscall_nr, syscall_nr, tmp;
+	int prev_sig_on_uaccess_err;
+	long ret;
+	unsigned long orig_dx;
+
+	/* Write faults or kernel-privilege faults never get fixed up. */
+	if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
+		return false;
+
+	if (!(error_code & X86_PF_INSTR)) {
+		/* Failed vsyscall read */
+		if (vsyscall_mode == EMULATE)
+			return false;
+
+		/*
+		 * User code tried and failed to read the vsyscall page.
+		 */
+		warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
+		return false;
+	}
+
+	/*
+	 * No point in checking CS -- the only way to get here is a user mode
+	 * trap to a high address, which means that we're in 64-bit user code.
+	 */
+
+	WARN_ON_ONCE(address != regs->ip);
+
+	if (vsyscall_mode == NONE) {
+		warn_bad_vsyscall(KERN_INFO, regs,
+				  "vsyscall attempted with vsyscall=none");
+		return false;
+	}
+
+	vsyscall_nr = addr_to_vsyscall_nr(address);
+
+	trace_emulate_vsyscall(vsyscall_nr);
+
+	if (vsyscall_nr < 0) {
+		warn_bad_vsyscall(KERN_WARNING, regs,
+				  "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
+		goto sigsegv;
+	}
+
+	if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
+		warn_bad_vsyscall(KERN_WARNING, regs,
+				  "vsyscall with bad stack (exploit attempt?)");
+		goto sigsegv;
+	}
+
+	tsk = current;
+
+	/*
+	 * Check for access_ok violations and find the syscall nr.
+	 *
+	 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
+	 * 64-bit, so we don't need to special-case it here.  For all the
+	 * vsyscalls, NULL means "don't write anything" not "write it at
+	 * address 0".
+	 */
+	switch (vsyscall_nr) {
+	case 0:
+		if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) ||
+		    !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
+			ret = -EFAULT;
+			goto check_fault;
+		}
+
+		syscall_nr = __NR_gettimeofday;
+		break;
+
+	case 1:
+		if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {
+			ret = -EFAULT;
+			goto check_fault;
+		}
+
+		syscall_nr = __NR_time;
+		break;
+
+	case 2:
+		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+		    !write_ok_or_segv(regs->si, sizeof(unsigned))) {
+			ret = -EFAULT;
+			goto check_fault;
+		}
+
+		syscall_nr = __NR_getcpu;
+		break;
+	}
+
+	/*
+	 * Handle seccomp.  regs->ip must be the original value.
+	 * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
+	 *
+	 * We could optimize the seccomp disabled case, but performance
+	 * here doesn't matter.
+	 */
+	regs->orig_ax = syscall_nr;
+	regs->ax = -ENOSYS;
+	tmp = secure_computing();
+	if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
+		warn_bad_vsyscall(KERN_DEBUG, regs,
+				  "seccomp tried to change syscall nr or ip");
+		force_exit_sig(SIGSYS);
+		return true;
+	}
+	regs->orig_ax = -1;
+	if (tmp)
+		goto do_ret;  /* skip requested */
+
+	/*
+	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
+	 * preserve that behavior to make writing exploits harder.
+	 */
+	prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
+	current->thread.sig_on_uaccess_err = 1;
+
+	ret = -EFAULT;
+	switch (vsyscall_nr) {
+	case 0:
+		/* this decodes regs->di and regs->si on its own */
+		ret = __x64_sys_gettimeofday(regs);
+		break;
+
+	case 1:
+		/* this decodes regs->di on its own */
+		ret = __x64_sys_time(regs);
+		break;
+
+	case 2:
+		/* while we could clobber regs->dx, we didn't in the past... */
+		orig_dx = regs->dx;
+		regs->dx = 0;
+		/* this decodes regs->di, regs->si and regs->dx on its own */
+		ret = __x64_sys_getcpu(regs);
+		regs->dx = orig_dx;
+		break;
+	}
+
+	current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
+
+check_fault:
+	if (ret == -EFAULT) {
+		/* Bad news -- userspace fed a bad pointer to a vsyscall. */
+		warn_bad_vsyscall(KERN_INFO, regs,
+				  "vsyscall fault (exploit attempt?)");
+
+		/*
+		 * If we failed to generate a signal for any reason,
+		 * generate one here.  (This should be impossible.)
+		 */
+		if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+				 !sigismember(&tsk->pending.signal, SIGSEGV)))
+			goto sigsegv;
+
+		return true;  /* Don't emulate the ret. */
+	}
+
+	regs->ax = ret;
+
+do_ret:
+	/* Emulate a ret instruction. */
+	regs->ip = caller;
+	regs->sp += 8;
+	return true;
+
+sigsegv:
+	force_sig(SIGSEGV);
+	return true;
+}
+
+/*
+ * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+ * not need special handling anymore:
+ */
+static const char *gate_vma_name(struct vm_area_struct *vma)
+{
+	return "[vsyscall]";
+}
+static const struct vm_operations_struct gate_vma_ops = {
+	.name = gate_vma_name,
+};
+static struct vm_area_struct gate_vma __ro_after_init = {
+	.vm_start	= VSYSCALL_ADDR,
+	.vm_end		= VSYSCALL_ADDR + PAGE_SIZE,
+	.vm_page_prot	= PAGE_READONLY_EXEC,
+	.vm_flags	= VM_READ | VM_EXEC,
+	.vm_ops		= &gate_vma_ops,
+};
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+#ifdef CONFIG_COMPAT
+	if (!mm || !(mm->context.flags & MM_CONTEXT_HAS_VSYSCALL))
+		return NULL;
+#endif
+	if (vsyscall_mode == NONE)
+		return NULL;
+	return &gate_vma;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = get_gate_vma(mm);
+
+	if (!vma)
+		return 0;
+
+	return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
+
+/*
+ * Use this when you have no reliable mm, typically from interrupt
+ * context. It is less reliable than using a task's mm and may give
+ * false positives.
+ */
+int in_gate_area_no_mm(unsigned long addr)
+{
+	return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
+}
+
+/*
+ * The VSYSCALL page is the only user-accessible page in the kernel address
+ * range.  Normally, the kernel page tables can have _PAGE_USER clear, but
+ * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
+ * are enabled.
+ *
+ * Some day we may create a "minimal" vsyscall mode in which we emulate
+ * vsyscalls but leave the page not present.  If so, we skip calling
+ * this.
+ */
+void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
+	set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
+	p4d = p4d_offset(pgd, VSYSCALL_ADDR);
+#if CONFIG_PGTABLE_LEVELS >= 5
+	set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
+#endif
+	pud = pud_offset(p4d, VSYSCALL_ADDR);
+	set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
+	pmd = pmd_offset(pud, VSYSCALL_ADDR);
+	set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
+}
+
+void __init map_vsyscall(void)
+{
+	extern char __vsyscall_page;
+	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
+
+	/*
+	 * For full emulation, the page needs to exist for real.  In
+	 * execute-only mode, there is no PTE at all backing the vsyscall
+	 * page.
+	 */
+	if (vsyscall_mode == EMULATE) {
+		__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
+			     PAGE_KERNEL_VVAR);
+		set_vsyscall_pgtable_user_bits(swapper_pg_dir);
+	}
+
+	if (vsyscall_mode == XONLY)
+		gate_vma.vm_flags = VM_EXEC;
+
+	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
+		     (unsigned long)VSYSCALL_ADDR);
+}
diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
new file mode 100644
index 000000000..ef2dd1827
--- /dev/null
+++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * vsyscall_emu_64.S: Vsyscall emulation page
+ *
+ * Copyright (c) 2011 Andy Lutomirski
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/page_types.h>
+#include <asm/unistd_64.h>
+
+__PAGE_ALIGNED_DATA
+	.globl __vsyscall_page
+	.balign PAGE_SIZE, 0xcc
+	.type __vsyscall_page, @object
+__vsyscall_page:
+
+	mov $__NR_gettimeofday, %rax
+	syscall
+	ret
+	int3
+
+	.balign 1024, 0xcc
+	mov $__NR_time, %rax
+	syscall
+	ret
+	int3
+
+	.balign 1024, 0xcc
+	mov $__NR_getcpu, %rax
+	syscall
+	ret
+	int3
+
+	.balign 4096, 0xcc
+
+	.size __vsyscall_page, 4096
diff --git a/arch/x86/entry/vsyscall/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h
new file mode 100644
index 000000000..3c3f9765a
--- /dev/null
+++ b/arch/x86/entry/vsyscall/vsyscall_trace.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vsyscall
+
+#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __VSYSCALL_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(emulate_vsyscall,
+
+	    TP_PROTO(int nr),
+
+	    TP_ARGS(nr),
+
+	    TP_STRUCT__entry(__field(int, nr)),
+
+	    TP_fast_assign(
+			   __entry->nr = nr;
+			   ),
+
+	    TP_printk("nr = %d", __entry->nr)
+);
+
+#endif
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/
+#define TRACE_INCLUDE_FILE vsyscall_trace
+#include <trace/define_trace.h>
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:49:45 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:49:45 +0000
commit	2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree	848558de17fb3008cdf4d861b01ac7781903ce39 /arch/x86/entry
parent	Initial commit. (diff)
download	linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip