diff options
Diffstat (limited to 'arch/x86/entry')
39 files changed, 8190 insertions, 0 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile new file mode 100644 index 000000000..06fc70cf5 --- /dev/null +++ b/arch/x86/entry/Makefile @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the x86 low level entry code +# + +OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y + +CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) +CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,) +obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o +obj-y += common.o + +obj-y += vdso/ +obj-y += vsyscall/ + +obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o + diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h new file mode 100644 index 000000000..993dd06c8 --- /dev/null +++ b/arch/x86/entry/calling.h @@ -0,0 +1,350 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/jump_label.h> +#include <asm/unwind_hints.h> +#include <asm/cpufeatures.h> +#include <asm/page_types.h> +#include <asm/percpu.h> +#include <asm/asm-offsets.h> +#include <asm/processor-flags.h> + +/* + + x86 function call convention, 64-bit: + ------------------------------------- + arguments | callee-saved | extra caller-saved | return + [callee-clobbered] | | [callee-clobbered] | + --------------------------------------------------------------------------- + rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**] + + ( rsp is obviously invariant across normal function calls. (gcc can 'merge' + functions when it sees tail-call optimization possibilities) rflags is + clobbered. Leftover arguments are passed over the stack frame.) + + [*] In the frame-pointers case rbp is fixed to the stack frame. + + [**] for struct return values wider than 64 bits the return convention is a + bit more complex: up to 128 bits width we return small structures + straight in rax, rdx. For structures larger than that (3 words or + larger) the caller puts a pointer to an on-stack return struct + [allocated in the caller's stack frame] into the first argument - i.e. + into rdi. All other arguments shift up by one in this case. + Fortunately this case is rare in the kernel. + +For 32-bit we have the following conventions - kernel is built with +-mregparm=3 and -freg-struct-return: + + x86 function calling convention, 32-bit: + ---------------------------------------- + arguments | callee-saved | extra caller-saved | return + [callee-clobbered] | | [callee-clobbered] | + ------------------------------------------------------------------------- + eax edx ecx | ebx edi esi ebp [*] | <none> | eax, edx [**] + + ( here too esp is obviously invariant across normal function calls. eflags + is clobbered. Leftover arguments are passed over the stack frame. ) + + [*] In the frame-pointers case ebp is fixed to the stack frame. + + [**] We build with -freg-struct-return, which on 32-bit means similar + semantics as on 64-bit: edx can be used for a second return value + (i.e. covering integer and structure sizes up to 64 bits) - after that + it gets more complex and more expensive: 3-word or larger struct returns + get done in the caller's frame and the pointer to the return struct goes + into regparm0, i.e. eax - the other arguments shift up and the + function's register parameters degenerate to regparm=2 in essence. + +*/ + +#ifdef CONFIG_X86_64 + +/* + * 64-bit system call stack frame layout defines and helpers, + * for assembly code: + */ + +/* The layout forms the "struct pt_regs" on the stack: */ +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ +#define R15 0*8 +#define R14 1*8 +#define R13 2*8 +#define R12 3*8 +#define RBP 4*8 +#define RBX 5*8 +/* These regs are callee-clobbered. Always saved on kernel entry. */ +#define R11 6*8 +#define R10 7*8 +#define R9 8*8 +#define R8 9*8 +#define RAX 10*8 +#define RCX 11*8 +#define RDX 12*8 +#define RSI 13*8 +#define RDI 14*8 +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 15*8 +/* Return frame for iretq */ +#define RIP 16*8 +#define CS 17*8 +#define EFLAGS 18*8 +#define RSP 19*8 +#define SS 20*8 + +#define SIZEOF_PTREGS 21*8 + +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 + .if \save_ret + pushq %rsi /* pt_regs->si */ + movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ + movq %rdi, 8(%rsp) /* pt_regs->di (overwriting original return address) */ + .else + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + .endif + pushq \rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq \rax /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS + + .if \save_ret + pushq %rsi /* return address on top of stack */ + .endif + + /* + * Sanitize registers of values that a speculation attack might + * otherwise want to exploit. The lower registers are likely clobbered + * well before they could be put to use in a speculative execution + * gadget. + */ + xorl %edx, %edx /* nospec dx */ + xorl %ecx, %ecx /* nospec cx */ + xorl %r8d, %r8d /* nospec r8 */ + xorl %r9d, %r9d /* nospec r9 */ + xorl %r10d, %r10d /* nospec r10 */ + xorl %r11d, %r11d /* nospec r11 */ + xorl %ebx, %ebx /* nospec rbx */ + xorl %ebp, %ebp /* nospec rbp */ + xorl %r12d, %r12d /* nospec r12 */ + xorl %r13d, %r13d /* nospec r13 */ + xorl %r14d, %r14d /* nospec r14 */ + xorl %r15d, %r15d /* nospec r15 */ + +.endm + +.macro POP_REGS pop_rdi=1 skip_r11rcx=0 + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + .if \skip_r11rcx + popq %rsi + .else + popq %r11 + .endif + popq %r10 + popq %r9 + popq %r8 + popq %rax + .if \skip_r11rcx + popq %rsi + .else + popq %rcx + .endif + popq %rdx + popq %rsi + .if \pop_rdi + popq %rdi + .endif +.endm + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +/* + * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two + * halves: + */ +#define PTI_USER_PGTABLE_BIT PAGE_SHIFT +#define PTI_USER_PGTABLE_MASK (1 << PTI_USER_PGTABLE_BIT) +#define PTI_USER_PCID_BIT X86_CR3_PTI_PCID_USER_BIT +#define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT) +#define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK) + +.macro SET_NOFLUSH_BIT reg:req + bts $X86_CR3_PCID_NOFLUSH_BIT, \reg +.endm + +.macro ADJUST_KERNEL_CR3 reg:req + ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID + /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ + andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg +.endm + +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + mov %cr3, \scratch_reg + ADJUST_KERNEL_CR3 \scratch_reg + mov \scratch_reg, %cr3 +.Lend_\@: +.endm + +#define THIS_CPU_user_pcid_flush_mask \ + PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask + +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + mov %cr3, \scratch_reg + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID + + /* + * Test if the ASID needs a flush. + */ + movq \scratch_reg, \scratch_reg2 + andq $(0x7FF), \scratch_reg /* mask ASID */ + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask + jnc .Lnoflush_\@ + + /* Flush needed, clear the bit */ + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask + movq \scratch_reg2, \scratch_reg + jmp .Lwrcr3_pcid_\@ + +.Lnoflush_\@: + movq \scratch_reg2, \scratch_reg + SET_NOFLUSH_BIT \scratch_reg + +.Lwrcr3_pcid_\@: + /* Flip the ASID to the user version */ + orq $(PTI_USER_PCID_MASK), \scratch_reg + +.Lwrcr3_\@: + /* Flip the PGD to the user version */ + orq $(PTI_USER_PGTABLE_MASK), \scratch_reg + mov \scratch_reg, %cr3 +.Lend_\@: +.endm + +.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req + pushq %rax + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax + popq %rax +.endm + +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req + ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI + movq %cr3, \scratch_reg + movq \scratch_reg, \save_reg + /* + * Test the user pagetable bit. If set, then the user page tables + * are active. If clear CR3 already has the kernel page table + * active. + */ + bt $PTI_USER_PGTABLE_BIT, \scratch_reg + jnc .Ldone_\@ + + ADJUST_KERNEL_CR3 \scratch_reg + movq \scratch_reg, %cr3 + +.Ldone_\@: +.endm + +.macro RESTORE_CR3 scratch_reg:req save_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID + + /* + * KERNEL pages can always resume with NOFLUSH as we do + * explicit flushes. + */ + bt $PTI_USER_PGTABLE_BIT, \save_reg + jnc .Lnoflush_\@ + + /* + * Check if there's a pending flush for the user ASID we're + * about to set. + */ + movq \save_reg, \scratch_reg + andq $(0x7FF), \scratch_reg + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask + jnc .Lnoflush_\@ + + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask + jmp .Lwrcr3_\@ + +.Lnoflush_\@: + SET_NOFLUSH_BIT \save_reg + +.Lwrcr3_\@: + /* + * The CR3 write could be avoided when not changing its value, + * but would require a CR3 read *and* a scratch register. + */ + movq \save_reg, %cr3 +.Lend_\@: +.endm + +#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ + +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req +.endm +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req +.endm +.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req +.endm +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req +.endm +.macro RESTORE_CR3 scratch_reg:req save_reg:req +.endm + +#endif + +/* + * Mitigate Spectre v1 for conditional swapgs code paths. + * + * FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to + * prevent a speculative swapgs when coming from kernel space. + * + * FENCE_SWAPGS_KERNEL_ENTRY is used in the kernel entry non-swapgs code path, + * to prevent the swapgs from getting speculatively skipped when coming from + * user space. + */ +.macro FENCE_SWAPGS_USER_ENTRY + ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_USER +.endm +.macro FENCE_SWAPGS_KERNEL_ENTRY + ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_KERNEL +.endm + +#endif /* CONFIG_X86_64 */ + +/* + * This does 'call enter_from_user_mode' unless we can avoid it based on + * kernel config or using the static jump infrastructure. + */ +.macro CALL_enter_from_user_mode +#ifdef CONFIG_CONTEXT_TRACKING +#ifdef CONFIG_JUMP_LABEL + STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0 +#endif + call enter_from_user_mode +.Lafter_call_\@: +#endif +.endm diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c new file mode 100644 index 000000000..8353348dd --- /dev/null +++ b/arch/x86/entry/common.c @@ -0,0 +1,432 @@ +/* + * common.c - C code for kernel entry and exit + * Copyright (c) 2015 Andrew Lutomirski + * GPL v2 + * + * Based on asm and ptrace code by many authors. The code here originated + * in ptrace.c and signal.c. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/tracehook.h> +#include <linux/audit.h> +#include <linux/seccomp.h> +#include <linux/signal.h> +#include <linux/export.h> +#include <linux/context_tracking.h> +#include <linux/user-return-notifier.h> +#include <linux/nospec.h> +#include <linux/uprobes.h> +#include <linux/livepatch.h> +#include <linux/syscalls.h> + +#include <asm/desc.h> +#include <asm/traps.h> +#include <asm/vdso.h> +#include <linux/uaccess.h> +#include <asm/cpufeature.h> +#include <asm/nospec-branch.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +#ifdef CONFIG_CONTEXT_TRACKING +/* Called on entry from user mode with IRQs off. */ +__visible inline void enter_from_user_mode(void) +{ + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit_irqoff(); +} +#else +static inline void enter_from_user_mode(void) {} +#endif + +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) +{ +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + audit_syscall_entry(regs->orig_ax, regs->di, + regs->si, regs->dx, regs->r10); + } else +#endif + { + audit_syscall_entry(regs->orig_ax, regs->bx, + regs->cx, regs->dx, regs->si); + } +} + +/* + * Returns the syscall nr to run (which should match regs->orig_ax) or -1 + * to skip the syscall. + */ +static long syscall_trace_enter(struct pt_regs *regs) +{ + u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + + struct thread_info *ti = current_thread_info(); + unsigned long ret = 0; + bool emulated = false; + u32 work; + + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + BUG_ON(regs != task_pt_regs(current)); + + work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; + + if (unlikely(work & _TIF_SYSCALL_EMU)) + emulated = true; + + if ((emulated || (work & _TIF_SYSCALL_TRACE)) && + tracehook_report_syscall_entry(regs)) + return -1L; + + if (emulated) + return -1L; + +#ifdef CONFIG_SECCOMP + /* + * Do seccomp after ptrace, to catch any tracer changes. + */ + if (work & _TIF_SECCOMP) { + struct seccomp_data sd; + + sd.arch = arch; + sd.nr = regs->orig_ax; + sd.instruction_pointer = regs->ip; +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + sd.args[0] = regs->di; + sd.args[1] = regs->si; + sd.args[2] = regs->dx; + sd.args[3] = regs->r10; + sd.args[4] = regs->r8; + sd.args[5] = regs->r9; + } else +#endif + { + sd.args[0] = regs->bx; + sd.args[1] = regs->cx; + sd.args[2] = regs->dx; + sd.args[3] = regs->si; + sd.args[4] = regs->di; + sd.args[5] = regs->bp; + } + + ret = __secure_computing(&sd); + if (ret == -1) + return ret; + } +#endif + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->orig_ax); + + do_audit_syscall_entry(regs, arch); + + return ret ?: regs->orig_ax; +} + +#define EXIT_TO_USERMODE_LOOP_FLAGS \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) + +static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) +{ + /* + * In order to return to user mode, we need to have IRQs off with + * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags + * can be set at any time on preemptable kernels if we have IRQs on, + * so we need to loop. Disabling preemption wouldn't help: doing the + * work to clear some of the flags can sleep. + */ + while (true) { + /* We have work to do. */ + local_irq_enable(); + + if (cached_flags & _TIF_NEED_RESCHED) + schedule(); + + if (cached_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + + if (cached_flags & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + + /* deal with pending signal delivery */ + if (cached_flags & _TIF_SIGPENDING) + do_signal(regs); + + if (cached_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + rseq_handle_notify_resume(NULL, regs); + } + + if (cached_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); + + /* Disable IRQs and retry */ + local_irq_disable(); + + cached_flags = READ_ONCE(current_thread_info()->flags); + + if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) + break; + } +} + +/* Called with IRQs disabled. */ +__visible inline void prepare_exit_to_usermode(struct pt_regs *regs) +{ + struct thread_info *ti = current_thread_info(); + u32 cached_flags; + + addr_limit_user_check(); + + lockdep_assert_irqs_disabled(); + lockdep_sys_exit(); + + cached_flags = READ_ONCE(ti->flags); + + if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) + exit_to_usermode_loop(regs, cached_flags); + +#ifdef CONFIG_COMPAT + /* + * Compat syscalls set TS_COMPAT. Make sure we clear it before + * returning to user mode. We need to clear it *after* signal + * handling, because syscall restart has a fixup for compat + * syscalls. The fixup is exercised by the ptrace_syscall_32 + * selftest. + * + * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer + * special case only applies after poking regs and before the + * very next return to user mode. + */ + ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); +#endif + + user_enter_irqoff(); + + mds_user_clear_cpu_buffers(); +} + +#define SYSCALL_EXIT_WORK_FLAGS \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) + +static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) +{ + bool step; + + audit_syscall_exit(regs); + + if (cached_flags & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely( + (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) + == _TIF_SINGLESTEP); + if (step || cached_flags & _TIF_SYSCALL_TRACE) + tracehook_report_syscall_exit(regs, step); +} + +/* + * Called with IRQs on and fully valid regs. Returns with IRQs off in a + * state such that we can immediately switch to user mode. + */ +__visible inline void syscall_return_slowpath(struct pt_regs *regs) +{ + struct thread_info *ti = current_thread_info(); + u32 cached_flags = READ_ONCE(ti->flags); + + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && + WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) + local_irq_enable(); + + rseq_syscall(regs); + + /* + * First do one-time work. If these work items are enabled, we + * want to run them exactly once per syscall exit with IRQs on. + */ + if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) + syscall_slow_exit_work(regs, cached_flags); + + local_irq_disable(); + prepare_exit_to_usermode(regs); +} + +#ifdef CONFIG_X86_64 +__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) +{ + struct thread_info *ti; + + enter_from_user_mode(); + local_irq_enable(); + ti = current_thread_info(); + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) + nr = syscall_trace_enter(regs); + + /* + * NB: Native and x32 syscalls are dispatched from the same + * table. The only functional difference is the x32 bit in + * regs->orig_ax, which changes the behavior of some syscalls. + */ + nr &= __SYSCALL_MASK; + if (likely(nr < NR_syscalls)) { + nr = array_index_nospec(nr, NR_syscalls); + regs->ax = sys_call_table[nr](regs); + } + + syscall_return_slowpath(regs); +} +#endif + +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +/* + * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does + * all entry and exit work and returns with IRQs off. This function is + * extremely hot in workloads that use it, and it's usually called from + * do_fast_syscall_32, so forcibly inline it to improve performance. + */ +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) +{ + struct thread_info *ti = current_thread_info(); + unsigned int nr = (unsigned int)regs->orig_ax; + +#ifdef CONFIG_IA32_EMULATION + ti->status |= TS_COMPAT; +#endif + + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { + /* + * Subtlety here: if ptrace pokes something larger than + * 2^32-1 into orig_ax, this truncates it. This may or + * may not be necessary, but it matches the old asm + * behavior. + */ + nr = syscall_trace_enter(regs); + } + + if (likely(nr < IA32_NR_syscalls)) { + nr = array_index_nospec(nr, IA32_NR_syscalls); +#ifdef CONFIG_IA32_EMULATION + regs->ax = ia32_sys_call_table[nr](regs); +#else + /* + * It's possible that a 32-bit syscall implementation + * takes a 64-bit parameter but nonetheless assumes that + * the high bits are zero. Make sure we zero-extend all + * of the args. + */ + regs->ax = ia32_sys_call_table[nr]( + (unsigned int)regs->bx, (unsigned int)regs->cx, + (unsigned int)regs->dx, (unsigned int)regs->si, + (unsigned int)regs->di, (unsigned int)regs->bp); +#endif /* CONFIG_IA32_EMULATION */ + } + + syscall_return_slowpath(regs); +} + +/* Handles int $0x80 */ +__visible void do_int80_syscall_32(struct pt_regs *regs) +{ + enter_from_user_mode(); + local_irq_enable(); + do_syscall_32_irqs_on(regs); +} + +/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ +__visible long do_fast_syscall_32(struct pt_regs *regs) +{ + /* + * Called using the internal vDSO SYSENTER/SYSCALL32 calling + * convention. Adjust regs so it looks like we entered using int80. + */ + + unsigned long landing_pad = (unsigned long)current->mm->context.vdso + + vdso_image_32.sym_int80_landing_pad; + + /* + * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward + * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. + * Fix it up. + */ + regs->ip = landing_pad; + + enter_from_user_mode(); + + local_irq_enable(); + + /* Fetch EBP from where the vDSO stashed it. */ + if ( +#ifdef CONFIG_X86_64 + /* + * Micro-optimization: the pointer we're following is explicitly + * 32 bits, so it can't be out of range. + */ + __get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp) +#else + get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp) +#endif + ) { + + /* User code screwed up. */ + local_irq_disable(); + regs->ax = -EFAULT; + prepare_exit_to_usermode(regs); + return 0; /* Keep it simple: use IRET. */ + } + + /* Now this is just like a normal syscall. */ + do_syscall_32_irqs_on(regs); + +#ifdef CONFIG_X86_64 + /* + * Opportunistic SYSRETL: if possible, try to return using SYSRETL. + * SYSRETL is available on all 64-bit CPUs, so we don't need to + * bother with SYSEXIT. + * + * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, + * because the ECX fixup above will ensure that this is essentially + * never the case. + */ + return regs->cs == __USER32_CS && regs->ss == __USER_DS && + regs->ip == landing_pad && + (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; +#else + /* + * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. + * + * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, + * because the ECX fixup above will ensure that this is essentially + * never the case. + * + * We don't allow syscalls at all from VM86 mode, but we still + * need to check VM, because we might be returning from sys_vm86. + */ + return static_cpu_has(X86_FEATURE_SEP) && + regs->cs == __USER_CS && regs->ss == __USER_DS && + regs->ip == landing_pad && + (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; +#endif +} +#endif diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S new file mode 100644 index 000000000..37d9016d4 --- /dev/null +++ b/arch/x86/entry/entry_32.S @@ -0,0 +1,1514 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 1991,1992 Linus Torvalds + * + * entry_32.S contains the system-call and low-level fault and trap handling routines. + * + * Stack layout while running C code: + * ptrace needs to have all registers on the stack. + * If the order here is changed, it needs to be + * updated in fork.c:copy_process(), signal.c:do_signal(), + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss + */ + +#include <linux/linkage.h> +#include <linux/err.h> +#include <asm/thread_info.h> +#include <asm/irqflags.h> +#include <asm/errno.h> +#include <asm/segment.h> +#include <asm/smp.h> +#include <asm/percpu.h> +#include <asm/processor-flags.h> +#include <asm/irq_vectors.h> +#include <asm/cpufeatures.h> +#include <asm/alternative-asm.h> +#include <asm/asm.h> +#include <asm/smap.h> +#include <asm/frame.h> +#include <asm/nospec-branch.h> + + .section .entry.text, "ax" + +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + +#ifdef CONFIG_PREEMPT +# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +#else +# define preempt_stop(clobbers) +# define resume_kernel restore_all_kernel +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +#define PTI_SWITCH_MASK (1 << PAGE_SHIFT) + +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl $0 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl %gs +.endm + +.macro POP_GS pop=0 +98: popl %gs + .if \pop <> 0 + add $\pop, %esp + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b, 99b) +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b, 99b) +.endm + +.macro GS_TO_REG reg + movl %gs, \reg +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) +.endm +.macro SET_KERNEL_GS reg + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + +/* Unconditionally switch to user cr3 */ +.macro SWITCH_TO_USER_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + + movl %cr3, \scratch_reg + orl $PTI_SWITCH_MASK, \scratch_reg + movl \scratch_reg, %cr3 +.Lend_\@: +.endm + +.macro BUG_IF_WRONG_CR3 no_user_check=0 +#ifdef CONFIG_DEBUG_ENTRY + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + .if \no_user_check == 0 + /* coming from usermode? */ + testl $SEGMENT_RPL_MASK, PT_CS(%esp) + jz .Lend_\@ + .endif + /* On user-cr3? */ + movl %cr3, %eax + testl $PTI_SWITCH_MASK, %eax + jnz .Lend_\@ + /* From userspace with kernel cr3 - BUG */ + ud2 +.Lend_\@: +#endif +.endm + +/* + * Switch to kernel cr3 if not already loaded and return current cr3 in + * \scratch_reg + */ +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + movl %cr3, \scratch_reg + /* Test if we are already on kernel CR3 */ + testl $PTI_SWITCH_MASK, \scratch_reg + jz .Lend_\@ + andl $(~PTI_SWITCH_MASK), \scratch_reg + movl \scratch_reg, %cr3 + /* Return original CR3 in \scratch_reg */ + orl $PTI_SWITCH_MASK, \scratch_reg +.Lend_\@: +.endm + +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 + cld + PUSH_GS + pushl %fs + pushl %es + pushl %ds + pushl \pt_regs_ax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs + SET_KERNEL_GS %edx + + /* Switch to kernel stack if necessary */ +.if \switch_stacks > 0 + SWITCH_TO_KERNEL_STACK +.endif + +.endm + +.macro SAVE_ALL_NMI cr3_reg:req + SAVE_ALL + + BUG_IF_WRONG_CR3 + + /* + * Now switch the CR3 when PTI is enabled. + * + * We can enter with either user or kernel cr3, the code will + * store the old cr3 in \cr3_reg and switches to the kernel cr3 + * if necessary. + */ + SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg + +.Lend_\@: +.endm + +.macro RESTORE_INT_REGS + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax +.endm + +.macro RESTORE_REGS pop=0 + RESTORE_INT_REGS +1: popl %ds +2: popl %es +3: popl %fs + POP_GS \pop +.pushsection .fixup, "ax" +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b +.popsection + _ASM_EXTABLE(1b, 4b) + _ASM_EXTABLE(2b, 5b) + _ASM_EXTABLE(3b, 6b) + POP_GS_EX +.endm + +.macro RESTORE_ALL_NMI cr3_reg:req pop=0 + /* + * Now switch the CR3 when PTI is enabled. + * + * We enter with kernel cr3 and switch the cr3 to the value + * stored on \cr3_reg, which is either a user or a kernel cr3. + */ + ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI + + testl $PTI_SWITCH_MASK, \cr3_reg + jz .Lswitched_\@ + + /* User cr3 in \cr3_reg - write it to hardware cr3 */ + movl \cr3_reg, %cr3 + +.Lswitched_\@: + + BUG_IF_WRONG_CR3 + + RESTORE_REGS pop=\pop +.endm + +.macro CHECK_AND_APPLY_ESPFIX +#ifdef CONFIG_X86_ESPFIX32 +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) + + ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX + + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + /* + * Warning: PT_OLDSS(%esp) contains the wrong/random values if we + * are returning to the kernel. + * See comments in process.c:copy_thread() for details. + */ + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + jne .Lend_\@ # returning to user-space with LDT SS + + /* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ + /* + * Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the IRET: + */ + DISABLE_INTERRUPTS(CLBR_ANY) + lss (%esp), %esp /* switch to espfix segment */ +.Lend_\@: +#endif /* CONFIG_X86_ESPFIX32 */ +.endm + +/* + * Called with pt_regs fully populated and kernel segments loaded, + * so we can access PER_CPU and use the integer registers. + * + * We need to be very careful here with the %esp switch, because an NMI + * can happen everywhere. If the NMI handler finds itself on the + * entry-stack, it will overwrite the task-stack and everything we + * copied there. So allocate the stack-frame on the task-stack and + * switch to it before we do any copying. + */ + +#define CS_FROM_ENTRY_STACK (1 << 31) +#define CS_FROM_USER_CR3 (1 << 30) + +.macro SWITCH_TO_KERNEL_STACK + + ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV + + BUG_IF_WRONG_CR3 + + SWITCH_TO_KERNEL_CR3 scratch_reg=%eax + + /* + * %eax now contains the entry cr3 and we carry it forward in + * that register for the time this macro runs + */ + + /* + * The high bits of the CS dword (__csh) are used for + * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case + * hardware didn't do this for us. + */ + andl $(0x0000ffff), PT_CS(%esp) + + /* Are we on the entry stack? Bail out if not! */ + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %esp, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx + jae .Lend_\@ + + /* Load stack pointer into %esi and %edi */ + movl %esp, %esi + movl %esi, %edi + + /* Move %edi to the top of the entry stack */ + andl $(MASK_entry_stack), %edi + addl $(SIZEOF_entry_stack), %edi + + /* Load top of task-stack into %edi */ + movl TSS_entry2task_stack(%edi), %edi + + /* Special case - entry from kernel mode via entry stack */ +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS + movb PT_CS(%esp), %cl + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx +#else + movl PT_CS(%esp), %ecx + andl $SEGMENT_RPL_MASK, %ecx +#endif + cmpl $USER_RPL, %ecx + jb .Lentry_from_kernel_\@ + + /* Bytes to copy */ + movl $PTREGS_SIZE, %ecx + +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esi) + jz .Lcopy_pt_regs_\@ + + /* + * Stack-frame contains 4 additional segment registers when + * coming from VM86 mode + */ + addl $(4 * 4), %ecx + +#endif +.Lcopy_pt_regs_\@: + + /* Allocate frame on task-stack */ + subl %ecx, %edi + + /* Switch to task-stack */ + movl %edi, %esp + + /* + * We are now on the task-stack and can safely copy over the + * stack-frame + */ + shrl $2, %ecx + cld + rep movsl + + jmp .Lend_\@ + +.Lentry_from_kernel_\@: + + /* + * This handles the case when we enter the kernel from + * kernel-mode and %esp points to the entry-stack. When this + * happens we need to switch to the task-stack to run C code, + * but switch back to the entry-stack again when we approach + * iret and return to the interrupted code-path. This usually + * happens when we hit an exception while restoring user-space + * segment registers on the way back to user-space or when the + * sysenter handler runs with eflags.tf set. + * + * When we switch to the task-stack here, we can't trust the + * contents of the entry-stack anymore, as the exception handler + * might be scheduled out or moved to another CPU. Therefore we + * copy the complete entry-stack to the task-stack and set a + * marker in the iret-frame (bit 31 of the CS dword) to detect + * what we've done on the iret path. + * + * On the iret path we copy everything back and switch to the + * entry-stack, so that the interrupted kernel code-path + * continues on the same stack it was interrupted with. + * + * Be aware that an NMI can happen anytime in this code. + * + * %esi: Entry-Stack pointer (same as %esp) + * %edi: Top of the task stack + * %eax: CR3 on kernel entry + */ + + /* Calculate number of bytes on the entry stack in %ecx */ + movl %esi, %ecx + + /* %ecx to the top of entry-stack */ + andl $(MASK_entry_stack), %ecx + addl $(SIZEOF_entry_stack), %ecx + + /* Number of bytes on the entry stack to %ecx */ + sub %esi, %ecx + + /* Mark stackframe as coming from entry stack */ + orl $CS_FROM_ENTRY_STACK, PT_CS(%esp) + + /* + * Test the cr3 used to enter the kernel and add a marker + * so that we can switch back to it before iret. + */ + testl $PTI_SWITCH_MASK, %eax + jz .Lcopy_pt_regs_\@ + orl $CS_FROM_USER_CR3, PT_CS(%esp) + + /* + * %esi and %edi are unchanged, %ecx contains the number of + * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate + * the stack-frame on task-stack and copy everything over + */ + jmp .Lcopy_pt_regs_\@ + +.Lend_\@: +.endm + +/* + * Switch back from the kernel stack to the entry stack. + * + * The %esp register must point to pt_regs on the task stack. It will + * first calculate the size of the stack-frame to copy, depending on + * whether we return to VM86 mode or not. With that it uses 'rep movsl' + * to copy the contents of the stack over to the entry stack. + * + * We must be very careful here, as we can't trust the contents of the + * task-stack once we switched to the entry-stack. When an NMI happens + * while on the entry-stack, the NMI handler will switch back to the top + * of the task stack, overwriting our stack-frame we are about to copy. + * Therefore we switch the stack only after everything is copied over. + */ +.macro SWITCH_TO_ENTRY_STACK + + ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV + + /* Bytes to copy */ + movl $PTREGS_SIZE, %ecx + +#ifdef CONFIG_VM86 + testl $(X86_EFLAGS_VM), PT_EFLAGS(%esp) + jz .Lcopy_pt_regs_\@ + + /* Additional 4 registers to copy when returning to VM86 mode */ + addl $(4 * 4), %ecx + +.Lcopy_pt_regs_\@: +#endif + + /* Initialize source and destination for movsl */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi + subl %ecx, %edi + movl %esp, %esi + + /* Save future stack pointer in %ebx */ + movl %edi, %ebx + + /* Copy over the stack-frame */ + shrl $2, %ecx + cld + rep movsl + + /* + * Switch to entry-stack - needs to happen after everything is + * copied because the NMI handler will overwrite the task-stack + * when on entry-stack + */ + movl %ebx, %esp + +.Lend_\@: +.endm + +/* + * This macro handles the case when we return to kernel-mode on the iret + * path and have to switch back to the entry stack and/or user-cr3 + * + * See the comments below the .Lentry_from_kernel_\@ label in the + * SWITCH_TO_KERNEL_STACK macro for more details. + */ +.macro PARANOID_EXIT_TO_KERNEL_MODE + + /* + * Test if we entered the kernel with the entry-stack. Most + * likely we did not, because this code only runs on the + * return-to-kernel path. + */ + testl $CS_FROM_ENTRY_STACK, PT_CS(%esp) + jz .Lend_\@ + + /* Unlikely slow-path */ + + /* Clear marker from stack-frame */ + andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp) + + /* Copy the remaining task-stack contents to entry-stack */ + movl %esp, %esi + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi + + /* Bytes on the task-stack to ecx */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx + subl %esi, %ecx + + /* Allocate stack-frame on entry-stack */ + subl %ecx, %edi + + /* + * Save future stack-pointer, we must not switch until the + * copy is done, otherwise the NMI handler could destroy the + * contents of the task-stack we are about to copy. + */ + movl %edi, %ebx + + /* Do the copy */ + shrl $2, %ecx + cld + rep movsl + + /* Safe to switch to entry-stack now */ + movl %ebx, %esp + + /* + * We came from entry-stack and need to check if we also need to + * switch back to user cr3. + */ + testl $CS_FROM_USER_CR3, PT_CS(%esp) + jz .Lend_\@ + + /* Clear marker from stack-frame */ + andl $(~CS_FROM_USER_CR3), PT_CS(%esp) + + SWITCH_TO_USER_CR3 scratch_reg=%eax + +.Lend_\@: +.endm +/* + * %eax: prev task + * %edx: next task + */ +ENTRY(__switch_to_asm) + /* + * Save callee-saved registers + * This must match the order in struct inactive_task_frame + */ + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + pushfl + + /* switch stack */ + movl %esp, TASK_threadsp(%eax) + movl TASK_threadsp(%edx), %esp + +#ifdef CONFIG_STACKPROTECTOR + movl TASK_stack_canary(%edx), %ebx + movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset +#endif + +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +#endif + + /* restore callee-saved registers */ + popfl + popl %esi + popl %edi + popl %ebx + popl %ebp + + jmp __switch_to +END(__switch_to_asm) + +/* + * The unwinder expects the last frame on the stack to always be at the same + * offset from the end of the page, which allows it to validate the stack. + * Calling schedule_tail() directly would break that convention because its an + * asmlinkage function so its argument has to be pushed on the stack. This + * wrapper creates a proper "end of stack" frame header before the call. + */ +ENTRY(schedule_tail_wrapper) + FRAME_BEGIN + + pushl %eax + call schedule_tail + popl %eax + + FRAME_END + ret +ENDPROC(schedule_tail_wrapper) +/* + * A newly forked process directly context switches into this address. + * + * eax: prev task we switched from + * ebx: kernel thread func (NULL for user thread) + * edi: kernel thread arg + */ +ENTRY(ret_from_fork) + call schedule_tail_wrapper + + testl %ebx, %ebx + jnz 1f /* kernel threads are uncommon */ + +2: + /* When we fork, we trace the syscall return in the child, too. */ + movl %esp, %eax + call syscall_return_slowpath + jmp restore_all + + /* kernel thread */ +1: movl %edi, %eax + CALL_NOSPEC %ebx + /* + * A kernel thread is allowed to return here after successfully + * calling do_execve(). Exit to userspace to complete the execve() + * syscall. + */ + movl $0, PT_EAX(%esp) + jmp 2b +END(ret_from_fork) + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN +ret_from_exception: + preempt_stop(CLBR_ANY) +ret_from_intr: +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else + /* + * We can be coming here from child spawned by kernel_thread(). + */ + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace + +ENTRY(resume_userspace) + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl %esp, %eax + call prepare_exit_to_usermode + jmp restore_all +END(ret_from_exception) + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) +.Lneed_resched: + cmpl $0, PER_CPU_VAR(__preempt_count) + jnz restore_all_kernel + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all_kernel + call preempt_schedule_irq + jmp .Lneed_resched +END(resume_kernel) +#endif + +GLOBAL(__begin_SYSENTER_singlestep_region) +/* + * All code from here through __end_SYSENTER_singlestep_region is subject + * to being single-stepped if a user program sets TF and executes SYSENTER. + * There is absolutely nothing that we can do to prevent this from happening + * (thanks Intel!). To keep our handling of this situation as simple as + * possible, we handle TF just like AC and NT, except that our #DB handler + * will ignore all of the single-step traps generated in this range. + */ + +#ifdef CONFIG_XEN +/* + * Xen doesn't set %esp to be precisely what the normal SYSENTER + * entry point expects, so fix it up before using the normal path. + */ +ENTRY(xen_sysenter_target) + addl $5*4, %esp /* remove xen-provided frame */ + jmp .Lsysenter_past_esp +#endif + +/* + * 32-bit SYSENTER entry. + * + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * if X86_FEATURE_SEP is available. This is the preferred system call + * entry on 32-bit systems. + * + * The SYSENTER instruction, in principle, should *only* occur in the + * vDSO. In practice, a small number of Android devices were shipped + * with a copy of Bionic that inlined a SYSENTER instruction. This + * never happened in any of Google's Bionic versions -- it only happened + * in a narrow range of Intel-provided versions. + * + * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs. + * IF and VM in RFLAGS are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old EIP (!!!), ESP, or EFLAGS. + * + * To avoid losing track of EFLAGS.VM (and thus potentially corrupting + * user and/or vm86 state), we explicitly disable the SYSENTER + * instruction in vm86 mode by reprogramming the MSRs. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + */ +ENTRY(entry_SYSENTER_32) + /* + * On entry-stack with all userspace-regs live - save and + * restore eflags and %eax to use it as scratch-reg for the cr3 + * switch. + */ + pushfl + pushl %eax + BUG_IF_WRONG_CR3 no_user_check=1 + SWITCH_TO_KERNEL_CR3 scratch_reg=%eax + popl %eax + popfl + + /* Stack empty again, switch to task stack */ + movl TSS_entry2task_stack(%esp), %esp + +.Lsysenter_past_esp: + pushl $__USER_DS /* pt_regs->ss */ + pushl %ebp /* pt_regs->sp (stashed in bp) */ + pushfl /* pt_regs->flags (except IF = 0) */ + orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ + pushl $__USER_CS /* pt_regs->cs */ + pushl $0 /* pt_regs->ip = 0 (placeholder) */ + pushl %eax /* pt_regs->orig_ax */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */ + + /* + * SYSENTER doesn't filter flags, so we need to clear NT, AC + * and TF ourselves. To save a few cycles, we can check whether + * either was set instead of doing an unconditional popfq. + * This needs to happen before enabling interrupts so that + * we don't get preempted with NT set. + * + * If TF is set, we will single-step all the way to here -- do_debug + * will ignore all the traps. (Yes, this is slow, but so is + * single-stepping in general. This allows us to avoid having + * a more complicated code to handle the case where a user program + * forces us to single-step through the SYSENTER entry code.) + * + * NB.: .Lsysenter_fix_flags is a label with the code under it moved + * out-of-line as an optimization: NT is unlikely to be set in the + * majority of the cases and instead of polluting the I$ unnecessarily, + * we're keeping that code behind a branch which will predict as + * not-taken and therefore its instructions won't be fetched. + */ + testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp) + jnz .Lsysenter_fix_flags +.Lsysenter_flags_fixed: + + /* + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. + */ + TRACE_IRQS_OFF + + movl %esp, %eax + call do_fast_syscall_32 + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + +/* Opportunistic SYSEXIT */ + TRACE_IRQS_ON /* User mode traces as IRQs on. */ + + /* + * Setup entry stack - we keep the pointer in %eax and do the + * switch after almost all user-state is restored. + */ + + /* Load entry stack pointer and allocate frame for eflags/eax */ + movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax + subl $(2*4), %eax + + /* Copy eflags and eax to entry stack */ + movl PT_EFLAGS(%esp), %edi + movl PT_EAX(%esp), %esi + movl %edi, (%eax) + movl %esi, 4(%eax) + + /* Restore user registers and segments */ + movl PT_EIP(%esp), %edx /* pt_regs->ip */ + movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ +1: mov PT_FS(%esp), %fs + PTGS_TO_GS + + popl %ebx /* pt_regs->bx */ + addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ + popl %esi /* pt_regs->si */ + popl %edi /* pt_regs->di */ + popl %ebp /* pt_regs->bp */ + + /* Switch to entry stack */ + movl %eax, %esp + + /* Now ready to switch the cr3 */ + SWITCH_TO_USER_CR3 scratch_reg=%eax + + /* + * Restore all flags except IF. (We restore IF separately because + * STI gives a one-instruction window in which we won't be interrupted, + * whereas POPF does not.) + */ + btrl $X86_EFLAGS_IF_BIT, (%esp) + BUG_IF_WRONG_CR3 no_user_check=1 + popfl + popl %eax + + /* + * Return back to the vDSO, which will pop ecx and edx. + * Don't bother with DS and ES (they already contain __USER_DS). + */ + sti + sysexit + +.pushsection .fixup, "ax" +2: movl $0, PT_FS(%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b, 2b) + PTGS_TO_GS_EX + +.Lsysenter_fix_flags: + pushl $X86_EFLAGS_FIXED + popfl + jmp .Lsysenter_flags_fixed +GLOBAL(__end_SYSENTER_singlestep_region) +ENDPROC(entry_SYSENTER_32) + +/* + * 32-bit legacy system call entry. + * + * 32-bit x86 Linux system calls traditionally used the INT $0x80 + * instruction. INT $0x80 lands here. + * + * This entry point can be used by any 32-bit perform system calls. + * Instances of INT $0x80 can be found inline in various programs and + * libraries. It is also used by the vDSO's __kernel_vsyscall + * fallback for hardware that doesn't support a faster entry method. + * Restarted 32-bit system calls also fall back to INT $0x80 + * regardless of what instruction was originally used to do the system + * call. (64-bit programs can use INT $0x80 as well, but they can + * only run on 64-bit kernels and therefore land in + * entry_INT80_compat.) + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 + */ +ENTRY(entry_INT80_32) + ASM_CLAC + pushl %eax /* pt_regs->orig_ax */ + + SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */ + + /* + * User mode is traced as though IRQs are on, and the interrupt gate + * turned them off. + */ + TRACE_IRQS_OFF + + movl %esp, %eax + call do_int80_syscall_32 +.Lsyscall_32_done: + +restore_all: + TRACE_IRQS_IRET + SWITCH_TO_ENTRY_STACK +.Lrestore_all_notrace: + CHECK_AND_APPLY_ESPFIX +.Lrestore_nocheck: + /* Switch back to user CR3 */ + SWITCH_TO_USER_CR3 scratch_reg=%eax + + BUG_IF_WRONG_CR3 + + /* Restore user state */ + RESTORE_REGS pop=4 # skip orig_eax/error_code +.Lirq_return: + /* + * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization + * when returning from IPI handler and when returning from + * scheduler to user-space. + */ + INTERRUPT_RETURN + +restore_all_kernel: + TRACE_IRQS_IRET + PARANOID_EXIT_TO_KERNEL_MODE + BUG_IF_WRONG_CR3 + RESTORE_REGS 4 + jmp .Lirq_return + +.section .fixup, "ax" +ENTRY(iret_exc ) + pushl $0 # no error code + pushl $do_iret_error + +#ifdef CONFIG_DEBUG_ENTRY + /* + * The stack-frame here is the one that iret faulted on, so its a + * return-to-user frame. We are on kernel-cr3 because we come here from + * the fixup code. This confuses the CR3 checker, so switch to user-cr3 + * as the checker expects it. + */ + pushl %eax + SWITCH_TO_USER_CR3 scratch_reg=%eax + popl %eax +#endif + + jmp common_exception +.previous + _ASM_EXTABLE(.Lirq_return, iret_exc) +ENDPROC(entry_INT80_32) + +.macro FIXUP_ESPFIX_STACK +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ +#ifdef CONFIG_X86_ESPFIX32 + /* fixup the stack */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ + pushl $__KERNEL_DS + pushl %eax + lss (%esp), %esp /* switch to the normal stack segment */ +#endif +.endm +.macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + /* switch to normal stack */ + FIXUP_ESPFIX_STACK +27: +#endif +.endm + +/* + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. + */ + .align 8 +ENTRY(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushl $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt + .align 8 + .endr +END(irq_entries_start) + +#ifdef CONFIG_X86_LOCAL_APIC + .align 8 +ENTRY(spurious_entries_start) + vector=FIRST_SYSTEM_VECTOR + .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) + pushl $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_spurious + .align 8 + .endr +END(spurious_entries_start) + +common_spurious: + ASM_CLAC + addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ + SAVE_ALL switch_stacks=1 + ENCODE_FRAME_POINTER + TRACE_IRQS_OFF + movl %esp, %eax + call smp_spurious_interrupt + jmp ret_from_intr +ENDPROC(common_spurious) +#endif + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + ASM_CLAC + addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ + + SAVE_ALL switch_stacks=1 + ENCODE_FRAME_POINTER + TRACE_IRQS_OFF + movl %esp, %eax + call do_IRQ + jmp ret_from_intr +ENDPROC(common_interrupt) + +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + ASM_CLAC; \ + pushl $~(nr); \ + SAVE_ALL switch_stacks=1; \ + ENCODE_FRAME_POINTER; \ + TRACE_IRQS_OFF \ + movl %esp, %eax; \ + call fn; \ + jmp ret_from_intr; \ +ENDPROC(name) + +#define BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(name, nr, smp_##name); \ + +/* The include is where all of the SMP etc. interrupts come from */ +#include <asm/entry_arch.h> + +ENTRY(coprocessor_error) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_error + jmp common_exception +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + ASM_CLAC + pushl $0 +#ifdef CONFIG_X86_INVD_BUG + /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ + ALTERNATIVE "pushl $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ + X86_FEATURE_XMM +#else + pushl $do_simd_coprocessor_error +#endif + jmp common_exception +END(simd_coprocessor_error) + +ENTRY(device_not_available) + ASM_CLAC + pushl $-1 # mark this as an int + pushl $do_device_not_available + jmp common_exception +END(device_not_available) + +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) + iret + _ASM_EXTABLE(native_iret, iret_exc) +END(native_iret) +#endif + +ENTRY(overflow) + ASM_CLAC + pushl $0 + pushl $do_overflow + jmp common_exception +END(overflow) + +ENTRY(bounds) + ASM_CLAC + pushl $0 + pushl $do_bounds + jmp common_exception +END(bounds) + +ENTRY(invalid_op) + ASM_CLAC + pushl $0 + pushl $do_invalid_op + jmp common_exception +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp common_exception +END(coprocessor_segment_overrun) + +ENTRY(invalid_TSS) + ASM_CLAC + pushl $do_invalid_TSS + jmp common_exception +END(invalid_TSS) + +ENTRY(segment_not_present) + ASM_CLAC + pushl $do_segment_not_present + jmp common_exception +END(segment_not_present) + +ENTRY(stack_segment) + ASM_CLAC + pushl $do_stack_segment + jmp common_exception +END(stack_segment) + +ENTRY(alignment_check) + ASM_CLAC + pushl $do_alignment_check + jmp common_exception +END(alignment_check) + +ENTRY(divide_error) + ASM_CLAC + pushl $0 # no error code + pushl $do_divide_error + jmp common_exception +END(divide_error) + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + ASM_CLAC + pushl $0 + pushl machine_check_vector + jmp common_exception +END(machine_check) +#endif + +ENTRY(spurious_interrupt_bug) + ASM_CLAC + pushl $0 + pushl $do_spurious_interrupt_bug + jmp common_exception +END(spurious_interrupt_bug) + +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + ENCODE_FRAME_POINTER + TRACE_IRQS_OFF + + /* + * Check to see if we got the event in the critical + * region in xen_iret_direct, after we've reenabled + * events and checked for pending events. This simulates + * iret instruction's behaviour where it delivers a + * pending interrupt when enabling interrupts: + */ + movl PT_EIP(%esp), %eax + cmpl $xen_iret_start_crit, %eax + jb 1f + cmpl $xen_iret_end_crit, %eax + jae 1f + + jmp xen_iret_crit_fixup + +ENTRY(xen_do_upcall) +1: mov %esp, %eax + call xen_evtchn_do_upcall +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif + jmp ret_from_intr +ENDPROC(xen_hypervisor_callback) + +/* + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we fix up by reattempting the load, and zeroing the segment + * register if the load fails. + * Category 2 we fix up by jumping to do_iret_error. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by maintaining a status value in EAX. + */ +ENTRY(xen_failsafe_callback) + pushl %eax + movl $1, %eax +1: mov 4(%esp), %ds +2: mov 8(%esp), %es +3: mov 12(%esp), %fs +4: mov 16(%esp), %gs + /* EAX == 0 => Category 1 (Bad segment) + EAX != 0 => Category 2 (Bad IRET) */ + testl %eax, %eax + popl %eax + lea 16(%esp), %esp + jz 5f + jmp iret_exc +5: pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + ENCODE_FRAME_POINTER + jmp ret_from_exception + +.section .fixup, "ax" +6: xorl %eax, %eax + movl %eax, 4(%esp) + jmp 1b +7: xorl %eax, %eax + movl %eax, 8(%esp) + jmp 2b +8: xorl %eax, %eax + movl %eax, 12(%esp) + jmp 3b +9: xorl %eax, %eax + movl %eax, 16(%esp) + jmp 4b +.previous + _ASM_EXTABLE(1b, 6b) + _ASM_EXTABLE(2b, 7b) + _ASM_EXTABLE(3b, 8b) + _ASM_EXTABLE(4b, 9b) +ENDPROC(xen_failsafe_callback) + +BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + xen_evtchn_do_upcall) + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) + +BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + hyperv_vector_handler) + +BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR, + hyperv_reenlightenment_intr) + +BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, + hv_stimer0_vector_handler) + +#endif /* CONFIG_HYPERV */ + +ENTRY(page_fault) + ASM_CLAC + pushl $do_page_fault + ALIGN + jmp common_exception +END(page_fault) + +common_exception: + /* the function address is in %gs's slot on the stack */ + pushl %fs + pushl %es + pushl %ds + pushl %eax + movl $(__USER_DS), %eax + movl %eax, %ds + movl %eax, %es + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + SWITCH_TO_KERNEL_STACK + ENCODE_FRAME_POINTER + cld + UNWIND_ESPFIX_STACK + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + TRACE_IRQS_OFF + movl %esp, %eax # pt_regs pointer + CALL_NOSPEC %edi + jmp ret_from_exception +END(common_exception) + +ENTRY(debug) + /* + * Entry from sysenter is now handled in common_exception + */ + ASM_CLAC + pushl $-1 # mark this as an int + pushl $do_debug + jmp common_exception +END(debug) + +/* + * NMI is doubly nasty. It can happen on the first instruction of + * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning + * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32 + * switched stacks. We handle both conditions by simply checking whether we + * interrupted kernel code running on the SYSENTER stack. + */ +ENTRY(nmi) + ASM_CLAC + +#ifdef CONFIG_X86_ESPFIX32 + pushl %eax + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + je .Lnmi_espfix_stack +#endif + + pushl %eax # pt_regs->orig_ax + SAVE_ALL_NMI cr3_reg=%edi + ENCODE_FRAME_POINTER + xorl %edx, %edx # zero error code + movl %esp, %eax # pt_regs pointer + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx + jb .Lnmi_from_sysenter_stack + + /* Not on SYSENTER stack. */ + call do_nmi + jmp .Lnmi_return + +.Lnmi_from_sysenter_stack: + /* + * We're on the SYSENTER stack. Switch off. No one (not even debug) + * is using the thread stack right now, so it's safe for us to use it. + */ + movl %esp, %ebx + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp + call do_nmi + movl %ebx, %esp + +.Lnmi_return: + CHECK_AND_APPLY_ESPFIX + RESTORE_ALL_NMI cr3_reg=%edi pop=4 + jmp .Lirq_return + +#ifdef CONFIG_X86_ESPFIX32 +.Lnmi_espfix_stack: + /* + * create the pointer to lss back + */ + pushl %ss + pushl %esp + addl $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + .endr + pushl %eax + SAVE_ALL_NMI cr3_reg=%edi + ENCODE_FRAME_POINTER + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx, %edx # zero error code + call do_nmi + RESTORE_ALL_NMI cr3_reg=%edi + lss 12+4(%esp), %esp # back to espfix stack + jmp .Lirq_return +#endif +END(nmi) + +ENTRY(int3) + ASM_CLAC + pushl $-1 # mark this as an int + + SAVE_ALL switch_stacks=1 + ENCODE_FRAME_POINTER + TRACE_IRQS_OFF + xorl %edx, %edx # zero error code + movl %esp, %eax # pt_regs pointer + call do_int3 + jmp ret_from_exception +END(int3) + +ENTRY(general_protection) + ASM_CLAC + pushl $do_general_protection + jmp common_exception +END(general_protection) + +#ifdef CONFIG_KVM_GUEST +ENTRY(async_page_fault) + ASM_CLAC + pushl $do_async_page_fault + jmp common_exception +END(async_page_fault) +#endif + +ENTRY(rewind_stack_do_exit) + /* Prevent any naive code from trying to unwind to our caller. */ + xorl %ebp, %ebp + + movl PER_CPU_VAR(cpu_current_top_of_stack), %esi + leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp + + call do_exit +1: jmp 1b +END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S new file mode 100644 index 000000000..dfe26f3cf --- /dev/null +++ b/arch/x86/entry/entry_64.S @@ -0,0 +1,1751 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * linux/arch/x86_64/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> + * + * entry.S contains the system-call and fault low-level handling routines. + * + * Some of this is documented in Documentation/x86/entry_64.txt + * + * A note on terminology: + * - iret frame: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. + * + * Some macro usage: + * - ENTRY/END: Define functions in the symbol table. + * - TRACE_IRQ_*: Trace hardirq state for lock debugging. + * - idtentry: Define exception entry points. + */ +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/cache.h> +#include <asm/errno.h> +#include <asm/asm-offsets.h> +#include <asm/msr.h> +#include <asm/unistd.h> +#include <asm/thread_info.h> +#include <asm/hw_irq.h> +#include <asm/page_types.h> +#include <asm/irqflags.h> +#include <asm/paravirt.h> +#include <asm/percpu.h> +#include <asm/asm.h> +#include <asm/smap.h> +#include <asm/pgtable_types.h> +#include <asm/export.h> +#include <asm/frame.h> +#include <asm/nospec-branch.h> +#include <linux/err.h> + +#include "calling.h" + +.code64 +.section .entry.text, "ax" + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret64) + UNWIND_HINT_EMPTY + swapgs + sysretq +END(native_usergs_sysret64) +#endif /* CONFIG_PARAVIRT */ + +.macro TRACE_IRQS_FLAGS flags:req +#ifdef CONFIG_TRACE_IRQFLAGS + btl $9, \flags /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + +.macro TRACE_IRQS_IRETQ + TRACE_IRQS_FLAGS EFLAGS(%rsp) +.endm + +/* + * When dynamic function tracer is enabled it will add a breakpoint + * to all locations that it is about to modify, sync CPUs, update + * all the code, sync CPUs, then remove the breakpoints. In this time + * if lockdep is enabled, it might jump back into the debug handler + * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). + * + * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to + * make sure the stack pointer does not get reset back to the top + * of the debug stack, and instead just reuses the current stack. + */ +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) + +.macro TRACE_IRQS_OFF_DEBUG + call debug_stack_set_zero + TRACE_IRQS_OFF + call debug_stack_reset +.endm + +.macro TRACE_IRQS_ON_DEBUG + call debug_stack_set_zero + TRACE_IRQS_ON + call debug_stack_reset +.endm + +.macro TRACE_IRQS_IRETQ_DEBUG + btl $9, EFLAGS(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON_DEBUG +1: +.endm + +#else +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +#endif + +/* + * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. + * + * This is the only entry point used for 64-bit system calls. The + * hardware interface is reasonably well designed and the register to + * argument mapping Linux uses fits well with the registers that are + * available when SYSCALL is used. + * + * SYSCALL instructions can be found inlined in libc implementations as + * well as some other programs and libraries. There are also a handful + * of SYSCALL instructions in the vDSO used, for example, as a + * clock_gettimeofday fallback. + * + * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Registers on entry: + * rax system call number + * rcx return address + * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) + * rdi arg0 + * rsi arg1 + * rdx arg2 + * r10 arg3 (needs to be moved to rcx to conform to C ABI) + * r8 arg4 + * r9 arg5 + * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) + * + * Only called from user space. + * + * When user can change pt_regs->foo always force IRET. That is because + * it deals with uncanonical addresses better. SYSRET has trouble + * with them due to bugs in both AMD and Intel CPUs. + */ + + .pushsection .entry_trampoline, "ax" + +/* + * The code in here gets remapped into cpu_entry_area's trampoline. This means + * that the assembler and linker have the wrong idea as to where this code + * lives (and, in fact, it's mapped more than once, so it's not even at a + * fixed address). So we can't reference any symbols outside the entry + * trampoline and expect it to work. + * + * Instead, we carefully abuse %rip-relative addressing. + * _entry_trampoline(%rip) refers to the start of the remapped) entry + * trampoline. We can thus find cpu_entry_area with this macro: + */ + +#define CPU_ENTRY_AREA \ + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ + SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA + +ENTRY(entry_SYSCALL_64_trampoline) + UNWIND_HINT_EMPTY + swapgs + + /* Stash the user RSP. */ + movq %rsp, RSP_SCRATCH + + /* Note: using %rsp as a scratch reg. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + + /* Load the top of the task stack into RSP */ + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp + + /* Start building the simulated IRET frame. */ + pushq $__USER_DS /* pt_regs->ss */ + pushq RSP_SCRATCH /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + + /* + * x86 lacks a near absolute jump, and we can't jump to the real + * entry text with a relative jump. We could push the target + * address and then use retq, but this destroys the pipeline on + * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, + * spill RDI and restore it in a second-stage trampoline. + */ + pushq %rdi + movq $entry_SYSCALL_64_stage2, %rdi + JMP_NOSPEC %rdi +END(entry_SYSCALL_64_trampoline) + + .popsection + +ENTRY(entry_SYSCALL_64_stage2) + UNWIND_HINT_EMPTY + popq %rdi + jmp entry_SYSCALL_64_after_hwframe +END(entry_SYSCALL_64_stage2) + +ENTRY(entry_SYSCALL_64) + UNWIND_HINT_EMPTY + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + + swapgs + /* + * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it + * is not required to switch CR3. + */ + movq %rsp, PER_CPU_VAR(rsp_scratch) + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + /* Construct struct pt_regs on stack */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_64_after_hwframe) + pushq %rax /* pt_regs->orig_ax */ + + PUSH_AND_CLEAR_REGS rax=$-ENOSYS + + TRACE_IRQS_OFF + + /* IRQs are off. */ + movq %rax, %rdi + movq %rsp, %rsi + call do_syscall_64 /* returns with IRQs disabled */ + + TRACE_IRQS_IRETQ /* we're about to change IF */ + + /* + * Try to use SYSRET instead of IRET if we're returning to + * a completely clean 64-bit userspace context. If we're not, + * go to the slow exit path. + */ + movq RCX(%rsp), %rcx + movq RIP(%rsp), %r11 + + cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ + jne swapgs_restore_regs_and_return_to_usermode + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. + * + * If width of "canonical tail" ever becomes variable, this will need + * to be updated to remain correct on both old and new CPUs. + * + * Change top bits to match most significant bit (47th or 56th bit + * depending on paging mode) in the address. + */ +#ifdef CONFIG_X86_5LEVEL + ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ + "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 +#else + shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx +#endif + + /* If this changed %rcx, it was not canonical */ + cmpq %rcx, %r11 + jne swapgs_restore_regs_and_return_to_usermode + + cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ + jne swapgs_restore_regs_and_return_to_usermode + + movq R11(%rsp), %r11 + cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ + jne swapgs_restore_regs_and_return_to_usermode + + /* + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot + * restore RF properly. If the slowpath sets it for whatever reason, we + * need to restore it correctly. + * + * SYSRET can restore TF, but unlike IRET, restoring TF results in a + * trap from userspace immediately after SYSRET. This would cause an + * infinite loop whenever #DB happens with register state that satisfies + * the opportunistic SYSRET conditions. For example, single-stepping + * this user code: + * + * movq $stuck_here, %rcx + * pushfq + * popq %r11 + * stuck_here: + * + * would never get past 'stuck_here'. + */ + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz swapgs_restore_regs_and_return_to_usermode + + /* nothing to check for RSP */ + + cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ + jne swapgs_restore_regs_and_return_to_usermode + + /* + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. + */ +syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + POP_REGS pop_rdi=0 skip_r11rcx=1 + + /* + * Now all regs are restored except RSP and RDI. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY + + pushq RSP-RDI(%rdi) /* RSP */ + pushq (%rdi) /* RDI */ + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + + popq %rdi + popq %rsp + USERGS_SYSRET64 +END(entry_SYSCALL_64) + +/* + * %rdi: prev task + * %rsi: next task + */ +ENTRY(__switch_to_asm) + UNWIND_HINT_FUNC + /* + * Save callee-saved registers + * This must match the order in inactive_task_frame + */ + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushfq + + /* switch stack */ + movq %rsp, TASK_threadsp(%rdi) + movq TASK_threadsp(%rsi), %rsp + +#ifdef CONFIG_STACKPROTECTOR + movq TASK_stack_canary(%rsi), %rbx + movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset +#endif + +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +#endif + + /* restore callee-saved registers */ + popfq + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + + jmp __switch_to +END(__switch_to_asm) + +/* + * A newly forked process directly context switches into this address. + * + * rax: prev task we switched from + * rbx: kernel thread func (NULL for user thread) + * r12: kernel thread arg + */ +ENTRY(ret_from_fork) + UNWIND_HINT_EMPTY + movq %rax, %rdi + call schedule_tail /* rdi: 'prev' task parameter */ + + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ + +2: + UNWIND_HINT_REGS + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ + TRACE_IRQS_ON /* user mode is traced as IRQS on */ + jmp swapgs_restore_regs_and_return_to_usermode + +1: + /* kernel thread */ + UNWIND_HINT_EMPTY + movq %r12, %rdi + CALL_NOSPEC %rbx + /* + * A kernel thread is allowed to return here after successfully + * calling do_execve(). Exit to userspace to complete the execve() + * syscall. + */ + movq $0, RAX(%rsp) + jmp 2b +END(ret_from_fork) + +/* + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. + */ + .align 8 +ENTRY(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + UNWIND_HINT_IRET_REGS + pushq $(~vector+0x80) /* Note: always in signed byte range */ + jmp common_interrupt + .align 8 + vector=vector+1 + .endr +END(irq_entries_start) + + .align 8 +ENTRY(spurious_entries_start) + vector=FIRST_SYSTEM_VECTOR + .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) + UNWIND_HINT_IRET_REGS + pushq $(~vector+0x80) /* Note: always in signed byte range */ + jmp common_spurious + .align 8 + vector=vector+1 + .endr +END(spurious_entries_start) + +.macro DEBUG_ENTRY_ASSERT_IRQS_OFF +#ifdef CONFIG_DEBUG_ENTRY + pushq %rax + SAVE_FLAGS(CLBR_RAX) + testl $X86_EFLAGS_IF, %eax + jz .Lokay_\@ + ud2 +.Lokay_\@: + popq %rax +#endif +.endm + +/* + * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers + * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. + * Requires kernel GSBASE. + * + * The invariant is that, if irq_count != -1, then the IRQ stack is in use. + */ +.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0 + DEBUG_ENTRY_ASSERT_IRQS_OFF + + .if \save_ret + /* + * If save_ret is set, the original stack contains one additional + * entry -- the return address. Therefore, move the address one + * entry below %rsp to \old_rsp. + */ + leaq 8(%rsp), \old_rsp + .else + movq %rsp, \old_rsp + .endif + + .if \regs + UNWIND_HINT_REGS base=\old_rsp + .endif + + incl PER_CPU_VAR(irq_count) + jnz .Lirq_stack_push_old_rsp_\@ + + /* + * Right now, if we just incremented irq_count to zero, we've + * claimed the IRQ stack but we haven't switched to it yet. + * + * If anything is added that can interrupt us here without using IST, + * it must be *extremely* careful to limit its stack usage. This + * could include kprobes and a hypothetical future IST-less #DB + * handler. + * + * The OOPS unwinder relies on the word at the top of the IRQ + * stack linking back to the previous RSP for the entire time we're + * on the IRQ stack. For this to work reliably, we need to write + * it before we actually move ourselves to the IRQ stack. + */ + + movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) + movq PER_CPU_VAR(irq_stack_ptr), %rsp + +#ifdef CONFIG_DEBUG_ENTRY + /* + * If the first movq above becomes wrong due to IRQ stack layout + * changes, the only way we'll notice is if we try to unwind right + * here. Assert that we set up the stack right to catch this type + * of bug quickly. + */ + cmpq -8(%rsp), \old_rsp + je .Lirq_stack_okay\@ + ud2 + .Lirq_stack_okay\@: +#endif + +.Lirq_stack_push_old_rsp_\@: + pushq \old_rsp + + .if \regs + UNWIND_HINT_REGS indirect=1 + .endif + + .if \save_ret + /* + * Push the return address to the stack. This return address can + * be found at the "real" original RSP, which was offset by 8 at + * the beginning of this macro. + */ + pushq -8(\old_rsp) + .endif +.endm + +/* + * Undoes ENTER_IRQ_STACK. + */ +.macro LEAVE_IRQ_STACK regs=1 + DEBUG_ENTRY_ASSERT_IRQS_OFF + /* We need to be off the IRQ stack before decrementing irq_count. */ + popq %rsp + + .if \regs + UNWIND_HINT_REGS + .endif + + /* + * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming + * the irq stack but we're not on it. + */ + + decl PER_CPU_VAR(irq_count) +.endm + +/* + * Interrupt entry helper function. + * + * Entry runs with interrupts off. Stack layout at entry: + * +----------------------------------------------------+ + * | regs->ss | + * | regs->rsp | + * | regs->eflags | + * | regs->cs | + * | regs->ip | + * +----------------------------------------------------+ + * | regs->orig_ax = ~(interrupt number) | + * +----------------------------------------------------+ + * | return address | + * +----------------------------------------------------+ + */ +ENTRY(interrupt_entry) + UNWIND_HINT_IRET_REGS offset=16 + ASM_CLAC + cld + + testb $3, CS-ORIG_RAX+8(%rsp) + jz 1f + SWAPGS + FENCE_SWAPGS_USER_ENTRY + /* + * Switch to the thread stack. The IRET frame and orig_ax are + * on the stack, as well as the return address. RDI..R12 are + * not (yet) on the stack and space has not (yet) been + * allocated for them. + */ + pushq %rdi + + /* Need to switch before accessing the thread stack. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + /* + * We have RDI, return address, and orig_ax on the stack on + * top of the IRET frame. That means offset=24 + */ + UNWIND_HINT_IRET_REGS base=%rdi offset=24 + + pushq 7*8(%rdi) /* regs->ss */ + pushq 6*8(%rdi) /* regs->rsp */ + pushq 5*8(%rdi) /* regs->eflags */ + pushq 4*8(%rdi) /* regs->cs */ + pushq 3*8(%rdi) /* regs->ip */ + UNWIND_HINT_IRET_REGS + pushq 2*8(%rdi) /* regs->orig_ax */ + pushq 8(%rdi) /* return address */ + + movq (%rdi), %rdi + jmp 2f +1: + FENCE_SWAPGS_KERNEL_ENTRY +2: + PUSH_AND_CLEAR_REGS save_ret=1 + ENCODE_FRAME_POINTER 8 + + testb $3, CS+8(%rsp) + jz 1f + + /* + * IRQ from user mode. + * + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode + * (which can take locks). Since TRACE_IRQS_OFF is idempotent, + * the simplest way to handle it is to just call it twice if + * we enter from user mode. There's no reason to optimize this since + * TRACE_IRQS_OFF is a no-op if lockdep is off. + */ + TRACE_IRQS_OFF + + CALL_enter_from_user_mode + +1: + ENTER_IRQ_STACK old_rsp=%rdi save_ret=1 + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF + + ret +END(interrupt_entry) +_ASM_NOKPROBE(interrupt_entry) + + +/* Interrupt entry/exit. */ + +/* + * The interrupt stubs push (~vector+0x80) onto the stack and + * then jump to common_spurious/interrupt. + */ +common_spurious: + addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ + call interrupt_entry + UNWIND_HINT_REGS indirect=1 + call smp_spurious_interrupt /* rdi points to pt_regs */ + jmp ret_from_intr +END(common_spurious) +_ASM_NOKPROBE(common_spurious) + +/* common_interrupt is a hotpath. Align it */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ + call interrupt_entry + UNWIND_HINT_REGS indirect=1 + call do_IRQ /* rdi points to pt_regs */ + /* 0(%rsp): old RSP */ +ret_from_intr: + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + + LEAVE_IRQ_STACK + + testb $3, CS(%rsp) + jz retint_kernel + + /* Interrupt came from user space */ +GLOBAL(retint_user) + mov %rsp,%rdi + call prepare_exit_to_usermode + TRACE_IRQS_IRETQ + +GLOBAL(swapgs_restore_regs_and_return_to_usermode) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ + testb $3, CS(%rsp) + jnz 1f + ud2 +1: +#endif + POP_REGS pop_rdi=0 + + /* + * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY + + /* Copy the IRET frame to the trampoline stack. */ + pushq 6*8(%rdi) /* SS */ + pushq 5*8(%rdi) /* RSP */ + pushq 4*8(%rdi) /* EFLAGS */ + pushq 3*8(%rdi) /* CS */ + pushq 2*8(%rdi) /* RIP */ + + /* Push user RDI on the trampoline stack. */ + pushq (%rdi) + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + + /* Restore RDI. */ + popq %rdi + SWAPGS + INTERRUPT_RETURN + + +/* Returning to kernel space */ +retint_kernel: +#ifdef CONFIG_PREEMPT + /* Interrupts are off */ + /* Check if we need preemption */ + btl $9, EFLAGS(%rsp) /* were interrupts off? */ + jnc 1f +0: cmpl $0, PER_CPU_VAR(__preempt_count) + jnz 1f + call preempt_schedule_irq + jmp 0b +1: +#endif + /* + * The iretq could re-enable interrupts: + */ + TRACE_IRQS_IRETQ + +GLOBAL(restore_regs_and_return_to_kernel) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates kernel mode. */ + testb $3, CS(%rsp) + jz 1f + ud2 +1: +#endif + POP_REGS + addq $8, %rsp /* skip regs->orig_ax */ + /* + * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization + * when returning from IPI handler. + */ + INTERRUPT_RETURN + +ENTRY(native_iret) + UNWIND_HINT_IRET_REGS + /* + * Are we returning to a stack segment from the LDT? Note: in + * 64-bit mode SS:RSP on the exception stack is always valid. + */ +#ifdef CONFIG_X86_ESPFIX64 + testb $4, (SS-RIP)(%rsp) + jnz native_irq_return_ldt +#endif + +.global native_irq_return_iret +native_irq_return_iret: + /* + * This may fault. Non-paranoid faults on return to userspace are + * handled by fixup_bad_iret. These include #SS, #GP, and #NP. + * Double-faults due to espfix64 are handled in do_double_fault. + * Other faults here are fatal. + */ + iretq + +#ifdef CONFIG_X86_ESPFIX64 +native_irq_return_ldt: + /* + * We are running with user GSBASE. All GPRs contain their user + * values. We have a percpu ESPFIX stack that is eight slots + * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom + * of the ESPFIX stack. + * + * We clobber RAX and RDI in this code. We stash RDI on the + * normal stack and RAX on the ESPFIX stack. + * + * The ESPFIX stack layout we set up looks like this: + * + * --- top of ESPFIX stack --- + * SS + * RSP + * RFLAGS + * CS + * RIP <-- RSP points here when we're done + * RAX <-- espfix_waddr points here + * --- bottom of ESPFIX stack --- + */ + + pushq %rdi /* Stash user RDI */ + SWAPGS /* to kernel GS */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ + + movq PER_CPU_VAR(espfix_waddr), %rdi + movq %rax, (0*8)(%rdi) /* user RAX */ + movq (1*8)(%rsp), %rax /* user RIP */ + movq %rax, (1*8)(%rdi) + movq (2*8)(%rsp), %rax /* user CS */ + movq %rax, (2*8)(%rdi) + movq (3*8)(%rsp), %rax /* user RFLAGS */ + movq %rax, (3*8)(%rdi) + movq (5*8)(%rsp), %rax /* user SS */ + movq %rax, (5*8)(%rdi) + movq (4*8)(%rsp), %rax /* user RSP */ + movq %rax, (4*8)(%rdi) + /* Now RAX == RSP. */ + + andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ + + /* + * espfix_stack[31:16] == 0. The page tables are set up such that + * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of + * espfix_waddr for any X. That is, there are 65536 RO aliases of + * the same page. Set up RSP so that RSP[31:16] contains the + * respective 16 bits of the /userspace/ RSP and RSP nonetheless + * still points to an RO alias of the ESPFIX stack. + */ + orq PER_CPU_VAR(espfix_stack), %rax + + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + SWAPGS /* to user GS */ + popq %rdi /* Restore user RDI */ + + movq %rax, %rsp + UNWIND_HINT_IRET_REGS offset=8 + + /* + * At this point, we cannot write to the stack any more, but we can + * still read. + */ + popq %rax /* Restore user RAX */ + + /* + * RSP now points to an ordinary IRET frame, except that the page + * is read-only and RSP[31:16] are preloaded with the userspace + * values. We can now IRET back to userspace. + */ + jmp native_irq_return_iret +#endif +END(common_interrupt) +_ASM_NOKPROBE(common_interrupt) + +/* + * APIC interrupts. + */ +.macro apicinterrupt3 num sym do_sym +ENTRY(\sym) + UNWIND_HINT_IRET_REGS + pushq $~(\num) +.Lcommon_\sym: + call interrupt_entry + UNWIND_HINT_REGS indirect=1 + call \do_sym /* rdi points to pt_regs */ + jmp ret_from_intr +END(\sym) +_ASM_NOKPROBE(\sym) +.endm + +/* Make sure APIC interrupt handlers end up in the irqentry section: */ +#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +#define POP_SECTION_IRQENTRY .popsection + +.macro apicinterrupt num sym do_sym +PUSH_SECTION_IRQENTRY +apicinterrupt3 \num \sym \do_sym +POP_SECTION_IRQENTRY +.endm + +#ifdef CONFIG_SMP +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt +#endif + +#ifdef CONFIG_X86_UV +apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt +#endif + +apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi + +#ifdef CONFIG_HAVE_KVM +apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi +apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD +apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt +#endif + +#ifdef CONFIG_X86_MCE_AMD +apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt +#endif + +#ifdef CONFIG_SMP +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt +#endif + +apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt + +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt +#endif + +/* + * Exception entry points. + */ +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) + +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 create_gap=0 +ENTRY(\sym) + UNWIND_HINT_IRET_REGS offset=\has_error_code*8 + + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 + .error "using shift_ist requires paranoid=1" + .endif + + ASM_CLAC + + .if \has_error_code == 0 + pushq $-1 /* ORIG_RAX: no syscall to restart */ + .endif + + .if \paranoid == 1 + testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */ + jnz .Lfrom_usermode_switch_stack_\@ + .endif + + .if \create_gap == 1 + /* + * If coming from kernel space, create a 6-word gap to allow the + * int3 handler to emulate a call instruction. + */ + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_no_gap_\@ + .rept 6 + pushq 5*8(%rsp) + .endr + UNWIND_HINT_IRET_REGS offset=8 +.Lfrom_usermode_no_gap_\@: + .endif + + .if \paranoid + call paranoid_entry + .else + call error_entry + .endif + UNWIND_HINT_REGS + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + + movq %rsp, %rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi, %esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + .endif + + call \do_sym + + .if \shift_ist != -1 + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + .endif + + /* these procedures expect "no swapgs" flag in ebx */ + .if \paranoid + jmp paranoid_exit + .else + jmp error_exit + .endif + + .if \paranoid == 1 + /* + * Entry from userspace. Switch stacks and treat it + * as a normal entry. This means that paranoid handlers + * run in real process context if user_mode(regs). + */ +.Lfrom_usermode_switch_stack_\@: + call error_entry + + movq %rsp, %rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi, %esi /* no error code */ + .endif + + call \do_sym + + jmp error_exit + .endif +_ASM_NOKPROBE(\sym) +END(\sym) +.endm + +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 + + + /* + * Reload gs selector with exception handling + * edi: new selector + */ +ENTRY(native_load_gs_index) + FRAME_BEGIN + pushfq + DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) + TRACE_IRQS_OFF + SWAPGS +.Lgs_change: + movl %edi, %gs +2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE + SWAPGS + TRACE_IRQS_FLAGS (%rsp) + popfq + FRAME_END + ret +ENDPROC(native_load_gs_index) +EXPORT_SYMBOL(native_load_gs_index) + + _ASM_EXTABLE(.Lgs_change, bad_gs) + .section .fixup, "ax" + /* running with kernelgs */ +bad_gs: + SWAPGS /* switch back to user gs */ +.macro ZAP_GS + /* This can't be a string because the preprocessor needs to see it. */ + movl $__USER_DS, %eax + movl %eax, %gs +.endm + ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG + xorl %eax, %eax + movl %eax, %gs + jmp 2b + .previous + +/* Call softirq on interrupt stack. Interrupts are off. */ +ENTRY(do_softirq_own_stack) + pushq %rbp + mov %rsp, %rbp + ENTER_IRQ_STACK regs=0 old_rsp=%r11 + call __do_softirq + LEAVE_IRQ_STACK regs=0 + leaveq + ret +ENDPROC(do_softirq_own_stack) + +#ifdef CONFIG_XEN +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 + +/* + * A note on the "critical region" in our callback handler. + * We want to avoid stacking callback handlers due to events occurring + * during handling of the last event. To do this, we keep events disabled + * until we've done all processing. HOWEVER, we must enable events before + * popping the stack frame (can't be done atomically) and so it would still + * be possible to get enough handler activations to overflow the stack. + * Although unlikely, bugs of that kind are hard to track down, so we'd + * like to avoid the possibility. + * So, on entry to the handler we detect whether we interrupted an + * existing activation in its critical region -- if so, we pop the current + * activation and restart the handler using the previous one. + */ +ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ + +/* + * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + * see the correct pointer to the pt_regs + */ + UNWIND_HINT_FUNC + movq %rdi, %rsp /* we don't return, adjust the stack frame */ + UNWIND_HINT_REGS + + ENTER_IRQ_STACK old_rsp=%r10 + call xen_evtchn_do_upcall + LEAVE_IRQ_STACK + +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif + jmp error_exit +END(xen_do_hypervisor_callback) + +/* + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we do not need to fix up as Xen has already reloaded all segment + * registers that could be reloaded and zeroed the others. + * Category 2 we fix up by killing the current process. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by comparing each saved segment register + * with its current contents: any discrepancy means we in category 1. + */ +ENTRY(xen_failsafe_callback) + UNWIND_HINT_EMPTY + movl %ds, %ecx + cmpw %cx, 0x10(%rsp) + jne 1f + movl %es, %ecx + cmpw %cx, 0x18(%rsp) + jne 1f + movl %fs, %ecx + cmpw %cx, 0x20(%rsp) + jne 1f + movl %gs, %ecx + cmpw %cx, 0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp), %rcx + movq 8(%rsp), %r11 + addq $0x30, %rsp + pushq $0 /* RIP */ + UNWIND_HINT_IRET_REGS offset=8 + jmp general_protection +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp), %rcx + movq 8(%rsp), %r11 + addq $0x30, %rsp + UNWIND_HINT_IRET_REGS + pushq $-1 /* orig_ax = -1 => not a system call */ + PUSH_AND_CLEAR_REGS + ENCODE_FRAME_POINTER + jmp error_exit +END(xen_failsafe_callback) + +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + xen_hvm_callback_vector xen_evtchn_do_upcall + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + hyperv_callback_vector hyperv_vector_handler + +apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \ + hyperv_reenlightenment_vector hyperv_reenlightenment_intr + +apicinterrupt3 HYPERV_STIMER0_VECTOR \ + hv_stimer0_callback_vector hv_stimer0_vector_handler +#endif /* CONFIG_HYPERV */ + +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 create_gap=1 +idtentry stack_segment do_stack_segment has_error_code=1 + +#ifdef CONFIG_XEN +idtentry xennmi do_nmi has_error_code=0 +idtentry xendebug do_debug has_error_code=0 +#endif + +idtentry general_protection do_general_protection has_error_code=1 +idtentry page_fault do_page_fault has_error_code=1 + +#ifdef CONFIG_KVM_GUEST +idtentry async_page_fault do_async_page_fault has_error_code=1 +#endif + +#ifdef CONFIG_X86_MCE +idtentry machine_check do_mce has_error_code=0 paranoid=1 +#endif + +/* + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ +ENTRY(paranoid_entry) + UNWIND_HINT_FUNC + cld + PUSH_AND_CLEAR_REGS save_ret=1 + ENCODE_FRAME_POINTER 8 + movl $1, %ebx + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx, %ebx + +1: + /* + * Always stash CR3 in %r14. This value will be restored, + * verbatim, at exit. Needed if paranoid_entry interrupted + * another entry that already switched to the user CR3 value + * but has not yet returned to userspace. + * + * This is also why CS (stashed in the "iret frame" by the + * hardware at entry) can not be used: this may be a return + * to kernel code, but with a user CR3 value. + */ + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + + /* + * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an + * unconditional CR3 write, even in the PTI case. So do an lfence + * to prevent GS speculation, regardless of whether PTI is enabled. + */ + FENCE_SWAPGS_KERNEL_ENTRY + + ret +END(paranoid_entry) + +/* + * "Paranoid" exit path from exception stack. This is invoked + * only on return from non-NMI IST interrupts that came + * from kernel space. + * + * We may be returning to very strange contexts (e.g. very early + * in syscall entry), so checking for preemption here would + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. + * + * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) + */ +ENTRY(paranoid_exit) + UNWIND_HINT_REGS + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF_DEBUG + testl %ebx, %ebx /* swapgs needed? */ + jnz .Lparanoid_exit_no_swapgs + TRACE_IRQS_IRETQ + /* Always restore stashed CR3 value (see paranoid_entry) */ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 + SWAPGS_UNSAFE_STACK + jmp .Lparanoid_exit_restore +.Lparanoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG + /* Always restore stashed CR3 value (see paranoid_entry) */ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 +.Lparanoid_exit_restore: + jmp restore_regs_and_return_to_kernel +END(paranoid_exit) + +/* + * Save all registers in pt_regs, and switch GS if needed. + */ +ENTRY(error_entry) + UNWIND_HINT_FUNC + cld + PUSH_AND_CLEAR_REGS save_ret=1 + ENCODE_FRAME_POINTER 8 + testb $3, CS+8(%rsp) + jz .Lerror_kernelspace + + /* + * We entered from user mode or we're pretending to have entered + * from user mode due to an IRET fault. + */ + SWAPGS + FENCE_SWAPGS_USER_ENTRY + /* We have user CR3. Change to kernel CR3. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + +.Lerror_entry_from_usermode_after_swapgs: + /* Put us onto the real thread stack. */ + popq %r12 /* save return addr in %12 */ + movq %rsp, %rdi /* arg0 = pt_regs pointer */ + call sync_regs + movq %rax, %rsp /* switch stack */ + ENCODE_FRAME_POINTER + pushq %r12 + + /* + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode + * (which can take locks). + */ + TRACE_IRQS_OFF + CALL_enter_from_user_mode + ret + +.Lerror_entry_done_lfence: + FENCE_SWAPGS_KERNEL_ENTRY +.Lerror_entry_done: + TRACE_IRQS_OFF + ret + + /* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. + */ +.Lerror_kernelspace: + leaq native_irq_return_iret(%rip), %rcx + cmpq %rcx, RIP+8(%rsp) + je .Lerror_bad_iret + movl %ecx, %eax /* zero extend */ + cmpq %rax, RIP+8(%rsp) + je .Lbstep_iret + cmpq $.Lgs_change, RIP+8(%rsp) + jne .Lerror_entry_done_lfence + + /* + * hack: .Lgs_change can fail with user gsbase. If this happens, fix up + * gsbase and proceed. We'll fix up the exception and land in + * .Lgs_change's error handler with kernel gsbase. + */ + SWAPGS + FENCE_SWAPGS_USER_ENTRY + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + jmp .Lerror_entry_done + +.Lbstep_iret: + /* Fix truncated RIP */ + movq %rcx, RIP+8(%rsp) + /* fall through */ + +.Lerror_bad_iret: + /* + * We came from an IRET to user mode, so we have user + * gsbase and CR3. Switch to kernel gsbase and CR3: + */ + SWAPGS + FENCE_SWAPGS_USER_ENTRY + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + + /* + * Pretend that the exception came from user mode: set up pt_regs + * as if we faulted immediately after IRET. + */ + mov %rsp, %rdi + call fixup_bad_iret + mov %rax, %rsp + jmp .Lerror_entry_from_usermode_after_swapgs +END(error_entry) + +ENTRY(error_exit) + UNWIND_HINT_REGS + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + testb $3, CS(%rsp) + jz retint_kernel + jmp retint_user +END(error_exit) + +/* + * Runs on exception stack. Xen PV does not go through this path at all, + * so we can use real assembly here. + * + * Registers: + * %r14: Used to save/restore the CR3 of the interrupted context + * when PAGE_TABLE_ISOLATION is in use. Do not clobber. + */ +ENTRY(nmi) + UNWIND_HINT_IRET_REGS + + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. + * This means that we can have nested NMIs where the next + * NMI is using the top of the stack of the previous NMI. We + * can't let it execute because the nested NMI will corrupt the + * stack of the previous NMI. NMI handlers are not re-entrant + * anyway. + * + * To handle this case we do the following: + * Check the a special location on the stack that contains + * a variable that is set when NMIs are executing. + * The interrupted task's stack is also checked to see if it + * is an NMI stack. + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack + * o Copy the interrupt frame into an "outermost" location on the + * stack + * o Copy the interrupt frame into an "iret" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: + * o Modify the "iret" location to jump to the repeat_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable + * The NMI stack will tell any nested NMIs at that point that it is + * nested. Then we pop the stack normally with iret, and if there was + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. + * + * However, espfix prevents us from directly returning to userspace + * with a single IRET instruction. Similarly, IRET to user mode + * can fault. We therefore handle NMIs from user space like + * other IST entries. + */ + + ASM_CLAC + + /* Use %rdx as our temp variable throughout */ + pushq %rdx + + testb $3, CS-RIP+8(%rsp) + jz .Lnmi_from_kernel + + /* + * NMI from user mode. We need to run on the thread stack, but we + * can't go through the normal entry paths: NMIs are masked, and + * we don't want to enable interrupts, because then we'll end + * up in an awkward situation in which IRQs are on but NMIs + * are off. + * + * We also must not push anything to the stack before switching + * stacks lest we corrupt the "NMI executing" variable. + */ + + swapgs + cld + FENCE_SWAPGS_USER_ENTRY + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx + movq %rsp, %rdx + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT_IRET_REGS base=%rdx offset=8 + pushq 5*8(%rdx) /* pt_regs->ss */ + pushq 4*8(%rdx) /* pt_regs->rsp */ + pushq 3*8(%rdx) /* pt_regs->flags */ + pushq 2*8(%rdx) /* pt_regs->cs */ + pushq 1*8(%rdx) /* pt_regs->rip */ + UNWIND_HINT_IRET_REGS + pushq $-1 /* pt_regs->orig_ax */ + PUSH_AND_CLEAR_REGS rdx=(%rdx) + ENCODE_FRAME_POINTER + + /* + * At this point we no longer need to worry about stack damage + * due to nesting -- we're on the normal thread stack and we're + * done with the NMI stack. + */ + + movq %rsp, %rdi + movq $-1, %rsi + call do_nmi + + /* + * Return back to user mode. We must *not* do the normal exit + * work, because we don't want to enable interrupts. + */ + jmp swapgs_restore_regs_and_return_to_usermode + +.Lnmi_from_kernel: + /* + * Here's what our stack frame will look like: + * +---------------------------------------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +---------------------------------------------------------+ + * | temp storage for rdx | + * +---------------------------------------------------------+ + * | "NMI executing" variable | + * +---------------------------------------------------------+ + * | iret SS } Copied from "outermost" frame | + * | iret Return RSP } on each loop iteration; overwritten | + * | iret RFLAGS } by a nested NMI to force another | + * | iret CS } iteration if needed. | + * | iret RIP } | + * +---------------------------------------------------------+ + * | outermost SS } initialized in first_nmi; | + * | outermost Return RSP } will not be changed before | + * | outermost RFLAGS } NMI processing is done. | + * | outermost CS } Copied to "iret" frame on each | + * | outermost RIP } iteration. | + * +---------------------------------------------------------+ + * | pt_regs | + * +---------------------------------------------------------+ + * + * The "original" frame is used by hardware. Before re-enabling + * NMIs, we need to be done with it, and we need to leave enough + * space for the asm code here. + * + * We return by executing IRET while RSP points to the "iret" frame. + * That will either return for real or it will loop back into NMI + * processing. + * + * The "outermost" frame is copied to the "iret" frame on each + * iteration of the loop, so each iteration starts with the "iret" + * frame pointing to the final return target. + */ + + /* + * Determine whether we're a nested NMI. + * + * If we interrupted kernel code between repeat_nmi and + * end_repeat_nmi, then we are a nested NMI. We must not + * modify the "iret" frame because it's being written by + * the outer NMI. That's okay; the outer NMI handler is + * about to about to call do_nmi anyway, so we can just + * resume the outer NMI. + */ + + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out +1: + + /* + * Now check "NMI executing". If it's set, then we're nested. + * This will not detect if we interrupted an outer NMI just + * before IRET. + */ + cmpl $1, -8(%rsp) + je nested_nmi + + /* + * Now test if the previous stack was an NMI stack. This covers + * the case where we interrupt an outer NMI after it clears + * "NMI executing" but before IRET. We need to be careful, though: + * there is one case in which RSP could point to the NMI stack + * despite there being no NMI active: naughty userspace controls + * RSP at the very beginning of the SYSCALL targets. We can + * pull a fast one on naughty userspace, though: we program + * SYSCALL to mask DF, so userspace cannot cause DF to be set + * if it controls the kernel's RSP. We set DF before we clear + * "NMI executing". + */ + lea 6*8(%rsp), %rdx + /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ + cmpq %rdx, 4*8(%rsp) + /* If the stack pointer is above the NMI stack, this is a normal NMI */ + ja first_nmi + + subq $EXCEPTION_STKSZ, %rdx + cmpq %rdx, 4*8(%rsp) + /* If it is below the NMI stack, it is a normal NMI */ + jb first_nmi + + /* Ah, it is within the NMI stack. */ + + testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) + jz first_nmi /* RSP was user controlled. */ + + /* This is a nested NMI. */ + +nested_nmi: + /* + * Modify the "iret" frame to point to repeat_nmi, forcing another + * iteration of NMI handling. + */ + subq $8, %rsp + leaq -10*8(%rsp), %rdx + pushq $__KERNEL_DS + pushq %rdx + pushfq + pushq $__KERNEL_CS + pushq $repeat_nmi + + /* Put stack back */ + addq $(6*8), %rsp + +nested_nmi_out: + popq %rdx + + /* We are returning to kernel mode, so this cannot result in a fault. */ + iretq + +first_nmi: + /* Restore rdx. */ + movq (%rsp), %rdx + + /* Make room for "NMI executing". */ + pushq $0 + + /* Leave room for the "iret" frame */ + subq $(5*8), %rsp + + /* Copy the "original" frame to the "outermost" frame */ + .rept 5 + pushq 11*8(%rsp) + .endr + UNWIND_HINT_IRET_REGS + + /* Everything up to here is safe from nested NMIs */ + +#ifdef CONFIG_DEBUG_ENTRY + /* + * For ease of testing, unmask NMIs right away. Disabled by + * default because IRET is very expensive. + */ + pushq $0 /* SS */ + pushq %rsp /* RSP (minus 8 because of the previous push) */ + addq $8, (%rsp) /* Fix up RSP */ + pushfq /* RFLAGS */ + pushq $__KERNEL_CS /* CS */ + pushq $1f /* RIP */ + iretq /* continues at repeat_nmi below */ + UNWIND_HINT_IRET_REGS +1: +#endif + +repeat_nmi: + /* + * If there was a nested NMI, the first NMI's iret will return + * here. But NMIs are still enabled and we can take another + * nested NMI. The nested NMI checks the interrupted RIP to see + * if it is between repeat_nmi and end_repeat_nmi, and if so + * it will just return, as we are about to repeat an NMI anyway. + * This makes it safe to copy to the stack frame that a nested + * NMI will update. + * + * RSP is pointing to "outermost RIP". gsbase is unknown, but, if + * we're repeating an NMI, gsbase has the same value that it had on + * the first iteration. paranoid_entry will load the kernel + * gsbase if needed before we call do_nmi. "NMI executing" + * is zero. + */ + movq $1, 10*8(%rsp) /* Set "NMI executing". */ + + /* + * Copy the "outermost" frame to the "iret" frame. NMIs that nest + * here must not modify the "iret" frame while we're writing to + * it or it will end up containing garbage. + */ + addq $(10*8), %rsp + .rept 5 + pushq -6*8(%rsp) + .endr + subq $(5*8), %rsp +end_repeat_nmi: + + /* + * Everything below this point can be preempted by a nested NMI. + * If this happens, then the inner NMI will change the "iret" + * frame to point back to repeat_nmi. + */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ + + /* + * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit + * as we should not be calling schedule in NMI context. + * Even with normal interrupts enabled. An NMI should not be + * setting NEED_RESCHED or anything that normal interrupts and + * exceptions might do. + */ + call paranoid_entry + UNWIND_HINT_REGS + + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp, %rdi + movq $-1, %rsi + call do_nmi + + /* Always restore stashed CR3 value (see paranoid_entry) */ + RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + + testl %ebx, %ebx /* swapgs needed? */ + jnz nmi_restore +nmi_swapgs: + SWAPGS_UNSAFE_STACK +nmi_restore: + POP_REGS + + /* + * Skip orig_ax and the "outermost" frame to point RSP at the "iret" + * at the "iret" frame. + */ + addq $6*8, %rsp + + /* + * Clear "NMI executing". Set DF first so that we can easily + * distinguish the remaining code between here and IRET from + * the SYSCALL entry and exit paths. + * + * We arguably should just inspect RIP instead, but I (Andy) wrote + * this code when I had the misapprehension that Xen PV supported + * NMIs, and Xen PV would break that approach. + */ + std + movq $0, 5*8(%rsp) /* clear "NMI executing" */ + + /* + * iretq reads the "iret" frame and exits the NMI stack in a + * single instruction. We are returning to kernel mode, so this + * cannot result in a fault. Similarly, we don't need to worry + * about espfix64 on the way back to kernel mode. + */ + iretq +END(nmi) + +ENTRY(ignore_sysret) + UNWIND_HINT_EMPTY + mov $-ENOSYS, %eax + sysret +END(ignore_sysret) + +ENTRY(rewind_stack_do_exit) + UNWIND_HINT_FUNC + /* Prevent any naive code from trying to unwind to our caller. */ + xorl %ebp, %ebp + + movq PER_CPU_VAR(cpu_current_top_of_stack), %rax + leaq -PTREGS_SIZE(%rax), %rsp + UNWIND_HINT_REGS + + call do_exit +END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S new file mode 100644 index 000000000..40d2834a8 --- /dev/null +++ b/arch/x86/entry/entry_64_compat.S @@ -0,0 +1,414 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ +#include "calling.h" +#include <asm/asm-offsets.h> +#include <asm/current.h> +#include <asm/errno.h> +#include <asm/ia32_unistd.h> +#include <asm/thread_info.h> +#include <asm/segment.h> +#include <asm/irqflags.h> +#include <asm/asm.h> +#include <asm/smap.h> +#include <linux/linkage.h> +#include <linux/err.h> + + .section .entry.text, "ax" + +/* + * 32-bit SYSENTER entry. + * + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * on 64-bit kernels running on Intel CPUs. + * + * The SYSENTER instruction, in principle, should *only* occur in the + * vDSO. In practice, a small number of Android devices were shipped + * with a copy of Bionic that inlined a SYSENTER instruction. This + * never happened in any of Google's Bionic versions -- it only happened + * in a narrow range of Intel-provided versions. + * + * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs. + * IF and VM in RFLAGS are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old RIP (!!!), RSP, or RFLAGS. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + */ +ENTRY(entry_SYSENTER_compat) + /* Interrupts are off on entry. */ + SWAPGS + + /* We are about to clobber %rsp anyway, clobbering here is OK */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ + movl %eax, %eax + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %rbp /* pt_regs->sp (stashed in bp) */ + + /* + * Push flags. This is nasty. First, interrupts are currently + * off, but we need pt_regs->flags to have IF set. Second, even + * if TF was set when SYSENTER started, it's clear by now. We fix + * that later using TIF_SINGLESTEP. + */ + pushfq /* pt_regs->flags (except IF = 0) */ + orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq $0 /* pt_regs->ip = 0 (placeholder) */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq $0 /* pt_regs->r8 = 0 */ + xorl %r8d, %r8d /* nospec r8 */ + pushq $0 /* pt_regs->r9 = 0 */ + xorl %r9d, %r9d /* nospec r9 */ + pushq $0 /* pt_regs->r10 = 0 */ + xorl %r10d, %r10d /* nospec r10 */ + pushq $0 /* pt_regs->r11 = 0 */ + xorl %r11d, %r11d /* nospec r11 */ + pushq %rbx /* pt_regs->rbx */ + xorl %ebx, %ebx /* nospec rbx */ + pushq %rbp /* pt_regs->rbp (will be overwritten) */ + xorl %ebp, %ebp /* nospec rbp */ + pushq $0 /* pt_regs->r12 = 0 */ + xorl %r12d, %r12d /* nospec r12 */ + pushq $0 /* pt_regs->r13 = 0 */ + xorl %r13d, %r13d /* nospec r13 */ + pushq $0 /* pt_regs->r14 = 0 */ + xorl %r14d, %r14d /* nospec r14 */ + pushq $0 /* pt_regs->r15 = 0 */ + xorl %r15d, %r15d /* nospec r15 */ + cld + + /* + * SYSENTER doesn't filter flags, so we need to clear NT and AC + * ourselves. To save a few cycles, we can check whether + * either was set instead of doing an unconditional popfq. + * This needs to happen before enabling interrupts so that + * we don't get preempted with NT set. + * + * If TF is set, we will single-step all the way to here -- do_debug + * will ignore all the traps. (Yes, this is slow, but so is + * single-stepping in general. This allows us to avoid having + * a more complicated code to handle the case where a user program + * forces us to single-step through the SYSENTER entry code.) + * + * NB.: .Lsysenter_fix_flags is a label with the code under it moved + * out-of-line as an optimization: NT is unlikely to be set in the + * majority of the cases and instead of polluting the I$ unnecessarily, + * we're keeping that code behind a branch which will predict as + * not-taken and therefore its instructions won't be fetched. + */ + testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp) + jnz .Lsysenter_fix_flags +.Lsysenter_flags_fixed: + + /* + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. + */ + TRACE_IRQS_OFF + + movq %rsp, %rdi + call do_fast_syscall_32 + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + jmp sysret32_from_system_call + +.Lsysenter_fix_flags: + pushq $X86_EFLAGS_FIXED + popfq + jmp .Lsysenter_flags_fixed +GLOBAL(__end_entry_SYSENTER_compat) +ENDPROC(entry_SYSENTER_compat) + +/* + * 32-bit SYSCALL entry. + * + * 32-bit system calls through the vDSO's __kernel_vsyscall enter here + * on 64-bit kernels running on AMD CPUs. + * + * The SYSCALL instruction, in principle, should *only* occur in the + * vDSO. In practice, it appears that this really is the case. + * As evidence: + * + * - The calling convention for SYSCALL has changed several times without + * anyone noticing. + * + * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything + * user task that did SYSCALL without immediately reloading SS + * would randomly crash. + * + * - Most programmers do not directly target AMD CPUs, and the 32-bit + * SYSCALL instruction does not exist on Intel CPUs. Even on AMD + * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels + * because the SYSCALL instruction in legacy/native 32-bit mode (as + * opposed to compat mode) is sufficiently poorly designed as to be + * essentially unusable. + * + * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves + * RFLAGS to R11, then loads new SS, CS, and RIP from previously + * programmed MSRs. RFLAGS gets masked by a value from another MSR + * (so CLD and CLAC are not needed). SYSCALL does not save anything on + * the stack and does not change RSP. + * + * Note: RFLAGS saving+masking-with-MSR happens only in Long mode + * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * + * Arguments: + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + */ +ENTRY(entry_SYSCALL_compat) + /* Interrupts are off on entry. */ + swapgs + + /* Stash user ESP */ + movl %esp, %r8d + + /* Use %rsp as scratch reg. User ESP is stashed in r8 */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + + /* Switch to the kernel stack */ + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_compat_after_hwframe) + movl %eax, %eax /* discard orig_ax high bits */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + xorl %esi, %esi /* nospec si */ + pushq %rdx /* pt_regs->dx */ + xorl %edx, %edx /* nospec dx */ + pushq %rbp /* pt_regs->cx (stashed in bp) */ + xorl %ecx, %ecx /* nospec cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq $0 /* pt_regs->r8 = 0 */ + xorl %r8d, %r8d /* nospec r8 */ + pushq $0 /* pt_regs->r9 = 0 */ + xorl %r9d, %r9d /* nospec r9 */ + pushq $0 /* pt_regs->r10 = 0 */ + xorl %r10d, %r10d /* nospec r10 */ + pushq $0 /* pt_regs->r11 = 0 */ + xorl %r11d, %r11d /* nospec r11 */ + pushq %rbx /* pt_regs->rbx */ + xorl %ebx, %ebx /* nospec rbx */ + pushq %rbp /* pt_regs->rbp (will be overwritten) */ + xorl %ebp, %ebp /* nospec rbp */ + pushq $0 /* pt_regs->r12 = 0 */ + xorl %r12d, %r12d /* nospec r12 */ + pushq $0 /* pt_regs->r13 = 0 */ + xorl %r13d, %r13d /* nospec r13 */ + pushq $0 /* pt_regs->r14 = 0 */ + xorl %r14d, %r14d /* nospec r14 */ + pushq $0 /* pt_regs->r15 = 0 */ + xorl %r15d, %r15d /* nospec r15 */ + + /* + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. + */ + TRACE_IRQS_OFF + + movq %rsp, %rdi + call do_fast_syscall_32 + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + + /* Opportunistic SYSRET */ +sysret32_from_system_call: + TRACE_IRQS_ON /* User mode traces as IRQs on. */ + movq RBX(%rsp), %rbx /* pt_regs->rbx */ + movq RBP(%rsp), %rbp /* pt_regs->rbp */ + movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ + movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */ + addq $RAX, %rsp /* Skip r8-r15 */ + popq %rax /* pt_regs->rax */ + popq %rdx /* Skip pt_regs->cx */ + popq %rdx /* pt_regs->dx */ + popq %rsi /* pt_regs->si */ + popq %rdi /* pt_regs->di */ + + /* + * USERGS_SYSRET32 does: + * GSBASE = user's GS base + * EIP = ECX + * RFLAGS = R11 + * CS = __USER32_CS + * SS = __USER_DS + * + * ECX will not match pt_regs->cx, but we're returning to a vDSO + * trampoline that will fix up RCX, so this is okay. + * + * R12-R15 are callee-saved, so they contain whatever was in them + * when the system call started, which is already known to user + * code. We zero R8-R10 to avoid info leaks. + */ + movq RSP-ORIG_RAX(%rsp), %rsp + + /* + * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored + * on the process stack which is not mapped to userspace and + * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 + * switch until after after the last reference to the process + * stack. + * + * %r8/%r9 are zeroed before the sysret, thus safe to clobber. + */ + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 + + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + swapgs + sysretl +END(entry_SYSCALL_compat) + +/* + * 32-bit legacy system call entry. + * + * 32-bit x86 Linux system calls traditionally used the INT $0x80 + * instruction. INT $0x80 lands here. + * + * This entry point can be used by 32-bit and 64-bit programs to perform + * 32-bit system calls. Instances of INT $0x80 can be found inline in + * various programs and libraries. It is also used by the vDSO's + * __kernel_vsyscall fallback for hardware that doesn't support a faster + * entry method. Restarted 32-bit system calls also fall back to INT + * $0x80 regardless of what instruction was originally used to do the + * system call. + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 + */ +ENTRY(entry_INT80_compat) + /* + * Interrupts are off on entry. + */ + ASM_CLAC /* Do this early to minimize exposure */ + SWAPGS + + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ + movl %eax, %eax + + /* switch to thread stack expects orig_ax and rdi to be pushed */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + + /* Need to switch before accessing the thread stack. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + /* In the Xen PV case we already run on the thread stack. */ + ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + pushq 6*8(%rdi) /* regs->ss */ + pushq 5*8(%rdi) /* regs->rsp */ + pushq 4*8(%rdi) /* regs->eflags */ + pushq 3*8(%rdi) /* regs->cs */ + pushq 2*8(%rdi) /* regs->ip */ + pushq 1*8(%rdi) /* regs->orig_ax */ + pushq (%rdi) /* pt_regs->di */ +.Lint80_keep_stack: + + pushq %rsi /* pt_regs->si */ + xorl %esi, %esi /* nospec si */ + pushq %rdx /* pt_regs->dx */ + xorl %edx, %edx /* nospec dx */ + pushq %rcx /* pt_regs->cx */ + xorl %ecx, %ecx /* nospec cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + xorl %r8d, %r8d /* nospec r8 */ + pushq %r9 /* pt_regs->r9 */ + xorl %r9d, %r9d /* nospec r9 */ + pushq %r10 /* pt_regs->r10*/ + xorl %r10d, %r10d /* nospec r10 */ + pushq %r11 /* pt_regs->r11 */ + xorl %r11d, %r11d /* nospec r11 */ + pushq %rbx /* pt_regs->rbx */ + xorl %ebx, %ebx /* nospec rbx */ + pushq %rbp /* pt_regs->rbp */ + xorl %ebp, %ebp /* nospec rbp */ + pushq %r12 /* pt_regs->r12 */ + xorl %r12d, %r12d /* nospec r12 */ + pushq %r13 /* pt_regs->r13 */ + xorl %r13d, %r13d /* nospec r13 */ + pushq %r14 /* pt_regs->r14 */ + xorl %r14d, %r14d /* nospec r14 */ + pushq %r15 /* pt_regs->r15 */ + xorl %r15d, %r15d /* nospec r15 */ + cld + + /* + * User mode is traced as though IRQs are on, and the interrupt + * gate turned them off. + */ + TRACE_IRQS_OFF + + movq %rsp, %rdi + call do_int80_syscall_32 +.Lsyscall_32_done: + + /* Go back to user mode. */ + TRACE_IRQS_ON + jmp swapgs_restore_regs_and_return_to_usermode +END(entry_INT80_compat) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c new file mode 100644 index 000000000..aa3336a7c --- /dev/null +++ b/arch/x86/entry/syscall_32.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* System call table for i386. */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> +#include <asm/asm-offsets.h> +#include <asm/syscall.h> + +#ifdef CONFIG_IA32_EMULATION +/* On X86_64, we use struct pt_regs * to pass parameters to syscalls */ +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); + +/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ +extern asmlinkage long sys_ni_syscall(const struct pt_regs *); + +#else /* CONFIG_IA32_EMULATION */ +#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#endif /* CONFIG_IA32_EMULATION */ + +#include <asm/syscalls_32.h> +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, qual) [nr] = sym, + +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_compat_max] = &sys_ni_syscall, +#include <asm/syscalls_32.h> +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c new file mode 100644 index 000000000..d5252bc1e --- /dev/null +++ b/arch/x86/entry/syscall_64.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +/* System call table for x86-64. */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> +#include <asm/asm-offsets.h> +#include <asm/syscall.h> + +/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ +extern asmlinkage long sys_ni_syscall(const struct pt_regs *); +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); +#include <asm/syscalls_64.h> +#undef __SYSCALL_64 + +#define __SYSCALL_64(nr, sym, qual) [nr] = sym, + +asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include <asm/syscalls_64.h> +}; diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile new file mode 100644 index 000000000..6fb9b57ed --- /dev/null +++ b/arch/x86/entry/syscalls/Makefile @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: GPL-2.0 +out := arch/$(SRCARCH)/include/generated/asm +uapi := arch/$(SRCARCH)/include/generated/uapi/asm + +# Create output directory if not already present +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ + $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') + +syscall32 := $(srctree)/$(src)/syscall_32.tbl +syscall64 := $(srctree)/$(src)/syscall_64.tbl + +syshdr := $(srctree)/$(src)/syscallhdr.sh +systbl := $(srctree)/$(src)/syscalltbl.sh + +quiet_cmd_syshdr = SYSHDR $@ + cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ + '$(syshdr_abi_$(basetarget))' \ + '$(syshdr_pfx_$(basetarget))' \ + '$(syshdr_offset_$(basetarget))' +quiet_cmd_systbl = SYSTBL $@ + cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ + +quiet_cmd_hypercalls = HYPERCALLS $@ + cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^) + +syshdr_abi_unistd_32 := i386 +$(uapi)/unistd_32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_32_ia32 := i386 +syshdr_pfx_unistd_32_ia32 := ia32_ +$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_x32 := common,x32 +syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT +$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_64 := common,64 +$(uapi)/unistd_64.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_64_x32 := x32 +syshdr_pfx_unistd_64_x32 := x32_ +$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +$(out)/syscalls_32.h: $(syscall32) $(systbl) + $(call if_changed,systbl) +$(out)/syscalls_64.h: $(syscall64) $(systbl) + $(call if_changed,systbl) + +$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh + $(call if_changed,hypercalls) + +$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h + +uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h +syshdr-y += syscalls_32.h +syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h +syshdr-$(CONFIG_X86_64) += syscalls_64.h +syshdr-$(CONFIG_XEN) += xen-hypercalls.h + +targets += $(uapisyshdr-y) $(syshdr-y) + +PHONY += all +all: $(addprefix $(uapi)/,$(uapisyshdr-y)) +all: $(addprefix $(out)/,$(syshdr-y)) + @: diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl new file mode 100644 index 000000000..3cf7b533b --- /dev/null +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -0,0 +1,400 @@ +# +# 32-bit system call numbers and entry vectors +# +# The format is: +# <number> <abi> <name> <entry point> <compat entry point> +# +# The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for +# sys_*() system calls and compat_sys_*() compat system calls if +# IA32_EMULATION is defined, and expect struct pt_regs *regs as their only +# parameter. +# +# The abi is always "i386" for this file. +# +0 i386 restart_syscall sys_restart_syscall __ia32_sys_restart_syscall +1 i386 exit sys_exit __ia32_sys_exit +2 i386 fork sys_fork __ia32_sys_fork +3 i386 read sys_read __ia32_sys_read +4 i386 write sys_write __ia32_sys_write +5 i386 open sys_open __ia32_compat_sys_open +6 i386 close sys_close __ia32_sys_close +7 i386 waitpid sys_waitpid __ia32_sys_waitpid +8 i386 creat sys_creat __ia32_sys_creat +9 i386 link sys_link __ia32_sys_link +10 i386 unlink sys_unlink __ia32_sys_unlink +11 i386 execve sys_execve __ia32_compat_sys_execve +12 i386 chdir sys_chdir __ia32_sys_chdir +13 i386 time sys_time __ia32_compat_sys_time +14 i386 mknod sys_mknod __ia32_sys_mknod +15 i386 chmod sys_chmod __ia32_sys_chmod +16 i386 lchown sys_lchown16 __ia32_sys_lchown16 +17 i386 break +18 i386 oldstat sys_stat __ia32_sys_stat +19 i386 lseek sys_lseek __ia32_compat_sys_lseek +20 i386 getpid sys_getpid __ia32_sys_getpid +21 i386 mount sys_mount __ia32_compat_sys_mount +22 i386 umount sys_oldumount __ia32_sys_oldumount +23 i386 setuid sys_setuid16 __ia32_sys_setuid16 +24 i386 getuid sys_getuid16 __ia32_sys_getuid16 +25 i386 stime sys_stime __ia32_compat_sys_stime +26 i386 ptrace sys_ptrace __ia32_compat_sys_ptrace +27 i386 alarm sys_alarm __ia32_sys_alarm +28 i386 oldfstat sys_fstat __ia32_sys_fstat +29 i386 pause sys_pause __ia32_sys_pause +30 i386 utime sys_utime __ia32_compat_sys_utime +31 i386 stty +32 i386 gtty +33 i386 access sys_access __ia32_sys_access +34 i386 nice sys_nice __ia32_sys_nice +35 i386 ftime +36 i386 sync sys_sync __ia32_sys_sync +37 i386 kill sys_kill __ia32_sys_kill +38 i386 rename sys_rename __ia32_sys_rename +39 i386 mkdir sys_mkdir __ia32_sys_mkdir +40 i386 rmdir sys_rmdir __ia32_sys_rmdir +41 i386 dup sys_dup __ia32_sys_dup +42 i386 pipe sys_pipe __ia32_sys_pipe +43 i386 times sys_times __ia32_compat_sys_times +44 i386 prof +45 i386 brk sys_brk __ia32_sys_brk +46 i386 setgid sys_setgid16 __ia32_sys_setgid16 +47 i386 getgid sys_getgid16 __ia32_sys_getgid16 +48 i386 signal sys_signal __ia32_sys_signal +49 i386 geteuid sys_geteuid16 __ia32_sys_geteuid16 +50 i386 getegid sys_getegid16 __ia32_sys_getegid16 +51 i386 acct sys_acct __ia32_sys_acct +52 i386 umount2 sys_umount __ia32_sys_umount +53 i386 lock +54 i386 ioctl sys_ioctl __ia32_compat_sys_ioctl +55 i386 fcntl sys_fcntl __ia32_compat_sys_fcntl64 +56 i386 mpx +57 i386 setpgid sys_setpgid __ia32_sys_setpgid +58 i386 ulimit +59 i386 oldolduname sys_olduname __ia32_sys_olduname +60 i386 umask sys_umask __ia32_sys_umask +61 i386 chroot sys_chroot __ia32_sys_chroot +62 i386 ustat sys_ustat __ia32_compat_sys_ustat +63 i386 dup2 sys_dup2 __ia32_sys_dup2 +64 i386 getppid sys_getppid __ia32_sys_getppid +65 i386 getpgrp sys_getpgrp __ia32_sys_getpgrp +66 i386 setsid sys_setsid __ia32_sys_setsid +67 i386 sigaction sys_sigaction __ia32_compat_sys_sigaction +68 i386 sgetmask sys_sgetmask __ia32_sys_sgetmask +69 i386 ssetmask sys_ssetmask __ia32_sys_ssetmask +70 i386 setreuid sys_setreuid16 __ia32_sys_setreuid16 +71 i386 setregid sys_setregid16 __ia32_sys_setregid16 +72 i386 sigsuspend sys_sigsuspend __ia32_sys_sigsuspend +73 i386 sigpending sys_sigpending __ia32_compat_sys_sigpending +74 i386 sethostname sys_sethostname __ia32_sys_sethostname +75 i386 setrlimit sys_setrlimit __ia32_compat_sys_setrlimit +76 i386 getrlimit sys_old_getrlimit __ia32_compat_sys_old_getrlimit +77 i386 getrusage sys_getrusage __ia32_compat_sys_getrusage +78 i386 gettimeofday sys_gettimeofday __ia32_compat_sys_gettimeofday +79 i386 settimeofday sys_settimeofday __ia32_compat_sys_settimeofday +80 i386 getgroups sys_getgroups16 __ia32_sys_getgroups16 +81 i386 setgroups sys_setgroups16 __ia32_sys_setgroups16 +82 i386 select sys_old_select __ia32_compat_sys_old_select +83 i386 symlink sys_symlink __ia32_sys_symlink +84 i386 oldlstat sys_lstat __ia32_sys_lstat +85 i386 readlink sys_readlink __ia32_sys_readlink +86 i386 uselib sys_uselib __ia32_sys_uselib +87 i386 swapon sys_swapon __ia32_sys_swapon +88 i386 reboot sys_reboot __ia32_sys_reboot +89 i386 readdir sys_old_readdir __ia32_compat_sys_old_readdir +90 i386 mmap sys_old_mmap __ia32_compat_sys_x86_mmap +91 i386 munmap sys_munmap __ia32_sys_munmap +92 i386 truncate sys_truncate __ia32_compat_sys_truncate +93 i386 ftruncate sys_ftruncate __ia32_compat_sys_ftruncate +94 i386 fchmod sys_fchmod __ia32_sys_fchmod +95 i386 fchown sys_fchown16 __ia32_sys_fchown16 +96 i386 getpriority sys_getpriority __ia32_sys_getpriority +97 i386 setpriority sys_setpriority __ia32_sys_setpriority +98 i386 profil +99 i386 statfs sys_statfs __ia32_compat_sys_statfs +100 i386 fstatfs sys_fstatfs __ia32_compat_sys_fstatfs +101 i386 ioperm sys_ioperm __ia32_sys_ioperm +102 i386 socketcall sys_socketcall __ia32_compat_sys_socketcall +103 i386 syslog sys_syslog __ia32_sys_syslog +104 i386 setitimer sys_setitimer __ia32_compat_sys_setitimer +105 i386 getitimer sys_getitimer __ia32_compat_sys_getitimer +106 i386 stat sys_newstat __ia32_compat_sys_newstat +107 i386 lstat sys_newlstat __ia32_compat_sys_newlstat +108 i386 fstat sys_newfstat __ia32_compat_sys_newfstat +109 i386 olduname sys_uname __ia32_sys_uname +110 i386 iopl sys_iopl __ia32_sys_iopl +111 i386 vhangup sys_vhangup __ia32_sys_vhangup +112 i386 idle +113 i386 vm86old sys_vm86old sys_ni_syscall +114 i386 wait4 sys_wait4 __ia32_compat_sys_wait4 +115 i386 swapoff sys_swapoff __ia32_sys_swapoff +116 i386 sysinfo sys_sysinfo __ia32_compat_sys_sysinfo +117 i386 ipc sys_ipc __ia32_compat_sys_ipc +118 i386 fsync sys_fsync __ia32_sys_fsync +119 i386 sigreturn sys_sigreturn sys32_sigreturn +120 i386 clone sys_clone __ia32_compat_sys_x86_clone +121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname +122 i386 uname sys_newuname __ia32_sys_newuname +123 i386 modify_ldt sys_modify_ldt __ia32_sys_modify_ldt +124 i386 adjtimex sys_adjtimex __ia32_compat_sys_adjtimex +125 i386 mprotect sys_mprotect __ia32_sys_mprotect +126 i386 sigprocmask sys_sigprocmask __ia32_compat_sys_sigprocmask +127 i386 create_module +128 i386 init_module sys_init_module __ia32_sys_init_module +129 i386 delete_module sys_delete_module __ia32_sys_delete_module +130 i386 get_kernel_syms +131 i386 quotactl sys_quotactl __ia32_compat_sys_quotactl32 +132 i386 getpgid sys_getpgid __ia32_sys_getpgid +133 i386 fchdir sys_fchdir __ia32_sys_fchdir +134 i386 bdflush sys_bdflush __ia32_sys_bdflush +135 i386 sysfs sys_sysfs __ia32_sys_sysfs +136 i386 personality sys_personality __ia32_sys_personality +137 i386 afs_syscall +138 i386 setfsuid sys_setfsuid16 __ia32_sys_setfsuid16 +139 i386 setfsgid sys_setfsgid16 __ia32_sys_setfsgid16 +140 i386 _llseek sys_llseek __ia32_sys_llseek +141 i386 getdents sys_getdents __ia32_compat_sys_getdents +142 i386 _newselect sys_select __ia32_compat_sys_select +143 i386 flock sys_flock __ia32_sys_flock +144 i386 msync sys_msync __ia32_sys_msync +145 i386 readv sys_readv __ia32_compat_sys_readv +146 i386 writev sys_writev __ia32_compat_sys_writev +147 i386 getsid sys_getsid __ia32_sys_getsid +148 i386 fdatasync sys_fdatasync __ia32_sys_fdatasync +149 i386 _sysctl sys_sysctl __ia32_compat_sys_sysctl +150 i386 mlock sys_mlock __ia32_sys_mlock +151 i386 munlock sys_munlock __ia32_sys_munlock +152 i386 mlockall sys_mlockall __ia32_sys_mlockall +153 i386 munlockall sys_munlockall __ia32_sys_munlockall +154 i386 sched_setparam sys_sched_setparam __ia32_sys_sched_setparam +155 i386 sched_getparam sys_sched_getparam __ia32_sys_sched_getparam +156 i386 sched_setscheduler sys_sched_setscheduler __ia32_sys_sched_setscheduler +157 i386 sched_getscheduler sys_sched_getscheduler __ia32_sys_sched_getscheduler +158 i386 sched_yield sys_sched_yield __ia32_sys_sched_yield +159 i386 sched_get_priority_max sys_sched_get_priority_max __ia32_sys_sched_get_priority_max +160 i386 sched_get_priority_min sys_sched_get_priority_min __ia32_sys_sched_get_priority_min +161 i386 sched_rr_get_interval sys_sched_rr_get_interval __ia32_compat_sys_sched_rr_get_interval +162 i386 nanosleep sys_nanosleep __ia32_compat_sys_nanosleep +163 i386 mremap sys_mremap __ia32_sys_mremap +164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16 +165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16 +166 i386 vm86 sys_vm86 sys_ni_syscall +167 i386 query_module +168 i386 poll sys_poll __ia32_sys_poll +169 i386 nfsservctl +170 i386 setresgid sys_setresgid16 __ia32_sys_setresgid16 +171 i386 getresgid sys_getresgid16 __ia32_sys_getresgid16 +172 i386 prctl sys_prctl __ia32_sys_prctl +173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn +174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction +175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask +176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending +177 i386 rt_sigtimedwait sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait +178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo +179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend +180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread +181 i386 pwrite64 sys_pwrite64 __ia32_compat_sys_x86_pwrite +182 i386 chown sys_chown16 __ia32_sys_chown16 +183 i386 getcwd sys_getcwd __ia32_sys_getcwd +184 i386 capget sys_capget __ia32_sys_capget +185 i386 capset sys_capset __ia32_sys_capset +186 i386 sigaltstack sys_sigaltstack __ia32_compat_sys_sigaltstack +187 i386 sendfile sys_sendfile __ia32_compat_sys_sendfile +188 i386 getpmsg +189 i386 putpmsg +190 i386 vfork sys_vfork __ia32_sys_vfork +191 i386 ugetrlimit sys_getrlimit __ia32_compat_sys_getrlimit +192 i386 mmap2 sys_mmap_pgoff __ia32_sys_mmap_pgoff +193 i386 truncate64 sys_truncate64 __ia32_compat_sys_x86_truncate64 +194 i386 ftruncate64 sys_ftruncate64 __ia32_compat_sys_x86_ftruncate64 +195 i386 stat64 sys_stat64 __ia32_compat_sys_x86_stat64 +196 i386 lstat64 sys_lstat64 __ia32_compat_sys_x86_lstat64 +197 i386 fstat64 sys_fstat64 __ia32_compat_sys_x86_fstat64 +198 i386 lchown32 sys_lchown __ia32_sys_lchown +199 i386 getuid32 sys_getuid __ia32_sys_getuid +200 i386 getgid32 sys_getgid __ia32_sys_getgid +201 i386 geteuid32 sys_geteuid __ia32_sys_geteuid +202 i386 getegid32 sys_getegid __ia32_sys_getegid +203 i386 setreuid32 sys_setreuid __ia32_sys_setreuid +204 i386 setregid32 sys_setregid __ia32_sys_setregid +205 i386 getgroups32 sys_getgroups __ia32_sys_getgroups +206 i386 setgroups32 sys_setgroups __ia32_sys_setgroups +207 i386 fchown32 sys_fchown __ia32_sys_fchown +208 i386 setresuid32 sys_setresuid __ia32_sys_setresuid +209 i386 getresuid32 sys_getresuid __ia32_sys_getresuid +210 i386 setresgid32 sys_setresgid __ia32_sys_setresgid +211 i386 getresgid32 sys_getresgid __ia32_sys_getresgid +212 i386 chown32 sys_chown __ia32_sys_chown +213 i386 setuid32 sys_setuid __ia32_sys_setuid +214 i386 setgid32 sys_setgid __ia32_sys_setgid +215 i386 setfsuid32 sys_setfsuid __ia32_sys_setfsuid +216 i386 setfsgid32 sys_setfsgid __ia32_sys_setfsgid +217 i386 pivot_root sys_pivot_root __ia32_sys_pivot_root +218 i386 mincore sys_mincore __ia32_sys_mincore +219 i386 madvise sys_madvise __ia32_sys_madvise +220 i386 getdents64 sys_getdents64 __ia32_sys_getdents64 +221 i386 fcntl64 sys_fcntl64 __ia32_compat_sys_fcntl64 +# 222 is unused +# 223 is unused +224 i386 gettid sys_gettid __ia32_sys_gettid +225 i386 readahead sys_readahead __ia32_compat_sys_x86_readahead +226 i386 setxattr sys_setxattr __ia32_sys_setxattr +227 i386 lsetxattr sys_lsetxattr __ia32_sys_lsetxattr +228 i386 fsetxattr sys_fsetxattr __ia32_sys_fsetxattr +229 i386 getxattr sys_getxattr __ia32_sys_getxattr +230 i386 lgetxattr sys_lgetxattr __ia32_sys_lgetxattr +231 i386 fgetxattr sys_fgetxattr __ia32_sys_fgetxattr +232 i386 listxattr sys_listxattr __ia32_sys_listxattr +233 i386 llistxattr sys_llistxattr __ia32_sys_llistxattr +234 i386 flistxattr sys_flistxattr __ia32_sys_flistxattr +235 i386 removexattr sys_removexattr __ia32_sys_removexattr +236 i386 lremovexattr sys_lremovexattr __ia32_sys_lremovexattr +237 i386 fremovexattr sys_fremovexattr __ia32_sys_fremovexattr +238 i386 tkill sys_tkill __ia32_sys_tkill +239 i386 sendfile64 sys_sendfile64 __ia32_sys_sendfile64 +240 i386 futex sys_futex __ia32_compat_sys_futex +241 i386 sched_setaffinity sys_sched_setaffinity __ia32_compat_sys_sched_setaffinity +242 i386 sched_getaffinity sys_sched_getaffinity __ia32_compat_sys_sched_getaffinity +243 i386 set_thread_area sys_set_thread_area __ia32_sys_set_thread_area +244 i386 get_thread_area sys_get_thread_area __ia32_sys_get_thread_area +245 i386 io_setup sys_io_setup __ia32_compat_sys_io_setup +246 i386 io_destroy sys_io_destroy __ia32_sys_io_destroy +247 i386 io_getevents sys_io_getevents __ia32_compat_sys_io_getevents +248 i386 io_submit sys_io_submit __ia32_compat_sys_io_submit +249 i386 io_cancel sys_io_cancel __ia32_sys_io_cancel +250 i386 fadvise64 sys_fadvise64 __ia32_compat_sys_x86_fadvise64 +# 251 is available for reuse (was briefly sys_set_zone_reclaim) +252 i386 exit_group sys_exit_group __ia32_sys_exit_group +253 i386 lookup_dcookie sys_lookup_dcookie __ia32_compat_sys_lookup_dcookie +254 i386 epoll_create sys_epoll_create __ia32_sys_epoll_create +255 i386 epoll_ctl sys_epoll_ctl __ia32_sys_epoll_ctl +256 i386 epoll_wait sys_epoll_wait __ia32_sys_epoll_wait +257 i386 remap_file_pages sys_remap_file_pages __ia32_sys_remap_file_pages +258 i386 set_tid_address sys_set_tid_address __ia32_sys_set_tid_address +259 i386 timer_create sys_timer_create __ia32_compat_sys_timer_create +260 i386 timer_settime sys_timer_settime __ia32_compat_sys_timer_settime +261 i386 timer_gettime sys_timer_gettime __ia32_compat_sys_timer_gettime +262 i386 timer_getoverrun sys_timer_getoverrun __ia32_sys_timer_getoverrun +263 i386 timer_delete sys_timer_delete __ia32_sys_timer_delete +264 i386 clock_settime sys_clock_settime __ia32_compat_sys_clock_settime +265 i386 clock_gettime sys_clock_gettime __ia32_compat_sys_clock_gettime +266 i386 clock_getres sys_clock_getres __ia32_compat_sys_clock_getres +267 i386 clock_nanosleep sys_clock_nanosleep __ia32_compat_sys_clock_nanosleep +268 i386 statfs64 sys_statfs64 __ia32_compat_sys_statfs64 +269 i386 fstatfs64 sys_fstatfs64 __ia32_compat_sys_fstatfs64 +270 i386 tgkill sys_tgkill __ia32_sys_tgkill +271 i386 utimes sys_utimes __ia32_compat_sys_utimes +272 i386 fadvise64_64 sys_fadvise64_64 __ia32_compat_sys_x86_fadvise64_64 +273 i386 vserver +274 i386 mbind sys_mbind __ia32_sys_mbind +275 i386 get_mempolicy sys_get_mempolicy __ia32_compat_sys_get_mempolicy +276 i386 set_mempolicy sys_set_mempolicy __ia32_sys_set_mempolicy +277 i386 mq_open sys_mq_open __ia32_compat_sys_mq_open +278 i386 mq_unlink sys_mq_unlink __ia32_sys_mq_unlink +279 i386 mq_timedsend sys_mq_timedsend __ia32_compat_sys_mq_timedsend +280 i386 mq_timedreceive sys_mq_timedreceive __ia32_compat_sys_mq_timedreceive +281 i386 mq_notify sys_mq_notify __ia32_compat_sys_mq_notify +282 i386 mq_getsetattr sys_mq_getsetattr __ia32_compat_sys_mq_getsetattr +283 i386 kexec_load sys_kexec_load __ia32_compat_sys_kexec_load +284 i386 waitid sys_waitid __ia32_compat_sys_waitid +# 285 sys_setaltroot +286 i386 add_key sys_add_key __ia32_sys_add_key +287 i386 request_key sys_request_key __ia32_sys_request_key +288 i386 keyctl sys_keyctl __ia32_compat_sys_keyctl +289 i386 ioprio_set sys_ioprio_set __ia32_sys_ioprio_set +290 i386 ioprio_get sys_ioprio_get __ia32_sys_ioprio_get +291 i386 inotify_init sys_inotify_init __ia32_sys_inotify_init +292 i386 inotify_add_watch sys_inotify_add_watch __ia32_sys_inotify_add_watch +293 i386 inotify_rm_watch sys_inotify_rm_watch __ia32_sys_inotify_rm_watch +294 i386 migrate_pages sys_migrate_pages __ia32_sys_migrate_pages +295 i386 openat sys_openat __ia32_compat_sys_openat +296 i386 mkdirat sys_mkdirat __ia32_sys_mkdirat +297 i386 mknodat sys_mknodat __ia32_sys_mknodat +298 i386 fchownat sys_fchownat __ia32_sys_fchownat +299 i386 futimesat sys_futimesat __ia32_compat_sys_futimesat +300 i386 fstatat64 sys_fstatat64 __ia32_compat_sys_x86_fstatat +301 i386 unlinkat sys_unlinkat __ia32_sys_unlinkat +302 i386 renameat sys_renameat __ia32_sys_renameat +303 i386 linkat sys_linkat __ia32_sys_linkat +304 i386 symlinkat sys_symlinkat __ia32_sys_symlinkat +305 i386 readlinkat sys_readlinkat __ia32_sys_readlinkat +306 i386 fchmodat sys_fchmodat __ia32_sys_fchmodat +307 i386 faccessat sys_faccessat __ia32_sys_faccessat +308 i386 pselect6 sys_pselect6 __ia32_compat_sys_pselect6 +309 i386 ppoll sys_ppoll __ia32_compat_sys_ppoll +310 i386 unshare sys_unshare __ia32_sys_unshare +311 i386 set_robust_list sys_set_robust_list __ia32_compat_sys_set_robust_list +312 i386 get_robust_list sys_get_robust_list __ia32_compat_sys_get_robust_list +313 i386 splice sys_splice __ia32_sys_splice +314 i386 sync_file_range sys_sync_file_range __ia32_compat_sys_x86_sync_file_range +315 i386 tee sys_tee __ia32_sys_tee +316 i386 vmsplice sys_vmsplice __ia32_compat_sys_vmsplice +317 i386 move_pages sys_move_pages __ia32_compat_sys_move_pages +318 i386 getcpu sys_getcpu __ia32_sys_getcpu +319 i386 epoll_pwait sys_epoll_pwait __ia32_sys_epoll_pwait +320 i386 utimensat sys_utimensat __ia32_compat_sys_utimensat +321 i386 signalfd sys_signalfd __ia32_compat_sys_signalfd +322 i386 timerfd_create sys_timerfd_create __ia32_sys_timerfd_create +323 i386 eventfd sys_eventfd __ia32_sys_eventfd +324 i386 fallocate sys_fallocate __ia32_compat_sys_x86_fallocate +325 i386 timerfd_settime sys_timerfd_settime __ia32_compat_sys_timerfd_settime +326 i386 timerfd_gettime sys_timerfd_gettime __ia32_compat_sys_timerfd_gettime +327 i386 signalfd4 sys_signalfd4 __ia32_compat_sys_signalfd4 +328 i386 eventfd2 sys_eventfd2 __ia32_sys_eventfd2 +329 i386 epoll_create1 sys_epoll_create1 __ia32_sys_epoll_create1 +330 i386 dup3 sys_dup3 __ia32_sys_dup3 +331 i386 pipe2 sys_pipe2 __ia32_sys_pipe2 +332 i386 inotify_init1 sys_inotify_init1 __ia32_sys_inotify_init1 +333 i386 preadv sys_preadv __ia32_compat_sys_preadv +334 i386 pwritev sys_pwritev __ia32_compat_sys_pwritev +335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo __ia32_compat_sys_rt_tgsigqueueinfo +336 i386 perf_event_open sys_perf_event_open __ia32_sys_perf_event_open +337 i386 recvmmsg sys_recvmmsg __ia32_compat_sys_recvmmsg +338 i386 fanotify_init sys_fanotify_init __ia32_sys_fanotify_init +339 i386 fanotify_mark sys_fanotify_mark __ia32_compat_sys_fanotify_mark +340 i386 prlimit64 sys_prlimit64 __ia32_sys_prlimit64 +341 i386 name_to_handle_at sys_name_to_handle_at __ia32_sys_name_to_handle_at +342 i386 open_by_handle_at sys_open_by_handle_at __ia32_compat_sys_open_by_handle_at +343 i386 clock_adjtime sys_clock_adjtime __ia32_compat_sys_clock_adjtime +344 i386 syncfs sys_syncfs __ia32_sys_syncfs +345 i386 sendmmsg sys_sendmmsg __ia32_compat_sys_sendmmsg +346 i386 setns sys_setns __ia32_sys_setns +347 i386 process_vm_readv sys_process_vm_readv __ia32_compat_sys_process_vm_readv +348 i386 process_vm_writev sys_process_vm_writev __ia32_compat_sys_process_vm_writev +349 i386 kcmp sys_kcmp __ia32_sys_kcmp +350 i386 finit_module sys_finit_module __ia32_sys_finit_module +351 i386 sched_setattr sys_sched_setattr __ia32_sys_sched_setattr +352 i386 sched_getattr sys_sched_getattr __ia32_sys_sched_getattr +353 i386 renameat2 sys_renameat2 __ia32_sys_renameat2 +354 i386 seccomp sys_seccomp __ia32_sys_seccomp +355 i386 getrandom sys_getrandom __ia32_sys_getrandom +356 i386 memfd_create sys_memfd_create __ia32_sys_memfd_create +357 i386 bpf sys_bpf __ia32_sys_bpf +358 i386 execveat sys_execveat __ia32_compat_sys_execveat +359 i386 socket sys_socket __ia32_sys_socket +360 i386 socketpair sys_socketpair __ia32_sys_socketpair +361 i386 bind sys_bind __ia32_sys_bind +362 i386 connect sys_connect __ia32_sys_connect +363 i386 listen sys_listen __ia32_sys_listen +364 i386 accept4 sys_accept4 __ia32_sys_accept4 +365 i386 getsockopt sys_getsockopt __ia32_compat_sys_getsockopt +366 i386 setsockopt sys_setsockopt __ia32_compat_sys_setsockopt +367 i386 getsockname sys_getsockname __ia32_sys_getsockname +368 i386 getpeername sys_getpeername __ia32_sys_getpeername +369 i386 sendto sys_sendto __ia32_sys_sendto +370 i386 sendmsg sys_sendmsg __ia32_compat_sys_sendmsg +371 i386 recvfrom sys_recvfrom __ia32_compat_sys_recvfrom +372 i386 recvmsg sys_recvmsg __ia32_compat_sys_recvmsg +373 i386 shutdown sys_shutdown __ia32_sys_shutdown +374 i386 userfaultfd sys_userfaultfd __ia32_sys_userfaultfd +375 i386 membarrier sys_membarrier __ia32_sys_membarrier +376 i386 mlock2 sys_mlock2 __ia32_sys_mlock2 +377 i386 copy_file_range sys_copy_file_range __ia32_sys_copy_file_range +378 i386 preadv2 sys_preadv2 __ia32_compat_sys_preadv2 +379 i386 pwritev2 sys_pwritev2 __ia32_compat_sys_pwritev2 +380 i386 pkey_mprotect sys_pkey_mprotect __ia32_sys_pkey_mprotect +381 i386 pkey_alloc sys_pkey_alloc __ia32_sys_pkey_alloc +382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free +383 i386 statx sys_statx __ia32_sys_statx +384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl +385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents +386 i386 rseq sys_rseq __ia32_sys_rseq diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl new file mode 100644 index 000000000..f0b1709a5 --- /dev/null +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -0,0 +1,388 @@ +# +# 64-bit system call numbers and entry vectors +# +# The format is: +# <number> <abi> <name> <entry point> +# +# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls +# +# The abi is "common", "64" or "x32" for this file. +# +0 common read __x64_sys_read +1 common write __x64_sys_write +2 common open __x64_sys_open +3 common close __x64_sys_close +4 common stat __x64_sys_newstat +5 common fstat __x64_sys_newfstat +6 common lstat __x64_sys_newlstat +7 common poll __x64_sys_poll +8 common lseek __x64_sys_lseek +9 common mmap __x64_sys_mmap +10 common mprotect __x64_sys_mprotect +11 common munmap __x64_sys_munmap +12 common brk __x64_sys_brk +13 64 rt_sigaction __x64_sys_rt_sigaction +14 common rt_sigprocmask __x64_sys_rt_sigprocmask +15 64 rt_sigreturn __x64_sys_rt_sigreturn/ptregs +16 64 ioctl __x64_sys_ioctl +17 common pread64 __x64_sys_pread64 +18 common pwrite64 __x64_sys_pwrite64 +19 64 readv __x64_sys_readv +20 64 writev __x64_sys_writev +21 common access __x64_sys_access +22 common pipe __x64_sys_pipe +23 common select __x64_sys_select +24 common sched_yield __x64_sys_sched_yield +25 common mremap __x64_sys_mremap +26 common msync __x64_sys_msync +27 common mincore __x64_sys_mincore +28 common madvise __x64_sys_madvise +29 common shmget __x64_sys_shmget +30 common shmat __x64_sys_shmat +31 common shmctl __x64_sys_shmctl +32 common dup __x64_sys_dup +33 common dup2 __x64_sys_dup2 +34 common pause __x64_sys_pause +35 common nanosleep __x64_sys_nanosleep +36 common getitimer __x64_sys_getitimer +37 common alarm __x64_sys_alarm +38 common setitimer __x64_sys_setitimer +39 common getpid __x64_sys_getpid +40 common sendfile __x64_sys_sendfile64 +41 common socket __x64_sys_socket +42 common connect __x64_sys_connect +43 common accept __x64_sys_accept +44 common sendto __x64_sys_sendto +45 64 recvfrom __x64_sys_recvfrom +46 64 sendmsg __x64_sys_sendmsg +47 64 recvmsg __x64_sys_recvmsg +48 common shutdown __x64_sys_shutdown +49 common bind __x64_sys_bind +50 common listen __x64_sys_listen +51 common getsockname __x64_sys_getsockname +52 common getpeername __x64_sys_getpeername +53 common socketpair __x64_sys_socketpair +54 64 setsockopt __x64_sys_setsockopt +55 64 getsockopt __x64_sys_getsockopt +56 common clone __x64_sys_clone/ptregs +57 common fork __x64_sys_fork/ptregs +58 common vfork __x64_sys_vfork/ptregs +59 64 execve __x64_sys_execve/ptregs +60 common exit __x64_sys_exit +61 common wait4 __x64_sys_wait4 +62 common kill __x64_sys_kill +63 common uname __x64_sys_newuname +64 common semget __x64_sys_semget +65 common semop __x64_sys_semop +66 common semctl __x64_sys_semctl +67 common shmdt __x64_sys_shmdt +68 common msgget __x64_sys_msgget +69 common msgsnd __x64_sys_msgsnd +70 common msgrcv __x64_sys_msgrcv +71 common msgctl __x64_sys_msgctl +72 common fcntl __x64_sys_fcntl +73 common flock __x64_sys_flock +74 common fsync __x64_sys_fsync +75 common fdatasync __x64_sys_fdatasync +76 common truncate __x64_sys_truncate +77 common ftruncate __x64_sys_ftruncate +78 common getdents __x64_sys_getdents +79 common getcwd __x64_sys_getcwd +80 common chdir __x64_sys_chdir +81 common fchdir __x64_sys_fchdir +82 common rename __x64_sys_rename +83 common mkdir __x64_sys_mkdir +84 common rmdir __x64_sys_rmdir +85 common creat __x64_sys_creat +86 common link __x64_sys_link +87 common unlink __x64_sys_unlink +88 common symlink __x64_sys_symlink +89 common readlink __x64_sys_readlink +90 common chmod __x64_sys_chmod +91 common fchmod __x64_sys_fchmod +92 common chown __x64_sys_chown +93 common fchown __x64_sys_fchown +94 common lchown __x64_sys_lchown +95 common umask __x64_sys_umask +96 common gettimeofday __x64_sys_gettimeofday +97 common getrlimit __x64_sys_getrlimit +98 common getrusage __x64_sys_getrusage +99 common sysinfo __x64_sys_sysinfo +100 common times __x64_sys_times +101 64 ptrace __x64_sys_ptrace +102 common getuid __x64_sys_getuid +103 common syslog __x64_sys_syslog +104 common getgid __x64_sys_getgid +105 common setuid __x64_sys_setuid +106 common setgid __x64_sys_setgid +107 common geteuid __x64_sys_geteuid +108 common getegid __x64_sys_getegid +109 common setpgid __x64_sys_setpgid +110 common getppid __x64_sys_getppid +111 common getpgrp __x64_sys_getpgrp +112 common setsid __x64_sys_setsid +113 common setreuid __x64_sys_setreuid +114 common setregid __x64_sys_setregid +115 common getgroups __x64_sys_getgroups +116 common setgroups __x64_sys_setgroups +117 common setresuid __x64_sys_setresuid +118 common getresuid __x64_sys_getresuid +119 common setresgid __x64_sys_setresgid +120 common getresgid __x64_sys_getresgid +121 common getpgid __x64_sys_getpgid +122 common setfsuid __x64_sys_setfsuid +123 common setfsgid __x64_sys_setfsgid +124 common getsid __x64_sys_getsid +125 common capget __x64_sys_capget +126 common capset __x64_sys_capset +127 64 rt_sigpending __x64_sys_rt_sigpending +128 64 rt_sigtimedwait __x64_sys_rt_sigtimedwait +129 64 rt_sigqueueinfo __x64_sys_rt_sigqueueinfo +130 common rt_sigsuspend __x64_sys_rt_sigsuspend +131 64 sigaltstack __x64_sys_sigaltstack +132 common utime __x64_sys_utime +133 common mknod __x64_sys_mknod +134 64 uselib +135 common personality __x64_sys_personality +136 common ustat __x64_sys_ustat +137 common statfs __x64_sys_statfs +138 common fstatfs __x64_sys_fstatfs +139 common sysfs __x64_sys_sysfs +140 common getpriority __x64_sys_getpriority +141 common setpriority __x64_sys_setpriority +142 common sched_setparam __x64_sys_sched_setparam +143 common sched_getparam __x64_sys_sched_getparam +144 common sched_setscheduler __x64_sys_sched_setscheduler +145 common sched_getscheduler __x64_sys_sched_getscheduler +146 common sched_get_priority_max __x64_sys_sched_get_priority_max +147 common sched_get_priority_min __x64_sys_sched_get_priority_min +148 common sched_rr_get_interval __x64_sys_sched_rr_get_interval +149 common mlock __x64_sys_mlock +150 common munlock __x64_sys_munlock +151 common mlockall __x64_sys_mlockall +152 common munlockall __x64_sys_munlockall +153 common vhangup __x64_sys_vhangup +154 common modify_ldt __x64_sys_modify_ldt +155 common pivot_root __x64_sys_pivot_root +156 64 _sysctl __x64_sys_sysctl +157 common prctl __x64_sys_prctl +158 common arch_prctl __x64_sys_arch_prctl +159 common adjtimex __x64_sys_adjtimex +160 common setrlimit __x64_sys_setrlimit +161 common chroot __x64_sys_chroot +162 common sync __x64_sys_sync +163 common acct __x64_sys_acct +164 common settimeofday __x64_sys_settimeofday +165 common mount __x64_sys_mount +166 common umount2 __x64_sys_umount +167 common swapon __x64_sys_swapon +168 common swapoff __x64_sys_swapoff +169 common reboot __x64_sys_reboot +170 common sethostname __x64_sys_sethostname +171 common setdomainname __x64_sys_setdomainname +172 common iopl __x64_sys_iopl/ptregs +173 common ioperm __x64_sys_ioperm +174 64 create_module +175 common init_module __x64_sys_init_module +176 common delete_module __x64_sys_delete_module +177 64 get_kernel_syms +178 64 query_module +179 common quotactl __x64_sys_quotactl +180 64 nfsservctl +181 common getpmsg +182 common putpmsg +183 common afs_syscall +184 common tuxcall +185 common security +186 common gettid __x64_sys_gettid +187 common readahead __x64_sys_readahead +188 common setxattr __x64_sys_setxattr +189 common lsetxattr __x64_sys_lsetxattr +190 common fsetxattr __x64_sys_fsetxattr +191 common getxattr __x64_sys_getxattr +192 common lgetxattr __x64_sys_lgetxattr +193 common fgetxattr __x64_sys_fgetxattr +194 common listxattr __x64_sys_listxattr +195 common llistxattr __x64_sys_llistxattr +196 common flistxattr __x64_sys_flistxattr +197 common removexattr __x64_sys_removexattr +198 common lremovexattr __x64_sys_lremovexattr +199 common fremovexattr __x64_sys_fremovexattr +200 common tkill __x64_sys_tkill +201 common time __x64_sys_time +202 common futex __x64_sys_futex +203 common sched_setaffinity __x64_sys_sched_setaffinity +204 common sched_getaffinity __x64_sys_sched_getaffinity +205 64 set_thread_area +206 64 io_setup __x64_sys_io_setup +207 common io_destroy __x64_sys_io_destroy +208 common io_getevents __x64_sys_io_getevents +209 64 io_submit __x64_sys_io_submit +210 common io_cancel __x64_sys_io_cancel +211 64 get_thread_area +212 common lookup_dcookie __x64_sys_lookup_dcookie +213 common epoll_create __x64_sys_epoll_create +214 64 epoll_ctl_old +215 64 epoll_wait_old +216 common remap_file_pages __x64_sys_remap_file_pages +217 common getdents64 __x64_sys_getdents64 +218 common set_tid_address __x64_sys_set_tid_address +219 common restart_syscall __x64_sys_restart_syscall +220 common semtimedop __x64_sys_semtimedop +221 common fadvise64 __x64_sys_fadvise64 +222 64 timer_create __x64_sys_timer_create +223 common timer_settime __x64_sys_timer_settime +224 common timer_gettime __x64_sys_timer_gettime +225 common timer_getoverrun __x64_sys_timer_getoverrun +226 common timer_delete __x64_sys_timer_delete +227 common clock_settime __x64_sys_clock_settime +228 common clock_gettime __x64_sys_clock_gettime +229 common clock_getres __x64_sys_clock_getres +230 common clock_nanosleep __x64_sys_clock_nanosleep +231 common exit_group __x64_sys_exit_group +232 common epoll_wait __x64_sys_epoll_wait +233 common epoll_ctl __x64_sys_epoll_ctl +234 common tgkill __x64_sys_tgkill +235 common utimes __x64_sys_utimes +236 64 vserver +237 common mbind __x64_sys_mbind +238 common set_mempolicy __x64_sys_set_mempolicy +239 common get_mempolicy __x64_sys_get_mempolicy +240 common mq_open __x64_sys_mq_open +241 common mq_unlink __x64_sys_mq_unlink +242 common mq_timedsend __x64_sys_mq_timedsend +243 common mq_timedreceive __x64_sys_mq_timedreceive +244 64 mq_notify __x64_sys_mq_notify +245 common mq_getsetattr __x64_sys_mq_getsetattr +246 64 kexec_load __x64_sys_kexec_load +247 64 waitid __x64_sys_waitid +248 common add_key __x64_sys_add_key +249 common request_key __x64_sys_request_key +250 common keyctl __x64_sys_keyctl +251 common ioprio_set __x64_sys_ioprio_set +252 common ioprio_get __x64_sys_ioprio_get +253 common inotify_init __x64_sys_inotify_init +254 common inotify_add_watch __x64_sys_inotify_add_watch +255 common inotify_rm_watch __x64_sys_inotify_rm_watch +256 common migrate_pages __x64_sys_migrate_pages +257 common openat __x64_sys_openat +258 common mkdirat __x64_sys_mkdirat +259 common mknodat __x64_sys_mknodat +260 common fchownat __x64_sys_fchownat +261 common futimesat __x64_sys_futimesat +262 common newfstatat __x64_sys_newfstatat +263 common unlinkat __x64_sys_unlinkat +264 common renameat __x64_sys_renameat +265 common linkat __x64_sys_linkat +266 common symlinkat __x64_sys_symlinkat +267 common readlinkat __x64_sys_readlinkat +268 common fchmodat __x64_sys_fchmodat +269 common faccessat __x64_sys_faccessat +270 common pselect6 __x64_sys_pselect6 +271 common ppoll __x64_sys_ppoll +272 common unshare __x64_sys_unshare +273 64 set_robust_list __x64_sys_set_robust_list +274 64 get_robust_list __x64_sys_get_robust_list +275 common splice __x64_sys_splice +276 common tee __x64_sys_tee +277 common sync_file_range __x64_sys_sync_file_range +278 64 vmsplice __x64_sys_vmsplice +279 64 move_pages __x64_sys_move_pages +280 common utimensat __x64_sys_utimensat +281 common epoll_pwait __x64_sys_epoll_pwait +282 common signalfd __x64_sys_signalfd +283 common timerfd_create __x64_sys_timerfd_create +284 common eventfd __x64_sys_eventfd +285 common fallocate __x64_sys_fallocate +286 common timerfd_settime __x64_sys_timerfd_settime +287 common timerfd_gettime __x64_sys_timerfd_gettime +288 common accept4 __x64_sys_accept4 +289 common signalfd4 __x64_sys_signalfd4 +290 common eventfd2 __x64_sys_eventfd2 +291 common epoll_create1 __x64_sys_epoll_create1 +292 common dup3 __x64_sys_dup3 +293 common pipe2 __x64_sys_pipe2 +294 common inotify_init1 __x64_sys_inotify_init1 +295 64 preadv __x64_sys_preadv +296 64 pwritev __x64_sys_pwritev +297 64 rt_tgsigqueueinfo __x64_sys_rt_tgsigqueueinfo +298 common perf_event_open __x64_sys_perf_event_open +299 64 recvmmsg __x64_sys_recvmmsg +300 common fanotify_init __x64_sys_fanotify_init +301 common fanotify_mark __x64_sys_fanotify_mark +302 common prlimit64 __x64_sys_prlimit64 +303 common name_to_handle_at __x64_sys_name_to_handle_at +304 common open_by_handle_at __x64_sys_open_by_handle_at +305 common clock_adjtime __x64_sys_clock_adjtime +306 common syncfs __x64_sys_syncfs +307 64 sendmmsg __x64_sys_sendmmsg +308 common setns __x64_sys_setns +309 common getcpu __x64_sys_getcpu +310 64 process_vm_readv __x64_sys_process_vm_readv +311 64 process_vm_writev __x64_sys_process_vm_writev +312 common kcmp __x64_sys_kcmp +313 common finit_module __x64_sys_finit_module +314 common sched_setattr __x64_sys_sched_setattr +315 common sched_getattr __x64_sys_sched_getattr +316 common renameat2 __x64_sys_renameat2 +317 common seccomp __x64_sys_seccomp +318 common getrandom __x64_sys_getrandom +319 common memfd_create __x64_sys_memfd_create +320 common kexec_file_load __x64_sys_kexec_file_load +321 common bpf __x64_sys_bpf +322 64 execveat __x64_sys_execveat/ptregs +323 common userfaultfd __x64_sys_userfaultfd +324 common membarrier __x64_sys_membarrier +325 common mlock2 __x64_sys_mlock2 +326 common copy_file_range __x64_sys_copy_file_range +327 64 preadv2 __x64_sys_preadv2 +328 64 pwritev2 __x64_sys_pwritev2 +329 common pkey_mprotect __x64_sys_pkey_mprotect +330 common pkey_alloc __x64_sys_pkey_alloc +331 common pkey_free __x64_sys_pkey_free +332 common statx __x64_sys_statx +333 common io_pgetevents __x64_sys_io_pgetevents +334 common rseq __x64_sys_rseq + +# +# x32-specific system call numbers start at 512 to avoid cache impact +# for native 64-bit operation. The __x32_compat_sys stubs are created +# on-the-fly for compat_sys_*() compatibility system calls if X86_X32 +# is defined. +# +512 x32 rt_sigaction __x32_compat_sys_rt_sigaction +513 x32 rt_sigreturn sys32_x32_rt_sigreturn +514 x32 ioctl __x32_compat_sys_ioctl +515 x32 readv __x32_compat_sys_readv +516 x32 writev __x32_compat_sys_writev +517 x32 recvfrom __x32_compat_sys_recvfrom +518 x32 sendmsg __x32_compat_sys_sendmsg +519 x32 recvmsg __x32_compat_sys_recvmsg +520 x32 execve __x32_compat_sys_execve/ptregs +521 x32 ptrace __x32_compat_sys_ptrace +522 x32 rt_sigpending __x32_compat_sys_rt_sigpending +523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait +524 x32 rt_sigqueueinfo __x32_compat_sys_rt_sigqueueinfo +525 x32 sigaltstack __x32_compat_sys_sigaltstack +526 x32 timer_create __x32_compat_sys_timer_create +527 x32 mq_notify __x32_compat_sys_mq_notify +528 x32 kexec_load __x32_compat_sys_kexec_load +529 x32 waitid __x32_compat_sys_waitid +530 x32 set_robust_list __x32_compat_sys_set_robust_list +531 x32 get_robust_list __x32_compat_sys_get_robust_list +532 x32 vmsplice __x32_compat_sys_vmsplice +533 x32 move_pages __x32_compat_sys_move_pages +534 x32 preadv __x32_compat_sys_preadv64 +535 x32 pwritev __x32_compat_sys_pwritev64 +536 x32 rt_tgsigqueueinfo __x32_compat_sys_rt_tgsigqueueinfo +537 x32 recvmmsg __x32_compat_sys_recvmmsg +538 x32 sendmmsg __x32_compat_sys_sendmmsg +539 x32 process_vm_readv __x32_compat_sys_process_vm_readv +540 x32 process_vm_writev __x32_compat_sys_process_vm_writev +541 x32 setsockopt __x32_compat_sys_setsockopt +542 x32 getsockopt __x32_compat_sys_getsockopt +543 x32 io_setup __x32_compat_sys_io_setup +544 x32 io_submit __x32_compat_sys_io_submit +545 x32 execveat __x32_compat_sys_execveat/ptregs +546 x32 preadv2 __x32_compat_sys_preadv64v2 +547 x32 pwritev2 __x32_compat_sys_pwritev64v2 diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh new file mode 100644 index 000000000..12fbbcfe7 --- /dev/null +++ b/arch/x86/entry/syscalls/syscallhdr.sh @@ -0,0 +1,28 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +in="$1" +out="$2" +my_abis=`echo "($3)" | tr ',' '|'` +prefix="$4" +offset="$5" + +fileguard=_ASM_X86_`basename "$out" | sed \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ + -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` +grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( + echo "#ifndef ${fileguard}" + echo "#define ${fileguard} 1" + echo "" + + while read nr abi name entry ; do + if [ -z "$offset" ]; then + echo "#define __NR_${prefix}${name} $nr" + else + echo "#define __NR_${prefix}${name} ($offset + $nr)" + fi + done + + echo "" + echo "#endif /* ${fileguard} */" +) > "$out" diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh new file mode 100644 index 000000000..94fcd1951 --- /dev/null +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -0,0 +1,81 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +in="$1" +out="$2" + +syscall_macro() { + abi="$1" + nr="$2" + entry="$3" + + # Entry can be either just a function name or "function/qualifier" + real_entry="${entry%%/*}" + if [ "$entry" = "$real_entry" ]; then + qualifier= + else + qualifier=${entry#*/} + fi + + echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)" +} + +emit() { + abi="$1" + nr="$2" + entry="$3" + compat="$4" + umlentry="" + + if [ "$abi" = "64" -a -n "$compat" ]; then + echo "a compat entry for a 64-bit syscall makes no sense" >&2 + exit 1 + fi + + # For CONFIG_UML, we need to strip the __x64_sys prefix + if [ "$abi" = "64" -a "${entry}" != "${entry#__x64_sys}" ]; then + umlentry="sys${entry#__x64_sys}" + fi + + if [ -z "$compat" ]; then + if [ -n "$entry" -a -z "$umlentry" ]; then + syscall_macro "$abi" "$nr" "$entry" + elif [ -n "$umlentry" ]; then # implies -n "$entry" + echo "#ifdef CONFIG_X86" + syscall_macro "$abi" "$nr" "$entry" + echo "#else /* CONFIG_UML */" + syscall_macro "$abi" "$nr" "$umlentry" + echo "#endif" + fi + else + echo "#ifdef CONFIG_X86_32" + if [ -n "$entry" ]; then + syscall_macro "$abi" "$nr" "$entry" + fi + echo "#else" + syscall_macro "$abi" "$nr" "$compat" + echo "#endif" + fi +} + +grep '^[0-9]' "$in" | sort -n | ( + while read nr abi name entry compat; do + abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` + if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then + # COMMON is the same as 64, except that we don't expect X32 + # programs to use it. Our expectation has nothing to do with + # any generated code, so treat them the same. + emit 64 "$nr" "$entry" "$compat" + elif [ "$abi" = "X32" ]; then + # X32 is equivalent to 64 on an X32-compatible kernel. + echo "#ifdef CONFIG_X86_X32_ABI" + emit 64 "$nr" "$entry" "$compat" + echo "#endif" + elif [ "$abi" = "I386" ]; then + emit "$abi" "$nr" "$entry" "$compat" + else + echo "Unknown abi $abi" >&2 + exit 1 + fi + done +) > "$out" diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S new file mode 100644 index 000000000..fee6bc79b --- /dev/null +++ b/arch/x86/entry/thunk_32.S @@ -0,0 +1,43 @@ +/* + * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) + * Copyright 2008 by Steven Rostedt, Red Hat, Inc + * (inspired by Andi Kleen's thunk_64.S) + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + #include <linux/linkage.h> + #include <asm/asm.h> + #include <asm/export.h> + + /* put return address in eax (arg1) */ + .macro THUNK name, func, put_ret_addr_in_eax=0 + .globl \name +\name: + pushl %eax + pushl %ecx + pushl %edx + + .if \put_ret_addr_in_eax + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + .endif + + call \func + popl %edx + popl %ecx + popl %eax + ret + _ASM_NOKPROBE(\name) + .endm + +#ifdef CONFIG_TRACE_IRQFLAGS + THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 + THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 +#endif + +#ifdef CONFIG_PREEMPT + THUNK ___preempt_schedule, preempt_schedule + THUNK ___preempt_schedule_notrace, preempt_schedule_notrace + EXPORT_SYMBOL(___preempt_schedule) + EXPORT_SYMBOL(___preempt_schedule_notrace) +#endif + diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S new file mode 100644 index 000000000..be36bf4e0 --- /dev/null +++ b/arch/x86/entry/thunk_64.S @@ -0,0 +1,73 @@ +/* + * Save registers before calling assembly functions. This avoids + * disturbance of register allocation in some inline assembly constructs. + * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. + * Subject to the GNU public license, v.2. No warranty of any kind. + */ +#include <linux/linkage.h> +#include "calling.h" +#include <asm/asm.h> +#include <asm/export.h> + + /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ + .macro THUNK name, func, put_ret_addr_in_rdi=0 + .globl \name + .type \name, @function +\name: + pushq %rbp + movq %rsp, %rbp + + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + + .if \put_ret_addr_in_rdi + /* 8(%rbp) is return addr on stack */ + movq 8(%rbp), %rdi + .endif + + call \func + jmp .L_restore + _ASM_NOKPROBE(\name) + .endm + +#ifdef CONFIG_TRACE_IRQFLAGS + THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 + THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + THUNK lockdep_sys_exit_thunk,lockdep_sys_exit +#endif + +#ifdef CONFIG_PREEMPT + THUNK ___preempt_schedule, preempt_schedule + THUNK ___preempt_schedule_notrace, preempt_schedule_notrace + EXPORT_SYMBOL(___preempt_schedule) + EXPORT_SYMBOL(___preempt_schedule_notrace) +#endif + +#if defined(CONFIG_TRACE_IRQFLAGS) \ + || defined(CONFIG_DEBUG_LOCK_ALLOC) \ + || defined(CONFIG_PREEMPT) +.L_restore: + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + popq %rbp + ret + _ASM_NOKPROBE(.L_restore) +#endif diff --git a/arch/x86/entry/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore new file mode 100644 index 000000000..aae8ffdd5 --- /dev/null +++ b/arch/x86/entry/vdso/.gitignore @@ -0,0 +1,7 @@ +vdso.lds +vdsox32.lds +vdso32-syscall-syms.lds +vdso32-sysenter-syms.lds +vdso32-int80-syms.lds +vdso-image-*.c +vdso2c diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile new file mode 100644 index 000000000..5bfe2243a --- /dev/null +++ b/arch/x86/entry/vdso/Makefile @@ -0,0 +1,205 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Building vDSO images for x86. +# + +KBUILD_CFLAGS += $(DISABLE_LTO) +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +OBJECT_FILES_NON_STANDARD := y + +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + +VDSO64-$(CONFIG_X86_64) := y +VDSOX32-$(CONFIG_X86_X32_ABI) := y +VDSO32-$(CONFIG_X86_32) := y +VDSO32-$(CONFIG_IA32_EMULATION) := y + +# files to link into the vdso +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o + +# files to link into kernel +obj-y += vma.o +OBJECT_FILES_NON_STANDARD_vma.o := n + +# vDSO images to build +vdso_img-$(VDSO64-y) += 64 +vdso_img-$(VDSOX32-y) += x32 +vdso_img-$(VDSO32-y) += 32 + +obj-$(VDSO32-y) += vdso32-setup.o + +vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) + +$(obj)/vdso.o: $(obj)/vdso.so + +targets += vdso.lds $(vobjs-y) + +# Build the vDSO image C files and link them in. +vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) +vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) +vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) +obj-y += $(vdso_img_objs) +targets += $(vdso_img_cfiles) +targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so) + +CPPFLAGS_vdso.lds += -P -C + +VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \ + -z max-page-size=4096 + +$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE + $(call if_changed,vdso) + +HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi +hostprogs-y += vdso2c + +quiet_cmd_vdso2c = VDSO2C $@ + cmd_vdso2c = $(obj)/vdso2c $< $(<:%.dbg=%) $@ + +$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE + $(call if_changed,vdso2c) + +# +# Don't omit frame pointers for ease of userspace debugging, but do +# optimize sibling calls. +# +CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ + $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ + -fno-omit-frame-pointer -foptimize-sibling-calls \ + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + CFL += $(RETPOLINE_VDSO_CFLAGS) +endif +endif + +$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) + +# +# vDSO code runs in userspace and -pg doesn't help with profiling anyway. +# +CFLAGS_REMOVE_vdso-note.o = -pg +CFLAGS_REMOVE_vclock_gettime.o = -pg +CFLAGS_REMOVE_vgetcpu.o = -pg +CFLAGS_REMOVE_vvar.o = -pg + +# +# X32 processes use x32 vDSO to access 64bit kernel data. +# +# Build x32 vDSO image: +# 1. Compile x32 vDSO as 64bit. +# 2. Convert object files to x32. +# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes +# so that it can reach 64bit address space with 64bit pointers. +# + +CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \ + -z max-page-size=4096 + +# x32-rebranded versions +vobjx32s-y := $(vobjs-y:.o=-x32.o) + +# same thing, but in the output directory +vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) + +# Convert 64bit object file to x32 for x32 vDSO. +quiet_cmd_x32 = X32 $@ + cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@ + +$(obj)/%-x32.o: $(obj)/%.o FORCE + $(call if_changed,x32) + +targets += vdsox32.lds $(vobjx32s-y) + +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg + $(call if_changed,objcopy) + +$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE + $(call if_changed,vdso) + +CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1 + +targets += vdso32/vdso32.lds +targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o +targets += vdso32/vclock_gettime.o + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO +$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32 + +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic +KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) +KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) +KBUILD_CFLAGS_32 += -fno-omit-frame-pointer +KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) +endif +endif + +$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) + +$(obj)/vdso32.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/vclock_gettime.o \ + $(obj)/vdso32/note.o \ + $(obj)/vdso32/system_call.o \ + $(obj)/vdso32/sigreturn.o + $(call if_changed,vdso) + +# +# The DSO images are built using a special linker script. +# +quiet_cmd_vdso = VDSO $@ + cmd_vdso = $(LD) -nostdlib -o $@ \ + $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ + -T $(filter %.lds,$^) $(filter %.o,$^) && \ + sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' + +VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \ + $(call ld-option, --build-id) $(call ld-option, --eh-frame-hdr) \ + -Bsymbolic +GCOV_PROFILE := n + +# +# Install the unstripped copies of vdso*.so. If our toolchain supports +# build-id, install .build-id links as well. +# +quiet_cmd_vdso_install = INSTALL $(@:install_%=%) +define cmd_vdso_install + cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \ + if readelf -n $< |grep -q 'Build ID'; then \ + buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \ + first=`echo $$buildid | cut -b-2`; \ + last=`echo $$buildid | cut -b3-`; \ + mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \ + ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \ + fi +endef + +vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) + +$(MODLIB)/vdso: FORCE + @mkdir -p $(MODLIB)/vdso + +$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso + $(call cmd,vdso_install) + +PHONY += vdso_install $(vdso_img_insttargets) +vdso_install: $(vdso_img_insttargets) + +clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh new file mode 100755 index 000000000..7ee90a9b5 --- /dev/null +++ b/arch/x86/entry/vdso/checkundef.sh @@ -0,0 +1,10 @@ +#!/bin/sh +nm="$1" +file="$2" +$nm "$file" | grep '^ *U' > /dev/null 2>&1 +if [ $? -eq 1 ]; then + exit 0 +else + echo "$file: undefined symbols found" >&2 + exit 1 +fi diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c new file mode 100644 index 000000000..8a88e738f --- /dev/null +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -0,0 +1,341 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of clock_gettime, gettimeofday, and time. + * + * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net> + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * The code should have no internal unresolved relocations. + * Check with readelf after changing. + */ + +#include <uapi/linux/time.h> +#include <asm/vgtod.h> +#include <asm/vvar.h> +#include <asm/unistd.h> +#include <asm/msr.h> +#include <asm/pvclock.h> +#include <asm/mshyperv.h> +#include <linux/math64.h> +#include <linux/time.h> +#include <linux/kernel.h> + +#define gtod (&VVAR(vsyscall_gtod_data)) + +extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); +extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); +extern time_t __vdso_time(time_t *t); + +#ifdef CONFIG_PARAVIRT_CLOCK +extern u8 pvclock_page[PAGE_SIZE] + __attribute__((visibility("hidden"))); +#endif + +#ifdef CONFIG_HYPERV_TSCPAGE +extern u8 hvclock_page[PAGE_SIZE] + __attribute__((visibility("hidden"))); +#endif + +#ifndef BUILD_VDSO32 + +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + asm ("syscall" : "=a" (ret), "=m" (*ts) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : + "memory", "rcx", "r11"); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm ("syscall" : "=a" (ret), "=m" (*tv), "=m" (*tz) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : + "memory", "rcx", "r11"); + return ret; +} + + +#else + +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + + asm ( + "mov %%ebx, %%edx \n" + "mov %[clock], %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret), "=m" (*ts) + : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) + : "memory", "edx"); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm ( + "mov %%ebx, %%edx \n" + "mov %[tv], %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret), "=m" (*tv), "=m" (*tz) + : "0" (__NR_gettimeofday), [tv] "g" (tv), "c" (tz) + : "memory", "edx"); + return ret; +} + +#endif + +#ifdef CONFIG_PARAVIRT_CLOCK +static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) +{ + return (const struct pvclock_vsyscall_time_info *)&pvclock_page; +} + +static notrace u64 vread_pvclock(int *mode) +{ + const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; + u64 ret; + u64 last; + u32 version; + + /* + * Note: The kernel and hypervisor must guarantee that cpu ID + * number maps 1:1 to per-CPU pvclock time info. + * + * Because the hypervisor is entirely unaware of guest userspace + * preemption, it cannot guarantee that per-CPU pvclock time + * info is updated if the underlying CPU changes or that that + * version is increased whenever underlying CPU changes. + * + * On KVM, we are guaranteed that pvti updates for any vCPU are + * atomic as seen by *all* vCPUs. This is an even stronger + * guarantee than we get with a normal seqlock. + * + * On Xen, we don't appear to have that guarantee, but Xen still + * supplies a valid seqlock using the version field. + * + * We only do pvclock vdso timing at all if + * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to + * mean that all vCPUs have matching pvti and that the TSC is + * synced, so we can just look at vCPU 0's pvti. + */ + + do { + version = pvclock_read_begin(pvti); + + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { + *mode = VCLOCK_NONE; + return 0; + } + + ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); + } while (pvclock_read_retry(pvti, version)); + + /* refer to vread_tsc() comment for rationale */ + last = gtod->cycle_last; + + if (likely(ret >= last)) + return ret; + + return last; +} +#endif +#ifdef CONFIG_HYPERV_TSCPAGE +static notrace u64 vread_hvclock(int *mode) +{ + const struct ms_hyperv_tsc_page *tsc_pg = + (const struct ms_hyperv_tsc_page *)&hvclock_page; + u64 current_tick = hv_read_tsc_page(tsc_pg); + + if (current_tick != U64_MAX) + return current_tick; + + *mode = VCLOCK_NONE; + return 0; +} +#endif + +notrace static u64 vread_tsc(void) +{ + u64 ret = (u64)rdtsc_ordered(); + u64 last = gtod->cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a function of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} + +notrace static inline u64 vgetsns(int *mode) +{ + u64 v; + cycles_t cycles; + + if (gtod->vclock_mode == VCLOCK_TSC) + cycles = vread_tsc(); + + /* + * For any memory-mapped vclock type, we need to make sure that gcc + * doesn't cleverly hoist a load before the mode check. Otherwise we + * might end up touching the memory-mapped page even if the vclock in + * question isn't enabled, which will segfault. Hence the barriers. + */ +#ifdef CONFIG_PARAVIRT_CLOCK + else if (gtod->vclock_mode == VCLOCK_PVCLOCK) { + barrier(); + cycles = vread_pvclock(mode); + } +#endif +#ifdef CONFIG_HYPERV_TSCPAGE + else if (gtod->vclock_mode == VCLOCK_HVCLOCK) { + barrier(); + cycles = vread_hvclock(mode); + } +#endif + else + return 0; + v = (cycles - gtod->cycle_last) & gtod->mask; + return v * gtod->mult; +} + +/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ +notrace static int __always_inline do_realtime(struct timespec *ts) +{ + unsigned long seq; + u64 ns; + int mode; + + do { + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; + ts->tv_sec = gtod->wall_time_sec; + ns = gtod->wall_time_snsec; + ns += vgetsns(&mode); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + +notrace static int __always_inline do_monotonic(struct timespec *ts) +{ + unsigned long seq; + u64 ns; + int mode; + + do { + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; + ts->tv_sec = gtod->monotonic_time_sec; + ns = gtod->monotonic_time_snsec; + ns += vgetsns(&mode); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + +notrace static void do_realtime_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->wall_time_coarse_sec; + ts->tv_nsec = gtod->wall_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); +} + +notrace static void do_monotonic_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->monotonic_time_coarse_sec; + ts->tv_nsec = gtod->monotonic_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); +} + +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +{ + switch (clock) { + case CLOCK_REALTIME: + if (do_realtime(ts) == VCLOCK_NONE) + goto fallback; + break; + case CLOCK_MONOTONIC: + if (do_monotonic(ts) == VCLOCK_NONE) + goto fallback; + break; + case CLOCK_REALTIME_COARSE: + do_realtime_coarse(ts); + break; + case CLOCK_MONOTONIC_COARSE: + do_monotonic_coarse(ts); + break; + default: + goto fallback; + } + + return 0; +fallback: + return vdso_fallback_gettime(clock, ts); +} +int clock_gettime(clockid_t, struct timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); + +notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + if (likely(tv != NULL)) { + if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) + return vdso_fallback_gtod(tv, tz); + tv->tv_usec /= 1000; + } + if (unlikely(tz != NULL)) { + tz->tz_minuteswest = gtod->tz_minuteswest; + tz->tz_dsttime = gtod->tz_dsttime; + } + + return 0; +} +int gettimeofday(struct timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); + +/* + * This will break when the xtime seconds get inaccurate, but that is + * unlikely + */ +notrace time_t __vdso_time(time_t *t) +{ + /* This is atomic on x86 so we don't need any locks. */ + time_t result = READ_ONCE(gtod->wall_time_sec); + + if (t) + *t = result; + return result; +} +time_t time(time_t *t) + __attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S new file mode 100644 index 000000000..acfd5ba7d --- /dev/null +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <asm/vdso.h> + +/* + * Linker script for vDSO. This is an ELF shared object prelinked to + * its virtual address, and with only one read-only segment. + * This script controls its layout. + */ + +#if defined(BUILD_VDSO64) +# define SHDR_SIZE 64 +#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32) +# define SHDR_SIZE 40 +#else +# error unknown VDSO target +#endif + +#define NUM_FAKE_SHDRS 13 + +SECTIONS +{ + /* + * User/kernel shared data is before the vDSO. This may be a little + * uglier than putting it after the vDSO, but it avoids issues with + * non-allocatable things that dangle past the end of the PT_LOAD + * segment. + */ + + vvar_start = . - 3 * PAGE_SIZE; + vvar_page = vvar_start; + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; +#define __VVAR_KERNEL_LDS +#include <asm/vvar.h> +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + pvclock_page = vvar_start + PAGE_SIZE; + hvclock_page = vvar_start + 2 * PAGE_SIZE; + + . = SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { + *(.rodata*) + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + + /* + * Ideally this would live in a C file, but that won't + * work cleanly for x32 until we start building the x32 + * C code using an x32 toolchain. + */ + VDSO_FAKE_SECTION_TABLE_START = .; + . = . + NUM_FAKE_SHDRS * SHDR_SIZE; + VDSO_FAKE_SECTION_TABLE_END = .; + } :text + + .fake_shstrtab : { *(.fake_shstrtab) } :text + + + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + + /* + * Text is well-separated from actual data: there's plenty of + * stuff that isn't used at runtime in between. + */ + + .text : { *(.text*) } :text =0x90909090, + + /* + * At the end so that eu-elflint stays happy when vdso2c strips + * these. A better implementation would avoid allocating space + * for these. + */ + .altinstructions : { *(.altinstructions) } :text + .altinstr_replacement : { *(.altinstr_replacement) } :text + + /DISCARD/ : { + *(.discard) + *(.discard.*) + *(__bug_table) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S new file mode 100644 index 000000000..794231701 --- /dev/null +++ b/arch/x86/entry/vdso/vdso-note.S @@ -0,0 +1,15 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/build-salt.h> +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END + +BUILD_SALT diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S new file mode 100644 index 000000000..d3a2dce4c --- /dev/null +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Linker script for 64-bit vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#define BUILD_VDSO64 + +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + time; + __vdso_time; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c new file mode 100644 index 000000000..4674f5858 --- /dev/null +++ b/arch/x86/entry/vdso/vdso2c.c @@ -0,0 +1,260 @@ +/* + * vdso2c - A vdso image preparation tool + * Copyright (c) 2014 Andy Lutomirski and others + * Licensed under the GPL v2 + * + * vdso2c requires stripped and unstripped input. It would be trivial + * to fully strip the input in here, but, for reasons described below, + * we need to write a section table. Doing this is more or less + * equivalent to dropping all non-allocatable sections, but it's + * easier to let objcopy handle that instead of doing it ourselves. + * If we ever need to do something fancier than what objcopy provides, + * it would be straightforward to add here. + * + * We're keep a section table for a few reasons: + * + * The Go runtime had a couple of bugs: it would read the section + * table to try to figure out how many dynamic symbols there were (it + * shouldn't have looked at the section table at all) and, if there + * were no SHT_SYNDYM section table entry, it would use an + * uninitialized value for the number of symbols. An empty DYNSYM + * table would work, but I see no reason not to write a valid one (and + * keep full performance for old Go programs). This hack is only + * needed on x86_64. + * + * The bug was introduced on 2012-08-31 by: + * https://code.google.com/p/go/source/detail?r=56ea40aac72b + * and was fixed on 2014-06-13 by: + * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 + * + * Binutils has issues debugging the vDSO: it reads the section table to + * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which + * would break build-id if we removed the section table. Binutils + * also requires that shstrndx != 0. See: + * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 + * + * elfutils might not look for PT_NOTE if there is a section table at + * all. I don't know whether this matters for any practical purpose. + * + * For simplicity, rather than hacking up a partial section table, we + * just write a mostly complete one. We omit non-dynamic symbols, + * though, since they're rather large. + * + * Once binutils gets fixed, we might be able to drop this for all but + * the 64-bit vdso, since build-id only works in kernel RPMs, and + * systems that update to new enough kernel RPMs will likely update + * binutils in sync. build-id has never worked for home-built kernel + * RPMs without manual symlinking, and I suspect that no one ever does + * that. + */ + +#include <inttypes.h> +#include <stdint.h> +#include <unistd.h> +#include <stdarg.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <fcntl.h> +#include <err.h> + +#include <sys/mman.h> +#include <sys/types.h> + +#include <tools/le_byteshift.h> + +#include <linux/elf.h> +#include <linux/types.h> +#include <linux/kernel.h> + +const char *outfilename; + +/* Symbols that we need in vdso2c. */ +enum { + sym_vvar_start, + sym_vvar_page, + sym_hpet_page, + sym_pvclock_page, + sym_hvclock_page, + sym_VDSO_FAKE_SECTION_TABLE_START, + sym_VDSO_FAKE_SECTION_TABLE_END, +}; + +const int special_pages[] = { + sym_vvar_page, + sym_hpet_page, + sym_pvclock_page, + sym_hvclock_page, +}; + +struct vdso_sym { + const char *name; + bool export; +}; + +struct vdso_sym required_syms[] = { + [sym_vvar_start] = {"vvar_start", true}, + [sym_vvar_page] = {"vvar_page", true}, + [sym_hpet_page] = {"hpet_page", true}, + [sym_pvclock_page] = {"pvclock_page", true}, + [sym_hvclock_page] = {"hvclock_page", true}, + [sym_VDSO_FAKE_SECTION_TABLE_START] = { + "VDSO_FAKE_SECTION_TABLE_START", false + }, + [sym_VDSO_FAKE_SECTION_TABLE_END] = { + "VDSO_FAKE_SECTION_TABLE_END", false + }, + {"VDSO32_NOTE_MASK", true}, + {"__kernel_vsyscall", true}, + {"__kernel_sigreturn", true}, + {"__kernel_rt_sigreturn", true}, + {"int80_landing_pad", true}, +}; + +__attribute__((format(printf, 1, 2))) __attribute__((noreturn)) +static void fail(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + fprintf(stderr, "Error: "); + vfprintf(stderr, format, ap); + if (outfilename) + unlink(outfilename); + exit(1); + va_end(ap); +} + +/* + * Evil macros for little-endian reads and writes + */ +#define GLE(x, bits, ifnot) \ + __builtin_choose_expr( \ + (sizeof(*(x)) == bits/8), \ + (__typeof__(*(x)))get_unaligned_le##bits(x), ifnot) + +extern void bad_get_le(void); +#define LAST_GLE(x) \ + __builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le()) + +#define GET_LE(x) \ + GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x)))) + +#define PLE(x, val, bits, ifnot) \ + __builtin_choose_expr( \ + (sizeof(*(x)) == bits/8), \ + put_unaligned_le##bits((val), (x)), ifnot) + +extern void bad_put_le(void); +#define LAST_PLE(x, val) \ + __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le()) + +#define PUT_LE(x, val) \ + PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val)))) + + +#define NSYMS ARRAY_SIZE(required_syms) + +#define BITSFUNC3(name, bits, suffix) name##bits##suffix +#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix) +#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, ) + +#define INT_BITS BITSFUNC2(int, ELF_BITS, _t) + +#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x +#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x) +#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x) + +#define ELF_BITS 64 +#include "vdso2c.h" +#undef ELF_BITS + +#define ELF_BITS 32 +#include "vdso2c.h" +#undef ELF_BITS + +static void go(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) +{ + Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr; + + if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { + go64(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); + } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { + go32(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); + } else { + fail("unknown ELF class\n"); + } +} + +static void map_input(const char *name, void **addr, size_t *len, int prot) +{ + off_t tmp_len; + + int fd = open(name, O_RDONLY); + if (fd == -1) + err(1, "%s", name); + + tmp_len = lseek(fd, 0, SEEK_END); + if (tmp_len == (off_t)-1) + err(1, "lseek"); + *len = (size_t)tmp_len; + + *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0); + if (*addr == MAP_FAILED) + err(1, "mmap"); + + close(fd); +} + +int main(int argc, char **argv) +{ + size_t raw_len, stripped_len; + void *raw_addr, *stripped_addr; + FILE *outfile; + char *name, *tmp; + int namelen; + + if (argc != 4) { + printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n"); + return 1; + } + + /* + * Figure out the struct name. If we're writing to a .so file, + * generate raw output insted. + */ + name = strdup(argv[3]); + namelen = strlen(name); + if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) { + name = NULL; + } else { + tmp = strrchr(name, '/'); + if (tmp) + name = tmp + 1; + tmp = strchr(name, '.'); + if (tmp) + *tmp = '\0'; + for (tmp = name; *tmp; tmp++) + if (*tmp == '-') + *tmp = '_'; + } + + map_input(argv[1], &raw_addr, &raw_len, PROT_READ); + map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ); + + outfilename = argv[3]; + outfile = fopen(outfilename, "w"); + if (!outfile) + err(1, "%s", argv[2]); + + go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); + + munmap(raw_addr, raw_len); + munmap(stripped_addr, stripped_len); + fclose(outfile); + + return 0; +} diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h new file mode 100644 index 000000000..fa847a620 --- /dev/null +++ b/arch/x86/entry/vdso/vdso2c.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This file is included twice from vdso2c.c. It generates code for 32-bit + * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs + * are built for 32-bit userspace. + */ + +static void BITSFUNC(go)(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) +{ + int found_load = 0; + unsigned long load_size = -1; /* Work around bogus warning */ + unsigned long mapping_size; + ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; + int i; + unsigned long j; + ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, + *alt_sec = NULL; + ELF(Dyn) *dyn = 0, *dyn_end = 0; + const char *secstrings; + INT_BITS syms[NSYMS] = {}; + + ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff)); + + if (GET_LE(&hdr->e_type) != ET_DYN) + fail("input is not a shared object\n"); + + /* Walk the segment table. */ + for (i = 0; i < GET_LE(&hdr->e_phnum); i++) { + if (GET_LE(&pt[i].p_type) == PT_LOAD) { + if (found_load) + fail("multiple PT_LOAD segs\n"); + + if (GET_LE(&pt[i].p_offset) != 0 || + GET_LE(&pt[i].p_vaddr) != 0) + fail("PT_LOAD in wrong place\n"); + + if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz)) + fail("cannot handle memsz != filesz\n"); + + load_size = GET_LE(&pt[i].p_memsz); + found_load = 1; + } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) { + dyn = raw_addr + GET_LE(&pt[i].p_offset); + dyn_end = raw_addr + GET_LE(&pt[i].p_offset) + + GET_LE(&pt[i].p_memsz); + } + } + if (!found_load) + fail("no PT_LOAD seg\n"); + + if (stripped_len < load_size) + fail("stripped input is too short\n"); + + if (!dyn) + fail("input has no PT_DYNAMIC section -- your toolchain is buggy\n"); + + /* Walk the dynamic table */ + for (i = 0; dyn + i < dyn_end && + GET_LE(&dyn[i].d_tag) != DT_NULL; i++) { + typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag); + if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA || + tag == DT_RELENT || tag == DT_TEXTREL) + fail("vdso image contains dynamic relocations\n"); + } + + /* Walk the section table */ + secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx); + secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset); + for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { + ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * i; + if (GET_LE(&sh->sh_type) == SHT_SYMTAB) + symtab_hdr = sh; + + if (!strcmp(secstrings + GET_LE(&sh->sh_name), + ".altinstructions")) + alt_sec = sh; + } + + if (!symtab_hdr) + fail("no symbol table\n"); + + strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link); + + /* Walk the symbol table */ + for (i = 0; + i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); + i++) { + int k; + ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + + GET_LE(&symtab_hdr->sh_entsize) * i; + const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) + + GET_LE(&sym->st_name); + + for (k = 0; k < NSYMS; k++) { + if (!strcmp(name, required_syms[k].name)) { + if (syms[k]) { + fail("duplicate symbol %s\n", + required_syms[k].name); + } + + /* + * Careful: we use negative addresses, but + * st_value is unsigned, so we rely + * on syms[k] being a signed type of the + * correct width. + */ + syms[k] = GET_LE(&sym->st_value); + } + } + } + + /* Validate mapping addresses. */ + for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { + INT_BITS symval = syms[special_pages[i]]; + + if (!symval) + continue; /* The mapping isn't used; ignore it. */ + + if (symval % 4096) + fail("%s must be a multiple of 4096\n", + required_syms[i].name); + if (symval + 4096 < syms[sym_vvar_start]) + fail("%s underruns vvar_start\n", + required_syms[i].name); + if (symval + 4096 > 0) + fail("%s is on the wrong side of the vdso text\n", + required_syms[i].name); + } + if (syms[sym_vvar_start] % 4096) + fail("vvar_begin must be a multiple of 4096\n"); + + if (!name) { + fwrite(stripped_addr, stripped_len, 1, outfile); + return; + } + + mapping_size = (stripped_len + 4095) / 4096 * 4096; + + fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); + fprintf(outfile, "#include <linux/linkage.h>\n"); + fprintf(outfile, "#include <asm/page_types.h>\n"); + fprintf(outfile, "#include <asm/vdso.h>\n"); + fprintf(outfile, "\n"); + fprintf(outfile, + "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {", + mapping_size); + for (j = 0; j < stripped_len; j++) { + if (j % 10 == 0) + fprintf(outfile, "\n\t"); + fprintf(outfile, "0x%02X, ", + (int)((unsigned char *)stripped_addr)[j]); + } + fprintf(outfile, "\n};\n\n"); + + fprintf(outfile, "const struct vdso_image %s = {\n", name); + fprintf(outfile, "\t.data = raw_data,\n"); + fprintf(outfile, "\t.size = %lu,\n", mapping_size); + if (alt_sec) { + fprintf(outfile, "\t.alt = %lu,\n", + (unsigned long)GET_LE(&alt_sec->sh_offset)); + fprintf(outfile, "\t.alt_len = %lu,\n", + (unsigned long)GET_LE(&alt_sec->sh_size)); + } + for (i = 0; i < NSYMS; i++) { + if (required_syms[i].export && syms[i]) + fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", + required_syms[i].name, (int64_t)syms[i]); + } + fprintf(outfile, "};\n"); +} diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c new file mode 100644 index 000000000..ddff0ca6f --- /dev/null +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * (C) Copyright 2002 Linus Torvalds + * Portions based on the vdso-randomization code from exec-shield: + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar + * + * This file contains the needed initializations to support sysenter. + */ + +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/kernel.h> +#include <linux/mm_types.h> +#include <linux/elf.h> + +#include <asm/processor.h> +#include <asm/vdso.h> + +#ifdef CONFIG_COMPAT_VDSO +#define VDSO_DEFAULT 0 +#else +#define VDSO_DEFAULT 1 +#endif + +/* + * Should the kernel map a VDSO page into processes and pass its + * address down to glibc upon exec()? + */ +unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT; + +static int __init vdso32_setup(char *s) +{ + vdso32_enabled = simple_strtoul(s, NULL, 0); + + if (vdso32_enabled > 1) { + pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); + vdso32_enabled = 0; + } + + return 1; +} + +/* + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO + * behavior on both 64-bit and 32-bit kernels. + * On 32-bit kernels, vdso=[012] means the same thing. + */ +__setup("vdso32=", vdso32_setup); + +#ifdef CONFIG_X86_32 +__setup_param("vdso=", vdso_setup, vdso32_setup, 0); +#endif + +int __init sysenter_setup(void) +{ + init_vdso_image(&vdso_image_32); + + return 0; +} + +#ifdef CONFIG_X86_64 + +subsys_initcall(sysenter_setup); + +#ifdef CONFIG_SYSCTL +/* Register vsyscall32 into the ABI table */ +#include <linux/sysctl.h> + +static const int zero; +static const int one = 1; + +static struct ctl_table abi_table2[] = { + { + .procname = "vsyscall32", + .data = &vdso32_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *)&zero, + .extra2 = (int *)&one, + }, + {} +}; + +static struct ctl_table abi_root_table2[] = { + { + .procname = "abi", + .mode = 0555, + .child = abi_table2 + }, + {} +}; + +static __init int ia32_binfmt_init(void) +{ + register_sysctl_table(abi_root_table2); + return 0; +} +__initcall(ia32_binfmt_init); +#endif /* CONFIG_SYSCTL */ + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore new file mode 100644 index 000000000..e45fba9d0 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/.gitignore @@ -0,0 +1 @@ +vdso32.lds diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S new file mode 100644 index 000000000..e78047d11 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/note.S @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/build-salt.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +/* Ideally this would use UTS_NAME, but using a quoted string here + doesn't work. Remember to change this when changing the + kernel's name. */ +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END + +BUILD_SALT + +#ifdef CONFIG_XEN +/* + * Add a special note telling glibc's dynamic linker a fake hardware + * flavor that it will use to choose the search path for libraries in the + * same way it uses real hardware capabilities like "mmx". + * We supply "nosegneg" as the fake capability, to indicate that we + * do not like negative offsets in instructions using segment overrides, + * since we implement those inefficiently. This makes it possible to + * install libraries optimized to avoid those access patterns in someplace + * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file + * corresponding to the bits here is needed to make ldconfig work right. + * It should contain: + * hwcap 1 nosegneg + * to match the mapping of bit to name that we give here. + * + * At runtime, the fake hardware feature will be considered to be present + * if its bit is set in the mask word. So, we start with the mask 0, and + * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. + */ + +#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ + +ELFNOTE_START(GNU, 2, "a") + .long 1 /* ncaps */ +VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */ + .long 0 /* mask */ + .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ +ELFNOTE_END +#endif diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S new file mode 100644 index 000000000..c3233ee98 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <asm/unistd_32.h> +#include <asm/asm-offsets.h> + +#ifndef SYSCALL_ENTER_KERNEL +#define SYSCALL_ENTER_KERNEL int $0x80 +#endif + + .text + .globl __kernel_sigreturn + .type __kernel_sigreturn,@function + nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */ + ALIGN +__kernel_sigreturn: +.LSTART_sigreturn: + popl %eax /* XXX does this mean it needs unwind info? */ + movl $__NR_sigreturn, %eax + SYSCALL_ENTER_KERNEL +.LEND_sigreturn: + nop + .size __kernel_sigreturn,.-.LSTART_sigreturn + + .globl __kernel_rt_sigreturn + .type __kernel_rt_sigreturn,@function + ALIGN +__kernel_rt_sigreturn: +.LSTART_rt_sigreturn: + movl $__NR_rt_sigreturn, %eax + SYSCALL_ENTER_KERNEL +.LEND_rt_sigreturn: + nop + .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI1: + .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 +.LSTARTCIEDLSI1: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zRS" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0 /* DW_CFA_nop */ + .align 4 +.LENDCIEDLSI1: + .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ +.LSTARTFDEDLSI1: + .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ + /* HACK: The dwarf2 unwind routines will subtract 1 from the + return address to get an address in the middle of the + presumed call instruction. Since we didn't get here via + a call, we need to include the nop before the real start + to make up for it. */ + .long .LSTART_sigreturn-1-. /* PC-relative start address */ + .long .LEND_sigreturn-.LSTART_sigreturn+1 + .uleb128 0 /* Augmentation */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + complicated by the fact that the "CFA" is always assumed to + be the value of the stack pointer in the caller. This means + that we must define the CFA of this body of code to be the + saved value of the stack pointer in the sigcontext. Which + also means that there is no fixed relation to the other + saved registers, which means that we must use DW_CFA_expression + to compute their addresses. It also means that when we + adjust the stack with the popl, we have to do it all over again. */ + +#define do_cfa_expr(offset) \ + .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ + .byte 0x06; /* DW_OP_deref */ \ +1: + +#define do_expr(regno, offset) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno; /* regno */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ +1: + + do_cfa_expr(IA32_SIGCONTEXT_sp+4) + do_expr(0, IA32_SIGCONTEXT_ax+4) + do_expr(1, IA32_SIGCONTEXT_cx+4) + do_expr(2, IA32_SIGCONTEXT_dx+4) + do_expr(3, IA32_SIGCONTEXT_bx+4) + do_expr(5, IA32_SIGCONTEXT_bp+4) + do_expr(6, IA32_SIGCONTEXT_si+4) + do_expr(7, IA32_SIGCONTEXT_di+4) + do_expr(8, IA32_SIGCONTEXT_ip+4) + + .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ + + do_cfa_expr(IA32_SIGCONTEXT_sp) + do_expr(0, IA32_SIGCONTEXT_ax) + do_expr(1, IA32_SIGCONTEXT_cx) + do_expr(2, IA32_SIGCONTEXT_dx) + do_expr(3, IA32_SIGCONTEXT_bx) + do_expr(5, IA32_SIGCONTEXT_bp) + do_expr(6, IA32_SIGCONTEXT_si) + do_expr(7, IA32_SIGCONTEXT_di) + do_expr(8, IA32_SIGCONTEXT_ip) + + .align 4 +.LENDFDEDLSI1: + + .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ +.LSTARTFDEDLSI2: + .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ + /* HACK: See above wrt unwind library assumptions. */ + .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ + .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 + .uleb128 0 /* Augmentation */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + slightly less complicated than the above, since we don't + modify the stack pointer in the process. */ + + do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp) + do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax) + do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx) + do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx) + do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx) + do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp) + do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si) + do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di) + do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip) + + .align 4 +.LENDFDEDLSI2: + .previous diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S new file mode 100644 index 000000000..263d7433d --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AT_SYSINFO entry point +*/ + +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/cpufeatures.h> +#include <asm/alternative-asm.h> + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: + CFI_STARTPROC + /* + * Reshuffle regs so that all of any of the entry instructions + * will preserve enough state. + * + * A really nice entry sequence would be: + * pushl %edx + * pushl %ecx + * movl %esp, %ecx + * + * Unfortunately, naughty Android versions between July and December + * 2015 actually hardcode the traditional Linux SYSENTER entry + * sequence. That is severely broken for a number of reasons (ask + * anyone with an AMD CPU, for example). Nonetheless, we try to keep + * it working approximately as well as it ever worked. + * + * This link may eludicate some of the history: + * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7 + * personally, I find it hard to understand what's going on there. + * + * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE. + * Execute an indirect call to the address in the AT_SYSINFO auxv + * entry. That is the ONLY correct way to make a fast 32-bit system + * call on Linux. (Open-coding int $0x80 is also fine, but it's + * slow.) + */ + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + + #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter" + #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall" + +#ifdef CONFIG_X86_64 + /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ + ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \ + SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32 +#else + ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP +#endif + + /* Enter using int $0x80 */ + int $0x80 +GLOBAL(int80_landing_pad) + + /* + * Restore EDX and ECX in case they were clobbered. EBP is not + * clobbered (the kernel restores it), but it's cleaner and + * probably faster to pop it than to adjust ESP using addl. + */ + popl %ebp + CFI_RESTORE ebp + CFI_ADJUST_CFA_OFFSET -4 + popl %edx + CFI_RESTORE edx + CFI_ADJUST_CFA_OFFSET -4 + popl %ecx + CFI_RESTORE ecx + CFI_ADJUST_CFA_OFFSET -4 + ret + CFI_ENDPROC + + .size __kernel_vsyscall,.-__kernel_vsyscall + .previous diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c new file mode 100644 index 000000000..9242b2841 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +#define BUILD_VDSO32 + +#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE +#undef CONFIG_OPTIMIZE_INLINING +#endif + +#ifdef CONFIG_X86_64 + +/* + * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel + * configuration + */ +#undef CONFIG_64BIT +#undef CONFIG_X86_64 +#undef CONFIG_PGTABLE_LEVELS +#undef CONFIG_ILLEGAL_POINTER_VALUE +#undef CONFIG_SPARSEMEM_VMEMMAP +#undef CONFIG_NR_CPUS + +#define CONFIG_X86_32 1 +#define CONFIG_PGTABLE_LEVELS 2 +#define CONFIG_PAGE_OFFSET 0 +#define CONFIG_ILLEGAL_POINTER_VALUE 0 +#define CONFIG_NR_CPUS 1 + +#define BUILD_VDSO32_64 + +#endif + +#include "../vclock_gettime.c" diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S new file mode 100644 index 000000000..422764a81 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Linker script for 32-bit vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#include <asm/page.h> + +#define BUILD_VDSO32 + +#include "../vdso-layout.lds.S" + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_vsyscall); + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION +{ + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_time; + }; + + LINUX_2.5 { + global: + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S new file mode 100644 index 000000000..05cd1c5c4 --- /dev/null +++ b/arch/x86/entry/vdso/vdsox32.lds.S @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Linker script for x32 vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#define BUILD_VDSOX32 + +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_getcpu; + __vdso_time; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c new file mode 100644 index 000000000..8ec3d1f4c --- /dev/null +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -0,0 +1,28 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of getcpu() + */ + +#include <linux/kernel.h> +#include <linux/getcpu.h> +#include <linux/time.h> +#include <asm/vgtod.h> + +notrace long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +{ + unsigned int p; + + p = __getcpu(); + + if (cpu) + *cpu = p & VGETCPU_CPU_MASK; + if (node) + *node = p >> 12; + return 0; +} + +long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) + __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c new file mode 100644 index 000000000..a1c31bb23 --- /dev/null +++ b/arch/x86/entry/vdso/vma.c @@ -0,0 +1,383 @@ +/* + * Copyright 2007 Andi Kleen, SUSE Labs. + * Subject to the GPL, v.2 + * + * This contains most of the x86 vDSO kernel-side code. + */ +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/random.h> +#include <linux/elf.h> +#include <linux/cpu.h> +#include <linux/ptrace.h> +#include <asm/pvclock.h> +#include <asm/vgtod.h> +#include <asm/proto.h> +#include <asm/vdso.h> +#include <asm/vvar.h> +#include <asm/page.h> +#include <asm/desc.h> +#include <asm/cpufeature.h> +#include <asm/mshyperv.h> + +#if defined(CONFIG_X86_64) +unsigned int __read_mostly vdso64_enabled = 1; +#endif + +void __init init_vdso_image(const struct vdso_image *image) +{ + BUG_ON(image->size % PAGE_SIZE != 0); + + apply_alternatives((struct alt_instr *)(image->data + image->alt), + (struct alt_instr *)(image->data + image->alt + + image->alt_len)); +} + +struct linux_binprm; + +static int vdso_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + const struct vdso_image *image = vma->vm_mm->context.vdso_image; + + if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size) + return VM_FAULT_SIGBUS; + + vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT)); + get_page(vmf->page); + return 0; +} + +static void vdso_fix_landing(const struct vdso_image *image, + struct vm_area_struct *new_vma) +{ +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + if (in_ia32_syscall() && image == &vdso_image_32) { + struct pt_regs *regs = current_pt_regs(); + unsigned long vdso_land = image->sym_int80_landing_pad; + unsigned long old_land_addr = vdso_land + + (unsigned long)current->mm->context.vdso; + + /* Fixing userspace landing - look at do_fast_syscall_32 */ + if (regs->ip == old_land_addr) + regs->ip = new_vma->vm_start + vdso_land; + } +#endif +} + +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + const struct vdso_image *image = current->mm->context.vdso_image; + + if (image->size != new_size) + return -EINVAL; + + vdso_fix_landing(image, new_vma); + current->mm->context.vdso = (void __user *)new_vma->vm_start; + + return 0; +} + +static int vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + const struct vdso_image *image = vma->vm_mm->context.vdso_image; + long sym_offset; + int ret = -EFAULT; + + if (!image) + return VM_FAULT_SIGBUS; + + sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + + image->sym_vvar_start; + + /* + * Sanity check: a symbol offset of zero means that the page + * does not exist for this vdso image, not that the page is at + * offset zero relative to the text mapping. This should be + * impossible here, because sym_offset should only be zero for + * the page past the end of the vvar mapping. + */ + if (sym_offset == 0) + return VM_FAULT_SIGBUS; + + if (sym_offset == image->sym_vvar_page) { + ret = vm_insert_pfn(vma, vmf->address, + __pa_symbol(&__vvar_page) >> PAGE_SHIFT); + } else if (sym_offset == image->sym_pvclock_page) { + struct pvclock_vsyscall_time_info *pvti = + pvclock_get_pvti_cpu0_va(); + if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { + ret = vm_insert_pfn_prot( + vma, + vmf->address, + __pa(pvti) >> PAGE_SHIFT, + pgprot_decrypted(vma->vm_page_prot)); + } + } else if (sym_offset == image->sym_hvclock_page) { + struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); + + if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) + ret = vm_insert_pfn(vma, vmf->address, + vmalloc_to_pfn(tsc_pg)); + } + + if (ret == 0 || ret == -EBUSY) + return VM_FAULT_NOPAGE; + + return VM_FAULT_SIGBUS; +} + +static const struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + .fault = vdso_fault, + .mremap = vdso_mremap, +}; +static const struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + .fault = vvar_fault, +}; + +/* + * Add vdso and vvar mappings to current process. + * @image - blob to map + * @addr - request a specific address (zero to map at free addr) + */ +static int map_vdso(const struct vdso_image *image, unsigned long addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long text_start; + int ret = 0; + + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + + addr = get_unmapped_area(NULL, addr, + image->size - image->sym_vvar_start, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + text_start = addr - image->sym_vvar_start; + + /* + * MAYWRITE to allow gdb to COW and set breakpoints + */ + vma = _install_special_mapping(mm, + text_start, + image->size, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + &vdso_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto up_fail; + } + + vma = _install_special_mapping(mm, + addr, + -image->sym_vvar_start, + VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| + VM_PFNMAP, + &vvar_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + do_munmap(mm, text_start, image->size, NULL); + } else { + current->mm->context.vdso = (void __user *)text_start; + current->mm->context.vdso_image = image; + } + +up_fail: + up_write(&mm->mmap_sem); + return ret; +} + +#ifdef CONFIG_X86_64 +/* + * Put the vdso above the (randomized) stack with another randomized + * offset. This way there is no hole in the middle of address space. + * To save memory make sure it is still in the same PTE as the stack + * top. This doesn't give that many random bits. + * + * Note that this algorithm is imperfect: the distribution of the vdso + * start address within a PMD is biased toward the end. + * + * Only used for the 64-bit and x32 vdsos. + */ +static unsigned long vdso_addr(unsigned long start, unsigned len) +{ + unsigned long addr, end; + unsigned offset; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; + if (end >= TASK_SIZE_MAX) + end = TASK_SIZE_MAX; + end -= len; + + if (end > start) { + offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } + + /* + * Forcibly align the final address in case we have a hardware + * issue that requires alignment for performance reasons. + */ + addr = align_vdso_addr(addr); + + return addr; +} + +static int map_vdso_randomized(const struct vdso_image *image) +{ + unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); + + return map_vdso(image, addr); +} +#endif + +int map_vdso_once(const struct vdso_image *image, unsigned long addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + down_write(&mm->mmap_sem); + /* + * Check if we have already mapped vdso blob - fail to prevent + * abusing from userspace install_speciall_mapping, which may + * not do accounting and rlimit right. + * We could search vma near context.vdso, but it's a slowpath, + * so let's explicitely check all VMAs to be completely sure. + */ + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma_is_special_mapping(vma, &vdso_mapping) || + vma_is_special_mapping(vma, &vvar_mapping)) { + up_write(&mm->mmap_sem); + return -EEXIST; + } + } + up_write(&mm->mmap_sem); + + return map_vdso(image, addr); +} + +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +static int load_vdso32(void) +{ + if (vdso32_enabled != 1) /* Other values all mean "disabled" */ + return 0; + + return map_vdso(&vdso_image_32, 0); +} +#endif + +#ifdef CONFIG_X86_64 +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + if (!vdso64_enabled) + return 0; + + return map_vdso_randomized(&vdso_image_64); +} + +#ifdef CONFIG_COMPAT +int compat_arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp) +{ +#ifdef CONFIG_X86_X32_ABI + if (test_thread_flag(TIF_X32)) { + if (!vdso64_enabled) + return 0; + return map_vdso_randomized(&vdso_image_x32); + } +#endif +#ifdef CONFIG_IA32_EMULATION + return load_vdso32(); +#else + return 0; +#endif +} +#endif +#else +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + return load_vdso32(); +} +#endif + +#ifdef CONFIG_X86_64 +static __init int vdso_setup(char *s) +{ + vdso64_enabled = simple_strtoul(s, NULL, 0); + return 1; +} +__setup("vdso=", vdso_setup); +#endif + +#ifdef CONFIG_X86_64 +static void vgetcpu_cpu_init(void *arg) +{ + int cpu = smp_processor_id(); + struct desc_struct d = { }; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node(cpu); +#endif + if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) + write_rdtscp_aux((node << 12) | cpu); + + /* + * Store cpu number in limit so that it can be loaded + * quickly in user space in vgetcpu. (12 bits for the CPU + * and 8 bits for the node) + */ + d.limit0 = cpu | ((node & 0xf) << 12); + d.limit1 = node >> 4; + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ + + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); +} + +static int vgetcpu_online(unsigned int cpu) +{ + return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); +} + +static int __init init_vdso(void) +{ + init_vdso_image(&vdso_image_64); + +#ifdef CONFIG_X86_X32_ABI + init_vdso_image(&vdso_image_x32); +#endif + + /* notifier priority > KVM */ + return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE, + "x86/vdso/vma:online", vgetcpu_online, NULL); +} +subsys_initcall(init_vdso); +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile new file mode 100644 index 000000000..a9f4856f6 --- /dev/null +++ b/arch/x86/entry/vsyscall/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the x86 low level vsyscall code +# +obj-y := vsyscall_gtod.o + +obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o + diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c new file mode 100644 index 000000000..82ed001e8 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> + * + * Based on the original implementation which is: + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Parts of the original code have been moved to arch/x86/vdso/vma.c + * + * This file implements vsyscall emulation. vsyscalls are a legacy ABI: + * Userspace can request certain kernel services by calling fixed + * addresses. This concept is problematic: + * + * - It interferes with ASLR. + * - It's awkward to write code that lives in kernel addresses but is + * callable by userspace at fixed addresses. + * - The whole concept is impossible for 32-bit compat userspace. + * - UML cannot easily virtualize a vsyscall. + * + * As of mid-2014, I believe that there is no new userspace code that + * will use a vsyscall if the vDSO is present. I hope that there will + * soon be no new userspace code that will ever use a vsyscall. + * + * The code in this file emulates vsyscalls when notified of a page + * fault to a vsyscall address. + */ + +#include <linux/kernel.h> +#include <linux/timer.h> +#include <linux/sched/signal.h> +#include <linux/mm_types.h> +#include <linux/syscalls.h> +#include <linux/ratelimit.h> + +#include <asm/vsyscall.h> +#include <asm/unistd.h> +#include <asm/fixmap.h> +#include <asm/traps.h> +#include <asm/paravirt.h> + +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + +static enum { EMULATE, NONE } vsyscall_mode = +#ifdef CONFIG_LEGACY_VSYSCALL_NONE + NONE; +#else + EMULATE; +#endif + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + +static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, + const char *message) +{ + if (!show_unhandled_signals) + return; + + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); +} + +static int addr_to_vsyscall_nr(unsigned long addr) +{ + int nr; + + if ((addr & ~0xC00UL) != VSYSCALL_ADDR) + return -EINVAL; + + nr = (addr & 0xC00UL) >> 10; + if (nr >= 3) + return -EINVAL; + + return nr; +} + +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ + /* + * XXX: if access_ok, get_user, and put_user handled + * sig_on_uaccess_err, this could go away. + */ + + if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { + siginfo_t info; + struct thread_struct *thread = ¤t->thread; + + thread->error_code = 6; /* user fault, no page, write */ + thread->cr2 = ptr; + thread->trap_nr = X86_TRAP_PF; + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)ptr; + + force_sig_info(SIGSEGV, &info, current); + return false; + } else { + return true; + } +} + +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +{ + struct task_struct *tsk; + unsigned long caller; + int vsyscall_nr, syscall_nr, tmp; + int prev_sig_on_uaccess_err; + long ret; + unsigned long orig_dx; + + /* + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. + */ + + WARN_ON_ONCE(address != regs->ip); + + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; + } + + vsyscall_nr = addr_to_vsyscall_nr(address); + + trace_emulate_vsyscall(vsyscall_nr); + + if (vsyscall_nr < 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); + goto sigsegv; + } + + if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); + goto sigsegv; + } + + tsk = current; + + /* + * Check for access_ok violations and find the syscall nr. + * + * NULL is a valid user pointer (in the access_ok sense) on 32-bit and + * 64-bit, so we don't need to special-case it here. For all the + * vsyscalls, NULL means "don't write anything" not "write it at + * address 0". + */ + switch (vsyscall_nr) { + case 0: + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_gettimeofday; + break; + + case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_time; + break; + + case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_getcpu; + break; + } + + /* + * Handle seccomp. regs->ip must be the original value. + * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst. + * + * We could optimize the seccomp disabled case, but performance + * here doesn't matter. + */ + regs->orig_ax = syscall_nr; + regs->ax = -ENOSYS; + tmp = secure_computing(NULL); + if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { + warn_bad_vsyscall(KERN_DEBUG, regs, + "seccomp tried to change syscall nr or ip"); + do_exit(SIGSYS); + } + regs->orig_ax = -1; + if (tmp) + goto do_ret; /* skip requested */ + + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err; + current->thread.sig_on_uaccess_err = 1; + + ret = -EFAULT; + switch (vsyscall_nr) { + case 0: + /* this decodes regs->di and regs->si on its own */ + ret = __x64_sys_gettimeofday(regs); + break; + + case 1: + /* this decodes regs->di on its own */ + ret = __x64_sys_time(regs); + break; + + case 2: + /* while we could clobber regs->dx, we didn't in the past... */ + orig_dx = regs->dx; + regs->dx = 0; + /* this decodes regs->di, regs->si and regs->dx on its own */ + ret = __x64_sys_getcpu(regs); + regs->dx = orig_dx; + break; + } + + current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err; + +check_fault: + if (ret == -EFAULT) { + /* Bad news -- userspace fed a bad pointer to a vsyscall. */ + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall fault (exploit attempt?)"); + + /* + * If we failed to generate a signal for any reason, + * generate one here. (This should be impossible.) + */ + if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && + !sigismember(&tsk->pending.signal, SIGSEGV))) + goto sigsegv; + + return true; /* Don't emulate the ret. */ + } + + regs->ax = ret; + +do_ret: + /* Emulate a ret instruction. */ + regs->ip = caller; + regs->sp += 8; + return true; + +sigsegv: + force_sig(SIGSEGV, current); + return true; +} + +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static const struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, +}; + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ +#ifdef CONFIG_COMPAT + if (!mm || mm->context.ia32_compat) + return NULL; +#endif + if (vsyscall_mode == NONE) + return NULL; + return &gate_vma; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; + + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. + */ +int in_gate_area_no_mm(unsigned long addr) +{ + return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; +} + +/* + * The VSYSCALL page is the only user-accessible page in the kernel address + * range. Normally, the kernel page tables can have _PAGE_USER clear, but + * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls + * are enabled. + * + * Some day we may create a "minimal" vsyscall mode in which we emulate + * vsyscalls but leave the page not present. If so, we skip calling + * this. + */ +void __init set_vsyscall_pgtable_user_bits(pgd_t *root) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); + set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); + p4d = p4d_offset(pgd, VSYSCALL_ADDR); +#if CONFIG_PGTABLE_LEVELS >= 5 + set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); +#endif + pud = pud_offset(p4d, VSYSCALL_ADDR); + set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); + pmd = pmd_offset(pud, VSYSCALL_ADDR); + set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); +} + +void __init map_vsyscall(void) +{ + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + + if (vsyscall_mode != NONE) { + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + PAGE_KERNEL_VVAR); + set_vsyscall_pgtable_user_bits(swapper_pg_dir); + } + + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S new file mode 100644 index 000000000..c9596a9af --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S @@ -0,0 +1,37 @@ +/* + * vsyscall_emu_64.S: Vsyscall emulation page + * + * Copyright (c) 2011 Andy Lutomirski + * + * Subject to the GNU General Public License, version 2 + */ + +#include <linux/linkage.h> + +#include <asm/irq_vectors.h> +#include <asm/page_types.h> +#include <asm/unistd_64.h> + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret + + .balign 4096, 0xcc + + .size __vsyscall_page, 4096 diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c new file mode 100644 index 000000000..e1216dd95 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Modified for x86 32 bit architecture by + * Stefani Seibold <stefani@seibold.net> + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + */ + +#include <linux/timekeeper_internal.h> +#include <asm/vgtod.h> +#include <asm/vvar.h> + +int vclocks_used __read_mostly; + +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); + +void update_vsyscall_tz(void) +{ + vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; + vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; +} + +void update_vsyscall(struct timekeeper *tk) +{ + int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + + /* Mark the new vclock used. */ + BUILD_BUG_ON(VCLOCK_MAX >= 32); + WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode)); + + gtod_write_begin(vdata); + + /* copy vsyscall data */ + vdata->vclock_mode = vclock_mode; + vdata->cycle_last = tk->tkr_mono.cycle_last; + vdata->mask = tk->tkr_mono.mask; + vdata->mult = tk->tkr_mono.mult; + vdata->shift = tk->tkr_mono.shift; + + vdata->wall_time_sec = tk->xtime_sec; + vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec + << tk->tkr_mono.shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; + vdata->monotonic_time_sec++; + } + + vdata->wall_time_coarse_sec = tk->xtime_sec; + vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift); + + vdata->monotonic_time_coarse_sec = + vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_coarse_nsec = + vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + + while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { + vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; + vdata->monotonic_time_coarse_sec++; + } + + gtod_write_end(vdata); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h new file mode 100644 index 000000000..3c3f9765a --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_trace.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ +#define TRACE_INCLUDE_FILE vsyscall_trace +#include <trace/define_trace.h> |