diff options
Diffstat (limited to '')
-rw-r--r-- | arch/x86/entry/common.c | 432 |
1 files changed, 432 insertions, 0 deletions
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c new file mode 100644 index 000000000..8353348dd --- /dev/null +++ b/arch/x86/entry/common.c @@ -0,0 +1,432 @@ +/* + * common.c - C code for kernel entry and exit + * Copyright (c) 2015 Andrew Lutomirski + * GPL v2 + * + * Based on asm and ptrace code by many authors. The code here originated + * in ptrace.c and signal.c. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/tracehook.h> +#include <linux/audit.h> +#include <linux/seccomp.h> +#include <linux/signal.h> +#include <linux/export.h> +#include <linux/context_tracking.h> +#include <linux/user-return-notifier.h> +#include <linux/nospec.h> +#include <linux/uprobes.h> +#include <linux/livepatch.h> +#include <linux/syscalls.h> + +#include <asm/desc.h> +#include <asm/traps.h> +#include <asm/vdso.h> +#include <linux/uaccess.h> +#include <asm/cpufeature.h> +#include <asm/nospec-branch.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +#ifdef CONFIG_CONTEXT_TRACKING +/* Called on entry from user mode with IRQs off. */ +__visible inline void enter_from_user_mode(void) +{ + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit_irqoff(); +} +#else +static inline void enter_from_user_mode(void) {} +#endif + +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) +{ +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + audit_syscall_entry(regs->orig_ax, regs->di, + regs->si, regs->dx, regs->r10); + } else +#endif + { + audit_syscall_entry(regs->orig_ax, regs->bx, + regs->cx, regs->dx, regs->si); + } +} + +/* + * Returns the syscall nr to run (which should match regs->orig_ax) or -1 + * to skip the syscall. + */ +static long syscall_trace_enter(struct pt_regs *regs) +{ + u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + + struct thread_info *ti = current_thread_info(); + unsigned long ret = 0; + bool emulated = false; + u32 work; + + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + BUG_ON(regs != task_pt_regs(current)); + + work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; + + if (unlikely(work & _TIF_SYSCALL_EMU)) + emulated = true; + + if ((emulated || (work & _TIF_SYSCALL_TRACE)) && + tracehook_report_syscall_entry(regs)) + return -1L; + + if (emulated) + return -1L; + +#ifdef CONFIG_SECCOMP + /* + * Do seccomp after ptrace, to catch any tracer changes. + */ + if (work & _TIF_SECCOMP) { + struct seccomp_data sd; + + sd.arch = arch; + sd.nr = regs->orig_ax; + sd.instruction_pointer = regs->ip; +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + sd.args[0] = regs->di; + sd.args[1] = regs->si; + sd.args[2] = regs->dx; + sd.args[3] = regs->r10; + sd.args[4] = regs->r8; + sd.args[5] = regs->r9; + } else +#endif + { + sd.args[0] = regs->bx; + sd.args[1] = regs->cx; + sd.args[2] = regs->dx; + sd.args[3] = regs->si; + sd.args[4] = regs->di; + sd.args[5] = regs->bp; + } + + ret = __secure_computing(&sd); + if (ret == -1) + return ret; + } +#endif + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->orig_ax); + + do_audit_syscall_entry(regs, arch); + + return ret ?: regs->orig_ax; +} + +#define EXIT_TO_USERMODE_LOOP_FLAGS \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) + +static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) +{ + /* + * In order to return to user mode, we need to have IRQs off with + * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags + * can be set at any time on preemptable kernels if we have IRQs on, + * so we need to loop. Disabling preemption wouldn't help: doing the + * work to clear some of the flags can sleep. + */ + while (true) { + /* We have work to do. */ + local_irq_enable(); + + if (cached_flags & _TIF_NEED_RESCHED) + schedule(); + + if (cached_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + + if (cached_flags & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + + /* deal with pending signal delivery */ + if (cached_flags & _TIF_SIGPENDING) + do_signal(regs); + + if (cached_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + rseq_handle_notify_resume(NULL, regs); + } + + if (cached_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); + + /* Disable IRQs and retry */ + local_irq_disable(); + + cached_flags = READ_ONCE(current_thread_info()->flags); + + if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) + break; + } +} + +/* Called with IRQs disabled. */ +__visible inline void prepare_exit_to_usermode(struct pt_regs *regs) +{ + struct thread_info *ti = current_thread_info(); + u32 cached_flags; + + addr_limit_user_check(); + + lockdep_assert_irqs_disabled(); + lockdep_sys_exit(); + + cached_flags = READ_ONCE(ti->flags); + + if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) + exit_to_usermode_loop(regs, cached_flags); + +#ifdef CONFIG_COMPAT + /* + * Compat syscalls set TS_COMPAT. Make sure we clear it before + * returning to user mode. We need to clear it *after* signal + * handling, because syscall restart has a fixup for compat + * syscalls. The fixup is exercised by the ptrace_syscall_32 + * selftest. + * + * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer + * special case only applies after poking regs and before the + * very next return to user mode. + */ + ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); +#endif + + user_enter_irqoff(); + + mds_user_clear_cpu_buffers(); +} + +#define SYSCALL_EXIT_WORK_FLAGS \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) + +static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) +{ + bool step; + + audit_syscall_exit(regs); + + if (cached_flags & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely( + (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) + == _TIF_SINGLESTEP); + if (step || cached_flags & _TIF_SYSCALL_TRACE) + tracehook_report_syscall_exit(regs, step); +} + +/* + * Called with IRQs on and fully valid regs. Returns with IRQs off in a + * state such that we can immediately switch to user mode. + */ +__visible inline void syscall_return_slowpath(struct pt_regs *regs) +{ + struct thread_info *ti = current_thread_info(); + u32 cached_flags = READ_ONCE(ti->flags); + + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && + WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) + local_irq_enable(); + + rseq_syscall(regs); + + /* + * First do one-time work. If these work items are enabled, we + * want to run them exactly once per syscall exit with IRQs on. + */ + if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) + syscall_slow_exit_work(regs, cached_flags); + + local_irq_disable(); + prepare_exit_to_usermode(regs); +} + +#ifdef CONFIG_X86_64 +__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) +{ + struct thread_info *ti; + + enter_from_user_mode(); + local_irq_enable(); + ti = current_thread_info(); + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) + nr = syscall_trace_enter(regs); + + /* + * NB: Native and x32 syscalls are dispatched from the same + * table. The only functional difference is the x32 bit in + * regs->orig_ax, which changes the behavior of some syscalls. + */ + nr &= __SYSCALL_MASK; + if (likely(nr < NR_syscalls)) { + nr = array_index_nospec(nr, NR_syscalls); + regs->ax = sys_call_table[nr](regs); + } + + syscall_return_slowpath(regs); +} +#endif + +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +/* + * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does + * all entry and exit work and returns with IRQs off. This function is + * extremely hot in workloads that use it, and it's usually called from + * do_fast_syscall_32, so forcibly inline it to improve performance. + */ +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) +{ + struct thread_info *ti = current_thread_info(); + unsigned int nr = (unsigned int)regs->orig_ax; + +#ifdef CONFIG_IA32_EMULATION + ti->status |= TS_COMPAT; +#endif + + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { + /* + * Subtlety here: if ptrace pokes something larger than + * 2^32-1 into orig_ax, this truncates it. This may or + * may not be necessary, but it matches the old asm + * behavior. + */ + nr = syscall_trace_enter(regs); + } + + if (likely(nr < IA32_NR_syscalls)) { + nr = array_index_nospec(nr, IA32_NR_syscalls); +#ifdef CONFIG_IA32_EMULATION + regs->ax = ia32_sys_call_table[nr](regs); +#else + /* + * It's possible that a 32-bit syscall implementation + * takes a 64-bit parameter but nonetheless assumes that + * the high bits are zero. Make sure we zero-extend all + * of the args. + */ + regs->ax = ia32_sys_call_table[nr]( + (unsigned int)regs->bx, (unsigned int)regs->cx, + (unsigned int)regs->dx, (unsigned int)regs->si, + (unsigned int)regs->di, (unsigned int)regs->bp); +#endif /* CONFIG_IA32_EMULATION */ + } + + syscall_return_slowpath(regs); +} + +/* Handles int $0x80 */ +__visible void do_int80_syscall_32(struct pt_regs *regs) +{ + enter_from_user_mode(); + local_irq_enable(); + do_syscall_32_irqs_on(regs); +} + +/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ +__visible long do_fast_syscall_32(struct pt_regs *regs) +{ + /* + * Called using the internal vDSO SYSENTER/SYSCALL32 calling + * convention. Adjust regs so it looks like we entered using int80. + */ + + unsigned long landing_pad = (unsigned long)current->mm->context.vdso + + vdso_image_32.sym_int80_landing_pad; + + /* + * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward + * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. + * Fix it up. + */ + regs->ip = landing_pad; + + enter_from_user_mode(); + + local_irq_enable(); + + /* Fetch EBP from where the vDSO stashed it. */ + if ( +#ifdef CONFIG_X86_64 + /* + * Micro-optimization: the pointer we're following is explicitly + * 32 bits, so it can't be out of range. + */ + __get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp) +#else + get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp) +#endif + ) { + + /* User code screwed up. */ + local_irq_disable(); + regs->ax = -EFAULT; + prepare_exit_to_usermode(regs); + return 0; /* Keep it simple: use IRET. */ + } + + /* Now this is just like a normal syscall. */ + do_syscall_32_irqs_on(regs); + +#ifdef CONFIG_X86_64 + /* + * Opportunistic SYSRETL: if possible, try to return using SYSRETL. + * SYSRETL is available on all 64-bit CPUs, so we don't need to + * bother with SYSEXIT. + * + * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, + * because the ECX fixup above will ensure that this is essentially + * never the case. + */ + return regs->cs == __USER32_CS && regs->ss == __USER_DS && + regs->ip == landing_pad && + (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; +#else + /* + * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. + * + * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, + * because the ECX fixup above will ensure that this is essentially + * never the case. + * + * We don't allow syscalls at all from VM86 mode, but we still + * need to check VM, because we might be returning from sys_vm86. + */ + return static_cpu_has(X86_FEATURE_SEP) && + regs->cs == __USER_CS && regs->ss == __USER_DS && + regs->ip == landing_pad && + (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; +#endif +} +#endif |