diff options
Diffstat (limited to '')
39 files changed, 6003 insertions, 0 deletions
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile new file mode 100644 index 000000000..1c2d4b29a --- /dev/null +++ b/arch/um/kernel/Makefile @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux,intel}.com) +# + +# Don't instrument UML-specific code; without this, we may crash when +# accessing the instrumentation buffer for the first time from the +# kernel. +KCOV_INSTRUMENT := n + +CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \ + -DELF_ARCH=$(LDS_ELF_ARCH) \ + -DELF_FORMAT=$(LDS_ELF_FORMAT) \ + $(LDS_EXTRA) +extra-y := vmlinux.lds + +obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ + physmem.o process.o ptrace.o reboot.o sigio.o \ + signal.o sysrq.o time.o tlb.o trap.o \ + um_arch.o umid.o maccess.o kmsg_dump.o capflags.o skas/ +obj-y += load_file.o + +obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o +obj-$(CONFIG_GPROF) += gprof_syms.o +obj-$(CONFIG_OF) += dtb.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o + +USER_OBJS := config.o + +include arch/um/scripts/Makefile.rules + +targets := config.c config.tmp capflags.c + +# Be careful with the below Sed code - sed is pitfall-rich! +# We use sed to lower build requirements, for "embedded" builders for instance. + +$(obj)/config.tmp: $(objtree)/.config FORCE + $(call if_changed,quote1) + +quiet_cmd_quote1 = QUOTE $@ + cmd_quote1 = sed -e 's/"/\\"/g' -e 's/^/"/' -e 's/$$/\\n",/' \ + $< > $@ + +$(obj)/config.c: $(src)/config.c.in $(obj)/config.tmp FORCE + $(call if_changed,quote2) + +quiet_cmd_mkcapflags = MKCAP $@ + cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/../../x86/kernel/cpu/mkcapflags.sh $@ $^ + +cpufeature = $(src)/../../x86/include/asm/cpufeatures.h +vmxfeature = $(src)/../../x86/include/asm/vmxfeatures.h + +$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/../../x86/kernel/cpu/mkcapflags.sh FORCE + $(call if_changed,mkcapflags) + +quiet_cmd_quote2 = QUOTE $@ + cmd_quote2 = sed -e '/CONFIG/{' \ + -e 's/"CONFIG"//' \ + -e 'r $(obj)/config.tmp' \ + -e 'a \' \ + -e '""' \ + -e '}' \ + $< > $@ diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c new file mode 100644 index 000000000..1fb12235a --- /dev/null +++ b/arch/um/kernel/asm-offsets.c @@ -0,0 +1 @@ +#include <sysdep/kernel-offsets.h> diff --git a/arch/um/kernel/config.c.in b/arch/um/kernel/config.c.in new file mode 100644 index 000000000..3ece3c3b3 --- /dev/null +++ b/arch/um/kernel/config.c.in @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + */ + +#include <stdio.h> +#include <stdlib.h> +#include <init.h> + +static __initdata const char *config[] = { +"CONFIG" +}; + +static int __init print_config(char *line, int *add) +{ + int i; + for (i = 0; i < sizeof(config)/sizeof(config[0]); i++) + printf("%s", config[i]); + exit(0); +} + +__uml_setup("--showconfig", print_config, +"--showconfig\n" +" Prints the config file that this UML binary was generated from.\n\n" +); + diff --git a/arch/um/kernel/dtb.c b/arch/um/kernel/dtb.c new file mode 100644 index 000000000..484141b06 --- /dev/null +++ b/arch/um/kernel/dtb.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/init.h> +#include <linux/of_fdt.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <init.h> + +#include "um_arch.h" + +static char *dtb __initdata; + +void uml_dtb_init(void) +{ + long long size; + void *area; + + area = uml_load_file(dtb, &size); + if (!area) + return; + + if (!early_init_dt_scan(area)) { + pr_err("invalid DTB %s\n", dtb); + memblock_free(area, size); + return; + } + + early_init_fdt_scan_reserved_mem(); + unflatten_device_tree(); +} + +static int __init uml_dtb_setup(char *line, int *add) +{ + dtb = line; + return 0; +} + +__uml_setup("dtb=", uml_dtb_setup, +"dtb=<file>\n" +" Boot the kernel with the devicetree blob from the specified file.\n" +); diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S new file mode 100644 index 000000000..2b7fc5b54 --- /dev/null +++ b/arch/um/kernel/dyn.lds.S @@ -0,0 +1,181 @@ +#include <asm/vmlinux.lds.h> +#include <asm/page.h> + +OUTPUT_FORMAT(ELF_FORMAT) +OUTPUT_ARCH(ELF_ARCH) +ENTRY(_start) +jiffies = jiffies_64; + +VERSION { + { + local: *; + }; +} + +SECTIONS +{ + PROVIDE (__executable_start = START); + . = START + SIZEOF_HEADERS; + .interp : { *(.interp) } + __binary_start = .; + . = ALIGN(4096); /* Init code and data */ + _text = .; + INIT_TEXT_SECTION(PAGE_SIZE) + + . = ALIGN(PAGE_SIZE); + + /* Read-only sections, merged into text segment: */ + .hash : { *(.hash) } + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + .rel.init : { *(.rel.init) } + .rela.init : { *(.rela.init) } + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } + .rel.fini : { *(.rel.fini) } + .rela.fini : { *(.rela.fini) } + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } + .rel.ctors : { *(.rel.ctors) } + .rela.ctors : { *(.rela.ctors) } + .rel.dtors : { *(.rel.dtors) } + .rela.dtors : { *(.rela.dtors) } + .rel.got : { *(.rel.got) } + .rela.got : { *(.rela.got) } + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } + .rel.plt : { + *(.rel.plt) + PROVIDE_HIDDEN(__rel_iplt_start = .); + *(.rel.iplt) + PROVIDE_HIDDEN(__rel_iplt_end = .); + } + .rela.plt : { + *(.rela.plt) + PROVIDE_HIDDEN(__rela_iplt_start = .); + *(.rela.iplt) + PROVIDE_HIDDEN(__rela_iplt_end = .); + } + .init : { + KEEP (*(.init)) + } =0x90909090 + .plt : { *(.plt) } + .text : { + _stext = .; + TEXT_TEXT + SCHED_TEXT + CPUIDLE_TEXT + LOCK_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT + *(.fixup) + *(.stub .text.* .gnu.linkonce.t.*) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + + . = ALIGN(PAGE_SIZE); + } =0x90909090 + . = ALIGN(PAGE_SIZE); + .syscall_stub : { + __syscall_stub_start = .; + *(.__syscall_stub*) + __syscall_stub_end = .; + } + .fini : { + KEEP (*(.fini)) + } =0x90909090 + + .kstrtab : { *(.kstrtab) } + + #include <asm/common.lds.S> + + __init_begin = .; + init.data : { INIT_DATA } + __init_end = .; + + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but + the linker would then create the section even if it turns out to + be empty, which isn't pretty. */ + . = ALIGN(32 / 8); + .preinit_array : { *(.preinit_array) } + .init_array : { + *(.kasan_init) + *(.init_array.*) + *(.init_array) + } + .fini_array : { *(.fini_array) } + .data : { + INIT_TASK_DATA(KERNEL_STACK_SIZE) + . = ALIGN(KERNEL_STACK_SIZE); + *(.data..init_irqstack) + DATA_DATA + *(.data.* .gnu.linkonce.d.*) + SORT(CONSTRUCTORS) + } + .data1 : { *(.data1) } + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } + .eh_frame : { KEEP (*(.eh_frame)) } + .gcc_except_table : { *(.gcc_except_table) } + .dynamic : { *(.dynamic) } + .ctors : { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } + .dtors : { + KEEP (*crtbegin.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } + .jcr : { KEEP (*(.jcr)) } + .got : { *(.got.plt) *(.got) } + _edata = .; + PROVIDE (edata = .); + .bss : { + __bss_start = .; + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* Align here to ensure that the .bss section occupies space up to + _end. Align after .bss to ensure correct alignment even if the + .bss section disappears because there are no input sections. */ + . = ALIGN(32 / 8); + . = ALIGN(32 / 8); + } + __bss_stop = .; + _end = .; + PROVIDE (end = .); + + STABS_DEBUG + DWARF_DEBUG + ELF_DETAILS + + DISCARDS +} diff --git a/arch/um/kernel/early_printk.c b/arch/um/kernel/early_printk.c new file mode 100644 index 000000000..c350c2331 --- /dev/null +++ b/arch/um/kernel/early_printk.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2011 Richard Weinberger <richrd@nod.at> + */ + +#include <linux/kernel.h> +#include <linux/console.h> +#include <linux/init.h> +#include <os.h> + +static void early_console_write(struct console *con, const char *s, unsigned int n) +{ + um_early_printk(s, n); +} + +static struct console early_console_dev = { + .name = "earlycon", + .write = early_console_write, + .flags = CON_BOOT, + .index = -1, +}; + +static int __init setup_early_printk(char *buf) +{ + if (!early_console) { + early_console = &early_console_dev; + register_console(&early_console_dev); + } + return 0; +} + +early_param("earlyprintk", setup_early_printk); diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c new file mode 100644 index 000000000..58938d758 --- /dev/null +++ b/arch/um/kernel/exec.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/stddef.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/ptrace.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/slab.h> +#include <asm/current.h> +#include <asm/processor.h> +#include <linux/uaccess.h> +#include <as-layout.h> +#include <mem_user.h> +#include <registers.h> +#include <skas.h> +#include <os.h> + +void flush_thread(void) +{ + void *data = NULL; + int ret; + + arch_flush_thread(¤t->thread.arch); + + ret = unmap(¤t->mm->context.id, 0, TASK_SIZE, 1, &data); + if (ret) { + printk(KERN_ERR "flush_thread - clearing address space failed, " + "err = %d\n", ret); + force_sig(SIGKILL); + } + get_safe_registers(current_pt_regs()->regs.gp, + current_pt_regs()->regs.fp); + + __switch_mm(¤t->mm->context.id); +} + +void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) +{ + PT_REGS_IP(regs) = eip; + PT_REGS_SP(regs) = esp; + clear_thread_flag(TIF_SINGLESTEP); +#ifdef SUBARCH_EXECVE1 + SUBARCH_EXECVE1(regs->regs); +#endif +} +EXPORT_SYMBOL(start_thread); diff --git a/arch/um/kernel/exitcode.c b/arch/um/kernel/exitcode.c new file mode 100644 index 000000000..43edc2aa5 --- /dev/null +++ b/arch/um/kernel/exitcode.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/types.h> +#include <linux/uaccess.h> + +/* + * If read and write race, the read will still atomically read a valid + * value. + */ +int uml_exitcode = 0; + +static int exitcode_proc_show(struct seq_file *m, void *v) +{ + int val; + + /* + * Save uml_exitcode in a local so that we don't need to guarantee + * that sprintf accesses it atomically. + */ + val = uml_exitcode; + seq_printf(m, "%d\n", val); + return 0; +} + +static int exitcode_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, exitcode_proc_show, NULL); +} + +static ssize_t exitcode_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + char *end, buf[sizeof("nnnnn\0")]; + size_t size; + int tmp; + + size = min(count, sizeof(buf)); + if (copy_from_user(buf, buffer, size)) + return -EFAULT; + + tmp = simple_strtol(buf, &end, 0); + if ((*end != '\0') && !isspace(*end)) + return -EINVAL; + + uml_exitcode = tmp; + return count; +} + +static const struct proc_ops exitcode_proc_ops = { + .proc_open = exitcode_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = exitcode_proc_write, +}; + +static int make_proc_exitcode(void) +{ + struct proc_dir_entry *ent; + + ent = proc_create("exitcode", 0600, NULL, &exitcode_proc_ops); + if (ent == NULL) { + printk(KERN_WARNING "make_proc_exitcode : Failed to register " + "/proc/exitcode\n"); + return 0; + } + return 0; +} + +__initcall(make_proc_exitcode); diff --git a/arch/um/kernel/gprof_syms.c b/arch/um/kernel/gprof_syms.c new file mode 100644 index 000000000..84d536908 --- /dev/null +++ b/arch/um/kernel/gprof_syms.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/module.h> + +extern void mcount(void); +EXPORT_SYMBOL(mcount); diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c new file mode 100644 index 000000000..47b8cb1a1 --- /dev/null +++ b/arch/um/kernel/initrd.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/init.h> +#include <linux/memblock.h> +#include <linux/initrd.h> +#include <asm/types.h> +#include <init.h> +#include <os.h> + +#include "um_arch.h" + +/* Changed by uml_initrd_setup, which is a setup */ +static char *initrd __initdata = NULL; + +int __init read_initrd(void) +{ + unsigned long long size; + void *area; + + if (!initrd) + return 0; + + area = uml_load_file(initrd, &size); + if (!area) + return 0; + + initrd_start = (unsigned long) area; + initrd_end = initrd_start + size; + return 0; +} + +static int __init uml_initrd_setup(char *line, int *add) +{ + initrd = line; + return 0; +} + +__uml_setup("initrd=", uml_initrd_setup, +"initrd=<initrd image>\n" +" This is used to boot UML from an initrd image. The argument is the\n" +" name of the file containing the image.\n\n" +); diff --git a/arch/um/kernel/ioport.c b/arch/um/kernel/ioport.c new file mode 100644 index 000000000..7220615b3 --- /dev/null +++ b/arch/um/kernel/ioport.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Intel Corporation + * Author: Johannes Berg <johannes@sipsolutions.net> + */ +#include <asm/iomap.h> +#include <asm-generic/pci_iomap.h> + +void __iomem *__pci_ioport_map(struct pci_dev *dev, unsigned long port, + unsigned int nr) +{ + return NULL; +} diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c new file mode 100644 index 000000000..a8873d9bc --- /dev/null +++ b/arch/um/kernel/irq.c @@ -0,0 +1,768 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 - Cambridge Greys Ltd + * Copyright (C) 2011 - 2014 Cisco Systems Inc + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c: + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + */ + +#include <linux/cpumask.h> +#include <linux/hardirq.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <as-layout.h> +#include <kern_util.h> +#include <os.h> +#include <irq_user.h> +#include <irq_kern.h> +#include <linux/time-internal.h> + + +extern void free_irqs(void); + +/* When epoll triggers we do not know why it did so + * we can also have different IRQs for read and write. + * This is why we keep a small irq_reg array for each fd - + * one entry per IRQ type + */ +struct irq_reg { + void *id; + int irq; + /* it's cheaper to store this than to query it */ + int events; + bool active; + bool pending; + bool wakeup; +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + bool pending_on_resume; + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *); + struct time_travel_event event; +#endif +}; + +struct irq_entry { + struct list_head list; + int fd; + struct irq_reg reg[NUM_IRQ_TYPES]; + bool suspended; + bool sigio_workaround; +}; + +static DEFINE_SPINLOCK(irq_lock); +static LIST_HEAD(active_fds); +static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ); +static bool irqs_suspended; + +static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) +{ +/* + * irq->active guards against reentry + * irq->pending accumulates pending requests + * if pending is raised the irq_handler is re-run + * until pending is cleared + */ + if (irq->active) { + irq->active = false; + + do { + irq->pending = false; + do_IRQ(irq->irq, regs); + } while (irq->pending); + + irq->active = true; + } else { + irq->pending = true; + } +} + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +static void irq_event_handler(struct time_travel_event *ev) +{ + struct irq_reg *reg = container_of(ev, struct irq_reg, event); + + /* do nothing if suspended - just to cause a wakeup */ + if (irqs_suspended) + return; + + generic_handle_irq(reg->irq); +} + +static bool irq_do_timetravel_handler(struct irq_entry *entry, + enum um_irq_type t) +{ + struct irq_reg *reg = &entry->reg[t]; + + if (!reg->timetravel_handler) + return false; + + /* + * Handle all messages - we might get multiple even while + * interrupts are already suspended, due to suspend order + * etc. Note that time_travel_add_irq_event() will not add + * an event twice, if it's pending already "first wins". + */ + reg->timetravel_handler(reg->irq, entry->fd, reg->id, ®->event); + + if (!reg->event.pending) + return false; + + if (irqs_suspended) + reg->pending_on_resume = true; + return true; +} +#else +static bool irq_do_timetravel_handler(struct irq_entry *entry, + enum um_irq_type t) +{ + return false; +} +#endif + +static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type t, + struct uml_pt_regs *regs, + bool timetravel_handlers_only) +{ + struct irq_reg *reg = &entry->reg[t]; + + if (!reg->events) + return; + + if (os_epoll_triggered(idx, reg->events) <= 0) + return; + + if (irq_do_timetravel_handler(entry, t)) + return; + + /* + * If we're called to only run time-travel handlers then don't + * actually proceed but mark sigio as pending (if applicable). + * For suspend/resume, timetravel_handlers_only may be true + * despite time-travel not being configured and used. + */ + if (timetravel_handlers_only) { +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + mark_sigio_pending(); +#endif + return; + } + + irq_io_loop(reg, regs); +} + +static void _sigio_handler(struct uml_pt_regs *regs, + bool timetravel_handlers_only) +{ + struct irq_entry *irq_entry; + int n, i; + + if (timetravel_handlers_only && !um_irq_timetravel_handler_used()) + return; + + while (1) { + /* This is now lockless - epoll keeps back-referencesto the irqs + * which have trigger it so there is no need to walk the irq + * list and lock it every time. We avoid locking by turning off + * IO for a specific fd by executing os_del_epoll_fd(fd) before + * we do any changes to the actual data structures + */ + n = os_waiting_for_events_epoll(); + + if (n <= 0) { + if (n == -EINTR) + continue; + else + break; + } + + for (i = 0; i < n ; i++) { + enum um_irq_type t; + + irq_entry = os_epoll_get_data_pointer(i); + + for (t = 0; t < NUM_IRQ_TYPES; t++) + sigio_reg_handler(i, irq_entry, t, regs, + timetravel_handlers_only); + } + } + + if (!timetravel_handlers_only) + free_irqs(); +} + +void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +{ + _sigio_handler(regs, irqs_suspended); +} + +static struct irq_entry *get_irq_entry_by_fd(int fd) +{ + struct irq_entry *walk; + + lockdep_assert_held(&irq_lock); + + list_for_each_entry(walk, &active_fds, list) { + if (walk->fd == fd) + return walk; + } + + return NULL; +} + +static void free_irq_entry(struct irq_entry *to_free, bool remove) +{ + if (!to_free) + return; + + if (remove) + os_del_epoll_fd(to_free->fd); + list_del(&to_free->list); + kfree(to_free); +} + +static bool update_irq_entry(struct irq_entry *entry) +{ + enum um_irq_type i; + int events = 0; + + for (i = 0; i < NUM_IRQ_TYPES; i++) + events |= entry->reg[i].events; + + if (events) { + /* will modify (instead of add) if needed */ + os_add_epoll_fd(events, entry->fd, entry); + return true; + } + + os_del_epoll_fd(entry->fd); + return false; +} + +static void update_or_free_irq_entry(struct irq_entry *entry) +{ + if (!update_irq_entry(entry)) + free_irq_entry(entry, false); +} + +static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *)) +{ + struct irq_entry *irq_entry; + int err, events = os_event_mask(type); + unsigned long flags; + + err = os_set_fd_async(fd); + if (err < 0) + goto out; + + spin_lock_irqsave(&irq_lock, flags); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { + /* cannot register the same FD twice with the same type */ + if (WARN_ON(irq_entry->reg[type].events)) { + err = -EALREADY; + goto out_unlock; + } + + /* temporarily disable to avoid IRQ-side locking */ + os_del_epoll_fd(fd); + } else { + irq_entry = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); + if (!irq_entry) { + err = -ENOMEM; + goto out_unlock; + } + irq_entry->fd = fd; + list_add_tail(&irq_entry->list, &active_fds); + maybe_sigio_broken(fd); + } + + irq_entry->reg[type].id = dev_id; + irq_entry->reg[type].irq = irq; + irq_entry->reg[type].active = true; + irq_entry->reg[type].events = events; + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + if (um_irq_timetravel_handler_used()) { + irq_entry->reg[type].timetravel_handler = timetravel_handler; + irq_entry->reg[type].event.fn = irq_event_handler; + } +#endif + + WARN_ON(!update_irq_entry(irq_entry)); + spin_unlock_irqrestore(&irq_lock, flags); + + return 0; +out_unlock: + spin_unlock_irqrestore(&irq_lock, flags); +out: + return err; +} + +/* + * Remove the entry or entries for a specific FD, if you + * don't want to remove all the possible entries then use + * um_free_irq() or deactivate_fd() instead. + */ +void free_irq_by_fd(int fd) +{ + struct irq_entry *to_free; + unsigned long flags; + + spin_lock_irqsave(&irq_lock, flags); + to_free = get_irq_entry_by_fd(fd); + free_irq_entry(to_free, true); + spin_unlock_irqrestore(&irq_lock, flags); +} +EXPORT_SYMBOL(free_irq_by_fd); + +static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) +{ + struct irq_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type i; + + for (i = 0; i < NUM_IRQ_TYPES; i++) { + struct irq_reg *reg = &entry->reg[i]; + + if (!reg->events) + continue; + if (reg->irq != irq) + continue; + if (reg->id != dev) + continue; + + os_del_epoll_fd(entry->fd); + reg->events = 0; + update_or_free_irq_entry(entry); + goto out; + } + } +out: + spin_unlock_irqrestore(&irq_lock, flags); +} + +void deactivate_fd(int fd, int irqnum) +{ + struct irq_entry *entry; + unsigned long flags; + enum um_irq_type i; + + os_del_epoll_fd(fd); + + spin_lock_irqsave(&irq_lock, flags); + entry = get_irq_entry_by_fd(fd); + if (!entry) + goto out; + + for (i = 0; i < NUM_IRQ_TYPES; i++) { + if (!entry->reg[i].events) + continue; + if (entry->reg[i].irq == irqnum) + entry->reg[i].events = 0; + } + + update_or_free_irq_entry(entry); +out: + spin_unlock_irqrestore(&irq_lock, flags); + + ignore_sigio_fd(fd); +} +EXPORT_SYMBOL(deactivate_fd); + +/* + * Called just before shutdown in order to provide a clean exec + * environment in case the system is rebooting. No locking because + * that would cause a pointless shutdown hang if something hadn't + * released the lock. + */ +int deactivate_all_fds(void) +{ + struct irq_entry *entry; + + /* Stop IO. The IRQ loop has no lock so this is our + * only way of making sure we are safe to dispose + * of all IRQ handlers + */ + os_set_ioignore(); + + /* we can no longer call kfree() here so just deactivate */ + list_for_each_entry(entry, &active_fds, list) + os_del_epoll_fd(entry->fd); + os_close_epoll_fd(); + return 0; +} + +/* + * do_IRQ handles all normal device IRQs (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +unsigned int do_IRQ(int irq, struct uml_pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs((struct pt_regs *)regs); + irq_enter(); + generic_handle_irq(irq); + irq_exit(); + set_irq_regs(old_regs); + return 1; +} + +void um_free_irq(int irq, void *dev) +{ + if (WARN(irq < 0 || irq > UM_LAST_SIGNAL_IRQ, + "freeing invalid irq %d", irq)) + return; + + free_irq_by_irq_and_dev(irq, dev); + free_irq(irq, dev); + clear_bit(irq, irqs_allocated); +} +EXPORT_SYMBOL(um_free_irq); + +static int +_um_request_irq(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *)) +{ + int err; + + if (irq == UM_IRQ_ALLOC) { + int i; + + for (i = UM_FIRST_DYN_IRQ; i < NR_IRQS; i++) { + if (!test_and_set_bit(i, irqs_allocated)) { + irq = i; + break; + } + } + } + + if (irq < 0) + return -ENOSPC; + + if (fd != -1) { + err = activate_fd(irq, fd, type, dev_id, timetravel_handler); + if (err) + goto error; + } + + err = request_irq(irq, handler, irqflags, devname, dev_id); + if (err < 0) + goto error; + + return irq; +error: + clear_bit(irq, irqs_allocated); + return err; +} + +int um_request_irq(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + return _um_request_irq(irq, fd, type, handler, irqflags, + devname, dev_id, NULL); +} +EXPORT_SYMBOL(um_request_irq); + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +int um_request_irq_tt(int irq, int fd, enum um_irq_type type, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, + void (*timetravel_handler)(int, int, void *, + struct time_travel_event *)) +{ + return _um_request_irq(irq, fd, type, handler, irqflags, + devname, dev_id, timetravel_handler); +} +EXPORT_SYMBOL(um_request_irq_tt); + +void sigio_run_timetravel_handlers(void) +{ + _sigio_handler(NULL, true); +} +#endif + +#ifdef CONFIG_PM_SLEEP +void um_irqs_suspend(void) +{ + struct irq_entry *entry; + unsigned long flags; + + irqs_suspended = true; + + spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + bool clear = true; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + if (!entry->reg[t].events) + continue; + + /* + * For the SIGIO_WRITE_IRQ, which is used to handle the + * SIGIO workaround thread, we need special handling: + * enable wake for it itself, but below we tell it about + * any FDs that should be suspended. + */ + if (entry->reg[t].wakeup || + entry->reg[t].irq == SIGIO_WRITE_IRQ +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + || entry->reg[t].timetravel_handler +#endif + ) { + clear = false; + break; + } + } + + if (clear) { + entry->suspended = true; + os_clear_fd_async(entry->fd); + entry->sigio_workaround = + !__ignore_sigio_fd(entry->fd); + } + } + spin_unlock_irqrestore(&irq_lock, flags); +} + +void um_irqs_resume(void) +{ + struct irq_entry *entry; + unsigned long flags; + + + local_irq_save(flags); +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + /* + * We don't need to lock anything here since we're in resume + * and nothing else is running, but have disabled IRQs so we + * don't try anything else with the interrupt list from there. + */ + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + struct irq_reg *reg = &entry->reg[t]; + + if (reg->pending_on_resume) { + irq_enter(); + generic_handle_irq(reg->irq); + irq_exit(); + reg->pending_on_resume = false; + } + } + } +#endif + + spin_lock(&irq_lock); + list_for_each_entry(entry, &active_fds, list) { + if (entry->suspended) { + int err = os_set_fd_async(entry->fd); + + WARN(err < 0, "os_set_fd_async returned %d\n", err); + entry->suspended = false; + + if (entry->sigio_workaround) { + err = __add_sigio_fd(entry->fd); + WARN(err < 0, "add_sigio_returned %d\n", err); + } + } + } + spin_unlock_irqrestore(&irq_lock, flags); + + irqs_suspended = false; + send_sigio_to_self(); +} + +static int normal_irq_set_wake(struct irq_data *d, unsigned int on) +{ + struct irq_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&irq_lock, flags); + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + if (!entry->reg[t].events) + continue; + + if (entry->reg[t].irq != d->irq) + continue; + entry->reg[t].wakeup = on; + goto unlock; + } + } +unlock: + spin_unlock_irqrestore(&irq_lock, flags); + return 0; +} +#else +#define normal_irq_set_wake NULL +#endif + +/* + * irq_chip must define at least enable/disable and ack when + * the edge handler is used. + */ +static void dummy(struct irq_data *d) +{ +} + +/* This is used for everything other than the timer. */ +static struct irq_chip normal_irq_type = { + .name = "SIGIO", + .irq_disable = dummy, + .irq_enable = dummy, + .irq_ack = dummy, + .irq_mask = dummy, + .irq_unmask = dummy, + .irq_set_wake = normal_irq_set_wake, +}; + +static struct irq_chip alarm_irq_type = { + .name = "SIGALRM", + .irq_disable = dummy, + .irq_enable = dummy, + .irq_ack = dummy, + .irq_mask = dummy, + .irq_unmask = dummy, +}; + +void __init init_IRQ(void) +{ + int i; + + irq_set_chip_and_handler(TIMER_IRQ, &alarm_irq_type, handle_edge_irq); + + for (i = 1; i < UM_LAST_SIGNAL_IRQ; i++) + irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq); + /* Initialize EPOLL Loop */ + os_setup_epoll(); +} + +/* + * IRQ stack entry and exit: + * + * Unlike i386, UML doesn't receive IRQs on the normal kernel stack + * and switch over to the IRQ stack after some preparation. We use + * sigaltstack to receive signals on a separate stack from the start. + * These two functions make sure the rest of the kernel won't be too + * upset by being on a different stack. The IRQ stack has a + * thread_info structure at the bottom so that current et al continue + * to work. + * + * to_irq_stack copies the current task's thread_info to the IRQ stack + * thread_info and sets the tasks's stack to point to the IRQ stack. + * + * from_irq_stack copies the thread_info struct back (flags may have + * been modified) and resets the task's stack pointer. + * + * Tricky bits - + * + * What happens when two signals race each other? UML doesn't block + * signals with sigprocmask, SA_DEFER, or sa_mask, so a second signal + * could arrive while a previous one is still setting up the + * thread_info. + * + * There are three cases - + * The first interrupt on the stack - sets up the thread_info and + * handles the interrupt + * A nested interrupt interrupting the copying of the thread_info - + * can't handle the interrupt, as the stack is in an unknown state + * A nested interrupt not interrupting the copying of the + * thread_info - doesn't do any setup, just handles the interrupt + * + * The first job is to figure out whether we interrupted stack setup. + * This is done by xchging the signal mask with thread_info->pending. + * If the value that comes back is zero, then there is no setup in + * progress, and the interrupt can be handled. If the value is + * non-zero, then there is stack setup in progress. In order to have + * the interrupt handled, we leave our signal in the mask, and it will + * be handled by the upper handler after it has set up the stack. + * + * Next is to figure out whether we are the outer handler or a nested + * one. As part of setting up the stack, thread_info->real_thread is + * set to non-NULL (and is reset to NULL on exit). This is the + * nesting indicator. If it is non-NULL, then the stack is already + * set up and the handler can run. + */ + +static unsigned long pending_mask; + +unsigned long to_irq_stack(unsigned long *mask_out) +{ + struct thread_info *ti; + unsigned long mask, old; + int nested; + + mask = xchg(&pending_mask, *mask_out); + if (mask != 0) { + /* + * If any interrupts come in at this point, we want to + * make sure that their bits aren't lost by our + * putting our bit in. So, this loop accumulates bits + * until xchg returns the same value that we put in. + * When that happens, there were no new interrupts, + * and pending_mask contains a bit for each interrupt + * that came in. + */ + old = *mask_out; + do { + old |= mask; + mask = xchg(&pending_mask, old); + } while (mask != old); + return 1; + } + + ti = current_thread_info(); + nested = (ti->real_thread != NULL); + if (!nested) { + struct task_struct *task; + struct thread_info *tti; + + task = cpu_tasks[ti->cpu].task; + tti = task_thread_info(task); + + *ti = *tti; + ti->real_thread = tti; + task->stack = ti; + } + + mask = xchg(&pending_mask, 0); + *mask_out |= mask | nested; + return 0; +} + +unsigned long from_irq_stack(int nested) +{ + struct thread_info *ti, *to; + unsigned long mask; + + ti = current_thread_info(); + + pending_mask = 1; + + to = ti->real_thread; + current->stack = to; + ti->real_thread = NULL; + *to = *ti; + + mask = xchg(&pending_mask, 0); + return mask & ~1; +} + diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c new file mode 100644 index 000000000..0224fcb36 --- /dev/null +++ b/arch/um/kernel/kmsg_dump.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kmsg_dump.h> +#include <linux/spinlock.h> +#include <linux/console.h> +#include <linux/string.h> +#include <shared/init.h> +#include <shared/kern.h> +#include <os.h> + +static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason) +{ + static struct kmsg_dump_iter iter; + static DEFINE_SPINLOCK(lock); + static char line[1024]; + struct console *con; + unsigned long flags; + size_t len = 0; + + /* only dump kmsg when no console is available */ + if (!console_trylock()) + return; + + for_each_console(con) { + if(strcmp(con->name, "tty") == 0 && + (con->flags & (CON_ENABLED | CON_CONSDEV)) != 0) { + break; + } + } + + console_unlock(); + + if (con) + return; + + if (!spin_trylock_irqsave(&lock, flags)) + return; + + kmsg_dump_rewind(&iter); + + printf("kmsg_dump:\n"); + while (kmsg_dump_get_line(&iter, true, line, sizeof(line), &len)) { + line[len] = '\0'; + printf("%s", line); + } + + spin_unlock_irqrestore(&lock, flags); +} + +static struct kmsg_dumper kmsg_dumper = { + .dump = kmsg_dumper_stdout +}; + +int __init kmsg_dumper_stdout_init(void) +{ + return kmsg_dump_register(&kmsg_dumper); +} + +__uml_postsetup(kmsg_dumper_stdout_init); diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c new file mode 100644 index 000000000..3a85bde3e --- /dev/null +++ b/arch/um/kernel/ksyms.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/module.h> +#include <os.h> + +EXPORT_SYMBOL(um_set_signals); +EXPORT_SYMBOL(signals_enabled); + +EXPORT_SYMBOL(os_stat_fd); +EXPORT_SYMBOL(os_stat_file); +EXPORT_SYMBOL(os_access); +EXPORT_SYMBOL(os_set_exec_close); +EXPORT_SYMBOL(os_getpid); +EXPORT_SYMBOL(os_open_file); +EXPORT_SYMBOL(os_read_file); +EXPORT_SYMBOL(os_write_file); +EXPORT_SYMBOL(os_seek_file); +EXPORT_SYMBOL(os_lock_file); +EXPORT_SYMBOL(os_ioctl_generic); +EXPORT_SYMBOL(os_pipe); +EXPORT_SYMBOL(os_file_type); +EXPORT_SYMBOL(os_file_mode); +EXPORT_SYMBOL(os_file_size); +EXPORT_SYMBOL(os_flush_stdout); +EXPORT_SYMBOL(os_close_file); +EXPORT_SYMBOL(os_set_fd_async); +EXPORT_SYMBOL(os_set_fd_block); +EXPORT_SYMBOL(helper_wait); +EXPORT_SYMBOL(os_shutdown_socket); +EXPORT_SYMBOL(os_create_unix_socket); +EXPORT_SYMBOL(os_connect_socket); +EXPORT_SYMBOL(os_accept_connection); +EXPORT_SYMBOL(os_rcv_fd); +EXPORT_SYMBOL(run_helper); +EXPORT_SYMBOL(os_major); +EXPORT_SYMBOL(os_minor); +EXPORT_SYMBOL(os_makedev); +EXPORT_SYMBOL(os_eventfd); +EXPORT_SYMBOL(os_sendmsg_fds); + +EXPORT_SYMBOL(add_sigio_fd); +EXPORT_SYMBOL(ignore_sigio_fd); +EXPORT_SYMBOL(sigio_broken); + +EXPORT_SYMBOL(syscall); diff --git a/arch/um/kernel/load_file.c b/arch/um/kernel/load_file.c new file mode 100644 index 000000000..5cecd0e29 --- /dev/null +++ b/arch/um/kernel/load_file.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ +#include <linux/memblock.h> +#include <os.h> + +#include "um_arch.h" + +static int __init __uml_load_file(const char *filename, void *buf, int size) +{ + int fd, n; + + fd = os_open_file(filename, of_read(OPENFLAGS()), 0); + if (fd < 0) { + printk(KERN_ERR "Opening '%s' failed - err = %d\n", filename, + -fd); + return -1; + } + n = os_read_file(fd, buf, size); + if (n != size) { + printk(KERN_ERR "Read of %d bytes from '%s' failed, " + "err = %d\n", size, + filename, -n); + return -1; + } + + os_close_file(fd); + return 0; +} + +void *uml_load_file(const char *filename, unsigned long long *size) +{ + void *area; + int err; + + *size = 0; + + if (!filename) + return NULL; + + err = os_file_size(filename, size); + if (err) + return NULL; + + if (*size == 0) { + printk(KERN_ERR "\"%s\" is empty\n", filename); + return NULL; + } + + area = memblock_alloc(*size, SMP_CACHE_BYTES); + if (!area) + panic("%s: Failed to allocate %llu bytes\n", __func__, *size); + + if (__uml_load_file(filename, area, *size)) { + memblock_free(area, *size); + return NULL; + } + + return area; +} diff --git a/arch/um/kernel/maccess.c b/arch/um/kernel/maccess.c new file mode 100644 index 000000000..8ccd56813 --- /dev/null +++ b/arch/um/kernel/maccess.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2013 Richard Weinberger <richrd@nod.at> + */ + +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <os.h> + +bool copy_from_kernel_nofault_allowed(const void *src, size_t size) +{ + void *psrc = (void *)rounddown((unsigned long)src, PAGE_SIZE); + + if ((unsigned long)src < PAGE_SIZE || size <= 0) + return false; + if (os_mincore(psrc, size + src - psrc) <= 0) + return false; + return true; +} diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c new file mode 100644 index 000000000..38d5a71a5 --- /dev/null +++ b/arch/um/kernel/mem.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/stddef.h> +#include <linux/module.h> +#include <linux/memblock.h> +#include <linux/highmem.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/slab.h> +#include <asm/fixmap.h> +#include <asm/page.h> +#include <as-layout.h> +#include <init.h> +#include <kern.h> +#include <kern_util.h> +#include <mem_user.h> +#include <os.h> +#include <linux/sched/task.h> + +#ifdef CONFIG_KASAN +int kasan_um_is_ready; +void kasan_init(void) +{ + /* + * kasan_map_memory will map all of the required address space and + * the host machine will allocate physical memory as necessary. + */ + kasan_map_memory((void *)KASAN_SHADOW_START, KASAN_SHADOW_SIZE); + init_task.kasan_depth = 0; + kasan_um_is_ready = true; +} + +static void (*kasan_init_ptr)(void) +__section(".kasan_init") __used += kasan_init; +#endif + +/* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ +unsigned long *empty_zero_page = NULL; +EXPORT_SYMBOL(empty_zero_page); + +/* + * Initialized during boot, and readonly for initializing page tables + * afterwards + */ +pgd_t swapper_pg_dir[PTRS_PER_PGD]; + +/* Initialized at boot time, and readonly after that */ +unsigned long long highmem; +EXPORT_SYMBOL(highmem); +int kmalloc_ok = 0; + +/* Used during early boot */ +static unsigned long brk_end; + +void __init mem_init(void) +{ + /* clear the zero-page */ + memset(empty_zero_page, 0, PAGE_SIZE); + + /* Map in the area just after the brk now that kmalloc is about + * to be turned on. + */ + brk_end = (unsigned long) UML_ROUND_UP(sbrk(0)); + map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); + memblock_free((void *)brk_end, uml_reserved - brk_end); + uml_reserved = brk_end; + + /* this will put all low memory onto the freelists */ + memblock_free_all(); + max_low_pfn = totalram_pages(); + max_pfn = max_low_pfn; + kmalloc_ok = 1; +} + +/* + * Create a page table and place a pointer to it in a middle page + * directory entry. + */ +static void __init one_page_table_init(pmd_t *pmd) +{ + if (pmd_none(*pmd)) { + pte_t *pte = (pte_t *) memblock_alloc_low(PAGE_SIZE, + PAGE_SIZE); + if (!pte) + panic("%s: Failed to allocate %lu bytes align=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); + + set_pmd(pmd, __pmd(_KERNPG_TABLE + + (unsigned long) __pa(pte))); + BUG_ON(pte != pte_offset_kernel(pmd, 0)); + } +} + +static void __init one_md_table_init(pud_t *pud) +{ +#ifdef CONFIG_3_LEVEL_PGTABLES + pmd_t *pmd_table = (pmd_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); + if (!pmd_table) + panic("%s: Failed to allocate %lu bytes align=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); + + set_pud(pud, __pud(_KERNPG_TABLE + (unsigned long) __pa(pmd_table))); + BUG_ON(pmd_table != pmd_offset(pud, 0)); +#endif +} + +static void __init fixrange_init(unsigned long start, unsigned long end, + pgd_t *pgd_base) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + int i, j; + unsigned long vaddr; + + vaddr = start; + i = pgd_index(vaddr); + j = pmd_index(vaddr); + pgd = pgd_base + i; + + for ( ; (i < PTRS_PER_PGD) && (vaddr < end); pgd++, i++) { + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); + if (pud_none(*pud)) + one_md_table_init(pud); + pmd = pmd_offset(pud, vaddr); + for (; (j < PTRS_PER_PMD) && (vaddr < end); pmd++, j++) { + one_page_table_init(pmd); + vaddr += PMD_SIZE; + } + j = 0; + } +} + +static void __init fixaddr_user_init( void) +{ +#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA + long size = FIXADDR_USER_END - FIXADDR_USER_START; + pte_t *pte; + phys_t p; + unsigned long v, vaddr = FIXADDR_USER_START; + + if (!size) + return; + + fixrange_init( FIXADDR_USER_START, FIXADDR_USER_END, swapper_pg_dir); + v = (unsigned long) memblock_alloc_low(size, PAGE_SIZE); + if (!v) + panic("%s: Failed to allocate %lu bytes align=%lx\n", + __func__, size, PAGE_SIZE); + + memcpy((void *) v , (void *) FIXADDR_USER_START, size); + p = __pa(v); + for ( ; size > 0; size -= PAGE_SIZE, vaddr += PAGE_SIZE, + p += PAGE_SIZE) { + pte = virt_to_kpte(vaddr); + pte_set_val(*pte, p, PAGE_READONLY); + } +#endif +} + +void __init paging_init(void) +{ + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; + unsigned long vaddr; + + empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, + PAGE_SIZE); + if (!empty_zero_page) + panic("%s: Failed to allocate %lu bytes align=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); + + max_zone_pfn[ZONE_NORMAL] = end_iomem >> PAGE_SHIFT; + free_area_init(max_zone_pfn); + + /* + * Fixed mappings, only the page table structure has to be + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir); + + fixaddr_user_init(); +} + +/* + * This can't do anything because nothing in the kernel image can be freed + * since it's not in kernel physical memory. + */ + +void free_initmem(void) +{ +} + +/* Allocate and free page tables. */ + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + + if (pgd) { + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + } + return pgd; +} + +void *uml_kmalloc(int size, int flags) +{ + return kmalloc(size, flags); +} + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY, + [VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_EXEC | VM_WRITE] = PAGE_COPY, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED +}; +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c new file mode 100644 index 000000000..91485119a --- /dev/null +++ b/arch/um/kernel/physmem.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/module.h> +#include <linux/memblock.h> +#include <linux/mm.h> +#include <linux/pfn.h> +#include <asm/page.h> +#include <asm/sections.h> +#include <as-layout.h> +#include <init.h> +#include <kern.h> +#include <mem_user.h> +#include <os.h> + +static int physmem_fd = -1; + +/* Changed during early boot */ +unsigned long high_physmem; +EXPORT_SYMBOL(high_physmem); + +extern unsigned long long physmem_size; + +void __init mem_total_pages(unsigned long physmem, unsigned long iomem, + unsigned long highmem) +{ + unsigned long phys_pages, highmem_pages; + unsigned long iomem_pages, total_pages; + + phys_pages = physmem >> PAGE_SHIFT; + iomem_pages = iomem >> PAGE_SHIFT; + highmem_pages = highmem >> PAGE_SHIFT; + + total_pages = phys_pages + iomem_pages + highmem_pages; + + max_mapnr = total_pages; +} + +void map_memory(unsigned long virt, unsigned long phys, unsigned long len, + int r, int w, int x) +{ + __u64 offset; + int fd, err; + + fd = phys_mapping(phys, &offset); + err = os_map_memory((void *) virt, fd, offset, len, r, w, x); + if (err) { + if (err == -ENOMEM) + printk(KERN_ERR "try increasing the host's " + "/proc/sys/vm/max_map_count to <physical " + "memory size>/4096\n"); + panic("map_memory(0x%lx, %d, 0x%llx, %ld, %d, %d, %d) failed, " + "err = %d\n", virt, fd, offset, len, r, w, x, err); + } +} + +/** + * setup_physmem() - Setup physical memory for UML + * @start: Start address of the physical kernel memory, + * i.e start address of the executable image. + * @reserve_end: end address of the physical kernel memory. + * @len: Length of total physical memory that should be mapped/made + * available, in bytes. + * @highmem: Number of highmem bytes that should be mapped/made available. + * + * Creates an unlinked temporary file of size (len + highmem) and memory maps + * it on the last executable image address (uml_reserved). + * + * The offset is needed as the length of the total physical memory + * (len + highmem) includes the size of the memory used be the executable image, + * but the mapped-to address is the last address of the executable image + * (uml_reserved == end address of executable image). + * + * The memory mapped memory of the temporary file is used as backing memory + * of all user space processes/kernel tasks. + */ +void __init setup_physmem(unsigned long start, unsigned long reserve_end, + unsigned long len, unsigned long long highmem) +{ + unsigned long reserve = reserve_end - start; + long map_size = len - reserve; + int err; + + if(map_size <= 0) { + os_warn("Too few physical memory! Needed=%lu, given=%lu\n", + reserve, len); + exit(1); + } + + physmem_fd = create_mem_file(len + highmem); + + err = os_map_memory((void *) reserve_end, physmem_fd, reserve, + map_size, 1, 1, 1); + if (err < 0) { + os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p " + "failed - errno = %d\n", map_size, + (void *) reserve_end, err); + exit(1); + } + + /* + * Special kludge - This page will be mapped in to userspace processes + * from physmem_fd, so it needs to be written out there. + */ + os_seek_file(physmem_fd, __pa(__syscall_stub_start)); + os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); + os_fsync_file(physmem_fd); + + memblock_add(__pa(start), len + highmem); + memblock_reserve(__pa(start), reserve); + + min_low_pfn = PFN_UP(__pa(reserve_end)); + max_low_pfn = min_low_pfn + (map_size >> PAGE_SHIFT); +} + +int phys_mapping(unsigned long phys, unsigned long long *offset_out) +{ + int fd = -1; + + if (phys < physmem_size) { + fd = physmem_fd; + *offset_out = phys; + } + else if (phys < __pa(end_iomem)) { + struct iomem_region *region = iomem_regions; + + while (region != NULL) { + if ((phys >= region->phys) && + (phys < region->phys + region->size)) { + fd = region->fd; + *offset_out = phys - region->phys; + break; + } + region = region->next; + } + } + else if (phys < __pa(end_iomem) + highmem) { + fd = physmem_fd; + *offset_out = phys - iomem_size; + } + + return fd; +} +EXPORT_SYMBOL(phys_mapping); + +static int __init uml_mem_setup(char *line, int *add) +{ + char *retptr; + physmem_size = memparse(line,&retptr); + return 0; +} +__uml_setup("mem=", uml_mem_setup, +"mem=<Amount of desired ram>\n" +" This controls how much \"physical\" memory the kernel allocates\n" +" for the system. The size is specified as a number followed by\n" +" one of 'k', 'K', 'm', 'M', which have the obvious meanings.\n" +" This is not related to the amount of memory in the host. It can\n" +" be more, and the excess, if it's ever used, will just be swapped out.\n" +" Example: mem=64M\n\n" +); + +extern int __init parse_iomem(char *str, int *add); + +__uml_setup("iomem=", parse_iomem, +"iomem=<name>,<file>\n" +" Configure <file> as an IO memory region named <name>.\n\n" +); + +/* + * This list is constructed in parse_iomem and addresses filled in + * setup_iomem, both of which run during early boot. Afterwards, it's + * unchanged. + */ +struct iomem_region *iomem_regions; + +/* Initialized in parse_iomem and unchanged thereafter */ +int iomem_size; + +unsigned long find_iomem(char *driver, unsigned long *len_out) +{ + struct iomem_region *region = iomem_regions; + + while (region != NULL) { + if (!strcmp(region->driver, driver)) { + *len_out = region->size; + return region->virt; + } + + region = region->next; + } + + return 0; +} +EXPORT_SYMBOL(find_iomem); + +static int setup_iomem(void) +{ + struct iomem_region *region = iomem_regions; + unsigned long iomem_start = high_physmem + PAGE_SIZE; + int err; + + while (region != NULL) { + err = os_map_memory((void *) iomem_start, region->fd, 0, + region->size, 1, 1, 0); + if (err) + printk(KERN_ERR "Mapping iomem region for driver '%s' " + "failed, errno = %d\n", region->driver, -err); + else { + region->virt = iomem_start; + region->phys = __pa(region->virt); + } + + iomem_start += region->size + PAGE_SIZE; + region = region->next; + } + + return 0; +} + +__initcall(setup_iomem); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c new file mode 100644 index 000000000..010bc422a --- /dev/null +++ b/arch/um/kernel/process.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk}) + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright 2003 PathScale, Inc. + */ + +#include <linux/stddef.h> +#include <linux/err.h> +#include <linux/hardirq.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/personality.h> +#include <linux/proc_fs.h> +#include <linux/ptrace.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/seq_file.h> +#include <linux/tick.h> +#include <linux/threads.h> +#include <linux/resume_user_mode.h> +#include <asm/current.h> +#include <asm/mmu_context.h> +#include <linux/uaccess.h> +#include <as-layout.h> +#include <kern_util.h> +#include <os.h> +#include <skas.h> +#include <registers.h> +#include <linux/time-internal.h> + +/* + * This is a per-cpu array. A processor only modifies its entry and it only + * cares about its entry, so it's OK if another processor is modifying its + * entry. + */ +struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { -1, NULL } }; + +static inline int external_pid(void) +{ + /* FIXME: Need to look up userspace_pid by cpu */ + return userspace_pid[0]; +} + +int pid_to_processor_id(int pid) +{ + int i; + + for (i = 0; i < ncpus; i++) { + if (cpu_tasks[i].pid == pid) + return i; + } + return -1; +} + +void free_stack(unsigned long stack, int order) +{ + free_pages(stack, order); +} + +unsigned long alloc_stack(int order, int atomic) +{ + unsigned long page; + gfp_t flags = GFP_KERNEL; + + if (atomic) + flags = GFP_ATOMIC; + page = __get_free_pages(flags, order); + + return page; +} + +static inline void set_current(struct task_struct *task) +{ + cpu_tasks[task_thread_info(task)->cpu] = ((struct cpu_task) + { external_pid(), task }); +} + +extern void arch_switch_to(struct task_struct *to); + +void *__switch_to(struct task_struct *from, struct task_struct *to) +{ + to->thread.prev_sched = from; + set_current(to); + + switch_threads(&from->thread.switch_buf, &to->thread.switch_buf); + arch_switch_to(current); + + return current->thread.prev_sched; +} + +void interrupt_end(void) +{ + struct pt_regs *regs = ¤t->thread.regs; + + if (need_resched()) + schedule(); + if (test_thread_flag(TIF_SIGPENDING) || + test_thread_flag(TIF_NOTIFY_SIGNAL)) + do_signal(regs); + if (test_thread_flag(TIF_NOTIFY_RESUME)) + resume_user_mode_work(regs); +} + +int get_current_pid(void) +{ + return task_pid_nr(current); +} + +/* + * This is called magically, by its address being stuffed in a jmp_buf + * and being longjmp-d to. + */ +void new_thread_handler(void) +{ + int (*fn)(void *), n; + void *arg; + + if (current->thread.prev_sched != NULL) + schedule_tail(current->thread.prev_sched); + current->thread.prev_sched = NULL; + + fn = current->thread.request.u.thread.proc; + arg = current->thread.request.u.thread.arg; + + /* + * callback returns only if the kernel thread execs a process + */ + n = fn(arg); + userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); +} + +/* Called magically, see new_thread_handler above */ +void fork_handler(void) +{ + force_flush_all(); + + schedule_tail(current->thread.prev_sched); + + /* + * XXX: if interrupt_end() calls schedule, this call to + * arch_switch_to isn't needed. We could want to apply this to + * improve performance. -bb + */ + arch_switch_to(current); + + current->thread.prev_sched = NULL; + + userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); +} + +int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) +{ + unsigned long clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long tls = args->tls; + void (*handler)(void); + int ret = 0; + + p->thread = (struct thread_struct) INIT_THREAD; + + if (!args->fn) { + memcpy(&p->thread.regs.regs, current_pt_regs(), + sizeof(p->thread.regs.regs)); + PT_REGS_SET_SYSCALL_RETURN(&p->thread.regs, 0); + if (sp != 0) + REGS_SP(p->thread.regs.regs.gp) = sp; + + handler = fork_handler; + + arch_copy_thread(¤t->thread.arch, &p->thread.arch); + } else { + get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp); + p->thread.request.u.thread.proc = args->fn; + p->thread.request.u.thread.arg = args->fn_arg; + handler = new_thread_handler; + } + + new_thread(task_stack_page(p), &p->thread.switch_buf, handler); + + if (!args->fn) { + clear_flushed_tls(p); + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) + ret = arch_set_tls(p, tls); + } + + return ret; +} + +void initial_thread_cb(void (*proc)(void *), void *arg) +{ + int save_kmalloc_ok = kmalloc_ok; + + kmalloc_ok = 0; + initial_thread_cb_skas(proc, arg); + kmalloc_ok = save_kmalloc_ok; +} + +void um_idle_sleep(void) +{ + if (time_travel_mode != TT_MODE_OFF) + time_travel_sleep(); + else + os_idle_sleep(); +} + +void arch_cpu_idle(void) +{ + cpu_tasks[current_thread_info()->cpu].pid = os_getpid(); + um_idle_sleep(); + raw_local_irq_enable(); +} + +int __cant_sleep(void) { + return in_atomic() || irqs_disabled() || in_interrupt(); + /* Is in_interrupt() really needed? */ +} + +int user_context(unsigned long sp) +{ + unsigned long stack; + + stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER); + return stack != (unsigned long) current_thread_info(); +} + +extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end; + +void do_uml_exitcalls(void) +{ + exitcall_t *call; + + call = &__uml_exitcall_end; + while (--call >= &__uml_exitcall_begin) + (*call)(); +} + +char *uml_strdup(const char *string) +{ + return kstrdup(string, GFP_KERNEL); +} +EXPORT_SYMBOL(uml_strdup); + +int copy_to_user_proc(void __user *to, void *from, int size) +{ + return copy_to_user(to, from, size); +} + +int copy_from_user_proc(void *to, void __user *from, int size) +{ + return copy_from_user(to, from, size); +} + +int clear_user_proc(void __user *buf, int size) +{ + return clear_user(buf, size); +} + +static atomic_t using_sysemu = ATOMIC_INIT(0); +int sysemu_supported; + +void set_using_sysemu(int value) +{ + if (value > sysemu_supported) + return; + atomic_set(&using_sysemu, value); +} + +int get_using_sysemu(void) +{ + return atomic_read(&using_sysemu); +} + +static int sysemu_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", get_using_sysemu()); + return 0; +} + +static int sysemu_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, sysemu_proc_show, NULL); +} + +static ssize_t sysemu_proc_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + char tmp[2]; + + if (copy_from_user(tmp, buf, 1)) + return -EFAULT; + + if (tmp[0] >= '0' && tmp[0] <= '2') + set_using_sysemu(tmp[0] - '0'); + /* We use the first char, but pretend to write everything */ + return count; +} + +static const struct proc_ops sysemu_proc_ops = { + .proc_open = sysemu_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = sysemu_proc_write, +}; + +int __init make_proc_sysemu(void) +{ + struct proc_dir_entry *ent; + if (!sysemu_supported) + return 0; + + ent = proc_create("sysemu", 0600, NULL, &sysemu_proc_ops); + + if (ent == NULL) + { + printk(KERN_WARNING "Failed to register /proc/sysemu\n"); + return 0; + } + + return 0; +} + +late_initcall(make_proc_sysemu); + +int singlestepping(void * t) +{ + struct task_struct *task = t ? t : current; + + if (!test_thread_flag(TIF_SINGLESTEP)) + return 0; + + if (task->thread.singlestep_syscall) + return 1; + + return 2; +} + +/* + * Only x86 and x86_64 have an arch_align_stack(). + * All other arches have "#define arch_align_stack(x) (x)" + * in their asm/exec.h + * As this is included in UML from asm-um/system-generic.h, + * we can use it to behave as the subarch does. + */ +#ifndef arch_align_stack +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= prandom_u32_max(8192); + return sp & ~0xf; +} +#endif + +unsigned long __get_wchan(struct task_struct *p) +{ + unsigned long stack_page, sp, ip; + bool seen_sched = 0; + + stack_page = (unsigned long) task_stack_page(p); + /* Bail if the process has no kernel stack for some reason */ + if (stack_page == 0) + return 0; + + sp = p->thread.switch_buf->JB_SP; + /* + * Bail if the stack pointer is below the bottom of the kernel + * stack for some reason + */ + if (sp < stack_page) + return 0; + + while (sp < stack_page + THREAD_SIZE) { + ip = *((unsigned long *) sp); + if (in_sched_functions(ip)) + /* Ignore everything until we're above the scheduler */ + seen_sched = 1; + else if (kernel_text_address(ip) && seen_sched) + return ip; + + sp += sizeof(unsigned long); + } + + return 0; +} + +int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu) +{ + int cpu = current_thread_info()->cpu; + + return save_i387_registers(userspace_pid[cpu], (unsigned long *) fpu); +} + diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c new file mode 100644 index 000000000..5154b27de --- /dev/null +++ b/arch/um/kernel/ptrace.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/audit.h> +#include <linux/ptrace.h> +#include <linux/sched.h> +#include <linux/uaccess.h> +#include <asm/ptrace-abi.h> + +void user_enable_single_step(struct task_struct *child) +{ + set_tsk_thread_flag(child, TIF_SINGLESTEP); + child->thread.singlestep_syscall = 0; + +#ifdef SUBARCH_SET_SINGLESTEPPING + SUBARCH_SET_SINGLESTEPPING(child, 1); +#endif +} + +void user_disable_single_step(struct task_struct *child) +{ + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + child->thread.singlestep_syscall = 0; + +#ifdef SUBARCH_SET_SINGLESTEPPING + SUBARCH_SET_SINGLESTEPPING(child, 0); +#endif +} + +/* + * Called by kernel/ptrace.c when detaching.. + */ +void ptrace_disable(struct task_struct *child) +{ + user_disable_single_step(child); +} + +extern int peek_user(struct task_struct * child, long addr, long data); +extern int poke_user(struct task_struct * child, long addr, long data); + +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int i, ret; + unsigned long __user *p = (void __user *)data; + void __user *vp = p; + + switch (request) { + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: + ret = peek_user(child, addr, data); + break; + + /* write the word at location addr in the USER area */ + case PTRACE_POKEUSR: + ret = poke_user(child, addr, data); + break; + + case PTRACE_SYSEMU: + case PTRACE_SYSEMU_SINGLESTEP: + ret = -EIO; + break; + +#ifdef PTRACE_GETREGS + case PTRACE_GETREGS: { /* Get all gp regs from the child. */ + if (!access_ok(p, MAX_REG_OFFSET)) { + ret = -EIO; + break; + } + for ( i = 0; i < MAX_REG_OFFSET; i += sizeof(long) ) { + __put_user(getreg(child, i), p); + p++; + } + ret = 0; + break; + } +#endif +#ifdef PTRACE_SETREGS + case PTRACE_SETREGS: { /* Set all gp regs in the child. */ + unsigned long tmp = 0; + if (!access_ok(p, MAX_REG_OFFSET)) { + ret = -EIO; + break; + } + for ( i = 0; i < MAX_REG_OFFSET; i += sizeof(long) ) { + __get_user(tmp, p); + putreg(child, i, tmp); + p++; + } + ret = 0; + break; + } +#endif + case PTRACE_GET_THREAD_AREA: + ret = ptrace_get_thread_area(child, addr, vp); + break; + + case PTRACE_SET_THREAD_AREA: + ret = ptrace_set_thread_area(child, addr, vp); + break; + + default: + ret = ptrace_request(child, request, addr, data); + if (ret == -EIO) + ret = subarch_ptrace(child, request, addr, data); + break; + } + + return ret; +} + +static void send_sigtrap(struct uml_pt_regs *regs, int error_code) +{ + /* Send us the fake SIGTRAP */ + force_sig_fault(SIGTRAP, TRAP_BRKPT, + /* User-mode eip? */ + UPT_IS_USER(regs) ? (void __user *) UPT_IP(regs) : NULL); +} + +/* + * XXX Check TIF_SINGLESTEP for singlestepping check and + * PT_PTRACED vs TIF_SYSCALL_TRACE for syscall tracing check + */ +int syscall_trace_enter(struct pt_regs *regs) +{ + audit_syscall_entry(UPT_SYSCALL_NR(®s->regs), + UPT_SYSCALL_ARG1(®s->regs), + UPT_SYSCALL_ARG2(®s->regs), + UPT_SYSCALL_ARG3(®s->regs), + UPT_SYSCALL_ARG4(®s->regs)); + + if (!test_thread_flag(TIF_SYSCALL_TRACE)) + return 0; + + return ptrace_report_syscall_entry(regs); +} + +void syscall_trace_leave(struct pt_regs *regs) +{ + int ptraced = current->ptrace; + + audit_syscall_exit(regs); + + /* Fake a debug trap */ + if (test_thread_flag(TIF_SINGLESTEP)) + send_sigtrap(®s->regs, 0); + + if (!test_thread_flag(TIF_SYSCALL_TRACE)) + return; + + ptrace_report_syscall_exit(regs, 0); + /* force do_signal() --> is_syscall() */ + if (ptraced & PT_PTRACED) + set_thread_flag(TIF_SIGPENDING); +} diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c new file mode 100644 index 000000000..48c0610d5 --- /dev/null +++ b/arch/um/kernel/reboot.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/sched/mm.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/oom.h> +#include <kern_util.h> +#include <os.h> +#include <skas.h> + +void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); + +static void kill_off_processes(void) +{ + struct task_struct *p; + int pid; + + read_lock(&tasklist_lock); + for_each_process(p) { + struct task_struct *t; + + t = find_lock_task_mm(p); + if (!t) + continue; + pid = t->mm->context.id.u.pid; + task_unlock(t); + os_kill_ptraced_process(pid, 1); + } + read_unlock(&tasklist_lock); +} + +void uml_cleanup(void) +{ + kmalloc_ok = 0; + do_uml_exitcalls(); + kill_off_processes(); +} + +void machine_restart(char * __unused) +{ + uml_cleanup(); + reboot_skas(); +} + +void machine_power_off(void) +{ + uml_cleanup(); + halt_skas(); +} + +void machine_halt(void) +{ + machine_power_off(); +} diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c new file mode 100644 index 000000000..5085a50c3 --- /dev/null +++ b/arch/um/kernel/sigio.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{linux.intel,addtoit}.com) + */ + +#include <linux/interrupt.h> +#include <irq_kern.h> +#include <os.h> +#include <sigio.h> + +/* Protected by sigio_lock() called from write_sigio_workaround */ +static int sigio_irq_fd = -1; + +static irqreturn_t sigio_interrupt(int irq, void *data) +{ + char c; + + os_read_file(sigio_irq_fd, &c, sizeof(c)); + return IRQ_HANDLED; +} + +int write_sigio_irq(int fd) +{ + int err; + + err = um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt, + 0, "write sigio", NULL); + if (err < 0) { + printk(KERN_ERR "write_sigio_irq : um_request_irq failed, " + "err = %d\n", err); + return -1; + } + sigio_irq_fd = fd; + return 0; +} + +/* These are called from os-Linux/sigio.c to protect its pollfds arrays. */ +static DEFINE_MUTEX(sigio_mutex); + +void sigio_lock(void) +{ + mutex_lock(&sigio_mutex); +} + +void sigio_unlock(void) +{ + mutex_unlock(&sigio_mutex); +} diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c new file mode 100644 index 000000000..ae4658f57 --- /dev/null +++ b/arch/um/kernel/signal.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/sched.h> +#include <linux/ftrace.h> +#include <asm/siginfo.h> +#include <asm/signal.h> +#include <asm/unistd.h> +#include <frame_kern.h> +#include <kern_util.h> +#include <os.h> + +EXPORT_SYMBOL(block_signals); +EXPORT_SYMBOL(unblock_signals); + +void block_signals_trace(void) +{ + block_signals(); + if (current_thread_info()) + trace_hardirqs_off(); +} + +void unblock_signals_trace(void) +{ + if (current_thread_info()) + trace_hardirqs_on(); + unblock_signals(); +} + +void um_trace_signals_on(void) +{ + if (current_thread_info()) + trace_hardirqs_on(); +} + +void um_trace_signals_off(void) +{ + if (current_thread_info()) + trace_hardirqs_off(); +} + +/* + * OK, we're invoking a handler + */ +static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) +{ + sigset_t *oldset = sigmask_to_save(); + int singlestep = 0; + unsigned long sp; + int err; + + if (test_thread_flag(TIF_SINGLESTEP) && (current->ptrace & PT_PTRACED)) + singlestep = 1; + + /* Did we come from a system call? */ + if (PT_REGS_SYSCALL_NR(regs) >= 0) { + /* If so, check system call restarting.. */ + switch (PT_REGS_SYSCALL_RET(regs)) { + case -ERESTART_RESTARTBLOCK: + case -ERESTARTNOHAND: + PT_REGS_SYSCALL_RET(regs) = -EINTR; + break; + + case -ERESTARTSYS: + if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { + PT_REGS_SYSCALL_RET(regs) = -EINTR; + break; + } + fallthrough; + case -ERESTARTNOINTR: + PT_REGS_RESTART_SYSCALL(regs); + PT_REGS_ORIG_SYSCALL(regs) = PT_REGS_SYSCALL_NR(regs); + break; + } + } + + sp = PT_REGS_SP(regs); + if ((ksig->ka.sa.sa_flags & SA_ONSTACK) && (sas_ss_flags(sp) == 0)) + sp = current->sas_ss_sp + current->sas_ss_size; + +#ifdef CONFIG_ARCH_HAS_SC_SIGNALS + if (!(ksig->ka.sa.sa_flags & SA_SIGINFO)) + err = setup_signal_stack_sc(sp, ksig, regs, oldset); + else +#endif + err = setup_signal_stack_si(sp, ksig, regs, oldset); + + signal_setup_done(err, ksig, singlestep); +} + +void do_signal(struct pt_regs *regs) +{ + struct ksignal ksig; + int handled_sig = 0; + + while (get_signal(&ksig)) { + handled_sig = 1; + /* Whee! Actually deliver the signal. */ + handle_signal(&ksig, regs); + } + + /* Did we come from a system call? */ + if (!handled_sig && (PT_REGS_SYSCALL_NR(regs) >= 0)) { + /* Restart the system call - no handlers present */ + switch (PT_REGS_SYSCALL_RET(regs)) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + PT_REGS_ORIG_SYSCALL(regs) = PT_REGS_SYSCALL_NR(regs); + PT_REGS_RESTART_SYSCALL(regs); + break; + case -ERESTART_RESTARTBLOCK: + PT_REGS_ORIG_SYSCALL(regs) = __NR_restart_syscall; + PT_REGS_RESTART_SYSCALL(regs); + break; + } + } + + /* + * This closes a way to execute a system call on the host. If + * you set a breakpoint on a system call instruction and singlestep + * from it, the tracing thread used to PTRACE_SINGLESTEP the process + * rather than PTRACE_SYSCALL it, allowing the system call to execute + * on the host. The tracing thread will check this flag and + * PTRACE_SYSCALL if necessary. + */ + if (test_thread_flag(TIF_SINGLESTEP)) + current->thread.singlestep_syscall = + is_syscall(PT_REGS_IP(¤t->thread.regs)); + + /* + * if there's no signal to deliver, we just put the saved sigmask + * back + */ + if (!handled_sig) + restore_saved_sigmask(); +} diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile new file mode 100644 index 000000000..f3d494a4f --- /dev/null +++ b/arch/um/kernel/skas/Makefile @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) +# + +obj-y := clone.o mmu.o process.o syscall.o uaccess.o + +# clone.o is in the stub, so it can't be built with profiling +# GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> +# disable it + +CFLAGS_clone.o := $(CFLAGS_NO_HARDENING) +UNPROFILE_OBJS := clone.o + +KCOV_INSTRUMENT := n + +include arch/um/scripts/Makefile.rules diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c new file mode 100644 index 000000000..ff5061f29 --- /dev/null +++ b/arch/um/kernel/skas/clone.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) + * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <signal.h> +#include <sched.h> +#include <asm/unistd.h> +#include <sys/time.h> +#include <as-layout.h> +#include <ptrace_user.h> +#include <stub-data.h> +#include <sysdep/stub.h> + +/* + * This is in a separate file because it needs to be compiled with any + * extraneous gcc flags (-pg, -fprofile-arcs, -ftest-coverage) disabled + * + * Use UM_KERN_PAGE_SIZE instead of PAGE_SIZE because that calls getpagesize + * on some systems. + */ + +void __attribute__ ((__section__ (".__syscall_stub"))) +stub_clone_handler(void) +{ + struct stub_data *data = get_stub_page(); + long err; + + err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD, + (unsigned long)data + UM_KERN_PAGE_SIZE / 2); + if (err) { + data->parent_err = err; + goto done; + } + + err = stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); + if (err) { + data->child_err = err; + goto done; + } + + remap_stack_and_trap(); + + done: + trap_myself(); +} diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c new file mode 100644 index 000000000..125df465e --- /dev/null +++ b/arch/um/kernel/skas/mmu.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/mm.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> + +#include <asm/pgalloc.h> +#include <asm/sections.h> +#include <as-layout.h> +#include <os.h> +#include <skas.h> + +int init_new_context(struct task_struct *task, struct mm_struct *mm) +{ + struct mm_context *from_mm = NULL; + struct mm_context *to_mm = &mm->context; + unsigned long stack = 0; + int ret = -ENOMEM; + + stack = get_zeroed_page(GFP_KERNEL); + if (stack == 0) + goto out; + + to_mm->id.stack = stack; + if (current->mm != NULL && current->mm != &init_mm) + from_mm = ¤t->mm->context; + + block_signals_trace(); + if (from_mm) + to_mm->id.u.pid = copy_context_skas0(stack, + from_mm->id.u.pid); + else to_mm->id.u.pid = start_userspace(stack); + unblock_signals_trace(); + + if (to_mm->id.u.pid < 0) { + ret = to_mm->id.u.pid; + goto out_free; + } + + ret = init_new_ldt(to_mm, from_mm); + if (ret < 0) { + printk(KERN_ERR "init_new_context_skas - init_ldt" + " failed, errno = %d\n", ret); + goto out_free; + } + + return 0; + + out_free: + if (to_mm->id.stack != 0) + free_page(to_mm->id.stack); + out: + return ret; +} + +void destroy_context(struct mm_struct *mm) +{ + struct mm_context *mmu = &mm->context; + + /* + * If init_new_context wasn't called, this will be + * zero, resulting in a kill(0), which will result in the + * whole UML suddenly dying. Also, cover negative and + * 1 cases, since they shouldn't happen either. + */ + if (mmu->id.u.pid < 2) { + printk(KERN_ERR "corrupt mm_context - pid = %d\n", + mmu->id.u.pid); + return; + } + os_kill_ptraced_process(mmu->id.u.pid, 1); + + free_page(mmu->id.stack); + free_ldt(mmu); +} diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c new file mode 100644 index 000000000..f2ac134c9 --- /dev/null +++ b/arch/um/kernel/skas/process.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/init.h> +#include <linux/sched/mm.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/task.h> + +#include <as-layout.h> +#include <kern.h> +#include <os.h> +#include <skas.h> + +extern void start_kernel(void); + +static int __init start_kernel_proc(void *unused) +{ + int pid; + + block_signals_trace(); + pid = os_getpid(); + + cpu_tasks[0].pid = pid; + cpu_tasks[0].task = current; + + start_kernel(); + return 0; +} + +extern int userspace_pid[]; + +extern char cpu0_irqstack[]; + +int __init start_uml(void) +{ + stack_protections((unsigned long) &cpu0_irqstack); + set_sigstack(cpu0_irqstack, THREAD_SIZE); + + init_new_thread_signals(); + + init_task.thread.request.u.thread.proc = start_kernel_proc; + init_task.thread.request.u.thread.arg = NULL; + return start_idle_thread(task_stack_page(&init_task), + &init_task.thread.switch_buf); +} + +unsigned long current_stub_stack(void) +{ + if (current->mm == NULL) + return 0; + + return current->mm->context.id.stack; +} diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c new file mode 100644 index 000000000..9ee19e566 --- /dev/null +++ b/arch/um/kernel/skas/syscall.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/kernel.h> +#include <linux/ptrace.h> +#include <linux/seccomp.h> +#include <kern_util.h> +#include <sysdep/ptrace.h> +#include <sysdep/ptrace_user.h> +#include <sysdep/syscalls.h> +#include <linux/time-internal.h> +#include <asm/unistd.h> + +void handle_syscall(struct uml_pt_regs *r) +{ + struct pt_regs *regs = container_of(r, struct pt_regs, regs); + int syscall; + + /* + * If we have infinite CPU resources, then make every syscall also a + * preemption point, since we don't have any other preemption in this + * case, and kernel threads would basically never run until userspace + * went to sleep, even if said userspace interacts with the kernel in + * various ways. + */ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) + schedule(); + + /* Initialize the syscall number and default return value. */ + UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp); + PT_REGS_SET_SYSCALL_RETURN(regs, -ENOSYS); + + if (syscall_trace_enter(regs)) + goto out; + + /* Do the seccomp check after ptrace; failures should be fast. */ + if (secure_computing() == -1) + goto out; + + syscall = UPT_SYSCALL_NR(r); + if (syscall >= 0 && syscall < __NR_syscalls) + PT_REGS_SET_SYSCALL_RETURN(regs, + EXECUTE_SYSCALL(syscall, regs)); + +out: + syscall_trace_leave(regs); +} diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c new file mode 100644 index 000000000..aaee96f07 --- /dev/null +++ b/arch/um/kernel/skas/uaccess.c @@ -0,0 +1,366 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/err.h> +#include <linux/highmem.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <asm/current.h> +#include <asm/page.h> +#include <kern_util.h> +#include <asm/futex.h> +#include <os.h> + +pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + if (mm == NULL) + return NULL; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return NULL; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return NULL; + + return pte_offset_kernel(pmd, addr); +} + +static pte_t *maybe_map(unsigned long virt, int is_write) +{ + pte_t *pte = virt_to_pte(current->mm, virt); + int err, dummy_code; + + if ((pte == NULL) || !pte_present(*pte) || + (is_write && !pte_write(*pte))) { + err = handle_page_fault(virt, 0, is_write, 1, &dummy_code); + if (err) + return NULL; + pte = virt_to_pte(current->mm, virt); + } + if (!pte_present(*pte)) + pte = NULL; + + return pte; +} + +static int do_op_one_page(unsigned long addr, int len, int is_write, + int (*op)(unsigned long addr, int len, void *arg), void *arg) +{ + struct page *page; + pte_t *pte; + int n; + + pte = maybe_map(addr, is_write); + if (pte == NULL) + return -1; + + page = pte_page(*pte); +#ifdef CONFIG_64BIT + pagefault_disable(); + addr = (unsigned long) page_address(page) + + (addr & ~PAGE_MASK); +#else + addr = (unsigned long) kmap_atomic(page) + + (addr & ~PAGE_MASK); +#endif + n = (*op)(addr, len, arg); + +#ifdef CONFIG_64BIT + pagefault_enable(); +#else + kunmap_atomic((void *)addr); +#endif + + return n; +} + +static long buffer_op(unsigned long addr, int len, int is_write, + int (*op)(unsigned long, int, void *), void *arg) +{ + long size, remain, n; + + size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len); + remain = len; + + n = do_op_one_page(addr, size, is_write, op, arg); + if (n != 0) { + remain = (n < 0 ? remain : 0); + goto out; + } + + addr += size; + remain -= size; + if (remain == 0) + goto out; + + while (addr < ((addr + remain) & PAGE_MASK)) { + n = do_op_one_page(addr, PAGE_SIZE, is_write, op, arg); + if (n != 0) { + remain = (n < 0 ? remain : 0); + goto out; + } + + addr += PAGE_SIZE; + remain -= PAGE_SIZE; + } + if (remain == 0) + goto out; + + n = do_op_one_page(addr, remain, is_write, op, arg); + if (n != 0) { + remain = (n < 0 ? remain : 0); + goto out; + } + + return 0; + out: + return remain; +} + +static int copy_chunk_from_user(unsigned long from, int len, void *arg) +{ + unsigned long *to_ptr = arg, to = *to_ptr; + + memcpy((void *) to, (void *) from, len); + *to_ptr += len; + return 0; +} + +unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) +{ + return buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to); +} +EXPORT_SYMBOL(raw_copy_from_user); + +static int copy_chunk_to_user(unsigned long to, int len, void *arg) +{ + unsigned long *from_ptr = arg, from = *from_ptr; + + memcpy((void *) to, (void *) from, len); + *from_ptr += len; + return 0; +} + +unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + return buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from); +} +EXPORT_SYMBOL(raw_copy_to_user); + +static int strncpy_chunk_from_user(unsigned long from, int len, void *arg) +{ + char **to_ptr = arg, *to = *to_ptr; + int n; + + strncpy(to, (void *) from, len); + n = strnlen(to, len); + *to_ptr += n; + + if (n < len) + return 1; + return 0; +} + +long strncpy_from_user(char *dst, const char __user *src, long count) +{ + long n; + char *ptr = dst; + + if (!access_ok(src, 1)) + return -EFAULT; + n = buffer_op((unsigned long) src, count, 0, strncpy_chunk_from_user, + &ptr); + if (n != 0) + return -EFAULT; + return strnlen(dst, count); +} +EXPORT_SYMBOL(strncpy_from_user); + +static int clear_chunk(unsigned long addr, int len, void *unused) +{ + memset((void *) addr, 0, len); + return 0; +} + +unsigned long __clear_user(void __user *mem, unsigned long len) +{ + return buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL); +} +EXPORT_SYMBOL(__clear_user); + +static int strnlen_chunk(unsigned long str, int len, void *arg) +{ + int *len_ptr = arg, n; + + n = strnlen((void *) str, len); + *len_ptr += n; + + if (n < len) + return 1; + return 0; +} + +long strnlen_user(const char __user *str, long len) +{ + int count = 0, n; + + if (!access_ok(str, 1)) + return -EFAULT; + n = buffer_op((unsigned long) str, len, 0, strnlen_chunk, &count); + if (n == 0) + return count + 1; + return 0; +} +EXPORT_SYMBOL(strnlen_user); + +/** + * arch_futex_atomic_op_inuser() - Atomic arithmetic operation with constant + * argument and comparison of the previous + * futex value with another constant. + * + * @encoded_op: encoded operation to execute + * @uaddr: pointer to user space address + * + * Return: + * 0 - On success + * -EFAULT - User access resulted in a page fault + * -EAGAIN - Atomic operation was unable to complete due to contention + * -ENOSYS - Operation not supported + */ + +int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr) +{ + int oldval, ret; + struct page *page; + unsigned long addr = (unsigned long) uaddr; + pte_t *pte; + + ret = -EFAULT; + if (!access_ok(uaddr, sizeof(*uaddr))) + return -EFAULT; + preempt_disable(); + pte = maybe_map(addr, 1); + if (pte == NULL) + goto out_inuser; + + page = pte_page(*pte); +#ifdef CONFIG_64BIT + pagefault_disable(); + addr = (unsigned long) page_address(page) + + (((unsigned long) addr) & ~PAGE_MASK); +#else + addr = (unsigned long) kmap_atomic(page) + + ((unsigned long) addr & ~PAGE_MASK); +#endif + uaddr = (u32 *) addr; + oldval = *uaddr; + + ret = 0; + + switch (op) { + case FUTEX_OP_SET: + *uaddr = oparg; + break; + case FUTEX_OP_ADD: + *uaddr += oparg; + break; + case FUTEX_OP_OR: + *uaddr |= oparg; + break; + case FUTEX_OP_ANDN: + *uaddr &= ~oparg; + break; + case FUTEX_OP_XOR: + *uaddr ^= oparg; + break; + default: + ret = -ENOSYS; + } +#ifdef CONFIG_64BIT + pagefault_enable(); +#else + kunmap_atomic((void *)addr); +#endif + +out_inuser: + preempt_enable(); + + if (ret == 0) + *oval = oldval; + + return ret; +} +EXPORT_SYMBOL(arch_futex_atomic_op_inuser); + +/** + * futex_atomic_cmpxchg_inatomic() - Compare and exchange the content of the + * uaddr with newval if the current value is + * oldval. + * @uval: pointer to store content of @uaddr + * @uaddr: pointer to user space address + * @oldval: old value + * @newval: new value to store to @uaddr + * + * Return: + * 0 - On success + * -EFAULT - User access resulted in a page fault + * -EAGAIN - Atomic operation was unable to complete due to contention + */ + +int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) +{ + struct page *page; + pte_t *pte; + int ret = -EFAULT; + + if (!access_ok(uaddr, sizeof(*uaddr))) + return -EFAULT; + + preempt_disable(); + pte = maybe_map((unsigned long) uaddr, 1); + if (pte == NULL) + goto out_inatomic; + + page = pte_page(*pte); +#ifdef CONFIG_64BIT + pagefault_disable(); + uaddr = page_address(page) + (((unsigned long) uaddr) & ~PAGE_MASK); +#else + uaddr = kmap_atomic(page) + ((unsigned long) uaddr & ~PAGE_MASK); +#endif + + *uval = *uaddr; + + ret = cmpxchg(uaddr, oldval, newval); + +#ifdef CONFIG_64BIT + pagefault_enable(); +#else + kunmap_atomic(uaddr); +#endif + ret = 0; + +out_inatomic: + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(futex_atomic_cmpxchg_inatomic); diff --git a/arch/um/kernel/stacktrace.c b/arch/um/kernel/stacktrace.c new file mode 100644 index 000000000..fd3b61b3d --- /dev/null +++ b/arch/um/kernel/stacktrace.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright (C) 2013 Richard Weinberger <richard@nod.at> + * Copyright (C) 2014 Google Inc., Author: Daniel Walter <dwalter@google.com> + */ + +#include <linux/kallsyms.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/stacktrace.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <asm/stacktrace.h> + +void dump_trace(struct task_struct *tsk, + const struct stacktrace_ops *ops, + void *data) +{ + int reliable = 0; + unsigned long *sp, bp, addr; + struct pt_regs *segv_regs = tsk->thread.segv_regs; + struct stack_frame *frame; + + bp = get_frame_pointer(tsk, segv_regs); + sp = get_stack_pointer(tsk, segv_regs); + + frame = (struct stack_frame *)bp; + while (((long) sp & (THREAD_SIZE-1)) != 0) { + addr = READ_ONCE_NOCHECK(*sp); + if (__kernel_text_address(addr)) { + reliable = 0; + if ((unsigned long) sp == bp + sizeof(long)) { + frame = frame ? frame->next_frame : NULL; + bp = (unsigned long)frame; + reliable = 1; + } + ops->address(data, addr, reliable); + } + sp++; + } +} + +static void save_addr(void *data, unsigned long address, int reliable) +{ + struct stack_trace *trace = data; + + if (!reliable) + return; + if (trace->nr_entries >= trace->max_entries) + return; + + trace->entries[trace->nr_entries++] = address; +} + +static const struct stacktrace_ops dump_ops = { + .address = save_addr +}; + +static void __save_stack_trace(struct task_struct *tsk, struct stack_trace *trace) +{ + dump_trace(tsk, &dump_ops, trace); +} + +void save_stack_trace(struct stack_trace *trace) +{ + __save_stack_trace(current, trace); +} +EXPORT_SYMBOL_GPL(save_stack_trace); + +void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + __save_stack_trace(tsk, trace); +} +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c new file mode 100644 index 000000000..746715379 --- /dev/null +++ b/arch/um/kernel/sysrq.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright (C) 2013 Richard Weinberger <richrd@nod.at> + */ + +#include <linux/kallsyms.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/sched/task_stack.h> + +#include <asm/sysrq.h> +#include <asm/stacktrace.h> +#include <os.h> + +static void _print_addr(void *data, unsigned long address, int reliable) +{ + const char *loglvl = data; + + printk("%s [<%08lx>] %s%pS\n", loglvl, address, reliable ? "" : "? ", + (void *)address); +} + +static const struct stacktrace_ops stackops = { + .address = _print_addr +}; + +void show_stack(struct task_struct *task, unsigned long *stack, + const char *loglvl) +{ + struct pt_regs *segv_regs = current->thread.segv_regs; + int i; + + if (!segv_regs && os_is_signal_stack()) { + pr_err("Received SIGSEGV in SIGSEGV handler," + " aborting stack trace!\n"); + return; + } + + if (!stack) + stack = get_stack_pointer(task, segv_regs); + + printk("%sStack:\n", loglvl); + for (i = 0; i < 3 * STACKSLOTS_PER_LINE; i++) { + if (kstack_end(stack)) + break; + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) + pr_cont("\n"); + pr_cont(" %08lx", READ_ONCE_NOCHECK(*stack)); + stack++; + } + + printk("%sCall Trace:\n", loglvl); + dump_trace(current, &stackops, (void *)loglvl); +} diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c new file mode 100644 index 000000000..fddd1dec2 --- /dev/null +++ b/arch/um/kernel/time.c @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk}) + * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) + * Copyright (C) 2012-2014 Cisco Systems + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright (C) 2019 Intel Corporation + */ + +#include <linux/clockchips.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/jiffies.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/threads.h> +#include <asm/irq.h> +#include <asm/param.h> +#include <kern_util.h> +#include <os.h> +#include <linux/time-internal.h> +#include <linux/um_timetravel.h> +#include <shared/init.h> + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +enum time_travel_mode time_travel_mode; +EXPORT_SYMBOL_GPL(time_travel_mode); + +static bool time_travel_start_set; +static unsigned long long time_travel_start; +static unsigned long long time_travel_time; +static LIST_HEAD(time_travel_events); +static LIST_HEAD(time_travel_irqs); +static unsigned long long time_travel_timer_interval; +static unsigned long long time_travel_next_event; +static struct time_travel_event time_travel_timer_event; +static int time_travel_ext_fd = -1; +static unsigned int time_travel_ext_waiting; +static bool time_travel_ext_prev_request_valid; +static unsigned long long time_travel_ext_prev_request; +static bool time_travel_ext_free_until_valid; +static unsigned long long time_travel_ext_free_until; + +static void time_travel_set_time(unsigned long long ns) +{ + if (unlikely(ns < time_travel_time)) + panic("time-travel: time goes backwards %lld -> %lld\n", + time_travel_time, ns); + else if (unlikely(ns >= S64_MAX)) + panic("The system was going to sleep forever, aborting"); + + time_travel_time = ns; +} + +enum time_travel_message_handling { + TTMH_IDLE, + TTMH_POLL, + TTMH_READ, +}; + +static void time_travel_handle_message(struct um_timetravel_msg *msg, + enum time_travel_message_handling mode) +{ + struct um_timetravel_msg resp = { + .op = UM_TIMETRAVEL_ACK, + }; + int ret; + + /* + * We can't unlock here, but interrupt signals with a timetravel_handler + * (see um_request_irq_tt) get to the timetravel_handler anyway. + */ + if (mode != TTMH_READ) { + BUG_ON(mode == TTMH_IDLE && !irqs_disabled()); + + while (os_poll(1, &time_travel_ext_fd) != 0) { + /* nothing */ + } + } + + ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); + + if (ret == 0) + panic("time-travel external link is broken\n"); + if (ret != sizeof(*msg)) + panic("invalid time-travel message - %d bytes\n", ret); + + switch (msg->op) { + default: + WARN_ONCE(1, "time-travel: unexpected message %lld\n", + (unsigned long long)msg->op); + break; + case UM_TIMETRAVEL_ACK: + return; + case UM_TIMETRAVEL_RUN: + time_travel_set_time(msg->time); + break; + case UM_TIMETRAVEL_FREE_UNTIL: + time_travel_ext_free_until_valid = true; + time_travel_ext_free_until = msg->time; + break; + } + + resp.seq = msg->seq; + os_write_file(time_travel_ext_fd, &resp, sizeof(resp)); +} + +static u64 time_travel_ext_req(u32 op, u64 time) +{ + static int seq; + int mseq = ++seq; + struct um_timetravel_msg msg = { + .op = op, + .time = time, + .seq = mseq, + }; + + /* + * We need to block even the timetravel handlers of SIGIO here and + * only restore their use when we got the ACK - otherwise we may + * (will) get interrupted by that, try to queue the IRQ for future + * processing and thus send another request while we're still waiting + * for an ACK, but the peer doesn't know we got interrupted and will + * send the ACKs in the same order as the message, but we'd need to + * see them in the opposite order ... + * + * This wouldn't matter *too* much, but some ACKs carry the + * current time (for UM_TIMETRAVEL_GET) and getting another + * ACK without a time would confuse us a lot! + * + * The sequence number assignment that happens here lets us + * debug such message handling issues more easily. + */ + block_signals_hard(); + os_write_file(time_travel_ext_fd, &msg, sizeof(msg)); + + while (msg.op != UM_TIMETRAVEL_ACK) + time_travel_handle_message(&msg, TTMH_READ); + + if (msg.seq != mseq) + panic("time-travel: ACK message has different seqno! op=%d, seq=%d != %d time=%lld\n", + msg.op, msg.seq, mseq, msg.time); + + if (op == UM_TIMETRAVEL_GET) + time_travel_set_time(msg.time); + unblock_signals_hard(); + + return msg.time; +} + +void __time_travel_wait_readable(int fd) +{ + int fds[2] = { fd, time_travel_ext_fd }; + int ret; + + if (time_travel_mode != TT_MODE_EXTERNAL) + return; + + while ((ret = os_poll(2, fds))) { + struct um_timetravel_msg msg; + + if (ret == 1) + time_travel_handle_message(&msg, TTMH_READ); + } +} +EXPORT_SYMBOL_GPL(__time_travel_wait_readable); + +static void time_travel_ext_update_request(unsigned long long time) +{ + if (time_travel_mode != TT_MODE_EXTERNAL) + return; + + /* asked for exactly this time previously */ + if (time_travel_ext_prev_request_valid && + time == time_travel_ext_prev_request) + return; + + /* + * if we're running and are allowed to run past the request + * then we don't need to update it either + */ + if (!time_travel_ext_waiting && time_travel_ext_free_until_valid && + time < time_travel_ext_free_until) + return; + + time_travel_ext_prev_request = time; + time_travel_ext_prev_request_valid = true; + time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time); +} + +void __time_travel_propagate_time(void) +{ + static unsigned long long last_propagated; + + if (last_propagated == time_travel_time) + return; + + time_travel_ext_req(UM_TIMETRAVEL_UPDATE, time_travel_time); + last_propagated = time_travel_time; +} +EXPORT_SYMBOL_GPL(__time_travel_propagate_time); + +/* returns true if we must do a wait to the simtime device */ +static bool time_travel_ext_request(unsigned long long time) +{ + /* + * If we received an external sync point ("free until") then we + * don't have to request/wait for anything until then, unless + * we're already waiting. + */ + if (!time_travel_ext_waiting && time_travel_ext_free_until_valid && + time < time_travel_ext_free_until) + return false; + + time_travel_ext_update_request(time); + return true; +} + +static void time_travel_ext_wait(bool idle) +{ + struct um_timetravel_msg msg = { + .op = UM_TIMETRAVEL_ACK, + }; + + time_travel_ext_prev_request_valid = false; + time_travel_ext_free_until_valid = false; + time_travel_ext_waiting++; + + time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1); + + /* + * Here we are deep in the idle loop, so we have to break out of the + * kernel abstraction in a sense and implement this in terms of the + * UML system waiting on the VQ interrupt while sleeping, when we get + * the signal it'll call time_travel_ext_vq_notify_done() completing the + * call. + */ + while (msg.op != UM_TIMETRAVEL_RUN) + time_travel_handle_message(&msg, idle ? TTMH_IDLE : TTMH_POLL); + + time_travel_ext_waiting--; + + /* we might request more stuff while polling - reset when we run */ + time_travel_ext_prev_request_valid = false; +} + +static void time_travel_ext_get_time(void) +{ + time_travel_ext_req(UM_TIMETRAVEL_GET, -1); +} + +static void __time_travel_update_time(unsigned long long ns, bool idle) +{ + if (time_travel_mode == TT_MODE_EXTERNAL && time_travel_ext_request(ns)) + time_travel_ext_wait(idle); + else + time_travel_set_time(ns); +} + +static struct time_travel_event *time_travel_first_event(void) +{ + return list_first_entry_or_null(&time_travel_events, + struct time_travel_event, + list); +} + +static void __time_travel_add_event(struct time_travel_event *e, + unsigned long long time) +{ + struct time_travel_event *tmp; + bool inserted = false; + unsigned long flags; + + if (e->pending) + return; + + e->pending = true; + e->time = time; + + local_irq_save(flags); + list_for_each_entry(tmp, &time_travel_events, list) { + /* + * Add the new entry before one with higher time, + * or if they're equal and both on stack, because + * in that case we need to unwind the stack in the + * right order, and the later event (timer sleep + * or such) must be dequeued first. + */ + if ((tmp->time > e->time) || + (tmp->time == e->time && tmp->onstack && e->onstack)) { + list_add_tail(&e->list, &tmp->list); + inserted = true; + break; + } + } + + if (!inserted) + list_add_tail(&e->list, &time_travel_events); + + tmp = time_travel_first_event(); + time_travel_ext_update_request(tmp->time); + time_travel_next_event = tmp->time; + local_irq_restore(flags); +} + +static void time_travel_add_event(struct time_travel_event *e, + unsigned long long time) +{ + if (WARN_ON(!e->fn)) + return; + + __time_travel_add_event(e, time); +} + +void time_travel_add_event_rel(struct time_travel_event *e, + unsigned long long delay_ns) +{ + time_travel_add_event(e, time_travel_time + delay_ns); +} + +void time_travel_periodic_timer(struct time_travel_event *e) +{ + time_travel_add_event(&time_travel_timer_event, + time_travel_time + time_travel_timer_interval); + deliver_alarm(); +} + +void deliver_time_travel_irqs(void) +{ + struct time_travel_event *e; + unsigned long flags; + + /* + * Don't do anything for most cases. Note that because here we have + * to disable IRQs (and re-enable later) we'll actually recurse at + * the end of the function, so this is strictly necessary. + */ + if (likely(list_empty(&time_travel_irqs))) + return; + + local_irq_save(flags); + irq_enter(); + while ((e = list_first_entry_or_null(&time_travel_irqs, + struct time_travel_event, + list))) { + list_del(&e->list); + e->pending = false; + e->fn(e); + } + irq_exit(); + local_irq_restore(flags); +} + +static void time_travel_deliver_event(struct time_travel_event *e) +{ + if (e == &time_travel_timer_event) { + /* + * deliver_alarm() does the irq_enter/irq_exit + * by itself, so must handle it specially here + */ + e->fn(e); + } else if (irqs_disabled()) { + list_add_tail(&e->list, &time_travel_irqs); + /* + * set pending again, it was set to false when the + * event was deleted from the original list, but + * now it's still pending until we deliver the IRQ. + */ + e->pending = true; + } else { + unsigned long flags; + + local_irq_save(flags); + irq_enter(); + e->fn(e); + irq_exit(); + local_irq_restore(flags); + } +} + +bool time_travel_del_event(struct time_travel_event *e) +{ + unsigned long flags; + + if (!e->pending) + return false; + local_irq_save(flags); + list_del(&e->list); + e->pending = false; + local_irq_restore(flags); + return true; +} + +static void time_travel_update_time(unsigned long long next, bool idle) +{ + struct time_travel_event ne = { + .onstack = true, + }; + struct time_travel_event *e; + bool finished = idle; + + /* add it without a handler - we deal with that specifically below */ + __time_travel_add_event(&ne, next); + + do { + e = time_travel_first_event(); + + BUG_ON(!e); + __time_travel_update_time(e->time, idle); + + /* new events may have been inserted while we were waiting */ + if (e == time_travel_first_event()) { + BUG_ON(!time_travel_del_event(e)); + BUG_ON(time_travel_time != e->time); + + if (e == &ne) { + finished = true; + } else { + if (e->onstack) + panic("On-stack event dequeued outside of the stack! time=%lld, event time=%lld, event=%pS\n", + time_travel_time, e->time, e); + time_travel_deliver_event(e); + } + } + + e = time_travel_first_event(); + if (e) + time_travel_ext_update_request(e->time); + } while (ne.pending && !finished); + + time_travel_del_event(&ne); +} + +void time_travel_ndelay(unsigned long nsec) +{ + time_travel_update_time(time_travel_time + nsec, false); +} +EXPORT_SYMBOL(time_travel_ndelay); + +void time_travel_add_irq_event(struct time_travel_event *e) +{ + BUG_ON(time_travel_mode != TT_MODE_EXTERNAL); + + time_travel_ext_get_time(); + /* + * We could model interrupt latency here, for now just + * don't have any latency at all and request the exact + * same time (again) to run the interrupt... + */ + time_travel_add_event(e, time_travel_time); +} +EXPORT_SYMBOL_GPL(time_travel_add_irq_event); + +static void time_travel_oneshot_timer(struct time_travel_event *e) +{ + deliver_alarm(); +} + +void time_travel_sleep(void) +{ + /* + * Wait "forever" (using S64_MAX because there are some potential + * wrapping issues, especially with the current TT_MODE_EXTERNAL + * controller application. + */ + unsigned long long next = S64_MAX; + + if (time_travel_mode == TT_MODE_BASIC) + os_timer_disable(); + + time_travel_update_time(next, true); + + if (time_travel_mode == TT_MODE_BASIC && + time_travel_timer_event.pending) { + if (time_travel_timer_event.fn == time_travel_periodic_timer) { + /* + * This is somewhat wrong - we should get the first + * one sooner like the os_timer_one_shot() below... + */ + os_timer_set_interval(time_travel_timer_interval); + } else { + os_timer_one_shot(time_travel_timer_event.time - next); + } + } +} + +static void time_travel_handle_real_alarm(void) +{ + time_travel_set_time(time_travel_next_event); + + time_travel_del_event(&time_travel_timer_event); + + if (time_travel_timer_event.fn == time_travel_periodic_timer) + time_travel_add_event(&time_travel_timer_event, + time_travel_time + + time_travel_timer_interval); +} + +static void time_travel_set_interval(unsigned long long interval) +{ + time_travel_timer_interval = interval; +} + +static int time_travel_connect_external(const char *socket) +{ + const char *sep; + unsigned long long id = (unsigned long long)-1; + int rc; + + if ((sep = strchr(socket, ':'))) { + char buf[25] = {}; + if (sep - socket > sizeof(buf) - 1) + goto invalid_number; + + memcpy(buf, socket, sep - socket); + if (kstrtoull(buf, 0, &id)) { +invalid_number: + panic("time-travel: invalid external ID in string '%s'\n", + socket); + return -EINVAL; + } + + socket = sep + 1; + } + + rc = os_connect_socket(socket); + if (rc < 0) { + panic("time-travel: failed to connect to external socket %s\n", + socket); + return rc; + } + + time_travel_ext_fd = rc; + + time_travel_ext_req(UM_TIMETRAVEL_START, id); + + return 1; +} + +static void time_travel_set_start(void) +{ + if (time_travel_start_set) + return; + + switch (time_travel_mode) { + case TT_MODE_EXTERNAL: + time_travel_start = time_travel_ext_req(UM_TIMETRAVEL_GET_TOD, -1); + /* controller gave us the *current* time, so adjust by that */ + time_travel_ext_get_time(); + time_travel_start -= time_travel_time; + break; + case TT_MODE_INFCPU: + case TT_MODE_BASIC: + if (!time_travel_start_set) + time_travel_start = os_persistent_clock_emulation(); + break; + case TT_MODE_OFF: + /* we just read the host clock with os_persistent_clock_emulation() */ + break; + } + + time_travel_start_set = true; +} +#else /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ +#define time_travel_start_set 0 +#define time_travel_start 0 +#define time_travel_time 0 +#define time_travel_ext_waiting 0 + +static inline void time_travel_update_time(unsigned long long ns, bool retearly) +{ +} + +static inline void time_travel_handle_real_alarm(void) +{ +} + +static void time_travel_set_interval(unsigned long long interval) +{ +} + +static inline void time_travel_set_start(void) +{ +} + +/* fail link if this actually gets used */ +extern u64 time_travel_ext_req(u32 op, u64 time); + +/* these are empty macros so the struct/fn need not exist */ +#define time_travel_add_event(e, time) do { } while (0) +/* externally not usable - redefine here so we can */ +#undef time_travel_del_event +#define time_travel_del_event(e) do { } while (0) +#endif + +void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +{ + unsigned long flags; + + /* + * In basic time-travel mode we still get real interrupts + * (signals) but since we don't read time from the OS, we + * must update the simulated time here to the expiry when + * we get a signal. + * This is not the case in inf-cpu mode, since there we + * never get any real signals from the OS. + */ + if (time_travel_mode == TT_MODE_BASIC) + time_travel_handle_real_alarm(); + + local_irq_save(flags); + do_IRQ(TIMER_IRQ, regs); + local_irq_restore(flags); +} + +static int itimer_shutdown(struct clock_event_device *evt) +{ + if (time_travel_mode != TT_MODE_OFF) + time_travel_del_event(&time_travel_timer_event); + + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) + os_timer_disable(); + + return 0; +} + +static int itimer_set_periodic(struct clock_event_device *evt) +{ + unsigned long long interval = NSEC_PER_SEC / HZ; + + if (time_travel_mode != TT_MODE_OFF) { + time_travel_del_event(&time_travel_timer_event); + time_travel_set_event_fn(&time_travel_timer_event, + time_travel_periodic_timer); + time_travel_set_interval(interval); + time_travel_add_event(&time_travel_timer_event, + time_travel_time + interval); + } + + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) + os_timer_set_interval(interval); + + return 0; +} + +static int itimer_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + delta += 1; + + if (time_travel_mode != TT_MODE_OFF) { + time_travel_del_event(&time_travel_timer_event); + time_travel_set_event_fn(&time_travel_timer_event, + time_travel_oneshot_timer); + time_travel_add_event(&time_travel_timer_event, + time_travel_time + delta); + } + + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) + return os_timer_one_shot(delta); + + return 0; +} + +static int itimer_one_shot(struct clock_event_device *evt) +{ + return itimer_next_event(0, evt); +} + +static struct clock_event_device timer_clockevent = { + .name = "posix-timer", + .rating = 250, + .cpumask = cpu_possible_mask, + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT, + .set_state_shutdown = itimer_shutdown, + .set_state_periodic = itimer_set_periodic, + .set_state_oneshot = itimer_one_shot, + .set_next_event = itimer_next_event, + .shift = 0, + .max_delta_ns = 0xffffffff, + .max_delta_ticks = 0xffffffff, + .min_delta_ns = TIMER_MIN_DELTA, + .min_delta_ticks = TIMER_MIN_DELTA, // microsecond resolution should be enough for anyone, same as 640K RAM + .irq = 0, + .mult = 1, +}; + +static irqreturn_t um_timer(int irq, void *dev) +{ + if (get_current()->mm != NULL) + { + /* userspace - relay signal, results in correct userspace timers */ + os_alarm_process(get_current()->mm->context.id.u.pid); + } + + (*timer_clockevent.event_handler)(&timer_clockevent); + + return IRQ_HANDLED; +} + +static u64 timer_read(struct clocksource *cs) +{ + if (time_travel_mode != TT_MODE_OFF) { + /* + * We make reading the timer cost a bit so that we don't get + * stuck in loops that expect time to move more than the + * exact requested sleep amount, e.g. python's socket server, + * see https://bugs.python.org/issue37026. + * + * However, don't do that when we're in interrupt or such as + * then we might recurse into our own processing, and get to + * even more waiting, and that's not good - it messes up the + * "what do I do next" and onstack event we use to know when + * to return from time_travel_update_time(). + */ + if (!irqs_disabled() && !in_interrupt() && !in_softirq() && + !time_travel_ext_waiting) + time_travel_update_time(time_travel_time + + TIMER_MULTIPLIER, + false); + return time_travel_time / TIMER_MULTIPLIER; + } + + return os_nsecs() / TIMER_MULTIPLIER; +} + +static struct clocksource timer_clocksource = { + .name = "timer", + .rating = 300, + .read = timer_read, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static void __init um_timer_setup(void) +{ + int err; + + err = request_irq(TIMER_IRQ, um_timer, IRQF_TIMER, "hr timer", NULL); + if (err != 0) + printk(KERN_ERR "register_timer : request_irq failed - " + "errno = %d\n", -err); + + err = os_timer_create(); + if (err != 0) { + printk(KERN_ERR "creation of timer failed - errno = %d\n", -err); + return; + } + + err = clocksource_register_hz(&timer_clocksource, NSEC_PER_SEC/TIMER_MULTIPLIER); + if (err) { + printk(KERN_ERR "clocksource_register_hz returned %d\n", err); + return; + } + clockevents_register_device(&timer_clockevent); +} + +void read_persistent_clock64(struct timespec64 *ts) +{ + long long nsecs; + + time_travel_set_start(); + + if (time_travel_mode != TT_MODE_OFF) + nsecs = time_travel_start + time_travel_time; + else + nsecs = os_persistent_clock_emulation(); + + set_normalized_timespec64(ts, nsecs / NSEC_PER_SEC, + nsecs % NSEC_PER_SEC); +} + +void __init time_init(void) +{ + timer_set_signal_handler(); + late_time_init = um_timer_setup; +} + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +unsigned long calibrate_delay_is_known(void) +{ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) + return 1; + return 0; +} + +int setup_time_travel(char *str) +{ + if (strcmp(str, "=inf-cpu") == 0) { + time_travel_mode = TT_MODE_INFCPU; + timer_clockevent.name = "time-travel-timer-infcpu"; + timer_clocksource.name = "time-travel-clock"; + return 1; + } + + if (strncmp(str, "=ext:", 5) == 0) { + time_travel_mode = TT_MODE_EXTERNAL; + timer_clockevent.name = "time-travel-timer-external"; + timer_clocksource.name = "time-travel-clock-external"; + return time_travel_connect_external(str + 5); + } + + if (!*str) { + time_travel_mode = TT_MODE_BASIC; + timer_clockevent.name = "time-travel-timer"; + timer_clocksource.name = "time-travel-clock"; + return 1; + } + + return -EINVAL; +} + +__setup("time-travel", setup_time_travel); +__uml_help(setup_time_travel, +"time-travel\n" +"This option just enables basic time travel mode, in which the clock/timers\n" +"inside the UML instance skip forward when there's nothing to do, rather than\n" +"waiting for real time to elapse. However, instance CPU speed is limited by\n" +"the real CPU speed, so e.g. a 10ms timer will always fire after ~10ms wall\n" +"clock (but quicker when there's nothing to do).\n" +"\n" +"time-travel=inf-cpu\n" +"This enables time travel mode with infinite processing power, in which there\n" +"are no wall clock timers, and any CPU processing happens - as seen from the\n" +"guest - instantly. This can be useful for accurate simulation regardless of\n" +"debug overhead, physical CPU speed, etc. but is somewhat dangerous as it can\n" +"easily lead to getting stuck (e.g. if anything in the system busy loops).\n" +"\n" +"time-travel=ext:[ID:]/path/to/socket\n" +"This enables time travel mode similar to =inf-cpu, except the system will\n" +"use the given socket to coordinate with a central scheduler, in order to\n" +"have more than one system simultaneously be on simulated time. The virtio\n" +"driver code in UML knows about this so you can also simulate networks and\n" +"devices using it, assuming the device has the right capabilities.\n" +"The optional ID is a 64-bit integer that's sent to the central scheduler.\n"); + +int setup_time_travel_start(char *str) +{ + int err; + + err = kstrtoull(str, 0, &time_travel_start); + if (err) + return err; + + time_travel_start_set = 1; + return 1; +} + +__setup("time-travel-start", setup_time_travel_start); +__uml_help(setup_time_travel_start, +"time-travel-start=<seconds>\n" +"Configure the UML instance's wall clock to start at this value rather than\n" +"the host's wall clock at the time of UML boot.\n"); +#endif diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c new file mode 100644 index 000000000..ad449173a --- /dev/null +++ b/arch/um/kernel/tlb.c @@ -0,0 +1,602 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sched/signal.h> + +#include <asm/tlbflush.h> +#include <as-layout.h> +#include <mem_user.h> +#include <os.h> +#include <skas.h> +#include <kern_util.h> + +struct host_vm_change { + struct host_vm_op { + enum { NONE, MMAP, MUNMAP, MPROTECT } type; + union { + struct { + unsigned long addr; + unsigned long len; + unsigned int prot; + int fd; + __u64 offset; + } mmap; + struct { + unsigned long addr; + unsigned long len; + } munmap; + struct { + unsigned long addr; + unsigned long len; + unsigned int prot; + } mprotect; + } u; + } ops[1]; + int userspace; + int index; + struct mm_struct *mm; + void *data; + int force; +}; + +#define INIT_HVC(mm, force, userspace) \ + ((struct host_vm_change) \ + { .ops = { { .type = NONE } }, \ + .mm = mm, \ + .data = NULL, \ + .userspace = userspace, \ + .index = 0, \ + .force = force }) + +static void report_enomem(void) +{ + printk(KERN_ERR "UML ran out of memory on the host side! " + "This can happen due to a memory limitation or " + "vm.max_map_count has been reached.\n"); +} + +static int do_ops(struct host_vm_change *hvc, int end, + int finished) +{ + struct host_vm_op *op; + int i, ret = 0; + + for (i = 0; i < end && !ret; i++) { + op = &hvc->ops[i]; + switch (op->type) { + case MMAP: + if (hvc->userspace) + ret = map(&hvc->mm->context.id, op->u.mmap.addr, + op->u.mmap.len, op->u.mmap.prot, + op->u.mmap.fd, + op->u.mmap.offset, finished, + &hvc->data); + else + map_memory(op->u.mmap.addr, op->u.mmap.offset, + op->u.mmap.len, 1, 1, 1); + break; + case MUNMAP: + if (hvc->userspace) + ret = unmap(&hvc->mm->context.id, + op->u.munmap.addr, + op->u.munmap.len, finished, + &hvc->data); + else + ret = os_unmap_memory( + (void *) op->u.munmap.addr, + op->u.munmap.len); + + break; + case MPROTECT: + if (hvc->userspace) + ret = protect(&hvc->mm->context.id, + op->u.mprotect.addr, + op->u.mprotect.len, + op->u.mprotect.prot, + finished, &hvc->data); + else + ret = os_protect_memory( + (void *) op->u.mprotect.addr, + op->u.mprotect.len, + 1, 1, 1); + break; + default: + printk(KERN_ERR "Unknown op type %d in do_ops\n", + op->type); + BUG(); + break; + } + } + + if (ret == -ENOMEM) + report_enomem(); + + return ret; +} + +static int add_mmap(unsigned long virt, unsigned long phys, unsigned long len, + unsigned int prot, struct host_vm_change *hvc) +{ + __u64 offset; + struct host_vm_op *last; + int fd = -1, ret = 0; + + if (hvc->userspace) + fd = phys_mapping(phys, &offset); + else + offset = phys; + if (hvc->index != 0) { + last = &hvc->ops[hvc->index - 1]; + if ((last->type == MMAP) && + (last->u.mmap.addr + last->u.mmap.len == virt) && + (last->u.mmap.prot == prot) && (last->u.mmap.fd == fd) && + (last->u.mmap.offset + last->u.mmap.len == offset)) { + last->u.mmap.len += len; + return 0; + } + } + + if (hvc->index == ARRAY_SIZE(hvc->ops)) { + ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); + hvc->index = 0; + } + + hvc->ops[hvc->index++] = ((struct host_vm_op) + { .type = MMAP, + .u = { .mmap = { .addr = virt, + .len = len, + .prot = prot, + .fd = fd, + .offset = offset } + } }); + return ret; +} + +static int add_munmap(unsigned long addr, unsigned long len, + struct host_vm_change *hvc) +{ + struct host_vm_op *last; + int ret = 0; + + if (hvc->index != 0) { + last = &hvc->ops[hvc->index - 1]; + if ((last->type == MUNMAP) && + (last->u.munmap.addr + last->u.mmap.len == addr)) { + last->u.munmap.len += len; + return 0; + } + } + + if (hvc->index == ARRAY_SIZE(hvc->ops)) { + ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); + hvc->index = 0; + } + + hvc->ops[hvc->index++] = ((struct host_vm_op) + { .type = MUNMAP, + .u = { .munmap = { .addr = addr, + .len = len } } }); + return ret; +} + +static int add_mprotect(unsigned long addr, unsigned long len, + unsigned int prot, struct host_vm_change *hvc) +{ + struct host_vm_op *last; + int ret = 0; + + if (hvc->index != 0) { + last = &hvc->ops[hvc->index - 1]; + if ((last->type == MPROTECT) && + (last->u.mprotect.addr + last->u.mprotect.len == addr) && + (last->u.mprotect.prot == prot)) { + last->u.mprotect.len += len; + return 0; + } + } + + if (hvc->index == ARRAY_SIZE(hvc->ops)) { + ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); + hvc->index = 0; + } + + hvc->ops[hvc->index++] = ((struct host_vm_op) + { .type = MPROTECT, + .u = { .mprotect = { .addr = addr, + .len = len, + .prot = prot } } }); + return ret; +} + +#define ADD_ROUND(n, inc) (((n) + (inc)) & ~((inc) - 1)) + +static inline int update_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, + struct host_vm_change *hvc) +{ + pte_t *pte; + int r, w, x, prot, ret = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + r = pte_read(*pte); + w = pte_write(*pte); + x = pte_exec(*pte); + if (!pte_young(*pte)) { + r = 0; + w = 0; + } else if (!pte_dirty(*pte)) + w = 0; + + prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | + (x ? UM_PROT_EXEC : 0)); + if (hvc->force || pte_newpage(*pte)) { + if (pte_present(*pte)) { + if (pte_newpage(*pte)) + ret = add_mmap(addr, pte_val(*pte) & PAGE_MASK, + PAGE_SIZE, prot, hvc); + } else + ret = add_munmap(addr, PAGE_SIZE, hvc); + } else if (pte_newprot(*pte)) + ret = add_mprotect(addr, PAGE_SIZE, prot, hvc); + *pte = pte_mkuptodate(*pte); + } while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret)); + return ret; +} + +static inline int update_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, + struct host_vm_change *hvc) +{ + pmd_t *pmd; + unsigned long next; + int ret = 0; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) { + if (hvc->force || pmd_newpage(*pmd)) { + ret = add_munmap(addr, next - addr, hvc); + pmd_mkuptodate(*pmd); + } + } + else ret = update_pte_range(pmd, addr, next, hvc); + } while (pmd++, addr = next, ((addr < end) && !ret)); + return ret; +} + +static inline int update_pud_range(p4d_t *p4d, unsigned long addr, + unsigned long end, + struct host_vm_change *hvc) +{ + pud_t *pud; + unsigned long next; + int ret = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_present(*pud)) { + if (hvc->force || pud_newpage(*pud)) { + ret = add_munmap(addr, next - addr, hvc); + pud_mkuptodate(*pud); + } + } + else ret = update_pmd_range(pud, addr, next, hvc); + } while (pud++, addr = next, ((addr < end) && !ret)); + return ret; +} + +static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, + struct host_vm_change *hvc) +{ + p4d_t *p4d; + unsigned long next; + int ret = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (!p4d_present(*p4d)) { + if (hvc->force || p4d_newpage(*p4d)) { + ret = add_munmap(addr, next - addr, hvc); + p4d_mkuptodate(*p4d); + } + } else + ret = update_pud_range(p4d, addr, next, hvc); + } while (p4d++, addr = next, ((addr < end) && !ret)); + return ret; +} + +void fix_range_common(struct mm_struct *mm, unsigned long start_addr, + unsigned long end_addr, int force) +{ + pgd_t *pgd; + struct host_vm_change hvc; + unsigned long addr = start_addr, next; + int ret = 0, userspace = 1; + + hvc = INIT_HVC(mm, force, userspace); + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end_addr); + if (!pgd_present(*pgd)) { + if (force || pgd_newpage(*pgd)) { + ret = add_munmap(addr, next - addr, &hvc); + pgd_mkuptodate(*pgd); + } + } else + ret = update_p4d_range(pgd, addr, next, &hvc); + } while (pgd++, addr = next, ((addr < end_addr) && !ret)); + + if (!ret) + ret = do_ops(&hvc, hvc.index, 1); + + /* This is not an else because ret is modified above */ + if (ret) { + struct mm_id *mm_idp = ¤t->mm->context.id; + + printk(KERN_ERR "fix_range_common: failed, killing current " + "process: %d\n", task_tgid_vnr(current)); + mm_idp->kill = 1; + } +} + +static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) +{ + struct mm_struct *mm; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long addr, last; + int updated = 0, err = 0, force = 0, userspace = 0; + struct host_vm_change hvc; + + mm = &init_mm; + hvc = INIT_HVC(mm, force, userspace); + for (addr = start; addr < end;) { + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) { + last = ADD_ROUND(addr, PGDIR_SIZE); + if (last > end) + last = end; + if (pgd_newpage(*pgd)) { + updated = 1; + err = add_munmap(addr, last - addr, &hvc); + if (err < 0) + panic("munmap failed, errno = %d\n", + -err); + } + addr = last; + continue; + } + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) { + last = ADD_ROUND(addr, P4D_SIZE); + if (last > end) + last = end; + if (p4d_newpage(*p4d)) { + updated = 1; + err = add_munmap(addr, last - addr, &hvc); + if (err < 0) + panic("munmap failed, errno = %d\n", + -err); + } + addr = last; + continue; + } + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) { + last = ADD_ROUND(addr, PUD_SIZE); + if (last > end) + last = end; + if (pud_newpage(*pud)) { + updated = 1; + err = add_munmap(addr, last - addr, &hvc); + if (err < 0) + panic("munmap failed, errno = %d\n", + -err); + } + addr = last; + continue; + } + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + last = ADD_ROUND(addr, PMD_SIZE); + if (last > end) + last = end; + if (pmd_newpage(*pmd)) { + updated = 1; + err = add_munmap(addr, last - addr, &hvc); + if (err < 0) + panic("munmap failed, errno = %d\n", + -err); + } + addr = last; + continue; + } + + pte = pte_offset_kernel(pmd, addr); + if (!pte_present(*pte) || pte_newpage(*pte)) { + updated = 1; + err = add_munmap(addr, PAGE_SIZE, &hvc); + if (err < 0) + panic("munmap failed, errno = %d\n", + -err); + if (pte_present(*pte)) + err = add_mmap(addr, pte_val(*pte) & PAGE_MASK, + PAGE_SIZE, 0, &hvc); + } + else if (pte_newprot(*pte)) { + updated = 1; + err = add_mprotect(addr, PAGE_SIZE, 0, &hvc); + } + addr += PAGE_SIZE; + } + if (!err) + err = do_ops(&hvc, hvc.index, 1); + + if (err < 0) + panic("flush_tlb_kernel failed, errno = %d\n", err); + return updated; +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + struct mm_struct *mm = vma->vm_mm; + void *flush = NULL; + int r, w, x, prot, err = 0; + struct mm_id *mm_id; + + address &= PAGE_MASK; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto kill; + + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + goto kill; + + pud = pud_offset(p4d, address); + if (!pud_present(*pud)) + goto kill; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + goto kill; + + pte = pte_offset_kernel(pmd, address); + + r = pte_read(*pte); + w = pte_write(*pte); + x = pte_exec(*pte); + if (!pte_young(*pte)) { + r = 0; + w = 0; + } else if (!pte_dirty(*pte)) { + w = 0; + } + + mm_id = &mm->context.id; + prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | + (x ? UM_PROT_EXEC : 0)); + if (pte_newpage(*pte)) { + if (pte_present(*pte)) { + unsigned long long offset; + int fd; + + fd = phys_mapping(pte_val(*pte) & PAGE_MASK, &offset); + err = map(mm_id, address, PAGE_SIZE, prot, fd, offset, + 1, &flush); + } + else err = unmap(mm_id, address, PAGE_SIZE, 1, &flush); + } + else if (pte_newprot(*pte)) + err = protect(mm_id, address, PAGE_SIZE, prot, 1, &flush); + + if (err) { + if (err == -ENOMEM) + report_enomem(); + + goto kill; + } + + *pte = pte_mkuptodate(*pte); + + return; + +kill: + printk(KERN_ERR "Failed to flush page for address 0x%lx\n", address); + force_sig(SIGKILL); +} + +void flush_tlb_all(void) +{ + /* + * Don't bother flushing if this address space is about to be + * destroyed. + */ + if (atomic_read(¤t->mm->mm_users) == 0) + return; + + flush_tlb_mm(current->mm); +} + +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + flush_tlb_kernel_range_common(start, end); +} + +void flush_tlb_kernel_vm(void) +{ + flush_tlb_kernel_range_common(start_vm, end_vm); +} + +void __flush_tlb_one(unsigned long addr) +{ + flush_tlb_kernel_range_common(addr, addr + PAGE_SIZE); +} + +static void fix_range(struct mm_struct *mm, unsigned long start_addr, + unsigned long end_addr, int force) +{ + /* + * Don't bother flushing if this address space is about to be + * destroyed. + */ + if (atomic_read(&mm->mm_users) == 0) + return; + + fix_range_common(mm, start_addr, end_addr, force); +} + +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + if (vma->vm_mm == NULL) + flush_tlb_kernel_range_common(start, end); + else fix_range(vma->vm_mm, start, end, 0); +} +EXPORT_SYMBOL(flush_tlb_range); + +void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + fix_range(mm, start, end, 0); +} + +void flush_tlb_mm(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + for_each_vma(vmi, vma) + fix_range(mm, vma->vm_start, vma->vm_end, 0); +} + +void force_flush_all(void) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + for_each_vma(vmi, vma) + fix_range(mm, vma->vm_start, vma->vm_end, 1); +} diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c new file mode 100644 index 000000000..6d8ae86ae --- /dev/null +++ b/arch/um/kernel/trap.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/mm.h> +#include <linux/sched/signal.h> +#include <linux/hardirq.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/sched/debug.h> +#include <asm/current.h> +#include <asm/tlbflush.h> +#include <arch.h> +#include <as-layout.h> +#include <kern_util.h> +#include <os.h> +#include <skas.h> + +/* + * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by + * segv(). + */ +int handle_page_fault(unsigned long address, unsigned long ip, + int is_write, int is_user, int *code_out) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + pmd_t *pmd; + pte_t *pte; + int err = -EFAULT; + unsigned int flags = FAULT_FLAG_DEFAULT; + + *code_out = SEGV_MAPERR; + + /* + * If the fault was with pagefaults disabled, don't take the fault, just + * fail. + */ + if (faulthandler_disabled()) + goto out_nosemaphore; + + if (is_user) + flags |= FAULT_FLAG_USER; +retry: + mmap_read_lock(mm); + vma = find_vma(mm, address); + if (!vma) + goto out; + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out; + if (is_user && !ARCH_IS_STACKGROW(address)) + goto out; + vma = expand_stack(mm, address); + if (!vma) + goto out_nosemaphore; + +good_area: + *code_out = SEGV_ACCERR; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto out; + flags |= FAULT_FLAG_WRITE; + } else { + /* Don't require VM_READ|VM_EXEC for write faults! */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto out; + } + + do { + vm_fault_t fault; + + fault = handle_mm_fault(vma, address, flags, NULL); + + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) + goto out_nosemaphore; + + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return 0; + + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) { + goto out_of_memory; + } else if (fault & VM_FAULT_SIGSEGV) { + goto out; + } else if (fault & VM_FAULT_SIGBUS) { + err = -EACCES; + goto out; + } + BUG(); + } + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; + + goto retry; + } + + pmd = pmd_off(mm, address); + pte = pte_offset_kernel(pmd, address); + } while (!pte_present(*pte)); + err = 0; + /* + * The below warning was added in place of + * pte_mkyoung(); if (is_write) pte_mkdirty(); + * If it's triggered, we'd see normally a hang here (a clean pte is + * marked read-only to emulate the dirty bit). + * However, the generic code can mark a PTE writable but clean on a + * concurrent read fault, triggering this harmlessly. So comment it out. + */ +#if 0 + WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte))); +#endif + flush_tlb_page(vma, address); +out: + mmap_read_unlock(mm); +out_nosemaphore: + return err; + +out_of_memory: + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ + mmap_read_unlock(mm); + if (!is_user) + goto out_nosemaphore; + pagefault_out_of_memory(); + return 0; +} + +static void show_segv_info(struct uml_pt_regs *regs) +{ + struct task_struct *tsk = current; + struct faultinfo *fi = UPT_FAULTINFO(regs); + + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi), + (void *)UPT_IP(regs), (void *)UPT_SP(regs), + fi->error_code); + + print_vma_addr(KERN_CONT " in ", UPT_IP(regs)); + printk(KERN_CONT "\n"); +} + +static void bad_segv(struct faultinfo fi, unsigned long ip) +{ + current->thread.arch.faultinfo = fi; + force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi)); +} + +void fatal_sigsegv(void) +{ + force_fatal_sig(SIGSEGV); + do_signal(¤t->thread.regs); + /* + * This is to tell gcc that we're not returning - do_signal + * can, in general, return, but in this case, it's not, since + * we just got a fatal SIGSEGV queued. + */ + os_dump_core(); +} + +/** + * segv_handler() - the SIGSEGV handler + * @sig: the signal number + * @unused_si: the signal info struct; unused in this handler + * @regs: the ptrace register information + * + * The handler first extracts the faultinfo from the UML ptrace regs struct. + * If the userfault did not happen in an UML userspace process, bad_segv is called. + * Otherwise the signal did happen in a cloned userspace process, handle it. + */ +void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +{ + struct faultinfo * fi = UPT_FAULTINFO(regs); + + if (UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)) { + show_segv_info(regs); + bad_segv(*fi, UPT_IP(regs)); + return; + } + segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs); +} + +/* + * We give a *copy* of the faultinfo in the regs to segv. + * This must be done, since nesting SEGVs could overwrite + * the info in the regs. A pointer to the info then would + * give us bad data! + */ +unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, + struct uml_pt_regs *regs) +{ + jmp_buf *catcher; + int si_code; + int err; + int is_write = FAULT_WRITE(fi); + unsigned long address = FAULT_ADDRESS(fi); + + if (!is_user && regs) + current->thread.segv_regs = container_of(regs, struct pt_regs, regs); + + if (!is_user && (address >= start_vm) && (address < end_vm)) { + flush_tlb_kernel_vm(); + goto out; + } + else if (current->mm == NULL) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault with no mm"); + } + else if (!is_user && address > PAGE_SIZE && address < TASK_SIZE) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx", + address, ip); + } + + if (SEGV_IS_FIXABLE(&fi)) + err = handle_page_fault(address, ip, is_write, is_user, + &si_code); + else { + err = -EFAULT; + /* + * A thread accessed NULL, we get a fault, but CR2 is invalid. + * This code is used in __do_copy_from_user() of TT mode. + * XXX tt mode is gone, so maybe this isn't needed any more + */ + address = 0; + } + + catcher = current->thread.fault_catcher; + if (!err) + goto out; + else if (catcher != NULL) { + current->thread.fault_addr = (void *) address; + UML_LONGJMP(catcher, 1); + } + else if (current->thread.fault_addr != NULL) + panic("fault_addr set but no fault catcher"); + else if (!is_user && arch_fixup(ip, regs)) + goto out; + + if (!is_user) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Kernel mode fault at addr 0x%lx, ip 0x%lx", + address, ip); + } + + show_segv_info(regs); + + if (err == -EACCES) { + current->thread.arch.faultinfo = fi; + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); + } else { + BUG_ON(err != -EFAULT); + current->thread.arch.faultinfo = fi; + force_sig_fault(SIGSEGV, si_code, (void __user *) address); + } + +out: + if (regs) + current->thread.segv_regs = NULL; + + return 0; +} + +void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) +{ + int code, err; + if (!UPT_IS_USER(regs)) { + if (sig == SIGBUS) + printk(KERN_ERR "Bus error - the host /dev/shm or /tmp " + "mount likely just ran out of space\n"); + panic("Kernel mode signal %d", sig); + } + + arch_examine_signal(sig, regs); + + /* Is the signal layout for the signal known? + * Signal data must be scrubbed to prevent information leaks. + */ + code = si->si_code; + err = si->si_errno; + if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) { + struct faultinfo *fi = UPT_FAULTINFO(regs); + current->thread.arch.faultinfo = *fi; + force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi)); + } else { + printk(KERN_ERR "Attempted to relay unknown signal %d (si_code = %d) with errno %d\n", + sig, code, err); + force_sig(sig); + } +} + +void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs) +{ + if (current->thread.fault_catcher != NULL) + UML_LONGJMP(current->thread.fault_catcher, 1); + else + relay_signal(sig, si, regs); +} + +void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +{ + do_IRQ(WINCH_IRQ, regs); +} diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c new file mode 100644 index 000000000..334c91191 --- /dev/null +++ b/arch/um/kernel/um_arch.c @@ -0,0 +1,532 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/panic_notifier.h> +#include <linux/seq_file.h> +#include <linux/string.h> +#include <linux/utsname.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/kmsg_dump.h> +#include <linux/suspend.h> +#include <linux/random.h> + +#include <asm/processor.h> +#include <asm/cpufeature.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <as-layout.h> +#include <arch.h> +#include <init.h> +#include <kern.h> +#include <kern_util.h> +#include <mem_user.h> +#include <os.h> + +#include "um_arch.h" + +#define DEFAULT_COMMAND_LINE_ROOT "root=98:0" +#define DEFAULT_COMMAND_LINE_CONSOLE "console=tty0" + +/* Changed in add_arg and setup_arch, which run before SMP is started */ +static char __initdata command_line[COMMAND_LINE_SIZE] = { 0 }; + +static void __init add_arg(char *arg) +{ + if (strlen(command_line) + strlen(arg) + 1 > COMMAND_LINE_SIZE) { + os_warn("add_arg: Too many command line arguments!\n"); + exit(1); + } + if (strlen(command_line) > 0) + strcat(command_line, " "); + strcat(command_line, arg); +} + +/* + * These fields are initialized at boot time and not changed. + * XXX This structure is used only in the non-SMP case. Maybe this + * should be moved to smp.c. + */ +struct cpuinfo_um boot_cpu_data = { + .loops_per_jiffy = 0, + .ipi_pipe = { -1, -1 }, + .cache_alignment = L1_CACHE_BYTES, + .x86_capability = { 0 } +}; + +EXPORT_SYMBOL(boot_cpu_data); + +union thread_union cpu0_irqstack + __section(".data..init_irqstack") = + { .thread_info = INIT_THREAD_INFO(init_task) }; + +/* Changed in setup_arch, which is called in early boot */ +static char host_info[(__NEW_UTS_LEN + 1) * 5]; + +static int show_cpuinfo(struct seq_file *m, void *v) +{ + int i = 0; + + seq_printf(m, "processor\t: %d\n", i); + seq_printf(m, "vendor_id\t: User Mode Linux\n"); + seq_printf(m, "model name\t: UML\n"); + seq_printf(m, "mode\t\t: skas\n"); + seq_printf(m, "host\t\t: %s\n", host_info); + seq_printf(m, "fpu\t\t: %s\n", cpu_has(&boot_cpu_data, X86_FEATURE_FPU) ? "yes" : "no"); + seq_printf(m, "flags\t\t:"); + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(&boot_cpu_data, i) && (x86_cap_flags[i] != NULL)) + seq_printf(m, " %s", x86_cap_flags[i]); + seq_printf(m, "\n"); + seq_printf(m, "cache_alignment\t: %d\n", boot_cpu_data.cache_alignment); + seq_printf(m, "bogomips\t: %lu.%02lu\n", + loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100); + + + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + return *pos < nr_cpu_ids ? cpu_data + *pos : NULL; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return c_start(m, pos); +} + +static void c_stop(struct seq_file *m, void *v) +{ +} + +const struct seq_operations cpuinfo_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo, +}; + +/* Set in linux_main */ +unsigned long uml_physmem; +EXPORT_SYMBOL(uml_physmem); + +unsigned long uml_reserved; /* Also modified in mem_init */ +unsigned long start_vm; +unsigned long end_vm; + +/* Set in uml_ncpus_setup */ +int ncpus = 1; + +/* Set in early boot */ +static int have_root __initdata; +static int have_console __initdata; + +/* Set in uml_mem_setup and modified in linux_main */ +long long physmem_size = 64 * 1024 * 1024; +EXPORT_SYMBOL(physmem_size); + +static const char *usage_string = +"User Mode Linux v%s\n" +" available at http://user-mode-linux.sourceforge.net/\n\n"; + +static int __init uml_version_setup(char *line, int *add) +{ + /* Explicitly use printf() to show version in stdout */ + printf("%s\n", init_utsname()->release); + exit(0); + + return 0; +} + +__uml_setup("--version", uml_version_setup, +"--version\n" +" Prints the version number of the kernel.\n\n" +); + +static int __init uml_root_setup(char *line, int *add) +{ + have_root = 1; + return 0; +} + +__uml_setup("root=", uml_root_setup, +"root=<file containing the root fs>\n" +" This is actually used by the generic kernel in exactly the same\n" +" way as in any other kernel. If you configure a number of block\n" +" devices and want to boot off something other than ubd0, you \n" +" would use something like:\n" +" root=/dev/ubd5\n\n" +); + +static int __init no_skas_debug_setup(char *line, int *add) +{ + os_warn("'debug' is not necessary to gdb UML in skas mode - run\n"); + os_warn("'gdb linux'\n"); + + return 0; +} + +__uml_setup("debug", no_skas_debug_setup, +"debug\n" +" this flag is not needed to run gdb on UML in skas mode\n\n" +); + +static int __init uml_console_setup(char *line, int *add) +{ + have_console = 1; + return 0; +} + +__uml_setup("console=", uml_console_setup, +"console=<preferred console>\n" +" Specify the preferred console output driver\n\n" +); + +static int __init Usage(char *line, int *add) +{ + const char **p; + + printf(usage_string, init_utsname()->release); + p = &__uml_help_start; + /* Explicitly use printf() to show help in stdout */ + while (p < &__uml_help_end) { + printf("%s", *p); + p++; + } + exit(0); + return 0; +} + +__uml_setup("--help", Usage, +"--help\n" +" Prints this message.\n\n" +); + +static void __init uml_checksetup(char *line, int *add) +{ + struct uml_param *p; + + p = &__uml_setup_start; + while (p < &__uml_setup_end) { + size_t n; + + n = strlen(p->str); + if (!strncmp(line, p->str, n) && p->setup_func(line + n, add)) + return; + p++; + } +} + +static void __init uml_postsetup(void) +{ + initcall_t *p; + + p = &__uml_postsetup_start; + while (p < &__uml_postsetup_end) { + (*p)(); + p++; + } + return; +} + +static int panic_exit(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ + kmsg_dump(KMSG_DUMP_PANIC); + bust_spinlocks(1); + bust_spinlocks(0); + uml_exitcode = 1; + os_dump_core(); + + return NOTIFY_DONE; +} + +static struct notifier_block panic_exit_notifier = { + .notifier_call = panic_exit, + .priority = INT_MAX - 1, /* run as 2nd notifier, won't return */ +}; + +void uml_finishsetup(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &panic_exit_notifier); + + uml_postsetup(); + + new_thread_handler(); +} + +/* Set during early boot */ +unsigned long stub_start; +unsigned long task_size; +EXPORT_SYMBOL(task_size); + +unsigned long host_task_size; + +unsigned long brk_start; +unsigned long end_iomem; +EXPORT_SYMBOL(end_iomem); + +#define MIN_VMALLOC (32 * 1024 * 1024) + +static void parse_host_cpu_flags(char *line) +{ + int i; + for (i = 0; i < 32*NCAPINTS; i++) { + if ((x86_cap_flags[i] != NULL) && strstr(line, x86_cap_flags[i])) + set_cpu_cap(&boot_cpu_data, i); + } +} +static void parse_cache_line(char *line) +{ + long res; + char *to_parse = strstr(line, ":"); + if (to_parse) { + to_parse++; + while (*to_parse != 0 && isspace(*to_parse)) { + to_parse++; + } + if (kstrtoul(to_parse, 10, &res) == 0 && is_power_of_2(res)) + boot_cpu_data.cache_alignment = res; + else + boot_cpu_data.cache_alignment = L1_CACHE_BYTES; + } +} + +int __init linux_main(int argc, char **argv) +{ + unsigned long avail, diff; + unsigned long virtmem_size, max_physmem; + unsigned long stack; + unsigned int i; + int add; + + for (i = 1; i < argc; i++) { + if ((i == 1) && (argv[i][0] == ' ')) + continue; + add = 1; + uml_checksetup(argv[i], &add); + if (add) + add_arg(argv[i]); + } + if (have_root == 0) + add_arg(DEFAULT_COMMAND_LINE_ROOT); + + if (have_console == 0) + add_arg(DEFAULT_COMMAND_LINE_CONSOLE); + + host_task_size = os_get_top_address(); + /* reserve two pages for the stubs */ + host_task_size -= 2 * PAGE_SIZE; + stub_start = host_task_size; + + /* + * TASK_SIZE needs to be PGDIR_SIZE aligned or else exit_mmap craps + * out + */ + task_size = host_task_size & PGDIR_MASK; + + /* OS sanity checks that need to happen before the kernel runs */ + os_early_checks(); + + get_host_cpu_features(parse_host_cpu_flags, parse_cache_line); + + brk_start = (unsigned long) sbrk(0); + + /* + * Increase physical memory size for exec-shield users + * so they actually get what they asked for. This should + * add zero for non-exec shield users + */ + + diff = UML_ROUND_UP(brk_start) - UML_ROUND_UP(&_end); + if (diff > 1024 * 1024) { + os_info("Adding %ld bytes to physical memory to account for " + "exec-shield gap\n", diff); + physmem_size += UML_ROUND_UP(brk_start) - UML_ROUND_UP(&_end); + } + + uml_physmem = (unsigned long) __binary_start & PAGE_MASK; + + /* Reserve up to 4M after the current brk */ + uml_reserved = ROUND_4M(brk_start) + (1 << 22); + + setup_machinename(init_utsname()->machine); + + highmem = 0; + iomem_size = (iomem_size + PAGE_SIZE - 1) & PAGE_MASK; + max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; + + /* + * Zones have to begin on a 1 << MAX_ORDER page boundary, + * so this makes sure that's true for highmem + */ + max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER)) - 1); + if (physmem_size + iomem_size > max_physmem) { + highmem = physmem_size + iomem_size - max_physmem; + physmem_size -= highmem; + } + + high_physmem = uml_physmem + physmem_size; + end_iomem = high_physmem + iomem_size; + high_memory = (void *) end_iomem; + + start_vm = VMALLOC_START; + + virtmem_size = physmem_size; + stack = (unsigned long) argv; + stack &= ~(1024 * 1024 - 1); + avail = stack - start_vm; + if (physmem_size > avail) + virtmem_size = avail; + end_vm = start_vm + virtmem_size; + + if (virtmem_size < physmem_size) + os_info("Kernel virtual memory size shrunk to %lu bytes\n", + virtmem_size); + + os_flush_stdout(); + + return start_uml(); +} + +int __init __weak read_initrd(void) +{ + return 0; +} + +void __init setup_arch(char **cmdline_p) +{ + u8 rng_seed[32]; + + stack_protections((unsigned long) &init_thread_info); + setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem); + mem_total_pages(physmem_size, iomem_size, highmem); + uml_dtb_init(); + read_initrd(); + + paging_init(); + strscpy(boot_command_line, command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + setup_hostinfo(host_info, sizeof host_info); + + if (os_getrandom(rng_seed, sizeof(rng_seed), 0) == sizeof(rng_seed)) { + add_bootloader_randomness(rng_seed, sizeof(rng_seed)); + memzero_explicit(rng_seed, sizeof(rng_seed)); + } +} + +void __init arch_cpu_finalize_init(void) +{ + arch_check_bugs(); + os_check_bugs(); +} + +void apply_ibt_endbr(s32 *start, s32 *end) +{ +} + +void apply_retpolines(s32 *start, s32 *end) +{ +} + +void apply_returns(s32 *start, s32 *end) +{ +} + +void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +{ +} + +void *text_poke(void *addr, const void *opcode, size_t len) +{ + /* + * In UML, the only reference to this function is in + * apply_relocate_add(), which shouldn't ever actually call this + * because UML doesn't have live patching. + */ + WARN_ON(1); + + return memcpy(addr, opcode, len); +} + +void text_poke_sync(void) +{ +} + +void uml_pm_wake(void) +{ + pm_system_wakeup(); +} + +#ifdef CONFIG_PM_SLEEP +static int um_suspend_valid(suspend_state_t state) +{ + return state == PM_SUSPEND_MEM; +} + +static int um_suspend_prepare(void) +{ + um_irqs_suspend(); + return 0; +} + +static int um_suspend_enter(suspend_state_t state) +{ + if (WARN_ON(state != PM_SUSPEND_MEM)) + return -EINVAL; + + /* + * This is identical to the idle sleep, but we've just + * (during suspend) turned off all interrupt sources + * except for the ones we want, so now we can only wake + * up on something we actually want to wake up on. All + * timing has also been suspended. + */ + um_idle_sleep(); + return 0; +} + +static void um_suspend_finish(void) +{ + um_irqs_resume(); +} + +const struct platform_suspend_ops um_suspend_ops = { + .valid = um_suspend_valid, + .prepare = um_suspend_prepare, + .enter = um_suspend_enter, + .finish = um_suspend_finish, +}; + +static int init_pm_wake_signal(void) +{ + /* + * In external time-travel mode we can't use signals to wake up + * since that would mess with the scheduling. We'll have to do + * some additional work to support wakeup on virtio devices or + * similar, perhaps implementing a fake RTC controller that can + * trigger wakeup (and request the appropriate scheduling from + * the external scheduler when going to suspend.) + */ + if (time_travel_mode != TT_MODE_EXTERNAL) + register_pm_wake_signal(); + + suspend_set_ops(&um_suspend_ops); + + return 0; +} + +late_initcall(init_pm_wake_signal); +#endif diff --git a/arch/um/kernel/um_arch.h b/arch/um/kernel/um_arch.h new file mode 100644 index 000000000..1e07fb7ee --- /dev/null +++ b/arch/um/kernel/um_arch.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __UML_ARCH_H__ +#define __UML_ARCH_H__ + +extern void * __init uml_load_file(const char *filename, unsigned long long *size); + +#ifdef CONFIG_OF +extern void __init uml_dtb_init(void); +#else +static inline void uml_dtb_init(void) { } +#endif + +#endif diff --git a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c new file mode 100644 index 000000000..72bc60ade --- /dev/null +++ b/arch/um/kernel/umid.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#include <asm/errno.h> +#include <init.h> +#include <kern.h> +#include <os.h> + +/* Changed by set_umid_arg */ +static int umid_inited; + +static int __init set_umid_arg(char *name, int *add) +{ + int err; + + if (umid_inited) { + os_warn("umid already set\n"); + return 0; + } + + *add = 0; + err = set_umid(name); + if (err == -EEXIST) + os_warn("umid '%s' already in use\n", name); + else if (!err) + umid_inited = 1; + + return 0; +} + +__uml_setup("umid=", set_umid_arg, +"umid=<name>\n" +" This is used to assign a unique identity to this UML machine and\n" +" is used for naming the pid file and management console socket.\n\n" +); + diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S new file mode 100644 index 000000000..71a59b8ad --- /dev/null +++ b/arch/um/kernel/uml.lds.S @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <asm/vmlinux.lds.h> +#include <asm/page.h> + +OUTPUT_FORMAT(ELF_FORMAT) +OUTPUT_ARCH(ELF_ARCH) +ENTRY(_start) +jiffies = jiffies_64; + +VERSION { + { + local: *; + }; +} + +SECTIONS +{ + /* This must contain the right address - not quite the default ELF one.*/ + PROVIDE (__executable_start = START); + /* Static binaries stick stuff here, like the sigreturn trampoline, + * invisibly to objdump. So, just make __binary_start equal to the very + * beginning of the executable, and if there are unmapped pages after this, + * they are forever unusable. + */ + __binary_start = START; + + . = START + SIZEOF_HEADERS; + . = ALIGN(PAGE_SIZE); + + _text = .; + INIT_TEXT_SECTION(0) + + .text : + { + _stext = .; + TEXT_TEXT + SCHED_TEXT + CPUIDLE_TEXT + LOCK_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT + *(.fixup) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + *(.gnu.linkonce.t*) + } + + . = ALIGN(PAGE_SIZE); + .syscall_stub : { + __syscall_stub_start = .; + *(.__syscall_stub*) + __syscall_stub_end = .; + } + + /* + * These are needed even in a static link, even if they wind up being empty. + * Newer glibc needs these __rel{,a}_iplt_{start,end} symbols. + */ + .rel.plt : { + *(.rel.plt) + PROVIDE_HIDDEN(__rel_iplt_start = .); + *(.rel.iplt) + PROVIDE_HIDDEN(__rel_iplt_end = .); + } + .rela.plt : { + *(.rela.plt) + PROVIDE_HIDDEN(__rela_iplt_start = .); + *(.rela.iplt) + PROVIDE_HIDDEN(__rela_iplt_end = .); + } + + #include <asm/common.lds.S> + + __init_begin = .; + init.data : { INIT_DATA } + __init_end = .; + + .data : + { + INIT_TASK_DATA(KERNEL_STACK_SIZE) + . = ALIGN(KERNEL_STACK_SIZE); + *(.data..init_irqstack) + DATA_DATA + *(.gnu.linkonce.d*) + CONSTRUCTORS + } + .data1 : { *(.data1) } + .ctors : + { + *(.ctors) + } + .dtors : + { + *(.dtors) + } + + .got : { *(.got.plt) *(.got) } + .eh_frame : { KEEP (*(.eh_frame)) } + .dynamic : { *(.dynamic) } + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : { *(.sdata) } + _edata = .; + PROVIDE (edata = .); + . = ALIGN(PAGE_SIZE); + __bss_start = .; + PROVIDE(_bss_start = .); + SBSS(0) + BSS(0) + __bss_stop = .; + _end = .; + PROVIDE (end = .); + + STABS_DEBUG + DWARF_DEBUG + ELF_DETAILS + + DISCARDS +} diff --git a/arch/um/kernel/vmlinux.lds.S b/arch/um/kernel/vmlinux.lds.S new file mode 100644 index 000000000..53d719c04 --- /dev/null +++ b/arch/um/kernel/vmlinux.lds.S @@ -0,0 +1,8 @@ +#define RUNTIME_DISCARD_EXIT +KERNEL_STACK_SIZE = 4096 * (1 << CONFIG_KERNEL_STACK_ORDER); + +#ifdef CONFIG_LD_SCRIPT_STATIC +#include "uml.lds.S" +#else +#include "dyn.lds.S" +#endif |