diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
commit | 76cb841cb886eef6b3bee341a2266c76578724ad (patch) | |
tree | f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /arch/s390/mm | |
parent | Initial commit. (diff) | |
download | linux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip |
Adding upstream version 4.19.249.upstream/4.19.249upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/s390/mm')
-rw-r--r-- | arch/s390/mm/Makefile | 13 | ||||
-rw-r--r-- | arch/s390/mm/cmm.c | 487 | ||||
-rw-r--r-- | arch/s390/mm/dump_pagetables.c | 249 | ||||
-rw-r--r-- | arch/s390/mm/extmem.c | 763 | ||||
-rw-r--r-- | arch/s390/mm/fault.c | 837 | ||||
-rw-r--r-- | arch/s390/mm/gmap.c | 2645 | ||||
-rw-r--r-- | arch/s390/mm/gup.c | 310 | ||||
-rw-r--r-- | arch/s390/mm/hugetlbpage.c | 370 | ||||
-rw-r--r-- | arch/s390/mm/init.c | 251 | ||||
-rw-r--r-- | arch/s390/mm/maccess.c | 213 | ||||
-rw-r--r-- | arch/s390/mm/mem_detect.c | 62 | ||||
-rw-r--r-- | arch/s390/mm/mmap.c | 206 | ||||
-rw-r--r-- | arch/s390/mm/page-states.c | 281 | ||||
-rw-r--r-- | arch/s390/mm/pageattr.c | 386 | ||||
-rw-r--r-- | arch/s390/mm/pgalloc.c | 681 | ||||
-rw-r--r-- | arch/s390/mm/pgtable.c | 1126 | ||||
-rw-r--r-- | arch/s390/mm/vmem.c | 443 |
17 files changed, 9323 insertions, 0 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile new file mode 100644 index 000000000..33fe41850 --- /dev/null +++ b/arch/s390/mm/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux s390-specific parts of the memory manager. +# + +obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o +obj-y += page-states.o gup.o pageattr.o mem_detect.o +obj-y += pgtable.o pgalloc.o + +obj-$(CONFIG_CMM) += cmm.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_PGSTE) += gmap.o diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c new file mode 100644 index 000000000..a51c892f1 --- /dev/null +++ b/arch/s390/mm/cmm.c @@ -0,0 +1,487 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Collaborative memory management interface. + * + * Copyright IBM Corp 2003, 2010 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>, + * + */ + +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/sysctl.h> +#include <linux/ctype.h> +#include <linux/swap.h> +#include <linux/kthread.h> +#include <linux/oom.h> +#include <linux/suspend.h> +#include <linux/uaccess.h> + +#include <asm/pgalloc.h> +#include <asm/diag.h> + +#ifdef CONFIG_CMM_IUCV +static char *cmm_default_sender = "VMRMSVM"; +#endif +static char *sender; +module_param(sender, charp, 0400); +MODULE_PARM_DESC(sender, + "Guest name that may send SMSG messages (default VMRMSVM)"); + +#include "../../../drivers/s390/net/smsgiucv.h" + +#define CMM_NR_PAGES ((PAGE_SIZE / sizeof(unsigned long)) - 2) + +struct cmm_page_array { + struct cmm_page_array *next; + unsigned long index; + unsigned long pages[CMM_NR_PAGES]; +}; + +static long cmm_pages; +static long cmm_timed_pages; +static volatile long cmm_pages_target; +static volatile long cmm_timed_pages_target; +static long cmm_timeout_pages; +static long cmm_timeout_seconds; +static int cmm_suspended; + +static struct cmm_page_array *cmm_page_list; +static struct cmm_page_array *cmm_timed_page_list; +static DEFINE_SPINLOCK(cmm_lock); + +static struct task_struct *cmm_thread_ptr; +static DECLARE_WAIT_QUEUE_HEAD(cmm_thread_wait); + +static void cmm_timer_fn(struct timer_list *); +static void cmm_set_timer(void); +static DEFINE_TIMER(cmm_timer, cmm_timer_fn); + +static long cmm_alloc_pages(long nr, long *counter, + struct cmm_page_array **list) +{ + struct cmm_page_array *pa, *npa; + unsigned long addr; + + while (nr) { + addr = __get_free_page(GFP_NOIO); + if (!addr) + break; + spin_lock(&cmm_lock); + pa = *list; + if (!pa || pa->index >= CMM_NR_PAGES) { + /* Need a new page for the page list. */ + spin_unlock(&cmm_lock); + npa = (struct cmm_page_array *) + __get_free_page(GFP_NOIO); + if (!npa) { + free_page(addr); + break; + } + spin_lock(&cmm_lock); + pa = *list; + if (!pa || pa->index >= CMM_NR_PAGES) { + npa->next = pa; + npa->index = 0; + pa = npa; + *list = pa; + } else + free_page((unsigned long) npa); + } + diag10_range(addr >> PAGE_SHIFT, 1); + pa->pages[pa->index++] = addr; + (*counter)++; + spin_unlock(&cmm_lock); + nr--; + } + return nr; +} + +static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list) +{ + struct cmm_page_array *pa; + unsigned long addr; + + spin_lock(&cmm_lock); + pa = *list; + while (nr) { + if (!pa || pa->index <= 0) + break; + addr = pa->pages[--pa->index]; + if (pa->index == 0) { + pa = pa->next; + free_page((unsigned long) *list); + *list = pa; + } + free_page(addr); + (*counter)--; + nr--; + } + spin_unlock(&cmm_lock); + return nr; +} + +static int cmm_oom_notify(struct notifier_block *self, + unsigned long dummy, void *parm) +{ + unsigned long *freed = parm; + long nr = 256; + + nr = cmm_free_pages(nr, &cmm_timed_pages, &cmm_timed_page_list); + if (nr > 0) + nr = cmm_free_pages(nr, &cmm_pages, &cmm_page_list); + cmm_pages_target = cmm_pages; + cmm_timed_pages_target = cmm_timed_pages; + *freed += 256 - nr; + return NOTIFY_OK; +} + +static struct notifier_block cmm_oom_nb = { + .notifier_call = cmm_oom_notify, +}; + +static int cmm_thread(void *dummy) +{ + int rc; + + while (1) { + rc = wait_event_interruptible(cmm_thread_wait, + (!cmm_suspended && (cmm_pages != cmm_pages_target || + cmm_timed_pages != cmm_timed_pages_target)) || + kthread_should_stop()); + if (kthread_should_stop() || rc == -ERESTARTSYS) { + cmm_pages_target = cmm_pages; + cmm_timed_pages_target = cmm_timed_pages; + break; + } + if (cmm_pages_target > cmm_pages) { + if (cmm_alloc_pages(1, &cmm_pages, &cmm_page_list)) + cmm_pages_target = cmm_pages; + } else if (cmm_pages_target < cmm_pages) { + cmm_free_pages(1, &cmm_pages, &cmm_page_list); + } + if (cmm_timed_pages_target > cmm_timed_pages) { + if (cmm_alloc_pages(1, &cmm_timed_pages, + &cmm_timed_page_list)) + cmm_timed_pages_target = cmm_timed_pages; + } else if (cmm_timed_pages_target < cmm_timed_pages) { + cmm_free_pages(1, &cmm_timed_pages, + &cmm_timed_page_list); + } + if (cmm_timed_pages > 0 && !timer_pending(&cmm_timer)) + cmm_set_timer(); + } + return 0; +} + +static void cmm_kick_thread(void) +{ + wake_up(&cmm_thread_wait); +} + +static void cmm_set_timer(void) +{ + if (cmm_timed_pages_target <= 0 || cmm_timeout_seconds <= 0) { + if (timer_pending(&cmm_timer)) + del_timer(&cmm_timer); + return; + } + mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ); +} + +static void cmm_timer_fn(struct timer_list *unused) +{ + long nr; + + nr = cmm_timed_pages_target - cmm_timeout_pages; + if (nr < 0) + cmm_timed_pages_target = 0; + else + cmm_timed_pages_target = nr; + cmm_kick_thread(); + cmm_set_timer(); +} + +static void cmm_set_pages(long nr) +{ + cmm_pages_target = nr; + cmm_kick_thread(); +} + +static long cmm_get_pages(void) +{ + return cmm_pages; +} + +static void cmm_add_timed_pages(long nr) +{ + cmm_timed_pages_target += nr; + cmm_kick_thread(); +} + +static long cmm_get_timed_pages(void) +{ + return cmm_timed_pages; +} + +static void cmm_set_timeout(long nr, long seconds) +{ + cmm_timeout_pages = nr; + cmm_timeout_seconds = seconds; + cmm_set_timer(); +} + +static int cmm_skip_blanks(char *cp, char **endp) +{ + char *str; + + for (str = cp; *str == ' ' || *str == '\t'; str++) + ; + *endp = str; + return str != cp; +} + +static int cmm_pages_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + long nr = cmm_get_pages(); + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &nr, + .maxlen = sizeof(long), + }; + int rc; + + rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + + cmm_set_pages(nr); + return 0; +} + +static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + long nr = cmm_get_timed_pages(); + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &nr, + .maxlen = sizeof(long), + }; + int rc; + + rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + + cmm_add_timed_pages(nr); + return 0; +} + +static int cmm_timeout_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char buf[64], *p; + long nr, seconds; + unsigned int len; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + len = min(*lenp, sizeof(buf)); + if (copy_from_user(buf, buffer, len)) + return -EFAULT; + buf[len - 1] = '\0'; + cmm_skip_blanks(buf, &p); + nr = simple_strtoul(p, &p, 0); + cmm_skip_blanks(p, &p); + seconds = simple_strtoul(p, &p, 0); + cmm_set_timeout(nr, seconds); + *ppos += *lenp; + } else { + len = sprintf(buf, "%ld %ld\n", + cmm_timeout_pages, cmm_timeout_seconds); + if (len > *lenp) + len = *lenp; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += len; + } + return 0; +} + +static struct ctl_table cmm_table[] = { + { + .procname = "cmm_pages", + .mode = 0644, + .proc_handler = cmm_pages_handler, + }, + { + .procname = "cmm_timed_pages", + .mode = 0644, + .proc_handler = cmm_timed_pages_handler, + }, + { + .procname = "cmm_timeout", + .mode = 0644, + .proc_handler = cmm_timeout_handler, + }, + { } +}; + +static struct ctl_table cmm_dir_table[] = { + { + .procname = "vm", + .maxlen = 0, + .mode = 0555, + .child = cmm_table, + }, + { } +}; + +#ifdef CONFIG_CMM_IUCV +#define SMSG_PREFIX "CMM" +static void cmm_smsg_target(const char *from, char *msg) +{ + long nr, seconds; + + if (strlen(sender) > 0 && strcmp(from, sender) != 0) + return; + if (!cmm_skip_blanks(msg + strlen(SMSG_PREFIX), &msg)) + return; + if (strncmp(msg, "SHRINK", 6) == 0) { + if (!cmm_skip_blanks(msg + 6, &msg)) + return; + nr = simple_strtoul(msg, &msg, 0); + cmm_skip_blanks(msg, &msg); + if (*msg == '\0') + cmm_set_pages(nr); + } else if (strncmp(msg, "RELEASE", 7) == 0) { + if (!cmm_skip_blanks(msg + 7, &msg)) + return; + nr = simple_strtoul(msg, &msg, 0); + cmm_skip_blanks(msg, &msg); + if (*msg == '\0') + cmm_add_timed_pages(nr); + } else if (strncmp(msg, "REUSE", 5) == 0) { + if (!cmm_skip_blanks(msg + 5, &msg)) + return; + nr = simple_strtoul(msg, &msg, 0); + if (!cmm_skip_blanks(msg, &msg)) + return; + seconds = simple_strtoul(msg, &msg, 0); + cmm_skip_blanks(msg, &msg); + if (*msg == '\0') + cmm_set_timeout(nr, seconds); + } +} +#endif + +static struct ctl_table_header *cmm_sysctl_header; + +static int cmm_suspend(void) +{ + cmm_suspended = 1; + cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); + cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); + return 0; +} + +static int cmm_resume(void) +{ + cmm_suspended = 0; + cmm_kick_thread(); + return 0; +} + +static int cmm_power_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + switch (event) { + case PM_POST_HIBERNATION: + return cmm_resume(); + case PM_HIBERNATION_PREPARE: + return cmm_suspend(); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block cmm_power_notifier = { + .notifier_call = cmm_power_event, +}; + +static int __init cmm_init(void) +{ + int rc = -ENOMEM; + + cmm_sysctl_header = register_sysctl_table(cmm_dir_table); + if (!cmm_sysctl_header) + goto out_sysctl; +#ifdef CONFIG_CMM_IUCV + /* convert sender to uppercase characters */ + if (sender) { + int len = strlen(sender); + while (len--) + sender[len] = toupper(sender[len]); + } else { + sender = cmm_default_sender; + } + + rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target); + if (rc < 0) + goto out_smsg; +#endif + rc = register_oom_notifier(&cmm_oom_nb); + if (rc < 0) + goto out_oom_notify; + rc = register_pm_notifier(&cmm_power_notifier); + if (rc) + goto out_pm; + cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); + if (!IS_ERR(cmm_thread_ptr)) + return 0; + + rc = PTR_ERR(cmm_thread_ptr); + unregister_pm_notifier(&cmm_power_notifier); +out_pm: + unregister_oom_notifier(&cmm_oom_nb); +out_oom_notify: +#ifdef CONFIG_CMM_IUCV + smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); +out_smsg: +#endif + unregister_sysctl_table(cmm_sysctl_header); +out_sysctl: + del_timer_sync(&cmm_timer); + return rc; +} +module_init(cmm_init); + +static void __exit cmm_exit(void) +{ + unregister_sysctl_table(cmm_sysctl_header); +#ifdef CONFIG_CMM_IUCV + smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); +#endif + unregister_pm_notifier(&cmm_power_notifier); + unregister_oom_notifier(&cmm_oom_nb); + kthread_stop(cmm_thread_ptr); + del_timer_sync(&cmm_timer); + cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); + cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); +} +module_exit(cmm_exit); + +MODULE_LICENSE("GPL"); diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c new file mode 100644 index 000000000..7cdea2ec5 --- /dev/null +++ b/arch/s390/mm/dump_pagetables.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <asm/sections.h> +#include <asm/pgtable.h> + +static unsigned long max_addr; + +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +enum address_markers_idx { + IDENTITY_NR = 0, + KERNEL_START_NR, + KERNEL_END_NR, + VMEMMAP_NR, + VMALLOC_NR, + MODULES_NR, +}; + +static struct addr_marker address_markers[] = { + [IDENTITY_NR] = {0, "Identity Mapping"}, + [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, + [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, + [VMEMMAP_NR] = {0, "vmemmap Area"}, + [VMALLOC_NR] = {0, "vmalloc Area"}, + [MODULES_NR] = {0, "Modules Area"}, + { -1, NULL } +}; + +struct pg_state { + int level; + unsigned int current_prot; + unsigned long start_address; + unsigned long current_address; + const struct addr_marker *marker; +}; + +static void print_prot(struct seq_file *m, unsigned int pr, int level) +{ + static const char * const level_name[] = + { "ASCE", "PGD", "PUD", "PMD", "PTE" }; + + seq_printf(m, "%s ", level_name[level]); + if (pr & _PAGE_INVALID) { + seq_printf(m, "I\n"); + return; + } + seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW "); + seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n"); +} + +static void note_page(struct seq_file *m, struct pg_state *st, + unsigned int new_prot, int level) +{ + static const char units[] = "KMGTPE"; + int width = sizeof(unsigned long) * 2; + const char *unit = units; + unsigned int prot, cur; + unsigned long delta; + + /* + * If we have a "break" in the series, we need to flush the state + * that we have now. "break" is either changing perms, levels or + * address space marker. + */ + prot = new_prot; + cur = st->current_prot; + + if (!st->level) { + /* First entry */ + st->current_prot = new_prot; + st->level = level; + st->marker = address_markers; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } else if (prot != cur || level != st->level || + st->current_address >= st->marker[1].start_address) { + /* Print the actual finished series */ + seq_printf(m, "0x%0*lx-0x%0*lx", + width, st->start_address, + width, st->current_address); + delta = (st->current_address - st->start_address) >> 10; + while (!(delta & 0x3ff) && unit[1]) { + delta >>= 10; + unit++; + } + seq_printf(m, "%9lu%c ", delta, *unit); + print_prot(m, st->current_prot, st->level); + if (st->current_address >= st->marker[1].start_address) { + st->marker++; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } + st->start_address = st->current_address; + st->current_prot = new_prot; + st->level = level; + } +} + +/* + * The actual page table walker functions. In order to keep the + * implementation of print_prot() short, we only check and pass + * _PAGE_INVALID and _PAGE_PROTECT flags to note_page() if a region, + * segment or page table entry is invalid or read-only. + * After all it's just a hint that the current level being walked + * contains an invalid or read-only entry. + */ +static void walk_pte_level(struct seq_file *m, struct pg_state *st, + pmd_t *pmd, unsigned long addr) +{ + unsigned int prot; + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) { + st->current_address = addr; + pte = pte_offset_kernel(pmd, addr); + prot = pte_val(*pte) & + (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC); + note_page(m, st, prot, 4); + addr += PAGE_SIZE; + } +} + +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, + pud_t *pud, unsigned long addr) +{ + unsigned int prot; + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++) { + st->current_address = addr; + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) { + if (pmd_large(*pmd)) { + prot = pmd_val(*pmd) & + (_SEGMENT_ENTRY_PROTECT | + _SEGMENT_ENTRY_NOEXEC); + note_page(m, st, prot, 3); + } else + walk_pte_level(m, st, pmd, addr); + } else + note_page(m, st, _PAGE_INVALID, 3); + addr += PMD_SIZE; + } +} + +static void walk_pud_level(struct seq_file *m, struct pg_state *st, + p4d_t *p4d, unsigned long addr) +{ + unsigned int prot; + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++) { + st->current_address = addr; + pud = pud_offset(p4d, addr); + if (!pud_none(*pud)) + if (pud_large(*pud)) { + prot = pud_val(*pud) & + (_REGION_ENTRY_PROTECT | + _REGION_ENTRY_NOEXEC); + note_page(m, st, prot, 2); + } else + walk_pmd_level(m, st, pud, addr); + else + note_page(m, st, _PAGE_INVALID, 2); + addr += PUD_SIZE; + } +} + +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, + pgd_t *pgd, unsigned long addr) +{ + p4d_t *p4d; + int i; + + for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++) { + st->current_address = addr; + p4d = p4d_offset(pgd, addr); + if (!p4d_none(*p4d)) + walk_pud_level(m, st, p4d, addr); + else + note_page(m, st, _PAGE_INVALID, 2); + addr += P4D_SIZE; + } +} + +static void walk_pgd_level(struct seq_file *m) +{ + unsigned long addr = 0; + struct pg_state st; + pgd_t *pgd; + int i; + + memset(&st, 0, sizeof(st)); + for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) { + st.current_address = addr; + pgd = pgd_offset_k(addr); + if (!pgd_none(*pgd)) + walk_p4d_level(m, &st, pgd, addr); + else + note_page(m, &st, _PAGE_INVALID, 1); + addr += PGDIR_SIZE; + cond_resched(); + } + /* Flush out the last page */ + st.current_address = max_addr; + note_page(m, &st, 0, 0); +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + walk_pgd_level(m); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int pt_dump_init(void) +{ + /* + * Figure out the maximum virtual address being accessible with the + * kernel ASCE. We need this to keep the page table walker functions + * from accessing non-existent entries. + */ + max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; + max_addr = 1UL << (max_addr * 11 + 31); + address_markers[MODULES_NR].start_address = MODULES_VADDR; + address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; + address_markers[VMALLOC_NR].start_address = VMALLOC_START; + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); + return 0; +} +device_initcall(pt_dump_init); diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c new file mode 100644 index 000000000..84111a43e --- /dev/null +++ b/arch/s390/mm/extmem.c @@ -0,0 +1,763 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author(s)......: Carsten Otte <cotte@de.ibm.com> + * Rob M van der Heij <rvdheij@nl.ibm.com> + * Steven Shultz <shultzss@us.ibm.com> + * Bugreports.to..: <Linux390@de.ibm.com> + * Copyright IBM Corp. 2002, 2004 + */ + +#define KMSG_COMPONENT "extmem" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/bootmem.h> +#include <linux/ctype.h> +#include <linux/ioport.h> +#include <asm/diag.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/ebcdic.h> +#include <asm/errno.h> +#include <asm/extmem.h> +#include <asm/cpcmd.h> +#include <asm/setup.h> + +#define DCSS_LOADSHR 0x00 +#define DCSS_LOADNSR 0x04 +#define DCSS_PURGESEG 0x08 +#define DCSS_FINDSEG 0x0c +#define DCSS_LOADNOLY 0x10 +#define DCSS_SEGEXT 0x18 +#define DCSS_LOADSHRX 0x20 +#define DCSS_LOADNSRX 0x24 +#define DCSS_FINDSEGX 0x2c +#define DCSS_SEGEXTX 0x38 +#define DCSS_FINDSEGA 0x0c + +struct qrange { + unsigned long start; /* last byte type */ + unsigned long end; /* last byte reserved */ +}; + +struct qout64 { + unsigned long segstart; + unsigned long segend; + int segcnt; + int segrcnt; + struct qrange range[6]; +}; + +struct qrange_old { + unsigned int start; /* last byte type */ + unsigned int end; /* last byte reserved */ +}; + +/* output area format for the Diag x'64' old subcode x'18' */ +struct qout64_old { + int segstart; + int segend; + int segcnt; + int segrcnt; + struct qrange_old range[6]; +}; + +struct qin64 { + char qopcode; + char rsrv1[3]; + char qrcode; + char rsrv2[3]; + char qname[8]; + unsigned int qoutptr; + short int qoutlen; +}; + +struct dcss_segment { + struct list_head list; + char dcss_name[8]; + char res_name[16]; + unsigned long start_addr; + unsigned long end; + atomic_t ref_count; + int do_nonshared; + unsigned int vm_segtype; + struct qrange range[6]; + int segcnt; + struct resource *res; +}; + +static DEFINE_MUTEX(dcss_lock); +static LIST_HEAD(dcss_list); +static char *segtype_string[] = { "SW", "EW", "SR", "ER", "SN", "EN", "SC", + "EW/EN-MIXED" }; +static int loadshr_scode, loadnsr_scode; +static int segext_scode, purgeseg_scode; +static int scode_set; + +/* set correct Diag x'64' subcodes. */ +static int +dcss_set_subcodes(void) +{ + char *name = kmalloc(8, GFP_KERNEL | GFP_DMA); + unsigned long rx, ry; + int rc; + + if (name == NULL) + return -ENOMEM; + + rx = (unsigned long) name; + ry = DCSS_FINDSEGX; + + strcpy(name, "dummy"); + diag_stat_inc(DIAG_STAT_X064); + asm volatile( + " diag %0,%1,0x64\n" + "0: ipm %2\n" + " srl %2,28\n" + " j 2f\n" + "1: la %2,3\n" + "2:\n" + EX_TABLE(0b, 1b) + : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc", "memory"); + + kfree(name); + /* Diag x'64' new subcodes are supported, set to new subcodes */ + if (rc != 3) { + loadshr_scode = DCSS_LOADSHRX; + loadnsr_scode = DCSS_LOADNSRX; + purgeseg_scode = DCSS_PURGESEG; + segext_scode = DCSS_SEGEXTX; + return 0; + } + /* Diag x'64' new subcodes are not supported, set to old subcodes */ + loadshr_scode = DCSS_LOADNOLY; + loadnsr_scode = DCSS_LOADNSR; + purgeseg_scode = DCSS_PURGESEG; + segext_scode = DCSS_SEGEXT; + return 0; +} + +/* + * Create the 8 bytes, ebcdic VM segment name from + * an ascii name. + */ +static void +dcss_mkname(char *name, char *dcss_name) +{ + int i; + + for (i = 0; i < 8; i++) { + if (name[i] == '\0') + break; + dcss_name[i] = toupper(name[i]); + } + for (; i < 8; i++) + dcss_name[i] = ' '; + ASCEBC(dcss_name, 8); +} + + +/* + * search all segments in dcss_list, and return the one + * namend *name. If not found, return NULL. + */ +static struct dcss_segment * +segment_by_name (char *name) +{ + char dcss_name[9]; + struct list_head *l; + struct dcss_segment *tmp, *retval = NULL; + + BUG_ON(!mutex_is_locked(&dcss_lock)); + dcss_mkname (name, dcss_name); + list_for_each (l, &dcss_list) { + tmp = list_entry (l, struct dcss_segment, list); + if (memcmp(tmp->dcss_name, dcss_name, 8) == 0) { + retval = tmp; + break; + } + } + return retval; +} + + +/* + * Perform a function on a dcss segment. + */ +static inline int +dcss_diag(int *func, void *parameter, + unsigned long *ret1, unsigned long *ret2) +{ + unsigned long rx, ry; + int rc; + + if (scode_set == 0) { + rc = dcss_set_subcodes(); + if (rc < 0) + return rc; + scode_set = 1; + } + rx = (unsigned long) parameter; + ry = (unsigned long) *func; + + /* 64-bit Diag x'64' new subcode, keep in 64-bit addressing mode */ + diag_stat_inc(DIAG_STAT_X064); + if (*func > DCSS_SEGEXT) + asm volatile( + " diag %0,%1,0x64\n" + " ipm %2\n" + " srl %2,28\n" + : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); + /* 31-bit Diag x'64' old subcode, switch to 31-bit addressing mode */ + else + asm volatile( + " sam31\n" + " diag %0,%1,0x64\n" + " sam64\n" + " ipm %2\n" + " srl %2,28\n" + : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc"); + *ret1 = rx; + *ret2 = ry; + return rc; +} + +static inline int +dcss_diag_translate_rc (int vm_rc) { + if (vm_rc == 44) + return -ENOENT; + return -EIO; +} + + +/* do a diag to get info about a segment. + * fills start_address, end and vm_segtype fields + */ +static int +query_segment_type (struct dcss_segment *seg) +{ + unsigned long dummy, vmrc; + int diag_cc, rc, i; + struct qout64 *qout; + struct qin64 *qin; + + qin = kmalloc(sizeof(*qin), GFP_KERNEL | GFP_DMA); + qout = kmalloc(sizeof(*qout), GFP_KERNEL | GFP_DMA); + if ((qin == NULL) || (qout == NULL)) { + rc = -ENOMEM; + goto out_free; + } + + /* initialize diag input parameters */ + qin->qopcode = DCSS_FINDSEGA; + qin->qoutptr = (unsigned long) qout; + qin->qoutlen = sizeof(struct qout64); + memcpy (qin->qname, seg->dcss_name, 8); + + diag_cc = dcss_diag(&segext_scode, qin, &dummy, &vmrc); + + if (diag_cc < 0) { + rc = diag_cc; + goto out_free; + } + if (diag_cc > 1) { + pr_warn("Querying a DCSS type failed with rc=%ld\n", vmrc); + rc = dcss_diag_translate_rc (vmrc); + goto out_free; + } + + /* Only old format of output area of Diagnose x'64' is supported, + copy data for the new format. */ + if (segext_scode == DCSS_SEGEXT) { + struct qout64_old *qout_old; + qout_old = kzalloc(sizeof(*qout_old), GFP_KERNEL | GFP_DMA); + if (qout_old == NULL) { + rc = -ENOMEM; + goto out_free; + } + memcpy(qout_old, qout, sizeof(struct qout64_old)); + qout->segstart = (unsigned long) qout_old->segstart; + qout->segend = (unsigned long) qout_old->segend; + qout->segcnt = qout_old->segcnt; + qout->segrcnt = qout_old->segrcnt; + + if (qout->segcnt > 6) + qout->segrcnt = 6; + for (i = 0; i < qout->segrcnt; i++) { + qout->range[i].start = + (unsigned long) qout_old->range[i].start; + qout->range[i].end = + (unsigned long) qout_old->range[i].end; + } + kfree(qout_old); + } + if (qout->segcnt > 6) { + rc = -EOPNOTSUPP; + goto out_free; + } + + if (qout->segcnt == 1) { + seg->vm_segtype = qout->range[0].start & 0xff; + } else { + /* multi-part segment. only one type supported here: + - all parts are contiguous + - all parts are either EW or EN type + - maximum 6 parts allowed */ + unsigned long start = qout->segstart >> PAGE_SHIFT; + for (i=0; i<qout->segcnt; i++) { + if (((qout->range[i].start & 0xff) != SEG_TYPE_EW) && + ((qout->range[i].start & 0xff) != SEG_TYPE_EN)) { + rc = -EOPNOTSUPP; + goto out_free; + } + if (start != qout->range[i].start >> PAGE_SHIFT) { + rc = -EOPNOTSUPP; + goto out_free; + } + start = (qout->range[i].end >> PAGE_SHIFT) + 1; + } + seg->vm_segtype = SEG_TYPE_EWEN; + } + + /* analyze diag output and update seg */ + seg->start_addr = qout->segstart; + seg->end = qout->segend; + + memcpy (seg->range, qout->range, 6*sizeof(struct qrange)); + seg->segcnt = qout->segcnt; + + rc = 0; + + out_free: + kfree(qin); + kfree(qout); + return rc; +} + +/* + * get info about a segment + * possible return values: + * -ENOSYS : we are not running on VM + * -EIO : could not perform query diagnose + * -ENOENT : no such segment + * -EOPNOTSUPP: multi-part segment cannot be used with linux + * -ENOMEM : out of memory + * 0 .. 6 : type of segment as defined in include/asm-s390/extmem.h + */ +int +segment_type (char* name) +{ + int rc; + struct dcss_segment seg; + + if (!MACHINE_IS_VM) + return -ENOSYS; + + dcss_mkname(name, seg.dcss_name); + rc = query_segment_type (&seg); + if (rc < 0) + return rc; + return seg.vm_segtype; +} + +/* + * check if segment collides with other segments that are currently loaded + * returns 1 if this is the case, 0 if no collision was found + */ +static int +segment_overlaps_others (struct dcss_segment *seg) +{ + struct list_head *l; + struct dcss_segment *tmp; + + BUG_ON(!mutex_is_locked(&dcss_lock)); + list_for_each(l, &dcss_list) { + tmp = list_entry(l, struct dcss_segment, list); + if ((tmp->start_addr >> 20) > (seg->end >> 20)) + continue; + if ((tmp->end >> 20) < (seg->start_addr >> 20)) + continue; + if (seg == tmp) + continue; + return 1; + } + return 0; +} + +/* + * real segment loading function, called from segment_load + */ +static int +__segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end) +{ + unsigned long start_addr, end_addr, dummy; + struct dcss_segment *seg; + int rc, diag_cc; + + start_addr = end_addr = 0; + seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA); + if (seg == NULL) { + rc = -ENOMEM; + goto out; + } + dcss_mkname (name, seg->dcss_name); + rc = query_segment_type (seg); + if (rc < 0) + goto out_free; + + if (loadshr_scode == DCSS_LOADSHRX) { + if (segment_overlaps_others(seg)) { + rc = -EBUSY; + goto out_free; + } + } + + rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + + if (rc) + goto out_free; + + seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL); + if (seg->res == NULL) { + rc = -ENOMEM; + goto out_shared; + } + seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + seg->res->start = seg->start_addr; + seg->res->end = seg->end; + memcpy(&seg->res_name, seg->dcss_name, 8); + EBCASC(seg->res_name, 8); + seg->res_name[8] = '\0'; + strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name)); + seg->res->name = seg->res_name; + rc = seg->vm_segtype; + if (rc == SEG_TYPE_SC || + ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared)) + seg->res->flags |= IORESOURCE_READONLY; + if (request_resource(&iomem_resource, seg->res)) { + rc = -EBUSY; + kfree(seg->res); + goto out_shared; + } + + if (do_nonshared) + diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name, + &start_addr, &end_addr); + else + diag_cc = dcss_diag(&loadshr_scode, seg->dcss_name, + &start_addr, &end_addr); + if (diag_cc < 0) { + dcss_diag(&purgeseg_scode, seg->dcss_name, + &dummy, &dummy); + rc = diag_cc; + goto out_resource; + } + if (diag_cc > 1) { + pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr); + rc = dcss_diag_translate_rc(end_addr); + dcss_diag(&purgeseg_scode, seg->dcss_name, + &dummy, &dummy); + goto out_resource; + } + seg->start_addr = start_addr; + seg->end = end_addr; + seg->do_nonshared = do_nonshared; + atomic_set(&seg->ref_count, 1); + list_add(&seg->list, &dcss_list); + *addr = seg->start_addr; + *end = seg->end; + if (do_nonshared) + pr_info("DCSS %s of range %p to %p and type %s loaded as " + "exclusive-writable\n", name, (void*) seg->start_addr, + (void*) seg->end, segtype_string[seg->vm_segtype]); + else { + pr_info("DCSS %s of range %p to %p and type %s loaded in " + "shared access mode\n", name, (void*) seg->start_addr, + (void*) seg->end, segtype_string[seg->vm_segtype]); + } + goto out; + out_resource: + release_resource(seg->res); + kfree(seg->res); + out_shared: + vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + out_free: + kfree(seg); + out: + return rc; +} + +/* + * this function loads a DCSS segment + * name : name of the DCSS + * do_nonshared : 0 indicates that the dcss should be shared with other linux images + * 1 indicates that the dcss should be exclusive for this linux image + * addr : will be filled with start address of the segment + * end : will be filled with end address of the segment + * return values: + * -ENOSYS : we are not running on VM + * -EIO : could not perform query or load diagnose + * -ENOENT : no such segment + * -EOPNOTSUPP: multi-part segment cannot be used with linux + * -ENOSPC : segment cannot be used (overlaps with storage) + * -EBUSY : segment can temporarily not be used (overlaps with dcss) + * -ERANGE : segment cannot be used (exceeds kernel mapping range) + * -EPERM : segment is currently loaded with incompatible permissions + * -ENOMEM : out of memory + * 0 .. 6 : type of segment as defined in include/asm-s390/extmem.h + */ +int +segment_load (char *name, int do_nonshared, unsigned long *addr, + unsigned long *end) +{ + struct dcss_segment *seg; + int rc; + + if (!MACHINE_IS_VM) + return -ENOSYS; + + mutex_lock(&dcss_lock); + seg = segment_by_name (name); + if (seg == NULL) + rc = __segment_load (name, do_nonshared, addr, end); + else { + if (do_nonshared == seg->do_nonshared) { + atomic_inc(&seg->ref_count); + *addr = seg->start_addr; + *end = seg->end; + rc = seg->vm_segtype; + } else { + *addr = *end = 0; + rc = -EPERM; + } + } + mutex_unlock(&dcss_lock); + return rc; +} + +/* + * this function modifies the shared state of a DCSS segment. note that + * name : name of the DCSS + * do_nonshared : 0 indicates that the dcss should be shared with other linux images + * 1 indicates that the dcss should be exclusive for this linux image + * return values: + * -EIO : could not perform load diagnose (segment gone!) + * -ENOENT : no such segment (segment gone!) + * -EAGAIN : segment is in use by other exploiters, try later + * -EINVAL : no segment with the given name is currently loaded - name invalid + * -EBUSY : segment can temporarily not be used (overlaps with dcss) + * 0 : operation succeeded + */ +int +segment_modify_shared (char *name, int do_nonshared) +{ + struct dcss_segment *seg; + unsigned long start_addr, end_addr, dummy; + int rc, diag_cc; + + start_addr = end_addr = 0; + mutex_lock(&dcss_lock); + seg = segment_by_name (name); + if (seg == NULL) { + rc = -EINVAL; + goto out_unlock; + } + if (do_nonshared == seg->do_nonshared) { + pr_info("DCSS %s is already in the requested access " + "mode\n", name); + rc = 0; + goto out_unlock; + } + if (atomic_read (&seg->ref_count) != 1) { + pr_warn("DCSS %s is in use and cannot be reloaded\n", name); + rc = -EAGAIN; + goto out_unlock; + } + release_resource(seg->res); + if (do_nonshared) + seg->res->flags &= ~IORESOURCE_READONLY; + else + if (seg->vm_segtype == SEG_TYPE_SR || + seg->vm_segtype == SEG_TYPE_ER) + seg->res->flags |= IORESOURCE_READONLY; + + if (request_resource(&iomem_resource, seg->res)) { + pr_warn("DCSS %s overlaps with used memory resources and cannot be reloaded\n", + name); + rc = -EBUSY; + kfree(seg->res); + goto out_del_mem; + } + + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); + if (do_nonshared) + diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name, + &start_addr, &end_addr); + else + diag_cc = dcss_diag(&loadshr_scode, seg->dcss_name, + &start_addr, &end_addr); + if (diag_cc < 0) { + rc = diag_cc; + goto out_del_res; + } + if (diag_cc > 1) { + pr_warn("Reloading DCSS %s failed with rc=%ld\n", + name, end_addr); + rc = dcss_diag_translate_rc(end_addr); + goto out_del_res; + } + seg->start_addr = start_addr; + seg->end = end_addr; + seg->do_nonshared = do_nonshared; + rc = 0; + goto out_unlock; + out_del_res: + release_resource(seg->res); + kfree(seg->res); + out_del_mem: + vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + list_del(&seg->list); + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); + kfree(seg); + out_unlock: + mutex_unlock(&dcss_lock); + return rc; +} + +/* + * Decrease the use count of a DCSS segment and remove + * it from the address space if nobody is using it + * any longer. + */ +void +segment_unload(char *name) +{ + unsigned long dummy; + struct dcss_segment *seg; + + if (!MACHINE_IS_VM) + return; + + mutex_lock(&dcss_lock); + seg = segment_by_name (name); + if (seg == NULL) { + pr_err("Unloading unknown DCSS %s failed\n", name); + goto out_unlock; + } + if (atomic_dec_return(&seg->ref_count) != 0) + goto out_unlock; + release_resource(seg->res); + kfree(seg->res); + vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + list_del(&seg->list); + dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); + kfree(seg); +out_unlock: + mutex_unlock(&dcss_lock); +} + +/* + * save segment content permanently + */ +void +segment_save(char *name) +{ + struct dcss_segment *seg; + char cmd1[160]; + char cmd2[80]; + int i, response; + + if (!MACHINE_IS_VM) + return; + + mutex_lock(&dcss_lock); + seg = segment_by_name (name); + + if (seg == NULL) { + pr_err("Saving unknown DCSS %s failed\n", name); + goto out; + } + + sprintf(cmd1, "DEFSEG %s", name); + for (i=0; i<seg->segcnt; i++) { + sprintf(cmd1+strlen(cmd1), " %lX-%lX %s", + seg->range[i].start >> PAGE_SHIFT, + seg->range[i].end >> PAGE_SHIFT, + segtype_string[seg->range[i].start & 0xff]); + } + sprintf(cmd2, "SAVESEG %s", name); + response = 0; + cpcmd(cmd1, NULL, 0, &response); + if (response) { + pr_err("Saving a DCSS failed with DEFSEG response code " + "%i\n", response); + goto out; + } + cpcmd(cmd2, NULL, 0, &response); + if (response) { + pr_err("Saving a DCSS failed with SAVESEG response code " + "%i\n", response); + goto out; + } +out: + mutex_unlock(&dcss_lock); +} + +/* + * print appropriate error message for segment_load()/segment_type() + * return code + */ +void segment_warning(int rc, char *seg_name) +{ + switch (rc) { + case -ENOENT: + pr_err("DCSS %s cannot be loaded or queried\n", seg_name); + break; + case -ENOSYS: + pr_err("DCSS %s cannot be loaded or queried without " + "z/VM\n", seg_name); + break; + case -EIO: + pr_err("Loading or querying DCSS %s resulted in a " + "hardware error\n", seg_name); + break; + case -EOPNOTSUPP: + pr_err("DCSS %s has multiple page ranges and cannot be " + "loaded or queried\n", seg_name); + break; + case -ENOSPC: + pr_err("DCSS %s overlaps with used storage and cannot " + "be loaded\n", seg_name); + break; + case -EBUSY: + pr_err("%s needs used memory resources and cannot be " + "loaded or queried\n", seg_name); + break; + case -EPERM: + pr_err("DCSS %s is already loaded in a different access " + "mode\n", seg_name); + break; + case -ENOMEM: + pr_err("There is not enough memory to load or query " + "DCSS %s\n", seg_name); + break; + case -ERANGE: + pr_err("DCSS %s exceeds the kernel mapping range (%lu) " + "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS); + break; + default: + break; + } +} + +EXPORT_SYMBOL(segment_load); +EXPORT_SYMBOL(segment_unload); +EXPORT_SYMBOL(segment_save); +EXPORT_SYMBOL(segment_type); +EXPORT_SYMBOL(segment_modify_shared); +EXPORT_SYMBOL(segment_warning); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c new file mode 100644 index 000000000..a6e3c7022 --- /dev/null +++ b/arch/s390/mm/fault.c @@ -0,0 +1,837 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Hartmut Penner (hp@de.ibm.com) + * Ulrich Weigand (uweigand@de.ibm.com) + * + * Derived from "arch/i386/mm/fault.c" + * Copyright (C) 1995 Linus Torvalds + */ + +#include <linux/kernel_stat.h> +#include <linux/perf_event.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/sched/debug.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/compat.h> +#include <linux/smp.h> +#include <linux/kdebug.h> +#include <linux/init.h> +#include <linux/console.h> +#include <linux/extable.h> +#include <linux/hardirq.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/hugetlb.h> +#include <asm/asm-offsets.h> +#include <asm/diag.h> +#include <asm/pgtable.h> +#include <asm/gmap.h> +#include <asm/irq.h> +#include <asm/mmu_context.h> +#include <asm/facility.h> +#include "../kernel/entry.h" + +#define __FAIL_ADDR_MASK -4096L +#define __SUBCODE_MASK 0x0600 +#define __PF_RES_FIELD 0x8000000000000000ULL + +#define VM_FAULT_BADCONTEXT 0x010000 +#define VM_FAULT_BADMAP 0x020000 +#define VM_FAULT_BADACCESS 0x040000 +#define VM_FAULT_SIGNAL 0x080000 +#define VM_FAULT_PFAULT 0x100000 + +enum fault_type { + KERNEL_FAULT, + USER_FAULT, + VDSO_FAULT, + GMAP_FAULT, +}; + +static unsigned long store_indication __read_mostly; + +static int __init fault_init(void) +{ + if (test_facility(75)) + store_indication = 0xc00; + return 0; +} +early_initcall(fault_init); + +static inline int notify_page_fault(struct pt_regs *regs) +{ + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ + if (kprobes_built_in() && !user_mode(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } + return ret; +} + + +/* + * Unlock any spinlocks which will prevent us from getting the + * message out. + */ +void bust_spinlocks(int yes) +{ + if (yes) { + oops_in_progress = 1; + } else { + int loglevel_save = console_loglevel; + console_unblank(); + oops_in_progress = 0; + /* + * OK, the message is on the console. Now we call printk() + * without oops_in_progress set so that printk will give klogd + * a poke. Hold onto your hats... + */ + console_loglevel = 15; + printk(" "); + console_loglevel = loglevel_save; + } +} + +/* + * Find out which address space caused the exception. + */ +static inline enum fault_type get_fault_type(struct pt_regs *regs) +{ + unsigned long trans_exc_code; + + trans_exc_code = regs->int_parm_long & 3; + if (likely(trans_exc_code == 0)) { + /* primary space exception */ + if (IS_ENABLED(CONFIG_PGSTE) && + test_pt_regs_flag(regs, PIF_GUEST_FAULT)) + return GMAP_FAULT; + if (current->thread.mm_segment == USER_DS) + return USER_FAULT; + return KERNEL_FAULT; + } + if (trans_exc_code == 2) { + /* secondary space exception */ + if (current->thread.mm_segment & 1) { + if (current->thread.mm_segment == USER_DS_SACF) + return USER_FAULT; + return KERNEL_FAULT; + } + return VDSO_FAULT; + } + if (trans_exc_code == 1) { + /* access register mode, not used in the kernel */ + return USER_FAULT; + } + /* home space exception -> access via kernel ASCE */ + return KERNEL_FAULT; +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void dump_pagetable(unsigned long asce, unsigned long address) +{ + unsigned long *table = __va(asce & _ASCE_ORIGIN); + + pr_alert("AS:%016lx ", asce); + switch (asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; + if (bad_address(table)) + goto bad; + pr_cont("R1:%016lx ", *table); + if (*table & _REGION_ENTRY_INVALID) + goto out; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* fallthrough */ + case _ASCE_TYPE_REGION2: + table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; + if (bad_address(table)) + goto bad; + pr_cont("R2:%016lx ", *table); + if (*table & _REGION_ENTRY_INVALID) + goto out; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* fallthrough */ + case _ASCE_TYPE_REGION3: + table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; + if (bad_address(table)) + goto bad; + pr_cont("R3:%016lx ", *table); + if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) + goto out; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* fallthrough */ + case _ASCE_TYPE_SEGMENT: + table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + if (bad_address(table)) + goto bad; + pr_cont("S:%016lx ", *table); + if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) + goto out; + table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + } + table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; + if (bad_address(table)) + goto bad; + pr_cont("P:%016lx ", *table); +out: + pr_cont("\n"); + return; +bad: + pr_cont("BAD\n"); +} + +static void dump_fault_info(struct pt_regs *regs) +{ + unsigned long asce; + + pr_alert("Failing address: %016lx TEID: %016lx\n", + regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long); + pr_alert("Fault in "); + switch (regs->int_parm_long & 3) { + case 3: + pr_cont("home space "); + break; + case 2: + pr_cont("secondary space "); + break; + case 1: + pr_cont("access register "); + break; + case 0: + pr_cont("primary space "); + break; + } + pr_cont("mode while using "); + switch (get_fault_type(regs)) { + case USER_FAULT: + asce = S390_lowcore.user_asce; + pr_cont("user "); + break; + case VDSO_FAULT: + asce = S390_lowcore.vdso_asce; + pr_cont("vdso "); + break; + case GMAP_FAULT: + asce = ((struct gmap *) S390_lowcore.gmap)->asce; + pr_cont("gmap "); + break; + case KERNEL_FAULT: + asce = S390_lowcore.kernel_asce; + pr_cont("kernel "); + break; + } + pr_cont("ASCE.\n"); + dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK); +} + +int show_unhandled_signals = 1; + +void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault) +{ + if ((task_pid_nr(current) > 1) && !show_unhandled_signals) + return; + if (!unhandled_signal(current, signr)) + return; + if (!printk_ratelimit()) + return; + printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ", + regs->int_code & 0xffff, regs->int_code >> 17); + print_vma_addr(KERN_CONT "in ", regs->psw.addr); + printk(KERN_CONT "\n"); + if (is_mm_fault) + dump_fault_info(regs); + show_regs(regs); +} + +/* + * Send SIGSEGV to task. This is an external routine + * to keep the stack usage of do_page_fault small. + */ +static noinline void do_sigsegv(struct pt_regs *regs, int si_code) +{ + report_user_fault(regs, SIGSEGV, 1); + force_sig_fault(SIGSEGV, si_code, + (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK), + current); +} + +static noinline void do_no_context(struct pt_regs *regs) +{ + const struct exception_table_entry *fixup; + + /* Are we prepared to handle this kernel fault? */ + fixup = search_exception_tables(regs->psw.addr); + if (fixup) { + regs->psw.addr = extable_fixup(fixup); + return; + } + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + if (get_fault_type(regs) == KERNEL_FAULT) + printk(KERN_ALERT "Unable to handle kernel pointer dereference" + " in virtual kernel address space\n"); + else + printk(KERN_ALERT "Unable to handle kernel paging request" + " in virtual user address space\n"); + dump_fault_info(regs); + die(regs, "Oops"); + do_exit(SIGKILL); +} + +static noinline void do_low_address(struct pt_regs *regs) +{ + /* Low-address protection hit in kernel mode means + NULL pointer write access in kernel mode. */ + if (regs->psw.mask & PSW_MASK_PSTATE) { + /* Low-address protection hit in user mode 'cannot happen'. */ + die (regs, "Low-address protection"); + do_exit(SIGKILL); + } + + do_no_context(regs); +} + +static noinline void do_sigbus(struct pt_regs *regs) +{ + /* + * Send a sigbus, regardless of whether we were in kernel + * or user mode. + */ + force_sig_fault(SIGBUS, BUS_ADRERR, + (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK), + current); +} + +static noinline int signal_return(struct pt_regs *regs) +{ + u16 instruction; + int rc; + + rc = __get_user(instruction, (u16 __user *) regs->psw.addr); + if (rc) + return rc; + if (instruction == 0x0a77) { + set_pt_regs_flag(regs, PIF_SYSCALL); + regs->int_code = 0x00040077; + return 0; + } else if (instruction == 0x0aad) { + set_pt_regs_flag(regs, PIF_SYSCALL); + regs->int_code = 0x000400ad; + return 0; + } + return -EACCES; +} + +static noinline void do_fault_error(struct pt_regs *regs, int access, + vm_fault_t fault) +{ + int si_code; + + switch (fault) { + case VM_FAULT_BADACCESS: + if (access == VM_EXEC && signal_return(regs) == 0) + break; + case VM_FAULT_BADMAP: + /* Bad memory access. Check if it is kernel or user space. */ + if (user_mode(regs)) { + /* User mode accesses just cause a SIGSEGV */ + si_code = (fault == VM_FAULT_BADMAP) ? + SEGV_MAPERR : SEGV_ACCERR; + do_sigsegv(regs, si_code); + break; + } + case VM_FAULT_BADCONTEXT: + case VM_FAULT_PFAULT: + do_no_context(regs); + break; + case VM_FAULT_SIGNAL: + if (!user_mode(regs)) + do_no_context(regs); + break; + default: /* fault & VM_FAULT_ERROR */ + if (fault & VM_FAULT_OOM) { + if (!user_mode(regs)) + do_no_context(regs); + else + pagefault_out_of_memory(); + } else if (fault & VM_FAULT_SIGSEGV) { + /* Kernel mode? Handle exceptions or die */ + if (!user_mode(regs)) + do_no_context(regs); + else + do_sigsegv(regs, SEGV_MAPERR); + } else if (fault & VM_FAULT_SIGBUS) { + /* Kernel mode? Handle exceptions or die */ + if (!user_mode(regs)) + do_no_context(regs); + else + do_sigbus(regs); + } else + BUG(); + break; + } +} + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + * + * interruption code (int_code): + * 04 Protection -> Write-Protection (suprression) + * 10 Segment translation -> Not present (nullification) + * 11 Page translation -> Not present (nullification) + * 3b Region third trans. -> Not present (nullification) + */ +static inline vm_fault_t do_exception(struct pt_regs *regs, int access) +{ + struct gmap *gmap; + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct *vma; + enum fault_type type; + unsigned long trans_exc_code; + unsigned long address; + unsigned int flags; + vm_fault_t fault; + + tsk = current; + /* + * The instruction that caused the program check has + * been nullified. Don't signal single step via SIGTRAP. + */ + clear_pt_regs_flag(regs, PIF_PER_TRAP); + + if (notify_page_fault(regs)) + return 0; + + mm = tsk->mm; + trans_exc_code = regs->int_parm_long; + + /* + * Verify that the fault happened in user space, that + * we are not in an interrupt and that there is a + * user context. + */ + fault = VM_FAULT_BADCONTEXT; + type = get_fault_type(regs); + switch (type) { + case KERNEL_FAULT: + goto out; + case VDSO_FAULT: + fault = VM_FAULT_BADMAP; + goto out; + case USER_FAULT: + case GMAP_FAULT: + if (faulthandler_disabled() || !mm) + goto out; + break; + } + + address = trans_exc_code & __FAIL_ADDR_MASK; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) + flags |= FAULT_FLAG_WRITE; + down_read(&mm->mmap_sem); + + gmap = NULL; + if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) { + gmap = (struct gmap *) S390_lowcore.gmap; + current->thread.gmap_addr = address; + current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); + current->thread.gmap_int_code = regs->int_code & 0xffff; + address = __gmap_translate(gmap, address); + if (address == -EFAULT) { + fault = VM_FAULT_BADMAP; + goto out_up; + } + if (gmap->pfault_enabled) + flags |= FAULT_FLAG_RETRY_NOWAIT; + } + +retry: + fault = VM_FAULT_BADMAP; + vma = find_vma(mm, address); + if (!vma) + goto out_up; + + if (unlikely(vma->vm_start > address)) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_up; + if (expand_stack(vma, address)) + goto out_up; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ + fault = VM_FAULT_BADACCESS; + if (unlikely(!(vma->vm_flags & access))) + goto out_up; + + if (is_vm_hugetlb_page(vma)) + address &= HPAGE_MASK; + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + fault = handle_mm_fault(vma, address, flags); + /* No reason to continue if interrupted by SIGKILL. */ + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + fault = VM_FAULT_SIGNAL; + if (flags & FAULT_FLAG_RETRY_NOWAIT) + goto out_up; + goto out; + } + if (unlikely(fault & VM_FAULT_ERROR)) + goto out_up; + + /* + * Major/minor page fault accounting is only done on the + * initial attempt. If we go through a retry, it is extremely + * likely that the page will be found in page cache at that point. + */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, + regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, + regs, address); + } + if (fault & VM_FAULT_RETRY) { + if (IS_ENABLED(CONFIG_PGSTE) && gmap && + (flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* FAULT_FLAG_RETRY_NOWAIT has been set, + * mmap_sem has not been released */ + current->thread.gmap_pfault = 1; + fault = VM_FAULT_PFAULT; + goto out_up; + } + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. */ + flags &= ~(FAULT_FLAG_ALLOW_RETRY | + FAULT_FLAG_RETRY_NOWAIT); + flags |= FAULT_FLAG_TRIED; + down_read(&mm->mmap_sem); + goto retry; + } + } + if (IS_ENABLED(CONFIG_PGSTE) && gmap) { + address = __gmap_link(gmap, current->thread.gmap_addr, + address); + if (address == -EFAULT) { + fault = VM_FAULT_BADMAP; + goto out_up; + } + if (address == -ENOMEM) { + fault = VM_FAULT_OOM; + goto out_up; + } + } + fault = 0; +out_up: + up_read(&mm->mmap_sem); +out: + return fault; +} + +void do_protection_exception(struct pt_regs *regs) +{ + unsigned long trans_exc_code; + int access; + vm_fault_t fault; + + trans_exc_code = regs->int_parm_long; + /* + * Protection exceptions are suppressing, decrement psw address. + * The exception to this rule are aborted transactions, for these + * the PSW already points to the correct location. + */ + if (!(regs->int_code & 0x200)) + regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); + /* + * Check for low-address protection. This needs to be treated + * as a special case because the translation exception code + * field is not guaranteed to contain valid data in this case. + */ + if (unlikely(!(trans_exc_code & 4))) { + do_low_address(regs); + return; + } + if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) { + regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) | + (regs->psw.addr & PAGE_MASK); + access = VM_EXEC; + fault = VM_FAULT_BADACCESS; + } else { + access = VM_WRITE; + fault = do_exception(regs, access); + } + if (unlikely(fault)) + do_fault_error(regs, access, fault); +} +NOKPROBE_SYMBOL(do_protection_exception); + +void do_dat_exception(struct pt_regs *regs) +{ + int access; + vm_fault_t fault; + + access = VM_READ | VM_EXEC | VM_WRITE; + fault = do_exception(regs, access); + if (unlikely(fault)) + do_fault_error(regs, access, fault); +} +NOKPROBE_SYMBOL(do_dat_exception); + +#ifdef CONFIG_PFAULT +/* + * 'pfault' pseudo page faults routines. + */ +static int pfault_disable; + +static int __init nopfault(char *str) +{ + pfault_disable = 1; + return 1; +} + +__setup("nopfault", nopfault); + +struct pfault_refbk { + u16 refdiagc; + u16 reffcode; + u16 refdwlen; + u16 refversn; + u64 refgaddr; + u64 refselmk; + u64 refcmpmk; + u64 reserved; +} __attribute__ ((packed, aligned(8))); + +int pfault_init(void) +{ + struct pfault_refbk refbk = { + .refdiagc = 0x258, + .reffcode = 0, + .refdwlen = 5, + .refversn = 2, + .refgaddr = __LC_LPP, + .refselmk = 1ULL << 48, + .refcmpmk = 1ULL << 48, + .reserved = __PF_RES_FIELD }; + int rc; + + if (pfault_disable) + return -1; + diag_stat_inc(DIAG_STAT_X258); + asm volatile( + " diag %1,%0,0x258\n" + "0: j 2f\n" + "1: la %0,8\n" + "2:\n" + EX_TABLE(0b,1b) + : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc"); + return rc; +} + +void pfault_fini(void) +{ + struct pfault_refbk refbk = { + .refdiagc = 0x258, + .reffcode = 1, + .refdwlen = 5, + .refversn = 2, + }; + + if (pfault_disable) + return; + diag_stat_inc(DIAG_STAT_X258); + asm volatile( + " diag %0,0,0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b,0b) + : : "a" (&refbk), "m" (refbk) : "cc"); +} + +static DEFINE_SPINLOCK(pfault_lock); +static LIST_HEAD(pfault_list); + +#define PF_COMPLETE 0x0080 + +/* + * The mechanism of our pfault code: if Linux is running as guest, runs a user + * space process and the user space process accesses a page that the host has + * paged out we get a pfault interrupt. + * + * This allows us, within the guest, to schedule a different process. Without + * this mechanism the host would have to suspend the whole virtual cpu until + * the page has been paged in. + * + * So when we get such an interrupt then we set the state of the current task + * to uninterruptible and also set the need_resched flag. Both happens within + * interrupt context(!). If we later on want to return to user space we + * recognize the need_resched flag and then call schedule(). It's not very + * obvious how this works... + * + * Of course we have a lot of additional fun with the completion interrupt (-> + * host signals that a page of a process has been paged in and the process can + * continue to run). This interrupt can arrive on any cpu and, since we have + * virtual cpus, actually appear before the interrupt that signals that a page + * is missing. + */ +static void pfault_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct task_struct *tsk; + __u16 subcode; + pid_t pid; + + /* + * Get the external interruption subcode & pfault initial/completion + * signal bit. VM stores this in the 'cpu address' field associated + * with the external interrupt. + */ + subcode = ext_code.subcode; + if ((subcode & 0xff00) != __SUBCODE_MASK) + return; + inc_irq_stat(IRQEXT_PFL); + /* Get the token (= pid of the affected task). */ + pid = param64 & LPP_PID_MASK; + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return; + spin_lock(&pfault_lock); + if (subcode & PF_COMPLETE) { + /* signal bit is set -> a page has been swapped in by VM */ + if (tsk->thread.pfault_wait == 1) { + /* Initial interrupt was faster than the completion + * interrupt. pfault_wait is valid. Set pfault_wait + * back to zero and wake up the process. This can + * safely be done because the task is still sleeping + * and can't produce new pfaults. */ + tsk->thread.pfault_wait = 0; + list_del(&tsk->thread.list); + wake_up_process(tsk); + put_task_struct(tsk); + } else { + /* Completion interrupt was faster than initial + * interrupt. Set pfault_wait to -1 so the initial + * interrupt doesn't put the task to sleep. + * If the task is not running, ignore the completion + * interrupt since it must be a leftover of a PFAULT + * CANCEL operation which didn't remove all pending + * completion interrupts. */ + if (tsk->state == TASK_RUNNING) + tsk->thread.pfault_wait = -1; + } + } else { + /* signal bit not set -> a real page is missing. */ + if (WARN_ON_ONCE(tsk != current)) + goto out; + if (tsk->thread.pfault_wait == 1) { + /* Already on the list with a reference: put to sleep */ + goto block; + } else if (tsk->thread.pfault_wait == -1) { + /* Completion interrupt was faster than the initial + * interrupt (pfault_wait == -1). Set pfault_wait + * back to zero and exit. */ + tsk->thread.pfault_wait = 0; + } else { + /* Initial interrupt arrived before completion + * interrupt. Let the task sleep. + * An extra task reference is needed since a different + * cpu may set the task state to TASK_RUNNING again + * before the scheduler is reached. */ + get_task_struct(tsk); + tsk->thread.pfault_wait = 1; + list_add(&tsk->thread.list, &pfault_list); +block: + /* Since this must be a userspace fault, there + * is no kernel task state to trample. Rely on the + * return to userspace schedule() to block. */ + __set_current_state(TASK_UNINTERRUPTIBLE); + set_tsk_need_resched(tsk); + set_preempt_need_resched(); + } + } +out: + spin_unlock(&pfault_lock); + put_task_struct(tsk); +} + +static int pfault_cpu_dead(unsigned int cpu) +{ + struct thread_struct *thread, *next; + struct task_struct *tsk; + + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(thread, next, &pfault_list, list) { + thread->pfault_wait = 0; + list_del(&thread->list); + tsk = container_of(thread, struct task_struct, thread); + wake_up_process(tsk); + put_task_struct(tsk); + } + spin_unlock_irq(&pfault_lock); + return 0; +} + +static int __init pfault_irq_init(void) +{ + int rc; + + rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); + if (rc) + goto out_extint; + rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; + if (rc) + goto out_pfault; + irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); + cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", + NULL, pfault_cpu_dead); + return 0; + +out_pfault: + unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); +out_extint: + pfault_disable = 1; + return rc; +} +early_initcall(pfault_irq_init); + +#endif /* CONFIG_PFAULT */ diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c new file mode 100644 index 000000000..65ccb9d79 --- /dev/null +++ b/arch/s390/mm/gmap.c @@ -0,0 +1,2645 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2016, 2018 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * David Hildenbrand <david@redhat.com> + * Janosch Frank <frankja@linux.vnet.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/swapops.h> +#include <linux/ksm.h> +#include <linux/mman.h> + +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/gmap.h> +#include <asm/tlb.h> + +#define GMAP_SHADOW_FAKE_TABLE 1ULL + +/** + * gmap_alloc - allocate and initialize a guest address space + * @mm: pointer to the parent mm_struct + * @limit: maximum address of the gmap address space + * + * Returns a guest address space structure. + */ +static struct gmap *gmap_alloc(unsigned long limit) +{ + struct gmap *gmap; + struct page *page; + unsigned long *table; + unsigned long etype, atype; + + if (limit < _REGION3_SIZE) { + limit = _REGION3_SIZE - 1; + atype = _ASCE_TYPE_SEGMENT; + etype = _SEGMENT_ENTRY_EMPTY; + } else if (limit < _REGION2_SIZE) { + limit = _REGION2_SIZE - 1; + atype = _ASCE_TYPE_REGION3; + etype = _REGION3_ENTRY_EMPTY; + } else if (limit < _REGION1_SIZE) { + limit = _REGION1_SIZE - 1; + atype = _ASCE_TYPE_REGION2; + etype = _REGION2_ENTRY_EMPTY; + } else { + limit = -1UL; + atype = _ASCE_TYPE_REGION1; + etype = _REGION1_ENTRY_EMPTY; + } + gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); + if (!gmap) + goto out; + INIT_LIST_HEAD(&gmap->crst_list); + INIT_LIST_HEAD(&gmap->children); + INIT_LIST_HEAD(&gmap->pt_list); + INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); + INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC); + spin_lock_init(&gmap->guest_table_lock); + spin_lock_init(&gmap->shadow_lock); + atomic_set(&gmap->ref_count, 1); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + if (!page) + goto out_free; + page->index = 0; + list_add(&page->lru, &gmap->crst_list); + table = (unsigned long *) page_to_phys(page); + crst_table_init(table, etype); + gmap->table = table; + gmap->asce = atype | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | __pa(table); + gmap->asce_end = limit; + return gmap; + +out_free: + kfree(gmap); +out: + return NULL; +} + +/** + * gmap_create - create a guest address space + * @mm: pointer to the parent mm_struct + * @limit: maximum size of the gmap address space + * + * Returns a guest address space structure. + */ +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) +{ + struct gmap *gmap; + unsigned long gmap_asce; + + gmap = gmap_alloc(limit); + if (!gmap) + return NULL; + gmap->mm = mm; + spin_lock(&mm->context.lock); + list_add_rcu(&gmap->list, &mm->context.gmap_list); + if (list_is_singular(&mm->context.gmap_list)) + gmap_asce = gmap->asce; + else + gmap_asce = -1UL; + WRITE_ONCE(mm->context.gmap_asce, gmap_asce); + spin_unlock(&mm->context.lock); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_create); + +static void gmap_flush_tlb(struct gmap *gmap) +{ + if (MACHINE_HAS_IDTE) + __tlb_flush_idte(gmap->asce); + else + __tlb_flush_global(); +} + +static void gmap_radix_tree_free(struct radix_tree_root *root) +{ + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void __rcu **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + radix_tree_delete(root, index); + } + } while (nr > 0); +} + +static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) +{ + struct gmap_rmap *rmap, *rnext, *head; + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void __rcu **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + head = radix_tree_delete(root, index); + gmap_for_each_rmap_safe(rmap, rnext, head) + kfree(rmap); + } + } while (nr > 0); +} + +/** + * gmap_free - free a guest address space + * @gmap: pointer to the guest address space structure + * + * No locks required. There are no references to this gmap anymore. + */ +static void gmap_free(struct gmap *gmap) +{ + struct page *page, *next; + + /* Flush tlb of all gmaps (if not already done for shadows) */ + if (!(gmap_is_shadow(gmap) && gmap->removed)) + gmap_flush_tlb(gmap); + /* Free all segment & region tables. */ + list_for_each_entry_safe(page, next, &gmap->crst_list, lru) + __free_pages(page, CRST_ALLOC_ORDER); + gmap_radix_tree_free(&gmap->guest_to_host); + gmap_radix_tree_free(&gmap->host_to_guest); + + /* Free additional data for a shadow gmap */ + if (gmap_is_shadow(gmap)) { + /* Free all page tables. */ + list_for_each_entry_safe(page, next, &gmap->pt_list, lru) + page_table_free_pgste(page); + gmap_rmap_radix_tree_free(&gmap->host_to_rmap); + /* Release reference to the parent */ + gmap_put(gmap->parent); + } + + kfree(gmap); +} + +/** + * gmap_get - increase reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * Returns the gmap pointer + */ +struct gmap *gmap_get(struct gmap *gmap) +{ + atomic_inc(&gmap->ref_count); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_get); + +/** + * gmap_put - decrease reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * If the reference counter reaches zero the guest address space is freed. + */ +void gmap_put(struct gmap *gmap) +{ + if (atomic_dec_return(&gmap->ref_count) == 0) + gmap_free(gmap); +} +EXPORT_SYMBOL_GPL(gmap_put); + +/** + * gmap_remove - remove a guest address space but do not free it yet + * @gmap: pointer to the guest address space structure + */ +void gmap_remove(struct gmap *gmap) +{ + struct gmap *sg, *next; + unsigned long gmap_asce; + + /* Remove all shadow gmaps linked to this gmap */ + if (!list_empty(&gmap->children)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, &gmap->children, list) { + list_del(&sg->list); + gmap_put(sg); + } + spin_unlock(&gmap->shadow_lock); + } + /* Remove gmap from the pre-mm list */ + spin_lock(&gmap->mm->context.lock); + list_del_rcu(&gmap->list); + if (list_empty(&gmap->mm->context.gmap_list)) + gmap_asce = 0; + else if (list_is_singular(&gmap->mm->context.gmap_list)) + gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, + struct gmap, list)->asce; + else + gmap_asce = -1UL; + WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); + spin_unlock(&gmap->mm->context.lock); + synchronize_rcu(); + /* Put reference */ + gmap_put(gmap); +} +EXPORT_SYMBOL_GPL(gmap_remove); + +/** + * gmap_enable - switch primary space to the guest address space + * @gmap: pointer to the guest address space structure + */ +void gmap_enable(struct gmap *gmap) +{ + S390_lowcore.gmap = (unsigned long) gmap; +} +EXPORT_SYMBOL_GPL(gmap_enable); + +/** + * gmap_disable - switch back to the standard primary address space + * @gmap: pointer to the guest address space structure + */ +void gmap_disable(struct gmap *gmap) +{ + S390_lowcore.gmap = 0UL; +} +EXPORT_SYMBOL_GPL(gmap_disable); + +/** + * gmap_get_enabled - get a pointer to the currently enabled gmap + * + * Returns a pointer to the currently enabled gmap. 0 if none is enabled. + */ +struct gmap *gmap_get_enabled(void) +{ + return (struct gmap *) S390_lowcore.gmap; +} +EXPORT_SYMBOL_GPL(gmap_get_enabled); + +/* + * gmap_alloc_table is assumed to be called with mmap_sem held + */ +static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, + unsigned long init, unsigned long gaddr) +{ + struct page *page; + unsigned long *new; + + /* since we dont free the gmap table until gmap_free we can unlock */ + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + if (!page) + return -ENOMEM; + new = (unsigned long *) page_to_phys(page); + crst_table_init(new, init); + spin_lock(&gmap->guest_table_lock); + if (*table & _REGION_ENTRY_INVALID) { + list_add(&page->lru, &gmap->crst_list); + *table = (unsigned long) new | _REGION_ENTRY_LENGTH | + (*table & _REGION_ENTRY_TYPE_MASK); + page->index = gaddr; + page = NULL; + } + spin_unlock(&gmap->guest_table_lock); + if (page) + __free_pages(page, CRST_ALLOC_ORDER); + return 0; +} + +/** + * __gmap_segment_gaddr - find virtual address from segment pointer + * @entry: pointer to a segment table entry in the guest address space + * + * Returns the virtual address in the guest address space for the segment + */ +static unsigned long __gmap_segment_gaddr(unsigned long *entry) +{ + struct page *page; + unsigned long offset, mask; + + offset = (unsigned long) entry / sizeof(unsigned long); + offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; + mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); + page = virt_to_page((void *)((unsigned long) entry & mask)); + return page->index + offset; +} + +/** + * __gmap_unlink_by_vmaddr - unlink a single segment via a host address + * @gmap: pointer to the guest address space structure + * @vmaddr: address in the host process address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) +{ + unsigned long *entry; + int flush = 0; + + BUG_ON(gmap_is_shadow(gmap)); + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); + if (entry) { + flush = (*entry != _SEGMENT_ENTRY_EMPTY); + *entry = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + return flush; +} + +/** + * __gmap_unmap_by_gaddr - unmap a single segment via a guest address + * @gmap: pointer to the guest address space structure + * @gaddr: address in the guest address space + * + * Returns 1 if a TLB flush is required + */ +static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; +} + +/** + * gmap_unmap_segment - unmap segment from the guest address space + * @gmap: pointer to the guest address space structure + * @to: address in the guest address space + * @len: length of the memory area to unmap + * + * Returns 0 if the unmap succeeded, -EINVAL if not. + */ +int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) +{ + unsigned long off; + int flush; + + BUG_ON(gmap_is_shadow(gmap)); + if ((to | len) & (PMD_SIZE - 1)) + return -EINVAL; + if (len == 0 || to + len < to) + return -EINVAL; + + flush = 0; + down_write(&gmap->mm->mmap_sem); + for (off = 0; off < len; off += PMD_SIZE) + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + up_write(&gmap->mm->mmap_sem); + if (flush) + gmap_flush_tlb(gmap); + return 0; +} +EXPORT_SYMBOL_GPL(gmap_unmap_segment); + +/** + * gmap_map_segment - map a segment to the guest address space + * @gmap: pointer to the guest address space structure + * @from: source address in the parent address space + * @to: target address in the guest address space + * @len: length of the memory area to map + * + * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. + */ +int gmap_map_segment(struct gmap *gmap, unsigned long from, + unsigned long to, unsigned long len) +{ + unsigned long off; + int flush; + + BUG_ON(gmap_is_shadow(gmap)); + if ((from | to | len) & (PMD_SIZE - 1)) + return -EINVAL; + if (len == 0 || from + len < from || to + len < to || + from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) + return -EINVAL; + + flush = 0; + down_write(&gmap->mm->mmap_sem); + for (off = 0; off < len; off += PMD_SIZE) { + /* Remove old translation */ + flush |= __gmap_unmap_by_gaddr(gmap, to + off); + /* Store new translation */ + if (radix_tree_insert(&gmap->guest_to_host, + (to + off) >> PMD_SHIFT, + (void *) from + off)) + break; + } + up_write(&gmap->mm->mmap_sem); + if (flush) + gmap_flush_tlb(gmap); + if (off >= len) + return 0; + gmap_unmap_segment(gmap, to, len); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(gmap_map_segment); + +/** + * __gmap_translate - translate a guest address to a user space address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + * The mmap_sem of the mm that belongs to the address space must be held + * when this function gets called. + * + * Note: Can also be called for shadow gmaps. + */ +unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + + vmaddr = (unsigned long) + radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); + /* Note: guest_to_host is empty for a shadow gmap */ + return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; +} +EXPORT_SYMBOL_GPL(__gmap_translate); + +/** + * gmap_translate - translate a guest address to a user space address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + */ +unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long rc; + + down_read(&gmap->mm->mmap_sem); + rc = __gmap_translate(gmap, gaddr); + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_translate); + +/** + * gmap_unlink - disconnect a page table from the gmap shadow tables + * @gmap: pointer to guest mapping meta data structure + * @table: pointer to the host page table + * @vmaddr: vm address associated with the host page table + */ +void gmap_unlink(struct mm_struct *mm, unsigned long *table, + unsigned long vmaddr) +{ + struct gmap *gmap; + int flush; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); + if (flush) + gmap_flush_tlb(gmap); + } + rcu_read_unlock(); +} + +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, + unsigned long gaddr); + +/** + * gmap_link - set up shadow page tables to connect a host to a guest address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * @vmaddr: vm address + * + * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT + * if the vm address is already mapped to a different guest segment. + * The mmap_sem of the mm that belongs to the address space must be held + * when this function gets called. + */ +int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) +{ + struct mm_struct *mm; + unsigned long *table; + spinlock_t *ptl; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + u64 unprot; + int rc; + + BUG_ON(gmap_is_shadow(gmap)); + /* Create higher level tables in the gmap page table */ + table = gmap->table; + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, + gaddr & _REGION1_MASK)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, + gaddr & _REGION2_MASK)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; + if ((*table & _REGION_ENTRY_INVALID) && + gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, + gaddr & _REGION3_MASK)) + return -ENOMEM; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + } + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + /* Walk the parent mm page table */ + mm = gmap->mm; + pgd = pgd_offset(mm, vmaddr); + VM_BUG_ON(pgd_none(*pgd)); + p4d = p4d_offset(pgd, vmaddr); + VM_BUG_ON(p4d_none(*p4d)); + pud = pud_offset(p4d, vmaddr); + VM_BUG_ON(pud_none(*pud)); + /* large puds cannot yet be handled */ + if (pud_large(*pud)) + return -EFAULT; + pmd = pmd_offset(pud, vmaddr); + VM_BUG_ON(pmd_none(*pmd)); + /* Are we allowed to use huge pages? */ + if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) + return -EFAULT; + /* Link gmap segment table entry location to page table. */ + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + return rc; + ptl = pmd_lock(mm, pmd); + spin_lock(&gmap->guest_table_lock); + if (*table == _SEGMENT_ENTRY_EMPTY) { + rc = radix_tree_insert(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT, table); + if (!rc) { + if (pmd_large(*pmd)) { + *table = (pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) + | _SEGMENT_ENTRY_GMAP_UC; + } else + *table = pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS; + } + } else if (*table & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { + unprot = (u64)*table; + unprot &= ~_SEGMENT_ENTRY_PROTECT; + unprot |= _SEGMENT_ENTRY_GMAP_UC; + gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); + } + spin_unlock(&gmap->guest_table_lock); + spin_unlock(ptl); + radix_tree_preload_end(); + return rc; +} + +/** + * gmap_fault - resolve a fault on a guest address + * @gmap: pointer to guest mapping meta data structure + * @gaddr: guest address + * @fault_flags: flags to pass down to handle_mm_fault() + * + * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT + * if the vm address is already mapped to a different guest segment. + */ +int gmap_fault(struct gmap *gmap, unsigned long gaddr, + unsigned int fault_flags) +{ + unsigned long vmaddr; + int rc; + bool unlocked; + + down_read(&gmap->mm->mmap_sem); + +retry: + unlocked = false; + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + goto out_up; + } + if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, + &unlocked)) { + rc = -EFAULT; + goto out_up; + } + /* + * In the case that fixup_user_fault unlocked the mmap_sem during + * faultin redo __gmap_translate to not race with a map/unmap_segment. + */ + if (unlocked) + goto retry; + + rc = __gmap_link(gmap, gaddr, vmaddr); +out_up: + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_fault); + +/* + * this function is assumed to be called with mmap_sem held + */ +void __gmap_zap(struct gmap *gmap, unsigned long gaddr) +{ + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + + /* Find the vm address for the guest address */ + vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + if (vmaddr) { + vmaddr |= gaddr & ~PMD_MASK; + /* Get pointer to the page table entry */ + ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); + if (likely(ptep)) { + ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); + pte_unmap_unlock(ptep, ptl); + } + } +} +EXPORT_SYMBOL_GPL(__gmap_zap); + +void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) +{ + unsigned long gaddr, vmaddr, size; + struct vm_area_struct *vma; + + down_read(&gmap->mm->mmap_sem); + for (gaddr = from; gaddr < to; + gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { + /* Find the vm address for the guest address */ + vmaddr = (unsigned long) + radix_tree_lookup(&gmap->guest_to_host, + gaddr >> PMD_SHIFT); + if (!vmaddr) + continue; + vmaddr |= gaddr & ~PMD_MASK; + /* Find vma in the parent mm */ + vma = find_vma(gmap->mm, vmaddr); + if (!vma) + continue; + /* + * We do not discard pages that are backed by + * hugetlbfs, so we don't have to refault them. + */ + if (is_vm_hugetlb_page(vma)) + continue; + size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); + zap_page_range(vma, vmaddr, size); + } + up_read(&gmap->mm->mmap_sem); +} +EXPORT_SYMBOL_GPL(gmap_discard); + +static LIST_HEAD(gmap_notifier_list); +static DEFINE_SPINLOCK(gmap_notifier_lock); + +/** + * gmap_register_pte_notifier - register a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_register_pte_notifier(struct gmap_notifier *nb) +{ + spin_lock(&gmap_notifier_lock); + list_add_rcu(&nb->list, &gmap_notifier_list); + spin_unlock(&gmap_notifier_lock); +} +EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); + +/** + * gmap_unregister_pte_notifier - remove a pte invalidation callback + * @nb: pointer to the gmap notifier block + */ +void gmap_unregister_pte_notifier(struct gmap_notifier *nb) +{ + spin_lock(&gmap_notifier_lock); + list_del_rcu(&nb->list); + spin_unlock(&gmap_notifier_lock); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); + +/** + * gmap_call_notifier - call all registered invalidation callbacks + * @gmap: pointer to guest mapping meta data structure + * @start: start virtual address in the guest address space + * @end: end virtual address in the guest address space + */ +static void gmap_call_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct gmap_notifier *nb; + + list_for_each_entry(nb, &gmap_notifier_list, list) + nb->notifier_call(gmap, start, end); +} + +/** + * gmap_table_walk - walk the gmap page tables + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @level: page table level to stop at + * + * Returns a table entry pointer for the given guest address and @level + * @level=0 : returns a pointer to a page table table entry (or NULL) + * @level=1 : returns a pointer to a segment table entry (or NULL) + * @level=2 : returns a pointer to a region-3 table entry (or NULL) + * @level=3 : returns a pointer to a region-2 table entry (or NULL) + * @level=4 : returns a pointer to a region-1 table entry (or NULL) + * + * Returns NULL if the gmap page tables could not be walked to the + * requested level. + * + * Note: Can also be called for shadow gmaps. + */ +static inline unsigned long *gmap_table_walk(struct gmap *gmap, + unsigned long gaddr, int level) +{ + const int asce_type = gmap->asce & _ASCE_TYPE_MASK; + unsigned long *table; + + if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4)) + return NULL; + if (gmap_is_shadow(gmap) && gmap->removed) + return NULL; + + if (asce_type != _ASCE_TYPE_REGION1 && + gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) + return NULL; + + table = gmap->table; + switch (gmap->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; + if (level == 4) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_REGION2: + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; + if (level == 3) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_REGION3: + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; + if (level == 2) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_SEGMENT: + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + if (level == 1) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; + } + return table; +} + +/** + * gmap_pte_op_walk - walk the gmap page table, get the page table lock + * and return the pte pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @ptl: pointer to the spinlock pointer + * + * Returns a pointer to the locked pte for a guest address, or NULL + */ +static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, + spinlock_t **ptl) +{ + unsigned long *table; + + BUG_ON(gmap_is_shadow(gmap)); + /* Walk the gmap page table, lock and get pte pointer */ + table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ + if (!table || *table & _SEGMENT_ENTRY_INVALID) + return NULL; + return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); +} + +/** + * gmap_pte_op_fixup - force a page in and connect the gmap page table + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @vmaddr: address in the host process address space + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if the caller can retry __gmap_translate (might fail again), + * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing + * up or connecting the gmap page table. + */ +static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, + unsigned long vmaddr, int prot) +{ + struct mm_struct *mm = gmap->mm; + unsigned int fault_flags; + bool unlocked = false; + + BUG_ON(gmap_is_shadow(gmap)); + fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; + if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked)) + return -EFAULT; + if (unlocked) + /* lost mmap_sem, caller has to retry __gmap_translate */ + return 0; + /* Connect the page tables */ + return __gmap_link(gmap, gaddr, vmaddr); +} + +/** + * gmap_pte_op_end - release the page table lock + * @ptl: pointer to the spinlock pointer + */ +static void gmap_pte_op_end(spinlock_t *ptl) +{ + if (ptl) + spin_unlock(ptl); +} + +/** + * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock + * and return the pmd pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * + * Returns a pointer to the pmd for a guest address, or NULL + */ +static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) +{ + pmd_t *pmdp; + + BUG_ON(gmap_is_shadow(gmap)); + spin_lock(&gmap->guest_table_lock); + pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); + + if (!pmdp || pmd_none(*pmdp)) { + spin_unlock(&gmap->guest_table_lock); + return NULL; + } + + /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ + if (!pmd_large(*pmdp)) + spin_unlock(&gmap->guest_table_lock); + return pmdp; +} + +/** + * gmap_pmd_op_end - release the guest_table_lock if needed + * @gmap: pointer to the guest mapping meta data structure + * @pmdp: pointer to the pmd + */ +static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) +{ + if (pmd_large(*pmdp)) + spin_unlock(&gmap->guest_table_lock); +} + +/* + * gmap_protect_pmd - remove access rights to memory and set pmd notification bits + * @pmdp: pointer to the pmd to be protected + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: notification bits to set + * + * Returns: + * 0 if successfully protected + * -EAGAIN if a fixup is needed + * -EINVAL if unsupported notifier bits have been specified + * + * Expected to be called with sg->mm->mmap_sem in read and + * guest_table_lock held. + */ +static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) +{ + int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; + int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; + pmd_t new = *pmdp; + + /* Fixup needed */ + if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) + return -EAGAIN; + + if (prot == PROT_NONE && !pmd_i) { + pmd_val(new) |= _SEGMENT_ENTRY_INVALID; + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + + if (prot == PROT_READ && !pmd_p) { + pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID; + pmd_val(new) |= _SEGMENT_ENTRY_PROTECT; + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + + if (bits & GMAP_NOTIFY_MPROT) + pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN; + + /* Shadow GMAP protection needs split PMDs */ + if (bits & GMAP_NOTIFY_SHADOW) + return -EINVAL; + + return 0; +} + +/* + * gmap_protect_pte - remove access rights to memory and set pgste bits + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @pmdp: pointer to the pmd associated with the pte + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: notification bits to set + * + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EAGAIN if a fixup is needed. + * + * Expected to be called with sg->mm->mmap_sem in read + */ +static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) +{ + int rc; + pte_t *ptep; + spinlock_t *ptl = NULL; + unsigned long pbits = 0; + + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return -EAGAIN; + + ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); + if (!ptep) + return -ENOMEM; + + pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; + pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; + /* Protect and unlock. */ + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); + gmap_pte_op_end(ptl); + return rc; +} + +/* + * gmap_protect_range - remove access rights to memory and set pgste bits + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set + * + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EFAULT if gaddr is invalid (or mapping for shadows is missing). + * + * Called with sg->mm->mmap_sem in read. + */ +static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot, unsigned long bits) +{ + unsigned long vmaddr, dist; + pmd_t *pmdp; + int rc; + + BUG_ON(gmap_is_shadow(gmap)); + while (len) { + rc = -EAGAIN; + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (pmdp) { + if (!pmd_large(*pmdp)) { + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, + bits); + if (!rc) { + len -= PAGE_SIZE; + gaddr += PAGE_SIZE; + } + } else { + rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, + bits); + if (!rc) { + dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); + len = len < dist ? 0 : len - dist; + gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; + } + } + gmap_pmd_op_end(gmap, pmdp); + } + if (rc) { + if (rc == -EINVAL) + return rc; + + /* -EAGAIN, fixup of userspace mm and gmap */ + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); + if (rc) + return rc; + } + } + return 0; +} + +/** + * gmap_mprotect_notify - change access rights for a range of ptes and + * call the notifier if any pte changes again + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if for each page in the given range a gmap mapping exists, + * the new access rights could be set and the notifier could be armed. + * If the gmap mapping is missing for one or more pages -EFAULT is + * returned. If no memory could be allocated -ENOMEM is returned. + * This function establishes missing page table entries. + */ +int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot) +{ + int rc; + + if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) + return -EINVAL; + if (!MACHINE_HAS_ESOP && prot == PROT_READ) + return -EINVAL; + down_read(&gmap->mm->mmap_sem); + rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_mprotect_notify); + +/** + * gmap_read_table - get an unsigned long value from a guest page table using + * absolute addressing, without marking the page referenced. + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @val: pointer to the unsigned long value to return + * + * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT + * if reading using the virtual address failed. -EINVAL if called on a gmap + * shadow. + * + * Called with gmap->mm->mmap_sem in read. + */ +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) +{ + unsigned long address, vmaddr; + spinlock_t *ptl; + pte_t *ptep, pte; + int rc; + + if (gmap_is_shadow(gmap)) + return -EINVAL; + + while (1) { + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + pte = *ptep; + if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { + address = pte_val(pte) & PAGE_MASK; + address += gaddr & ~PAGE_MASK; + *val = *(unsigned long *) address; + pte_val(*ptep) |= _PAGE_YOUNG; + /* Do *NOT* clear the _PAGE_INVALID bit! */ + rc = 0; + } + gmap_pte_op_end(ptl); + } + if (!rc) + break; + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); + if (rc) + break; + } + return rc; +} +EXPORT_SYMBOL_GPL(gmap_read_table); + +/** + * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree + * @sg: pointer to the shadow guest address space structure + * @vmaddr: vm address associated with the rmap + * @rmap: pointer to the rmap structure + * + * Called with the sg->guest_table_lock + */ +static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, + struct gmap_rmap *rmap) +{ + void __rcu **slot; + + BUG_ON(!gmap_is_shadow(sg)); + slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); + if (slot) { + rmap->next = radix_tree_deref_slot_protected(slot, + &sg->guest_table_lock); + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); + } else { + rmap->next = NULL; + radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, + rmap); + } +} + +/** + * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow gmap + * @paddr: address in the parent guest address space + * @len: length of the memory area to protect + * + * Returns 0 if successfully protected and the rmap was created, -ENOMEM + * if out of memory and -EFAULT if paddr is invalid. + */ +static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, + unsigned long paddr, unsigned long len) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + while (len) { + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + if (!rmap) + return -ENOMEM; + rmap->raddr = raddr; + rc = radix_tree_preload(GFP_KERNEL); + if (rc) { + kfree(rmap); + return rc; + } + rc = -EAGAIN; + ptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (ptep) { + spin_lock(&sg->guest_table_lock); + rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, + PGSTE_VSIE_BIT); + if (!rc) + gmap_insert_rmap(sg, vmaddr, rmap); + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptl); + } + radix_tree_preload_end(); + if (rc) { + kfree(rmap); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); + if (rc) + return rc; + continue; + } + paddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + return 0; +} + +#define _SHADOW_RMAP_MASK 0x7 +#define _SHADOW_RMAP_REGION1 0x5 +#define _SHADOW_RMAP_REGION2 0x4 +#define _SHADOW_RMAP_REGION3 0x3 +#define _SHADOW_RMAP_SEGMENT 0x2 +#define _SHADOW_RMAP_PGTABLE 0x1 + +/** + * gmap_idte_one - invalidate a single region or segment table entry + * @asce: region or segment table *origin* + table-type bits + * @vaddr: virtual address to identify the table entry to flush + * + * The invalid bit of a single region or segment table entry is set + * and the associated TLB entries depending on the entry are flushed. + * The table-type of the @asce identifies the portion of the @vaddr + * that is used as the invalidation index. + */ +static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) +{ + asm volatile( + " .insn rrf,0xb98e0000,%0,%1,0,0" + : : "a" (asce), "a" (vaddr) : "cc", "memory"); +} + +/** + * gmap_unshadow_page - remove a page from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ + if (!table || *table & _PAGE_INVALID) + return; + gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); + ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); +} + +/** + * __gmap_unshadow_pgt - remove all entries from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @pgt: pointer to the start of a shadow page table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, + unsigned long *pgt) +{ + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) + pgt[i] = _PAGE_INVALID; +} + +/** + * gmap_unshadow_pgt - remove a shadow page table from a segment entry + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long sto, *ste, *pgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ + if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); + sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); + gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); + pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + *ste = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, pgt); + /* Free page table */ + page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + list_del(&page->lru); + page_table_free_pgste(page); +} + +/** + * __gmap_unshadow_sgt - remove all entries from a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @sgt: pointer to the start of a shadow segment table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, + unsigned long *sgt) +{ + unsigned long *pgt; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { + if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) + continue; + pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + sgt[i] = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, pgt); + /* Free page table */ + page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + list_del(&page->lru); + page_table_free_pgste(page); + } +} + +/** + * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the shadow->guest_table_lock + */ +static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long r3o, *r3e, *sgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ + if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); + r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); + gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); + sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + *r3e = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, sgt); + /* Free segment table */ + page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); +} + +/** + * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * @r3t: pointer to the start of a shadow region-3 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, + unsigned long *r3t) +{ + unsigned long *sgt; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { + if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) + continue; + sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + r3t[i] = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, sgt); + /* Free segment table */ + page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); + } +} + +/** + * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r2o, *r2e, *r3t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ + if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); + r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); + gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); + r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + *r2e = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, r3t); + /* Free region 3 table */ + page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); +} + +/** + * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r2t: pointer to the start of a shadow region-2 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, + unsigned long *r2t) +{ + unsigned long *r3t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { + if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r2t[i] = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, r3t); + /* Free region 3 table */ + page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); + } +} + +/** + * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r1o, *r1e, *r2t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ + if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) + return; + gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); + r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); + gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); + r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + *r1e = _REGION1_ENTRY_EMPTY; + __gmap_unshadow_r2t(sg, raddr, r2t); + /* Free region 2 table */ + page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); +} + +/** + * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r1t: pointer to the start of a shadow region-1 table + * + * Called with the shadow->guest_table_lock + */ +static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, + unsigned long *r1t) +{ + unsigned long asce, *r2t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { + if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) + continue; + r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); + __gmap_unshadow_r2t(sg, raddr, r2t); + /* Clear entry and flush translation r1t -> r2t */ + gmap_idte_one(asce, raddr); + r1t[i] = _REGION1_ENTRY_EMPTY; + /* Free region 2 table */ + page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, CRST_ALLOC_ORDER); + } +} + +/** + * gmap_unshadow - remove a shadow page table completely + * @sg: pointer to the shadow guest address space structure + * + * Called with sg->guest_table_lock + */ +static void gmap_unshadow(struct gmap *sg) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + if (sg->removed) + return; + sg->removed = 1; + gmap_call_notifier(sg, 0, -1UL); + gmap_flush_tlb(sg); + table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + switch (sg->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + __gmap_unshadow_r1t(sg, 0, table); + break; + case _ASCE_TYPE_REGION2: + __gmap_unshadow_r2t(sg, 0, table); + break; + case _ASCE_TYPE_REGION3: + __gmap_unshadow_r3t(sg, 0, table); + break; + case _ASCE_TYPE_SEGMENT: + __gmap_unshadow_sgt(sg, 0, table); + break; + } +} + +/** + * gmap_find_shadow - find a specific asce in the list of shadow tables + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * Returns the pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, + int edat_level) +{ + struct gmap *sg; + + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce != asce || sg->edat_level != edat_level || + sg->removed) + continue; + if (!sg->initialized) + return ERR_PTR(-EAGAIN); + atomic_inc(&sg->ref_count); + return sg; + } + return NULL; +} + +/** + * gmap_shadow_valid - check if a shadow guest address space matches the + * given properties and is still valid + * @sg: pointer to the shadow guest address space structure + * @asce: ASCE for which the shadow table is requested + * @edat_level: edat level to be used for the shadow translation + * + * Returns 1 if the gmap shadow is still valid and matches the given + * properties, the caller can continue using it. Returns 0 otherwise, the + * caller has to request a new shadow gmap in this case. + * + */ +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +{ + if (sg->removed) + return 0; + return sg->orig_asce == asce && sg->edat_level == edat_level; +} +EXPORT_SYMBOL_GPL(gmap_shadow_valid); + +/** + * gmap_shadow - create/find a shadow guest address space + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. + */ +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, + int edat_level) +{ + struct gmap *sg, *new; + unsigned long limit; + int rc; + + BUG_ON(parent->mm->context.allow_gmap_hpage_1m); + BUG_ON(gmap_is_shadow(parent)); + spin_lock(&parent->shadow_lock); + sg = gmap_find_shadow(parent, asce, edat_level); + spin_unlock(&parent->shadow_lock); + if (sg) + return sg; + /* Create a new shadow gmap */ + limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); + if (asce & _ASCE_REAL_SPACE) + limit = -1UL; + new = gmap_alloc(limit); + if (!new) + return ERR_PTR(-ENOMEM); + new->mm = parent->mm; + new->parent = gmap_get(parent); + new->orig_asce = asce; + new->edat_level = edat_level; + new->initialized = false; + spin_lock(&parent->shadow_lock); + /* Recheck if another CPU created the same shadow */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + spin_unlock(&parent->shadow_lock); + gmap_free(new); + return sg; + } + if (asce & _ASCE_REAL_SPACE) { + /* only allow one real-space gmap shadow */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce & _ASCE_REAL_SPACE) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + break; + } + } + } + atomic_set(&new->ref_count, 2); + list_add(&new->list, &parent->children); + if (asce & _ASCE_REAL_SPACE) { + /* nothing to protect, return right away */ + new->initialized = true; + spin_unlock(&parent->shadow_lock); + return new; + } + spin_unlock(&parent->shadow_lock); + /* protect after insertion, so it will get properly invalidated */ + down_read(&parent->mm->mmap_sem); + rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, + ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, + PROT_READ, GMAP_NOTIFY_SHADOW); + up_read(&parent->mm->mmap_sem); + spin_lock(&parent->shadow_lock); + new->initialized = true; + if (rc) { + list_del(&new->list); + gmap_free(new); + new = ERR_PTR(rc); + } + spin_unlock(&parent->shadow_lock); + return new; +} +EXPORT_SYMBOL_GPL(gmap_shadow); + +/** + * gmap_shadow_r2t - create an empty shadow region 2 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r2t: parent gmap address of the region 2 table to get shadowed + * @fake: r2t references contiguous guest memory block, not a r2t + * + * The r2t parameter specifies the address of the source table. The + * four pages of the source table are made read-only in the parent gmap + * address space. A write to the source table area @r2t will automatically + * remove the shadow r2 table and all of its decendents. + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_r2t, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + if (!page) + return -ENOMEM; + page->index = r2t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_r2t = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r2t & _REGION_ENTRY_PROTECT); + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r2t read-only in parent gmap page table */ + raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; + origin = r2t & _REGION_ENTRY_ORIGIN; + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 4); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_r2t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r2t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, CRST_ALLOC_ORDER); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r2t); + +/** + * gmap_shadow_r3t - create a shadow region 3 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r3t: parent gmap address of the region 3 table to get shadowed + * @fake: r3t references contiguous guest memory block, not a r3t + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_r3t, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + if (!page) + return -ENOMEM; + page->index = r3t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_r3t = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r3t & _REGION_ENTRY_PROTECT); + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make r3t read-only in parent gmap page table */ + raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; + origin = r3t & _REGION_ENTRY_ORIGIN; + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 3); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_r3t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_r3t(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, CRST_ALLOC_ORDER); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r3t); + +/** + * gmap_shadow_sgt - create a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @sgt: parent gmap address of the segment table to get shadowed + * @fake: sgt references contiguous guest memory block, not a sgt + * + * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_sgt, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); + /* Allocate a shadow segment table */ + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + if (!page) + return -ENOMEM; + page->index = sgt & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_sgt = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= sgt & _REGION_ENTRY_PROTECT; + list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make sgt read-only in parent gmap page table */ + raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; + origin = sgt & _REGION_ENTRY_ORIGIN; + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 2); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_sgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { + gmap_unshadow_sgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, CRST_ALLOC_ORDER); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_sgt); + +/** + * gmap_shadow_lookup_pgtable - find a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: the address in the shadow aguest address space + * @pgt: parent gmap address of the page table to get shadowed + * @dat_protection: if the pgtable is marked as protected by dat + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if the shadow page table was found and -EAGAIN if the page + * table was not found. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection, + int *fake) +{ + unsigned long *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { + /* Shadow page tables are full pages (pte+pgste) */ + page = pfn_to_page(*table >> PAGE_SHIFT); + *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; + *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); + *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); + rc = 0; + } else { + rc = -EAGAIN; + } + spin_unlock(&sg->guest_table_lock); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); + +/** + * gmap_shadow_pgt - instantiate a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: parent gmap address of the page table to get shadowed + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory, + * -EFAULT if an address in the parent gmap could not be resolved and + * + * Called with gmap->mm->mmap_sem in read + */ +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake) +{ + unsigned long raddr, origin; + unsigned long *s_pgt, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); + /* Allocate a shadow page table */ + page = page_table_alloc_pgste(sg->mm); + if (!page) + return -ENOMEM; + page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; + s_pgt = (unsigned long *) page_to_phys(page); + /* Install shadow page table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _SEGMENT_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } else if (*table & _SEGMENT_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; + } + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | + (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; + list_add(&page->lru, &sg->pt_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_SEGMENT_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } + spin_unlock(&sg->guest_table_lock); + /* Make pgt read-only in parent gmap page table (not the pgste) */ + raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; + origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; + rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 1); + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != + (unsigned long) s_pgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_SEGMENT_ENTRY_INVALID; + } else { + gmap_unshadow_pgt(sg, raddr); + } + spin_unlock(&sg->guest_table_lock); + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + page_table_free_pgste(page); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt); + +/** + * gmap_shadow_page - create a shadow page mapping + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pte: pte in parent gmap address space to get shadowed + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr, paddr; + spinlock_t *ptl; + pte_t *sptep, *tptep; + int prot; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; + + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + if (!rmap) + return -ENOMEM; + rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; + + while (1) { + paddr = pte_val(pte) & PAGE_MASK; + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + break; + rc = -EAGAIN; + sptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (sptep) { + spin_lock(&sg->guest_table_lock); + /* Get page table pointer */ + tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); + if (!tptep) { + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptl); + radix_tree_preload_end(); + break; + } + rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); + if (rc > 0) { + /* Success and a new mapping */ + gmap_insert_rmap(sg, vmaddr, rmap); + rmap = NULL; + rc = 0; + } + gmap_pte_op_end(ptl); + spin_unlock(&sg->guest_table_lock); + } + radix_tree_preload_end(); + if (!rc) + break; + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); + if (rc) + break; + } + kfree(rmap); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_page); + +/** + * gmap_shadow_notify - handle notifications for shadow gmap + * + * Called with sg->parent->shadow_lock. + */ +static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, + unsigned long gaddr) +{ + struct gmap_rmap *rmap, *rnext, *head; + unsigned long start, end, bits, raddr; + + BUG_ON(!gmap_is_shadow(sg)); + + spin_lock(&sg->guest_table_lock); + if (sg->removed) { + spin_unlock(&sg->guest_table_lock); + return; + } + /* Check for top level table */ + start = sg->orig_asce & _ASCE_ORIGIN; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; + if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && + gaddr < end) { + /* The complete shadow table has to go */ + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + return; + } + /* Remove the page table tree from on specific entry */ + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); + gmap_for_each_rmap_safe(rmap, rnext, head) { + bits = rmap->raddr & _SHADOW_RMAP_MASK; + raddr = rmap->raddr ^ bits; + switch (bits) { + case _SHADOW_RMAP_REGION1: + gmap_unshadow_r2t(sg, raddr); + break; + case _SHADOW_RMAP_REGION2: + gmap_unshadow_r3t(sg, raddr); + break; + case _SHADOW_RMAP_REGION3: + gmap_unshadow_sgt(sg, raddr); + break; + case _SHADOW_RMAP_SEGMENT: + gmap_unshadow_pgt(sg, raddr); + break; + case _SHADOW_RMAP_PGTABLE: + gmap_unshadow_page(sg, raddr); + break; + } + kfree(rmap); + } + spin_unlock(&sg->guest_table_lock); +} + +/** + * ptep_notify - call all invalidation callbacks for a specific pte. + * @mm: pointer to the process mm_struct + * @addr: virtual address in the process address space + * @pte: pointer to the page table entry + * @bits: bits from the pgste that caused the notify call + * + * This function is assumed to be called with the page table lock held + * for the pte to notify. + */ +void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, + pte_t *pte, unsigned long bits) +{ + unsigned long offset, gaddr = 0; + unsigned long *table; + struct gmap *gmap, *sg, *next; + + offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); + offset = offset * (PAGE_SIZE / sizeof(pte_t)); + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + table = radix_tree_lookup(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (table) + gaddr = __gmap_segment_gaddr(table) + offset; + spin_unlock(&gmap->guest_table_lock); + if (!table) + continue; + + if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, + &gmap->children, list) + gmap_shadow_notify(sg, vmaddr, gaddr); + spin_unlock(&gmap->shadow_lock); + } + if (bits & PGSTE_IN_BIT) + gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(ptep_notify); + +static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN; + gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); +} + +/** + * gmap_pmdp_xchg - exchange a gmap pmd with another + * @gmap: pointer to the guest address space structure + * @pmdp: pointer to the pmd entry + * @new: replacement entry + * @gaddr: the affected guest address + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, + unsigned long gaddr) +{ + gaddr &= HPAGE_MASK; + pmdp_notify_gmap(gmap, pmdp, gaddr); + pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN; + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, + IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); + *pmdp = new; +} + +static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, + int purge) +{ + pmd_t *pmdp; + struct gmap *gmap; + unsigned long gaddr; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (pmdp) { + gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); + if (purge) + __pmdp_csp(pmdp); + pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} + +/** + * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without + * flushing + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) +{ + gmap_pmdp_clear(mm, vmaddr, 0); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); + +/** + * gmap_pmdp_csp - csp all affected guest pmd entries + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) +{ + gmap_pmdp_clear(mm, vmaddr, 1); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_csp); + +/** + * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long *entry, gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (entry) { + pmdp = (pmd_t *)entry; + gaddr = __gmap_segment_gaddr(entry); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_LOCAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); + *entry = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); + +/** + * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long *entry, gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (entry) { + pmdp = (pmd_t *)entry; + gaddr = __gmap_segment_gaddr(entry); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); + *entry = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); + +/** + * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status + * @gmap: pointer to guest address space + * @pmdp: pointer to the pmd to be tested + * @gaddr: virtual address in the guest address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return false; + + /* Already protected memory, which did not change is clean */ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) + return false; + + /* Clear UC indication and reset protection */ + pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC; + gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); + return true; +} + +/** + * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment + * @gmap: pointer to guest address space + * @bitmap: dirty bitmap for this pmd + * @gaddr: virtual address in the guest address space + * @vmaddr: virtual address in the host address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], + unsigned long gaddr, unsigned long vmaddr) +{ + int i; + pmd_t *pmdp; + pte_t *ptep; + spinlock_t *ptl; + + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return; + + if (pmd_large(*pmdp)) { + if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) + bitmap_fill(bitmap, _PAGE_ENTRIES); + } else { + for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { + ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); + if (!ptep) + continue; + if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) + set_bit(i, bitmap); + spin_unlock(ptl); + } + } + gmap_pmd_op_end(gmap, pmdp); +} +EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); + +static inline void thp_split_mm(struct mm_struct *mm) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct vm_area_struct *vma; + unsigned long addr; + + for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for (addr = vma->vm_start; + addr < vma->vm_end; + addr += PAGE_SIZE) + follow_page(vma, addr, FOLL_SPLIT); + vma->vm_flags &= ~VM_HUGEPAGE; + vma->vm_flags |= VM_NOHUGEPAGE; + } + mm->def_flags |= VM_NOHUGEPAGE; +#endif +} + +/* + * Remove all empty zero pages from the mapping for lazy refaulting + * - This must be called after mm->context.has_pgste is set, to avoid + * future creation of zero pages + * - This must be called after THP was enabled + */ +static int __zap_zero_pages(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + unsigned long addr; + + for (addr = start; addr != end; addr += PAGE_SIZE) { + pte_t *ptep; + spinlock_t *ptl; + + ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (is_zero_pfn(pte_pfn(*ptep))) + ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_unmap_unlock(ptep, ptl); + } + return 0; +} + +static inline void zap_zero_pages(struct mm_struct *mm) +{ + struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; + + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); +} + +/* + * switch on pgstes for its userspace process (for kvm) + */ +int s390_enable_sie(void) +{ + struct mm_struct *mm = current->mm; + + /* Do we have pgstes? if yes, we are done */ + if (mm_has_pgste(mm)) + return 0; + /* Fail if the page tables are 2K */ + if (!mm_alloc_pgste(mm)) + return -EINVAL; + down_write(&mm->mmap_sem); + mm->context.has_pgste = 1; + /* split thp mappings and disable thp for future mappings */ + thp_split_mm(mm); + zap_zero_pages(mm); + up_write(&mm->mmap_sem); + return 0; +} +EXPORT_SYMBOL_GPL(s390_enable_sie); + +/* + * Enable storage key handling from now on and initialize the storage + * keys with the default key. + */ +static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + /* Clear storage key */ + ptep_zap_key(walk->mm, addr, pte); + return 0; +} + +static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, + unsigned long hmask, unsigned long next, + struct mm_walk *walk) +{ + pmd_t *pmd = (pmd_t *)pte; + unsigned long start, end; + struct page *page = pmd_page(*pmd); + + /* + * The write check makes sure we do not set a key on shared + * memory. This is needed as the walker does not differentiate + * between actual guest memory and the process executable or + * shared libraries. + */ + if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || + !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) + return 0; + + start = pmd_val(*pmd) & HPAGE_MASK; + end = start + HPAGE_SIZE - 1; + __storage_key_init_range(start, end); + set_bit(PG_arch_1, &page->flags); + return 0; +} + +int s390_enable_skey(void) +{ + struct mm_walk walk = { + .hugetlb_entry = __s390_enable_skey_hugetlb, + .pte_entry = __s390_enable_skey_pte, + }; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int rc = 0; + + down_write(&mm->mmap_sem); + if (mm_uses_skeys(mm)) + goto out_up; + + mm->context.uses_skeys = 1; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (ksm_madvise(vma, vma->vm_start, vma->vm_end, + MADV_UNMERGEABLE, &vma->vm_flags)) { + mm->context.uses_skeys = 0; + rc = -ENOMEM; + goto out_up; + } + } + mm->def_flags &= ~VM_MERGEABLE; + + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); + +out_up: + up_write(&mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(s390_enable_skey); + +/* + * Reset CMMA state, make all pages stable again. + */ +static int __s390_reset_cmma(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + ptep_zap_unused(walk->mm, addr, pte, 1); + return 0; +} + +void s390_reset_cmma(struct mm_struct *mm) +{ + struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; + + down_write(&mm->mmap_sem); + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); + up_write(&mm->mmap_sem); +} +EXPORT_SYMBOL_GPL(s390_reset_cmma); diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c new file mode 100644 index 000000000..5389bf5bc --- /dev/null +++ b/arch/s390/mm/gup.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Lockless get_user_pages_fast for s390 + * + * Copyright IBM Corp. 2010 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/vmstat.h> +#include <linux/pagemap.h> +#include <linux/rwsem.h> +#include <asm/pgtable.h> + +/* + * The performance critical leaf functions are made noinline otherwise gcc + * inlines everything into a single function which results in too much + * register pressure. + */ +static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page; + unsigned long mask; + pte_t *ptep, pte; + + mask = (write ? _PAGE_PROTECT : 0) | _PAGE_INVALID | _PAGE_SPECIAL; + + ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr); + do { + pte = *ptep; + barrier(); + /* Similar to the PMD case, NUMA hinting must take slow path */ + if (pte_protnone(pte)) + return 0; + if ((pte_val(pte) & mask) != 0) + return 0; + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + head = compound_head(page); + if (unlikely(WARN_ON_ONCE(page_ref_count(head) < 0) + || !page_cache_get_speculative(head))) + return 0; + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + put_page(head); + return 0; + } + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + + return 1; +} + +static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page; + unsigned long mask; + int refs; + + mask = (write ? _SEGMENT_ENTRY_PROTECT : 0) | _SEGMENT_ENTRY_INVALID; + if ((pmd_val(pmd) & mask) != 0) + return 0; + VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT)); + + refs = 0; + head = pmd_page(pmd); + page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (unlikely(WARN_ON_ONCE(page_ref_count(head) < 0) + || !page_cache_add_speculative(head, refs))) { + *nr -= refs; + return 0; + } + + if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) { + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + return 1; +} + + +static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp, pmd; + + pmdp = (pmd_t *) pudp; + if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + pmdp = (pmd_t *) pud_deref(pud); + pmdp += pmd_index(addr); + do { + pmd = *pmdp; + barrier(); + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + return 0; + if (unlikely(pmd_large(pmd))) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_protnone(pmd)) + return 0; + if (!gup_huge_pmd(pmdp, pmd, addr, next, + write, pages, nr)) + return 0; + } else if (!gup_pte_range(pmdp, pmd, addr, next, + write, pages, nr)) + return 0; + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page; + unsigned long mask; + int refs; + + mask = (write ? _REGION_ENTRY_PROTECT : 0) | _REGION_ENTRY_INVALID; + if ((pud_val(pud) & mask) != 0) + return 0; + VM_BUG_ON(!pfn_valid(pud_pfn(pud))); + + refs = 0; + head = pud_page(pud); + page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (unlikely(WARN_ON_ONCE(page_ref_count(head) < 0) + || !page_cache_add_speculative(head, refs))) { + *nr -= refs; + return 0; + } + + if (unlikely(pud_val(pud) != pud_val(*pudp))) { + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + return 1; +} + +static inline int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp, pud; + + pudp = (pud_t *) p4dp; + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) + pudp = (pud_t *) p4d_deref(p4d); + pudp += pud_index(addr); + do { + pud = *pudp; + barrier(); + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (unlikely(pud_large(pud))) { + if (!gup_huge_pud(pudp, pud, addr, next, write, pages, + nr)) + return 0; + } else if (!gup_pmd_range(pudp, pud, addr, next, write, pages, + nr)) + return 0; + } while (pudp++, addr = next, addr != end); + + return 1; +} + +static inline int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp, p4d; + + p4dp = (p4d_t *) pgdp; + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) + p4dp = (p4d_t *) pgd_deref(pgd); + p4dp += p4d_index(addr); + do { + p4d = *p4dp; + barrier(); + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + if (!gup_pud_range(p4dp, p4d, addr, next, write, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + +/* + * Like get_user_pages_fast() except its IRQ-safe in that it won't fall + * back to the regular GUP. + * Note a difference with get_user_pages_fast: this always returns the + * number of pages pinned, 0 if no pages were pinned. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next, flags; + pgd_t *pgdp, pgd; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if ((end <= start) || (end > mm->context.asce_limit)) + return 0; + /* + * local_irq_save() doesn't prevent pagetable teardown, but does + * prevent the pagetables from being freed on s390. + * + * So long as we atomically load page table pointers versus teardown, + * we can follow the address down to the the page and take a ref on it. + */ + local_irq_save(flags); + pgdp = pgd_offset(mm, addr); + do { + pgd = *pgdp; + barrier(); + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + break; + if (!gup_p4d_range(pgdp, pgd, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + local_irq_restore(flags); + + return nr; +} + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + */ +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + int nr, ret; + + might_sleep(); + start &= PAGE_MASK; + /* + * The FAST_GUP case requires FOLL_WRITE even for pure reads, + * because get_user_pages() may need to cause an early COW in + * order to avoid confusing the normal COW routines. So only + * targets that are already writable are safe to do by just + * looking at the page tables. + */ + nr = __get_user_pages_fast(start, nr_pages, 1, pages); + if (nr == nr_pages) + return nr; + + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + ret = get_user_pages_unlocked(start, nr_pages - nr, pages, + write ? FOLL_WRITE : 0); + /* Have to be a bit careful with return values */ + if (nr > 0) + ret = (ret < 0) ? nr : ret + nr; + return ret; +} diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c new file mode 100644 index 000000000..ff8234bca --- /dev/null +++ b/arch/s390/mm/hugetlbpage.c @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IBM System z Huge TLB Page Support for Kernel. + * + * Copyright IBM Corp. 2007,2020 + * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com> + */ + +#define KMSG_COMPONENT "hugetlb" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/security.h> + +/* + * If the bit selected by single-bit bitmask "a" is set within "x", move + * it to the position indicated by single-bit bitmask "b". + */ +#define move_set_bit(x, a, b) (((x) & (a)) >> ilog2(a) << ilog2(b)) + +static inline unsigned long __pte_to_rste(pte_t pte) +{ + unsigned long rste; + + /* + * Convert encoding pte bits pmd / pud bits + * lIR.uswrdy.p dy..R...I...wr + * empty 010.000000.0 -> 00..0...1...00 + * prot-none, clean, old 111.000000.1 -> 00..1...1...00 + * prot-none, clean, young 111.000001.1 -> 01..1...1...00 + * prot-none, dirty, old 111.000010.1 -> 10..1...1...00 + * prot-none, dirty, young 111.000011.1 -> 11..1...1...00 + * read-only, clean, old 111.000100.1 -> 00..1...1...01 + * read-only, clean, young 101.000101.1 -> 01..1...0...01 + * read-only, dirty, old 111.000110.1 -> 10..1...1...01 + * read-only, dirty, young 101.000111.1 -> 11..1...0...01 + * read-write, clean, old 111.001100.1 -> 00..1...1...11 + * read-write, clean, young 101.001101.1 -> 01..1...0...11 + * read-write, dirty, old 110.001110.1 -> 10..0...1...11 + * read-write, dirty, young 100.001111.1 -> 11..0...0...11 + * HW-bits: R read-only, I invalid + * SW-bits: p present, y young, d dirty, r read, w write, s special, + * u unused, l large + */ + if (pte_present(pte)) { + rste = pte_val(pte) & PAGE_MASK; + rste |= move_set_bit(pte_val(pte), _PAGE_READ, + _SEGMENT_ENTRY_READ); + rste |= move_set_bit(pte_val(pte), _PAGE_WRITE, + _SEGMENT_ENTRY_WRITE); + rste |= move_set_bit(pte_val(pte), _PAGE_INVALID, + _SEGMENT_ENTRY_INVALID); + rste |= move_set_bit(pte_val(pte), _PAGE_PROTECT, + _SEGMENT_ENTRY_PROTECT); + rste |= move_set_bit(pte_val(pte), _PAGE_DIRTY, + _SEGMENT_ENTRY_DIRTY); + rste |= move_set_bit(pte_val(pte), _PAGE_YOUNG, + _SEGMENT_ENTRY_YOUNG); +#ifdef CONFIG_MEM_SOFT_DIRTY + rste |= move_set_bit(pte_val(pte), _PAGE_SOFT_DIRTY, + _SEGMENT_ENTRY_SOFT_DIRTY); +#endif + rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC, + _SEGMENT_ENTRY_NOEXEC); + } else + rste = _SEGMENT_ENTRY_EMPTY; + return rste; +} + +static inline pte_t __rste_to_pte(unsigned long rste) +{ + int present; + pte_t pte; + + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + present = pud_present(__pud(rste)); + else + present = pmd_present(__pmd(rste)); + + /* + * Convert encoding pmd / pud bits pte bits + * dy..R...I...wr lIR.uswrdy.p + * empty 00..0...1...00 -> 010.000000.0 + * prot-none, clean, old 00..1...1...00 -> 111.000000.1 + * prot-none, clean, young 01..1...1...00 -> 111.000001.1 + * prot-none, dirty, old 10..1...1...00 -> 111.000010.1 + * prot-none, dirty, young 11..1...1...00 -> 111.000011.1 + * read-only, clean, old 00..1...1...01 -> 111.000100.1 + * read-only, clean, young 01..1...0...01 -> 101.000101.1 + * read-only, dirty, old 10..1...1...01 -> 111.000110.1 + * read-only, dirty, young 11..1...0...01 -> 101.000111.1 + * read-write, clean, old 00..1...1...11 -> 111.001100.1 + * read-write, clean, young 01..1...0...11 -> 101.001101.1 + * read-write, dirty, old 10..0...1...11 -> 110.001110.1 + * read-write, dirty, young 11..0...0...11 -> 100.001111.1 + * HW-bits: R read-only, I invalid + * SW-bits: p present, y young, d dirty, r read, w write, s special, + * u unused, l large + */ + if (present) { + pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE; + pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT; + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ, + _PAGE_READ); + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, + _PAGE_WRITE); + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, + _PAGE_INVALID); + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, + _PAGE_PROTECT); + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, + _PAGE_DIRTY); + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, + _PAGE_YOUNG); +#ifdef CONFIG_MEM_SOFT_DIRTY + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, + _PAGE_SOFT_DIRTY); +#endif + pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, + _PAGE_NOEXEC); + } else + pte_val(pte) = _PAGE_INVALID; + return pte; +} + +static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) +{ + struct page *page; + unsigned long size, paddr; + + if (!mm_uses_skeys(mm) || + rste & _SEGMENT_ENTRY_INVALID) + return; + + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { + page = pud_page(__pud(rste)); + size = PUD_SIZE; + paddr = rste & PUD_MASK; + } else { + page = pmd_page(__pmd(rste)); + size = PMD_SIZE; + paddr = rste & PMD_MASK; + } + + if (!test_and_set_bit(PG_arch_1, &page->flags)) + __storage_key_init_range(paddr, paddr + size - 1); +} + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + unsigned long rste; + + rste = __pte_to_rste(pte); + if (!MACHINE_HAS_NX) + rste &= ~_SEGMENT_ENTRY_NOEXEC; + + /* Set correct table type for 2G hugepages */ + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { + if (likely(pte_present(pte))) + rste |= _REGION3_ENTRY_LARGE; + rste |= _REGION_ENTRY_TYPE_R3; + } else if (likely(pte_present(pte))) + rste |= _SEGMENT_ENTRY_LARGE; + + clear_huge_pte_skeys(mm, rste); + pte_val(*ptep) = rste; +} + +pte_t huge_ptep_get(pte_t *ptep) +{ + return __rste_to_pte(pte_val(*ptep)); +} + +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = huge_ptep_get(ptep); + pmd_t *pmdp = (pmd_t *) ptep; + pud_t *pudp = (pud_t *) ptep; + + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY)); + else + pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + return pte; +} + +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp = NULL; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (p4dp) { + pudp = pud_alloc(mm, p4dp, addr); + if (pudp) { + if (sz == PUD_SIZE) + return (pte_t *) pudp; + else if (sz == PMD_SIZE) + pmdp = pmd_alloc(mm, pudp, addr); + } + } + return (pte_t *) pmdp; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp = NULL; + + pgdp = pgd_offset(mm, addr); + if (pgd_present(*pgdp)) { + p4dp = p4d_offset(pgdp, addr); + if (p4d_present(*p4dp)) { + pudp = pud_offset(p4dp, addr); + if (pud_present(*pudp)) { + if (pud_large(*pudp)) + return (pte_t *) pudp; + pmdp = pmd_offset(pudp, addr); + } + } + } + return (pte_t *) pmdp; +} + +int pmd_huge(pmd_t pmd) +{ + return pmd_large(pmd); +} + +int pud_huge(pud_t pud) +{ + return pud_large(pud); +} + +struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int flags) +{ + if (flags & FOLL_GET) + return NULL; + + return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); +} + +static __init int setup_hugepagesz(char *opt) +{ + unsigned long size; + char *string = opt; + + size = memparse(opt, &opt); + if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) { + hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); + } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) { + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + } else { + hugetlb_bad_size(); + pr_err("hugepagesz= specifies an unsupported page size %s\n", + string); + return 0; + } + return 1; +} +__setup("hugepagesz=", setup_hugepagesz); + +static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.low_limit = current->mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +} + +static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, + unsigned long addr0, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + unsigned long addr; + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } + + return addr; +} + +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int rc; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + goto check_asce_limit; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) + goto check_asce_limit; + } + + if (mm->get_unmapped_area == arch_get_unmapped_area) + addr = hugetlb_get_unmapped_area_bottomup(file, addr, len, + pgoff, flags); + else + addr = hugetlb_get_unmapped_area_topdown(file, addr, len, + pgoff, flags); + if (addr & ~PAGE_MASK) + return addr; + +check_asce_limit: + if (addr + len > current->mm->context.asce_limit && + addr + len <= TASK_SIZE) { + rc = crst_table_upgrade(mm, addr + len); + if (rc) + return (unsigned long) rc; + } + return addr; +} diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c new file mode 100644 index 000000000..379a925d9 --- /dev/null +++ b/arch/s390/mm/init.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999 + * Author(s): Hartmut Penner (hp@de.ibm.com) + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1995 Linus Torvalds + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/bootmem.h> +#include <linux/memory.h> +#include <linux/pfn.h> +#include <linux/poison.h> +#include <linux/initrd.h> +#include <linux/export.h> +#include <linux/cma.h> +#include <linux/gfp.h> +#include <linux/memblock.h> +#include <asm/processor.h> +#include <linux/uaccess.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/dma.h> +#include <asm/lowcore.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/ctl_reg.h> +#include <asm/sclp.h> +#include <asm/set_memory.h> + +pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir); + +unsigned long empty_zero_page, zero_page_mask; +EXPORT_SYMBOL(empty_zero_page); +EXPORT_SYMBOL(zero_page_mask); + +static void __init setup_zero_pages(void) +{ + unsigned int order; + struct page *page; + int i; + + /* Latest machines require a mapping granularity of 512KB */ + order = 7; + + /* Limit number of empty zero pages for small memory sizes */ + while (order > 2 && (totalram_pages >> 10) < (1UL << order)) + order--; + + empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!empty_zero_page) + panic("Out of memory in setup_zero_pages"); + + page = virt_to_page((void *) empty_zero_page); + split_page(page, order); + for (i = 1 << order; i > 0; i--) { + mark_page_reserved(page); + page++; + } + + zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; +} + +/* + * paging_init() sets up the page tables + */ +void __init paging_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + unsigned long pgd_type, asce_bits; + psw_t psw; + + init_mm.pgd = swapper_pg_dir; + if (VMALLOC_END > _REGION2_SIZE) { + asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + pgd_type = _REGION2_ENTRY_EMPTY; + } else { + asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + pgd_type = _REGION3_ENTRY_EMPTY; + } + init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; + S390_lowcore.kernel_asce = init_mm.context.asce; + S390_lowcore.user_asce = S390_lowcore.kernel_asce; + crst_table_init((unsigned long *) init_mm.pgd, pgd_type); + vmem_map_init(); + + /* enable virtual mapping in kernel mode */ + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + __ctl_load(S390_lowcore.kernel_asce, 7, 7); + __ctl_load(S390_lowcore.kernel_asce, 13, 13); + psw.mask = __extract_psw(); + psw_bits(psw).dat = 1; + psw_bits(psw).as = PSW_BITS_AS_HOME; + __load_psw_mask(psw.mask); + + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS); + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; + free_area_init_nodes(max_zone_pfns); +} + +void mark_rodata_ro(void) +{ + unsigned long size = __end_ro_after_init - __start_ro_after_init; + + set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT); + pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); +} + +void __init mem_init(void) +{ + cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); + cpumask_set_cpu(0, mm_cpumask(&init_mm)); + + set_max_mapnr(max_low_pfn); + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + + /* Setup guest page hinting */ + cmma_init(); + + /* this will put all low memory onto the freelists */ + free_all_bootmem(); + setup_zero_pages(); /* Setup zeroed pages. */ + + cmma_init_nodat(); + + mem_init_print_info(NULL); +} + +void free_initmem(void) +{ + __set_memory((unsigned long)_sinittext, + (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, + SET_MEMORY_RW | SET_MEMORY_NX); + free_initmem_default(POISON_FREE_INITMEM); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void __init free_initrd_mem(unsigned long start, unsigned long end) +{ + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); +} +#endif + +unsigned long memory_block_size_bytes(void) +{ + /* + * Make sure the memory block size is always greater + * or equal than the memory increment size. + */ + return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm); +} + +#ifdef CONFIG_MEMORY_HOTPLUG + +#ifdef CONFIG_CMA + +/* Prevent memory blocks which contain cma regions from going offline */ + +struct s390_cma_mem_data { + unsigned long start; + unsigned long end; +}; + +static int s390_cma_check_range(struct cma *cma, void *data) +{ + struct s390_cma_mem_data *mem_data; + unsigned long start, end; + + mem_data = data; + start = cma_get_base(cma); + end = start + cma_get_size(cma); + if (end < mem_data->start) + return 0; + if (start >= mem_data->end) + return 0; + return -EBUSY; +} + +static int s390_cma_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct s390_cma_mem_data mem_data; + struct memory_notify *arg; + int rc = 0; + + arg = data; + mem_data.start = arg->start_pfn << PAGE_SHIFT; + mem_data.end = mem_data.start + (arg->nr_pages << PAGE_SHIFT); + if (action == MEM_GOING_OFFLINE) + rc = cma_for_each_area(s390_cma_check_range, &mem_data); + return notifier_from_errno(rc); +} + +static struct notifier_block s390_cma_mem_nb = { + .notifier_call = s390_cma_mem_notifier, +}; + +static int __init s390_cma_mem_init(void) +{ + return register_memory_notifier(&s390_cma_mem_nb); +} +device_initcall(s390_cma_mem_init); + +#endif /* CONFIG_CMA */ + +int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, + bool want_memblock) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long size_pages = PFN_DOWN(size); + int rc; + + rc = vmem_add_mapping(start, size); + if (rc) + return rc; + + rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock); + if (rc) + vmem_remove_mapping(start, size); + return rc; +} + +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + __remove_pages(start_pfn, nr_pages, altmap); + vmem_remove_mapping(start, size); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c new file mode 100644 index 000000000..7be064758 --- /dev/null +++ b/arch/s390/mm/maccess.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Access kernel memory without faulting -- s390 specific implementation. + * + * Copyright IBM Corp. 2009, 2015 + * + * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, + * + */ + +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/gfp.h> +#include <linux/cpu.h> +#include <asm/ctl_reg.h> +#include <asm/io.h> + +static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) +{ + unsigned long aligned, offset, count; + char tmp[8]; + + aligned = (unsigned long) dst & ~7UL; + offset = (unsigned long) dst & 7UL; + size = min(8UL - offset, size); + count = size - 1; + asm volatile( + " bras 1,0f\n" + " mvc 0(1,%4),0(%5)\n" + "0: mvc 0(8,%3),0(%0)\n" + " ex %1,0(1)\n" + " lg %1,0(%3)\n" + " lra %0,0(%0)\n" + " sturg %1,%0\n" + : "+&a" (aligned), "+&a" (count), "=m" (tmp) + : "a" (&tmp), "a" (&tmp[offset]), "a" (src) + : "cc", "memory", "1"); + return size; +} + +/* + * s390_kernel_write - write to kernel memory bypassing DAT + * @dst: destination address + * @src: source address + * @size: number of bytes to copy + * + * This function writes to kernel memory bypassing DAT and possible page table + * write protection. It writes to the destination using the sturg instruction. + * Therefore we have a read-modify-write sequence: the function reads eight + * bytes from destination at an eight byte boundary, modifies the bytes + * requested and writes the result back in a loop. + * + * Note: this means that this function may not be called concurrently on + * several cpus with overlapping words, since this may potentially + * cause data corruption. + */ +void notrace s390_kernel_write(void *dst, const void *src, size_t size) +{ + long copied; + + while (size) { + copied = s390_kernel_write_odd(dst, src, size); + dst += copied; + src += copied; + size -= copied; + } +} + +static int __memcpy_real(void *dest, void *src, size_t count) +{ + register unsigned long _dest asm("2") = (unsigned long) dest; + register unsigned long _len1 asm("3") = (unsigned long) count; + register unsigned long _src asm("4") = (unsigned long) src; + register unsigned long _len2 asm("5") = (unsigned long) count; + int rc = -EFAULT; + + asm volatile ( + "0: mvcle %1,%2,0x0\n" + "1: jo 0b\n" + " lhi %0,0x0\n" + "2:\n" + EX_TABLE(1b,2b) + : "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1), + "+d" (_len2), "=m" (*((long *) dest)) + : "m" (*((long *) src)) + : "cc", "memory"); + return rc; +} + +/* + * Copy memory in real mode (kernel to kernel) + */ +int memcpy_real(void *dest, void *src, size_t count) +{ + int irqs_disabled, rc; + unsigned long flags; + + if (!count) + return 0; + flags = __arch_local_irq_stnsm(0xf8UL); + irqs_disabled = arch_irqs_disabled_flags(flags); + if (!irqs_disabled) + trace_hardirqs_off(); + rc = __memcpy_real(dest, src, count); + if (!irqs_disabled) + trace_hardirqs_on(); + __arch_local_irq_ssm(flags); + return rc; +} + +/* + * Copy memory in absolute mode (kernel to kernel) + */ +void memcpy_absolute(void *dest, void *src, size_t count) +{ + unsigned long cr0, flags, prefix; + + flags = arch_local_irq_save(); + __ctl_store(cr0, 0, 0); + __ctl_clear_bit(0, 28); /* disable lowcore protection */ + prefix = store_prefix(); + if (prefix) { + local_mcck_disable(); + set_prefix(0); + memcpy(dest, src, count); + set_prefix(prefix); + local_mcck_enable(); + } else { + memcpy(dest, src, count); + } + __ctl_load(cr0, 0, 0); + arch_local_irq_restore(flags); +} + +/* + * Copy memory from kernel (real) to user (virtual) + */ +int copy_to_user_real(void __user *dest, void *src, unsigned long count) +{ + int offs = 0, size, rc; + char *buf; + + buf = (char *) __get_free_page(GFP_KERNEL); + if (!buf) + return -ENOMEM; + rc = -EFAULT; + while (offs < count) { + size = min(PAGE_SIZE, count - offs); + if (memcpy_real(buf, src + offs, size)) + goto out; + if (copy_to_user(dest + offs, buf, size)) + goto out; + offs += size; + } + rc = 0; +out: + free_page((unsigned long) buf); + return rc; +} + +/* + * Check if physical address is within prefix or zero page + */ +static int is_swapped(unsigned long addr) +{ + unsigned long lc; + int cpu; + + if (addr < sizeof(struct lowcore)) + return 1; + for_each_online_cpu(cpu) { + lc = (unsigned long) lowcore_ptr[cpu]; + if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc) + continue; + return 1; + } + return 0; +} + +/* + * Convert a physical pointer for /dev/mem access + * + * For swapped prefix pages a new buffer is returned that contains a copy of + * the absolute memory. The buffer size is maximum one page large. + */ +void *xlate_dev_mem_ptr(phys_addr_t addr) +{ + void *bounce = (void *) addr; + unsigned long size; + + get_online_cpus(); + preempt_disable(); + if (is_swapped(addr)) { + size = PAGE_SIZE - (addr & ~PAGE_MASK); + bounce = (void *) __get_free_page(GFP_ATOMIC); + if (bounce) + memcpy_absolute(bounce, (void *) addr, size); + } + preempt_enable(); + put_online_cpus(); + return bounce; +} + +/* + * Free converted buffer for /dev/mem access (if necessary) + */ +void unxlate_dev_mem_ptr(phys_addr_t addr, void *buf) +{ + if ((void *) addr != buf) + free_page((unsigned long) buf); +} diff --git a/arch/s390/mm/mem_detect.c b/arch/s390/mm/mem_detect.c new file mode 100644 index 000000000..21f6c82c8 --- /dev/null +++ b/arch/s390/mm/mem_detect.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2008, 2009 + * + * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/init.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <asm/ipl.h> +#include <asm/sclp.h> +#include <asm/setup.h> + +#define CHUNK_READ_WRITE 0 +#define CHUNK_READ_ONLY 1 + +static inline void memblock_physmem_add(phys_addr_t start, phys_addr_t size) +{ + memblock_dbg("memblock_physmem_add: [%#016llx-%#016llx]\n", + start, start + size - 1); + memblock_add_range(&memblock.memory, start, size, 0, 0); + memblock_add_range(&memblock.physmem, start, size, 0, 0); +} + +void __init detect_memory_memblock(void) +{ + unsigned long memsize, rnmax, rzm, addr, size; + int type; + + rzm = sclp.rzm; + rnmax = sclp.rnmax; + memsize = rzm * rnmax; + if (!rzm) + rzm = 1UL << 17; + max_physmem_end = memsize; + addr = 0; + /* keep memblock lists close to the kernel */ + memblock_set_bottom_up(true); + do { + size = 0; + /* assume lowcore is writable */ + type = addr ? tprot(addr) : CHUNK_READ_WRITE; + do { + size += rzm; + if (max_physmem_end && addr + size >= max_physmem_end) + break; + } while (type == tprot(addr + size)); + if (type == CHUNK_READ_WRITE || type == CHUNK_READ_ONLY) { + if (max_physmem_end && (addr + size > max_physmem_end)) + size = max_physmem_end - addr; + memblock_physmem_add(addr, size); + } + addr += size; + } while (addr < max_physmem_end); + memblock_set_bottom_up(false); + if (!max_physmem_end) + max_physmem_end = memblock_end_of_DRAM(); + memblock_dump_all(); +} diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c new file mode 100644 index 000000000..0a7627cdb --- /dev/null +++ b/arch/s390/mm/mmap.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * flexible mmap layout support + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * Started by Ingo Molnar <mingo@elte.hu> + */ + +#include <linux/elf-randomize.h> +#include <linux/personality.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/sched/signal.h> +#include <linux/sched/mm.h> +#include <linux/random.h> +#include <linux/compat.h> +#include <linux/security.h> +#include <asm/pgalloc.h> +#include <asm/elf.h> + +static unsigned long stack_maxrandom_size(void) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + if (current->personality & ADDR_NO_RANDOMIZE) + return 0; + return STACK_RND_MASK << PAGE_SHIFT; +} + +/* + * Top of mmap area (just below the process stack). + * + * Leave at least a ~32 MB hole. + */ +#define MIN_GAP (32*1024*1024) +#define MAX_GAP (STACK_TOP/6*5) + +static inline int mmap_is_legacy(struct rlimit *rlim_stack) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + if (rlim_stack->rlim_cur == RLIM_INFINITY) + return 1; + return sysctl_legacy_va_layout; +} + +unsigned long arch_mmap_rnd(void) +{ + return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT; +} + +static unsigned long mmap_base_legacy(unsigned long rnd) +{ + return TASK_UNMAPPED_BASE + rnd; +} + +static inline unsigned long mmap_base(unsigned long rnd, + struct rlimit *rlim_stack) +{ + unsigned long gap = rlim_stack->rlim_cur; + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + gap &= PAGE_MASK; + return STACK_TOP - stack_maxrandom_size() - rnd - gap; +} + +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct vm_unmapped_area_info info; + int rc; + + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + goto check_asce_limit; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) + goto check_asce_limit; + } + + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + if (filp || (flags & MAP_SHARED)) + info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; + else + info.align_mask = 0; + info.align_offset = pgoff << PAGE_SHIFT; + addr = vm_unmapped_area(&info); + if (addr & ~PAGE_MASK) + return addr; + +check_asce_limit: + if (addr + len > current->mm->context.asce_limit && + addr + len <= TASK_SIZE) { + rc = crst_table_upgrade(mm, addr + len); + if (rc) + return (unsigned long) rc; + } + + return addr; +} + +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; + int rc; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + goto check_asce_limit; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) + goto check_asce_limit; + } + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = mm->mmap_base; + if (filp || (flags & MAP_SHARED)) + info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; + else + info.align_mask = 0; + info.align_offset = pgoff << PAGE_SHIFT; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + if (addr & ~PAGE_MASK) + return addr; + } + +check_asce_limit: + if (addr + len > current->mm->context.asce_limit && + addr + len <= TASK_SIZE) { + rc = crst_table_upgrade(mm, addr + len); + if (rc) + return (unsigned long) rc; + } + + return addr; +} + +/* + * This function, called very early during the creation of a new + * process VM image, sets up which VM layout function to use: + */ +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +{ + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = arch_mmap_rnd(); + + /* + * Fall back to the standard layout if the personality + * bit is set, or if the expected stack growth is unlimited: + */ + if (mmap_is_legacy(rlim_stack)) { + mm->mmap_base = mmap_base_legacy(random_factor); + mm->get_unmapped_area = arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor, rlim_stack); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + } +} diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c new file mode 100644 index 000000000..dc3cede7f --- /dev/null +++ b/arch/s390/mm/page-states.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2008 + * + * Guest page hinting for unused pages. + * + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/memblock.h> +#include <linux/gfp.h> +#include <linux/init.h> +#include <asm/facility.h> +#include <asm/page-states.h> + +static int cmma_flag = 1; + +static int __init cmma(char *str) +{ + char *parm; + + parm = strstrip(str); + if (strcmp(parm, "yes") == 0 || strcmp(parm, "on") == 0) { + cmma_flag = 1; + return 1; + } + cmma_flag = 0; + if (strcmp(parm, "no") == 0 || strcmp(parm, "off") == 0) + return 1; + return 0; +} +__setup("cmma=", cmma); + +static inline int cmma_test_essa(void) +{ + register unsigned long tmp asm("0") = 0; + register int rc asm("1"); + + /* test ESSA_GET_STATE */ + asm volatile( + " .insn rrf,0xb9ab0000,%1,%1,%2,0\n" + "0: la %0,0\n" + "1:\n" + EX_TABLE(0b,1b) + : "=&d" (rc), "+&d" (tmp) + : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP)); + return rc; +} + +void __init cmma_init(void) +{ + if (!cmma_flag) + return; + if (cmma_test_essa()) { + cmma_flag = 0; + return; + } + if (test_facility(147)) + cmma_flag = 2; +} + +static inline unsigned char get_page_state(struct page *page) +{ + unsigned char state; + + asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (state) + : "a" (page_to_phys(page)), + "i" (ESSA_GET_STATE)); + return state & 0x3f; +} + +static inline void set_page_unused(struct page *page, int order) +{ + int i, rc; + + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" (page_to_phys(page + i)), + "i" (ESSA_SET_UNUSED)); +} + +static inline void set_page_stable_dat(struct page *page, int order) +{ + int i, rc; + + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" (page_to_phys(page + i)), + "i" (ESSA_SET_STABLE)); +} + +static inline void set_page_stable_nodat(struct page *page, int order) +{ + int i, rc; + + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" (page_to_phys(page + i)), + "i" (ESSA_SET_STABLE_NODAT)); +} + +static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pmd_t *pmd; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd) || pmd_large(*pmd)) + continue; + page = virt_to_page(pmd_val(*pmd)); + set_bit(PG_arch_1, &page->flags); + } while (pmd++, addr = next, addr != end); +} + +static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pud_t *pud; + int i; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none(*pud) || pud_large(*pud)) + continue; + if (!pud_folded(*pud)) { + page = virt_to_page(pud_val(*pud)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pmd(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + p4d_t *p4d; + int i; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) + continue; + if (!p4d_folded(*p4d)) { + page = virt_to_page(p4d_val(*p4d)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pud(p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + +static void mark_kernel_pgd(void) +{ + unsigned long addr, next; + struct page *page; + pgd_t *pgd; + int i; + + addr = 0; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, MODULES_END); + if (pgd_none(*pgd)) + continue; + if (!pgd_folded(*pgd)) { + page = virt_to_page(pgd_val(*pgd)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_p4d(pgd, addr, next); + } while (pgd++, addr = next, addr != MODULES_END); +} + +void __init cmma_init_nodat(void) +{ + struct memblock_region *reg; + struct page *page; + unsigned long start, end, ix; + + if (cmma_flag < 2) + return; + /* Mark pages used in kernel page tables */ + mark_kernel_pgd(); + + /* Set all kernel pages not used for page tables to stable/no-dat */ + for_each_memblock(memory, reg) { + start = memblock_region_memory_base_pfn(reg); + end = memblock_region_memory_end_pfn(reg); + page = pfn_to_page(start); + for (ix = start; ix < end; ix++, page++) { + if (__test_and_clear_bit(PG_arch_1, &page->flags)) + continue; /* skip page table pages */ + if (!list_empty(&page->lru)) + continue; /* skip free pages */ + set_page_stable_nodat(page, 0); + } + } +} + +void arch_free_page(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_unused(page, order); +} + +void arch_alloc_page(struct page *page, int order) +{ + if (!cmma_flag) + return; + if (cmma_flag < 2) + set_page_stable_dat(page, order); + else + set_page_stable_nodat(page, order); +} + +void arch_set_page_dat(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_stable_dat(page, order); +} + +void arch_set_page_nodat(struct page *page, int order) +{ + if (cmma_flag < 2) + return; + set_page_stable_nodat(page, order); +} + +int arch_test_page_nodat(struct page *page) +{ + unsigned char state; + + if (cmma_flag < 2) + return 0; + state = get_page_state(page); + return !!(state & 0x20); +} + +void arch_set_page_states(int make_stable) +{ + unsigned long flags, order, t; + struct list_head *l; + struct page *page; + struct zone *zone; + + if (!cmma_flag) + return; + if (make_stable) + drain_local_pages(NULL); + for_each_populated_zone(zone) { + spin_lock_irqsave(&zone->lock, flags); + for_each_migratetype_order(order, t) { + list_for_each(l, &zone->free_area[order].free_list[t]) { + page = list_entry(l, struct page, lru); + if (make_stable) + set_page_stable_dat(page, order); + else + set_page_unused(page, order); + } + } + spin_unlock_irqrestore(&zone->lock, flags); + } +} diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c new file mode 100644 index 000000000..f8c6faab4 --- /dev/null +++ b/arch/s390/mm/pageattr.c @@ -0,0 +1,386 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2011 + * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> + */ +#include <linux/hugetlb.h> +#include <linux/mm.h> +#include <asm/cacheflush.h> +#include <asm/facility.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/page.h> +#include <asm/set_memory.h> + +static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) +{ + asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0" + : [addr] "+a" (addr) : [skey] "d" (skey)); + return addr; +} + +void __storage_key_init_range(unsigned long start, unsigned long end) +{ + unsigned long boundary, size; + + while (start < end) { + if (MACHINE_HAS_EDAT1) { + /* set storage keys for a 1MB frame */ + size = 1UL << 20; + boundary = (start + size) & ~(size - 1); + if (boundary <= end) { + do { + start = sske_frame(start, PAGE_DEFAULT_KEY); + } while (start < boundary); + continue; + } + } + page_set_storage_key(start, PAGE_DEFAULT_KEY, 1); + start += PAGE_SIZE; + } +} + +#ifdef CONFIG_PROC_FS +atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX]; + +void arch_report_meminfo(struct seq_file *m) +{ + seq_printf(m, "DirectMap4k: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_4K]) << 2); + seq_printf(m, "DirectMap1M: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_1M]) << 10); + seq_printf(m, "DirectMap2G: %8lu kB\n", + atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_2G]) << 21); +} +#endif /* CONFIG_PROC_FS */ + +static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, + unsigned long dtt) +{ + unsigned long table, mask; + + mask = 0; + if (MACHINE_HAS_EDAT2) { + switch (dtt) { + case CRDTE_DTT_REGION3: + mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1); + break; + case CRDTE_DTT_SEGMENT: + mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); + break; + case CRDTE_DTT_PAGE: + mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1); + break; + } + table = (unsigned long)old & mask; + crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce); + } else if (MACHINE_HAS_IDTE) { + cspg(old, *old, new); + } else { + csp((unsigned int *)old + 1, *old, new); + } +} + +static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, + unsigned long flags) +{ + pte_t *ptep, new; + + ptep = pte_offset(pmdp, addr); + do { + new = *ptep; + if (pte_none(new)) + return -EINVAL; + if (flags & SET_MEMORY_RO) + new = pte_wrprotect(new); + else if (flags & SET_MEMORY_RW) + new = pte_mkwrite(pte_mkdirty(new)); + if (flags & SET_MEMORY_NX) + pte_val(new) |= _PAGE_NOEXEC; + else if (flags & SET_MEMORY_X) + pte_val(new) &= ~_PAGE_NOEXEC; + pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE); + ptep++; + addr += PAGE_SIZE; + cond_resched(); + } while (addr < end); + return 0; +} + +static int split_pmd_page(pmd_t *pmdp, unsigned long addr) +{ + unsigned long pte_addr, prot; + pte_t *pt_dir, *ptep; + pmd_t new; + int i, ro, nx; + + pt_dir = vmem_pte_alloc(); + if (!pt_dir) + return -ENOMEM; + pte_addr = pmd_pfn(*pmdp) << PAGE_SHIFT; + ro = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT); + nx = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_NOEXEC); + prot = pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL); + if (!nx) + prot &= ~_PAGE_NOEXEC; + ptep = pt_dir; + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_val(*ptep) = pte_addr | prot; + pte_addr += PAGE_SIZE; + ptep++; + } + pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY; + pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); + update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE); + update_page_count(PG_DIRECT_MAP_1M, -1); + return 0; +} + +static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, + unsigned long flags) +{ + pmd_t new = *pmdp; + + if (flags & SET_MEMORY_RO) + new = pmd_wrprotect(new); + else if (flags & SET_MEMORY_RW) + new = pmd_mkwrite(pmd_mkdirty(new)); + if (flags & SET_MEMORY_NX) + pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC; + else if (flags & SET_MEMORY_X) + pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC; + pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); +} + +static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + pmd_t *pmdp; + int rc = 0; + + pmdp = pmd_offset(pudp, addr); + do { + if (pmd_none(*pmdp)) + return -EINVAL; + next = pmd_addr_end(addr, end); + if (pmd_large(*pmdp)) { + if (addr & ~PMD_MASK || addr + PMD_SIZE > next) { + rc = split_pmd_page(pmdp, addr); + if (rc) + return rc; + continue; + } + modify_pmd_page(pmdp, addr, flags); + } else { + rc = walk_pte_level(pmdp, addr, next, flags); + if (rc) + return rc; + } + pmdp++; + addr = next; + cond_resched(); + } while (addr < end); + return rc; +} + +static int split_pud_page(pud_t *pudp, unsigned long addr) +{ + unsigned long pmd_addr, prot; + pmd_t *pm_dir, *pmdp; + pud_t new; + int i, ro, nx; + + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pm_dir) + return -ENOMEM; + pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT; + ro = !!(pud_val(*pudp) & _REGION_ENTRY_PROTECT); + nx = !!(pud_val(*pudp) & _REGION_ENTRY_NOEXEC); + prot = pgprot_val(ro ? SEGMENT_KERNEL_RO : SEGMENT_KERNEL); + if (!nx) + prot &= ~_SEGMENT_ENTRY_NOEXEC; + pmdp = pm_dir; + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd_val(*pmdp) = pmd_addr | prot; + pmd_addr += PMD_SIZE; + pmdp++; + } + pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY; + pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); + update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD); + update_page_count(PG_DIRECT_MAP_2G, -1); + return 0; +} + +static void modify_pud_page(pud_t *pudp, unsigned long addr, + unsigned long flags) +{ + pud_t new = *pudp; + + if (flags & SET_MEMORY_RO) + new = pud_wrprotect(new); + else if (flags & SET_MEMORY_RW) + new = pud_mkwrite(pud_mkdirty(new)); + if (flags & SET_MEMORY_NX) + pud_val(new) |= _REGION_ENTRY_NOEXEC; + else if (flags & SET_MEMORY_X) + pud_val(new) &= ~_REGION_ENTRY_NOEXEC; + pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); +} + +static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + pud_t *pudp; + int rc = 0; + + pudp = pud_offset(p4d, addr); + do { + if (pud_none(*pudp)) + return -EINVAL; + next = pud_addr_end(addr, end); + if (pud_large(*pudp)) { + if (addr & ~PUD_MASK || addr + PUD_SIZE > next) { + rc = split_pud_page(pudp, addr); + if (rc) + break; + continue; + } + modify_pud_page(pudp, addr, flags); + } else { + rc = walk_pmd_level(pudp, addr, next, flags); + } + pudp++; + addr = next; + cond_resched(); + } while (addr < end && !rc); + return rc; +} + +static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + p4d_t *p4dp; + int rc = 0; + + p4dp = p4d_offset(pgd, addr); + do { + if (p4d_none(*p4dp)) + return -EINVAL; + next = p4d_addr_end(addr, end); + rc = walk_pud_level(p4dp, addr, next, flags); + p4dp++; + addr = next; + cond_resched(); + } while (addr < end && !rc); + return rc; +} + +static DEFINE_MUTEX(cpa_mutex); + +static int change_page_attr(unsigned long addr, unsigned long end, + unsigned long flags) +{ + unsigned long next; + int rc = -EINVAL; + pgd_t *pgdp; + + if (addr == end) + return 0; + if (end >= MODULES_END) + return -EINVAL; + mutex_lock(&cpa_mutex); + pgdp = pgd_offset_k(addr); + do { + if (pgd_none(*pgdp)) + break; + next = pgd_addr_end(addr, end); + rc = walk_p4d_level(pgdp, addr, next, flags); + if (rc) + break; + cond_resched(); + } while (pgdp++, addr = next, addr < end && !rc); + mutex_unlock(&cpa_mutex); + return rc; +} + +int __set_memory(unsigned long addr, int numpages, unsigned long flags) +{ + if (!MACHINE_HAS_NX) + flags &= ~(SET_MEMORY_NX | SET_MEMORY_X); + if (!flags) + return 0; + addr &= PAGE_MASK; + return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC + +static void ipte_range(pte_t *pte, unsigned long address, int nr) +{ + int i; + + if (test_facility(13)) { + __ptep_ipte_range(address, nr - 1, pte, IPTE_GLOBAL); + return; + } + for (i = 0; i < nr; i++) { + __ptep_ipte(address, pte, 0, 0, IPTE_GLOBAL); + address += PAGE_SIZE; + pte++; + } +} + +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long address; + int nr, i, j; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (i = 0; i < numpages;) { + address = page_to_phys(page + i); + pgd = pgd_offset_k(address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); + pte = pte_offset_kernel(pmd, address); + nr = (unsigned long)pte >> ilog2(sizeof(long)); + nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1)); + nr = min(numpages - i, nr); + if (enable) { + for (j = 0; j < nr; j++) { + pte_val(*pte) &= ~_PAGE_INVALID; + address += PAGE_SIZE; + pte++; + } + } else { + ipte_range(pte, address, nr); + } + i += nr; + } +} + +#ifdef CONFIG_HIBERNATION +bool kernel_page_present(struct page *page) +{ + unsigned long addr; + int cc; + + addr = page_to_phys(page); + asm volatile( + " lra %1,0(%1)\n" + " ipm %0\n" + " srl %0,28" + : "=d" (cc), "+a" (addr) : : "cc"); + return cc == 0; +} +#endif /* CONFIG_HIBERNATION */ + +#endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c new file mode 100644 index 000000000..3f3c13a4d --- /dev/null +++ b/arch/s390/mm/pgalloc.c @@ -0,0 +1,681 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Page table allocation functions + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <asm/mmu_context.h> +#include <asm/pgalloc.h> +#include <asm/gmap.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> + +#ifdef CONFIG_PGSTE + +static int page_table_allocate_pgste_min = 0; +static int page_table_allocate_pgste_max = 1; +int page_table_allocate_pgste = 0; +EXPORT_SYMBOL(page_table_allocate_pgste); + +static struct ctl_table page_table_sysctl[] = { + { + .procname = "allocate_pgste", + .data = &page_table_allocate_pgste, + .maxlen = sizeof(int), + .mode = S_IRUGO | S_IWUSR, + .proc_handler = proc_dointvec_minmax, + .extra1 = &page_table_allocate_pgste_min, + .extra2 = &page_table_allocate_pgste_max, + }, + { } +}; + +static struct ctl_table page_table_sysctl_dir[] = { + { + .procname = "vm", + .maxlen = 0, + .mode = 0555, + .child = page_table_sysctl, + }, + { } +}; + +static int __init page_table_register_sysctl(void) +{ + return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; +} +__initcall(page_table_register_sysctl); + +#endif /* CONFIG_PGSTE */ + +unsigned long *crst_table_alloc(struct mm_struct *mm) +{ + struct page *page = alloc_pages(GFP_KERNEL, 2); + + if (!page) + return NULL; + arch_set_page_dat(page, 2); + return (unsigned long *) page_to_phys(page); +} + +void crst_table_free(struct mm_struct *mm, unsigned long *table) +{ + free_pages((unsigned long) table, 2); +} + +static void __crst_table_upgrade(void *arg) +{ + struct mm_struct *mm = arg; + + /* we must change all active ASCEs to avoid the creation of new TLBs */ + if (current->active_mm == mm) { + S390_lowcore.user_asce = mm->context.asce; + if (current->thread.mm_segment == USER_DS) { + __ctl_load(S390_lowcore.user_asce, 1, 1); + /* Mark user-ASCE present in CR1 */ + clear_cpu_flag(CIF_ASCE_PRIMARY); + } + if (current->thread.mm_segment == USER_DS_SACF) { + __ctl_load(S390_lowcore.user_asce, 7, 7); + /* enable_sacf_uaccess does all or nothing */ + WARN_ON(!test_cpu_flag(CIF_ASCE_SECONDARY)); + } + } + __tlb_flush_local(); +} + +int crst_table_upgrade(struct mm_struct *mm, unsigned long end) +{ + unsigned long *table, *pgd; + int rc, notify; + + /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ + VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE); + rc = 0; + notify = 0; + while (mm->context.asce_limit < end) { + table = crst_table_alloc(mm); + if (!table) { + rc = -ENOMEM; + break; + } + spin_lock_bh(&mm->page_table_lock); + pgd = (unsigned long *) mm->pgd; + if (mm->context.asce_limit == _REGION2_SIZE) { + crst_table_init(table, _REGION2_ENTRY_EMPTY); + p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd); + mm->pgd = (pgd_t *) table; + mm->context.asce_limit = _REGION1_SIZE; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + mm_inc_nr_puds(mm); + } else { + crst_table_init(table, _REGION1_ENTRY_EMPTY); + pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd); + mm->pgd = (pgd_t *) table; + mm->context.asce_limit = -PAGE_SIZE; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION1; + } + notify = 1; + spin_unlock_bh(&mm->page_table_lock); + } + if (notify) + on_each_cpu(__crst_table_upgrade, mm, 0); + return rc; +} + +void crst_table_downgrade(struct mm_struct *mm) +{ + pgd_t *pgd; + + /* downgrade should only happen from 3 to 2 levels (compat only) */ + VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE); + + if (current->active_mm == mm) { + clear_user_asce(); + __tlb_flush_mm(mm); + } + + pgd = mm->pgd; + mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); + mm->context.asce_limit = _REGION3_SIZE; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; + crst_table_free(mm, (unsigned long *) pgd); + + if (current->active_mm == mm) + set_user_asce(mm); +} + +static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) +{ + unsigned int old, new; + + do { + old = atomic_read(v); + new = old ^ bits; + } while (atomic_cmpxchg(v, old, new) != old); + return new; +} + +#ifdef CONFIG_PGSTE + +struct page *page_table_alloc_pgste(struct mm_struct *mm) +{ + struct page *page; + u64 *table; + + page = alloc_page(GFP_KERNEL); + if (page) { + table = (u64 *)page_to_phys(page); + memset64(table, _PAGE_INVALID, PTRS_PER_PTE); + memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); + } + return page; +} + +void page_table_free_pgste(struct page *page) +{ + __free_page(page); +} + +#endif /* CONFIG_PGSTE */ + +/* + * page table entry allocation/free routines. + */ +unsigned long *page_table_alloc(struct mm_struct *mm) +{ + unsigned long *table; + struct page *page; + unsigned int mask, bit; + + /* Try to get a fragment of a 4K page as a 2K page table */ + if (!mm_alloc_pgste(mm)) { + table = NULL; + spin_lock_bh(&mm->context.lock); + if (!list_empty(&mm->context.pgtable_list)) { + page = list_first_entry(&mm->context.pgtable_list, + struct page, lru); + mask = atomic_read(&page->_refcount) >> 24; + mask = (mask | (mask >> 4)) & 3; + if (mask != 3) { + table = (unsigned long *) page_to_phys(page); + bit = mask & 1; /* =1 -> second 2K */ + if (bit) + table += PTRS_PER_PTE; + atomic_xor_bits(&page->_refcount, + 1U << (bit + 24)); + list_del(&page->lru); + } + } + spin_unlock_bh(&mm->context.lock); + if (table) + return table; + } + /* Allocate a fresh page */ + page = alloc_page(GFP_KERNEL); + if (!page) + return NULL; + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + arch_set_page_dat(page, 0); + /* Initialize page table */ + table = (unsigned long *) page_to_phys(page); + if (mm_alloc_pgste(mm)) { + /* Return 4K page table with PGSTEs */ + atomic_xor_bits(&page->_refcount, 3 << 24); + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); + } else { + /* Return the first 2K fragment of the page */ + atomic_xor_bits(&page->_refcount, 1 << 24); + memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); + spin_lock_bh(&mm->context.lock); + list_add(&page->lru, &mm->context.pgtable_list); + spin_unlock_bh(&mm->context.lock); + } + return table; +} + +void page_table_free(struct mm_struct *mm, unsigned long *table) +{ + struct page *page; + unsigned int bit, mask; + + page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + if (!mm_alloc_pgste(mm)) { + /* Free 2K page table fragment of a 4K page */ + bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); + spin_lock_bh(&mm->context.lock); + mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask >>= 24; + if (mask & 3) + list_add(&page->lru, &mm->context.pgtable_list); + else + list_del(&page->lru); + spin_unlock_bh(&mm->context.lock); + mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); + mask >>= 24; + if (mask != 0) + return; + } else { + atomic_xor_bits(&page->_refcount, 3U << 24); + } + + pgtable_page_dtor(page); + __free_page(page); +} + +void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, + unsigned long vmaddr) +{ + struct mm_struct *mm; + struct page *page; + unsigned int bit, mask; + + mm = tlb->mm; + page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + if (mm_alloc_pgste(mm)) { + gmap_unlink(mm, table, vmaddr); + table = (unsigned long *) (__pa(table) | 3); + tlb_remove_table(tlb, table); + return; + } + bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); + spin_lock_bh(&mm->context.lock); + mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask >>= 24; + if (mask & 3) + list_add_tail(&page->lru, &mm->context.pgtable_list); + else + list_del(&page->lru); + spin_unlock_bh(&mm->context.lock); + table = (unsigned long *) (__pa(table) | (1U << bit)); + tlb_remove_table(tlb, table); +} + +static void __tlb_remove_table(void *_table) +{ + unsigned int mask = (unsigned long) _table & 3; + void *table = (void *)((unsigned long) _table ^ mask); + struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + + switch (mask) { + case 0: /* pmd, pud, or p4d */ + free_pages((unsigned long) table, 2); + break; + case 1: /* lower 2K of a 4K page table */ + case 2: /* higher 2K of a 4K page table */ + mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); + mask >>= 24; + if (mask != 0) + break; + /* fallthrough */ + case 3: /* 4K page table with pgstes */ + if (mask & 3) + atomic_xor_bits(&page->_refcount, 3 << 24); + pgtable_page_dtor(page); + __free_page(page); + break; + } +} + +static void tlb_remove_table_smp_sync(void *arg) +{ + /* Simply deliver the interrupt */ +} + +static void tlb_remove_table_one(void *table) +{ + /* + * This isn't an RCU grace period and hence the page-tables cannot be + * assumed to be actually RCU-freed. + * + * It is however sufficient for software page-table walkers that rely + * on IRQ disabling. See the comment near struct mmu_table_batch. + */ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); + __tlb_remove_table(table); +} + +static void tlb_remove_table_rcu(struct rcu_head *head) +{ + struct mmu_table_batch *batch; + int i; + + batch = container_of(head, struct mmu_table_batch, rcu); + + for (i = 0; i < batch->nr; i++) + __tlb_remove_table(batch->tables[i]); + + free_page((unsigned long)batch); +} + +void tlb_table_flush(struct mmu_gather *tlb) +{ + struct mmu_table_batch **batch = &tlb->batch; + + if (*batch) { + call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); + *batch = NULL; + } +} + +void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct mmu_table_batch **batch = &tlb->batch; + + tlb->mm->context.flush_mm = 1; + if (*batch == NULL) { + *batch = (struct mmu_table_batch *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + if (*batch == NULL) { + __tlb_flush_mm_lazy(tlb->mm); + tlb_remove_table_one(table); + return; + } + (*batch)->nr = 0; + } + (*batch)->tables[(*batch)->nr++] = table; + if ((*batch)->nr == MAX_TABLE_BATCH) + tlb_flush_mmu(tlb); +} + +/* + * Base infrastructure required to generate basic asces, region, segment, + * and page tables that do not make use of enhanced features like EDAT1. + */ + +static struct kmem_cache *base_pgt_cache; + +static unsigned long base_pgt_alloc(void) +{ + u64 *table; + + table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL); + if (table) + memset64(table, _PAGE_INVALID, PTRS_PER_PTE); + return (unsigned long) table; +} + +static void base_pgt_free(unsigned long table) +{ + kmem_cache_free(base_pgt_cache, (void *) table); +} + +static unsigned long base_crst_alloc(unsigned long val) +{ + unsigned long table; + + table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + if (table) + crst_table_init((unsigned long *)table, val); + return table; +} + +static void base_crst_free(unsigned long table) +{ + free_pages(table, CRST_ALLOC_ORDER); +} + +#define BASE_ADDR_END_FUNC(NAME, SIZE) \ +static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \ + unsigned long end) \ +{ \ + unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \ + \ + return (next - 1) < (end - 1) ? next : end; \ +} + +BASE_ADDR_END_FUNC(page, _PAGE_SIZE) +BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE) +BASE_ADDR_END_FUNC(region3, _REGION3_SIZE) +BASE_ADDR_END_FUNC(region2, _REGION2_SIZE) +BASE_ADDR_END_FUNC(region1, _REGION1_SIZE) + +static inline unsigned long base_lra(unsigned long address) +{ + unsigned long real; + + asm volatile( + " lra %0,0(%1)\n" + : "=d" (real) : "a" (address) : "cc"); + return real; +} + +static int base_page_walk(unsigned long origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *pte, next; + + if (!alloc) + return 0; + pte = (unsigned long *) origin; + pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT; + do { + next = base_page_addr_end(addr, end); + *pte = base_lra(addr); + } while (pte++, addr = next, addr < end); + return 0; +} + +static int base_segment_walk(unsigned long origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *ste, next, table; + int rc; + + ste = (unsigned long *) origin; + ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; + do { + next = base_segment_addr_end(addr, end); + if (*ste & _SEGMENT_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_pgt_alloc(); + if (!table) + return -ENOMEM; + *ste = table | _SEGMENT_ENTRY; + } + table = *ste & _SEGMENT_ENTRY_ORIGIN; + rc = base_page_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_pgt_free(table); + cond_resched(); + } while (ste++, addr = next, addr < end); + return 0; +} + +static int base_region3_walk(unsigned long origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rtte, next, table; + int rc; + + rtte = (unsigned long *) origin; + rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT; + do { + next = base_region3_addr_end(addr, end); + if (*rtte & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rtte = table | _REGION3_ENTRY; + } + table = *rtte & _REGION_ENTRY_ORIGIN; + rc = base_segment_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rtte++, addr = next, addr < end); + return 0; +} + +static int base_region2_walk(unsigned long origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rste, next, table; + int rc; + + rste = (unsigned long *) origin; + rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT; + do { + next = base_region2_addr_end(addr, end); + if (*rste & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rste = table | _REGION2_ENTRY; + } + table = *rste & _REGION_ENTRY_ORIGIN; + rc = base_region3_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rste++, addr = next, addr < end); + return 0; +} + +static int base_region1_walk(unsigned long origin, unsigned long addr, + unsigned long end, int alloc) +{ + unsigned long *rfte, next, table; + int rc; + + rfte = (unsigned long *) origin; + rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT; + do { + next = base_region1_addr_end(addr, end); + if (*rfte & _REGION_ENTRY_INVALID) { + if (!alloc) + continue; + table = base_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!table) + return -ENOMEM; + *rfte = table | _REGION1_ENTRY; + } + table = *rfte & _REGION_ENTRY_ORIGIN; + rc = base_region2_walk(table, addr, next, alloc); + if (rc) + return rc; + if (!alloc) + base_crst_free(table); + } while (rfte++, addr = next, addr < end); + return 0; +} + +/** + * base_asce_free - free asce and tables returned from base_asce_alloc() + * @asce: asce to be freed + * + * Frees all region, segment, and page tables that were allocated with a + * corresponding base_asce_alloc() call. + */ +void base_asce_free(unsigned long asce) +{ + unsigned long table = asce & _ASCE_ORIGIN; + + if (!asce) + return; + switch (asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_SEGMENT: + base_segment_walk(table, 0, _REGION3_SIZE, 0); + break; + case _ASCE_TYPE_REGION3: + base_region3_walk(table, 0, _REGION2_SIZE, 0); + break; + case _ASCE_TYPE_REGION2: + base_region2_walk(table, 0, _REGION1_SIZE, 0); + break; + case _ASCE_TYPE_REGION1: + base_region1_walk(table, 0, -_PAGE_SIZE, 0); + break; + } + base_crst_free(table); +} + +static int base_pgt_cache_init(void) +{ + static DEFINE_MUTEX(base_pgt_cache_mutex); + unsigned long sz = _PAGE_TABLE_SIZE; + + if (base_pgt_cache) + return 0; + mutex_lock(&base_pgt_cache_mutex); + if (!base_pgt_cache) + base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL); + mutex_unlock(&base_pgt_cache_mutex); + return base_pgt_cache ? 0 : -ENOMEM; +} + +/** + * base_asce_alloc - create kernel mapping without enhanced DAT features + * @addr: virtual start address of kernel mapping + * @num_pages: number of consecutive pages + * + * Generate an asce, including all required region, segment and page tables, + * that can be used to access the virtual kernel mapping. The difference is + * that the returned asce does not make use of any enhanced DAT features like + * e.g. large pages. This is required for some I/O functions that pass an + * asce, like e.g. some service call requests. + * + * Note: the returned asce may NEVER be attached to any cpu. It may only be + * used for I/O requests. tlb entries that might result because the + * asce was attached to a cpu won't be cleared. + */ +unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) +{ + unsigned long asce, table, end; + int rc; + + if (base_pgt_cache_init()) + return 0; + end = addr + num_pages * PAGE_SIZE; + if (end <= _REGION3_SIZE) { + table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_segment_walk(table, addr, end, 1); + asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; + } else if (end <= _REGION2_SIZE) { + table = base_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region3_walk(table, addr, end, 1); + asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } else if (end <= _REGION1_SIZE) { + table = base_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region2_walk(table, addr, end, 1); + asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + table = base_crst_alloc(_REGION1_ENTRY_EMPTY); + if (!table) + return 0; + rc = base_region1_walk(table, addr, end, 1); + asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; + } + if (rc) { + base_asce_free(asce); + asce = 0; + } + return asce; +} diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c new file mode 100644 index 000000000..9f3903089 --- /dev/null +++ b/arch/s390/mm/pgtable.c @@ -0,0 +1,1126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2007, 2011 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/spinlock.h> +#include <linux/rcupdate.h> +#include <linux/slab.h> +#include <linux/swapops.h> +#include <linux/sysctl.h> +#include <linux/ksm.h> +#include <linux/mman.h> + +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/page-states.h> + +static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); + } +} + +static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int nodat) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL || nodat) + opt |= IPTE_NODAT; + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); + } else { + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + } +} + +static inline pte_t ptep_flush_direct(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + int nodat) +{ + pte_t old; + + old = *ptep; + if (unlikely(pte_val(old) & _PAGE_INVALID)) + return old; + atomic_inc(&mm->context.flush_count); + if (MACHINE_HAS_TLB_LC && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + ptep_ipte_local(mm, addr, ptep, nodat); + else + ptep_ipte_global(mm, addr, ptep, nodat); + atomic_dec(&mm->context.flush_count); + return old; +} + +static inline pte_t ptep_flush_lazy(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + int nodat) +{ + pte_t old; + + old = *ptep; + if (unlikely(pte_val(old) & _PAGE_INVALID)) + return old; + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(&mm->context.cpu_attach_mask, + cpumask_of(smp_processor_id()))) { + pte_val(*ptep) |= _PAGE_INVALID; + mm->context.flush_mm = 1; + } else + ptep_ipte_global(mm, addr, ptep, nodat); + atomic_dec(&mm->context.flush_count); + return old; +} + +static inline pgste_t pgste_get_lock(pte_t *ptep) +{ + unsigned long new = 0; +#ifdef CONFIG_PGSTE + unsigned long old; + + asm( + " lg %0,%2\n" + "0: lgr %1,%0\n" + " nihh %0,0xff7f\n" /* clear PCL bit in old */ + " oihh %1,0x0080\n" /* set PCL bit in new */ + " csg %0,%1,%2\n" + " jl 0b\n" + : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) + : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); +#endif + return __pgste(new); +} + +static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + asm( + " nihh %1,0xff7f\n" /* clear PCL bit */ + " stg %1,%0\n" + : "=Q" (ptep[PTRS_PER_PTE]) + : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) + : "cc", "memory"); +#endif +} + +static inline pgste_t pgste_get(pte_t *ptep) +{ + unsigned long pgste = 0; +#ifdef CONFIG_PGSTE + pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); +#endif + return __pgste(pgste); +} + +static inline void pgste_set(pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; +#endif +} + +static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, + struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + unsigned long address, bits, skey; + + if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) + return pgste; + address = pte_val(pte) & PAGE_MASK; + skey = (unsigned long) page_get_storage_key(address); + bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); + /* Transfer page changed & referenced bit to guest bits in pgste */ + pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ + /* Copy page access key and fetch protection bit to pgste */ + pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; +#endif + return pgste; + +} + +static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, + struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + unsigned long address; + unsigned long nkey; + + if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) + return; + VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); + address = pte_val(entry) & PAGE_MASK; + /* + * Set page access key and fetch protection bit from pgste. + * The guest C/R information is still in the PGSTE, set real + * key C/R to 0. + */ + nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; + page_set_storage_key(address, nkey, 0); +#endif +} + +static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) +{ +#ifdef CONFIG_PGSTE + if ((pte_val(entry) & _PAGE_PRESENT) && + (pte_val(entry) & _PAGE_WRITE) && + !(pte_val(entry) & _PAGE_INVALID)) { + if (!MACHINE_HAS_ESOP) { + /* + * Without enhanced suppression-on-protection force + * the dirty bit on for all writable ptes. + */ + pte_val(entry) |= _PAGE_DIRTY; + pte_val(entry) &= ~_PAGE_PROTECT; + } + if (!(pte_val(entry) & _PAGE_PROTECT)) + /* This pte allows write access, set user-dirty */ + pgste_val(pgste) |= PGSTE_UC_BIT; + } +#endif + *ptep = entry; + return pgste; +} + +static inline pgste_t pgste_pte_notify(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pgste_t pgste) +{ +#ifdef CONFIG_PGSTE + unsigned long bits; + + bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); + if (bits) { + pgste_val(pgste) ^= bits; + ptep_notify(mm, addr, ptep, bits); + } +#endif + return pgste; +} + +static inline pgste_t ptep_xchg_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pgste_t pgste = __pgste(0); + + if (mm_has_pgste(mm)) { + pgste = pgste_get_lock(ptep); + pgste = pgste_pte_notify(mm, addr, ptep, pgste); + } + return pgste; +} + +static inline pte_t ptep_xchg_commit(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pgste_t pgste, pte_t old, pte_t new) +{ + if (mm_has_pgste(mm)) { + if (pte_val(old) & _PAGE_INVALID) + pgste_set_key(ptep, pgste, new, mm); + if (pte_val(new) & _PAGE_INVALID) { + pgste = pgste_update_all(old, pgste, mm); + if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == + _PGSTE_GPS_USAGE_UNUSED) + pte_val(old) |= _PAGE_UNUSED; + } + pgste = pgste_set_pte(ptep, pgste, new); + pgste_set_unlock(ptep, pgste); + } else { + *ptep = new; + } + return old; +} + +pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t new) +{ + pgste_t pgste; + pte_t old; + int nodat; + + preempt_disable(); + pgste = ptep_xchg_start(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_direct(mm, addr, ptep, nodat); + old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + preempt_enable(); + return old; +} +EXPORT_SYMBOL(ptep_xchg_direct); + +pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t new) +{ + pgste_t pgste; + pte_t old; + int nodat; + + preempt_disable(); + pgste = ptep_xchg_start(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); + old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + preempt_enable(); + return old; +} +EXPORT_SYMBOL(ptep_xchg_lazy); + +pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pgste_t pgste; + pte_t old; + int nodat; + + preempt_disable(); + pgste = ptep_xchg_start(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); + if (mm_has_pgste(mm)) { + pgste = pgste_update_all(old, pgste, mm); + pgste_set(ptep, pgste); + } + return old; +} +EXPORT_SYMBOL(ptep_modify_prot_start); + +void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + pgste_t pgste; + + if (!MACHINE_HAS_NX) + pte_val(pte) &= ~_PAGE_NOEXEC; + if (mm_has_pgste(mm)) { + pgste = pgste_get(ptep); + pgste_set_key(ptep, pgste, pte, mm); + pgste = pgste_set_pte(ptep, pgste, pte); + pgste_set_unlock(ptep, pgste); + } else { + *ptep = pte; + } + preempt_enable(); +} +EXPORT_SYMBOL(ptep_modify_prot_commit); + +static inline void pmdp_idte_local(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_local(mm, addr); +} + +static inline void pmdp_idte_global(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) { + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_global(mm, addr); + } else if (MACHINE_HAS_IDTE) { + __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_idte_global(mm, addr); + } else { + __pmdp_csp(pmdp); + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) + gmap_pmdp_csp(mm, addr); + } +} + +static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old; + + old = *pmdp; + if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) + return old; + atomic_inc(&mm->context.flush_count); + if (MACHINE_HAS_TLB_LC && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + pmdp_idte_local(mm, addr, pmdp); + else + pmdp_idte_global(mm, addr, pmdp); + atomic_dec(&mm->context.flush_count); + return old; +} + +static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old; + + old = *pmdp; + if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) + return old; + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(&mm->context.cpu_attach_mask, + cpumask_of(smp_processor_id()))) { + pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; + mm->context.flush_mm = 1; + if (mm_has_pgste(mm)) + gmap_pmdp_invalidate(mm, addr); + } else { + pmdp_idte_global(mm, addr, pmdp); + } + atomic_dec(&mm->context.flush_count); + return old; +} + +#ifdef CONFIG_PGSTE +static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return NULL; + pmd = pmd_alloc(mm, pud, addr); + return pmd; +} +#endif + +pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t new) +{ + pmd_t old; + + preempt_disable(); + old = pmdp_flush_direct(mm, addr, pmdp); + *pmdp = new; + preempt_enable(); + return old; +} +EXPORT_SYMBOL(pmdp_xchg_direct); + +pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t new) +{ + pmd_t old; + + preempt_disable(); + old = pmdp_flush_lazy(mm, addr, pmdp); + *pmdp = new; + preempt_enable(); + return old; +} +EXPORT_SYMBOL(pmdp_xchg_lazy); + +static inline void pudp_idte_local(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); + else + __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); +} + +static inline void pudp_idte_global(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); + else + /* + * Invalid bit position is the same for pmd and pud, so we can + * re-use _pmd_csp() here + */ + __pmdp_csp((pmd_t *) pudp); +} + +static inline pud_t pudp_flush_direct(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old; + + old = *pudp; + if (pud_val(old) & _REGION_ENTRY_INVALID) + return old; + atomic_inc(&mm->context.flush_count); + if (MACHINE_HAS_TLB_LC && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + pudp_idte_local(mm, addr, pudp); + else + pudp_idte_global(mm, addr, pudp); + atomic_dec(&mm->context.flush_count); + return old; +} + +pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t new) +{ + pud_t old; + + preempt_disable(); + old = pudp_flush_direct(mm, addr, pudp); + *pudp = new; + preempt_enable(); + return old; +} +EXPORT_SYMBOL(pudp_xchg_direct); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + struct list_head *lh = (struct list_head *) pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; +} + +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + struct list_head *lh; + pgtable_t pgtable; + pte_t *ptep; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + pte_val(*ptep) = _PAGE_INVALID; + ptep++; + pte_val(*ptep) = _PAGE_INVALID; + return pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_PGSTE +void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry) +{ + pgste_t pgste; + + /* the mm_has_pgste() check is done in set_pte_at() */ + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; + pgste_set_key(ptep, pgste, entry, mm); + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); + preempt_enable(); +} + +void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pgste_t pgste; + + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgste_val(pgste) |= PGSTE_IN_BIT; + pgste_set_unlock(ptep, pgste); + preempt_enable(); +} + +/** + * ptep_force_prot - change access rights of a locked pte + * @mm: pointer to the process mm_struct + * @addr: virtual address in the guest address space + * @ptep: pointer to the page table entry + * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bit: pgste bit to set (e.g. for notification) + * + * Returns 0 if the access rights were changed and -EAGAIN if the current + * and requested access rights are incompatible. + */ +int ptep_force_prot(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int prot, unsigned long bit) +{ + pte_t entry; + pgste_t pgste; + int pte_i, pte_p, nodat; + + pgste = pgste_get_lock(ptep); + entry = *ptep; + /* Check pte entry after all locks have been acquired */ + pte_i = pte_val(entry) & _PAGE_INVALID; + pte_p = pte_val(entry) & _PAGE_PROTECT; + if ((pte_i && (prot != PROT_NONE)) || + (pte_p && (prot & PROT_WRITE))) { + pgste_set_unlock(ptep, pgste); + return -EAGAIN; + } + /* Change access rights and set pgste bit */ + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + if (prot == PROT_NONE && !pte_i) { + ptep_flush_direct(mm, addr, ptep, nodat); + pgste = pgste_update_all(entry, pgste, mm); + pte_val(entry) |= _PAGE_INVALID; + } + if (prot == PROT_READ && !pte_p) { + ptep_flush_direct(mm, addr, ptep, nodat); + pte_val(entry) &= ~_PAGE_INVALID; + pte_val(entry) |= _PAGE_PROTECT; + } + pgste_val(pgste) |= bit; + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); + return 0; +} + +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, pte_t pte) +{ + pgste_t spgste, tpgste; + pte_t spte, tpte; + int rc = -EAGAIN; + + if (!(pte_val(*tptep) & _PAGE_INVALID)) + return 0; /* already shadowed */ + spgste = pgste_get_lock(sptep); + spte = *sptep; + if (!(pte_val(spte) & _PAGE_INVALID) && + !((pte_val(spte) & _PAGE_PROTECT) && + !(pte_val(pte) & _PAGE_PROTECT))) { + pgste_val(spgste) |= PGSTE_VSIE_BIT; + tpgste = pgste_get_lock(tptep); + pte_val(tpte) = (pte_val(spte) & PAGE_MASK) | + (pte_val(pte) & _PAGE_PROTECT); + /* don't touch the storage key - it belongs to parent pgste */ + tpgste = pgste_set_pte(tptep, tpgste, tpte); + pgste_set_unlock(tptep, tpgste); + rc = 1; + } + pgste_set_unlock(sptep, spgste); + return rc; +} + +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) +{ + pgste_t pgste; + int nodat; + + pgste = pgste_get_lock(ptep); + /* notifier is called by the caller */ + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_flush_direct(mm, saddr, ptep, nodat); + /* don't touch the storage key - it belongs to parent pgste */ + pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); + pgste_set_unlock(ptep, pgste); +} + +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) +{ + if (!non_swap_entry(entry)) + dec_mm_counter(mm, MM_SWAPENTS); + else if (is_migration_entry(entry)) { + struct page *page = migration_entry_to_page(entry); + + dec_mm_counter(mm, mm_counter(page)); + } + free_swap_and_cache(entry); +} + +void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int reset) +{ + unsigned long pgstev; + pgste_t pgste; + pte_t pte; + + /* Zap unused and logically-zero pages */ + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgstev = pgste_val(pgste); + pte = *ptep; + if (!reset && pte_swap(pte) && + ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || + (pgstev & _PGSTE_GPS_ZERO))) { + ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); + pte_clear(mm, addr, ptep); + } + if (reset) + pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; + pgste_set_unlock(ptep, pgste); + preempt_enable(); +} + +void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + unsigned long ptev; + pgste_t pgste; + + /* Clear storage key ACC and F, but set R/C */ + preempt_disable(); + pgste = pgste_get_lock(ptep); + pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT; + ptev = pte_val(*ptep); + if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) + page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); + pgste_set_unlock(ptep, pgste); + preempt_enable(); +} + +/* + * Test and reset if a guest page is dirty + */ +bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pgste_t pgste; + pte_t pte; + bool dirty; + int nodat; + + pgste = pgste_get_lock(ptep); + dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); + pgste_val(pgste) &= ~PGSTE_UC_BIT; + pte = *ptep; + if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { + pgste = pgste_pte_notify(mm, addr, ptep, pgste); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_ipte_global(mm, addr, ptep, nodat); + if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) + pte_val(pte) |= _PAGE_PROTECT; + else + pte_val(pte) |= _PAGE_INVALID; + *ptep = pte; + } + pgste_set_unlock(ptep, pgste); + return dirty; +} +EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); + +int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, bool nq) +{ + unsigned long keyul, paddr; + spinlock_t *ptl; + pgste_t old, new; + pmd_t *pmdp; + pte_t *ptep; + + pmdp = pmd_alloc_map(mm, addr); + if (unlikely(!pmdp)) + return -EFAULT; + + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); + return -EFAULT; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + /* + * Huge pmds need quiescing operations, they are + * always mapped. + */ + page_set_storage_key(paddr, key, 1); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + + ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + + new = old = pgste_get_lock(ptep); + pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | + PGSTE_ACC_BITS | PGSTE_FP_BIT); + keyul = (unsigned long) key; + pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; + pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + unsigned long bits, skey; + + paddr = pte_val(*ptep) & PAGE_MASK; + skey = (unsigned long) page_get_storage_key(paddr); + bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); + skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); + /* Set storage key ACC and FP */ + page_set_storage_key(paddr, skey, !nq); + /* Merge host changed & referenced into pgste */ + pgste_val(new) |= bits << 52; + } + /* changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & + (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) + pgste_val(new) |= PGSTE_UC_BIT; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(set_guest_storage_key); + +/** + * Conditionally set a guest storage key (handling csske). + * oldkey will be updated when either mr or mc is set and a pointer is given. + * + * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest + * storage key was updated and -EFAULT on access errors. + */ +int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, unsigned char *oldkey, + bool nq, bool mr, bool mc) +{ + unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; + int rc; + + /* we can drop the pgste lock between getting and setting the key */ + if (mr | mc) { + rc = get_guest_storage_key(current->mm, addr, &tmp); + if (rc) + return rc; + if (oldkey) + *oldkey = tmp; + if (!mr) + mask |= _PAGE_REFERENCED; + if (!mc) + mask |= _PAGE_CHANGED; + if (!((tmp ^ key) & mask)) + return 0; + } + rc = set_guest_storage_key(current->mm, addr, key, nq); + return rc < 0 ? rc : 1; +} +EXPORT_SYMBOL(cond_set_guest_storage_key); + +/** + * Reset a guest reference bit (rrbe), returning the reference and changed bit. + * + * Returns < 0 in case of error, otherwise the cc to be reported to the guest. + */ +int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) +{ + spinlock_t *ptl; + unsigned long paddr; + pgste_t old, new; + pmd_t *pmdp; + pte_t *ptep; + int cc = 0; + + pmdp = pmd_alloc_map(mm, addr); + if (unlikely(!pmdp)) + return -EFAULT; + + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); + return -EFAULT; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + cc = page_reset_referenced(paddr); + spin_unlock(ptl); + return cc; + } + spin_unlock(ptl); + + ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + + new = old = pgste_get_lock(ptep); + /* Reset guest reference bit only */ + pgste_val(new) &= ~PGSTE_GR_BIT; + + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + paddr = pte_val(*ptep) & PAGE_MASK; + cc = page_reset_referenced(paddr); + /* Merge real referenced bit into host-set */ + pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; + } + /* Reflect guest's logical view, not physical */ + cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; + /* Changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) + pgste_val(new) |= PGSTE_UC_BIT; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return cc; +} +EXPORT_SYMBOL(reset_guest_reference_bit); + +int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char *key) +{ + unsigned long paddr; + spinlock_t *ptl; + pgste_t pgste; + pmd_t *pmdp; + pte_t *ptep; + + pmdp = pmd_alloc_map(mm, addr); + if (unlikely(!pmdp)) + return -EFAULT; + + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + /* Not yet mapped memory has a zero key */ + spin_unlock(ptl); + *key = 0; + return 0; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + *key = page_get_storage_key(paddr); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + + ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + + pgste = pgste_get_lock(ptep); + *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + paddr = pte_val(*ptep) & PAGE_MASK; + if (!(pte_val(*ptep) & _PAGE_INVALID)) + *key = page_get_storage_key(paddr); + /* Reflect guest's logical view, not physical */ + *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; + pgste_set_unlock(ptep, pgste); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(get_guest_storage_key); + +/** + * pgste_perform_essa - perform ESSA actions on the PGSTE. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @orc: the specific action to perform, see the ESSA_SET_* macros. + * @oldpte: the PTE will be saved there if the pointer is not NULL. + * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. + * + * Return: 1 if the page is to be added to the CBRL, otherwise 0, + * or < 0 in case of error. -EINVAL is returned for invalid values + * of orc, -EFAULT for invalid addresses. + */ +int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, + unsigned long *oldpte, unsigned long *oldpgste) +{ + struct vm_area_struct *vma; + unsigned long pgstev; + spinlock_t *ptl; + pgste_t pgste; + pte_t *ptep; + int res = 0; + + WARN_ON_ONCE(orc > ESSA_MAX); + if (unlikely(orc > ESSA_MAX)) + return -EINVAL; + + vma = find_vma(mm, hva); + if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma)) + return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + pgste = pgste_get_lock(ptep); + pgstev = pgste_val(pgste); + if (oldpte) + *oldpte = pte_val(*ptep); + if (oldpgste) + *oldpgste = pgstev; + + switch (orc) { + case ESSA_GET_STATE: + break; + case ESSA_SET_STABLE: + pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); + pgstev |= _PGSTE_GPS_USAGE_STABLE; + break; + case ESSA_SET_UNUSED: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_UNUSED; + if (pte_val(*ptep) & _PAGE_INVALID) + res = 1; + break; + case ESSA_SET_VOLATILE: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + if (pte_val(*ptep) & _PAGE_INVALID) + res = 1; + break; + case ESSA_SET_POT_VOLATILE: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; + break; + } + if (pgstev & _PGSTE_GPS_ZERO) { + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + break; + } + if (!(pgstev & PGSTE_GC_BIT)) { + pgstev |= _PGSTE_GPS_USAGE_VOLATILE; + res = 1; + break; + } + break; + case ESSA_SET_STABLE_RESIDENT: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE; + /* + * Since the resident state can go away any time after this + * call, we will not make this page resident. We can revisit + * this decision if a guest will ever start using this. + */ + break; + case ESSA_SET_STABLE_IF_RESIDENT: + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE; + } + break; + case ESSA_SET_STABLE_NODAT: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; + break; + default: + /* we should never get here! */ + break; + } + /* If we are discarding a page, set it to logical zero */ + if (res) + pgstev |= _PGSTE_GPS_ZERO; + + pgste_val(pgste) = pgstev; + pgste_set_unlock(ptep, pgste); + pte_unmap_unlock(ptep, ptl); + return res; +} +EXPORT_SYMBOL(pgste_perform_essa); + +/** + * set_pgste_bits - set specific PGSTE bits. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @bits: a bitmask representing the bits that will be touched + * @value: the values of the bits to be written. Only the bits in the mask + * will be written. + * + * Return: 0 on success, < 0 in case of error. + */ +int set_pgste_bits(struct mm_struct *mm, unsigned long hva, + unsigned long bits, unsigned long value) +{ + struct vm_area_struct *vma; + spinlock_t *ptl; + pgste_t new; + pte_t *ptep; + + vma = find_vma(mm, hva); + if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma)) + return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + new = pgste_get_lock(ptep); + + pgste_val(new) &= ~bits; + pgste_val(new) |= value & bits; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(set_pgste_bits); + +/** + * get_pgste - get the current PGSTE for the given address. + * @mm: the memory context. It must have PGSTEs, no check is performed here! + * @hva: the host virtual address of the page whose PGSTE is to be processed + * @pgstep: will be written with the current PGSTE for the given address. + * + * Return: 0 on success, < 0 in case of error. + */ +int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) +{ + struct vm_area_struct *vma; + spinlock_t *ptl; + pte_t *ptep; + + vma = find_vma(mm, hva); + if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma)) + return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + *pgstep = pgste_val(pgste_get(ptep)); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(get_pgste); +#endif diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c new file mode 100644 index 000000000..db55561c5 --- /dev/null +++ b/arch/s390/mm/vmem.c @@ -0,0 +1,443 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2006 + * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> + */ + +#include <linux/bootmem.h> +#include <linux/pfn.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/hugetlb.h> +#include <linux/slab.h> +#include <linux/memblock.h> +#include <asm/cacheflush.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/setup.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/set_memory.h> + +static DEFINE_MUTEX(vmem_mutex); + +struct memory_segment { + struct list_head list; + unsigned long start; + unsigned long size; +}; + +static LIST_HEAD(mem_segs); + +static void __ref *vmem_alloc_pages(unsigned int order) +{ + unsigned long size = PAGE_SIZE << order; + + if (slab_is_available()) + return (void *)__get_free_pages(GFP_KERNEL, order); + return (void *) memblock_alloc(size, size); +} + +void *vmem_crst_alloc(unsigned long val) +{ + unsigned long *table; + + table = vmem_alloc_pages(CRST_ALLOC_ORDER); + if (table) + crst_table_init(table, val); + return table; +} + +pte_t __ref *vmem_pte_alloc(void) +{ + unsigned long size = PTRS_PER_PTE * sizeof(pte_t); + pte_t *pte; + + if (slab_is_available()) + pte = (pte_t *) page_table_alloc(&init_mm); + else + pte = (pte_t *) memblock_alloc(size, size); + if (!pte) + return NULL; + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + return pte; +} + +/* + * Add a physical memory range to the 1:1 mapping. + */ +static int vmem_add_mem(unsigned long start, unsigned long size) +{ + unsigned long pgt_prot, sgt_prot, r3_prot; + unsigned long pages4k, pages1m, pages2g; + unsigned long end = start + size; + unsigned long address = start; + pgd_t *pg_dir; + p4d_t *p4_dir; + pud_t *pu_dir; + pmd_t *pm_dir; + pte_t *pt_dir; + int ret = -ENOMEM; + + pgt_prot = pgprot_val(PAGE_KERNEL); + sgt_prot = pgprot_val(SEGMENT_KERNEL); + r3_prot = pgprot_val(REGION3_KERNEL); + if (!MACHINE_HAS_NX) { + pgt_prot &= ~_PAGE_NOEXEC; + sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; + r3_prot &= ~_REGION_ENTRY_NOEXEC; + } + pages4k = pages1m = pages2g = 0; + while (address < end) { + pg_dir = pgd_offset_k(address); + if (pgd_none(*pg_dir)) { + p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4_dir) + goto out; + pgd_populate(&init_mm, pg_dir, p4_dir); + } + p4_dir = p4d_offset(pg_dir, address); + if (p4d_none(*p4_dir)) { + pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pu_dir) + goto out; + p4d_populate(&init_mm, p4_dir, pu_dir); + } + pu_dir = pud_offset(p4_dir, address); + if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address && + !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) && + !debug_pagealloc_enabled()) { + pud_val(*pu_dir) = address | r3_prot; + address += PUD_SIZE; + pages2g++; + continue; + } + if (pud_none(*pu_dir)) { + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pm_dir) + goto out; + pud_populate(&init_mm, pu_dir, pm_dir); + } + pm_dir = pmd_offset(pu_dir, address); + if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address && + !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) && + !debug_pagealloc_enabled()) { + pmd_val(*pm_dir) = address | sgt_prot; + address += PMD_SIZE; + pages1m++; + continue; + } + if (pmd_none(*pm_dir)) { + pt_dir = vmem_pte_alloc(); + if (!pt_dir) + goto out; + pmd_populate(&init_mm, pm_dir, pt_dir); + } + + pt_dir = pte_offset_kernel(pm_dir, address); + pte_val(*pt_dir) = address | pgt_prot; + address += PAGE_SIZE; + pages4k++; + } + ret = 0; +out: + update_page_count(PG_DIRECT_MAP_4K, pages4k); + update_page_count(PG_DIRECT_MAP_1M, pages1m); + update_page_count(PG_DIRECT_MAP_2G, pages2g); + return ret; +} + +/* + * Remove a physical memory range from the 1:1 mapping. + * Currently only invalidates page table entries. + */ +static void vmem_remove_range(unsigned long start, unsigned long size) +{ + unsigned long pages4k, pages1m, pages2g; + unsigned long end = start + size; + unsigned long address = start; + pgd_t *pg_dir; + p4d_t *p4_dir; + pud_t *pu_dir; + pmd_t *pm_dir; + pte_t *pt_dir; + + pages4k = pages1m = pages2g = 0; + while (address < end) { + pg_dir = pgd_offset_k(address); + if (pgd_none(*pg_dir)) { + address += PGDIR_SIZE; + continue; + } + p4_dir = p4d_offset(pg_dir, address); + if (p4d_none(*p4_dir)) { + address += P4D_SIZE; + continue; + } + pu_dir = pud_offset(p4_dir, address); + if (pud_none(*pu_dir)) { + address += PUD_SIZE; + continue; + } + if (pud_large(*pu_dir)) { + pud_clear(pu_dir); + address += PUD_SIZE; + pages2g++; + continue; + } + pm_dir = pmd_offset(pu_dir, address); + if (pmd_none(*pm_dir)) { + address += PMD_SIZE; + continue; + } + if (pmd_large(*pm_dir)) { + pmd_clear(pm_dir); + address += PMD_SIZE; + pages1m++; + continue; + } + pt_dir = pte_offset_kernel(pm_dir, address); + pte_clear(&init_mm, address, pt_dir); + address += PAGE_SIZE; + pages4k++; + } + flush_tlb_kernel_range(start, end); + update_page_count(PG_DIRECT_MAP_4K, -pages4k); + update_page_count(PG_DIRECT_MAP_1M, -pages1m); + update_page_count(PG_DIRECT_MAP_2G, -pages2g); +} + +/* + * Add a backed mem_map array to the virtual mem_map array. + */ +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + unsigned long pgt_prot, sgt_prot; + unsigned long address = start; + pgd_t *pg_dir; + p4d_t *p4_dir; + pud_t *pu_dir; + pmd_t *pm_dir; + pte_t *pt_dir; + int ret = -ENOMEM; + + pgt_prot = pgprot_val(PAGE_KERNEL); + sgt_prot = pgprot_val(SEGMENT_KERNEL); + if (!MACHINE_HAS_NX) { + pgt_prot &= ~_PAGE_NOEXEC; + sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; + } + for (address = start; address < end;) { + pg_dir = pgd_offset_k(address); + if (pgd_none(*pg_dir)) { + p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4_dir) + goto out; + pgd_populate(&init_mm, pg_dir, p4_dir); + } + + p4_dir = p4d_offset(pg_dir, address); + if (p4d_none(*p4_dir)) { + pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pu_dir) + goto out; + p4d_populate(&init_mm, p4_dir, pu_dir); + } + + pu_dir = pud_offset(p4_dir, address); + if (pud_none(*pu_dir)) { + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pm_dir) + goto out; + pud_populate(&init_mm, pu_dir, pm_dir); + } + + pm_dir = pmd_offset(pu_dir, address); + if (pmd_none(*pm_dir)) { + /* Use 1MB frames for vmemmap if available. We always + * use large frames even if they are only partially + * used. + * Otherwise we would have also page tables since + * vmemmap_populate gets called for each section + * separately. */ + if (MACHINE_HAS_EDAT1) { + void *new_page; + + new_page = vmemmap_alloc_block(PMD_SIZE, node); + if (!new_page) + goto out; + pmd_val(*pm_dir) = __pa(new_page) | sgt_prot; + address = (address + PMD_SIZE) & PMD_MASK; + continue; + } + pt_dir = vmem_pte_alloc(); + if (!pt_dir) + goto out; + pmd_populate(&init_mm, pm_dir, pt_dir); + } else if (pmd_large(*pm_dir)) { + address = (address + PMD_SIZE) & PMD_MASK; + continue; + } + + pt_dir = pte_offset_kernel(pm_dir, address); + if (pte_none(*pt_dir)) { + void *new_page; + + new_page = vmemmap_alloc_block(PAGE_SIZE, node); + if (!new_page) + goto out; + pte_val(*pt_dir) = __pa(new_page) | pgt_prot; + } + address += PAGE_SIZE; + } + ret = 0; +out: + return ret; +} + +void vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ +} + +/* + * Add memory segment to the segment list if it doesn't overlap with + * an already present segment. + */ +static int insert_memory_segment(struct memory_segment *seg) +{ + struct memory_segment *tmp; + + if (seg->start + seg->size > VMEM_MAX_PHYS || + seg->start + seg->size < seg->start) + return -ERANGE; + + list_for_each_entry(tmp, &mem_segs, list) { + if (seg->start >= tmp->start + tmp->size) + continue; + if (seg->start + seg->size <= tmp->start) + continue; + return -ENOSPC; + } + list_add(&seg->list, &mem_segs); + return 0; +} + +/* + * Remove memory segment from the segment list. + */ +static void remove_memory_segment(struct memory_segment *seg) +{ + list_del(&seg->list); +} + +static void __remove_shared_memory(struct memory_segment *seg) +{ + remove_memory_segment(seg); + vmem_remove_range(seg->start, seg->size); +} + +int vmem_remove_mapping(unsigned long start, unsigned long size) +{ + struct memory_segment *seg; + int ret; + + mutex_lock(&vmem_mutex); + + ret = -ENOENT; + list_for_each_entry(seg, &mem_segs, list) { + if (seg->start == start && seg->size == size) + break; + } + + if (seg->start != start || seg->size != size) + goto out; + + ret = 0; + __remove_shared_memory(seg); + kfree(seg); +out: + mutex_unlock(&vmem_mutex); + return ret; +} + +int vmem_add_mapping(unsigned long start, unsigned long size) +{ + struct memory_segment *seg; + int ret; + + mutex_lock(&vmem_mutex); + ret = -ENOMEM; + seg = kzalloc(sizeof(*seg), GFP_KERNEL); + if (!seg) + goto out; + seg->start = start; + seg->size = size; + + ret = insert_memory_segment(seg); + if (ret) + goto out_free; + + ret = vmem_add_mem(start, size); + if (ret) + goto out_remove; + goto out; + +out_remove: + __remove_shared_memory(seg); +out_free: + kfree(seg); +out: + mutex_unlock(&vmem_mutex); + return ret; +} + +/* + * map whole physical memory to virtual memory (identity mapping) + * we reserve enough space in the vmalloc area for vmemmap to hotplug + * additional memory segments. + */ +void __init vmem_map_init(void) +{ + struct memblock_region *reg; + + for_each_memblock(memory, reg) + vmem_add_mem(reg->base, reg->size); + __set_memory((unsigned long)_stext, + (unsigned long)(_etext - _stext) >> PAGE_SHIFT, + SET_MEMORY_RO | SET_MEMORY_X); + __set_memory((unsigned long)_etext, + (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT, + SET_MEMORY_RO); + __set_memory((unsigned long)_sinittext, + (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, + SET_MEMORY_RO | SET_MEMORY_X); + pr_info("Write protected kernel read-only data: %luk\n", + (unsigned long)(__end_rodata - _stext) >> 10); +} + +/* + * Convert memblock.memory to a memory segment list so there is a single + * list that contains all memory segments. + */ +static int __init vmem_convert_memory_chunk(void) +{ + struct memblock_region *reg; + struct memory_segment *seg; + + mutex_lock(&vmem_mutex); + for_each_memblock(memory, reg) { + seg = kzalloc(sizeof(*seg), GFP_KERNEL); + if (!seg) + panic("Out of memory...\n"); + seg->start = reg->base; + seg->size = reg->size; + insert_memory_segment(seg); + } + mutex_unlock(&vmem_mutex); + return 0; +} + +core_initcall(vmem_convert_memory_chunk); |