summaryrefslogtreecommitdiffstats
path: root/arch/s390/mm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
commit76cb841cb886eef6b3bee341a2266c76578724ad (patch)
treef5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /arch/s390/mm
parentInitial commit. (diff)
downloadlinux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz
linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip
Adding upstream version 4.19.249.upstream/4.19.249upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/Makefile13
-rw-r--r--arch/s390/mm/cmm.c487
-rw-r--r--arch/s390/mm/dump_pagetables.c249
-rw-r--r--arch/s390/mm/extmem.c763
-rw-r--r--arch/s390/mm/fault.c837
-rw-r--r--arch/s390/mm/gmap.c2645
-rw-r--r--arch/s390/mm/gup.c310
-rw-r--r--arch/s390/mm/hugetlbpage.c370
-rw-r--r--arch/s390/mm/init.c251
-rw-r--r--arch/s390/mm/maccess.c213
-rw-r--r--arch/s390/mm/mem_detect.c62
-rw-r--r--arch/s390/mm/mmap.c206
-rw-r--r--arch/s390/mm/page-states.c281
-rw-r--r--arch/s390/mm/pageattr.c386
-rw-r--r--arch/s390/mm/pgalloc.c681
-rw-r--r--arch/s390/mm/pgtable.c1126
-rw-r--r--arch/s390/mm/vmem.c443
17 files changed, 9323 insertions, 0 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
new file mode 100644
index 000000000..33fe41850
--- /dev/null
+++ b/arch/s390/mm/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the linux s390-specific parts of the memory manager.
+#
+
+obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o
+obj-y += page-states.o gup.o pageattr.o mem_detect.o
+obj-y += pgtable.o pgalloc.o
+
+obj-$(CONFIG_CMM) += cmm.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o
+obj-$(CONFIG_PGSTE) += gmap.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
new file mode 100644
index 000000000..a51c892f1
--- /dev/null
+++ b/arch/s390/mm/cmm.c
@@ -0,0 +1,487 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Collaborative memory management interface.
+ *
+ * Copyright IBM Corp 2003, 2010
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>,
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/swap.h>
+#include <linux/kthread.h>
+#include <linux/oom.h>
+#include <linux/suspend.h>
+#include <linux/uaccess.h>
+
+#include <asm/pgalloc.h>
+#include <asm/diag.h>
+
+#ifdef CONFIG_CMM_IUCV
+static char *cmm_default_sender = "VMRMSVM";
+#endif
+static char *sender;
+module_param(sender, charp, 0400);
+MODULE_PARM_DESC(sender,
+ "Guest name that may send SMSG messages (default VMRMSVM)");
+
+#include "../../../drivers/s390/net/smsgiucv.h"
+
+#define CMM_NR_PAGES ((PAGE_SIZE / sizeof(unsigned long)) - 2)
+
+struct cmm_page_array {
+ struct cmm_page_array *next;
+ unsigned long index;
+ unsigned long pages[CMM_NR_PAGES];
+};
+
+static long cmm_pages;
+static long cmm_timed_pages;
+static volatile long cmm_pages_target;
+static volatile long cmm_timed_pages_target;
+static long cmm_timeout_pages;
+static long cmm_timeout_seconds;
+static int cmm_suspended;
+
+static struct cmm_page_array *cmm_page_list;
+static struct cmm_page_array *cmm_timed_page_list;
+static DEFINE_SPINLOCK(cmm_lock);
+
+static struct task_struct *cmm_thread_ptr;
+static DECLARE_WAIT_QUEUE_HEAD(cmm_thread_wait);
+
+static void cmm_timer_fn(struct timer_list *);
+static void cmm_set_timer(void);
+static DEFINE_TIMER(cmm_timer, cmm_timer_fn);
+
+static long cmm_alloc_pages(long nr, long *counter,
+ struct cmm_page_array **list)
+{
+ struct cmm_page_array *pa, *npa;
+ unsigned long addr;
+
+ while (nr) {
+ addr = __get_free_page(GFP_NOIO);
+ if (!addr)
+ break;
+ spin_lock(&cmm_lock);
+ pa = *list;
+ if (!pa || pa->index >= CMM_NR_PAGES) {
+ /* Need a new page for the page list. */
+ spin_unlock(&cmm_lock);
+ npa = (struct cmm_page_array *)
+ __get_free_page(GFP_NOIO);
+ if (!npa) {
+ free_page(addr);
+ break;
+ }
+ spin_lock(&cmm_lock);
+ pa = *list;
+ if (!pa || pa->index >= CMM_NR_PAGES) {
+ npa->next = pa;
+ npa->index = 0;
+ pa = npa;
+ *list = pa;
+ } else
+ free_page((unsigned long) npa);
+ }
+ diag10_range(addr >> PAGE_SHIFT, 1);
+ pa->pages[pa->index++] = addr;
+ (*counter)++;
+ spin_unlock(&cmm_lock);
+ nr--;
+ }
+ return nr;
+}
+
+static long cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
+{
+ struct cmm_page_array *pa;
+ unsigned long addr;
+
+ spin_lock(&cmm_lock);
+ pa = *list;
+ while (nr) {
+ if (!pa || pa->index <= 0)
+ break;
+ addr = pa->pages[--pa->index];
+ if (pa->index == 0) {
+ pa = pa->next;
+ free_page((unsigned long) *list);
+ *list = pa;
+ }
+ free_page(addr);
+ (*counter)--;
+ nr--;
+ }
+ spin_unlock(&cmm_lock);
+ return nr;
+}
+
+static int cmm_oom_notify(struct notifier_block *self,
+ unsigned long dummy, void *parm)
+{
+ unsigned long *freed = parm;
+ long nr = 256;
+
+ nr = cmm_free_pages(nr, &cmm_timed_pages, &cmm_timed_page_list);
+ if (nr > 0)
+ nr = cmm_free_pages(nr, &cmm_pages, &cmm_page_list);
+ cmm_pages_target = cmm_pages;
+ cmm_timed_pages_target = cmm_timed_pages;
+ *freed += 256 - nr;
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cmm_oom_nb = {
+ .notifier_call = cmm_oom_notify,
+};
+
+static int cmm_thread(void *dummy)
+{
+ int rc;
+
+ while (1) {
+ rc = wait_event_interruptible(cmm_thread_wait,
+ (!cmm_suspended && (cmm_pages != cmm_pages_target ||
+ cmm_timed_pages != cmm_timed_pages_target)) ||
+ kthread_should_stop());
+ if (kthread_should_stop() || rc == -ERESTARTSYS) {
+ cmm_pages_target = cmm_pages;
+ cmm_timed_pages_target = cmm_timed_pages;
+ break;
+ }
+ if (cmm_pages_target > cmm_pages) {
+ if (cmm_alloc_pages(1, &cmm_pages, &cmm_page_list))
+ cmm_pages_target = cmm_pages;
+ } else if (cmm_pages_target < cmm_pages) {
+ cmm_free_pages(1, &cmm_pages, &cmm_page_list);
+ }
+ if (cmm_timed_pages_target > cmm_timed_pages) {
+ if (cmm_alloc_pages(1, &cmm_timed_pages,
+ &cmm_timed_page_list))
+ cmm_timed_pages_target = cmm_timed_pages;
+ } else if (cmm_timed_pages_target < cmm_timed_pages) {
+ cmm_free_pages(1, &cmm_timed_pages,
+ &cmm_timed_page_list);
+ }
+ if (cmm_timed_pages > 0 && !timer_pending(&cmm_timer))
+ cmm_set_timer();
+ }
+ return 0;
+}
+
+static void cmm_kick_thread(void)
+{
+ wake_up(&cmm_thread_wait);
+}
+
+static void cmm_set_timer(void)
+{
+ if (cmm_timed_pages_target <= 0 || cmm_timeout_seconds <= 0) {
+ if (timer_pending(&cmm_timer))
+ del_timer(&cmm_timer);
+ return;
+ }
+ mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ);
+}
+
+static void cmm_timer_fn(struct timer_list *unused)
+{
+ long nr;
+
+ nr = cmm_timed_pages_target - cmm_timeout_pages;
+ if (nr < 0)
+ cmm_timed_pages_target = 0;
+ else
+ cmm_timed_pages_target = nr;
+ cmm_kick_thread();
+ cmm_set_timer();
+}
+
+static void cmm_set_pages(long nr)
+{
+ cmm_pages_target = nr;
+ cmm_kick_thread();
+}
+
+static long cmm_get_pages(void)
+{
+ return cmm_pages;
+}
+
+static void cmm_add_timed_pages(long nr)
+{
+ cmm_timed_pages_target += nr;
+ cmm_kick_thread();
+}
+
+static long cmm_get_timed_pages(void)
+{
+ return cmm_timed_pages;
+}
+
+static void cmm_set_timeout(long nr, long seconds)
+{
+ cmm_timeout_pages = nr;
+ cmm_timeout_seconds = seconds;
+ cmm_set_timer();
+}
+
+static int cmm_skip_blanks(char *cp, char **endp)
+{
+ char *str;
+
+ for (str = cp; *str == ' ' || *str == '\t'; str++)
+ ;
+ *endp = str;
+ return str != cp;
+}
+
+static int cmm_pages_handler(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ long nr = cmm_get_pages();
+ struct ctl_table ctl_entry = {
+ .procname = ctl->procname,
+ .data = &nr,
+ .maxlen = sizeof(long),
+ };
+ int rc;
+
+ rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
+ if (rc < 0 || !write)
+ return rc;
+
+ cmm_set_pages(nr);
+ return 0;
+}
+
+static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ long nr = cmm_get_timed_pages();
+ struct ctl_table ctl_entry = {
+ .procname = ctl->procname,
+ .data = &nr,
+ .maxlen = sizeof(long),
+ };
+ int rc;
+
+ rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
+ if (rc < 0 || !write)
+ return rc;
+
+ cmm_add_timed_pages(nr);
+ return 0;
+}
+
+static int cmm_timeout_handler(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char buf[64], *p;
+ long nr, seconds;
+ unsigned int len;
+
+ if (!*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ len = min(*lenp, sizeof(buf));
+ if (copy_from_user(buf, buffer, len))
+ return -EFAULT;
+ buf[len - 1] = '\0';
+ cmm_skip_blanks(buf, &p);
+ nr = simple_strtoul(p, &p, 0);
+ cmm_skip_blanks(p, &p);
+ seconds = simple_strtoul(p, &p, 0);
+ cmm_set_timeout(nr, seconds);
+ *ppos += *lenp;
+ } else {
+ len = sprintf(buf, "%ld %ld\n",
+ cmm_timeout_pages, cmm_timeout_seconds);
+ if (len > *lenp)
+ len = *lenp;
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ *ppos += len;
+ }
+ return 0;
+}
+
+static struct ctl_table cmm_table[] = {
+ {
+ .procname = "cmm_pages",
+ .mode = 0644,
+ .proc_handler = cmm_pages_handler,
+ },
+ {
+ .procname = "cmm_timed_pages",
+ .mode = 0644,
+ .proc_handler = cmm_timed_pages_handler,
+ },
+ {
+ .procname = "cmm_timeout",
+ .mode = 0644,
+ .proc_handler = cmm_timeout_handler,
+ },
+ { }
+};
+
+static struct ctl_table cmm_dir_table[] = {
+ {
+ .procname = "vm",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = cmm_table,
+ },
+ { }
+};
+
+#ifdef CONFIG_CMM_IUCV
+#define SMSG_PREFIX "CMM"
+static void cmm_smsg_target(const char *from, char *msg)
+{
+ long nr, seconds;
+
+ if (strlen(sender) > 0 && strcmp(from, sender) != 0)
+ return;
+ if (!cmm_skip_blanks(msg + strlen(SMSG_PREFIX), &msg))
+ return;
+ if (strncmp(msg, "SHRINK", 6) == 0) {
+ if (!cmm_skip_blanks(msg + 6, &msg))
+ return;
+ nr = simple_strtoul(msg, &msg, 0);
+ cmm_skip_blanks(msg, &msg);
+ if (*msg == '\0')
+ cmm_set_pages(nr);
+ } else if (strncmp(msg, "RELEASE", 7) == 0) {
+ if (!cmm_skip_blanks(msg + 7, &msg))
+ return;
+ nr = simple_strtoul(msg, &msg, 0);
+ cmm_skip_blanks(msg, &msg);
+ if (*msg == '\0')
+ cmm_add_timed_pages(nr);
+ } else if (strncmp(msg, "REUSE", 5) == 0) {
+ if (!cmm_skip_blanks(msg + 5, &msg))
+ return;
+ nr = simple_strtoul(msg, &msg, 0);
+ if (!cmm_skip_blanks(msg, &msg))
+ return;
+ seconds = simple_strtoul(msg, &msg, 0);
+ cmm_skip_blanks(msg, &msg);
+ if (*msg == '\0')
+ cmm_set_timeout(nr, seconds);
+ }
+}
+#endif
+
+static struct ctl_table_header *cmm_sysctl_header;
+
+static int cmm_suspend(void)
+{
+ cmm_suspended = 1;
+ cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
+ cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
+ return 0;
+}
+
+static int cmm_resume(void)
+{
+ cmm_suspended = 0;
+ cmm_kick_thread();
+ return 0;
+}
+
+static int cmm_power_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ switch (event) {
+ case PM_POST_HIBERNATION:
+ return cmm_resume();
+ case PM_HIBERNATION_PREPARE:
+ return cmm_suspend();
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block cmm_power_notifier = {
+ .notifier_call = cmm_power_event,
+};
+
+static int __init cmm_init(void)
+{
+ int rc = -ENOMEM;
+
+ cmm_sysctl_header = register_sysctl_table(cmm_dir_table);
+ if (!cmm_sysctl_header)
+ goto out_sysctl;
+#ifdef CONFIG_CMM_IUCV
+ /* convert sender to uppercase characters */
+ if (sender) {
+ int len = strlen(sender);
+ while (len--)
+ sender[len] = toupper(sender[len]);
+ } else {
+ sender = cmm_default_sender;
+ }
+
+ rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
+ if (rc < 0)
+ goto out_smsg;
+#endif
+ rc = register_oom_notifier(&cmm_oom_nb);
+ if (rc < 0)
+ goto out_oom_notify;
+ rc = register_pm_notifier(&cmm_power_notifier);
+ if (rc)
+ goto out_pm;
+ cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+ if (!IS_ERR(cmm_thread_ptr))
+ return 0;
+
+ rc = PTR_ERR(cmm_thread_ptr);
+ unregister_pm_notifier(&cmm_power_notifier);
+out_pm:
+ unregister_oom_notifier(&cmm_oom_nb);
+out_oom_notify:
+#ifdef CONFIG_CMM_IUCV
+ smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target);
+out_smsg:
+#endif
+ unregister_sysctl_table(cmm_sysctl_header);
+out_sysctl:
+ del_timer_sync(&cmm_timer);
+ return rc;
+}
+module_init(cmm_init);
+
+static void __exit cmm_exit(void)
+{
+ unregister_sysctl_table(cmm_sysctl_header);
+#ifdef CONFIG_CMM_IUCV
+ smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target);
+#endif
+ unregister_pm_notifier(&cmm_power_notifier);
+ unregister_oom_notifier(&cmm_oom_nb);
+ kthread_stop(cmm_thread_ptr);
+ del_timer_sync(&cmm_timer);
+ cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
+ cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
+}
+module_exit(cmm_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
new file mode 100644
index 000000000..7cdea2ec5
--- /dev/null
+++ b/arch/s390/mm/dump_pagetables.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <asm/sections.h>
+#include <asm/pgtable.h>
+
+static unsigned long max_addr;
+
+struct addr_marker {
+ unsigned long start_address;
+ const char *name;
+};
+
+enum address_markers_idx {
+ IDENTITY_NR = 0,
+ KERNEL_START_NR,
+ KERNEL_END_NR,
+ VMEMMAP_NR,
+ VMALLOC_NR,
+ MODULES_NR,
+};
+
+static struct addr_marker address_markers[] = {
+ [IDENTITY_NR] = {0, "Identity Mapping"},
+ [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"},
+ [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"},
+ [VMEMMAP_NR] = {0, "vmemmap Area"},
+ [VMALLOC_NR] = {0, "vmalloc Area"},
+ [MODULES_NR] = {0, "Modules Area"},
+ { -1, NULL }
+};
+
+struct pg_state {
+ int level;
+ unsigned int current_prot;
+ unsigned long start_address;
+ unsigned long current_address;
+ const struct addr_marker *marker;
+};
+
+static void print_prot(struct seq_file *m, unsigned int pr, int level)
+{
+ static const char * const level_name[] =
+ { "ASCE", "PGD", "PUD", "PMD", "PTE" };
+
+ seq_printf(m, "%s ", level_name[level]);
+ if (pr & _PAGE_INVALID) {
+ seq_printf(m, "I\n");
+ return;
+ }
+ seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
+ seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
+}
+
+static void note_page(struct seq_file *m, struct pg_state *st,
+ unsigned int new_prot, int level)
+{
+ static const char units[] = "KMGTPE";
+ int width = sizeof(unsigned long) * 2;
+ const char *unit = units;
+ unsigned int prot, cur;
+ unsigned long delta;
+
+ /*
+ * If we have a "break" in the series, we need to flush the state
+ * that we have now. "break" is either changing perms, levels or
+ * address space marker.
+ */
+ prot = new_prot;
+ cur = st->current_prot;
+
+ if (!st->level) {
+ /* First entry */
+ st->current_prot = new_prot;
+ st->level = level;
+ st->marker = address_markers;
+ seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ } else if (prot != cur || level != st->level ||
+ st->current_address >= st->marker[1].start_address) {
+ /* Print the actual finished series */
+ seq_printf(m, "0x%0*lx-0x%0*lx",
+ width, st->start_address,
+ width, st->current_address);
+ delta = (st->current_address - st->start_address) >> 10;
+ while (!(delta & 0x3ff) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ seq_printf(m, "%9lu%c ", delta, *unit);
+ print_prot(m, st->current_prot, st->level);
+ if (st->current_address >= st->marker[1].start_address) {
+ st->marker++;
+ seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ }
+ st->start_address = st->current_address;
+ st->current_prot = new_prot;
+ st->level = level;
+ }
+}
+
+/*
+ * The actual page table walker functions. In order to keep the
+ * implementation of print_prot() short, we only check and pass
+ * _PAGE_INVALID and _PAGE_PROTECT flags to note_page() if a region,
+ * segment or page table entry is invalid or read-only.
+ * After all it's just a hint that the current level being walked
+ * contains an invalid or read-only entry.
+ */
+static void walk_pte_level(struct seq_file *m, struct pg_state *st,
+ pmd_t *pmd, unsigned long addr)
+{
+ unsigned int prot;
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) {
+ st->current_address = addr;
+ pte = pte_offset_kernel(pmd, addr);
+ prot = pte_val(*pte) &
+ (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC);
+ note_page(m, st, prot, 4);
+ addr += PAGE_SIZE;
+ }
+}
+
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
+ pud_t *pud, unsigned long addr)
+{
+ unsigned int prot;
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++) {
+ st->current_address = addr;
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_none(*pmd)) {
+ if (pmd_large(*pmd)) {
+ prot = pmd_val(*pmd) &
+ (_SEGMENT_ENTRY_PROTECT |
+ _SEGMENT_ENTRY_NOEXEC);
+ note_page(m, st, prot, 3);
+ } else
+ walk_pte_level(m, st, pmd, addr);
+ } else
+ note_page(m, st, _PAGE_INVALID, 3);
+ addr += PMD_SIZE;
+ }
+}
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st,
+ p4d_t *p4d, unsigned long addr)
+{
+ unsigned int prot;
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++) {
+ st->current_address = addr;
+ pud = pud_offset(p4d, addr);
+ if (!pud_none(*pud))
+ if (pud_large(*pud)) {
+ prot = pud_val(*pud) &
+ (_REGION_ENTRY_PROTECT |
+ _REGION_ENTRY_NOEXEC);
+ note_page(m, st, prot, 2);
+ } else
+ walk_pmd_level(m, st, pud, addr);
+ else
+ note_page(m, st, _PAGE_INVALID, 2);
+ addr += PUD_SIZE;
+ }
+}
+
+static void walk_p4d_level(struct seq_file *m, struct pg_state *st,
+ pgd_t *pgd, unsigned long addr)
+{
+ p4d_t *p4d;
+ int i;
+
+ for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++) {
+ st->current_address = addr;
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_none(*p4d))
+ walk_pud_level(m, st, p4d, addr);
+ else
+ note_page(m, st, _PAGE_INVALID, 2);
+ addr += P4D_SIZE;
+ }
+}
+
+static void walk_pgd_level(struct seq_file *m)
+{
+ unsigned long addr = 0;
+ struct pg_state st;
+ pgd_t *pgd;
+ int i;
+
+ memset(&st, 0, sizeof(st));
+ for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) {
+ st.current_address = addr;
+ pgd = pgd_offset_k(addr);
+ if (!pgd_none(*pgd))
+ walk_p4d_level(m, &st, pgd, addr);
+ else
+ note_page(m, &st, _PAGE_INVALID, 1);
+ addr += PGDIR_SIZE;
+ cond_resched();
+ }
+ /* Flush out the last page */
+ st.current_address = max_addr;
+ note_page(m, &st, 0, 0);
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+ walk_pgd_level(m);
+ return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+ .open = ptdump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int pt_dump_init(void)
+{
+ /*
+ * Figure out the maximum virtual address being accessible with the
+ * kernel ASCE. We need this to keep the page table walker functions
+ * from accessing non-existent entries.
+ */
+ max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2;
+ max_addr = 1UL << (max_addr * 11 + 31);
+ address_markers[MODULES_NR].start_address = MODULES_VADDR;
+ address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
+ address_markers[VMALLOC_NR].start_address = VMALLOC_START;
+ debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+ return 0;
+}
+device_initcall(pt_dump_init);
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
new file mode 100644
index 000000000..84111a43e
--- /dev/null
+++ b/arch/s390/mm/extmem.c
@@ -0,0 +1,763 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author(s)......: Carsten Otte <cotte@de.ibm.com>
+ * Rob M van der Heij <rvdheij@nl.ibm.com>
+ * Steven Shultz <shultzss@us.ibm.com>
+ * Bugreports.to..: <Linux390@de.ibm.com>
+ * Copyright IBM Corp. 2002, 2004
+ */
+
+#define KMSG_COMPONENT "extmem"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/bootmem.h>
+#include <linux/ctype.h>
+#include <linux/ioport.h>
+#include <asm/diag.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/ebcdic.h>
+#include <asm/errno.h>
+#include <asm/extmem.h>
+#include <asm/cpcmd.h>
+#include <asm/setup.h>
+
+#define DCSS_LOADSHR 0x00
+#define DCSS_LOADNSR 0x04
+#define DCSS_PURGESEG 0x08
+#define DCSS_FINDSEG 0x0c
+#define DCSS_LOADNOLY 0x10
+#define DCSS_SEGEXT 0x18
+#define DCSS_LOADSHRX 0x20
+#define DCSS_LOADNSRX 0x24
+#define DCSS_FINDSEGX 0x2c
+#define DCSS_SEGEXTX 0x38
+#define DCSS_FINDSEGA 0x0c
+
+struct qrange {
+ unsigned long start; /* last byte type */
+ unsigned long end; /* last byte reserved */
+};
+
+struct qout64 {
+ unsigned long segstart;
+ unsigned long segend;
+ int segcnt;
+ int segrcnt;
+ struct qrange range[6];
+};
+
+struct qrange_old {
+ unsigned int start; /* last byte type */
+ unsigned int end; /* last byte reserved */
+};
+
+/* output area format for the Diag x'64' old subcode x'18' */
+struct qout64_old {
+ int segstart;
+ int segend;
+ int segcnt;
+ int segrcnt;
+ struct qrange_old range[6];
+};
+
+struct qin64 {
+ char qopcode;
+ char rsrv1[3];
+ char qrcode;
+ char rsrv2[3];
+ char qname[8];
+ unsigned int qoutptr;
+ short int qoutlen;
+};
+
+struct dcss_segment {
+ struct list_head list;
+ char dcss_name[8];
+ char res_name[16];
+ unsigned long start_addr;
+ unsigned long end;
+ atomic_t ref_count;
+ int do_nonshared;
+ unsigned int vm_segtype;
+ struct qrange range[6];
+ int segcnt;
+ struct resource *res;
+};
+
+static DEFINE_MUTEX(dcss_lock);
+static LIST_HEAD(dcss_list);
+static char *segtype_string[] = { "SW", "EW", "SR", "ER", "SN", "EN", "SC",
+ "EW/EN-MIXED" };
+static int loadshr_scode, loadnsr_scode;
+static int segext_scode, purgeseg_scode;
+static int scode_set;
+
+/* set correct Diag x'64' subcodes. */
+static int
+dcss_set_subcodes(void)
+{
+ char *name = kmalloc(8, GFP_KERNEL | GFP_DMA);
+ unsigned long rx, ry;
+ int rc;
+
+ if (name == NULL)
+ return -ENOMEM;
+
+ rx = (unsigned long) name;
+ ry = DCSS_FINDSEGX;
+
+ strcpy(name, "dummy");
+ diag_stat_inc(DIAG_STAT_X064);
+ asm volatile(
+ " diag %0,%1,0x64\n"
+ "0: ipm %2\n"
+ " srl %2,28\n"
+ " j 2f\n"
+ "1: la %2,3\n"
+ "2:\n"
+ EX_TABLE(0b, 1b)
+ : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc", "memory");
+
+ kfree(name);
+ /* Diag x'64' new subcodes are supported, set to new subcodes */
+ if (rc != 3) {
+ loadshr_scode = DCSS_LOADSHRX;
+ loadnsr_scode = DCSS_LOADNSRX;
+ purgeseg_scode = DCSS_PURGESEG;
+ segext_scode = DCSS_SEGEXTX;
+ return 0;
+ }
+ /* Diag x'64' new subcodes are not supported, set to old subcodes */
+ loadshr_scode = DCSS_LOADNOLY;
+ loadnsr_scode = DCSS_LOADNSR;
+ purgeseg_scode = DCSS_PURGESEG;
+ segext_scode = DCSS_SEGEXT;
+ return 0;
+}
+
+/*
+ * Create the 8 bytes, ebcdic VM segment name from
+ * an ascii name.
+ */
+static void
+dcss_mkname(char *name, char *dcss_name)
+{
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ if (name[i] == '\0')
+ break;
+ dcss_name[i] = toupper(name[i]);
+ }
+ for (; i < 8; i++)
+ dcss_name[i] = ' ';
+ ASCEBC(dcss_name, 8);
+}
+
+
+/*
+ * search all segments in dcss_list, and return the one
+ * namend *name. If not found, return NULL.
+ */
+static struct dcss_segment *
+segment_by_name (char *name)
+{
+ char dcss_name[9];
+ struct list_head *l;
+ struct dcss_segment *tmp, *retval = NULL;
+
+ BUG_ON(!mutex_is_locked(&dcss_lock));
+ dcss_mkname (name, dcss_name);
+ list_for_each (l, &dcss_list) {
+ tmp = list_entry (l, struct dcss_segment, list);
+ if (memcmp(tmp->dcss_name, dcss_name, 8) == 0) {
+ retval = tmp;
+ break;
+ }
+ }
+ return retval;
+}
+
+
+/*
+ * Perform a function on a dcss segment.
+ */
+static inline int
+dcss_diag(int *func, void *parameter,
+ unsigned long *ret1, unsigned long *ret2)
+{
+ unsigned long rx, ry;
+ int rc;
+
+ if (scode_set == 0) {
+ rc = dcss_set_subcodes();
+ if (rc < 0)
+ return rc;
+ scode_set = 1;
+ }
+ rx = (unsigned long) parameter;
+ ry = (unsigned long) *func;
+
+ /* 64-bit Diag x'64' new subcode, keep in 64-bit addressing mode */
+ diag_stat_inc(DIAG_STAT_X064);
+ if (*func > DCSS_SEGEXT)
+ asm volatile(
+ " diag %0,%1,0x64\n"
+ " ipm %2\n"
+ " srl %2,28\n"
+ : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
+ /* 31-bit Diag x'64' old subcode, switch to 31-bit addressing mode */
+ else
+ asm volatile(
+ " sam31\n"
+ " diag %0,%1,0x64\n"
+ " sam64\n"
+ " ipm %2\n"
+ " srl %2,28\n"
+ : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
+ *ret1 = rx;
+ *ret2 = ry;
+ return rc;
+}
+
+static inline int
+dcss_diag_translate_rc (int vm_rc) {
+ if (vm_rc == 44)
+ return -ENOENT;
+ return -EIO;
+}
+
+
+/* do a diag to get info about a segment.
+ * fills start_address, end and vm_segtype fields
+ */
+static int
+query_segment_type (struct dcss_segment *seg)
+{
+ unsigned long dummy, vmrc;
+ int diag_cc, rc, i;
+ struct qout64 *qout;
+ struct qin64 *qin;
+
+ qin = kmalloc(sizeof(*qin), GFP_KERNEL | GFP_DMA);
+ qout = kmalloc(sizeof(*qout), GFP_KERNEL | GFP_DMA);
+ if ((qin == NULL) || (qout == NULL)) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+
+ /* initialize diag input parameters */
+ qin->qopcode = DCSS_FINDSEGA;
+ qin->qoutptr = (unsigned long) qout;
+ qin->qoutlen = sizeof(struct qout64);
+ memcpy (qin->qname, seg->dcss_name, 8);
+
+ diag_cc = dcss_diag(&segext_scode, qin, &dummy, &vmrc);
+
+ if (diag_cc < 0) {
+ rc = diag_cc;
+ goto out_free;
+ }
+ if (diag_cc > 1) {
+ pr_warn("Querying a DCSS type failed with rc=%ld\n", vmrc);
+ rc = dcss_diag_translate_rc (vmrc);
+ goto out_free;
+ }
+
+ /* Only old format of output area of Diagnose x'64' is supported,
+ copy data for the new format. */
+ if (segext_scode == DCSS_SEGEXT) {
+ struct qout64_old *qout_old;
+ qout_old = kzalloc(sizeof(*qout_old), GFP_KERNEL | GFP_DMA);
+ if (qout_old == NULL) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+ memcpy(qout_old, qout, sizeof(struct qout64_old));
+ qout->segstart = (unsigned long) qout_old->segstart;
+ qout->segend = (unsigned long) qout_old->segend;
+ qout->segcnt = qout_old->segcnt;
+ qout->segrcnt = qout_old->segrcnt;
+
+ if (qout->segcnt > 6)
+ qout->segrcnt = 6;
+ for (i = 0; i < qout->segrcnt; i++) {
+ qout->range[i].start =
+ (unsigned long) qout_old->range[i].start;
+ qout->range[i].end =
+ (unsigned long) qout_old->range[i].end;
+ }
+ kfree(qout_old);
+ }
+ if (qout->segcnt > 6) {
+ rc = -EOPNOTSUPP;
+ goto out_free;
+ }
+
+ if (qout->segcnt == 1) {
+ seg->vm_segtype = qout->range[0].start & 0xff;
+ } else {
+ /* multi-part segment. only one type supported here:
+ - all parts are contiguous
+ - all parts are either EW or EN type
+ - maximum 6 parts allowed */
+ unsigned long start = qout->segstart >> PAGE_SHIFT;
+ for (i=0; i<qout->segcnt; i++) {
+ if (((qout->range[i].start & 0xff) != SEG_TYPE_EW) &&
+ ((qout->range[i].start & 0xff) != SEG_TYPE_EN)) {
+ rc = -EOPNOTSUPP;
+ goto out_free;
+ }
+ if (start != qout->range[i].start >> PAGE_SHIFT) {
+ rc = -EOPNOTSUPP;
+ goto out_free;
+ }
+ start = (qout->range[i].end >> PAGE_SHIFT) + 1;
+ }
+ seg->vm_segtype = SEG_TYPE_EWEN;
+ }
+
+ /* analyze diag output and update seg */
+ seg->start_addr = qout->segstart;
+ seg->end = qout->segend;
+
+ memcpy (seg->range, qout->range, 6*sizeof(struct qrange));
+ seg->segcnt = qout->segcnt;
+
+ rc = 0;
+
+ out_free:
+ kfree(qin);
+ kfree(qout);
+ return rc;
+}
+
+/*
+ * get info about a segment
+ * possible return values:
+ * -ENOSYS : we are not running on VM
+ * -EIO : could not perform query diagnose
+ * -ENOENT : no such segment
+ * -EOPNOTSUPP: multi-part segment cannot be used with linux
+ * -ENOMEM : out of memory
+ * 0 .. 6 : type of segment as defined in include/asm-s390/extmem.h
+ */
+int
+segment_type (char* name)
+{
+ int rc;
+ struct dcss_segment seg;
+
+ if (!MACHINE_IS_VM)
+ return -ENOSYS;
+
+ dcss_mkname(name, seg.dcss_name);
+ rc = query_segment_type (&seg);
+ if (rc < 0)
+ return rc;
+ return seg.vm_segtype;
+}
+
+/*
+ * check if segment collides with other segments that are currently loaded
+ * returns 1 if this is the case, 0 if no collision was found
+ */
+static int
+segment_overlaps_others (struct dcss_segment *seg)
+{
+ struct list_head *l;
+ struct dcss_segment *tmp;
+
+ BUG_ON(!mutex_is_locked(&dcss_lock));
+ list_for_each(l, &dcss_list) {
+ tmp = list_entry(l, struct dcss_segment, list);
+ if ((tmp->start_addr >> 20) > (seg->end >> 20))
+ continue;
+ if ((tmp->end >> 20) < (seg->start_addr >> 20))
+ continue;
+ if (seg == tmp)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * real segment loading function, called from segment_load
+ */
+static int
+__segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end)
+{
+ unsigned long start_addr, end_addr, dummy;
+ struct dcss_segment *seg;
+ int rc, diag_cc;
+
+ start_addr = end_addr = 0;
+ seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA);
+ if (seg == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ dcss_mkname (name, seg->dcss_name);
+ rc = query_segment_type (seg);
+ if (rc < 0)
+ goto out_free;
+
+ if (loadshr_scode == DCSS_LOADSHRX) {
+ if (segment_overlaps_others(seg)) {
+ rc = -EBUSY;
+ goto out_free;
+ }
+ }
+
+ rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+
+ if (rc)
+ goto out_free;
+
+ seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+ if (seg->res == NULL) {
+ rc = -ENOMEM;
+ goto out_shared;
+ }
+ seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+ seg->res->start = seg->start_addr;
+ seg->res->end = seg->end;
+ memcpy(&seg->res_name, seg->dcss_name, 8);
+ EBCASC(seg->res_name, 8);
+ seg->res_name[8] = '\0';
+ strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name));
+ seg->res->name = seg->res_name;
+ rc = seg->vm_segtype;
+ if (rc == SEG_TYPE_SC ||
+ ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared))
+ seg->res->flags |= IORESOURCE_READONLY;
+ if (request_resource(&iomem_resource, seg->res)) {
+ rc = -EBUSY;
+ kfree(seg->res);
+ goto out_shared;
+ }
+
+ if (do_nonshared)
+ diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name,
+ &start_addr, &end_addr);
+ else
+ diag_cc = dcss_diag(&loadshr_scode, seg->dcss_name,
+ &start_addr, &end_addr);
+ if (diag_cc < 0) {
+ dcss_diag(&purgeseg_scode, seg->dcss_name,
+ &dummy, &dummy);
+ rc = diag_cc;
+ goto out_resource;
+ }
+ if (diag_cc > 1) {
+ pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr);
+ rc = dcss_diag_translate_rc(end_addr);
+ dcss_diag(&purgeseg_scode, seg->dcss_name,
+ &dummy, &dummy);
+ goto out_resource;
+ }
+ seg->start_addr = start_addr;
+ seg->end = end_addr;
+ seg->do_nonshared = do_nonshared;
+ atomic_set(&seg->ref_count, 1);
+ list_add(&seg->list, &dcss_list);
+ *addr = seg->start_addr;
+ *end = seg->end;
+ if (do_nonshared)
+ pr_info("DCSS %s of range %p to %p and type %s loaded as "
+ "exclusive-writable\n", name, (void*) seg->start_addr,
+ (void*) seg->end, segtype_string[seg->vm_segtype]);
+ else {
+ pr_info("DCSS %s of range %p to %p and type %s loaded in "
+ "shared access mode\n", name, (void*) seg->start_addr,
+ (void*) seg->end, segtype_string[seg->vm_segtype]);
+ }
+ goto out;
+ out_resource:
+ release_resource(seg->res);
+ kfree(seg->res);
+ out_shared:
+ vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+ out_free:
+ kfree(seg);
+ out:
+ return rc;
+}
+
+/*
+ * this function loads a DCSS segment
+ * name : name of the DCSS
+ * do_nonshared : 0 indicates that the dcss should be shared with other linux images
+ * 1 indicates that the dcss should be exclusive for this linux image
+ * addr : will be filled with start address of the segment
+ * end : will be filled with end address of the segment
+ * return values:
+ * -ENOSYS : we are not running on VM
+ * -EIO : could not perform query or load diagnose
+ * -ENOENT : no such segment
+ * -EOPNOTSUPP: multi-part segment cannot be used with linux
+ * -ENOSPC : segment cannot be used (overlaps with storage)
+ * -EBUSY : segment can temporarily not be used (overlaps with dcss)
+ * -ERANGE : segment cannot be used (exceeds kernel mapping range)
+ * -EPERM : segment is currently loaded with incompatible permissions
+ * -ENOMEM : out of memory
+ * 0 .. 6 : type of segment as defined in include/asm-s390/extmem.h
+ */
+int
+segment_load (char *name, int do_nonshared, unsigned long *addr,
+ unsigned long *end)
+{
+ struct dcss_segment *seg;
+ int rc;
+
+ if (!MACHINE_IS_VM)
+ return -ENOSYS;
+
+ mutex_lock(&dcss_lock);
+ seg = segment_by_name (name);
+ if (seg == NULL)
+ rc = __segment_load (name, do_nonshared, addr, end);
+ else {
+ if (do_nonshared == seg->do_nonshared) {
+ atomic_inc(&seg->ref_count);
+ *addr = seg->start_addr;
+ *end = seg->end;
+ rc = seg->vm_segtype;
+ } else {
+ *addr = *end = 0;
+ rc = -EPERM;
+ }
+ }
+ mutex_unlock(&dcss_lock);
+ return rc;
+}
+
+/*
+ * this function modifies the shared state of a DCSS segment. note that
+ * name : name of the DCSS
+ * do_nonshared : 0 indicates that the dcss should be shared with other linux images
+ * 1 indicates that the dcss should be exclusive for this linux image
+ * return values:
+ * -EIO : could not perform load diagnose (segment gone!)
+ * -ENOENT : no such segment (segment gone!)
+ * -EAGAIN : segment is in use by other exploiters, try later
+ * -EINVAL : no segment with the given name is currently loaded - name invalid
+ * -EBUSY : segment can temporarily not be used (overlaps with dcss)
+ * 0 : operation succeeded
+ */
+int
+segment_modify_shared (char *name, int do_nonshared)
+{
+ struct dcss_segment *seg;
+ unsigned long start_addr, end_addr, dummy;
+ int rc, diag_cc;
+
+ start_addr = end_addr = 0;
+ mutex_lock(&dcss_lock);
+ seg = segment_by_name (name);
+ if (seg == NULL) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+ if (do_nonshared == seg->do_nonshared) {
+ pr_info("DCSS %s is already in the requested access "
+ "mode\n", name);
+ rc = 0;
+ goto out_unlock;
+ }
+ if (atomic_read (&seg->ref_count) != 1) {
+ pr_warn("DCSS %s is in use and cannot be reloaded\n", name);
+ rc = -EAGAIN;
+ goto out_unlock;
+ }
+ release_resource(seg->res);
+ if (do_nonshared)
+ seg->res->flags &= ~IORESOURCE_READONLY;
+ else
+ if (seg->vm_segtype == SEG_TYPE_SR ||
+ seg->vm_segtype == SEG_TYPE_ER)
+ seg->res->flags |= IORESOURCE_READONLY;
+
+ if (request_resource(&iomem_resource, seg->res)) {
+ pr_warn("DCSS %s overlaps with used memory resources and cannot be reloaded\n",
+ name);
+ rc = -EBUSY;
+ kfree(seg->res);
+ goto out_del_mem;
+ }
+
+ dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+ if (do_nonshared)
+ diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name,
+ &start_addr, &end_addr);
+ else
+ diag_cc = dcss_diag(&loadshr_scode, seg->dcss_name,
+ &start_addr, &end_addr);
+ if (diag_cc < 0) {
+ rc = diag_cc;
+ goto out_del_res;
+ }
+ if (diag_cc > 1) {
+ pr_warn("Reloading DCSS %s failed with rc=%ld\n",
+ name, end_addr);
+ rc = dcss_diag_translate_rc(end_addr);
+ goto out_del_res;
+ }
+ seg->start_addr = start_addr;
+ seg->end = end_addr;
+ seg->do_nonshared = do_nonshared;
+ rc = 0;
+ goto out_unlock;
+ out_del_res:
+ release_resource(seg->res);
+ kfree(seg->res);
+ out_del_mem:
+ vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+ list_del(&seg->list);
+ dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+ kfree(seg);
+ out_unlock:
+ mutex_unlock(&dcss_lock);
+ return rc;
+}
+
+/*
+ * Decrease the use count of a DCSS segment and remove
+ * it from the address space if nobody is using it
+ * any longer.
+ */
+void
+segment_unload(char *name)
+{
+ unsigned long dummy;
+ struct dcss_segment *seg;
+
+ if (!MACHINE_IS_VM)
+ return;
+
+ mutex_lock(&dcss_lock);
+ seg = segment_by_name (name);
+ if (seg == NULL) {
+ pr_err("Unloading unknown DCSS %s failed\n", name);
+ goto out_unlock;
+ }
+ if (atomic_dec_return(&seg->ref_count) != 0)
+ goto out_unlock;
+ release_resource(seg->res);
+ kfree(seg->res);
+ vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+ list_del(&seg->list);
+ dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+ kfree(seg);
+out_unlock:
+ mutex_unlock(&dcss_lock);
+}
+
+/*
+ * save segment content permanently
+ */
+void
+segment_save(char *name)
+{
+ struct dcss_segment *seg;
+ char cmd1[160];
+ char cmd2[80];
+ int i, response;
+
+ if (!MACHINE_IS_VM)
+ return;
+
+ mutex_lock(&dcss_lock);
+ seg = segment_by_name (name);
+
+ if (seg == NULL) {
+ pr_err("Saving unknown DCSS %s failed\n", name);
+ goto out;
+ }
+
+ sprintf(cmd1, "DEFSEG %s", name);
+ for (i=0; i<seg->segcnt; i++) {
+ sprintf(cmd1+strlen(cmd1), " %lX-%lX %s",
+ seg->range[i].start >> PAGE_SHIFT,
+ seg->range[i].end >> PAGE_SHIFT,
+ segtype_string[seg->range[i].start & 0xff]);
+ }
+ sprintf(cmd2, "SAVESEG %s", name);
+ response = 0;
+ cpcmd(cmd1, NULL, 0, &response);
+ if (response) {
+ pr_err("Saving a DCSS failed with DEFSEG response code "
+ "%i\n", response);
+ goto out;
+ }
+ cpcmd(cmd2, NULL, 0, &response);
+ if (response) {
+ pr_err("Saving a DCSS failed with SAVESEG response code "
+ "%i\n", response);
+ goto out;
+ }
+out:
+ mutex_unlock(&dcss_lock);
+}
+
+/*
+ * print appropriate error message for segment_load()/segment_type()
+ * return code
+ */
+void segment_warning(int rc, char *seg_name)
+{
+ switch (rc) {
+ case -ENOENT:
+ pr_err("DCSS %s cannot be loaded or queried\n", seg_name);
+ break;
+ case -ENOSYS:
+ pr_err("DCSS %s cannot be loaded or queried without "
+ "z/VM\n", seg_name);
+ break;
+ case -EIO:
+ pr_err("Loading or querying DCSS %s resulted in a "
+ "hardware error\n", seg_name);
+ break;
+ case -EOPNOTSUPP:
+ pr_err("DCSS %s has multiple page ranges and cannot be "
+ "loaded or queried\n", seg_name);
+ break;
+ case -ENOSPC:
+ pr_err("DCSS %s overlaps with used storage and cannot "
+ "be loaded\n", seg_name);
+ break;
+ case -EBUSY:
+ pr_err("%s needs used memory resources and cannot be "
+ "loaded or queried\n", seg_name);
+ break;
+ case -EPERM:
+ pr_err("DCSS %s is already loaded in a different access "
+ "mode\n", seg_name);
+ break;
+ case -ENOMEM:
+ pr_err("There is not enough memory to load or query "
+ "DCSS %s\n", seg_name);
+ break;
+ case -ERANGE:
+ pr_err("DCSS %s exceeds the kernel mapping range (%lu) "
+ "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS);
+ break;
+ default:
+ break;
+ }
+}
+
+EXPORT_SYMBOL(segment_load);
+EXPORT_SYMBOL(segment_unload);
+EXPORT_SYMBOL(segment_save);
+EXPORT_SYMBOL(segment_type);
+EXPORT_SYMBOL(segment_modify_shared);
+EXPORT_SYMBOL(segment_warning);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
new file mode 100644
index 000000000..a6e3c7022
--- /dev/null
+++ b/arch/s390/mm/fault.c
@@ -0,0 +1,837 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * S390 version
+ * Copyright IBM Corp. 1999
+ * Author(s): Hartmut Penner (hp@de.ibm.com)
+ * Ulrich Weigand (uweigand@de.ibm.com)
+ *
+ * Derived from "arch/i386/mm/fault.c"
+ * Copyright (C) 1995 Linus Torvalds
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/perf_event.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/compat.h>
+#include <linux/smp.h>
+#include <linux/kdebug.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/extable.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/hugetlb.h>
+#include <asm/asm-offsets.h>
+#include <asm/diag.h>
+#include <asm/pgtable.h>
+#include <asm/gmap.h>
+#include <asm/irq.h>
+#include <asm/mmu_context.h>
+#include <asm/facility.h>
+#include "../kernel/entry.h"
+
+#define __FAIL_ADDR_MASK -4096L
+#define __SUBCODE_MASK 0x0600
+#define __PF_RES_FIELD 0x8000000000000000ULL
+
+#define VM_FAULT_BADCONTEXT 0x010000
+#define VM_FAULT_BADMAP 0x020000
+#define VM_FAULT_BADACCESS 0x040000
+#define VM_FAULT_SIGNAL 0x080000
+#define VM_FAULT_PFAULT 0x100000
+
+enum fault_type {
+ KERNEL_FAULT,
+ USER_FAULT,
+ VDSO_FAULT,
+ GMAP_FAULT,
+};
+
+static unsigned long store_indication __read_mostly;
+
+static int __init fault_init(void)
+{
+ if (test_facility(75))
+ store_indication = 0xc00;
+ return 0;
+}
+early_initcall(fault_init);
+
+static inline int notify_page_fault(struct pt_regs *regs)
+{
+ int ret = 0;
+
+ /* kprobe_running() needs smp_processor_id() */
+ if (kprobes_built_in() && !user_mode(regs)) {
+ preempt_disable();
+ if (kprobe_running() && kprobe_fault_handler(regs, 14))
+ ret = 1;
+ preempt_enable();
+ }
+ return ret;
+}
+
+
+/*
+ * Unlock any spinlocks which will prevent us from getting the
+ * message out.
+ */
+void bust_spinlocks(int yes)
+{
+ if (yes) {
+ oops_in_progress = 1;
+ } else {
+ int loglevel_save = console_loglevel;
+ console_unblank();
+ oops_in_progress = 0;
+ /*
+ * OK, the message is on the console. Now we call printk()
+ * without oops_in_progress set so that printk will give klogd
+ * a poke. Hold onto your hats...
+ */
+ console_loglevel = 15;
+ printk(" ");
+ console_loglevel = loglevel_save;
+ }
+}
+
+/*
+ * Find out which address space caused the exception.
+ */
+static inline enum fault_type get_fault_type(struct pt_regs *regs)
+{
+ unsigned long trans_exc_code;
+
+ trans_exc_code = regs->int_parm_long & 3;
+ if (likely(trans_exc_code == 0)) {
+ /* primary space exception */
+ if (IS_ENABLED(CONFIG_PGSTE) &&
+ test_pt_regs_flag(regs, PIF_GUEST_FAULT))
+ return GMAP_FAULT;
+ if (current->thread.mm_segment == USER_DS)
+ return USER_FAULT;
+ return KERNEL_FAULT;
+ }
+ if (trans_exc_code == 2) {
+ /* secondary space exception */
+ if (current->thread.mm_segment & 1) {
+ if (current->thread.mm_segment == USER_DS_SACF)
+ return USER_FAULT;
+ return KERNEL_FAULT;
+ }
+ return VDSO_FAULT;
+ }
+ if (trans_exc_code == 1) {
+ /* access register mode, not used in the kernel */
+ return USER_FAULT;
+ }
+ /* home space exception -> access via kernel ASCE */
+ return KERNEL_FAULT;
+}
+
+static int bad_address(void *p)
+{
+ unsigned long dummy;
+
+ return probe_kernel_address((unsigned long *)p, dummy);
+}
+
+static void dump_pagetable(unsigned long asce, unsigned long address)
+{
+ unsigned long *table = __va(asce & _ASCE_ORIGIN);
+
+ pr_alert("AS:%016lx ", asce);
+ switch (asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_REGION1:
+ table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
+ if (bad_address(table))
+ goto bad;
+ pr_cont("R1:%016lx ", *table);
+ if (*table & _REGION_ENTRY_INVALID)
+ goto out;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* fallthrough */
+ case _ASCE_TYPE_REGION2:
+ table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
+ if (bad_address(table))
+ goto bad;
+ pr_cont("R2:%016lx ", *table);
+ if (*table & _REGION_ENTRY_INVALID)
+ goto out;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* fallthrough */
+ case _ASCE_TYPE_REGION3:
+ table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
+ if (bad_address(table))
+ goto bad;
+ pr_cont("R3:%016lx ", *table);
+ if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
+ goto out;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* fallthrough */
+ case _ASCE_TYPE_SEGMENT:
+ table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
+ if (bad_address(table))
+ goto bad;
+ pr_cont("S:%016lx ", *table);
+ if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
+ goto out;
+ table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ }
+ table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
+ if (bad_address(table))
+ goto bad;
+ pr_cont("P:%016lx ", *table);
+out:
+ pr_cont("\n");
+ return;
+bad:
+ pr_cont("BAD\n");
+}
+
+static void dump_fault_info(struct pt_regs *regs)
+{
+ unsigned long asce;
+
+ pr_alert("Failing address: %016lx TEID: %016lx\n",
+ regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
+ pr_alert("Fault in ");
+ switch (regs->int_parm_long & 3) {
+ case 3:
+ pr_cont("home space ");
+ break;
+ case 2:
+ pr_cont("secondary space ");
+ break;
+ case 1:
+ pr_cont("access register ");
+ break;
+ case 0:
+ pr_cont("primary space ");
+ break;
+ }
+ pr_cont("mode while using ");
+ switch (get_fault_type(regs)) {
+ case USER_FAULT:
+ asce = S390_lowcore.user_asce;
+ pr_cont("user ");
+ break;
+ case VDSO_FAULT:
+ asce = S390_lowcore.vdso_asce;
+ pr_cont("vdso ");
+ break;
+ case GMAP_FAULT:
+ asce = ((struct gmap *) S390_lowcore.gmap)->asce;
+ pr_cont("gmap ");
+ break;
+ case KERNEL_FAULT:
+ asce = S390_lowcore.kernel_asce;
+ pr_cont("kernel ");
+ break;
+ }
+ pr_cont("ASCE.\n");
+ dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
+}
+
+int show_unhandled_signals = 1;
+
+void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
+{
+ if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
+ return;
+ if (!unhandled_signal(current, signr))
+ return;
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
+ regs->int_code & 0xffff, regs->int_code >> 17);
+ print_vma_addr(KERN_CONT "in ", regs->psw.addr);
+ printk(KERN_CONT "\n");
+ if (is_mm_fault)
+ dump_fault_info(regs);
+ show_regs(regs);
+}
+
+/*
+ * Send SIGSEGV to task. This is an external routine
+ * to keep the stack usage of do_page_fault small.
+ */
+static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
+{
+ report_user_fault(regs, SIGSEGV, 1);
+ force_sig_fault(SIGSEGV, si_code,
+ (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
+ current);
+}
+
+static noinline void do_no_context(struct pt_regs *regs)
+{
+ const struct exception_table_entry *fixup;
+
+ /* Are we prepared to handle this kernel fault? */
+ fixup = search_exception_tables(regs->psw.addr);
+ if (fixup) {
+ regs->psw.addr = extable_fixup(fixup);
+ return;
+ }
+
+ /*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+ if (get_fault_type(regs) == KERNEL_FAULT)
+ printk(KERN_ALERT "Unable to handle kernel pointer dereference"
+ " in virtual kernel address space\n");
+ else
+ printk(KERN_ALERT "Unable to handle kernel paging request"
+ " in virtual user address space\n");
+ dump_fault_info(regs);
+ die(regs, "Oops");
+ do_exit(SIGKILL);
+}
+
+static noinline void do_low_address(struct pt_regs *regs)
+{
+ /* Low-address protection hit in kernel mode means
+ NULL pointer write access in kernel mode. */
+ if (regs->psw.mask & PSW_MASK_PSTATE) {
+ /* Low-address protection hit in user mode 'cannot happen'. */
+ die (regs, "Low-address protection");
+ do_exit(SIGKILL);
+ }
+
+ do_no_context(regs);
+}
+
+static noinline void do_sigbus(struct pt_regs *regs)
+{
+ /*
+ * Send a sigbus, regardless of whether we were in kernel
+ * or user mode.
+ */
+ force_sig_fault(SIGBUS, BUS_ADRERR,
+ (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
+ current);
+}
+
+static noinline int signal_return(struct pt_regs *regs)
+{
+ u16 instruction;
+ int rc;
+
+ rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
+ if (rc)
+ return rc;
+ if (instruction == 0x0a77) {
+ set_pt_regs_flag(regs, PIF_SYSCALL);
+ regs->int_code = 0x00040077;
+ return 0;
+ } else if (instruction == 0x0aad) {
+ set_pt_regs_flag(regs, PIF_SYSCALL);
+ regs->int_code = 0x000400ad;
+ return 0;
+ }
+ return -EACCES;
+}
+
+static noinline void do_fault_error(struct pt_regs *regs, int access,
+ vm_fault_t fault)
+{
+ int si_code;
+
+ switch (fault) {
+ case VM_FAULT_BADACCESS:
+ if (access == VM_EXEC && signal_return(regs) == 0)
+ break;
+ case VM_FAULT_BADMAP:
+ /* Bad memory access. Check if it is kernel or user space. */
+ if (user_mode(regs)) {
+ /* User mode accesses just cause a SIGSEGV */
+ si_code = (fault == VM_FAULT_BADMAP) ?
+ SEGV_MAPERR : SEGV_ACCERR;
+ do_sigsegv(regs, si_code);
+ break;
+ }
+ case VM_FAULT_BADCONTEXT:
+ case VM_FAULT_PFAULT:
+ do_no_context(regs);
+ break;
+ case VM_FAULT_SIGNAL:
+ if (!user_mode(regs))
+ do_no_context(regs);
+ break;
+ default: /* fault & VM_FAULT_ERROR */
+ if (fault & VM_FAULT_OOM) {
+ if (!user_mode(regs))
+ do_no_context(regs);
+ else
+ pagefault_out_of_memory();
+ } else if (fault & VM_FAULT_SIGSEGV) {
+ /* Kernel mode? Handle exceptions or die */
+ if (!user_mode(regs))
+ do_no_context(regs);
+ else
+ do_sigsegv(regs, SEGV_MAPERR);
+ } else if (fault & VM_FAULT_SIGBUS) {
+ /* Kernel mode? Handle exceptions or die */
+ if (!user_mode(regs))
+ do_no_context(regs);
+ else
+ do_sigbus(regs);
+ } else
+ BUG();
+ break;
+ }
+}
+
+/*
+ * This routine handles page faults. It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ *
+ * interruption code (int_code):
+ * 04 Protection -> Write-Protection (suprression)
+ * 10 Segment translation -> Not present (nullification)
+ * 11 Page translation -> Not present (nullification)
+ * 3b Region third trans. -> Not present (nullification)
+ */
+static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
+{
+ struct gmap *gmap;
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ enum fault_type type;
+ unsigned long trans_exc_code;
+ unsigned long address;
+ unsigned int flags;
+ vm_fault_t fault;
+
+ tsk = current;
+ /*
+ * The instruction that caused the program check has
+ * been nullified. Don't signal single step via SIGTRAP.
+ */
+ clear_pt_regs_flag(regs, PIF_PER_TRAP);
+
+ if (notify_page_fault(regs))
+ return 0;
+
+ mm = tsk->mm;
+ trans_exc_code = regs->int_parm_long;
+
+ /*
+ * Verify that the fault happened in user space, that
+ * we are not in an interrupt and that there is a
+ * user context.
+ */
+ fault = VM_FAULT_BADCONTEXT;
+ type = get_fault_type(regs);
+ switch (type) {
+ case KERNEL_FAULT:
+ goto out;
+ case VDSO_FAULT:
+ fault = VM_FAULT_BADMAP;
+ goto out;
+ case USER_FAULT:
+ case GMAP_FAULT:
+ if (faulthandler_disabled() || !mm)
+ goto out;
+ break;
+ }
+
+ address = trans_exc_code & __FAIL_ADDR_MASK;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ if (user_mode(regs))
+ flags |= FAULT_FLAG_USER;
+ if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
+ flags |= FAULT_FLAG_WRITE;
+ down_read(&mm->mmap_sem);
+
+ gmap = NULL;
+ if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
+ gmap = (struct gmap *) S390_lowcore.gmap;
+ current->thread.gmap_addr = address;
+ current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+ current->thread.gmap_int_code = regs->int_code & 0xffff;
+ address = __gmap_translate(gmap, address);
+ if (address == -EFAULT) {
+ fault = VM_FAULT_BADMAP;
+ goto out_up;
+ }
+ if (gmap->pfault_enabled)
+ flags |= FAULT_FLAG_RETRY_NOWAIT;
+ }
+
+retry:
+ fault = VM_FAULT_BADMAP;
+ vma = find_vma(mm, address);
+ if (!vma)
+ goto out_up;
+
+ if (unlikely(vma->vm_start > address)) {
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto out_up;
+ if (expand_stack(vma, address))
+ goto out_up;
+ }
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+ fault = VM_FAULT_BADACCESS;
+ if (unlikely(!(vma->vm_flags & access)))
+ goto out_up;
+
+ if (is_vm_hugetlb_page(vma))
+ address &= HPAGE_MASK;
+ /*
+ * If for any reason at all we couldn't handle the fault,
+ * make sure we exit gracefully rather than endlessly redo
+ * the fault.
+ */
+ fault = handle_mm_fault(vma, address, flags);
+ /* No reason to continue if interrupted by SIGKILL. */
+ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
+ fault = VM_FAULT_SIGNAL;
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ goto out_up;
+ goto out;
+ }
+ if (unlikely(fault & VM_FAULT_ERROR))
+ goto out_up;
+
+ /*
+ * Major/minor page fault accounting is only done on the
+ * initial attempt. If we go through a retry, it is extremely
+ * likely that the page will be found in page cache at that point.
+ */
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ if (fault & VM_FAULT_MAJOR) {
+ tsk->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
+ regs, address);
+ } else {
+ tsk->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
+ regs, address);
+ }
+ if (fault & VM_FAULT_RETRY) {
+ if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
+ (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /* FAULT_FLAG_RETRY_NOWAIT has been set,
+ * mmap_sem has not been released */
+ current->thread.gmap_pfault = 1;
+ fault = VM_FAULT_PFAULT;
+ goto out_up;
+ }
+ /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+ * of starvation. */
+ flags &= ~(FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_RETRY_NOWAIT);
+ flags |= FAULT_FLAG_TRIED;
+ down_read(&mm->mmap_sem);
+ goto retry;
+ }
+ }
+ if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
+ address = __gmap_link(gmap, current->thread.gmap_addr,
+ address);
+ if (address == -EFAULT) {
+ fault = VM_FAULT_BADMAP;
+ goto out_up;
+ }
+ if (address == -ENOMEM) {
+ fault = VM_FAULT_OOM;
+ goto out_up;
+ }
+ }
+ fault = 0;
+out_up:
+ up_read(&mm->mmap_sem);
+out:
+ return fault;
+}
+
+void do_protection_exception(struct pt_regs *regs)
+{
+ unsigned long trans_exc_code;
+ int access;
+ vm_fault_t fault;
+
+ trans_exc_code = regs->int_parm_long;
+ /*
+ * Protection exceptions are suppressing, decrement psw address.
+ * The exception to this rule are aborted transactions, for these
+ * the PSW already points to the correct location.
+ */
+ if (!(regs->int_code & 0x200))
+ regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
+ /*
+ * Check for low-address protection. This needs to be treated
+ * as a special case because the translation exception code
+ * field is not guaranteed to contain valid data in this case.
+ */
+ if (unlikely(!(trans_exc_code & 4))) {
+ do_low_address(regs);
+ return;
+ }
+ if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
+ regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
+ (regs->psw.addr & PAGE_MASK);
+ access = VM_EXEC;
+ fault = VM_FAULT_BADACCESS;
+ } else {
+ access = VM_WRITE;
+ fault = do_exception(regs, access);
+ }
+ if (unlikely(fault))
+ do_fault_error(regs, access, fault);
+}
+NOKPROBE_SYMBOL(do_protection_exception);
+
+void do_dat_exception(struct pt_regs *regs)
+{
+ int access;
+ vm_fault_t fault;
+
+ access = VM_READ | VM_EXEC | VM_WRITE;
+ fault = do_exception(regs, access);
+ if (unlikely(fault))
+ do_fault_error(regs, access, fault);
+}
+NOKPROBE_SYMBOL(do_dat_exception);
+
+#ifdef CONFIG_PFAULT
+/*
+ * 'pfault' pseudo page faults routines.
+ */
+static int pfault_disable;
+
+static int __init nopfault(char *str)
+{
+ pfault_disable = 1;
+ return 1;
+}
+
+__setup("nopfault", nopfault);
+
+struct pfault_refbk {
+ u16 refdiagc;
+ u16 reffcode;
+ u16 refdwlen;
+ u16 refversn;
+ u64 refgaddr;
+ u64 refselmk;
+ u64 refcmpmk;
+ u64 reserved;
+} __attribute__ ((packed, aligned(8)));
+
+int pfault_init(void)
+{
+ struct pfault_refbk refbk = {
+ .refdiagc = 0x258,
+ .reffcode = 0,
+ .refdwlen = 5,
+ .refversn = 2,
+ .refgaddr = __LC_LPP,
+ .refselmk = 1ULL << 48,
+ .refcmpmk = 1ULL << 48,
+ .reserved = __PF_RES_FIELD };
+ int rc;
+
+ if (pfault_disable)
+ return -1;
+ diag_stat_inc(DIAG_STAT_X258);
+ asm volatile(
+ " diag %1,%0,0x258\n"
+ "0: j 2f\n"
+ "1: la %0,8\n"
+ "2:\n"
+ EX_TABLE(0b,1b)
+ : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
+ return rc;
+}
+
+void pfault_fini(void)
+{
+ struct pfault_refbk refbk = {
+ .refdiagc = 0x258,
+ .reffcode = 1,
+ .refdwlen = 5,
+ .refversn = 2,
+ };
+
+ if (pfault_disable)
+ return;
+ diag_stat_inc(DIAG_STAT_X258);
+ asm volatile(
+ " diag %0,0,0x258\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b,0b)
+ : : "a" (&refbk), "m" (refbk) : "cc");
+}
+
+static DEFINE_SPINLOCK(pfault_lock);
+static LIST_HEAD(pfault_list);
+
+#define PF_COMPLETE 0x0080
+
+/*
+ * The mechanism of our pfault code: if Linux is running as guest, runs a user
+ * space process and the user space process accesses a page that the host has
+ * paged out we get a pfault interrupt.
+ *
+ * This allows us, within the guest, to schedule a different process. Without
+ * this mechanism the host would have to suspend the whole virtual cpu until
+ * the page has been paged in.
+ *
+ * So when we get such an interrupt then we set the state of the current task
+ * to uninterruptible and also set the need_resched flag. Both happens within
+ * interrupt context(!). If we later on want to return to user space we
+ * recognize the need_resched flag and then call schedule(). It's not very
+ * obvious how this works...
+ *
+ * Of course we have a lot of additional fun with the completion interrupt (->
+ * host signals that a page of a process has been paged in and the process can
+ * continue to run). This interrupt can arrive on any cpu and, since we have
+ * virtual cpus, actually appear before the interrupt that signals that a page
+ * is missing.
+ */
+static void pfault_interrupt(struct ext_code ext_code,
+ unsigned int param32, unsigned long param64)
+{
+ struct task_struct *tsk;
+ __u16 subcode;
+ pid_t pid;
+
+ /*
+ * Get the external interruption subcode & pfault initial/completion
+ * signal bit. VM stores this in the 'cpu address' field associated
+ * with the external interrupt.
+ */
+ subcode = ext_code.subcode;
+ if ((subcode & 0xff00) != __SUBCODE_MASK)
+ return;
+ inc_irq_stat(IRQEXT_PFL);
+ /* Get the token (= pid of the affected task). */
+ pid = param64 & LPP_PID_MASK;
+ rcu_read_lock();
+ tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (tsk)
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (!tsk)
+ return;
+ spin_lock(&pfault_lock);
+ if (subcode & PF_COMPLETE) {
+ /* signal bit is set -> a page has been swapped in by VM */
+ if (tsk->thread.pfault_wait == 1) {
+ /* Initial interrupt was faster than the completion
+ * interrupt. pfault_wait is valid. Set pfault_wait
+ * back to zero and wake up the process. This can
+ * safely be done because the task is still sleeping
+ * and can't produce new pfaults. */
+ tsk->thread.pfault_wait = 0;
+ list_del(&tsk->thread.list);
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ } else {
+ /* Completion interrupt was faster than initial
+ * interrupt. Set pfault_wait to -1 so the initial
+ * interrupt doesn't put the task to sleep.
+ * If the task is not running, ignore the completion
+ * interrupt since it must be a leftover of a PFAULT
+ * CANCEL operation which didn't remove all pending
+ * completion interrupts. */
+ if (tsk->state == TASK_RUNNING)
+ tsk->thread.pfault_wait = -1;
+ }
+ } else {
+ /* signal bit not set -> a real page is missing. */
+ if (WARN_ON_ONCE(tsk != current))
+ goto out;
+ if (tsk->thread.pfault_wait == 1) {
+ /* Already on the list with a reference: put to sleep */
+ goto block;
+ } else if (tsk->thread.pfault_wait == -1) {
+ /* Completion interrupt was faster than the initial
+ * interrupt (pfault_wait == -1). Set pfault_wait
+ * back to zero and exit. */
+ tsk->thread.pfault_wait = 0;
+ } else {
+ /* Initial interrupt arrived before completion
+ * interrupt. Let the task sleep.
+ * An extra task reference is needed since a different
+ * cpu may set the task state to TASK_RUNNING again
+ * before the scheduler is reached. */
+ get_task_struct(tsk);
+ tsk->thread.pfault_wait = 1;
+ list_add(&tsk->thread.list, &pfault_list);
+block:
+ /* Since this must be a userspace fault, there
+ * is no kernel task state to trample. Rely on the
+ * return to userspace schedule() to block. */
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ set_tsk_need_resched(tsk);
+ set_preempt_need_resched();
+ }
+ }
+out:
+ spin_unlock(&pfault_lock);
+ put_task_struct(tsk);
+}
+
+static int pfault_cpu_dead(unsigned int cpu)
+{
+ struct thread_struct *thread, *next;
+ struct task_struct *tsk;
+
+ spin_lock_irq(&pfault_lock);
+ list_for_each_entry_safe(thread, next, &pfault_list, list) {
+ thread->pfault_wait = 0;
+ list_del(&thread->list);
+ tsk = container_of(thread, struct task_struct, thread);
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ }
+ spin_unlock_irq(&pfault_lock);
+ return 0;
+}
+
+static int __init pfault_irq_init(void)
+{
+ int rc;
+
+ rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+ if (rc)
+ goto out_extint;
+ rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
+ if (rc)
+ goto out_pfault;
+ irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
+ cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
+ NULL, pfault_cpu_dead);
+ return 0;
+
+out_pfault:
+ unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+out_extint:
+ pfault_disable = 1;
+ return rc;
+}
+early_initcall(pfault_irq_init);
+
+#endif /* CONFIG_PFAULT */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
new file mode 100644
index 000000000..65ccb9d79
--- /dev/null
+++ b/arch/s390/mm/gmap.c
@@ -0,0 +1,2645 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM guest address space mapping code
+ *
+ * Copyright IBM Corp. 2007, 2016, 2018
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ * David Hildenbrand <david@redhat.com>
+ * Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/ksm.h>
+#include <linux/mman.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/gmap.h>
+#include <asm/tlb.h>
+
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
+/**
+ * gmap_alloc - allocate and initialize a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum address of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+static struct gmap *gmap_alloc(unsigned long limit)
+{
+ struct gmap *gmap;
+ struct page *page;
+ unsigned long *table;
+ unsigned long etype, atype;
+
+ if (limit < _REGION3_SIZE) {
+ limit = _REGION3_SIZE - 1;
+ atype = _ASCE_TYPE_SEGMENT;
+ etype = _SEGMENT_ENTRY_EMPTY;
+ } else if (limit < _REGION2_SIZE) {
+ limit = _REGION2_SIZE - 1;
+ atype = _ASCE_TYPE_REGION3;
+ etype = _REGION3_ENTRY_EMPTY;
+ } else if (limit < _REGION1_SIZE) {
+ limit = _REGION1_SIZE - 1;
+ atype = _ASCE_TYPE_REGION2;
+ etype = _REGION2_ENTRY_EMPTY;
+ } else {
+ limit = -1UL;
+ atype = _ASCE_TYPE_REGION1;
+ etype = _REGION1_ENTRY_EMPTY;
+ }
+ gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
+ if (!gmap)
+ goto out;
+ INIT_LIST_HEAD(&gmap->crst_list);
+ INIT_LIST_HEAD(&gmap->children);
+ INIT_LIST_HEAD(&gmap->pt_list);
+ INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
+ INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+ INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
+ spin_lock_init(&gmap->guest_table_lock);
+ spin_lock_init(&gmap->shadow_lock);
+ atomic_set(&gmap->ref_count, 1);
+ page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ if (!page)
+ goto out_free;
+ page->index = 0;
+ list_add(&page->lru, &gmap->crst_list);
+ table = (unsigned long *) page_to_phys(page);
+ crst_table_init(table, etype);
+ gmap->table = table;
+ gmap->asce = atype | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | __pa(table);
+ gmap->asce_end = limit;
+ return gmap;
+
+out_free:
+ kfree(gmap);
+out:
+ return NULL;
+}
+
+/**
+ * gmap_create - create a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+{
+ struct gmap *gmap;
+ unsigned long gmap_asce;
+
+ gmap = gmap_alloc(limit);
+ if (!gmap)
+ return NULL;
+ gmap->mm = mm;
+ spin_lock(&mm->context.lock);
+ list_add_rcu(&gmap->list, &mm->context.gmap_list);
+ if (list_is_singular(&mm->context.gmap_list))
+ gmap_asce = gmap->asce;
+ else
+ gmap_asce = -1UL;
+ WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
+ spin_unlock(&mm->context.lock);
+ return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_create);
+
+static void gmap_flush_tlb(struct gmap *gmap)
+{
+ if (MACHINE_HAS_IDTE)
+ __tlb_flush_idte(gmap->asce);
+ else
+ __tlb_flush_global();
+}
+
+static void gmap_radix_tree_free(struct radix_tree_root *root)
+{
+ struct radix_tree_iter iter;
+ unsigned long indices[16];
+ unsigned long index;
+ void __rcu **slot;
+ int i, nr;
+
+ /* A radix tree is freed by deleting all of its entries */
+ index = 0;
+ do {
+ nr = 0;
+ radix_tree_for_each_slot(slot, root, &iter, index) {
+ indices[nr] = iter.index;
+ if (++nr == 16)
+ break;
+ }
+ for (i = 0; i < nr; i++) {
+ index = indices[i];
+ radix_tree_delete(root, index);
+ }
+ } while (nr > 0);
+}
+
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+ struct gmap_rmap *rmap, *rnext, *head;
+ struct radix_tree_iter iter;
+ unsigned long indices[16];
+ unsigned long index;
+ void __rcu **slot;
+ int i, nr;
+
+ /* A radix tree is freed by deleting all of its entries */
+ index = 0;
+ do {
+ nr = 0;
+ radix_tree_for_each_slot(slot, root, &iter, index) {
+ indices[nr] = iter.index;
+ if (++nr == 16)
+ break;
+ }
+ for (i = 0; i < nr; i++) {
+ index = indices[i];
+ head = radix_tree_delete(root, index);
+ gmap_for_each_rmap_safe(rmap, rnext, head)
+ kfree(rmap);
+ }
+ } while (nr > 0);
+}
+
+/**
+ * gmap_free - free a guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * No locks required. There are no references to this gmap anymore.
+ */
+static void gmap_free(struct gmap *gmap)
+{
+ struct page *page, *next;
+
+ /* Flush tlb of all gmaps (if not already done for shadows) */
+ if (!(gmap_is_shadow(gmap) && gmap->removed))
+ gmap_flush_tlb(gmap);
+ /* Free all segment & region tables. */
+ list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
+ __free_pages(page, CRST_ALLOC_ORDER);
+ gmap_radix_tree_free(&gmap->guest_to_host);
+ gmap_radix_tree_free(&gmap->host_to_guest);
+
+ /* Free additional data for a shadow gmap */
+ if (gmap_is_shadow(gmap)) {
+ /* Free all page tables. */
+ list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+ page_table_free_pgste(page);
+ gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+ /* Release reference to the parent */
+ gmap_put(gmap->parent);
+ }
+
+ kfree(gmap);
+}
+
+/**
+ * gmap_get - increase reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * Returns the gmap pointer
+ */
+struct gmap *gmap_get(struct gmap *gmap)
+{
+ atomic_inc(&gmap->ref_count);
+ return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get);
+
+/**
+ * gmap_put - decrease reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * If the reference counter reaches zero the guest address space is freed.
+ */
+void gmap_put(struct gmap *gmap)
+{
+ if (atomic_dec_return(&gmap->ref_count) == 0)
+ gmap_free(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_put);
+
+/**
+ * gmap_remove - remove a guest address space but do not free it yet
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_remove(struct gmap *gmap)
+{
+ struct gmap *sg, *next;
+ unsigned long gmap_asce;
+
+ /* Remove all shadow gmaps linked to this gmap */
+ if (!list_empty(&gmap->children)) {
+ spin_lock(&gmap->shadow_lock);
+ list_for_each_entry_safe(sg, next, &gmap->children, list) {
+ list_del(&sg->list);
+ gmap_put(sg);
+ }
+ spin_unlock(&gmap->shadow_lock);
+ }
+ /* Remove gmap from the pre-mm list */
+ spin_lock(&gmap->mm->context.lock);
+ list_del_rcu(&gmap->list);
+ if (list_empty(&gmap->mm->context.gmap_list))
+ gmap_asce = 0;
+ else if (list_is_singular(&gmap->mm->context.gmap_list))
+ gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
+ struct gmap, list)->asce;
+ else
+ gmap_asce = -1UL;
+ WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
+ spin_unlock(&gmap->mm->context.lock);
+ synchronize_rcu();
+ /* Put reference */
+ gmap_put(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_remove);
+
+/**
+ * gmap_enable - switch primary space to the guest address space
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_enable(struct gmap *gmap)
+{
+ S390_lowcore.gmap = (unsigned long) gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_enable);
+
+/**
+ * gmap_disable - switch back to the standard primary address space
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_disable(struct gmap *gmap)
+{
+ S390_lowcore.gmap = 0UL;
+}
+EXPORT_SYMBOL_GPL(gmap_disable);
+
+/**
+ * gmap_get_enabled - get a pointer to the currently enabled gmap
+ *
+ * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+ */
+struct gmap *gmap_get_enabled(void)
+{
+ return (struct gmap *) S390_lowcore.gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get_enabled);
+
+/*
+ * gmap_alloc_table is assumed to be called with mmap_sem held
+ */
+static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
+ unsigned long init, unsigned long gaddr)
+{
+ struct page *page;
+ unsigned long *new;
+
+ /* since we dont free the gmap table until gmap_free we can unlock */
+ page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ if (!page)
+ return -ENOMEM;
+ new = (unsigned long *) page_to_phys(page);
+ crst_table_init(new, init);
+ spin_lock(&gmap->guest_table_lock);
+ if (*table & _REGION_ENTRY_INVALID) {
+ list_add(&page->lru, &gmap->crst_list);
+ *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
+ (*table & _REGION_ENTRY_TYPE_MASK);
+ page->index = gaddr;
+ page = NULL;
+ }
+ spin_unlock(&gmap->guest_table_lock);
+ if (page)
+ __free_pages(page, CRST_ALLOC_ORDER);
+ return 0;
+}
+
+/**
+ * __gmap_segment_gaddr - find virtual address from segment pointer
+ * @entry: pointer to a segment table entry in the guest address space
+ *
+ * Returns the virtual address in the guest address space for the segment
+ */
+static unsigned long __gmap_segment_gaddr(unsigned long *entry)
+{
+ struct page *page;
+ unsigned long offset, mask;
+
+ offset = (unsigned long) entry / sizeof(unsigned long);
+ offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
+ mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
+ page = virt_to_page((void *)((unsigned long) entry & mask));
+ return page->index + offset;
+}
+
+/**
+ * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
+ * @gmap: pointer to the guest address space structure
+ * @vmaddr: address in the host process address space
+ *
+ * Returns 1 if a TLB flush is required
+ */
+static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
+{
+ unsigned long *entry;
+ int flush = 0;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ spin_lock(&gmap->guest_table_lock);
+ entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+ if (entry) {
+ flush = (*entry != _SEGMENT_ENTRY_EMPTY);
+ *entry = _SEGMENT_ENTRY_EMPTY;
+ }
+ spin_unlock(&gmap->guest_table_lock);
+ return flush;
+}
+
+/**
+ * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
+ * @gmap: pointer to the guest address space structure
+ * @gaddr: address in the guest address space
+ *
+ * Returns 1 if a TLB flush is required
+ */
+static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
+{
+ unsigned long vmaddr;
+
+ vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
+ gaddr >> PMD_SHIFT);
+ return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
+}
+
+/**
+ * gmap_unmap_segment - unmap segment from the guest address space
+ * @gmap: pointer to the guest address space structure
+ * @to: address in the guest address space
+ * @len: length of the memory area to unmap
+ *
+ * Returns 0 if the unmap succeeded, -EINVAL if not.
+ */
+int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
+{
+ unsigned long off;
+ int flush;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ if ((to | len) & (PMD_SIZE - 1))
+ return -EINVAL;
+ if (len == 0 || to + len < to)
+ return -EINVAL;
+
+ flush = 0;
+ down_write(&gmap->mm->mmap_sem);
+ for (off = 0; off < len; off += PMD_SIZE)
+ flush |= __gmap_unmap_by_gaddr(gmap, to + off);
+ up_write(&gmap->mm->mmap_sem);
+ if (flush)
+ gmap_flush_tlb(gmap);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gmap_unmap_segment);
+
+/**
+ * gmap_map_segment - map a segment to the guest address space
+ * @gmap: pointer to the guest address space structure
+ * @from: source address in the parent address space
+ * @to: target address in the guest address space
+ * @len: length of the memory area to map
+ *
+ * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
+ */
+int gmap_map_segment(struct gmap *gmap, unsigned long from,
+ unsigned long to, unsigned long len)
+{
+ unsigned long off;
+ int flush;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ if ((from | to | len) & (PMD_SIZE - 1))
+ return -EINVAL;
+ if (len == 0 || from + len < from || to + len < to ||
+ from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
+ return -EINVAL;
+
+ flush = 0;
+ down_write(&gmap->mm->mmap_sem);
+ for (off = 0; off < len; off += PMD_SIZE) {
+ /* Remove old translation */
+ flush |= __gmap_unmap_by_gaddr(gmap, to + off);
+ /* Store new translation */
+ if (radix_tree_insert(&gmap->guest_to_host,
+ (to + off) >> PMD_SHIFT,
+ (void *) from + off))
+ break;
+ }
+ up_write(&gmap->mm->mmap_sem);
+ if (flush)
+ gmap_flush_tlb(gmap);
+ if (off >= len)
+ return 0;
+ gmap_unmap_segment(gmap, to, len);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(gmap_map_segment);
+
+/**
+ * __gmap_translate - translate a guest address to a user space address
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: guest address
+ *
+ * Returns user space address which corresponds to the guest address or
+ * -EFAULT if no such mapping exists.
+ * This function does not establish potentially missing page table entries.
+ * The mmap_sem of the mm that belongs to the address space must be held
+ * when this function gets called.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
+{
+ unsigned long vmaddr;
+
+ vmaddr = (unsigned long)
+ radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+ /* Note: guest_to_host is empty for a shadow gmap */
+ return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
+}
+EXPORT_SYMBOL_GPL(__gmap_translate);
+
+/**
+ * gmap_translate - translate a guest address to a user space address
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: guest address
+ *
+ * Returns user space address which corresponds to the guest address or
+ * -EFAULT if no such mapping exists.
+ * This function does not establish potentially missing page table entries.
+ */
+unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
+{
+ unsigned long rc;
+
+ down_read(&gmap->mm->mmap_sem);
+ rc = __gmap_translate(gmap, gaddr);
+ up_read(&gmap->mm->mmap_sem);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_translate);
+
+/**
+ * gmap_unlink - disconnect a page table from the gmap shadow tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @table: pointer to the host page table
+ * @vmaddr: vm address associated with the host page table
+ */
+void gmap_unlink(struct mm_struct *mm, unsigned long *table,
+ unsigned long vmaddr)
+{
+ struct gmap *gmap;
+ int flush;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
+ if (flush)
+ gmap_flush_tlb(gmap);
+ }
+ rcu_read_unlock();
+}
+
+static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
+ unsigned long gaddr);
+
+/**
+ * gmap_link - set up shadow page tables to connect a host to a guest address
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: guest address
+ * @vmaddr: vm address
+ *
+ * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
+ * if the vm address is already mapped to a different guest segment.
+ * The mmap_sem of the mm that belongs to the address space must be held
+ * when this function gets called.
+ */
+int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
+{
+ struct mm_struct *mm;
+ unsigned long *table;
+ spinlock_t *ptl;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ u64 unprot;
+ int rc;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ /* Create higher level tables in the gmap page table */
+ table = gmap->table;
+ if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
+ table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
+ if ((*table & _REGION_ENTRY_INVALID) &&
+ gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
+ gaddr & _REGION1_MASK))
+ return -ENOMEM;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ }
+ if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
+ table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
+ if ((*table & _REGION_ENTRY_INVALID) &&
+ gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
+ gaddr & _REGION2_MASK))
+ return -ENOMEM;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ }
+ if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
+ table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
+ if ((*table & _REGION_ENTRY_INVALID) &&
+ gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
+ gaddr & _REGION3_MASK))
+ return -ENOMEM;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ }
+ table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
+ /* Walk the parent mm page table */
+ mm = gmap->mm;
+ pgd = pgd_offset(mm, vmaddr);
+ VM_BUG_ON(pgd_none(*pgd));
+ p4d = p4d_offset(pgd, vmaddr);
+ VM_BUG_ON(p4d_none(*p4d));
+ pud = pud_offset(p4d, vmaddr);
+ VM_BUG_ON(pud_none(*pud));
+ /* large puds cannot yet be handled */
+ if (pud_large(*pud))
+ return -EFAULT;
+ pmd = pmd_offset(pud, vmaddr);
+ VM_BUG_ON(pmd_none(*pmd));
+ /* Are we allowed to use huge pages? */
+ if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
+ return -EFAULT;
+ /* Link gmap segment table entry location to page table. */
+ rc = radix_tree_preload(GFP_KERNEL);
+ if (rc)
+ return rc;
+ ptl = pmd_lock(mm, pmd);
+ spin_lock(&gmap->guest_table_lock);
+ if (*table == _SEGMENT_ENTRY_EMPTY) {
+ rc = radix_tree_insert(&gmap->host_to_guest,
+ vmaddr >> PMD_SHIFT, table);
+ if (!rc) {
+ if (pmd_large(*pmd)) {
+ *table = (pmd_val(*pmd) &
+ _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
+ | _SEGMENT_ENTRY_GMAP_UC;
+ } else
+ *table = pmd_val(*pmd) &
+ _SEGMENT_ENTRY_HARDWARE_BITS;
+ }
+ } else if (*table & _SEGMENT_ENTRY_PROTECT &&
+ !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
+ unprot = (u64)*table;
+ unprot &= ~_SEGMENT_ENTRY_PROTECT;
+ unprot |= _SEGMENT_ENTRY_GMAP_UC;
+ gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
+ }
+ spin_unlock(&gmap->guest_table_lock);
+ spin_unlock(ptl);
+ radix_tree_preload_end();
+ return rc;
+}
+
+/**
+ * gmap_fault - resolve a fault on a guest address
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: guest address
+ * @fault_flags: flags to pass down to handle_mm_fault()
+ *
+ * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
+ * if the vm address is already mapped to a different guest segment.
+ */
+int gmap_fault(struct gmap *gmap, unsigned long gaddr,
+ unsigned int fault_flags)
+{
+ unsigned long vmaddr;
+ int rc;
+ bool unlocked;
+
+ down_read(&gmap->mm->mmap_sem);
+
+retry:
+ unlocked = false;
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr)) {
+ rc = vmaddr;
+ goto out_up;
+ }
+ if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+ &unlocked)) {
+ rc = -EFAULT;
+ goto out_up;
+ }
+ /*
+ * In the case that fixup_user_fault unlocked the mmap_sem during
+ * faultin redo __gmap_translate to not race with a map/unmap_segment.
+ */
+ if (unlocked)
+ goto retry;
+
+ rc = __gmap_link(gmap, gaddr, vmaddr);
+out_up:
+ up_read(&gmap->mm->mmap_sem);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_fault);
+
+/*
+ * this function is assumed to be called with mmap_sem held
+ */
+void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
+{
+ unsigned long vmaddr;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ /* Find the vm address for the guest address */
+ vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
+ gaddr >> PMD_SHIFT);
+ if (vmaddr) {
+ vmaddr |= gaddr & ~PMD_MASK;
+ /* Get pointer to the page table entry */
+ ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
+ if (likely(ptep)) {
+ ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
+ pte_unmap_unlock(ptep, ptl);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(__gmap_zap);
+
+void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
+{
+ unsigned long gaddr, vmaddr, size;
+ struct vm_area_struct *vma;
+
+ down_read(&gmap->mm->mmap_sem);
+ for (gaddr = from; gaddr < to;
+ gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
+ /* Find the vm address for the guest address */
+ vmaddr = (unsigned long)
+ radix_tree_lookup(&gmap->guest_to_host,
+ gaddr >> PMD_SHIFT);
+ if (!vmaddr)
+ continue;
+ vmaddr |= gaddr & ~PMD_MASK;
+ /* Find vma in the parent mm */
+ vma = find_vma(gmap->mm, vmaddr);
+ if (!vma)
+ continue;
+ /*
+ * We do not discard pages that are backed by
+ * hugetlbfs, so we don't have to refault them.
+ */
+ if (is_vm_hugetlb_page(vma))
+ continue;
+ size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
+ zap_page_range(vma, vmaddr, size);
+ }
+ up_read(&gmap->mm->mmap_sem);
+}
+EXPORT_SYMBOL_GPL(gmap_discard);
+
+static LIST_HEAD(gmap_notifier_list);
+static DEFINE_SPINLOCK(gmap_notifier_lock);
+
+/**
+ * gmap_register_pte_notifier - register a pte invalidation callback
+ * @nb: pointer to the gmap notifier block
+ */
+void gmap_register_pte_notifier(struct gmap_notifier *nb)
+{
+ spin_lock(&gmap_notifier_lock);
+ list_add_rcu(&nb->list, &gmap_notifier_list);
+ spin_unlock(&gmap_notifier_lock);
+}
+EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
+
+/**
+ * gmap_unregister_pte_notifier - remove a pte invalidation callback
+ * @nb: pointer to the gmap notifier block
+ */
+void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
+{
+ spin_lock(&gmap_notifier_lock);
+ list_del_rcu(&nb->list);
+ spin_unlock(&gmap_notifier_lock);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+
+/**
+ * gmap_call_notifier - call all registered invalidation callbacks
+ * @gmap: pointer to guest mapping meta data structure
+ * @start: start virtual address in the guest address space
+ * @end: end virtual address in the guest address space
+ */
+static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
+{
+ struct gmap_notifier *nb;
+
+ list_for_each_entry(nb, &gmap_notifier_list, list)
+ nb->notifier_call(gmap, start, end);
+}
+
+/**
+ * gmap_table_walk - walk the gmap page tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @level: page table level to stop at
+ *
+ * Returns a table entry pointer for the given guest address and @level
+ * @level=0 : returns a pointer to a page table table entry (or NULL)
+ * @level=1 : returns a pointer to a segment table entry (or NULL)
+ * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+ * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+ * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+ *
+ * Returns NULL if the gmap page tables could not be walked to the
+ * requested level.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+ unsigned long gaddr, int level)
+{
+ const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
+ unsigned long *table;
+
+ if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+ return NULL;
+ if (gmap_is_shadow(gmap) && gmap->removed)
+ return NULL;
+
+ if (asce_type != _ASCE_TYPE_REGION1 &&
+ gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
+ return NULL;
+
+ table = gmap->table;
+ switch (gmap->asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_REGION1:
+ table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
+ if (level == 4)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_REGION2:
+ table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
+ if (level == 3)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_REGION3:
+ table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
+ if (level == 2)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_SEGMENT:
+ table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
+ if (level == 1)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
+ }
+ return table;
+}
+
+/**
+ * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+ * and return the pte pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @ptl: pointer to the spinlock pointer
+ *
+ * Returns a pointer to the locked pte for a guest address, or NULL
+ */
+static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+ spinlock_t **ptl)
+{
+ unsigned long *table;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ /* Walk the gmap page table, lock and get pte pointer */
+ table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+ if (!table || *table & _SEGMENT_ENTRY_INVALID)
+ return NULL;
+ return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+}
+
+/**
+ * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: address in the host process address space
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if the caller can retry __gmap_translate (might fail again),
+ * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+ * up or connecting the gmap page table.
+ */
+static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+ unsigned long vmaddr, int prot)
+{
+ struct mm_struct *mm = gmap->mm;
+ unsigned int fault_flags;
+ bool unlocked = false;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+ if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+ return -EFAULT;
+ if (unlocked)
+ /* lost mmap_sem, caller has to retry __gmap_translate */
+ return 0;
+ /* Connect the page tables */
+ return __gmap_link(gmap, gaddr, vmaddr);
+}
+
+/**
+ * gmap_pte_op_end - release the page table lock
+ * @ptl: pointer to the spinlock pointer
+ */
+static void gmap_pte_op_end(spinlock_t *ptl)
+{
+ if (ptl)
+ spin_unlock(ptl);
+}
+
+/**
+ * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
+ * and return the pmd pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ *
+ * Returns a pointer to the pmd for a guest address, or NULL
+ */
+static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
+{
+ pmd_t *pmdp;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ spin_lock(&gmap->guest_table_lock);
+ pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
+
+ if (!pmdp || pmd_none(*pmdp)) {
+ spin_unlock(&gmap->guest_table_lock);
+ return NULL;
+ }
+
+ /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
+ if (!pmd_large(*pmdp))
+ spin_unlock(&gmap->guest_table_lock);
+ return pmdp;
+}
+
+/**
+ * gmap_pmd_op_end - release the guest_table_lock if needed
+ * @gmap: pointer to the guest mapping meta data structure
+ * @pmdp: pointer to the pmd
+ */
+static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
+{
+ if (pmd_large(*pmdp))
+ spin_unlock(&gmap->guest_table_lock);
+}
+
+/*
+ * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
+ * @pmdp: pointer to the pmd to be protected
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: notification bits to set
+ *
+ * Returns:
+ * 0 if successfully protected
+ * -EAGAIN if a fixup is needed
+ * -EINVAL if unsupported notifier bits have been specified
+ *
+ * Expected to be called with sg->mm->mmap_sem in read and
+ * guest_table_lock held.
+ */
+static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
+ pmd_t *pmdp, int prot, unsigned long bits)
+{
+ int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
+ int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
+ pmd_t new = *pmdp;
+
+ /* Fixup needed */
+ if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
+ return -EAGAIN;
+
+ if (prot == PROT_NONE && !pmd_i) {
+ pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
+ gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
+ }
+
+ if (prot == PROT_READ && !pmd_p) {
+ pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
+ pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
+ gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
+ }
+
+ if (bits & GMAP_NOTIFY_MPROT)
+ pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
+
+ /* Shadow GMAP protection needs split PMDs */
+ if (bits & GMAP_NOTIFY_SHADOW)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * gmap_protect_pte - remove access rights to memory and set pgste bits
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @pmdp: pointer to the pmd associated with the pte
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: notification bits to set
+ *
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EAGAIN if a fixup is needed.
+ *
+ * Expected to be called with sg->mm->mmap_sem in read
+ */
+static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
+ pmd_t *pmdp, int prot, unsigned long bits)
+{
+ int rc;
+ pte_t *ptep;
+ spinlock_t *ptl = NULL;
+ unsigned long pbits = 0;
+
+ if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
+ return -EAGAIN;
+
+ ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
+ if (!ptep)
+ return -ENOMEM;
+
+ pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
+ pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
+ /* Protect and unlock. */
+ rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
+ gmap_pte_op_end(ptl);
+ return rc;
+}
+
+/*
+ * gmap_protect_range - remove access rights to memory and set pgste bits
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
+ *
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+ unsigned long len, int prot, unsigned long bits)
+{
+ unsigned long vmaddr, dist;
+ pmd_t *pmdp;
+ int rc;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ while (len) {
+ rc = -EAGAIN;
+ pmdp = gmap_pmd_op_walk(gmap, gaddr);
+ if (pmdp) {
+ if (!pmd_large(*pmdp)) {
+ rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
+ bits);
+ if (!rc) {
+ len -= PAGE_SIZE;
+ gaddr += PAGE_SIZE;
+ }
+ } else {
+ rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
+ bits);
+ if (!rc) {
+ dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
+ len = len < dist ? 0 : len - dist;
+ gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
+ }
+ }
+ gmap_pmd_op_end(gmap, pmdp);
+ }
+ if (rc) {
+ if (rc == -EINVAL)
+ return rc;
+
+ /* -EAGAIN, fixup of userspace mm and gmap */
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr))
+ return vmaddr;
+ rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
+
+/**
+ * gmap_mprotect_notify - change access rights for a range of ptes and
+ * call the notifier if any pte changes again
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if for each page in the given range a gmap mapping exists,
+ * the new access rights could be set and the notifier could be armed.
+ * If the gmap mapping is missing for one or more pages -EFAULT is
+ * returned. If no memory could be allocated -ENOMEM is returned.
+ * This function establishes missing page table entries.
+ */
+int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+ unsigned long len, int prot)
+{
+ int rc;
+
+ if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
+ return -EINVAL;
+ if (!MACHINE_HAS_ESOP && prot == PROT_READ)
+ return -EINVAL;
+ down_read(&gmap->mm->mmap_sem);
+ rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
+ up_read(&gmap->mm->mmap_sem);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+
+/**
+ * gmap_read_table - get an unsigned long value from a guest page table using
+ * absolute addressing, without marking the page referenced.
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @val: pointer to the unsigned long value to return
+ *
+ * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+ * if reading using the virtual address failed. -EINVAL if called on a gmap
+ * shadow.
+ *
+ * Called with gmap->mm->mmap_sem in read.
+ */
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+{
+ unsigned long address, vmaddr;
+ spinlock_t *ptl;
+ pte_t *ptep, pte;
+ int rc;
+
+ if (gmap_is_shadow(gmap))
+ return -EINVAL;
+
+ while (1) {
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+ if (ptep) {
+ pte = *ptep;
+ if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+ address = pte_val(pte) & PAGE_MASK;
+ address += gaddr & ~PAGE_MASK;
+ *val = *(unsigned long *) address;
+ pte_val(*ptep) |= _PAGE_YOUNG;
+ /* Do *NOT* clear the _PAGE_INVALID bit! */
+ rc = 0;
+ }
+ gmap_pte_op_end(ptl);
+ }
+ if (!rc)
+ break;
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr)) {
+ rc = vmaddr;
+ break;
+ }
+ rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
+ if (rc)
+ break;
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_read_table);
+
+/**
+ * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+ * @sg: pointer to the shadow guest address space structure
+ * @vmaddr: vm address associated with the rmap
+ * @rmap: pointer to the rmap structure
+ *
+ * Called with the sg->guest_table_lock
+ */
+static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+ struct gmap_rmap *rmap)
+{
+ void __rcu **slot;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+ if (slot) {
+ rmap->next = radix_tree_deref_slot_protected(slot,
+ &sg->guest_table_lock);
+ radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
+ } else {
+ rmap->next = NULL;
+ radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+ rmap);
+ }
+}
+
+/**
+ * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow gmap
+ * @paddr: address in the parent guest address space
+ * @len: length of the memory area to protect
+ *
+ * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+ * if out of memory and -EFAULT if paddr is invalid.
+ */
+static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+ unsigned long paddr, unsigned long len)
+{
+ struct gmap *parent;
+ struct gmap_rmap *rmap;
+ unsigned long vmaddr;
+ spinlock_t *ptl;
+ pte_t *ptep;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ parent = sg->parent;
+ while (len) {
+ vmaddr = __gmap_translate(parent, paddr);
+ if (IS_ERR_VALUE(vmaddr))
+ return vmaddr;
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ if (!rmap)
+ return -ENOMEM;
+ rmap->raddr = raddr;
+ rc = radix_tree_preload(GFP_KERNEL);
+ if (rc) {
+ kfree(rmap);
+ return rc;
+ }
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+ if (ptep) {
+ spin_lock(&sg->guest_table_lock);
+ rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
+ PGSTE_VSIE_BIT);
+ if (!rc)
+ gmap_insert_rmap(sg, vmaddr, rmap);
+ spin_unlock(&sg->guest_table_lock);
+ gmap_pte_op_end(ptl);
+ }
+ radix_tree_preload_end();
+ if (rc) {
+ kfree(rmap);
+ rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
+ if (rc)
+ return rc;
+ continue;
+ }
+ paddr += PAGE_SIZE;
+ len -= PAGE_SIZE;
+ }
+ return 0;
+}
+
+#define _SHADOW_RMAP_MASK 0x7
+#define _SHADOW_RMAP_REGION1 0x5
+#define _SHADOW_RMAP_REGION2 0x4
+#define _SHADOW_RMAP_REGION3 0x3
+#define _SHADOW_RMAP_SEGMENT 0x2
+#define _SHADOW_RMAP_PGTABLE 0x1
+
+/**
+ * gmap_idte_one - invalidate a single region or segment table entry
+ * @asce: region or segment table *origin* + table-type bits
+ * @vaddr: virtual address to identify the table entry to flush
+ *
+ * The invalid bit of a single region or segment table entry is set
+ * and the associated TLB entries depending on the entry are flushed.
+ * The table-type of the @asce identifies the portion of the @vaddr
+ * that is used as the invalidation index.
+ */
+static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+{
+ asm volatile(
+ " .insn rrf,0xb98e0000,%0,%1,0,0"
+ : : "a" (asce), "a" (vaddr) : "cc", "memory");
+}
+
+/**
+ * gmap_unshadow_page - remove a page from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+ if (!table || *table & _PAGE_INVALID)
+ return;
+ gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1);
+ ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+}
+
+/**
+ * __gmap_unshadow_pgt - remove all entries from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @pgt: pointer to the start of a shadow page table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+ unsigned long *pgt)
+{
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE)
+ pgt[i] = _PAGE_INVALID;
+}
+
+/**
+ * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long sto, *ste, *pgt;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
+ sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
+ gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+ pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+ *ste = _SEGMENT_ENTRY_EMPTY;
+ __gmap_unshadow_pgt(sg, raddr, pgt);
+ /* Free page table */
+ page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ page_table_free_pgste(page);
+}
+
+/**
+ * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @sgt: pointer to the start of a shadow segment table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+ unsigned long *sgt)
+{
+ unsigned long *pgt;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
+ if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+ continue;
+ pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+ sgt[i] = _SEGMENT_ENTRY_EMPTY;
+ __gmap_unshadow_pgt(sg, raddr, pgt);
+ /* Free page table */
+ page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ page_table_free_pgste(page);
+ }
+}
+
+/**
+ * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r3o, *r3e, *sgt;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+ if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
+ r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
+ gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+ sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+ *r3e = _REGION3_ENTRY_EMPTY;
+ __gmap_unshadow_sgt(sg, raddr, sgt);
+ /* Free segment table */
+ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, CRST_ALLOC_ORDER);
+}
+
+/**
+ * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ * @r3t: pointer to the start of a shadow region-3 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r3t)
+{
+ unsigned long *sgt;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
+ if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+ r3t[i] = _REGION3_ENTRY_EMPTY;
+ __gmap_unshadow_sgt(sg, raddr, sgt);
+ /* Free segment table */
+ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, CRST_ALLOC_ORDER);
+ }
+}
+
+/**
+ * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r2o, *r2e, *r3t;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+ if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
+ r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
+ gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+ r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+ *r2e = _REGION2_ENTRY_EMPTY;
+ __gmap_unshadow_r3t(sg, raddr, r3t);
+ /* Free region 3 table */
+ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, CRST_ALLOC_ORDER);
+}
+
+/**
+ * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r2t: pointer to the start of a shadow region-2 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r2t)
+{
+ unsigned long *r3t;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
+ if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+ r2t[i] = _REGION2_ENTRY_EMPTY;
+ __gmap_unshadow_r3t(sg, raddr, r3t);
+ /* Free region 3 table */
+ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, CRST_ALLOC_ORDER);
+ }
+}
+
+/**
+ * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r1o, *r1e, *r2t;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+ if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
+ r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
+ gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+ r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+ *r1e = _REGION1_ENTRY_EMPTY;
+ __gmap_unshadow_r2t(sg, raddr, r2t);
+ /* Free region 2 table */
+ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, CRST_ALLOC_ORDER);
+}
+
+/**
+ * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r1t: pointer to the start of a shadow region-1 table
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r1t)
+{
+ unsigned long asce, *r2t;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+ for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
+ if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+ __gmap_unshadow_r2t(sg, raddr, r2t);
+ /* Clear entry and flush translation r1t -> r2t */
+ gmap_idte_one(asce, raddr);
+ r1t[i] = _REGION1_ENTRY_EMPTY;
+ /* Free region 2 table */
+ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, CRST_ALLOC_ORDER);
+ }
+}
+
+/**
+ * gmap_unshadow - remove a shadow page table completely
+ * @sg: pointer to the shadow guest address space structure
+ *
+ * Called with sg->guest_table_lock
+ */
+static void gmap_unshadow(struct gmap *sg)
+{
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ if (sg->removed)
+ return;
+ sg->removed = 1;
+ gmap_call_notifier(sg, 0, -1UL);
+ gmap_flush_tlb(sg);
+ table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+ switch (sg->asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_REGION1:
+ __gmap_unshadow_r1t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_REGION2:
+ __gmap_unshadow_r2t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_REGION3:
+ __gmap_unshadow_r3t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_SEGMENT:
+ __gmap_unshadow_sgt(sg, 0, table);
+ break;
+ }
+}
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level)
+{
+ struct gmap *sg;
+
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+ sg->removed)
+ continue;
+ if (!sg->initialized)
+ return ERR_PTR(-EAGAIN);
+ atomic_inc(&sg->ref_count);
+ return sg;
+ }
+ return NULL;
+}
+
+/**
+ * gmap_shadow_valid - check if a shadow guest address space matches the
+ * given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ *
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+ if (sg->removed)
+ return 0;
+ return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level)
+{
+ struct gmap *sg, *new;
+ unsigned long limit;
+ int rc;
+
+ BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
+ BUG_ON(gmap_is_shadow(parent));
+ spin_lock(&parent->shadow_lock);
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ spin_unlock(&parent->shadow_lock);
+ if (sg)
+ return sg;
+ /* Create a new shadow gmap */
+ limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+ if (asce & _ASCE_REAL_SPACE)
+ limit = -1UL;
+ new = gmap_alloc(limit);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+ new->mm = parent->mm;
+ new->parent = gmap_get(parent);
+ new->orig_asce = asce;
+ new->edat_level = edat_level;
+ new->initialized = false;
+ spin_lock(&parent->shadow_lock);
+ /* Recheck if another CPU created the same shadow */
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ if (sg) {
+ spin_unlock(&parent->shadow_lock);
+ gmap_free(new);
+ return sg;
+ }
+ if (asce & _ASCE_REAL_SPACE) {
+ /* only allow one real-space gmap shadow */
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->orig_asce & _ASCE_REAL_SPACE) {
+ spin_lock(&sg->guest_table_lock);
+ gmap_unshadow(sg);
+ spin_unlock(&sg->guest_table_lock);
+ list_del(&sg->list);
+ gmap_put(sg);
+ break;
+ }
+ }
+ }
+ atomic_set(&new->ref_count, 2);
+ list_add(&new->list, &parent->children);
+ if (asce & _ASCE_REAL_SPACE) {
+ /* nothing to protect, return right away */
+ new->initialized = true;
+ spin_unlock(&parent->shadow_lock);
+ return new;
+ }
+ spin_unlock(&parent->shadow_lock);
+ /* protect after insertion, so it will get properly invalidated */
+ down_read(&parent->mm->mmap_sem);
+ rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+ ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
+ PROT_READ, GMAP_NOTIFY_SHADOW);
+ up_read(&parent->mm->mmap_sem);
+ spin_lock(&parent->shadow_lock);
+ new->initialized = true;
+ if (rc) {
+ list_del(&new->list);
+ gmap_free(new);
+ new = ERR_PTR(rc);
+ }
+ spin_unlock(&parent->shadow_lock);
+ return new;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow);
+
+/**
+ * gmap_shadow_r2t - create an empty shadow region 2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r2t: parent gmap address of the region 2 table to get shadowed
+ * @fake: r2t references contiguous guest memory block, not a r2t
+ *
+ * The r2t parameter specifies the address of the source table. The
+ * four pages of the source table are made read-only in the parent gmap
+ * address space. A write to the source table area @r2t will automatically
+ * remove the shadow r2 table and all of its decendents.
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_r2t, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ /* Allocate a shadow region second table */
+ page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ if (!page)
+ return -ENOMEM;
+ page->index = r2t & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_r2t = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= (r2t & _REGION_ENTRY_PROTECT);
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make r2t read-only in parent gmap page table */
+ raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
+ origin = r2t & _REGION_ENTRY_ORIGIN;
+ offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
+ len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 4);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_r2t)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_r2t(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, CRST_ALLOC_ORDER);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+
+/**
+ * gmap_shadow_r3t - create a shadow region 3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r3t: parent gmap address of the region 3 table to get shadowed
+ * @fake: r3t references contiguous guest memory block, not a r3t
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_r3t, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ /* Allocate a shadow region second table */
+ page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ if (!page)
+ return -ENOMEM;
+ page->index = r3t & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_r3t = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= (r3t & _REGION_ENTRY_PROTECT);
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make r3t read-only in parent gmap page table */
+ raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
+ origin = r3t & _REGION_ENTRY_ORIGIN;
+ offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
+ len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 3);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_r3t)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_r3t(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, CRST_ALLOC_ORDER);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+
+/**
+ * gmap_shadow_sgt - create a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @sgt: parent gmap address of the segment table to get shadowed
+ * @fake: sgt references contiguous guest memory block, not a sgt
+ *
+ * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_sgt, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+ /* Allocate a shadow segment table */
+ page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ if (!page)
+ return -ENOMEM;
+ page->index = sgt & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_sgt = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= sgt & _REGION_ENTRY_PROTECT;
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make sgt read-only in parent gmap page table */
+ raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
+ origin = sgt & _REGION_ENTRY_ORIGIN;
+ offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
+ len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 2);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_sgt)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_sgt(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, CRST_ALLOC_ORDER);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+
+/**
+ * gmap_shadow_lookup_pgtable - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection,
+ int *fake)
+{
+ unsigned long *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+ if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+ /* Shadow page tables are full pages (pte+pgste) */
+ page = pfn_to_page(*table >> PAGE_SHIFT);
+ *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
+ *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+ *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
+ rc = 0;
+ } else {
+ rc = -EAGAIN;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+
+/**
+ * gmap_shadow_pgt - instantiate a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory,
+ * -EFAULT if an address in the parent gmap could not be resolved and
+ *
+ * Called with gmap->mm->mmap_sem in read
+ */
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+ int fake)
+{
+ unsigned long raddr, origin;
+ unsigned long *s_pgt, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+ /* Allocate a shadow page table */
+ page = page_table_alloc_pgste(sg->mm);
+ if (!page)
+ return -ENOMEM;
+ page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_pgt = (unsigned long *) page_to_phys(page);
+ /* Install shadow page table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+ (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+ list_add(&page->lru, &sg->pt_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_SEGMENT_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make pgt read-only in parent gmap page table (not the pgste) */
+ raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
+ origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+ rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 1);
+ if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+ (unsigned long) s_pgt)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_SEGMENT_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_pgt(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ page_table_free_pgste(page);
+ return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+
+/**
+ * gmap_shadow_page - create a shadow page mapping
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pte: pte in parent gmap address space to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+{
+ struct gmap *parent;
+ struct gmap_rmap *rmap;
+ unsigned long vmaddr, paddr;
+ spinlock_t *ptl;
+ pte_t *sptep, *tptep;
+ int prot;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ parent = sg->parent;
+ prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ if (!rmap)
+ return -ENOMEM;
+ rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+
+ while (1) {
+ paddr = pte_val(pte) & PAGE_MASK;
+ vmaddr = __gmap_translate(parent, paddr);
+ if (IS_ERR_VALUE(vmaddr)) {
+ rc = vmaddr;
+ break;
+ }
+ rc = radix_tree_preload(GFP_KERNEL);
+ if (rc)
+ break;
+ rc = -EAGAIN;
+ sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+ if (sptep) {
+ spin_lock(&sg->guest_table_lock);
+ /* Get page table pointer */
+ tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+ if (!tptep) {
+ spin_unlock(&sg->guest_table_lock);
+ gmap_pte_op_end(ptl);
+ radix_tree_preload_end();
+ break;
+ }
+ rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+ if (rc > 0) {
+ /* Success and a new mapping */
+ gmap_insert_rmap(sg, vmaddr, rmap);
+ rmap = NULL;
+ rc = 0;
+ }
+ gmap_pte_op_end(ptl);
+ spin_unlock(&sg->guest_table_lock);
+ }
+ radix_tree_preload_end();
+ if (!rc)
+ break;
+ rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+ if (rc)
+ break;
+ }
+ kfree(rmap);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_page);
+
+/**
+ * gmap_shadow_notify - handle notifications for shadow gmap
+ *
+ * Called with sg->parent->shadow_lock.
+ */
+static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+ unsigned long gaddr)
+{
+ struct gmap_rmap *rmap, *rnext, *head;
+ unsigned long start, end, bits, raddr;
+
+ BUG_ON(!gmap_is_shadow(sg));
+
+ spin_lock(&sg->guest_table_lock);
+ if (sg->removed) {
+ spin_unlock(&sg->guest_table_lock);
+ return;
+ }
+ /* Check for top level table */
+ start = sg->orig_asce & _ASCE_ORIGIN;
+ end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
+ if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+ gaddr < end) {
+ /* The complete shadow table has to go */
+ gmap_unshadow(sg);
+ spin_unlock(&sg->guest_table_lock);
+ list_del(&sg->list);
+ gmap_put(sg);
+ return;
+ }
+ /* Remove the page table tree from on specific entry */
+ head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+ gmap_for_each_rmap_safe(rmap, rnext, head) {
+ bits = rmap->raddr & _SHADOW_RMAP_MASK;
+ raddr = rmap->raddr ^ bits;
+ switch (bits) {
+ case _SHADOW_RMAP_REGION1:
+ gmap_unshadow_r2t(sg, raddr);
+ break;
+ case _SHADOW_RMAP_REGION2:
+ gmap_unshadow_r3t(sg, raddr);
+ break;
+ case _SHADOW_RMAP_REGION3:
+ gmap_unshadow_sgt(sg, raddr);
+ break;
+ case _SHADOW_RMAP_SEGMENT:
+ gmap_unshadow_pgt(sg, raddr);
+ break;
+ case _SHADOW_RMAP_PGTABLE:
+ gmap_unshadow_page(sg, raddr);
+ break;
+ }
+ kfree(rmap);
+ }
+ spin_unlock(&sg->guest_table_lock);
+}
+
+/**
+ * ptep_notify - call all invalidation callbacks for a specific pte.
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the process address space
+ * @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
+ *
+ * This function is assumed to be called with the page table lock held
+ * for the pte to notify.
+ */
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+ pte_t *pte, unsigned long bits)
+{
+ unsigned long offset, gaddr = 0;
+ unsigned long *table;
+ struct gmap *gmap, *sg, *next;
+
+ offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
+ offset = offset * (PAGE_SIZE / sizeof(pte_t));
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ spin_lock(&gmap->guest_table_lock);
+ table = radix_tree_lookup(&gmap->host_to_guest,
+ vmaddr >> PMD_SHIFT);
+ if (table)
+ gaddr = __gmap_segment_gaddr(table) + offset;
+ spin_unlock(&gmap->guest_table_lock);
+ if (!table)
+ continue;
+
+ if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+ spin_lock(&gmap->shadow_lock);
+ list_for_each_entry_safe(sg, next,
+ &gmap->children, list)
+ gmap_shadow_notify(sg, vmaddr, gaddr);
+ spin_unlock(&gmap->shadow_lock);
+ }
+ if (bits & PGSTE_IN_BIT)
+ gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ptep_notify);
+
+static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
+ unsigned long gaddr)
+{
+ pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
+}
+
+/**
+ * gmap_pmdp_xchg - exchange a gmap pmd with another
+ * @gmap: pointer to the guest address space structure
+ * @pmdp: pointer to the pmd entry
+ * @new: replacement entry
+ * @gaddr: the affected guest address
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
+ unsigned long gaddr)
+{
+ gaddr &= HPAGE_MASK;
+ pmdp_notify_gmap(gmap, pmdp, gaddr);
+ pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ if (MACHINE_HAS_TLB_GUEST)
+ __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
+ IDTE_GLOBAL);
+ else if (MACHINE_HAS_IDTE)
+ __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
+ else
+ __pmdp_csp(pmdp);
+ *pmdp = new;
+}
+
+static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
+ int purge)
+{
+ pmd_t *pmdp;
+ struct gmap *gmap;
+ unsigned long gaddr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ spin_lock(&gmap->guest_table_lock);
+ pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
+ vmaddr >> PMD_SHIFT);
+ if (pmdp) {
+ gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
+ pmdp_notify_gmap(gmap, pmdp, gaddr);
+ WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC));
+ if (purge)
+ __pmdp_csp(pmdp);
+ pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+ }
+ spin_unlock(&gmap->guest_table_lock);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
+ * flushing
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
+{
+ gmap_pmdp_clear(mm, vmaddr, 0);
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
+
+/**
+ * gmap_pmdp_csp - csp all affected guest pmd entries
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
+{
+ gmap_pmdp_clear(mm, vmaddr, 1);
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
+
+/**
+ * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
+{
+ unsigned long *entry, gaddr;
+ struct gmap *gmap;
+ pmd_t *pmdp;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ spin_lock(&gmap->guest_table_lock);
+ entry = radix_tree_delete(&gmap->host_to_guest,
+ vmaddr >> PMD_SHIFT);
+ if (entry) {
+ pmdp = (pmd_t *)entry;
+ gaddr = __gmap_segment_gaddr(entry);
+ pmdp_notify_gmap(gmap, pmdp, gaddr);
+ WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC));
+ if (MACHINE_HAS_TLB_GUEST)
+ __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
+ gmap->asce, IDTE_LOCAL);
+ else if (MACHINE_HAS_IDTE)
+ __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
+ *entry = _SEGMENT_ENTRY_EMPTY;
+ }
+ spin_unlock(&gmap->guest_table_lock);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
+
+/**
+ * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
+{
+ unsigned long *entry, gaddr;
+ struct gmap *gmap;
+ pmd_t *pmdp;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ spin_lock(&gmap->guest_table_lock);
+ entry = radix_tree_delete(&gmap->host_to_guest,
+ vmaddr >> PMD_SHIFT);
+ if (entry) {
+ pmdp = (pmd_t *)entry;
+ gaddr = __gmap_segment_gaddr(entry);
+ pmdp_notify_gmap(gmap, pmdp, gaddr);
+ WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC));
+ if (MACHINE_HAS_TLB_GUEST)
+ __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
+ gmap->asce, IDTE_GLOBAL);
+ else if (MACHINE_HAS_IDTE)
+ __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
+ else
+ __pmdp_csp(pmdp);
+ *entry = _SEGMENT_ENTRY_EMPTY;
+ }
+ spin_unlock(&gmap->guest_table_lock);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
+
+/**
+ * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
+ * @gmap: pointer to guest address space
+ * @pmdp: pointer to the pmd to be tested
+ * @gaddr: virtual address in the guest address space
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
+ unsigned long gaddr)
+{
+ if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
+ return false;
+
+ /* Already protected memory, which did not change is clean */
+ if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
+ !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
+ return false;
+
+ /* Clear UC indication and reset protection */
+ pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
+ gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
+ return true;
+}
+
+/**
+ * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
+ * @gmap: pointer to guest address space
+ * @bitmap: dirty bitmap for this pmd
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: virtual address in the host address space
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
+ unsigned long gaddr, unsigned long vmaddr)
+{
+ int i;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ spinlock_t *ptl;
+
+ pmdp = gmap_pmd_op_walk(gmap, gaddr);
+ if (!pmdp)
+ return;
+
+ if (pmd_large(*pmdp)) {
+ if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
+ bitmap_fill(bitmap, _PAGE_ENTRIES);
+ } else {
+ for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
+ ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
+ if (!ptep)
+ continue;
+ if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
+ set_bit(i, bitmap);
+ spin_unlock(ptl);
+ }
+ }
+ gmap_pmd_op_end(gmap, pmdp);
+}
+EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
+
+static inline void thp_split_mm(struct mm_struct *mm)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ struct vm_area_struct *vma;
+ unsigned long addr;
+
+ for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for (addr = vma->vm_start;
+ addr < vma->vm_end;
+ addr += PAGE_SIZE)
+ follow_page(vma, addr, FOLL_SPLIT);
+ vma->vm_flags &= ~VM_HUGEPAGE;
+ vma->vm_flags |= VM_NOHUGEPAGE;
+ }
+ mm->def_flags |= VM_NOHUGEPAGE;
+#endif
+}
+
+/*
+ * Remove all empty zero pages from the mapping for lazy refaulting
+ * - This must be called after mm->context.has_pgste is set, to avoid
+ * future creation of zero pages
+ * - This must be called after THP was enabled
+ */
+static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+ unsigned long addr;
+
+ for (addr = start; addr != end; addr += PAGE_SIZE) {
+ pte_t *ptep;
+ spinlock_t *ptl;
+
+ ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (is_zero_pfn(pte_pfn(*ptep)))
+ ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
+ pte_unmap_unlock(ptep, ptl);
+ }
+ return 0;
+}
+
+static inline void zap_zero_pages(struct mm_struct *mm)
+{
+ struct mm_walk walk = { .pmd_entry = __zap_zero_pages };
+
+ walk.mm = mm;
+ walk_page_range(0, TASK_SIZE, &walk);
+}
+
+/*
+ * switch on pgstes for its userspace process (for kvm)
+ */
+int s390_enable_sie(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ /* Do we have pgstes? if yes, we are done */
+ if (mm_has_pgste(mm))
+ return 0;
+ /* Fail if the page tables are 2K */
+ if (!mm_alloc_pgste(mm))
+ return -EINVAL;
+ down_write(&mm->mmap_sem);
+ mm->context.has_pgste = 1;
+ /* split thp mappings and disable thp for future mappings */
+ thp_split_mm(mm);
+ zap_zero_pages(mm);
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(s390_enable_sie);
+
+/*
+ * Enable storage key handling from now on and initialize the storage
+ * keys with the default key.
+ */
+static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ /* Clear storage key */
+ ptep_zap_key(walk->mm, addr, pte);
+ return 0;
+}
+
+static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
+ unsigned long hmask, unsigned long next,
+ struct mm_walk *walk)
+{
+ pmd_t *pmd = (pmd_t *)pte;
+ unsigned long start, end;
+ struct page *page = pmd_page(*pmd);
+
+ /*
+ * The write check makes sure we do not set a key on shared
+ * memory. This is needed as the walker does not differentiate
+ * between actual guest memory and the process executable or
+ * shared libraries.
+ */
+ if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
+ !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
+ return 0;
+
+ start = pmd_val(*pmd) & HPAGE_MASK;
+ end = start + HPAGE_SIZE - 1;
+ __storage_key_init_range(start, end);
+ set_bit(PG_arch_1, &page->flags);
+ return 0;
+}
+
+int s390_enable_skey(void)
+{
+ struct mm_walk walk = {
+ .hugetlb_entry = __s390_enable_skey_hugetlb,
+ .pte_entry = __s390_enable_skey_pte,
+ };
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int rc = 0;
+
+ down_write(&mm->mmap_sem);
+ if (mm_uses_skeys(mm))
+ goto out_up;
+
+ mm->context.uses_skeys = 1;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
+ MADV_UNMERGEABLE, &vma->vm_flags)) {
+ mm->context.uses_skeys = 0;
+ rc = -ENOMEM;
+ goto out_up;
+ }
+ }
+ mm->def_flags &= ~VM_MERGEABLE;
+
+ walk.mm = mm;
+ walk_page_range(0, TASK_SIZE, &walk);
+
+out_up:
+ up_write(&mm->mmap_sem);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(s390_enable_skey);
+
+/*
+ * Reset CMMA state, make all pages stable again.
+ */
+static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ ptep_zap_unused(walk->mm, addr, pte, 1);
+ return 0;
+}
+
+void s390_reset_cmma(struct mm_struct *mm)
+{
+ struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
+
+ down_write(&mm->mmap_sem);
+ walk.mm = mm;
+ walk_page_range(0, TASK_SIZE, &walk);
+ up_write(&mm->mmap_sem);
+}
+EXPORT_SYMBOL_GPL(s390_reset_cmma);
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
new file mode 100644
index 000000000..5389bf5bc
--- /dev/null
+++ b/arch/s390/mm/gup.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Lockless get_user_pages_fast for s390
+ *
+ * Copyright IBM Corp. 2010
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ struct page *head, *page;
+ unsigned long mask;
+ pte_t *ptep, pte;
+
+ mask = (write ? _PAGE_PROTECT : 0) | _PAGE_INVALID | _PAGE_SPECIAL;
+
+ ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr);
+ do {
+ pte = *ptep;
+ barrier();
+ /* Similar to the PMD case, NUMA hinting must take slow path */
+ if (pte_protnone(pte))
+ return 0;
+ if ((pte_val(pte) & mask) != 0)
+ return 0;
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ page = pte_page(pte);
+ head = compound_head(page);
+ if (unlikely(WARN_ON_ONCE(page_ref_count(head) < 0)
+ || !page_cache_get_speculative(head)))
+ return 0;
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ put_page(head);
+ return 0;
+ }
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
+ pages[*nr] = page;
+ (*nr)++;
+
+ } while (ptep++, addr += PAGE_SIZE, addr != end);
+
+ return 1;
+}
+
+static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ struct page *head, *page;
+ unsigned long mask;
+ int refs;
+
+ mask = (write ? _SEGMENT_ENTRY_PROTECT : 0) | _SEGMENT_ENTRY_INVALID;
+ if ((pmd_val(pmd) & mask) != 0)
+ return 0;
+ VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT));
+
+ refs = 0;
+ head = pmd_page(pmd);
+ page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (unlikely(WARN_ON_ONCE(page_ref_count(head) < 0)
+ || !page_cache_add_speculative(head, refs))) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) {
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ return 1;
+}
+
+
+static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pmd_t *pmdp, pmd;
+
+ pmdp = (pmd_t *) pudp;
+ if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ pmdp = (pmd_t *) pud_deref(pud);
+ pmdp += pmd_index(addr);
+ do {
+ pmd = *pmdp;
+ barrier();
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(pmd))
+ return 0;
+ if (unlikely(pmd_large(pmd))) {
+ /*
+ * NUMA hinting faults need to be handled in the GUP
+ * slowpath for accounting purposes and so that they
+ * can be serialised against THP migration.
+ */
+ if (pmd_protnone(pmd))
+ return 0;
+ if (!gup_huge_pmd(pmdp, pmd, addr, next,
+ write, pages, nr))
+ return 0;
+ } else if (!gup_pte_range(pmdp, pmd, addr, next,
+ write, pages, nr))
+ return 0;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ struct page *head, *page;
+ unsigned long mask;
+ int refs;
+
+ mask = (write ? _REGION_ENTRY_PROTECT : 0) | _REGION_ENTRY_INVALID;
+ if ((pud_val(pud) & mask) != 0)
+ return 0;
+ VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
+
+ refs = 0;
+ head = pud_page(pud);
+ page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (unlikely(WARN_ON_ONCE(page_ref_count(head) < 0)
+ || !page_cache_add_speculative(head, refs))) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pud_val(pud) != pud_val(*pudp))) {
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pud_t *pudp, pud;
+
+ pudp = (pud_t *) p4dp;
+ if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2)
+ pudp = (pud_t *) p4d_deref(p4d);
+ pudp += pud_index(addr);
+ do {
+ pud = *pudp;
+ barrier();
+ next = pud_addr_end(addr, end);
+ if (pud_none(pud))
+ return 0;
+ if (unlikely(pud_large(pud))) {
+ if (!gup_huge_pud(pudp, pud, addr, next, write, pages,
+ nr))
+ return 0;
+ } else if (!gup_pmd_range(pudp, pud, addr, next, write, pages,
+ nr))
+ return 0;
+ } while (pudp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static inline int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ p4d_t *p4dp, p4d;
+
+ p4dp = (p4d_t *) pgdp;
+ if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1)
+ p4dp = (p4d_t *) pgd_deref(pgd);
+ p4dp += p4d_index(addr);
+ do {
+ p4d = *p4dp;
+ barrier();
+ next = p4d_addr_end(addr, end);
+ if (p4d_none(p4d))
+ return 0;
+ if (!gup_pud_range(p4dp, p4d, addr, next, write, pages, nr))
+ return 0;
+ } while (p4dp++, addr = next, addr != end);
+
+ return 1;
+}
+
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ * Note a difference with get_user_pages_fast: this always returns the
+ * number of pages pinned, 0 if no pages were pinned.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, len, end;
+ unsigned long next, flags;
+ pgd_t *pgdp, pgd;
+ int nr = 0;
+
+ start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ if ((end <= start) || (end > mm->context.asce_limit))
+ return 0;
+ /*
+ * local_irq_save() doesn't prevent pagetable teardown, but does
+ * prevent the pagetables from being freed on s390.
+ *
+ * So long as we atomically load page table pointers versus teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
+ local_irq_save(flags);
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd = *pgdp;
+ barrier();
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ break;
+ if (!gup_p4d_range(pgdp, pgd, addr, next, write, pages, &nr))
+ break;
+ } while (pgdp++, addr = next, addr != end);
+ local_irq_restore(flags);
+
+ return nr;
+}
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @write: whether pages will be written to
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ int nr, ret;
+
+ might_sleep();
+ start &= PAGE_MASK;
+ /*
+ * The FAST_GUP case requires FOLL_WRITE even for pure reads,
+ * because get_user_pages() may need to cause an early COW in
+ * order to avoid confusing the normal COW routines. So only
+ * targets that are already writable are safe to do by just
+ * looking at the page tables.
+ */
+ nr = __get_user_pages_fast(start, nr_pages, 1, pages);
+ if (nr == nr_pages)
+ return nr;
+
+ /* Try to get the remaining pages with get_user_pages */
+ start += nr << PAGE_SHIFT;
+ pages += nr;
+ ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
+ write ? FOLL_WRITE : 0);
+ /* Have to be a bit careful with return values */
+ if (nr > 0)
+ ret = (ret < 0) ? nr : ret + nr;
+ return ret;
+}
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
new file mode 100644
index 000000000..ff8234bca
--- /dev/null
+++ b/arch/s390/mm/hugetlbpage.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IBM System z Huge TLB Page Support for Kernel.
+ *
+ * Copyright IBM Corp. 2007,2020
+ * Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
+ */
+
+#define KMSG_COMPONENT "hugetlb"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/security.h>
+
+/*
+ * If the bit selected by single-bit bitmask "a" is set within "x", move
+ * it to the position indicated by single-bit bitmask "b".
+ */
+#define move_set_bit(x, a, b) (((x) & (a)) >> ilog2(a) << ilog2(b))
+
+static inline unsigned long __pte_to_rste(pte_t pte)
+{
+ unsigned long rste;
+
+ /*
+ * Convert encoding pte bits pmd / pud bits
+ * lIR.uswrdy.p dy..R...I...wr
+ * empty 010.000000.0 -> 00..0...1...00
+ * prot-none, clean, old 111.000000.1 -> 00..1...1...00
+ * prot-none, clean, young 111.000001.1 -> 01..1...1...00
+ * prot-none, dirty, old 111.000010.1 -> 10..1...1...00
+ * prot-none, dirty, young 111.000011.1 -> 11..1...1...00
+ * read-only, clean, old 111.000100.1 -> 00..1...1...01
+ * read-only, clean, young 101.000101.1 -> 01..1...0...01
+ * read-only, dirty, old 111.000110.1 -> 10..1...1...01
+ * read-only, dirty, young 101.000111.1 -> 11..1...0...01
+ * read-write, clean, old 111.001100.1 -> 00..1...1...11
+ * read-write, clean, young 101.001101.1 -> 01..1...0...11
+ * read-write, dirty, old 110.001110.1 -> 10..0...1...11
+ * read-write, dirty, young 100.001111.1 -> 11..0...0...11
+ * HW-bits: R read-only, I invalid
+ * SW-bits: p present, y young, d dirty, r read, w write, s special,
+ * u unused, l large
+ */
+ if (pte_present(pte)) {
+ rste = pte_val(pte) & PAGE_MASK;
+ rste |= move_set_bit(pte_val(pte), _PAGE_READ,
+ _SEGMENT_ENTRY_READ);
+ rste |= move_set_bit(pte_val(pte), _PAGE_WRITE,
+ _SEGMENT_ENTRY_WRITE);
+ rste |= move_set_bit(pte_val(pte), _PAGE_INVALID,
+ _SEGMENT_ENTRY_INVALID);
+ rste |= move_set_bit(pte_val(pte), _PAGE_PROTECT,
+ _SEGMENT_ENTRY_PROTECT);
+ rste |= move_set_bit(pte_val(pte), _PAGE_DIRTY,
+ _SEGMENT_ENTRY_DIRTY);
+ rste |= move_set_bit(pte_val(pte), _PAGE_YOUNG,
+ _SEGMENT_ENTRY_YOUNG);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ rste |= move_set_bit(pte_val(pte), _PAGE_SOFT_DIRTY,
+ _SEGMENT_ENTRY_SOFT_DIRTY);
+#endif
+ rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
+ _SEGMENT_ENTRY_NOEXEC);
+ } else
+ rste = _SEGMENT_ENTRY_EMPTY;
+ return rste;
+}
+
+static inline pte_t __rste_to_pte(unsigned long rste)
+{
+ int present;
+ pte_t pte;
+
+ if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ present = pud_present(__pud(rste));
+ else
+ present = pmd_present(__pmd(rste));
+
+ /*
+ * Convert encoding pmd / pud bits pte bits
+ * dy..R...I...wr lIR.uswrdy.p
+ * empty 00..0...1...00 -> 010.000000.0
+ * prot-none, clean, old 00..1...1...00 -> 111.000000.1
+ * prot-none, clean, young 01..1...1...00 -> 111.000001.1
+ * prot-none, dirty, old 10..1...1...00 -> 111.000010.1
+ * prot-none, dirty, young 11..1...1...00 -> 111.000011.1
+ * read-only, clean, old 00..1...1...01 -> 111.000100.1
+ * read-only, clean, young 01..1...0...01 -> 101.000101.1
+ * read-only, dirty, old 10..1...1...01 -> 111.000110.1
+ * read-only, dirty, young 11..1...0...01 -> 101.000111.1
+ * read-write, clean, old 00..1...1...11 -> 111.001100.1
+ * read-write, clean, young 01..1...0...11 -> 101.001101.1
+ * read-write, dirty, old 10..0...1...11 -> 110.001110.1
+ * read-write, dirty, young 11..0...0...11 -> 100.001111.1
+ * HW-bits: R read-only, I invalid
+ * SW-bits: p present, y young, d dirty, r read, w write, s special,
+ * u unused, l large
+ */
+ if (present) {
+ pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
+ pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT;
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ,
+ _PAGE_READ);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE,
+ _PAGE_WRITE);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID,
+ _PAGE_INVALID);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT,
+ _PAGE_PROTECT);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY,
+ _PAGE_DIRTY);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG,
+ _PAGE_YOUNG);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY,
+ _PAGE_SOFT_DIRTY);
+#endif
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC,
+ _PAGE_NOEXEC);
+ } else
+ pte_val(pte) = _PAGE_INVALID;
+ return pte;
+}
+
+static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
+{
+ struct page *page;
+ unsigned long size, paddr;
+
+ if (!mm_uses_skeys(mm) ||
+ rste & _SEGMENT_ENTRY_INVALID)
+ return;
+
+ if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
+ page = pud_page(__pud(rste));
+ size = PUD_SIZE;
+ paddr = rste & PUD_MASK;
+ } else {
+ page = pmd_page(__pmd(rste));
+ size = PMD_SIZE;
+ paddr = rste & PMD_MASK;
+ }
+
+ if (!test_and_set_bit(PG_arch_1, &page->flags))
+ __storage_key_init_range(paddr, paddr + size - 1);
+}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ unsigned long rste;
+
+ rste = __pte_to_rste(pte);
+ if (!MACHINE_HAS_NX)
+ rste &= ~_SEGMENT_ENTRY_NOEXEC;
+
+ /* Set correct table type for 2G hugepages */
+ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
+ if (likely(pte_present(pte)))
+ rste |= _REGION3_ENTRY_LARGE;
+ rste |= _REGION_ENTRY_TYPE_R3;
+ } else if (likely(pte_present(pte)))
+ rste |= _SEGMENT_ENTRY_LARGE;
+
+ clear_huge_pte_skeys(mm, rste);
+ pte_val(*ptep) = rste;
+}
+
+pte_t huge_ptep_get(pte_t *ptep)
+{
+ return __rste_to_pte(pte_val(*ptep));
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t pte = huge_ptep_get(ptep);
+ pmd_t *pmdp = (pmd_t *) ptep;
+ pud_t *pudp = (pud_t *) ptep;
+
+ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY));
+ else
+ pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
+ return pte;
+}
+
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
+{
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp = NULL;
+
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (p4dp) {
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (pudp) {
+ if (sz == PUD_SIZE)
+ return (pte_t *) pudp;
+ else if (sz == PMD_SIZE)
+ pmdp = pmd_alloc(mm, pudp, addr);
+ }
+ }
+ return (pte_t *) pmdp;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
+{
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp = NULL;
+
+ pgdp = pgd_offset(mm, addr);
+ if (pgd_present(*pgdp)) {
+ p4dp = p4d_offset(pgdp, addr);
+ if (p4d_present(*p4dp)) {
+ pudp = pud_offset(p4dp, addr);
+ if (pud_present(*pudp)) {
+ if (pud_large(*pudp))
+ return (pte_t *) pudp;
+ pmdp = pmd_offset(pudp, addr);
+ }
+ }
+ }
+ return (pte_t *) pmdp;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+ return pmd_large(pmd);
+}
+
+int pud_huge(pud_t pud)
+{
+ return pud_large(pud);
+}
+
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int flags)
+{
+ if (flags & FOLL_GET)
+ return NULL;
+
+ return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+}
+
+static __init int setup_hugepagesz(char *opt)
+{
+ unsigned long size;
+ char *string = opt;
+
+ size = memparse(opt, &opt);
+ if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) {
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+ } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) {
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else {
+ hugetlb_bad_size();
+ pr_err("hugepagesz= specifies an unsupported page size %s\n",
+ string);
+ return 0;
+ }
+ return 1;
+}
+__setup("hugepagesz=", setup_hugepagesz);
+
+static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = current->mm->mmap_base;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
+}
+
+static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
+ unsigned long addr0, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+ unsigned long addr;
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.high_limit = current->mm->mmap_base;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ addr = vm_unmapped_area(&info);
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
+ }
+
+ return addr;
+}
+
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int rc;
+
+ if (len & ~huge_page_mask(h))
+ return -EINVAL;
+ if (len > TASK_SIZE - mmap_min_addr)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED) {
+ if (prepare_hugepage_range(file, addr, len))
+ return -EINVAL;
+ goto check_asce_limit;
+ }
+
+ if (addr) {
+ addr = ALIGN(addr, huge_page_size(h));
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ (!vma || addr + len <= vm_start_gap(vma)))
+ goto check_asce_limit;
+ }
+
+ if (mm->get_unmapped_area == arch_get_unmapped_area)
+ addr = hugetlb_get_unmapped_area_bottomup(file, addr, len,
+ pgoff, flags);
+ else
+ addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
+ pgoff, flags);
+ if (addr & ~PAGE_MASK)
+ return addr;
+
+check_asce_limit:
+ if (addr + len > current->mm->context.asce_limit &&
+ addr + len <= TASK_SIZE) {
+ rc = crst_table_upgrade(mm, addr + len);
+ if (rc)
+ return (unsigned long) rc;
+ }
+ return addr;
+}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
new file mode 100644
index 000000000..379a925d9
--- /dev/null
+++ b/arch/s390/mm/init.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * S390 version
+ * Copyright IBM Corp. 1999
+ * Author(s): Hartmut Penner (hp@de.ibm.com)
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1995 Linus Torvalds
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/memory.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/initrd.h>
+#include <linux/export.h>
+#include <linux/cma.h>
+#include <linux/gfp.h>
+#include <linux/memblock.h>
+#include <asm/processor.h>
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/lowcore.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/ctl_reg.h>
+#include <asm/sclp.h>
+#include <asm/set_memory.h>
+
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir);
+
+unsigned long empty_zero_page, zero_page_mask;
+EXPORT_SYMBOL(empty_zero_page);
+EXPORT_SYMBOL(zero_page_mask);
+
+static void __init setup_zero_pages(void)
+{
+ unsigned int order;
+ struct page *page;
+ int i;
+
+ /* Latest machines require a mapping granularity of 512KB */
+ order = 7;
+
+ /* Limit number of empty zero pages for small memory sizes */
+ while (order > 2 && (totalram_pages >> 10) < (1UL << order))
+ order--;
+
+ empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+ if (!empty_zero_page)
+ panic("Out of memory in setup_zero_pages");
+
+ page = virt_to_page((void *) empty_zero_page);
+ split_page(page, order);
+ for (i = 1 << order; i > 0; i--) {
+ mark_page_reserved(page);
+ page++;
+ }
+
+ zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK;
+}
+
+/*
+ * paging_init() sets up the page tables
+ */
+void __init paging_init(void)
+{
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+ unsigned long pgd_type, asce_bits;
+ psw_t psw;
+
+ init_mm.pgd = swapper_pg_dir;
+ if (VMALLOC_END > _REGION2_SIZE) {
+ asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+ pgd_type = _REGION2_ENTRY_EMPTY;
+ } else {
+ asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+ pgd_type = _REGION3_ENTRY_EMPTY;
+ }
+ init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
+ S390_lowcore.kernel_asce = init_mm.context.asce;
+ S390_lowcore.user_asce = S390_lowcore.kernel_asce;
+ crst_table_init((unsigned long *) init_mm.pgd, pgd_type);
+ vmem_map_init();
+
+ /* enable virtual mapping in kernel mode */
+ __ctl_load(S390_lowcore.kernel_asce, 1, 1);
+ __ctl_load(S390_lowcore.kernel_asce, 7, 7);
+ __ctl_load(S390_lowcore.kernel_asce, 13, 13);
+ psw.mask = __extract_psw();
+ psw_bits(psw).dat = 1;
+ psw_bits(psw).as = PSW_BITS_AS_HOME;
+ __load_psw_mask(psw.mask);
+
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_init();
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+ max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS);
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+ free_area_init_nodes(max_zone_pfns);
+}
+
+void mark_rodata_ro(void)
+{
+ unsigned long size = __end_ro_after_init - __start_ro_after_init;
+
+ set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT);
+ pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
+}
+
+void __init mem_init(void)
+{
+ cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
+ cpumask_set_cpu(0, mm_cpumask(&init_mm));
+
+ set_max_mapnr(max_low_pfn);
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+
+ /* Setup guest page hinting */
+ cmma_init();
+
+ /* this will put all low memory onto the freelists */
+ free_all_bootmem();
+ setup_zero_pages(); /* Setup zeroed pages. */
+
+ cmma_init_nodat();
+
+ mem_init_print_info(NULL);
+}
+
+void free_initmem(void)
+{
+ __set_memory((unsigned long)_sinittext,
+ (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
+ SET_MEMORY_RW | SET_MEMORY_NX);
+ free_initmem_default(POISON_FREE_INITMEM);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void __init free_initrd_mem(unsigned long start, unsigned long end)
+{
+ free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
+ "initrd");
+}
+#endif
+
+unsigned long memory_block_size_bytes(void)
+{
+ /*
+ * Make sure the memory block size is always greater
+ * or equal than the memory increment size.
+ */
+ return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+#ifdef CONFIG_CMA
+
+/* Prevent memory blocks which contain cma regions from going offline */
+
+struct s390_cma_mem_data {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int s390_cma_check_range(struct cma *cma, void *data)
+{
+ struct s390_cma_mem_data *mem_data;
+ unsigned long start, end;
+
+ mem_data = data;
+ start = cma_get_base(cma);
+ end = start + cma_get_size(cma);
+ if (end < mem_data->start)
+ return 0;
+ if (start >= mem_data->end)
+ return 0;
+ return -EBUSY;
+}
+
+static int s390_cma_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct s390_cma_mem_data mem_data;
+ struct memory_notify *arg;
+ int rc = 0;
+
+ arg = data;
+ mem_data.start = arg->start_pfn << PAGE_SHIFT;
+ mem_data.end = mem_data.start + (arg->nr_pages << PAGE_SHIFT);
+ if (action == MEM_GOING_OFFLINE)
+ rc = cma_for_each_area(s390_cma_check_range, &mem_data);
+ return notifier_from_errno(rc);
+}
+
+static struct notifier_block s390_cma_mem_nb = {
+ .notifier_call = s390_cma_mem_notifier,
+};
+
+static int __init s390_cma_mem_init(void)
+{
+ return register_memory_notifier(&s390_cma_mem_nb);
+}
+device_initcall(s390_cma_mem_init);
+
+#endif /* CONFIG_CMA */
+
+int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
+ bool want_memblock)
+{
+ unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long size_pages = PFN_DOWN(size);
+ int rc;
+
+ rc = vmem_add_mapping(start, size);
+ if (rc)
+ return rc;
+
+ rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock);
+ if (rc)
+ vmem_remove_mapping(start, size);
+ return rc;
+}
+
+void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ __remove_pages(start_pfn, nr_pages, altmap);
+ vmem_remove_mapping(start, size);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
new file mode 100644
index 000000000..7be064758
--- /dev/null
+++ b/arch/s390/mm/maccess.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Access kernel memory without faulting -- s390 specific implementation.
+ *
+ * Copyright IBM Corp. 2009, 2015
+ *
+ * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
+ *
+ */
+
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/cpu.h>
+#include <asm/ctl_reg.h>
+#include <asm/io.h>
+
+static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size)
+{
+ unsigned long aligned, offset, count;
+ char tmp[8];
+
+ aligned = (unsigned long) dst & ~7UL;
+ offset = (unsigned long) dst & 7UL;
+ size = min(8UL - offset, size);
+ count = size - 1;
+ asm volatile(
+ " bras 1,0f\n"
+ " mvc 0(1,%4),0(%5)\n"
+ "0: mvc 0(8,%3),0(%0)\n"
+ " ex %1,0(1)\n"
+ " lg %1,0(%3)\n"
+ " lra %0,0(%0)\n"
+ " sturg %1,%0\n"
+ : "+&a" (aligned), "+&a" (count), "=m" (tmp)
+ : "a" (&tmp), "a" (&tmp[offset]), "a" (src)
+ : "cc", "memory", "1");
+ return size;
+}
+
+/*
+ * s390_kernel_write - write to kernel memory bypassing DAT
+ * @dst: destination address
+ * @src: source address
+ * @size: number of bytes to copy
+ *
+ * This function writes to kernel memory bypassing DAT and possible page table
+ * write protection. It writes to the destination using the sturg instruction.
+ * Therefore we have a read-modify-write sequence: the function reads eight
+ * bytes from destination at an eight byte boundary, modifies the bytes
+ * requested and writes the result back in a loop.
+ *
+ * Note: this means that this function may not be called concurrently on
+ * several cpus with overlapping words, since this may potentially
+ * cause data corruption.
+ */
+void notrace s390_kernel_write(void *dst, const void *src, size_t size)
+{
+ long copied;
+
+ while (size) {
+ copied = s390_kernel_write_odd(dst, src, size);
+ dst += copied;
+ src += copied;
+ size -= copied;
+ }
+}
+
+static int __memcpy_real(void *dest, void *src, size_t count)
+{
+ register unsigned long _dest asm("2") = (unsigned long) dest;
+ register unsigned long _len1 asm("3") = (unsigned long) count;
+ register unsigned long _src asm("4") = (unsigned long) src;
+ register unsigned long _len2 asm("5") = (unsigned long) count;
+ int rc = -EFAULT;
+
+ asm volatile (
+ "0: mvcle %1,%2,0x0\n"
+ "1: jo 0b\n"
+ " lhi %0,0x0\n"
+ "2:\n"
+ EX_TABLE(1b,2b)
+ : "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1),
+ "+d" (_len2), "=m" (*((long *) dest))
+ : "m" (*((long *) src))
+ : "cc", "memory");
+ return rc;
+}
+
+/*
+ * Copy memory in real mode (kernel to kernel)
+ */
+int memcpy_real(void *dest, void *src, size_t count)
+{
+ int irqs_disabled, rc;
+ unsigned long flags;
+
+ if (!count)
+ return 0;
+ flags = __arch_local_irq_stnsm(0xf8UL);
+ irqs_disabled = arch_irqs_disabled_flags(flags);
+ if (!irqs_disabled)
+ trace_hardirqs_off();
+ rc = __memcpy_real(dest, src, count);
+ if (!irqs_disabled)
+ trace_hardirqs_on();
+ __arch_local_irq_ssm(flags);
+ return rc;
+}
+
+/*
+ * Copy memory in absolute mode (kernel to kernel)
+ */
+void memcpy_absolute(void *dest, void *src, size_t count)
+{
+ unsigned long cr0, flags, prefix;
+
+ flags = arch_local_irq_save();
+ __ctl_store(cr0, 0, 0);
+ __ctl_clear_bit(0, 28); /* disable lowcore protection */
+ prefix = store_prefix();
+ if (prefix) {
+ local_mcck_disable();
+ set_prefix(0);
+ memcpy(dest, src, count);
+ set_prefix(prefix);
+ local_mcck_enable();
+ } else {
+ memcpy(dest, src, count);
+ }
+ __ctl_load(cr0, 0, 0);
+ arch_local_irq_restore(flags);
+}
+
+/*
+ * Copy memory from kernel (real) to user (virtual)
+ */
+int copy_to_user_real(void __user *dest, void *src, unsigned long count)
+{
+ int offs = 0, size, rc;
+ char *buf;
+
+ buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ rc = -EFAULT;
+ while (offs < count) {
+ size = min(PAGE_SIZE, count - offs);
+ if (memcpy_real(buf, src + offs, size))
+ goto out;
+ if (copy_to_user(dest + offs, buf, size))
+ goto out;
+ offs += size;
+ }
+ rc = 0;
+out:
+ free_page((unsigned long) buf);
+ return rc;
+}
+
+/*
+ * Check if physical address is within prefix or zero page
+ */
+static int is_swapped(unsigned long addr)
+{
+ unsigned long lc;
+ int cpu;
+
+ if (addr < sizeof(struct lowcore))
+ return 1;
+ for_each_online_cpu(cpu) {
+ lc = (unsigned long) lowcore_ptr[cpu];
+ if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Convert a physical pointer for /dev/mem access
+ *
+ * For swapped prefix pages a new buffer is returned that contains a copy of
+ * the absolute memory. The buffer size is maximum one page large.
+ */
+void *xlate_dev_mem_ptr(phys_addr_t addr)
+{
+ void *bounce = (void *) addr;
+ unsigned long size;
+
+ get_online_cpus();
+ preempt_disable();
+ if (is_swapped(addr)) {
+ size = PAGE_SIZE - (addr & ~PAGE_MASK);
+ bounce = (void *) __get_free_page(GFP_ATOMIC);
+ if (bounce)
+ memcpy_absolute(bounce, (void *) addr, size);
+ }
+ preempt_enable();
+ put_online_cpus();
+ return bounce;
+}
+
+/*
+ * Free converted buffer for /dev/mem access (if necessary)
+ */
+void unxlate_dev_mem_ptr(phys_addr_t addr, void *buf)
+{
+ if ((void *) addr != buf)
+ free_page((unsigned long) buf);
+}
diff --git a/arch/s390/mm/mem_detect.c b/arch/s390/mm/mem_detect.c
new file mode 100644
index 000000000..21f6c82c8
--- /dev/null
+++ b/arch/s390/mm/mem_detect.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2008, 2009
+ *
+ * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/memblock.h>
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <asm/ipl.h>
+#include <asm/sclp.h>
+#include <asm/setup.h>
+
+#define CHUNK_READ_WRITE 0
+#define CHUNK_READ_ONLY 1
+
+static inline void memblock_physmem_add(phys_addr_t start, phys_addr_t size)
+{
+ memblock_dbg("memblock_physmem_add: [%#016llx-%#016llx]\n",
+ start, start + size - 1);
+ memblock_add_range(&memblock.memory, start, size, 0, 0);
+ memblock_add_range(&memblock.physmem, start, size, 0, 0);
+}
+
+void __init detect_memory_memblock(void)
+{
+ unsigned long memsize, rnmax, rzm, addr, size;
+ int type;
+
+ rzm = sclp.rzm;
+ rnmax = sclp.rnmax;
+ memsize = rzm * rnmax;
+ if (!rzm)
+ rzm = 1UL << 17;
+ max_physmem_end = memsize;
+ addr = 0;
+ /* keep memblock lists close to the kernel */
+ memblock_set_bottom_up(true);
+ do {
+ size = 0;
+ /* assume lowcore is writable */
+ type = addr ? tprot(addr) : CHUNK_READ_WRITE;
+ do {
+ size += rzm;
+ if (max_physmem_end && addr + size >= max_physmem_end)
+ break;
+ } while (type == tprot(addr + size));
+ if (type == CHUNK_READ_WRITE || type == CHUNK_READ_ONLY) {
+ if (max_physmem_end && (addr + size > max_physmem_end))
+ size = max_physmem_end - addr;
+ memblock_physmem_add(addr, size);
+ }
+ addr += size;
+ } while (addr < max_physmem_end);
+ memblock_set_bottom_up(false);
+ if (!max_physmem_end)
+ max_physmem_end = memblock_end_of_DRAM();
+ memblock_dump_all();
+}
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
new file mode 100644
index 000000000..0a7627cdb
--- /dev/null
+++ b/arch/s390/mm/mmap.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * flexible mmap layout support
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * Started by Ingo Molnar <mingo@elte.hu>
+ */
+
+#include <linux/elf-randomize.h>
+#include <linux/personality.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#include <linux/random.h>
+#include <linux/compat.h>
+#include <linux/security.h>
+#include <asm/pgalloc.h>
+#include <asm/elf.h>
+
+static unsigned long stack_maxrandom_size(void)
+{
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
+ if (current->personality & ADDR_NO_RANDOMIZE)
+ return 0;
+ return STACK_RND_MASK << PAGE_SHIFT;
+}
+
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave at least a ~32 MB hole.
+ */
+#define MIN_GAP (32*1024*1024)
+#define MAX_GAP (STACK_TOP/6*5)
+
+static inline int mmap_is_legacy(struct rlimit *rlim_stack)
+{
+ if (current->personality & ADDR_COMPAT_LAYOUT)
+ return 1;
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
+ return 1;
+ return sysctl_legacy_va_layout;
+}
+
+unsigned long arch_mmap_rnd(void)
+{
+ return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT;
+}
+
+static unsigned long mmap_base_legacy(unsigned long rnd)
+{
+ return TASK_UNMAPPED_BASE + rnd;
+}
+
+static inline unsigned long mmap_base(unsigned long rnd,
+ struct rlimit *rlim_stack)
+{
+ unsigned long gap = rlim_stack->rlim_cur;
+
+ if (gap < MIN_GAP)
+ gap = MIN_GAP;
+ else if (gap > MAX_GAP)
+ gap = MAX_GAP;
+ gap &= PAGE_MASK;
+ return STACK_TOP - stack_maxrandom_size() - rnd - gap;
+}
+
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct vm_unmapped_area_info info;
+ int rc;
+
+ if (len > TASK_SIZE - mmap_min_addr)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED)
+ goto check_asce_limit;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ (!vma || addr + len <= vm_start_gap(vma)))
+ goto check_asce_limit;
+ }
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = mm->mmap_base;
+ info.high_limit = TASK_SIZE;
+ if (filp || (flags & MAP_SHARED))
+ info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
+ else
+ info.align_mask = 0;
+ info.align_offset = pgoff << PAGE_SHIFT;
+ addr = vm_unmapped_area(&info);
+ if (addr & ~PAGE_MASK)
+ return addr;
+
+check_asce_limit:
+ if (addr + len > current->mm->context.asce_limit &&
+ addr + len <= TASK_SIZE) {
+ rc = crst_table_upgrade(mm, addr + len);
+ if (rc)
+ return (unsigned long) rc;
+ }
+
+ return addr;
+}
+
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+ const unsigned long len, const unsigned long pgoff,
+ const unsigned long flags)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ unsigned long addr = addr0;
+ struct vm_unmapped_area_info info;
+ int rc;
+
+ /* requested length too big for entire address space */
+ if (len > TASK_SIZE - mmap_min_addr)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED)
+ goto check_asce_limit;
+
+ /* requesting a specific address */
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ (!vma || addr + len <= vm_start_gap(vma)))
+ goto check_asce_limit;
+ }
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.high_limit = mm->mmap_base;
+ if (filp || (flags & MAP_SHARED))
+ info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
+ else
+ info.align_mask = 0;
+ info.align_offset = pgoff << PAGE_SHIFT;
+ addr = vm_unmapped_area(&info);
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
+ if (addr & ~PAGE_MASK)
+ return addr;
+ }
+
+check_asce_limit:
+ if (addr + len > current->mm->context.asce_limit &&
+ addr + len <= TASK_SIZE) {
+ rc = crst_table_upgrade(mm, addr + len);
+ if (rc)
+ return (unsigned long) rc;
+ }
+
+ return addr;
+}
+
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+{
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
+ /*
+ * Fall back to the standard layout if the personality
+ * bit is set, or if the expected stack growth is unlimited:
+ */
+ if (mmap_is_legacy(rlim_stack)) {
+ mm->mmap_base = mmap_base_legacy(random_factor);
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ } else {
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ }
+}
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
new file mode 100644
index 000000000..dc3cede7f
--- /dev/null
+++ b/arch/s390/mm/page-states.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2008
+ *
+ * Guest page hinting for unused pages.
+ *
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <asm/facility.h>
+#include <asm/page-states.h>
+
+static int cmma_flag = 1;
+
+static int __init cmma(char *str)
+{
+ char *parm;
+
+ parm = strstrip(str);
+ if (strcmp(parm, "yes") == 0 || strcmp(parm, "on") == 0) {
+ cmma_flag = 1;
+ return 1;
+ }
+ cmma_flag = 0;
+ if (strcmp(parm, "no") == 0 || strcmp(parm, "off") == 0)
+ return 1;
+ return 0;
+}
+__setup("cmma=", cmma);
+
+static inline int cmma_test_essa(void)
+{
+ register unsigned long tmp asm("0") = 0;
+ register int rc asm("1");
+
+ /* test ESSA_GET_STATE */
+ asm volatile(
+ " .insn rrf,0xb9ab0000,%1,%1,%2,0\n"
+ "0: la %0,0\n"
+ "1:\n"
+ EX_TABLE(0b,1b)
+ : "=&d" (rc), "+&d" (tmp)
+ : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP));
+ return rc;
+}
+
+void __init cmma_init(void)
+{
+ if (!cmma_flag)
+ return;
+ if (cmma_test_essa()) {
+ cmma_flag = 0;
+ return;
+ }
+ if (test_facility(147))
+ cmma_flag = 2;
+}
+
+static inline unsigned char get_page_state(struct page *page)
+{
+ unsigned char state;
+
+ asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0"
+ : "=&d" (state)
+ : "a" (page_to_phys(page)),
+ "i" (ESSA_GET_STATE));
+ return state & 0x3f;
+}
+
+static inline void set_page_unused(struct page *page, int order)
+{
+ int i, rc;
+
+ for (i = 0; i < (1 << order); i++)
+ asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
+ : "=&d" (rc)
+ : "a" (page_to_phys(page + i)),
+ "i" (ESSA_SET_UNUSED));
+}
+
+static inline void set_page_stable_dat(struct page *page, int order)
+{
+ int i, rc;
+
+ for (i = 0; i < (1 << order); i++)
+ asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
+ : "=&d" (rc)
+ : "a" (page_to_phys(page + i)),
+ "i" (ESSA_SET_STABLE));
+}
+
+static inline void set_page_stable_nodat(struct page *page, int order)
+{
+ int i, rc;
+
+ for (i = 0; i < (1 << order); i++)
+ asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0"
+ : "=&d" (rc)
+ : "a" (page_to_phys(page + i)),
+ "i" (ESSA_SET_STABLE_NODAT));
+}
+
+static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ struct page *page;
+ pmd_t *pmd;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(*pmd) || pmd_large(*pmd))
+ continue;
+ page = virt_to_page(pmd_val(*pmd));
+ set_bit(PG_arch_1, &page->flags);
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ struct page *page;
+ pud_t *pud;
+ int i;
+
+ pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none(*pud) || pud_large(*pud))
+ continue;
+ if (!pud_folded(*pud)) {
+ page = virt_to_page(pud_val(*pud));
+ for (i = 0; i < 3; i++)
+ set_bit(PG_arch_1, &page[i].flags);
+ }
+ mark_kernel_pmd(pud, addr, next);
+ } while (pud++, addr = next, addr != end);
+}
+
+static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ struct page *page;
+ p4d_t *p4d;
+ int i;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none(*p4d))
+ continue;
+ if (!p4d_folded(*p4d)) {
+ page = virt_to_page(p4d_val(*p4d));
+ for (i = 0; i < 3; i++)
+ set_bit(PG_arch_1, &page[i].flags);
+ }
+ mark_kernel_pud(p4d, addr, next);
+ } while (p4d++, addr = next, addr != end);
+}
+
+static void mark_kernel_pgd(void)
+{
+ unsigned long addr, next;
+ struct page *page;
+ pgd_t *pgd;
+ int i;
+
+ addr = 0;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, MODULES_END);
+ if (pgd_none(*pgd))
+ continue;
+ if (!pgd_folded(*pgd)) {
+ page = virt_to_page(pgd_val(*pgd));
+ for (i = 0; i < 3; i++)
+ set_bit(PG_arch_1, &page[i].flags);
+ }
+ mark_kernel_p4d(pgd, addr, next);
+ } while (pgd++, addr = next, addr != MODULES_END);
+}
+
+void __init cmma_init_nodat(void)
+{
+ struct memblock_region *reg;
+ struct page *page;
+ unsigned long start, end, ix;
+
+ if (cmma_flag < 2)
+ return;
+ /* Mark pages used in kernel page tables */
+ mark_kernel_pgd();
+
+ /* Set all kernel pages not used for page tables to stable/no-dat */
+ for_each_memblock(memory, reg) {
+ start = memblock_region_memory_base_pfn(reg);
+ end = memblock_region_memory_end_pfn(reg);
+ page = pfn_to_page(start);
+ for (ix = start; ix < end; ix++, page++) {
+ if (__test_and_clear_bit(PG_arch_1, &page->flags))
+ continue; /* skip page table pages */
+ if (!list_empty(&page->lru))
+ continue; /* skip free pages */
+ set_page_stable_nodat(page, 0);
+ }
+ }
+}
+
+void arch_free_page(struct page *page, int order)
+{
+ if (!cmma_flag)
+ return;
+ set_page_unused(page, order);
+}
+
+void arch_alloc_page(struct page *page, int order)
+{
+ if (!cmma_flag)
+ return;
+ if (cmma_flag < 2)
+ set_page_stable_dat(page, order);
+ else
+ set_page_stable_nodat(page, order);
+}
+
+void arch_set_page_dat(struct page *page, int order)
+{
+ if (!cmma_flag)
+ return;
+ set_page_stable_dat(page, order);
+}
+
+void arch_set_page_nodat(struct page *page, int order)
+{
+ if (cmma_flag < 2)
+ return;
+ set_page_stable_nodat(page, order);
+}
+
+int arch_test_page_nodat(struct page *page)
+{
+ unsigned char state;
+
+ if (cmma_flag < 2)
+ return 0;
+ state = get_page_state(page);
+ return !!(state & 0x20);
+}
+
+void arch_set_page_states(int make_stable)
+{
+ unsigned long flags, order, t;
+ struct list_head *l;
+ struct page *page;
+ struct zone *zone;
+
+ if (!cmma_flag)
+ return;
+ if (make_stable)
+ drain_local_pages(NULL);
+ for_each_populated_zone(zone) {
+ spin_lock_irqsave(&zone->lock, flags);
+ for_each_migratetype_order(order, t) {
+ list_for_each(l, &zone->free_area[order].free_list[t]) {
+ page = list_entry(l, struct page, lru);
+ if (make_stable)
+ set_page_stable_dat(page, order);
+ else
+ set_page_unused(page, order);
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+}
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
new file mode 100644
index 000000000..f8c6faab4
--- /dev/null
+++ b/arch/s390/mm/pageattr.c
@@ -0,0 +1,386 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2011
+ * Author(s): Jan Glauber <jang@linux.vnet.ibm.com>
+ */
+#include <linux/hugetlb.h>
+#include <linux/mm.h>
+#include <asm/cacheflush.h>
+#include <asm/facility.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/page.h>
+#include <asm/set_memory.h>
+
+static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
+{
+ asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0"
+ : [addr] "+a" (addr) : [skey] "d" (skey));
+ return addr;
+}
+
+void __storage_key_init_range(unsigned long start, unsigned long end)
+{
+ unsigned long boundary, size;
+
+ while (start < end) {
+ if (MACHINE_HAS_EDAT1) {
+ /* set storage keys for a 1MB frame */
+ size = 1UL << 20;
+ boundary = (start + size) & ~(size - 1);
+ if (boundary <= end) {
+ do {
+ start = sske_frame(start, PAGE_DEFAULT_KEY);
+ } while (start < boundary);
+ continue;
+ }
+ }
+ page_set_storage_key(start, PAGE_DEFAULT_KEY, 1);
+ start += PAGE_SIZE;
+ }
+}
+
+#ifdef CONFIG_PROC_FS
+atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
+
+void arch_report_meminfo(struct seq_file *m)
+{
+ seq_printf(m, "DirectMap4k: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_4K]) << 2);
+ seq_printf(m, "DirectMap1M: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_1M]) << 10);
+ seq_printf(m, "DirectMap2G: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_2G]) << 21);
+}
+#endif /* CONFIG_PROC_FS */
+
+static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
+ unsigned long dtt)
+{
+ unsigned long table, mask;
+
+ mask = 0;
+ if (MACHINE_HAS_EDAT2) {
+ switch (dtt) {
+ case CRDTE_DTT_REGION3:
+ mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1);
+ break;
+ case CRDTE_DTT_SEGMENT:
+ mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
+ break;
+ case CRDTE_DTT_PAGE:
+ mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
+ break;
+ }
+ table = (unsigned long)old & mask;
+ crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce);
+ } else if (MACHINE_HAS_IDTE) {
+ cspg(old, *old, new);
+ } else {
+ csp((unsigned int *)old + 1, *old, new);
+ }
+}
+
+static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ unsigned long flags)
+{
+ pte_t *ptep, new;
+
+ ptep = pte_offset(pmdp, addr);
+ do {
+ new = *ptep;
+ if (pte_none(new))
+ return -EINVAL;
+ if (flags & SET_MEMORY_RO)
+ new = pte_wrprotect(new);
+ else if (flags & SET_MEMORY_RW)
+ new = pte_mkwrite(pte_mkdirty(new));
+ if (flags & SET_MEMORY_NX)
+ pte_val(new) |= _PAGE_NOEXEC;
+ else if (flags & SET_MEMORY_X)
+ pte_val(new) &= ~_PAGE_NOEXEC;
+ pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
+ ptep++;
+ addr += PAGE_SIZE;
+ cond_resched();
+ } while (addr < end);
+ return 0;
+}
+
+static int split_pmd_page(pmd_t *pmdp, unsigned long addr)
+{
+ unsigned long pte_addr, prot;
+ pte_t *pt_dir, *ptep;
+ pmd_t new;
+ int i, ro, nx;
+
+ pt_dir = vmem_pte_alloc();
+ if (!pt_dir)
+ return -ENOMEM;
+ pte_addr = pmd_pfn(*pmdp) << PAGE_SHIFT;
+ ro = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT);
+ nx = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_NOEXEC);
+ prot = pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL);
+ if (!nx)
+ prot &= ~_PAGE_NOEXEC;
+ ptep = pt_dir;
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte_val(*ptep) = pte_addr | prot;
+ pte_addr += PAGE_SIZE;
+ ptep++;
+ }
+ pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY;
+ pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
+ update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE);
+ update_page_count(PG_DIRECT_MAP_1M, -1);
+ return 0;
+}
+
+static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
+ unsigned long flags)
+{
+ pmd_t new = *pmdp;
+
+ if (flags & SET_MEMORY_RO)
+ new = pmd_wrprotect(new);
+ else if (flags & SET_MEMORY_RW)
+ new = pmd_mkwrite(pmd_mkdirty(new));
+ if (flags & SET_MEMORY_NX)
+ pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC;
+ else if (flags & SET_MEMORY_X)
+ pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC;
+ pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
+}
+
+static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
+ unsigned long flags)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+ int rc = 0;
+
+ pmdp = pmd_offset(pudp, addr);
+ do {
+ if (pmd_none(*pmdp))
+ return -EINVAL;
+ next = pmd_addr_end(addr, end);
+ if (pmd_large(*pmdp)) {
+ if (addr & ~PMD_MASK || addr + PMD_SIZE > next) {
+ rc = split_pmd_page(pmdp, addr);
+ if (rc)
+ return rc;
+ continue;
+ }
+ modify_pmd_page(pmdp, addr, flags);
+ } else {
+ rc = walk_pte_level(pmdp, addr, next, flags);
+ if (rc)
+ return rc;
+ }
+ pmdp++;
+ addr = next;
+ cond_resched();
+ } while (addr < end);
+ return rc;
+}
+
+static int split_pud_page(pud_t *pudp, unsigned long addr)
+{
+ unsigned long pmd_addr, prot;
+ pmd_t *pm_dir, *pmdp;
+ pud_t new;
+ int i, ro, nx;
+
+ pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!pm_dir)
+ return -ENOMEM;
+ pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT;
+ ro = !!(pud_val(*pudp) & _REGION_ENTRY_PROTECT);
+ nx = !!(pud_val(*pudp) & _REGION_ENTRY_NOEXEC);
+ prot = pgprot_val(ro ? SEGMENT_KERNEL_RO : SEGMENT_KERNEL);
+ if (!nx)
+ prot &= ~_SEGMENT_ENTRY_NOEXEC;
+ pmdp = pm_dir;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_val(*pmdp) = pmd_addr | prot;
+ pmd_addr += PMD_SIZE;
+ pmdp++;
+ }
+ pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY;
+ pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
+ update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD);
+ update_page_count(PG_DIRECT_MAP_2G, -1);
+ return 0;
+}
+
+static void modify_pud_page(pud_t *pudp, unsigned long addr,
+ unsigned long flags)
+{
+ pud_t new = *pudp;
+
+ if (flags & SET_MEMORY_RO)
+ new = pud_wrprotect(new);
+ else if (flags & SET_MEMORY_RW)
+ new = pud_mkwrite(pud_mkdirty(new));
+ if (flags & SET_MEMORY_NX)
+ pud_val(new) |= _REGION_ENTRY_NOEXEC;
+ else if (flags & SET_MEMORY_X)
+ pud_val(new) &= ~_REGION_ENTRY_NOEXEC;
+ pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
+}
+
+static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
+ unsigned long flags)
+{
+ unsigned long next;
+ pud_t *pudp;
+ int rc = 0;
+
+ pudp = pud_offset(p4d, addr);
+ do {
+ if (pud_none(*pudp))
+ return -EINVAL;
+ next = pud_addr_end(addr, end);
+ if (pud_large(*pudp)) {
+ if (addr & ~PUD_MASK || addr + PUD_SIZE > next) {
+ rc = split_pud_page(pudp, addr);
+ if (rc)
+ break;
+ continue;
+ }
+ modify_pud_page(pudp, addr, flags);
+ } else {
+ rc = walk_pmd_level(pudp, addr, next, flags);
+ }
+ pudp++;
+ addr = next;
+ cond_resched();
+ } while (addr < end && !rc);
+ return rc;
+}
+
+static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end,
+ unsigned long flags)
+{
+ unsigned long next;
+ p4d_t *p4dp;
+ int rc = 0;
+
+ p4dp = p4d_offset(pgd, addr);
+ do {
+ if (p4d_none(*p4dp))
+ return -EINVAL;
+ next = p4d_addr_end(addr, end);
+ rc = walk_pud_level(p4dp, addr, next, flags);
+ p4dp++;
+ addr = next;
+ cond_resched();
+ } while (addr < end && !rc);
+ return rc;
+}
+
+static DEFINE_MUTEX(cpa_mutex);
+
+static int change_page_attr(unsigned long addr, unsigned long end,
+ unsigned long flags)
+{
+ unsigned long next;
+ int rc = -EINVAL;
+ pgd_t *pgdp;
+
+ if (addr == end)
+ return 0;
+ if (end >= MODULES_END)
+ return -EINVAL;
+ mutex_lock(&cpa_mutex);
+ pgdp = pgd_offset_k(addr);
+ do {
+ if (pgd_none(*pgdp))
+ break;
+ next = pgd_addr_end(addr, end);
+ rc = walk_p4d_level(pgdp, addr, next, flags);
+ if (rc)
+ break;
+ cond_resched();
+ } while (pgdp++, addr = next, addr < end && !rc);
+ mutex_unlock(&cpa_mutex);
+ return rc;
+}
+
+int __set_memory(unsigned long addr, int numpages, unsigned long flags)
+{
+ if (!MACHINE_HAS_NX)
+ flags &= ~(SET_MEMORY_NX | SET_MEMORY_X);
+ if (!flags)
+ return 0;
+ addr &= PAGE_MASK;
+ return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags);
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static void ipte_range(pte_t *pte, unsigned long address, int nr)
+{
+ int i;
+
+ if (test_facility(13)) {
+ __ptep_ipte_range(address, nr - 1, pte, IPTE_GLOBAL);
+ return;
+ }
+ for (i = 0; i < nr; i++) {
+ __ptep_ipte(address, pte, 0, 0, IPTE_GLOBAL);
+ address += PAGE_SIZE;
+ pte++;
+ }
+}
+
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ unsigned long address;
+ int nr, i, j;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (i = 0; i < numpages;) {
+ address = page_to_phys(page + i);
+ pgd = pgd_offset_k(address);
+ p4d = p4d_offset(pgd, address);
+ pud = pud_offset(p4d, address);
+ pmd = pmd_offset(pud, address);
+ pte = pte_offset_kernel(pmd, address);
+ nr = (unsigned long)pte >> ilog2(sizeof(long));
+ nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1));
+ nr = min(numpages - i, nr);
+ if (enable) {
+ for (j = 0; j < nr; j++) {
+ pte_val(*pte) &= ~_PAGE_INVALID;
+ address += PAGE_SIZE;
+ pte++;
+ }
+ } else {
+ ipte_range(pte, address, nr);
+ }
+ i += nr;
+ }
+}
+
+#ifdef CONFIG_HIBERNATION
+bool kernel_page_present(struct page *page)
+{
+ unsigned long addr;
+ int cc;
+
+ addr = page_to_phys(page);
+ asm volatile(
+ " lra %1,0(%1)\n"
+ " ipm %0\n"
+ " srl %0,28"
+ : "=d" (cc), "+a" (addr) : : "cc");
+ return cc == 0;
+}
+#endif /* CONFIG_HIBERNATION */
+
+#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
new file mode 100644
index 000000000..3f3c13a4d
--- /dev/null
+++ b/arch/s390/mm/pgalloc.c
@@ -0,0 +1,681 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Page table allocation functions
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+#include <asm/gmap.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+#ifdef CONFIG_PGSTE
+
+static int page_table_allocate_pgste_min = 0;
+static int page_table_allocate_pgste_max = 1;
+int page_table_allocate_pgste = 0;
+EXPORT_SYMBOL(page_table_allocate_pgste);
+
+static struct ctl_table page_table_sysctl[] = {
+ {
+ .procname = "allocate_pgste",
+ .data = &page_table_allocate_pgste,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO | S_IWUSR,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &page_table_allocate_pgste_min,
+ .extra2 = &page_table_allocate_pgste_max,
+ },
+ { }
+};
+
+static struct ctl_table page_table_sysctl_dir[] = {
+ {
+ .procname = "vm",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = page_table_sysctl,
+ },
+ { }
+};
+
+static int __init page_table_register_sysctl(void)
+{
+ return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
+}
+__initcall(page_table_register_sysctl);
+
+#endif /* CONFIG_PGSTE */
+
+unsigned long *crst_table_alloc(struct mm_struct *mm)
+{
+ struct page *page = alloc_pages(GFP_KERNEL, 2);
+
+ if (!page)
+ return NULL;
+ arch_set_page_dat(page, 2);
+ return (unsigned long *) page_to_phys(page);
+}
+
+void crst_table_free(struct mm_struct *mm, unsigned long *table)
+{
+ free_pages((unsigned long) table, 2);
+}
+
+static void __crst_table_upgrade(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ /* we must change all active ASCEs to avoid the creation of new TLBs */
+ if (current->active_mm == mm) {
+ S390_lowcore.user_asce = mm->context.asce;
+ if (current->thread.mm_segment == USER_DS) {
+ __ctl_load(S390_lowcore.user_asce, 1, 1);
+ /* Mark user-ASCE present in CR1 */
+ clear_cpu_flag(CIF_ASCE_PRIMARY);
+ }
+ if (current->thread.mm_segment == USER_DS_SACF) {
+ __ctl_load(S390_lowcore.user_asce, 7, 7);
+ /* enable_sacf_uaccess does all or nothing */
+ WARN_ON(!test_cpu_flag(CIF_ASCE_SECONDARY));
+ }
+ }
+ __tlb_flush_local();
+}
+
+int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
+{
+ unsigned long *table, *pgd;
+ int rc, notify;
+
+ /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
+ VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
+ rc = 0;
+ notify = 0;
+ while (mm->context.asce_limit < end) {
+ table = crst_table_alloc(mm);
+ if (!table) {
+ rc = -ENOMEM;
+ break;
+ }
+ spin_lock_bh(&mm->page_table_lock);
+ pgd = (unsigned long *) mm->pgd;
+ if (mm->context.asce_limit == _REGION2_SIZE) {
+ crst_table_init(table, _REGION2_ENTRY_EMPTY);
+ p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd);
+ mm->pgd = (pgd_t *) table;
+ mm->context.asce_limit = _REGION1_SIZE;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+ mm_inc_nr_puds(mm);
+ } else {
+ crst_table_init(table, _REGION1_ENTRY_EMPTY);
+ pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd);
+ mm->pgd = (pgd_t *) table;
+ mm->context.asce_limit = -PAGE_SIZE;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
+ }
+ notify = 1;
+ spin_unlock_bh(&mm->page_table_lock);
+ }
+ if (notify)
+ on_each_cpu(__crst_table_upgrade, mm, 0);
+ return rc;
+}
+
+void crst_table_downgrade(struct mm_struct *mm)
+{
+ pgd_t *pgd;
+
+ /* downgrade should only happen from 3 to 2 levels (compat only) */
+ VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE);
+
+ if (current->active_mm == mm) {
+ clear_user_asce();
+ __tlb_flush_mm(mm);
+ }
+
+ pgd = mm->pgd;
+ mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
+ mm->context.asce_limit = _REGION3_SIZE;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
+ crst_table_free(mm, (unsigned long *) pgd);
+
+ if (current->active_mm == mm)
+ set_user_asce(mm);
+}
+
+static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
+{
+ unsigned int old, new;
+
+ do {
+ old = atomic_read(v);
+ new = old ^ bits;
+ } while (atomic_cmpxchg(v, old, new) != old);
+ return new;
+}
+
+#ifdef CONFIG_PGSTE
+
+struct page *page_table_alloc_pgste(struct mm_struct *mm)
+{
+ struct page *page;
+ u64 *table;
+
+ page = alloc_page(GFP_KERNEL);
+ if (page) {
+ table = (u64 *)page_to_phys(page);
+ memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
+ memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
+ }
+ return page;
+}
+
+void page_table_free_pgste(struct page *page)
+{
+ __free_page(page);
+}
+
+#endif /* CONFIG_PGSTE */
+
+/*
+ * page table entry allocation/free routines.
+ */
+unsigned long *page_table_alloc(struct mm_struct *mm)
+{
+ unsigned long *table;
+ struct page *page;
+ unsigned int mask, bit;
+
+ /* Try to get a fragment of a 4K page as a 2K page table */
+ if (!mm_alloc_pgste(mm)) {
+ table = NULL;
+ spin_lock_bh(&mm->context.lock);
+ if (!list_empty(&mm->context.pgtable_list)) {
+ page = list_first_entry(&mm->context.pgtable_list,
+ struct page, lru);
+ mask = atomic_read(&page->_refcount) >> 24;
+ mask = (mask | (mask >> 4)) & 3;
+ if (mask != 3) {
+ table = (unsigned long *) page_to_phys(page);
+ bit = mask & 1; /* =1 -> second 2K */
+ if (bit)
+ table += PTRS_PER_PTE;
+ atomic_xor_bits(&page->_refcount,
+ 1U << (bit + 24));
+ list_del(&page->lru);
+ }
+ }
+ spin_unlock_bh(&mm->context.lock);
+ if (table)
+ return table;
+ }
+ /* Allocate a fresh page */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return NULL;
+ if (!pgtable_page_ctor(page)) {
+ __free_page(page);
+ return NULL;
+ }
+ arch_set_page_dat(page, 0);
+ /* Initialize page table */
+ table = (unsigned long *) page_to_phys(page);
+ if (mm_alloc_pgste(mm)) {
+ /* Return 4K page table with PGSTEs */
+ atomic_xor_bits(&page->_refcount, 3 << 24);
+ memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+ memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
+ } else {
+ /* Return the first 2K fragment of the page */
+ atomic_xor_bits(&page->_refcount, 1 << 24);
+ memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
+ spin_lock_bh(&mm->context.lock);
+ list_add(&page->lru, &mm->context.pgtable_list);
+ spin_unlock_bh(&mm->context.lock);
+ }
+ return table;
+}
+
+void page_table_free(struct mm_struct *mm, unsigned long *table)
+{
+ struct page *page;
+ unsigned int bit, mask;
+
+ page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ if (!mm_alloc_pgste(mm)) {
+ /* Free 2K page table fragment of a 4K page */
+ bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
+ spin_lock_bh(&mm->context.lock);
+ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+ mask >>= 24;
+ if (mask & 3)
+ list_add(&page->lru, &mm->context.pgtable_list);
+ else
+ list_del(&page->lru);
+ spin_unlock_bh(&mm->context.lock);
+ mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
+ mask >>= 24;
+ if (mask != 0)
+ return;
+ } else {
+ atomic_xor_bits(&page->_refcount, 3U << 24);
+ }
+
+ pgtable_page_dtor(page);
+ __free_page(page);
+}
+
+void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
+ unsigned long vmaddr)
+{
+ struct mm_struct *mm;
+ struct page *page;
+ unsigned int bit, mask;
+
+ mm = tlb->mm;
+ page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ if (mm_alloc_pgste(mm)) {
+ gmap_unlink(mm, table, vmaddr);
+ table = (unsigned long *) (__pa(table) | 3);
+ tlb_remove_table(tlb, table);
+ return;
+ }
+ bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
+ spin_lock_bh(&mm->context.lock);
+ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+ mask >>= 24;
+ if (mask & 3)
+ list_add_tail(&page->lru, &mm->context.pgtable_list);
+ else
+ list_del(&page->lru);
+ spin_unlock_bh(&mm->context.lock);
+ table = (unsigned long *) (__pa(table) | (1U << bit));
+ tlb_remove_table(tlb, table);
+}
+
+static void __tlb_remove_table(void *_table)
+{
+ unsigned int mask = (unsigned long) _table & 3;
+ void *table = (void *)((unsigned long) _table ^ mask);
+ struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+
+ switch (mask) {
+ case 0: /* pmd, pud, or p4d */
+ free_pages((unsigned long) table, 2);
+ break;
+ case 1: /* lower 2K of a 4K page table */
+ case 2: /* higher 2K of a 4K page table */
+ mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
+ mask >>= 24;
+ if (mask != 0)
+ break;
+ /* fallthrough */
+ case 3: /* 4K page table with pgstes */
+ if (mask & 3)
+ atomic_xor_bits(&page->_refcount, 3 << 24);
+ pgtable_page_dtor(page);
+ __free_page(page);
+ break;
+ }
+}
+
+static void tlb_remove_table_smp_sync(void *arg)
+{
+ /* Simply deliver the interrupt */
+}
+
+static void tlb_remove_table_one(void *table)
+{
+ /*
+ * This isn't an RCU grace period and hence the page-tables cannot be
+ * assumed to be actually RCU-freed.
+ *
+ * It is however sufficient for software page-table walkers that rely
+ * on IRQ disabling. See the comment near struct mmu_table_batch.
+ */
+ smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+ __tlb_remove_table(table);
+}
+
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+ struct mmu_table_batch *batch;
+ int i;
+
+ batch = container_of(head, struct mmu_table_batch, rcu);
+
+ for (i = 0; i < batch->nr; i++)
+ __tlb_remove_table(batch->tables[i]);
+
+ free_page((unsigned long)batch);
+}
+
+void tlb_table_flush(struct mmu_gather *tlb)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ if (*batch) {
+ call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
+ *batch = NULL;
+ }
+}
+
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ tlb->mm->context.flush_mm = 1;
+ if (*batch == NULL) {
+ *batch = (struct mmu_table_batch *)
+ __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ if (*batch == NULL) {
+ __tlb_flush_mm_lazy(tlb->mm);
+ tlb_remove_table_one(table);
+ return;
+ }
+ (*batch)->nr = 0;
+ }
+ (*batch)->tables[(*batch)->nr++] = table;
+ if ((*batch)->nr == MAX_TABLE_BATCH)
+ tlb_flush_mmu(tlb);
+}
+
+/*
+ * Base infrastructure required to generate basic asces, region, segment,
+ * and page tables that do not make use of enhanced features like EDAT1.
+ */
+
+static struct kmem_cache *base_pgt_cache;
+
+static unsigned long base_pgt_alloc(void)
+{
+ u64 *table;
+
+ table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
+ if (table)
+ memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
+ return (unsigned long) table;
+}
+
+static void base_pgt_free(unsigned long table)
+{
+ kmem_cache_free(base_pgt_cache, (void *) table);
+}
+
+static unsigned long base_crst_alloc(unsigned long val)
+{
+ unsigned long table;
+
+ table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ if (table)
+ crst_table_init((unsigned long *)table, val);
+ return table;
+}
+
+static void base_crst_free(unsigned long table)
+{
+ free_pages(table, CRST_ALLOC_ORDER);
+}
+
+#define BASE_ADDR_END_FUNC(NAME, SIZE) \
+static inline unsigned long base_##NAME##_addr_end(unsigned long addr, \
+ unsigned long end) \
+{ \
+ unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1); \
+ \
+ return (next - 1) < (end - 1) ? next : end; \
+}
+
+BASE_ADDR_END_FUNC(page, _PAGE_SIZE)
+BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
+BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
+BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
+BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
+
+static inline unsigned long base_lra(unsigned long address)
+{
+ unsigned long real;
+
+ asm volatile(
+ " lra %0,0(%1)\n"
+ : "=d" (real) : "a" (address) : "cc");
+ return real;
+}
+
+static int base_page_walk(unsigned long origin, unsigned long addr,
+ unsigned long end, int alloc)
+{
+ unsigned long *pte, next;
+
+ if (!alloc)
+ return 0;
+ pte = (unsigned long *) origin;
+ pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
+ do {
+ next = base_page_addr_end(addr, end);
+ *pte = base_lra(addr);
+ } while (pte++, addr = next, addr < end);
+ return 0;
+}
+
+static int base_segment_walk(unsigned long origin, unsigned long addr,
+ unsigned long end, int alloc)
+{
+ unsigned long *ste, next, table;
+ int rc;
+
+ ste = (unsigned long *) origin;
+ ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
+ do {
+ next = base_segment_addr_end(addr, end);
+ if (*ste & _SEGMENT_ENTRY_INVALID) {
+ if (!alloc)
+ continue;
+ table = base_pgt_alloc();
+ if (!table)
+ return -ENOMEM;
+ *ste = table | _SEGMENT_ENTRY;
+ }
+ table = *ste & _SEGMENT_ENTRY_ORIGIN;
+ rc = base_page_walk(table, addr, next, alloc);
+ if (rc)
+ return rc;
+ if (!alloc)
+ base_pgt_free(table);
+ cond_resched();
+ } while (ste++, addr = next, addr < end);
+ return 0;
+}
+
+static int base_region3_walk(unsigned long origin, unsigned long addr,
+ unsigned long end, int alloc)
+{
+ unsigned long *rtte, next, table;
+ int rc;
+
+ rtte = (unsigned long *) origin;
+ rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
+ do {
+ next = base_region3_addr_end(addr, end);
+ if (*rtte & _REGION_ENTRY_INVALID) {
+ if (!alloc)
+ continue;
+ table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!table)
+ return -ENOMEM;
+ *rtte = table | _REGION3_ENTRY;
+ }
+ table = *rtte & _REGION_ENTRY_ORIGIN;
+ rc = base_segment_walk(table, addr, next, alloc);
+ if (rc)
+ return rc;
+ if (!alloc)
+ base_crst_free(table);
+ } while (rtte++, addr = next, addr < end);
+ return 0;
+}
+
+static int base_region2_walk(unsigned long origin, unsigned long addr,
+ unsigned long end, int alloc)
+{
+ unsigned long *rste, next, table;
+ int rc;
+
+ rste = (unsigned long *) origin;
+ rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
+ do {
+ next = base_region2_addr_end(addr, end);
+ if (*rste & _REGION_ENTRY_INVALID) {
+ if (!alloc)
+ continue;
+ table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!table)
+ return -ENOMEM;
+ *rste = table | _REGION2_ENTRY;
+ }
+ table = *rste & _REGION_ENTRY_ORIGIN;
+ rc = base_region3_walk(table, addr, next, alloc);
+ if (rc)
+ return rc;
+ if (!alloc)
+ base_crst_free(table);
+ } while (rste++, addr = next, addr < end);
+ return 0;
+}
+
+static int base_region1_walk(unsigned long origin, unsigned long addr,
+ unsigned long end, int alloc)
+{
+ unsigned long *rfte, next, table;
+ int rc;
+
+ rfte = (unsigned long *) origin;
+ rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
+ do {
+ next = base_region1_addr_end(addr, end);
+ if (*rfte & _REGION_ENTRY_INVALID) {
+ if (!alloc)
+ continue;
+ table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!table)
+ return -ENOMEM;
+ *rfte = table | _REGION1_ENTRY;
+ }
+ table = *rfte & _REGION_ENTRY_ORIGIN;
+ rc = base_region2_walk(table, addr, next, alloc);
+ if (rc)
+ return rc;
+ if (!alloc)
+ base_crst_free(table);
+ } while (rfte++, addr = next, addr < end);
+ return 0;
+}
+
+/**
+ * base_asce_free - free asce and tables returned from base_asce_alloc()
+ * @asce: asce to be freed
+ *
+ * Frees all region, segment, and page tables that were allocated with a
+ * corresponding base_asce_alloc() call.
+ */
+void base_asce_free(unsigned long asce)
+{
+ unsigned long table = asce & _ASCE_ORIGIN;
+
+ if (!asce)
+ return;
+ switch (asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_SEGMENT:
+ base_segment_walk(table, 0, _REGION3_SIZE, 0);
+ break;
+ case _ASCE_TYPE_REGION3:
+ base_region3_walk(table, 0, _REGION2_SIZE, 0);
+ break;
+ case _ASCE_TYPE_REGION2:
+ base_region2_walk(table, 0, _REGION1_SIZE, 0);
+ break;
+ case _ASCE_TYPE_REGION1:
+ base_region1_walk(table, 0, -_PAGE_SIZE, 0);
+ break;
+ }
+ base_crst_free(table);
+}
+
+static int base_pgt_cache_init(void)
+{
+ static DEFINE_MUTEX(base_pgt_cache_mutex);
+ unsigned long sz = _PAGE_TABLE_SIZE;
+
+ if (base_pgt_cache)
+ return 0;
+ mutex_lock(&base_pgt_cache_mutex);
+ if (!base_pgt_cache)
+ base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
+ mutex_unlock(&base_pgt_cache_mutex);
+ return base_pgt_cache ? 0 : -ENOMEM;
+}
+
+/**
+ * base_asce_alloc - create kernel mapping without enhanced DAT features
+ * @addr: virtual start address of kernel mapping
+ * @num_pages: number of consecutive pages
+ *
+ * Generate an asce, including all required region, segment and page tables,
+ * that can be used to access the virtual kernel mapping. The difference is
+ * that the returned asce does not make use of any enhanced DAT features like
+ * e.g. large pages. This is required for some I/O functions that pass an
+ * asce, like e.g. some service call requests.
+ *
+ * Note: the returned asce may NEVER be attached to any cpu. It may only be
+ * used for I/O requests. tlb entries that might result because the
+ * asce was attached to a cpu won't be cleared.
+ */
+unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
+{
+ unsigned long asce, table, end;
+ int rc;
+
+ if (base_pgt_cache_init())
+ return 0;
+ end = addr + num_pages * PAGE_SIZE;
+ if (end <= _REGION3_SIZE) {
+ table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!table)
+ return 0;
+ rc = base_segment_walk(table, addr, end, 1);
+ asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
+ } else if (end <= _REGION2_SIZE) {
+ table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!table)
+ return 0;
+ rc = base_region3_walk(table, addr, end, 1);
+ asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+ } else if (end <= _REGION1_SIZE) {
+ table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!table)
+ return 0;
+ rc = base_region2_walk(table, addr, end, 1);
+ asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+ } else {
+ table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
+ if (!table)
+ return 0;
+ rc = base_region1_walk(table, addr, end, 1);
+ asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
+ }
+ if (rc) {
+ base_asce_free(asce);
+ asce = 0;
+ }
+ return asce;
+}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
new file mode 100644
index 000000000..9f3903089
--- /dev/null
+++ b/arch/s390/mm/pgtable.c
@@ -0,0 +1,1126 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2007, 2011
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/sysctl.h>
+#include <linux/ksm.h>
+#include <linux/mman.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/page-states.h>
+
+static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, int nodat)
+{
+ unsigned long opt, asce;
+
+ if (MACHINE_HAS_TLB_GUEST) {
+ opt = 0;
+ asce = READ_ONCE(mm->context.gmap_asce);
+ if (asce == 0UL || nodat)
+ opt |= IPTE_NODAT;
+ if (asce != -1UL) {
+ asce = asce ? : mm->context.asce;
+ opt |= IPTE_GUEST_ASCE;
+ }
+ __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL);
+ } else {
+ __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL);
+ }
+}
+
+static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, int nodat)
+{
+ unsigned long opt, asce;
+
+ if (MACHINE_HAS_TLB_GUEST) {
+ opt = 0;
+ asce = READ_ONCE(mm->context.gmap_asce);
+ if (asce == 0UL || nodat)
+ opt |= IPTE_NODAT;
+ if (asce != -1UL) {
+ asce = asce ? : mm->context.asce;
+ opt |= IPTE_GUEST_ASCE;
+ }
+ __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL);
+ } else {
+ __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+ }
+}
+
+static inline pte_t ptep_flush_direct(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ int nodat)
+{
+ pte_t old;
+
+ old = *ptep;
+ if (unlikely(pte_val(old) & _PAGE_INVALID))
+ return old;
+ atomic_inc(&mm->context.flush_count);
+ if (MACHINE_HAS_TLB_LC &&
+ cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+ ptep_ipte_local(mm, addr, ptep, nodat);
+ else
+ ptep_ipte_global(mm, addr, ptep, nodat);
+ atomic_dec(&mm->context.flush_count);
+ return old;
+}
+
+static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ int nodat)
+{
+ pte_t old;
+
+ old = *ptep;
+ if (unlikely(pte_val(old) & _PAGE_INVALID))
+ return old;
+ atomic_inc(&mm->context.flush_count);
+ if (cpumask_equal(&mm->context.cpu_attach_mask,
+ cpumask_of(smp_processor_id()))) {
+ pte_val(*ptep) |= _PAGE_INVALID;
+ mm->context.flush_mm = 1;
+ } else
+ ptep_ipte_global(mm, addr, ptep, nodat);
+ atomic_dec(&mm->context.flush_count);
+ return old;
+}
+
+static inline pgste_t pgste_get_lock(pte_t *ptep)
+{
+ unsigned long new = 0;
+#ifdef CONFIG_PGSTE
+ unsigned long old;
+
+ asm(
+ " lg %0,%2\n"
+ "0: lgr %1,%0\n"
+ " nihh %0,0xff7f\n" /* clear PCL bit in old */
+ " oihh %1,0x0080\n" /* set PCL bit in new */
+ " csg %0,%1,%2\n"
+ " jl 0b\n"
+ : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE])
+ : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory");
+#endif
+ return __pgste(new);
+}
+
+static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
+{
+#ifdef CONFIG_PGSTE
+ asm(
+ " nihh %1,0xff7f\n" /* clear PCL bit */
+ " stg %1,%0\n"
+ : "=Q" (ptep[PTRS_PER_PTE])
+ : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
+ : "cc", "memory");
+#endif
+}
+
+static inline pgste_t pgste_get(pte_t *ptep)
+{
+ unsigned long pgste = 0;
+#ifdef CONFIG_PGSTE
+ pgste = *(unsigned long *)(ptep + PTRS_PER_PTE);
+#endif
+ return __pgste(pgste);
+}
+
+static inline void pgste_set(pte_t *ptep, pgste_t pgste)
+{
+#ifdef CONFIG_PGSTE
+ *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste;
+#endif
+}
+
+static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
+ struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+ unsigned long address, bits, skey;
+
+ if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID)
+ return pgste;
+ address = pte_val(pte) & PAGE_MASK;
+ skey = (unsigned long) page_get_storage_key(address);
+ bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
+ /* Transfer page changed & referenced bit to guest bits in pgste */
+ pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */
+ /* Copy page access key and fetch protection bit to pgste */
+ pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
+ pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
+#endif
+ return pgste;
+
+}
+
+static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry,
+ struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+ unsigned long address;
+ unsigned long nkey;
+
+ if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID)
+ return;
+ VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID));
+ address = pte_val(entry) & PAGE_MASK;
+ /*
+ * Set page access key and fetch protection bit from pgste.
+ * The guest C/R information is still in the PGSTE, set real
+ * key C/R to 0.
+ */
+ nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+ nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
+ page_set_storage_key(address, nkey, 0);
+#endif
+}
+
+static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
+{
+#ifdef CONFIG_PGSTE
+ if ((pte_val(entry) & _PAGE_PRESENT) &&
+ (pte_val(entry) & _PAGE_WRITE) &&
+ !(pte_val(entry) & _PAGE_INVALID)) {
+ if (!MACHINE_HAS_ESOP) {
+ /*
+ * Without enhanced suppression-on-protection force
+ * the dirty bit on for all writable ptes.
+ */
+ pte_val(entry) |= _PAGE_DIRTY;
+ pte_val(entry) &= ~_PAGE_PROTECT;
+ }
+ if (!(pte_val(entry) & _PAGE_PROTECT))
+ /* This pte allows write access, set user-dirty */
+ pgste_val(pgste) |= PGSTE_UC_BIT;
+ }
+#endif
+ *ptep = entry;
+ return pgste;
+}
+
+static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, pgste_t pgste)
+{
+#ifdef CONFIG_PGSTE
+ unsigned long bits;
+
+ bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+ if (bits) {
+ pgste_val(pgste) ^= bits;
+ ptep_notify(mm, addr, ptep, bits);
+ }
+#endif
+ return pgste;
+}
+
+static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ pgste_t pgste = __pgste(0);
+
+ if (mm_has_pgste(mm)) {
+ pgste = pgste_get_lock(ptep);
+ pgste = pgste_pte_notify(mm, addr, ptep, pgste);
+ }
+ return pgste;
+}
+
+static inline pte_t ptep_xchg_commit(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ pgste_t pgste, pte_t old, pte_t new)
+{
+ if (mm_has_pgste(mm)) {
+ if (pte_val(old) & _PAGE_INVALID)
+ pgste_set_key(ptep, pgste, new, mm);
+ if (pte_val(new) & _PAGE_INVALID) {
+ pgste = pgste_update_all(old, pgste, mm);
+ if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
+ _PGSTE_GPS_USAGE_UNUSED)
+ pte_val(old) |= _PAGE_UNUSED;
+ }
+ pgste = pgste_set_pte(ptep, pgste, new);
+ pgste_set_unlock(ptep, pgste);
+ } else {
+ *ptep = new;
+ }
+ return old;
+}
+
+pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t new)
+{
+ pgste_t pgste;
+ pte_t old;
+ int nodat;
+
+ preempt_disable();
+ pgste = ptep_xchg_start(mm, addr, ptep);
+ nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+ old = ptep_flush_direct(mm, addr, ptep, nodat);
+ old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+ preempt_enable();
+ return old;
+}
+EXPORT_SYMBOL(ptep_xchg_direct);
+
+pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t new)
+{
+ pgste_t pgste;
+ pte_t old;
+ int nodat;
+
+ preempt_disable();
+ pgste = ptep_xchg_start(mm, addr, ptep);
+ nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+ old = ptep_flush_lazy(mm, addr, ptep, nodat);
+ old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+ preempt_enable();
+ return old;
+}
+EXPORT_SYMBOL(ptep_xchg_lazy);
+
+pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pgste_t pgste;
+ pte_t old;
+ int nodat;
+
+ preempt_disable();
+ pgste = ptep_xchg_start(mm, addr, ptep);
+ nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+ old = ptep_flush_lazy(mm, addr, ptep, nodat);
+ if (mm_has_pgste(mm)) {
+ pgste = pgste_update_all(old, pgste, mm);
+ pgste_set(ptep, pgste);
+ }
+ return old;
+}
+EXPORT_SYMBOL(ptep_modify_prot_start);
+
+void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ pgste_t pgste;
+
+ if (!MACHINE_HAS_NX)
+ pte_val(pte) &= ~_PAGE_NOEXEC;
+ if (mm_has_pgste(mm)) {
+ pgste = pgste_get(ptep);
+ pgste_set_key(ptep, pgste, pte, mm);
+ pgste = pgste_set_pte(ptep, pgste, pte);
+ pgste_set_unlock(ptep, pgste);
+ } else {
+ *ptep = pte;
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL(ptep_modify_prot_commit);
+
+static inline void pmdp_idte_local(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ if (MACHINE_HAS_TLB_GUEST)
+ __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
+ mm->context.asce, IDTE_LOCAL);
+ else
+ __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL);
+ if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
+ gmap_pmdp_idte_local(mm, addr);
+}
+
+static inline void pmdp_idte_global(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ if (MACHINE_HAS_TLB_GUEST) {
+ __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
+ mm->context.asce, IDTE_GLOBAL);
+ if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
+ gmap_pmdp_idte_global(mm, addr);
+ } else if (MACHINE_HAS_IDTE) {
+ __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
+ if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
+ gmap_pmdp_idte_global(mm, addr);
+ } else {
+ __pmdp_csp(pmdp);
+ if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
+ gmap_pmdp_csp(mm, addr);
+ }
+}
+
+static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old;
+
+ old = *pmdp;
+ if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
+ return old;
+ atomic_inc(&mm->context.flush_count);
+ if (MACHINE_HAS_TLB_LC &&
+ cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+ pmdp_idte_local(mm, addr, pmdp);
+ else
+ pmdp_idte_global(mm, addr, pmdp);
+ atomic_dec(&mm->context.flush_count);
+ return old;
+}
+
+static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old;
+
+ old = *pmdp;
+ if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
+ return old;
+ atomic_inc(&mm->context.flush_count);
+ if (cpumask_equal(&mm->context.cpu_attach_mask,
+ cpumask_of(smp_processor_id()))) {
+ pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
+ mm->context.flush_mm = 1;
+ if (mm_has_pgste(mm))
+ gmap_pmdp_invalidate(mm, addr);
+ } else {
+ pmdp_idte_global(mm, addr, pmdp);
+ }
+ atomic_dec(&mm->context.flush_count);
+ return old;
+}
+
+#ifdef CONFIG_PGSTE
+static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset(mm, addr);
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return NULL;
+ pud = pud_alloc(mm, p4d, addr);
+ if (!pud)
+ return NULL;
+ pmd = pmd_alloc(mm, pud, addr);
+ return pmd;
+}
+#endif
+
+pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t new)
+{
+ pmd_t old;
+
+ preempt_disable();
+ old = pmdp_flush_direct(mm, addr, pmdp);
+ *pmdp = new;
+ preempt_enable();
+ return old;
+}
+EXPORT_SYMBOL(pmdp_xchg_direct);
+
+pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t new)
+{
+ pmd_t old;
+
+ preempt_disable();
+ old = pmdp_flush_lazy(mm, addr, pmdp);
+ *pmdp = new;
+ preempt_enable();
+ return old;
+}
+EXPORT_SYMBOL(pmdp_xchg_lazy);
+
+static inline void pudp_idte_local(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ if (MACHINE_HAS_TLB_GUEST)
+ __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
+ mm->context.asce, IDTE_LOCAL);
+ else
+ __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL);
+}
+
+static inline void pudp_idte_global(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ if (MACHINE_HAS_TLB_GUEST)
+ __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
+ mm->context.asce, IDTE_GLOBAL);
+ else if (MACHINE_HAS_IDTE)
+ __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL);
+ else
+ /*
+ * Invalid bit position is the same for pmd and pud, so we can
+ * re-use _pmd_csp() here
+ */
+ __pmdp_csp((pmd_t *) pudp);
+}
+
+static inline pud_t pudp_flush_direct(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ pud_t old;
+
+ old = *pudp;
+ if (pud_val(old) & _REGION_ENTRY_INVALID)
+ return old;
+ atomic_inc(&mm->context.flush_count);
+ if (MACHINE_HAS_TLB_LC &&
+ cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+ pudp_idte_local(mm, addr, pudp);
+ else
+ pudp_idte_global(mm, addr, pudp);
+ atomic_dec(&mm->context.flush_count);
+ return old;
+}
+
+pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t new)
+{
+ pud_t old;
+
+ preempt_disable();
+ old = pudp_flush_direct(mm, addr, pudp);
+ *pudp = new;
+ preempt_enable();
+ return old;
+}
+EXPORT_SYMBOL(pudp_xchg_direct);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ struct list_head *lh = (struct list_head *) pgtable;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ if (!pmd_huge_pte(mm, pmdp))
+ INIT_LIST_HEAD(lh);
+ else
+ list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+ pmd_huge_pte(mm, pmdp) = pgtable;
+}
+
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ struct list_head *lh;
+ pgtable_t pgtable;
+ pte_t *ptep;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ pgtable = pmd_huge_pte(mm, pmdp);
+ lh = (struct list_head *) pgtable;
+ if (list_empty(lh))
+ pmd_huge_pte(mm, pmdp) = NULL;
+ else {
+ pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
+ list_del(lh);
+ }
+ ptep = (pte_t *) pgtable;
+ pte_val(*ptep) = _PAGE_INVALID;
+ ptep++;
+ pte_val(*ptep) = _PAGE_INVALID;
+ return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_PGSTE
+void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t entry)
+{
+ pgste_t pgste;
+
+ /* the mm_has_pgste() check is done in set_pte_at() */
+ preempt_disable();
+ pgste = pgste_get_lock(ptep);
+ pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
+ pgste_set_key(ptep, pgste, entry, mm);
+ pgste = pgste_set_pte(ptep, pgste, entry);
+ pgste_set_unlock(ptep, pgste);
+ preempt_enable();
+}
+
+void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ pgste_t pgste;
+
+ preempt_disable();
+ pgste = pgste_get_lock(ptep);
+ pgste_val(pgste) |= PGSTE_IN_BIT;
+ pgste_set_unlock(ptep, pgste);
+ preempt_enable();
+}
+
+/**
+ * ptep_force_prot - change access rights of a locked pte
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @ptep: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bit: pgste bit to set (e.g. for notification)
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
+ */
+int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, int prot, unsigned long bit)
+{
+ pte_t entry;
+ pgste_t pgste;
+ int pte_i, pte_p, nodat;
+
+ pgste = pgste_get_lock(ptep);
+ entry = *ptep;
+ /* Check pte entry after all locks have been acquired */
+ pte_i = pte_val(entry) & _PAGE_INVALID;
+ pte_p = pte_val(entry) & _PAGE_PROTECT;
+ if ((pte_i && (prot != PROT_NONE)) ||
+ (pte_p && (prot & PROT_WRITE))) {
+ pgste_set_unlock(ptep, pgste);
+ return -EAGAIN;
+ }
+ /* Change access rights and set pgste bit */
+ nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+ if (prot == PROT_NONE && !pte_i) {
+ ptep_flush_direct(mm, addr, ptep, nodat);
+ pgste = pgste_update_all(entry, pgste, mm);
+ pte_val(entry) |= _PAGE_INVALID;
+ }
+ if (prot == PROT_READ && !pte_p) {
+ ptep_flush_direct(mm, addr, ptep, nodat);
+ pte_val(entry) &= ~_PAGE_INVALID;
+ pte_val(entry) |= _PAGE_PROTECT;
+ }
+ pgste_val(pgste) |= bit;
+ pgste = pgste_set_pte(ptep, pgste, entry);
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+ pte_t *sptep, pte_t *tptep, pte_t pte)
+{
+ pgste_t spgste, tpgste;
+ pte_t spte, tpte;
+ int rc = -EAGAIN;
+
+ if (!(pte_val(*tptep) & _PAGE_INVALID))
+ return 0; /* already shadowed */
+ spgste = pgste_get_lock(sptep);
+ spte = *sptep;
+ if (!(pte_val(spte) & _PAGE_INVALID) &&
+ !((pte_val(spte) & _PAGE_PROTECT) &&
+ !(pte_val(pte) & _PAGE_PROTECT))) {
+ pgste_val(spgste) |= PGSTE_VSIE_BIT;
+ tpgste = pgste_get_lock(tptep);
+ pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+ (pte_val(pte) & _PAGE_PROTECT);
+ /* don't touch the storage key - it belongs to parent pgste */
+ tpgste = pgste_set_pte(tptep, tpgste, tpte);
+ pgste_set_unlock(tptep, tpgste);
+ rc = 1;
+ }
+ pgste_set_unlock(sptep, spgste);
+ return rc;
+}
+
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+{
+ pgste_t pgste;
+ int nodat;
+
+ pgste = pgste_get_lock(ptep);
+ /* notifier is called by the caller */
+ nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+ ptep_flush_direct(mm, saddr, ptep, nodat);
+ /* don't touch the storage key - it belongs to parent pgste */
+ pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+ pgste_set_unlock(ptep, pgste);
+}
+
+static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
+{
+ if (!non_swap_entry(entry))
+ dec_mm_counter(mm, MM_SWAPENTS);
+ else if (is_migration_entry(entry)) {
+ struct page *page = migration_entry_to_page(entry);
+
+ dec_mm_counter(mm, mm_counter(page));
+ }
+ free_swap_and_cache(entry);
+}
+
+void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, int reset)
+{
+ unsigned long pgstev;
+ pgste_t pgste;
+ pte_t pte;
+
+ /* Zap unused and logically-zero pages */
+ preempt_disable();
+ pgste = pgste_get_lock(ptep);
+ pgstev = pgste_val(pgste);
+ pte = *ptep;
+ if (!reset && pte_swap(pte) &&
+ ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
+ (pgstev & _PGSTE_GPS_ZERO))) {
+ ptep_zap_swap_entry(mm, pte_to_swp_entry(pte));
+ pte_clear(mm, addr, ptep);
+ }
+ if (reset)
+ pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
+ pgste_set_unlock(ptep, pgste);
+ preempt_enable();
+}
+
+void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ unsigned long ptev;
+ pgste_t pgste;
+
+ /* Clear storage key ACC and F, but set R/C */
+ preempt_disable();
+ pgste = pgste_get_lock(ptep);
+ pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
+ pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT;
+ ptev = pte_val(*ptep);
+ if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
+ page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
+ pgste_set_unlock(ptep, pgste);
+ preempt_enable();
+}
+
+/*
+ * Test and reset if a guest page is dirty
+ */
+bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pgste_t pgste;
+ pte_t pte;
+ bool dirty;
+ int nodat;
+
+ pgste = pgste_get_lock(ptep);
+ dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
+ pgste_val(pgste) &= ~PGSTE_UC_BIT;
+ pte = *ptep;
+ if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
+ pgste = pgste_pte_notify(mm, addr, ptep, pgste);
+ nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
+ ptep_ipte_global(mm, addr, ptep, nodat);
+ if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
+ pte_val(pte) |= _PAGE_PROTECT;
+ else
+ pte_val(pte) |= _PAGE_INVALID;
+ *ptep = pte;
+ }
+ pgste_set_unlock(ptep, pgste);
+ return dirty;
+}
+EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc);
+
+int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char key, bool nq)
+{
+ unsigned long keyul, paddr;
+ spinlock_t *ptl;
+ pgste_t old, new;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ pmdp = pmd_alloc_map(mm, addr);
+ if (unlikely(!pmdp))
+ return -EFAULT;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (!pmd_present(*pmdp)) {
+ spin_unlock(ptl);
+ return -EFAULT;
+ }
+
+ if (pmd_large(*pmdp)) {
+ paddr = pmd_val(*pmdp) & HPAGE_MASK;
+ paddr |= addr & ~HPAGE_MASK;
+ /*
+ * Huge pmds need quiescing operations, they are
+ * always mapped.
+ */
+ page_set_storage_key(paddr, key, 1);
+ spin_unlock(ptl);
+ return 0;
+ }
+ spin_unlock(ptl);
+
+ ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
+ if (unlikely(!ptep))
+ return -EFAULT;
+
+ new = old = pgste_get_lock(ptep);
+ pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
+ PGSTE_ACC_BITS | PGSTE_FP_BIT);
+ keyul = (unsigned long) key;
+ pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
+ pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
+ if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+ unsigned long bits, skey;
+
+ paddr = pte_val(*ptep) & PAGE_MASK;
+ skey = (unsigned long) page_get_storage_key(paddr);
+ bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
+ skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
+ /* Set storage key ACC and FP */
+ page_set_storage_key(paddr, skey, !nq);
+ /* Merge host changed & referenced into pgste */
+ pgste_val(new) |= bits << 52;
+ }
+ /* changing the guest storage key is considered a change of the page */
+ if ((pgste_val(new) ^ pgste_val(old)) &
+ (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
+ pgste_val(new) |= PGSTE_UC_BIT;
+
+ pgste_set_unlock(ptep, new);
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+EXPORT_SYMBOL(set_guest_storage_key);
+
+/**
+ * Conditionally set a guest storage key (handling csske).
+ * oldkey will be updated when either mr or mc is set and a pointer is given.
+ *
+ * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
+ * storage key was updated and -EFAULT on access errors.
+ */
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char key, unsigned char *oldkey,
+ bool nq, bool mr, bool mc)
+{
+ unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
+ int rc;
+
+ /* we can drop the pgste lock between getting and setting the key */
+ if (mr | mc) {
+ rc = get_guest_storage_key(current->mm, addr, &tmp);
+ if (rc)
+ return rc;
+ if (oldkey)
+ *oldkey = tmp;
+ if (!mr)
+ mask |= _PAGE_REFERENCED;
+ if (!mc)
+ mask |= _PAGE_CHANGED;
+ if (!((tmp ^ key) & mask))
+ return 0;
+ }
+ rc = set_guest_storage_key(current->mm, addr, key, nq);
+ return rc < 0 ? rc : 1;
+}
+EXPORT_SYMBOL(cond_set_guest_storage_key);
+
+/**
+ * Reset a guest reference bit (rrbe), returning the reference and changed bit.
+ *
+ * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
+ */
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
+{
+ spinlock_t *ptl;
+ unsigned long paddr;
+ pgste_t old, new;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ int cc = 0;
+
+ pmdp = pmd_alloc_map(mm, addr);
+ if (unlikely(!pmdp))
+ return -EFAULT;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (!pmd_present(*pmdp)) {
+ spin_unlock(ptl);
+ return -EFAULT;
+ }
+
+ if (pmd_large(*pmdp)) {
+ paddr = pmd_val(*pmdp) & HPAGE_MASK;
+ paddr |= addr & ~HPAGE_MASK;
+ cc = page_reset_referenced(paddr);
+ spin_unlock(ptl);
+ return cc;
+ }
+ spin_unlock(ptl);
+
+ ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
+ if (unlikely(!ptep))
+ return -EFAULT;
+
+ new = old = pgste_get_lock(ptep);
+ /* Reset guest reference bit only */
+ pgste_val(new) &= ~PGSTE_GR_BIT;
+
+ if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+ paddr = pte_val(*ptep) & PAGE_MASK;
+ cc = page_reset_referenced(paddr);
+ /* Merge real referenced bit into host-set */
+ pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
+ }
+ /* Reflect guest's logical view, not physical */
+ cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
+ /* Changing the guest storage key is considered a change of the page */
+ if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
+ pgste_val(new) |= PGSTE_UC_BIT;
+
+ pgste_set_unlock(ptep, new);
+ pte_unmap_unlock(ptep, ptl);
+ return cc;
+}
+EXPORT_SYMBOL(reset_guest_reference_bit);
+
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char *key)
+{
+ unsigned long paddr;
+ spinlock_t *ptl;
+ pgste_t pgste;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ pmdp = pmd_alloc_map(mm, addr);
+ if (unlikely(!pmdp))
+ return -EFAULT;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (!pmd_present(*pmdp)) {
+ /* Not yet mapped memory has a zero key */
+ spin_unlock(ptl);
+ *key = 0;
+ return 0;
+ }
+
+ if (pmd_large(*pmdp)) {
+ paddr = pmd_val(*pmdp) & HPAGE_MASK;
+ paddr |= addr & ~HPAGE_MASK;
+ *key = page_get_storage_key(paddr);
+ spin_unlock(ptl);
+ return 0;
+ }
+ spin_unlock(ptl);
+
+ ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
+ if (unlikely(!ptep))
+ return -EFAULT;
+
+ pgste = pgste_get_lock(ptep);
+ *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+ paddr = pte_val(*ptep) & PAGE_MASK;
+ if (!(pte_val(*ptep) & _PAGE_INVALID))
+ *key = page_get_storage_key(paddr);
+ /* Reflect guest's logical view, not physical */
+ *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
+ pgste_set_unlock(ptep, pgste);
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+EXPORT_SYMBOL(get_guest_storage_key);
+
+/**
+ * pgste_perform_essa - perform ESSA actions on the PGSTE.
+ * @mm: the memory context. It must have PGSTEs, no check is performed here!
+ * @hva: the host virtual address of the page whose PGSTE is to be processed
+ * @orc: the specific action to perform, see the ESSA_SET_* macros.
+ * @oldpte: the PTE will be saved there if the pointer is not NULL.
+ * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL.
+ *
+ * Return: 1 if the page is to be added to the CBRL, otherwise 0,
+ * or < 0 in case of error. -EINVAL is returned for invalid values
+ * of orc, -EFAULT for invalid addresses.
+ */
+int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
+ unsigned long *oldpte, unsigned long *oldpgste)
+{
+ struct vm_area_struct *vma;
+ unsigned long pgstev;
+ spinlock_t *ptl;
+ pgste_t pgste;
+ pte_t *ptep;
+ int res = 0;
+
+ WARN_ON_ONCE(orc > ESSA_MAX);
+ if (unlikely(orc > ESSA_MAX))
+ return -EINVAL;
+
+ vma = find_vma(mm, hva);
+ if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma))
+ return -EFAULT;
+ ptep = get_locked_pte(mm, hva, &ptl);
+ if (unlikely(!ptep))
+ return -EFAULT;
+ pgste = pgste_get_lock(ptep);
+ pgstev = pgste_val(pgste);
+ if (oldpte)
+ *oldpte = pte_val(*ptep);
+ if (oldpgste)
+ *oldpgste = pgstev;
+
+ switch (orc) {
+ case ESSA_GET_STATE:
+ break;
+ case ESSA_SET_STABLE:
+ pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
+ pgstev |= _PGSTE_GPS_USAGE_STABLE;
+ break;
+ case ESSA_SET_UNUSED:
+ pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+ pgstev |= _PGSTE_GPS_USAGE_UNUSED;
+ if (pte_val(*ptep) & _PAGE_INVALID)
+ res = 1;
+ break;
+ case ESSA_SET_VOLATILE:
+ pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+ pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
+ if (pte_val(*ptep) & _PAGE_INVALID)
+ res = 1;
+ break;
+ case ESSA_SET_POT_VOLATILE:
+ pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+ if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+ pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE;
+ break;
+ }
+ if (pgstev & _PGSTE_GPS_ZERO) {
+ pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
+ break;
+ }
+ if (!(pgstev & PGSTE_GC_BIT)) {
+ pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
+ res = 1;
+ break;
+ }
+ break;
+ case ESSA_SET_STABLE_RESIDENT:
+ pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+ pgstev |= _PGSTE_GPS_USAGE_STABLE;
+ /*
+ * Since the resident state can go away any time after this
+ * call, we will not make this page resident. We can revisit
+ * this decision if a guest will ever start using this.
+ */
+ break;
+ case ESSA_SET_STABLE_IF_RESIDENT:
+ if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+ pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+ pgstev |= _PGSTE_GPS_USAGE_STABLE;
+ }
+ break;
+ case ESSA_SET_STABLE_NODAT:
+ pgstev &= ~_PGSTE_GPS_USAGE_MASK;
+ pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT;
+ break;
+ default:
+ /* we should never get here! */
+ break;
+ }
+ /* If we are discarding a page, set it to logical zero */
+ if (res)
+ pgstev |= _PGSTE_GPS_ZERO;
+
+ pgste_val(pgste) = pgstev;
+ pgste_set_unlock(ptep, pgste);
+ pte_unmap_unlock(ptep, ptl);
+ return res;
+}
+EXPORT_SYMBOL(pgste_perform_essa);
+
+/**
+ * set_pgste_bits - set specific PGSTE bits.
+ * @mm: the memory context. It must have PGSTEs, no check is performed here!
+ * @hva: the host virtual address of the page whose PGSTE is to be processed
+ * @bits: a bitmask representing the bits that will be touched
+ * @value: the values of the bits to be written. Only the bits in the mask
+ * will be written.
+ *
+ * Return: 0 on success, < 0 in case of error.
+ */
+int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
+ unsigned long bits, unsigned long value)
+{
+ struct vm_area_struct *vma;
+ spinlock_t *ptl;
+ pgste_t new;
+ pte_t *ptep;
+
+ vma = find_vma(mm, hva);
+ if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma))
+ return -EFAULT;
+ ptep = get_locked_pte(mm, hva, &ptl);
+ if (unlikely(!ptep))
+ return -EFAULT;
+ new = pgste_get_lock(ptep);
+
+ pgste_val(new) &= ~bits;
+ pgste_val(new) |= value & bits;
+
+ pgste_set_unlock(ptep, new);
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+EXPORT_SYMBOL(set_pgste_bits);
+
+/**
+ * get_pgste - get the current PGSTE for the given address.
+ * @mm: the memory context. It must have PGSTEs, no check is performed here!
+ * @hva: the host virtual address of the page whose PGSTE is to be processed
+ * @pgstep: will be written with the current PGSTE for the given address.
+ *
+ * Return: 0 on success, < 0 in case of error.
+ */
+int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep)
+{
+ struct vm_area_struct *vma;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ vma = find_vma(mm, hva);
+ if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma))
+ return -EFAULT;
+ ptep = get_locked_pte(mm, hva, &ptl);
+ if (unlikely(!ptep))
+ return -EFAULT;
+ *pgstep = pgste_val(pgste_get(ptep));
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+EXPORT_SYMBOL(get_pgste);
+#endif
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
new file mode 100644
index 000000000..db55561c5
--- /dev/null
+++ b/arch/s390/mm/vmem.c
@@ -0,0 +1,443 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2006
+ * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
+ */
+
+#include <linux/bootmem.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/hugetlb.h>
+#include <linux/slab.h>
+#include <linux/memblock.h>
+#include <asm/cacheflush.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/set_memory.h>
+
+static DEFINE_MUTEX(vmem_mutex);
+
+struct memory_segment {
+ struct list_head list;
+ unsigned long start;
+ unsigned long size;
+};
+
+static LIST_HEAD(mem_segs);
+
+static void __ref *vmem_alloc_pages(unsigned int order)
+{
+ unsigned long size = PAGE_SIZE << order;
+
+ if (slab_is_available())
+ return (void *)__get_free_pages(GFP_KERNEL, order);
+ return (void *) memblock_alloc(size, size);
+}
+
+void *vmem_crst_alloc(unsigned long val)
+{
+ unsigned long *table;
+
+ table = vmem_alloc_pages(CRST_ALLOC_ORDER);
+ if (table)
+ crst_table_init(table, val);
+ return table;
+}
+
+pte_t __ref *vmem_pte_alloc(void)
+{
+ unsigned long size = PTRS_PER_PTE * sizeof(pte_t);
+ pte_t *pte;
+
+ if (slab_is_available())
+ pte = (pte_t *) page_table_alloc(&init_mm);
+ else
+ pte = (pte_t *) memblock_alloc(size, size);
+ if (!pte)
+ return NULL;
+ memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
+ return pte;
+}
+
+/*
+ * Add a physical memory range to the 1:1 mapping.
+ */
+static int vmem_add_mem(unsigned long start, unsigned long size)
+{
+ unsigned long pgt_prot, sgt_prot, r3_prot;
+ unsigned long pages4k, pages1m, pages2g;
+ unsigned long end = start + size;
+ unsigned long address = start;
+ pgd_t *pg_dir;
+ p4d_t *p4_dir;
+ pud_t *pu_dir;
+ pmd_t *pm_dir;
+ pte_t *pt_dir;
+ int ret = -ENOMEM;
+
+ pgt_prot = pgprot_val(PAGE_KERNEL);
+ sgt_prot = pgprot_val(SEGMENT_KERNEL);
+ r3_prot = pgprot_val(REGION3_KERNEL);
+ if (!MACHINE_HAS_NX) {
+ pgt_prot &= ~_PAGE_NOEXEC;
+ sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
+ r3_prot &= ~_REGION_ENTRY_NOEXEC;
+ }
+ pages4k = pages1m = pages2g = 0;
+ while (address < end) {
+ pg_dir = pgd_offset_k(address);
+ if (pgd_none(*pg_dir)) {
+ p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!p4_dir)
+ goto out;
+ pgd_populate(&init_mm, pg_dir, p4_dir);
+ }
+ p4_dir = p4d_offset(pg_dir, address);
+ if (p4d_none(*p4_dir)) {
+ pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!pu_dir)
+ goto out;
+ p4d_populate(&init_mm, p4_dir, pu_dir);
+ }
+ pu_dir = pud_offset(p4_dir, address);
+ if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
+ !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
+ !debug_pagealloc_enabled()) {
+ pud_val(*pu_dir) = address | r3_prot;
+ address += PUD_SIZE;
+ pages2g++;
+ continue;
+ }
+ if (pud_none(*pu_dir)) {
+ pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!pm_dir)
+ goto out;
+ pud_populate(&init_mm, pu_dir, pm_dir);
+ }
+ pm_dir = pmd_offset(pu_dir, address);
+ if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
+ !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
+ !debug_pagealloc_enabled()) {
+ pmd_val(*pm_dir) = address | sgt_prot;
+ address += PMD_SIZE;
+ pages1m++;
+ continue;
+ }
+ if (pmd_none(*pm_dir)) {
+ pt_dir = vmem_pte_alloc();
+ if (!pt_dir)
+ goto out;
+ pmd_populate(&init_mm, pm_dir, pt_dir);
+ }
+
+ pt_dir = pte_offset_kernel(pm_dir, address);
+ pte_val(*pt_dir) = address | pgt_prot;
+ address += PAGE_SIZE;
+ pages4k++;
+ }
+ ret = 0;
+out:
+ update_page_count(PG_DIRECT_MAP_4K, pages4k);
+ update_page_count(PG_DIRECT_MAP_1M, pages1m);
+ update_page_count(PG_DIRECT_MAP_2G, pages2g);
+ return ret;
+}
+
+/*
+ * Remove a physical memory range from the 1:1 mapping.
+ * Currently only invalidates page table entries.
+ */
+static void vmem_remove_range(unsigned long start, unsigned long size)
+{
+ unsigned long pages4k, pages1m, pages2g;
+ unsigned long end = start + size;
+ unsigned long address = start;
+ pgd_t *pg_dir;
+ p4d_t *p4_dir;
+ pud_t *pu_dir;
+ pmd_t *pm_dir;
+ pte_t *pt_dir;
+
+ pages4k = pages1m = pages2g = 0;
+ while (address < end) {
+ pg_dir = pgd_offset_k(address);
+ if (pgd_none(*pg_dir)) {
+ address += PGDIR_SIZE;
+ continue;
+ }
+ p4_dir = p4d_offset(pg_dir, address);
+ if (p4d_none(*p4_dir)) {
+ address += P4D_SIZE;
+ continue;
+ }
+ pu_dir = pud_offset(p4_dir, address);
+ if (pud_none(*pu_dir)) {
+ address += PUD_SIZE;
+ continue;
+ }
+ if (pud_large(*pu_dir)) {
+ pud_clear(pu_dir);
+ address += PUD_SIZE;
+ pages2g++;
+ continue;
+ }
+ pm_dir = pmd_offset(pu_dir, address);
+ if (pmd_none(*pm_dir)) {
+ address += PMD_SIZE;
+ continue;
+ }
+ if (pmd_large(*pm_dir)) {
+ pmd_clear(pm_dir);
+ address += PMD_SIZE;
+ pages1m++;
+ continue;
+ }
+ pt_dir = pte_offset_kernel(pm_dir, address);
+ pte_clear(&init_mm, address, pt_dir);
+ address += PAGE_SIZE;
+ pages4k++;
+ }
+ flush_tlb_kernel_range(start, end);
+ update_page_count(PG_DIRECT_MAP_4K, -pages4k);
+ update_page_count(PG_DIRECT_MAP_1M, -pages1m);
+ update_page_count(PG_DIRECT_MAP_2G, -pages2g);
+}
+
+/*
+ * Add a backed mem_map array to the virtual mem_map array.
+ */
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+ unsigned long pgt_prot, sgt_prot;
+ unsigned long address = start;
+ pgd_t *pg_dir;
+ p4d_t *p4_dir;
+ pud_t *pu_dir;
+ pmd_t *pm_dir;
+ pte_t *pt_dir;
+ int ret = -ENOMEM;
+
+ pgt_prot = pgprot_val(PAGE_KERNEL);
+ sgt_prot = pgprot_val(SEGMENT_KERNEL);
+ if (!MACHINE_HAS_NX) {
+ pgt_prot &= ~_PAGE_NOEXEC;
+ sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
+ }
+ for (address = start; address < end;) {
+ pg_dir = pgd_offset_k(address);
+ if (pgd_none(*pg_dir)) {
+ p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!p4_dir)
+ goto out;
+ pgd_populate(&init_mm, pg_dir, p4_dir);
+ }
+
+ p4_dir = p4d_offset(pg_dir, address);
+ if (p4d_none(*p4_dir)) {
+ pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!pu_dir)
+ goto out;
+ p4d_populate(&init_mm, p4_dir, pu_dir);
+ }
+
+ pu_dir = pud_offset(p4_dir, address);
+ if (pud_none(*pu_dir)) {
+ pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!pm_dir)
+ goto out;
+ pud_populate(&init_mm, pu_dir, pm_dir);
+ }
+
+ pm_dir = pmd_offset(pu_dir, address);
+ if (pmd_none(*pm_dir)) {
+ /* Use 1MB frames for vmemmap if available. We always
+ * use large frames even if they are only partially
+ * used.
+ * Otherwise we would have also page tables since
+ * vmemmap_populate gets called for each section
+ * separately. */
+ if (MACHINE_HAS_EDAT1) {
+ void *new_page;
+
+ new_page = vmemmap_alloc_block(PMD_SIZE, node);
+ if (!new_page)
+ goto out;
+ pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
+ address = (address + PMD_SIZE) & PMD_MASK;
+ continue;
+ }
+ pt_dir = vmem_pte_alloc();
+ if (!pt_dir)
+ goto out;
+ pmd_populate(&init_mm, pm_dir, pt_dir);
+ } else if (pmd_large(*pm_dir)) {
+ address = (address + PMD_SIZE) & PMD_MASK;
+ continue;
+ }
+
+ pt_dir = pte_offset_kernel(pm_dir, address);
+ if (pte_none(*pt_dir)) {
+ void *new_page;
+
+ new_page = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!new_page)
+ goto out;
+ pte_val(*pt_dir) = __pa(new_page) | pgt_prot;
+ }
+ address += PAGE_SIZE;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+void vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+}
+
+/*
+ * Add memory segment to the segment list if it doesn't overlap with
+ * an already present segment.
+ */
+static int insert_memory_segment(struct memory_segment *seg)
+{
+ struct memory_segment *tmp;
+
+ if (seg->start + seg->size > VMEM_MAX_PHYS ||
+ seg->start + seg->size < seg->start)
+ return -ERANGE;
+
+ list_for_each_entry(tmp, &mem_segs, list) {
+ if (seg->start >= tmp->start + tmp->size)
+ continue;
+ if (seg->start + seg->size <= tmp->start)
+ continue;
+ return -ENOSPC;
+ }
+ list_add(&seg->list, &mem_segs);
+ return 0;
+}
+
+/*
+ * Remove memory segment from the segment list.
+ */
+static void remove_memory_segment(struct memory_segment *seg)
+{
+ list_del(&seg->list);
+}
+
+static void __remove_shared_memory(struct memory_segment *seg)
+{
+ remove_memory_segment(seg);
+ vmem_remove_range(seg->start, seg->size);
+}
+
+int vmem_remove_mapping(unsigned long start, unsigned long size)
+{
+ struct memory_segment *seg;
+ int ret;
+
+ mutex_lock(&vmem_mutex);
+
+ ret = -ENOENT;
+ list_for_each_entry(seg, &mem_segs, list) {
+ if (seg->start == start && seg->size == size)
+ break;
+ }
+
+ if (seg->start != start || seg->size != size)
+ goto out;
+
+ ret = 0;
+ __remove_shared_memory(seg);
+ kfree(seg);
+out:
+ mutex_unlock(&vmem_mutex);
+ return ret;
+}
+
+int vmem_add_mapping(unsigned long start, unsigned long size)
+{
+ struct memory_segment *seg;
+ int ret;
+
+ mutex_lock(&vmem_mutex);
+ ret = -ENOMEM;
+ seg = kzalloc(sizeof(*seg), GFP_KERNEL);
+ if (!seg)
+ goto out;
+ seg->start = start;
+ seg->size = size;
+
+ ret = insert_memory_segment(seg);
+ if (ret)
+ goto out_free;
+
+ ret = vmem_add_mem(start, size);
+ if (ret)
+ goto out_remove;
+ goto out;
+
+out_remove:
+ __remove_shared_memory(seg);
+out_free:
+ kfree(seg);
+out:
+ mutex_unlock(&vmem_mutex);
+ return ret;
+}
+
+/*
+ * map whole physical memory to virtual memory (identity mapping)
+ * we reserve enough space in the vmalloc area for vmemmap to hotplug
+ * additional memory segments.
+ */
+void __init vmem_map_init(void)
+{
+ struct memblock_region *reg;
+
+ for_each_memblock(memory, reg)
+ vmem_add_mem(reg->base, reg->size);
+ __set_memory((unsigned long)_stext,
+ (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
+ SET_MEMORY_RO | SET_MEMORY_X);
+ __set_memory((unsigned long)_etext,
+ (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT,
+ SET_MEMORY_RO);
+ __set_memory((unsigned long)_sinittext,
+ (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
+ SET_MEMORY_RO | SET_MEMORY_X);
+ pr_info("Write protected kernel read-only data: %luk\n",
+ (unsigned long)(__end_rodata - _stext) >> 10);
+}
+
+/*
+ * Convert memblock.memory to a memory segment list so there is a single
+ * list that contains all memory segments.
+ */
+static int __init vmem_convert_memory_chunk(void)
+{
+ struct memblock_region *reg;
+ struct memory_segment *seg;
+
+ mutex_lock(&vmem_mutex);
+ for_each_memblock(memory, reg) {
+ seg = kzalloc(sizeof(*seg), GFP_KERNEL);
+ if (!seg)
+ panic("Out of memory...\n");
+ seg->start = reg->base;
+ seg->size = reg->size;
+ insert_memory_segment(seg);
+ }
+ mutex_unlock(&vmem_mutex);
+ return 0;
+}
+
+core_initcall(vmem_convert_memory_chunk);