summaryrefslogtreecommitdiffstats
path: root/fs/proc
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
commit2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree848558de17fb3008cdf4d861b01ac7781903ce39 /fs/proc
parentInitial commit. (diff)
downloadlinux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz
linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip
Adding upstream version 6.1.76.upstream/6.1.76
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/Kconfig110
-rw-r--r--fs/proc/Makefile36
-rw-r--r--fs/proc/array.c803
-rw-r--r--fs/proc/base.c3928
-rw-r--r--fs/proc/bootconfig.c96
-rw-r--r--fs/proc/cmdline.c19
-rw-r--r--fs/proc/consoles.c98
-rw-r--r--fs/proc/cpuinfo.c28
-rw-r--r--fs/proc/devices.c64
-rw-r--r--fs/proc/fd.c381
-rw-r--r--fs/proc/fd.h21
-rw-r--r--fs/proc/generic.c819
-rw-r--r--fs/proc/inode.c704
-rw-r--r--fs/proc/internal.h318
-rw-r--r--fs/proc/interrupts.c42
-rw-r--r--fs/proc/kcore.c701
-rw-r--r--fs/proc/kmsg.c63
-rw-r--r--fs/proc/loadavg.c37
-rw-r--r--fs/proc/meminfo.c173
-rw-r--r--fs/proc/namespaces.c183
-rw-r--r--fs/proc/nommu.c116
-rw-r--r--fs/proc/page.c342
-rw-r--r--fs/proc/proc_net.c416
-rw-r--r--fs/proc/proc_sysctl.c1951
-rw-r--r--fs/proc/proc_tty.c177
-rw-r--r--fs/proc/root.c375
-rw-r--r--fs/proc/self.c73
-rw-r--r--fs/proc/softirqs.c37
-rw-r--r--fs/proc/stat.c242
-rw-r--r--fs/proc/task_mmu.c2026
-rw-r--r--fs/proc/task_nommu.c309
-rw-r--r--fs/proc/thread_self.c73
-rw-r--r--fs/proc/uptime.c49
-rw-r--r--fs/proc/util.c24
-rw-r--r--fs/proc/version.c27
-rw-r--r--fs/proc/vmcore.c1603
36 files changed, 16464 insertions, 0 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
new file mode 100644
index 000000000..32b1116ae
--- /dev/null
+++ b/fs/proc/Kconfig
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config PROC_FS
+ bool "/proc file system support" if EXPERT
+ default y
+ help
+ This is a virtual file system providing information about the status
+ of the system. "Virtual" means that it doesn't take up any space on
+ your hard disk: the files are created on the fly by the kernel when
+ you try to access them. Also, you cannot read the files with older
+ version of the program less: you need to use more or cat.
+
+ It's totally cool; for example, "cat /proc/interrupts" gives
+ information about what the different IRQs are used for at the moment
+ (there is a small number of Interrupt ReQuest lines in your computer
+ that are used by the attached devices to gain the CPU's attention --
+ often a source of trouble if two devices are mistakenly configured
+ to use the same IRQ). The program procinfo to display some
+ information about your system gathered from the /proc file system.
+
+ Before you can use the /proc file system, it has to be mounted,
+ meaning it has to be given a location in the directory hierarchy.
+ That location should be /proc. A command such as "mount -t proc proc
+ /proc" or the equivalent line in /etc/fstab does the job.
+
+ The /proc file system is explained in the file
+ <file:Documentation/filesystems/proc.rst> and on the proc(5) manpage
+ ("man 5 proc").
+
+ This option will enlarge your kernel by about 67 KB. Several
+ programs depend on this, so everyone should say Y here.
+
+config PROC_KCORE
+ bool "/proc/kcore support" if !ARM
+ depends on PROC_FS && MMU
+ select CRASH_CORE
+ help
+ Provides a virtual ELF core file of the live kernel. This can
+ be read with gdb and other ELF tools. No modifications can be
+ made using this mechanism.
+
+config PROC_VMCORE
+ bool "/proc/vmcore support"
+ depends on PROC_FS && CRASH_DUMP
+ default y
+ help
+ Exports the dump image of crashed kernel in ELF format.
+
+config PROC_VMCORE_DEVICE_DUMP
+ bool "Device Hardware/Firmware Log Collection"
+ depends on PROC_VMCORE
+ default n
+ help
+ After kernel panic, device drivers can collect the device
+ specific snapshot of their hardware or firmware before the
+ underlying devices are initialized in crash recovery kernel.
+ Note that the device driver must be present in the crash
+ recovery kernel's initramfs to collect its underlying device
+ snapshot.
+
+ If you say Y here, the collected device dumps will be added
+ as ELF notes to /proc/vmcore. You can still disable device
+ dump using the kernel command line option 'novmcoredd'.
+
+config PROC_SYSCTL
+ bool "Sysctl support (/proc/sys)" if EXPERT
+ depends on PROC_FS
+ select SYSCTL
+ default y
+ help
+ The sysctl interface provides a means of dynamically changing
+ certain kernel parameters and variables on the fly without requiring
+ a recompile of the kernel or reboot of the system. The primary
+ interface is through /proc/sys. If you say Y here a tree of
+ modifiable sysctl entries will be generated beneath the
+ /proc/sys directory. They are explained in the files
+ in <file:Documentation/admin-guide/sysctl/>. Note that enabling this
+ option will enlarge the kernel by at least 8 KB.
+
+ As it is generally a good thing, you should say Y here unless
+ building a kernel for install/rescue disks or your system is very
+ limited in memory.
+
+config PROC_PAGE_MONITOR
+ default y
+ depends on PROC_FS && MMU
+ bool "Enable /proc page monitoring" if EXPERT
+ help
+ Various /proc files exist to monitor process memory utilization:
+ /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
+ /proc/kpagecount, and /proc/kpageflags. Disabling these
+ interfaces will reduce the size of the kernel by approximately 4kb.
+
+config PROC_CHILDREN
+ bool "Include /proc/<pid>/task/<tid>/children file"
+ depends on PROC_FS
+ default n
+ help
+ Provides a fast way to retrieve first level children pids of a task. See
+ <file:Documentation/filesystems/proc.rst> for more information.
+
+ Say Y if you are running any user-space software which takes benefit from
+ this interface. For example, rkt is such a piece of software.
+
+config PROC_PID_ARCH_STATUS
+ def_bool n
+ depends on PROC_FS
+
+config PROC_CPU_RESCTRL
+ def_bool n
+ depends on PROC_FS
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
new file mode 100644
index 000000000..bd08616ed
--- /dev/null
+++ b/fs/proc/Makefile
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux proc filesystem routines.
+#
+
+obj-y += proc.o
+
+CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,)
+proc-y := nommu.o task_nommu.o
+proc-$(CONFIG_MMU) := task_mmu.o
+
+proc-y += inode.o root.o base.o generic.o array.o \
+ fd.o
+proc-$(CONFIG_TTY) += proc_tty.o
+proc-y += cmdline.o
+proc-y += consoles.o
+proc-y += cpuinfo.o
+proc-y += devices.o
+proc-y += interrupts.o
+proc-y += loadavg.o
+proc-y += meminfo.o
+proc-y += stat.o
+proc-y += uptime.o
+proc-y += util.o
+proc-y += version.o
+proc-y += softirqs.o
+proc-y += namespaces.o
+proc-y += self.o
+proc-y += thread_self.o
+proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
+proc-$(CONFIG_NET) += proc_net.o
+proc-$(CONFIG_PROC_KCORE) += kcore.o
+proc-$(CONFIG_PROC_VMCORE) += vmcore.o
+proc-$(CONFIG_PRINTK) += kmsg.o
+proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
+proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
new file mode 100644
index 000000000..49283b810
--- /dev/null
+++ b/fs/proc/array.c
@@ -0,0 +1,803 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/proc/array.c
+ *
+ * Copyright (C) 1992 by Linus Torvalds
+ * based on ideas by Darren Senn
+ *
+ * Fixes:
+ * Michael. K. Johnson: stat,statm extensions.
+ * <johnsonm@stolaf.edu>
+ *
+ * Pauline Middelink : Made cmdline,envline only break at '\0's, to
+ * make sure SET_PROCTITLE works. Also removed
+ * bad '!' which forced address recalculation for
+ * EVERY character on the current page.
+ * <middelin@polyware.iaf.nl>
+ *
+ * Danny ter Haar : added cpuinfo
+ * <dth@cistron.nl>
+ *
+ * Alessandro Rubini : profile extension.
+ * <rubini@ipvvis.unipv.it>
+ *
+ * Jeff Tranter : added BogoMips field to cpuinfo
+ * <Jeff_Tranter@Mitel.COM>
+ *
+ * Bruno Haible : remove 4K limit for the maps file
+ * <haible@ma2s2.mathematik.uni-karlsruhe.de>
+ *
+ * Yves Arrouye : remove removal of trailing spaces in get_array.
+ * <Yves.Arrouye@marin.fdn.fr>
+ *
+ * Jerome Forissier : added per-CPU time information to /proc/stat
+ * and /proc/<pid>/cpu extension
+ * <forissier@isia.cma.fr>
+ * - Incorporation and non-SMP safe operation
+ * of forissier patch in 2.1.78 by
+ * Hans Marcus <crowbar@concepts.nl>
+ *
+ * aeb@cwi.nl : /proc/partitions
+ *
+ *
+ * Alan Cox : security fixes.
+ * <alan@lxorguk.ukuu.org.uk>
+ *
+ * Al Viro : safe handling of mm_struct
+ *
+ * Gerhard Wichert : added BIGMEM support
+ * Siemens AG <Gerhard.Wichert@pdb.siemens.de>
+ *
+ * Al Viro & Jeff Garzik : moved most of the thing into base.c and
+ * : proc_misc.c. The rest may eventually go into
+ * : base.c too.
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/time_namespace.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/tty.h>
+#include <linux/string.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
+#include <linux/proc_fs.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/signal.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/times.h>
+#include <linux/cpuset.h>
+#include <linux/rcupdate.h>
+#include <linux/delayacct.h>
+#include <linux/seq_file.h>
+#include <linux/pid_namespace.h>
+#include <linux/prctl.h>
+#include <linux/ptrace.h>
+#include <linux/string_helpers.h>
+#include <linux/user_namespace.h>
+#include <linux/fs_struct.h>
+#include <linux/kthread.h>
+
+#include <asm/processor.h>
+#include "internal.h"
+
+void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
+{
+ char tcomm[64];
+
+ /*
+ * Test before PF_KTHREAD because all workqueue worker threads are
+ * kernel threads.
+ */
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_comm(tcomm, sizeof(tcomm), p);
+ else if (p->flags & PF_KTHREAD)
+ get_kthread_comm(tcomm, sizeof(tcomm), p);
+ else
+ __get_task_comm(tcomm, sizeof(tcomm), p);
+
+ if (escape)
+ seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ else
+ seq_printf(m, "%.64s", tcomm);
+}
+
+/*
+ * The task state array is a strange "bitmap" of
+ * reasons to sleep. Thus "running" is zero, and
+ * you can test for combinations of others with
+ * simple bit tests.
+ */
+static const char * const task_state_array[] = {
+
+ /* states in TASK_REPORT: */
+ "R (running)", /* 0x00 */
+ "S (sleeping)", /* 0x01 */
+ "D (disk sleep)", /* 0x02 */
+ "T (stopped)", /* 0x04 */
+ "t (tracing stop)", /* 0x08 */
+ "X (dead)", /* 0x10 */
+ "Z (zombie)", /* 0x20 */
+ "P (parked)", /* 0x40 */
+
+ /* states beyond TASK_REPORT: */
+ "I (idle)", /* 0x80 */
+};
+
+static inline const char *get_task_state(struct task_struct *tsk)
+{
+ BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
+ return task_state_array[task_state_index(tsk)];
+}
+
+static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *p)
+{
+ struct user_namespace *user_ns = seq_user_ns(m);
+ struct group_info *group_info;
+ int g, umask = -1;
+ struct task_struct *tracer;
+ const struct cred *cred;
+ pid_t ppid, tpid = 0, tgid, ngid;
+ unsigned int max_fds = 0;
+
+ rcu_read_lock();
+ ppid = pid_alive(p) ?
+ task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
+
+ tracer = ptrace_parent(p);
+ if (tracer)
+ tpid = task_pid_nr_ns(tracer, ns);
+
+ tgid = task_tgid_nr_ns(p, ns);
+ ngid = task_numa_group_id(p);
+ cred = get_task_cred(p);
+
+ task_lock(p);
+ if (p->fs)
+ umask = p->fs->umask;
+ if (p->files)
+ max_fds = files_fdtable(p->files)->max_fds;
+ task_unlock(p);
+ rcu_read_unlock();
+
+ if (umask >= 0)
+ seq_printf(m, "Umask:\t%#04o\n", umask);
+ seq_puts(m, "State:\t");
+ seq_puts(m, get_task_state(p));
+
+ seq_put_decimal_ull(m, "\nTgid:\t", tgid);
+ seq_put_decimal_ull(m, "\nNgid:\t", ngid);
+ seq_put_decimal_ull(m, "\nPid:\t", pid_nr_ns(pid, ns));
+ seq_put_decimal_ull(m, "\nPPid:\t", ppid);
+ seq_put_decimal_ull(m, "\nTracerPid:\t", tpid);
+ seq_put_decimal_ull(m, "\nUid:\t", from_kuid_munged(user_ns, cred->uid));
+ seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->euid));
+ seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->suid));
+ seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->fsuid));
+ seq_put_decimal_ull(m, "\nGid:\t", from_kgid_munged(user_ns, cred->gid));
+ seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid));
+ seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid));
+ seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid));
+ seq_put_decimal_ull(m, "\nFDSize:\t", max_fds);
+
+ seq_puts(m, "\nGroups:\t");
+ group_info = cred->group_info;
+ for (g = 0; g < group_info->ngroups; g++)
+ seq_put_decimal_ull(m, g ? " " : "",
+ from_kgid_munged(user_ns, group_info->gid[g]));
+ put_cred(cred);
+ /* Trailing space shouldn't have been added in the first place. */
+ seq_putc(m, ' ');
+
+#ifdef CONFIG_PID_NS
+ seq_puts(m, "\nNStgid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_put_decimal_ull(m, "\t", task_tgid_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSpid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_put_decimal_ull(m, "\t", task_pid_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSpgid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_put_decimal_ull(m, "\t", task_pgrp_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSsid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
+#endif
+ seq_putc(m, '\n');
+}
+
+void render_sigset_t(struct seq_file *m, const char *header,
+ sigset_t *set)
+{
+ int i;
+
+ seq_puts(m, header);
+
+ i = _NSIG;
+ do {
+ int x = 0;
+
+ i -= 4;
+ if (sigismember(set, i+1)) x |= 1;
+ if (sigismember(set, i+2)) x |= 2;
+ if (sigismember(set, i+3)) x |= 4;
+ if (sigismember(set, i+4)) x |= 8;
+ seq_putc(m, hex_asc[x]);
+ } while (i >= 4);
+
+ seq_putc(m, '\n');
+}
+
+static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign,
+ sigset_t *sigcatch)
+{
+ struct k_sigaction *k;
+ int i;
+
+ k = p->sighand->action;
+ for (i = 1; i <= _NSIG; ++i, ++k) {
+ if (k->sa.sa_handler == SIG_IGN)
+ sigaddset(sigign, i);
+ else if (k->sa.sa_handler != SIG_DFL)
+ sigaddset(sigcatch, i);
+ }
+}
+
+static inline void task_sig(struct seq_file *m, struct task_struct *p)
+{
+ unsigned long flags;
+ sigset_t pending, shpending, blocked, ignored, caught;
+ int num_threads = 0;
+ unsigned int qsize = 0;
+ unsigned long qlim = 0;
+
+ sigemptyset(&pending);
+ sigemptyset(&shpending);
+ sigemptyset(&blocked);
+ sigemptyset(&ignored);
+ sigemptyset(&caught);
+
+ if (lock_task_sighand(p, &flags)) {
+ pending = p->pending.signal;
+ shpending = p->signal->shared_pending.signal;
+ blocked = p->blocked;
+ collect_sigign_sigcatch(p, &ignored, &caught);
+ num_threads = get_nr_threads(p);
+ rcu_read_lock(); /* FIXME: is this correct? */
+ qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
+ rcu_read_unlock();
+ qlim = task_rlimit(p, RLIMIT_SIGPENDING);
+ unlock_task_sighand(p, &flags);
+ }
+
+ seq_put_decimal_ull(m, "Threads:\t", num_threads);
+ seq_put_decimal_ull(m, "\nSigQ:\t", qsize);
+ seq_put_decimal_ull(m, "/", qlim);
+
+ /* render them all */
+ render_sigset_t(m, "\nSigPnd:\t", &pending);
+ render_sigset_t(m, "ShdPnd:\t", &shpending);
+ render_sigset_t(m, "SigBlk:\t", &blocked);
+ render_sigset_t(m, "SigIgn:\t", &ignored);
+ render_sigset_t(m, "SigCgt:\t", &caught);
+}
+
+static void render_cap_t(struct seq_file *m, const char *header,
+ kernel_cap_t *a)
+{
+ unsigned __capi;
+
+ seq_puts(m, header);
+ CAP_FOR_EACH_U32(__capi) {
+ seq_put_hex_ll(m, NULL,
+ a->cap[CAP_LAST_U32 - __capi], 8);
+ }
+ seq_putc(m, '\n');
+}
+
+static inline void task_cap(struct seq_file *m, struct task_struct *p)
+{
+ const struct cred *cred;
+ kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
+ cap_bset, cap_ambient;
+
+ rcu_read_lock();
+ cred = __task_cred(p);
+ cap_inheritable = cred->cap_inheritable;
+ cap_permitted = cred->cap_permitted;
+ cap_effective = cred->cap_effective;
+ cap_bset = cred->cap_bset;
+ cap_ambient = cred->cap_ambient;
+ rcu_read_unlock();
+
+ render_cap_t(m, "CapInh:\t", &cap_inheritable);
+ render_cap_t(m, "CapPrm:\t", &cap_permitted);
+ render_cap_t(m, "CapEff:\t", &cap_effective);
+ render_cap_t(m, "CapBnd:\t", &cap_bset);
+ render_cap_t(m, "CapAmb:\t", &cap_ambient);
+}
+
+static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
+{
+ seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
+#ifdef CONFIG_SECCOMP
+ seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
+#ifdef CONFIG_SECCOMP_FILTER
+ seq_put_decimal_ull(m, "\nSeccomp_filters:\t",
+ atomic_read(&p->seccomp.filter_count));
+#endif
+#endif
+ seq_puts(m, "\nSpeculation_Store_Bypass:\t");
+ switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
+ case -EINVAL:
+ seq_puts(m, "unknown");
+ break;
+ case PR_SPEC_NOT_AFFECTED:
+ seq_puts(m, "not vulnerable");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
+ seq_puts(m, "thread force mitigated");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
+ seq_puts(m, "thread mitigated");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
+ seq_puts(m, "thread vulnerable");
+ break;
+ case PR_SPEC_DISABLE:
+ seq_puts(m, "globally mitigated");
+ break;
+ default:
+ seq_puts(m, "vulnerable");
+ break;
+ }
+
+ seq_puts(m, "\nSpeculationIndirectBranch:\t");
+ switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_INDIRECT_BRANCH)) {
+ case -EINVAL:
+ seq_puts(m, "unsupported");
+ break;
+ case PR_SPEC_NOT_AFFECTED:
+ seq_puts(m, "not affected");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
+ seq_puts(m, "conditional force disabled");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
+ seq_puts(m, "conditional disabled");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
+ seq_puts(m, "conditional enabled");
+ break;
+ case PR_SPEC_ENABLE:
+ seq_puts(m, "always enabled");
+ break;
+ case PR_SPEC_DISABLE:
+ seq_puts(m, "always disabled");
+ break;
+ default:
+ seq_puts(m, "unknown");
+ break;
+ }
+ seq_putc(m, '\n');
+}
+
+static inline void task_context_switch_counts(struct seq_file *m,
+ struct task_struct *p)
+{
+ seq_put_decimal_ull(m, "voluntary_ctxt_switches:\t", p->nvcsw);
+ seq_put_decimal_ull(m, "\nnonvoluntary_ctxt_switches:\t", p->nivcsw);
+ seq_putc(m, '\n');
+}
+
+static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
+{
+ seq_printf(m, "Cpus_allowed:\t%*pb\n",
+ cpumask_pr_args(&task->cpus_mask));
+ seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
+ cpumask_pr_args(&task->cpus_mask));
+}
+
+static inline void task_core_dumping(struct seq_file *m, struct task_struct *task)
+{
+ seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state);
+ seq_putc(m, '\n');
+}
+
+static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
+{
+ bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE);
+
+ if (thp_enabled)
+ thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags);
+ seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
+}
+
+int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm = get_task_mm(task);
+
+ seq_puts(m, "Name:\t");
+ proc_task_name(m, task, true);
+ seq_putc(m, '\n');
+
+ task_state(m, ns, pid, task);
+
+ if (mm) {
+ task_mem(m, mm);
+ task_core_dumping(m, task);
+ task_thp_status(m, mm);
+ mmput(mm);
+ }
+ task_sig(m, task);
+ task_cap(m, task);
+ task_seccomp(m, task);
+ task_cpus_allowed(m, task);
+ cpuset_task_status_allowed(m, task);
+ task_context_switch_counts(m, task);
+ return 0;
+}
+
+static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task, int whole)
+{
+ unsigned long vsize, eip, esp, wchan = 0;
+ int priority, nice;
+ int tty_pgrp = -1, tty_nr = 0;
+ sigset_t sigign, sigcatch;
+ char state;
+ pid_t ppid = 0, pgid = -1, sid = -1;
+ int num_threads = 0;
+ int permitted;
+ struct mm_struct *mm;
+ unsigned long long start_time;
+ unsigned long cmin_flt = 0, cmaj_flt = 0;
+ unsigned long min_flt = 0, maj_flt = 0;
+ u64 cutime, cstime, utime, stime;
+ u64 cgtime, gtime;
+ unsigned long rsslim = 0;
+ unsigned long flags;
+ int exit_code = task->exit_code;
+
+ state = *get_task_state(task);
+ vsize = eip = esp = 0;
+ permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT);
+ mm = get_task_mm(task);
+ if (mm) {
+ vsize = task_vsize(mm);
+ /*
+ * esp and eip are intentionally zeroed out. There is no
+ * non-racy way to read them without freezing the task.
+ * Programs that need reliable values can use ptrace(2).
+ *
+ * The only exception is if the task is core dumping because
+ * a program is not able to use ptrace(2) in that case. It is
+ * safe because the task has stopped executing permanently.
+ */
+ if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) {
+ if (try_get_task_stack(task)) {
+ eip = KSTK_EIP(task);
+ esp = KSTK_ESP(task);
+ put_task_stack(task);
+ }
+ }
+ }
+
+ sigemptyset(&sigign);
+ sigemptyset(&sigcatch);
+ cutime = cstime = utime = stime = 0;
+ cgtime = gtime = 0;
+
+ if (lock_task_sighand(task, &flags)) {
+ struct signal_struct *sig = task->signal;
+
+ if (sig->tty) {
+ struct pid *pgrp = tty_get_pgrp(sig->tty);
+ tty_pgrp = pid_nr_ns(pgrp, ns);
+ put_pid(pgrp);
+ tty_nr = new_encode_dev(tty_devnum(sig->tty));
+ }
+
+ num_threads = get_nr_threads(task);
+ collect_sigign_sigcatch(task, &sigign, &sigcatch);
+
+ cmin_flt = sig->cmin_flt;
+ cmaj_flt = sig->cmaj_flt;
+ cutime = sig->cutime;
+ cstime = sig->cstime;
+ cgtime = sig->cgtime;
+ rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
+
+ /* add up live thread stats at the group level */
+ if (whole) {
+ struct task_struct *t = task;
+ do {
+ min_flt += t->min_flt;
+ maj_flt += t->maj_flt;
+ gtime += task_gtime(t);
+ } while_each_thread(task, t);
+
+ min_flt += sig->min_flt;
+ maj_flt += sig->maj_flt;
+ thread_group_cputime_adjusted(task, &utime, &stime);
+ gtime += sig->gtime;
+
+ if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
+ exit_code = sig->group_exit_code;
+ }
+
+ sid = task_session_nr_ns(task, ns);
+ ppid = task_tgid_nr_ns(task->real_parent, ns);
+ pgid = task_pgrp_nr_ns(task, ns);
+
+ unlock_task_sighand(task, &flags);
+ }
+
+ if (permitted && (!whole || num_threads < 2))
+ wchan = !task_is_running(task);
+ if (!whole) {
+ min_flt = task->min_flt;
+ maj_flt = task->maj_flt;
+ task_cputime_adjusted(task, &utime, &stime);
+ gtime = task_gtime(task);
+ }
+
+ /* scale priority and nice values from timeslices to -20..20 */
+ /* to make it look like a "normal" Unix priority/nice value */
+ priority = task_prio(task);
+ nice = task_nice(task);
+
+ /* apply timens offset for boottime and convert nsec -> ticks */
+ start_time =
+ nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime));
+
+ seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
+ seq_puts(m, " (");
+ proc_task_name(m, task, false);
+ seq_puts(m, ") ");
+ seq_putc(m, state);
+ seq_put_decimal_ll(m, " ", ppid);
+ seq_put_decimal_ll(m, " ", pgid);
+ seq_put_decimal_ll(m, " ", sid);
+ seq_put_decimal_ll(m, " ", tty_nr);
+ seq_put_decimal_ll(m, " ", tty_pgrp);
+ seq_put_decimal_ull(m, " ", task->flags);
+ seq_put_decimal_ull(m, " ", min_flt);
+ seq_put_decimal_ull(m, " ", cmin_flt);
+ seq_put_decimal_ull(m, " ", maj_flt);
+ seq_put_decimal_ull(m, " ", cmaj_flt);
+ seq_put_decimal_ull(m, " ", nsec_to_clock_t(utime));
+ seq_put_decimal_ull(m, " ", nsec_to_clock_t(stime));
+ seq_put_decimal_ll(m, " ", nsec_to_clock_t(cutime));
+ seq_put_decimal_ll(m, " ", nsec_to_clock_t(cstime));
+ seq_put_decimal_ll(m, " ", priority);
+ seq_put_decimal_ll(m, " ", nice);
+ seq_put_decimal_ll(m, " ", num_threads);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", start_time);
+ seq_put_decimal_ull(m, " ", vsize);
+ seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0);
+ seq_put_decimal_ull(m, " ", rsslim);
+ seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0);
+ seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0);
+ seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0);
+ seq_put_decimal_ull(m, " ", esp);
+ seq_put_decimal_ull(m, " ", eip);
+ /* The signal information here is obsolete.
+ * It must be decimal for Linux 2.0 compatibility.
+ * Use /proc/#/status for real-time signals.
+ */
+ seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL);
+
+ /*
+ * We used to output the absolute kernel address, but that's an
+ * information leak - so instead we show a 0/1 flag here, to signal
+ * to user-space whether there's a wchan field in /proc/PID/wchan.
+ *
+ * This works with older implementations of procps as well.
+ */
+ seq_put_decimal_ull(m, " ", wchan);
+
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ll(m, " ", task->exit_signal);
+ seq_put_decimal_ll(m, " ", task_cpu(task));
+ seq_put_decimal_ull(m, " ", task->rt_priority);
+ seq_put_decimal_ull(m, " ", task->policy);
+ seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task));
+ seq_put_decimal_ull(m, " ", nsec_to_clock_t(gtime));
+ seq_put_decimal_ll(m, " ", nsec_to_clock_t(cgtime));
+
+ if (mm && permitted) {
+ seq_put_decimal_ull(m, " ", mm->start_data);
+ seq_put_decimal_ull(m, " ", mm->end_data);
+ seq_put_decimal_ull(m, " ", mm->start_brk);
+ seq_put_decimal_ull(m, " ", mm->arg_start);
+ seq_put_decimal_ull(m, " ", mm->arg_end);
+ seq_put_decimal_ull(m, " ", mm->env_start);
+ seq_put_decimal_ull(m, " ", mm->env_end);
+ } else
+ seq_puts(m, " 0 0 0 0 0 0 0");
+
+ if (permitted)
+ seq_put_decimal_ll(m, " ", exit_code);
+ else
+ seq_puts(m, " 0");
+
+ seq_putc(m, '\n');
+ if (mm)
+ mmput(mm);
+ return 0;
+}
+
+int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ return do_task_stat(m, ns, pid, task, 0);
+}
+
+int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ return do_task_stat(m, ns, pid, task, 1);
+}
+
+int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm = get_task_mm(task);
+
+ if (mm) {
+ unsigned long size;
+ unsigned long resident = 0;
+ unsigned long shared = 0;
+ unsigned long text = 0;
+ unsigned long data = 0;
+
+ size = task_statm(mm, &shared, &text, &data, &resident);
+ mmput(mm);
+
+ /*
+ * For quick read, open code by putting numbers directly
+ * expected format is
+ * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+ * size, resident, shared, text, data);
+ */
+ seq_put_decimal_ull(m, "", size);
+ seq_put_decimal_ull(m, " ", resident);
+ seq_put_decimal_ull(m, " ", shared);
+ seq_put_decimal_ull(m, " ", text);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_put_decimal_ull(m, " ", data);
+ seq_put_decimal_ull(m, " ", 0);
+ seq_putc(m, '\n');
+ } else {
+ seq_write(m, "0 0 0 0 0 0 0\n", 14);
+ }
+ return 0;
+}
+
+#ifdef CONFIG_PROC_CHILDREN
+static struct pid *
+get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
+{
+ struct task_struct *start, *task;
+ struct pid *pid = NULL;
+
+ read_lock(&tasklist_lock);
+
+ start = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (!start)
+ goto out;
+
+ /*
+ * Lets try to continue searching first, this gives
+ * us significant speedup on children-rich processes.
+ */
+ if (pid_prev) {
+ task = pid_task(pid_prev, PIDTYPE_PID);
+ if (task && task->real_parent == start &&
+ !(list_empty(&task->sibling))) {
+ if (list_is_last(&task->sibling, &start->children))
+ goto out;
+ task = list_first_entry(&task->sibling,
+ struct task_struct, sibling);
+ pid = get_pid(task_pid(task));
+ goto out;
+ }
+ }
+
+ /*
+ * Slow search case.
+ *
+ * We might miss some children here if children
+ * are exited while we were not holding the lock,
+ * but it was never promised to be accurate that
+ * much.
+ *
+ * "Just suppose that the parent sleeps, but N children
+ * exit after we printed their tids. Now the slow paths
+ * skips N extra children, we miss N tasks." (c)
+ *
+ * So one need to stop or freeze the leader and all
+ * its children to get a precise result.
+ */
+ list_for_each_entry(task, &start->children, sibling) {
+ if (pos-- == 0) {
+ pid = get_pid(task_pid(task));
+ break;
+ }
+ }
+
+out:
+ read_unlock(&tasklist_lock);
+ return pid;
+}
+
+static int children_seq_show(struct seq_file *seq, void *v)
+{
+ struct inode *inode = file_inode(seq->file);
+
+ seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode->i_sb)));
+ return 0;
+}
+
+static void *children_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return get_children_pid(file_inode(seq->file), NULL, *pos);
+}
+
+static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct pid *pid;
+
+ pid = get_children_pid(file_inode(seq->file), v, *pos + 1);
+ put_pid(v);
+
+ ++*pos;
+ return pid;
+}
+
+static void children_seq_stop(struct seq_file *seq, void *v)
+{
+ put_pid(v);
+}
+
+static const struct seq_operations children_seq_ops = {
+ .start = children_seq_start,
+ .next = children_seq_next,
+ .stop = children_seq_stop,
+ .show = children_seq_show,
+};
+
+static int children_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &children_seq_ops);
+}
+
+const struct file_operations proc_tid_children_operations = {
+ .open = children_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif /* CONFIG_PROC_CHILDREN */
diff --git a/fs/proc/base.c b/fs/proc/base.c
new file mode 100644
index 000000000..74442e017
--- /dev/null
+++ b/fs/proc/base.c
@@ -0,0 +1,3928 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/proc/base.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * proc base directory handling functions
+ *
+ * 1999, Al Viro. Rewritten. Now it covers the whole per-process part.
+ * Instead of using magical inumbers to determine the kind of object
+ * we allocate and fill in-core inodes upon lookup. They don't even
+ * go into icache. We cache the reference to task_struct upon lookup too.
+ * Eventually it should become a filesystem in its own. We don't use the
+ * rest of procfs anymore.
+ *
+ *
+ * Changelog:
+ * 17-Jan-2005
+ * Allan Bezerra
+ * Bruna Moreira <bruna.moreira@indt.org.br>
+ * Edjard Mota <edjard.mota@indt.org.br>
+ * Ilias Biris <ilias.biris@indt.org.br>
+ * Mauricio Lin <mauricio.lin@indt.org.br>
+ *
+ * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *
+ * A new process specific entry (smaps) included in /proc. It shows the
+ * size of rss for each memory area. The maps entry lacks information
+ * about physical memory size (rss) for each mapped file, i.e.,
+ * rss information for executables and library files.
+ * This additional information is useful for any tools that need to know
+ * about physical memory consumption for a process specific library.
+ *
+ * Changelog:
+ * 21-Feb-2005
+ * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ * Pud inclusion in the page table walking.
+ *
+ * ChangeLog:
+ * 10-Mar-2005
+ * 10LE Instituto Nokia de Tecnologia - INdT:
+ * A better way to walks through the page table as suggested by Hugh Dickins.
+ *
+ * Simo Piiroinen <simo.piiroinen@nokia.com>:
+ * Smaps information related to shared, private, clean and dirty pages.
+ *
+ * Paul Mundt <paul.mundt@nokia.com>:
+ * Overall revision about smaps.
+ */
+
+#include <linux/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/init.h>
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/generic-radix-tree.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
+#include <linux/resource.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/ptrace.h>
+#include <linux/printk.h>
+#include <linux/cache.h>
+#include <linux/cgroup.h>
+#include <linux/cpuset.h>
+#include <linux/audit.h>
+#include <linux/poll.h>
+#include <linux/nsproxy.h>
+#include <linux/oom.h>
+#include <linux/elf.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/fs_struct.h>
+#include <linux/slab.h>
+#include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/stat.h>
+#include <linux/posix-timers.h>
+#include <linux/time_namespace.h>
+#include <linux/resctrl.h>
+#include <linux/cn_proc.h>
+#include <trace/events/oom.h>
+#include "internal.h"
+#include "fd.h"
+
+#include "../../lib/kstrtox.h"
+
+/* NOTE:
+ * Implementing inode permission operations in /proc is almost
+ * certainly an error. Permission checks need to happen during
+ * each system call not at open time. The reason is that most of
+ * what we wish to check for permissions in /proc varies at runtime.
+ *
+ * The classic example of a problem is opening file descriptors
+ * in /proc for a task before it execs a suid executable.
+ */
+
+static u8 nlink_tid __ro_after_init;
+static u8 nlink_tgid __ro_after_init;
+
+struct pid_entry {
+ const char *name;
+ unsigned int len;
+ umode_t mode;
+ const struct inode_operations *iop;
+ const struct file_operations *fop;
+ union proc_op op;
+};
+
+#define NOD(NAME, MODE, IOP, FOP, OP) { \
+ .name = (NAME), \
+ .len = sizeof(NAME) - 1, \
+ .mode = MODE, \
+ .iop = IOP, \
+ .fop = FOP, \
+ .op = OP, \
+}
+
+#define DIR(NAME, MODE, iops, fops) \
+ NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
+#define LNK(NAME, get_link) \
+ NOD(NAME, (S_IFLNK|S_IRWXUGO), \
+ &proc_pid_link_inode_operations, NULL, \
+ { .proc_get_link = get_link } )
+#define REG(NAME, MODE, fops) \
+ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
+#define ONE(NAME, MODE, show) \
+ NOD(NAME, (S_IFREG|(MODE)), \
+ NULL, &proc_single_file_operations, \
+ { .proc_show = show } )
+#define ATTR(LSM, NAME, MODE) \
+ NOD(NAME, (S_IFREG|(MODE)), \
+ NULL, &proc_pid_attr_operations, \
+ { .lsm = LSM })
+
+/*
+ * Count the number of hardlinks for the pid_entry table, excluding the .
+ * and .. links.
+ */
+static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
+ unsigned int n)
+{
+ unsigned int i;
+ unsigned int count;
+
+ count = 2;
+ for (i = 0; i < n; ++i) {
+ if (S_ISDIR(entries[i].mode))
+ ++count;
+ }
+
+ return count;
+}
+
+static int get_task_root(struct task_struct *task, struct path *root)
+{
+ int result = -ENOENT;
+
+ task_lock(task);
+ if (task->fs) {
+ get_fs_root(task->fs, root);
+ result = 0;
+ }
+ task_unlock(task);
+ return result;
+}
+
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
+{
+ struct task_struct *task = get_proc_task(d_inode(dentry));
+ int result = -ENOENT;
+
+ if (task) {
+ task_lock(task);
+ if (task->fs) {
+ get_fs_pwd(task->fs, path);
+ result = 0;
+ }
+ task_unlock(task);
+ put_task_struct(task);
+ }
+ return result;
+}
+
+static int proc_root_link(struct dentry *dentry, struct path *path)
+{
+ struct task_struct *task = get_proc_task(d_inode(dentry));
+ int result = -ENOENT;
+
+ if (task) {
+ result = get_task_root(task, path);
+ put_task_struct(task);
+ }
+ return result;
+}
+
+/*
+ * If the user used setproctitle(), we just get the string from
+ * user space at arg_start, and limit it to a maximum of one page.
+ */
+static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
+ size_t count, unsigned long pos,
+ unsigned long arg_start)
+{
+ char *page;
+ int ret, got;
+
+ if (pos >= PAGE_SIZE)
+ return 0;
+
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ ret = 0;
+ got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
+ if (got > 0) {
+ int len = strnlen(page, got);
+
+ /* Include the NUL character if it was found */
+ if (len < got)
+ len++;
+
+ if (len > pos) {
+ len -= pos;
+ if (len > count)
+ len = count;
+ len -= copy_to_user(buf, page+pos, len);
+ if (!len)
+ len = -EFAULT;
+ ret = len;
+ }
+ }
+ free_page((unsigned long)page);
+ return ret;
+}
+
+static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long arg_start, arg_end, env_start, env_end;
+ unsigned long pos, len;
+ char *page, c;
+
+ /* Check if process spawned far enough to have cmdline. */
+ if (!mm->env_end)
+ return 0;
+
+ spin_lock(&mm->arg_lock);
+ arg_start = mm->arg_start;
+ arg_end = mm->arg_end;
+ env_start = mm->env_start;
+ env_end = mm->env_end;
+ spin_unlock(&mm->arg_lock);
+
+ if (arg_start >= arg_end)
+ return 0;
+
+ /*
+ * We allow setproctitle() to overwrite the argument
+ * strings, and overflow past the original end. But
+ * only when it overflows into the environment area.
+ */
+ if (env_start != arg_end || env_end < env_start)
+ env_start = env_end = arg_end;
+ len = env_end - arg_start;
+
+ /* We're not going to care if "*ppos" has high bits set */
+ pos = *ppos;
+ if (pos >= len)
+ return 0;
+ if (count > len - pos)
+ count = len - pos;
+ if (!count)
+ return 0;
+
+ /*
+ * Magical special case: if the argv[] end byte is not
+ * zero, the user has overwritten it with setproctitle(3).
+ *
+ * Possible future enhancement: do this only once when
+ * pos is 0, and set a flag in the 'struct file'.
+ */
+ if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
+ return get_mm_proctitle(mm, buf, count, pos, arg_start);
+
+ /*
+ * For the non-setproctitle() case we limit things strictly
+ * to the [arg_start, arg_end[ range.
+ */
+ pos += arg_start;
+ if (pos < arg_start || pos >= arg_end)
+ return 0;
+ if (count > arg_end - pos)
+ count = arg_end - pos;
+
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ len = 0;
+ while (count) {
+ int got;
+ size_t size = min_t(size_t, PAGE_SIZE, count);
+
+ got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
+ if (got <= 0)
+ break;
+ got -= copy_to_user(buf, page, got);
+ if (unlikely(!got)) {
+ if (!len)
+ len = -EFAULT;
+ break;
+ }
+ pos += got;
+ buf += got;
+ len += got;
+ count -= got;
+ }
+
+ free_page((unsigned long)page);
+ return len;
+}
+
+static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct mm_struct *mm;
+ ssize_t ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ ret = get_mm_cmdline(mm, buf, count, pos);
+ mmput(mm);
+ return ret;
+}
+
+static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct task_struct *tsk;
+ ssize_t ret;
+
+ BUG_ON(*pos < 0);
+
+ tsk = get_proc_task(file_inode(file));
+ if (!tsk)
+ return -ESRCH;
+ ret = get_task_cmdline(tsk, buf, count, pos);
+ put_task_struct(tsk);
+ if (ret > 0)
+ *pos += ret;
+ return ret;
+}
+
+static const struct file_operations proc_pid_cmdline_ops = {
+ .read = proc_pid_cmdline_read,
+ .llseek = generic_file_llseek,
+};
+
+#ifdef CONFIG_KALLSYMS
+/*
+ * Provides a wchan file via kallsyms in a proper one-value-per-file format.
+ * Returns the resolved symbol. If that fails, simply return the address.
+ */
+static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ unsigned long wchan;
+ char symname[KSYM_NAME_LEN];
+
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto print0;
+
+ wchan = get_wchan(task);
+ if (wchan && !lookup_symbol_name(wchan, symname)) {
+ seq_puts(m, symname);
+ return 0;
+ }
+
+print0:
+ seq_putc(m, '0');
+ return 0;
+}
+#endif /* CONFIG_KALLSYMS */
+
+static int lock_trace(struct task_struct *task)
+{
+ int err = down_read_killable(&task->signal->exec_update_lock);
+ if (err)
+ return err;
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
+ up_read(&task->signal->exec_update_lock);
+ return -EPERM;
+ }
+ return 0;
+}
+
+static void unlock_trace(struct task_struct *task)
+{
+ up_read(&task->signal->exec_update_lock);
+}
+
+#ifdef CONFIG_STACKTRACE
+
+#define MAX_STACK_TRACE_DEPTH 64
+
+static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ unsigned long *entries;
+ int err;
+
+ /*
+ * The ability to racily run the kernel stack unwinder on a running task
+ * and then observe the unwinder output is scary; while it is useful for
+ * debugging kernel issues, it can also allow an attacker to leak kernel
+ * stack contents.
+ * Doing this in a manner that is at least safe from races would require
+ * some work to ensure that the remote task can not be scheduled; and
+ * even then, this would still expose the unwinder as local attack
+ * surface.
+ * Therefore, this interface is restricted to root.
+ */
+ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EACCES;
+
+ entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
+ GFP_KERNEL);
+ if (!entries)
+ return -ENOMEM;
+
+ err = lock_trace(task);
+ if (!err) {
+ unsigned int i, nr_entries;
+
+ nr_entries = stack_trace_save_tsk(task, entries,
+ MAX_STACK_TRACE_DEPTH, 0);
+
+ for (i = 0; i < nr_entries; i++) {
+ seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
+ }
+
+ unlock_trace(task);
+ }
+ kfree(entries);
+
+ return err;
+}
+#endif
+
+#ifdef CONFIG_SCHED_INFO
+/*
+ * Provides /proc/PID/schedstat
+ */
+static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ if (unlikely(!sched_info_on()))
+ seq_puts(m, "0 0 0\n");
+ else
+ seq_printf(m, "%llu %llu %lu\n",
+ (unsigned long long)task->se.sum_exec_runtime,
+ (unsigned long long)task->sched_info.run_delay,
+ task->sched_info.pcount);
+
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_LATENCYTOP
+static int lstats_show_proc(struct seq_file *m, void *v)
+{
+ int i;
+ struct inode *inode = m->private;
+ struct task_struct *task = get_proc_task(inode);
+
+ if (!task)
+ return -ESRCH;
+ seq_puts(m, "Latency Top version : v0.1\n");
+ for (i = 0; i < LT_SAVECOUNT; i++) {
+ struct latency_record *lr = &task->latency_record[i];
+ if (lr->backtrace[0]) {
+ int q;
+ seq_printf(m, "%i %li %li",
+ lr->count, lr->time, lr->max);
+ for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+ unsigned long bt = lr->backtrace[q];
+
+ if (!bt)
+ break;
+ seq_printf(m, " %ps", (void *)bt);
+ }
+ seq_putc(m, '\n');
+ }
+
+ }
+ put_task_struct(task);
+ return 0;
+}
+
+static int lstats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, lstats_show_proc, inode);
+}
+
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offs)
+{
+ struct task_struct *task = get_proc_task(file_inode(file));
+
+ if (!task)
+ return -ESRCH;
+ clear_tsk_latency_tracing(task);
+ put_task_struct(task);
+
+ return count;
+}
+
+static const struct file_operations proc_lstats_operations = {
+ .open = lstats_open,
+ .read = seq_read,
+ .write = lstats_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif
+
+static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ unsigned long totalpages = totalram_pages() + total_swap_pages;
+ unsigned long points = 0;
+ long badness;
+
+ badness = oom_badness(task, totalpages);
+ /*
+ * Special case OOM_SCORE_ADJ_MIN for all others scale the
+ * badness value into [0, 2000] range which we have been
+ * exporting for a long time so userspace might depend on it.
+ */
+ if (badness != LONG_MIN)
+ points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;
+
+ seq_printf(m, "%lu\n", points);
+
+ return 0;
+}
+
+struct limit_names {
+ const char *name;
+ const char *unit;
+};
+
+static const struct limit_names lnames[RLIM_NLIMITS] = {
+ [RLIMIT_CPU] = {"Max cpu time", "seconds"},
+ [RLIMIT_FSIZE] = {"Max file size", "bytes"},
+ [RLIMIT_DATA] = {"Max data size", "bytes"},
+ [RLIMIT_STACK] = {"Max stack size", "bytes"},
+ [RLIMIT_CORE] = {"Max core file size", "bytes"},
+ [RLIMIT_RSS] = {"Max resident set", "bytes"},
+ [RLIMIT_NPROC] = {"Max processes", "processes"},
+ [RLIMIT_NOFILE] = {"Max open files", "files"},
+ [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
+ [RLIMIT_AS] = {"Max address space", "bytes"},
+ [RLIMIT_LOCKS] = {"Max file locks", "locks"},
+ [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
+ [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
+ [RLIMIT_NICE] = {"Max nice priority", NULL},
+ [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
+ [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
+};
+
+/* Display limits for a process */
+static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ unsigned int i;
+ unsigned long flags;
+
+ struct rlimit rlim[RLIM_NLIMITS];
+
+ if (!lock_task_sighand(task, &flags))
+ return 0;
+ memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
+ unlock_task_sighand(task, &flags);
+
+ /*
+ * print the file header
+ */
+ seq_puts(m, "Limit "
+ "Soft Limit "
+ "Hard Limit "
+ "Units \n");
+
+ for (i = 0; i < RLIM_NLIMITS; i++) {
+ if (rlim[i].rlim_cur == RLIM_INFINITY)
+ seq_printf(m, "%-25s %-20s ",
+ lnames[i].name, "unlimited");
+ else
+ seq_printf(m, "%-25s %-20lu ",
+ lnames[i].name, rlim[i].rlim_cur);
+
+ if (rlim[i].rlim_max == RLIM_INFINITY)
+ seq_printf(m, "%-20s ", "unlimited");
+ else
+ seq_printf(m, "%-20lu ", rlim[i].rlim_max);
+
+ if (lnames[i].unit)
+ seq_printf(m, "%-10s\n", lnames[i].unit);
+ else
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct syscall_info info;
+ u64 *args = &info.data.args[0];
+ int res;
+
+ res = lock_trace(task);
+ if (res)
+ return res;
+
+ if (task_current_syscall(task, &info))
+ seq_puts(m, "running\n");
+ else if (info.data.nr < 0)
+ seq_printf(m, "%d 0x%llx 0x%llx\n",
+ info.data.nr, info.sp, info.data.instruction_pointer);
+ else
+ seq_printf(m,
+ "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
+ info.data.nr,
+ args[0], args[1], args[2], args[3], args[4], args[5],
+ info.sp, info.data.instruction_pointer);
+ unlock_trace(task);
+
+ return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
+
+/************************************************************************/
+/* Here the fs part begins */
+/************************************************************************/
+
+/* permission checks */
+static bool proc_fd_access_allowed(struct inode *inode)
+{
+ struct task_struct *task;
+ bool allowed = false;
+ /* Allow access to a task's file descriptors if it is us or we
+ * may use ptrace attach to the process and find out that
+ * information.
+ */
+ task = get_proc_task(inode);
+ if (task) {
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+ put_task_struct(task);
+ }
+ return allowed;
+}
+
+int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
+{
+ int error;
+ struct inode *inode = d_inode(dentry);
+
+ if (attr->ia_valid & ATTR_MODE)
+ return -EPERM;
+
+ error = setattr_prepare(&init_user_ns, dentry, attr);
+ if (error)
+ return error;
+
+ setattr_copy(&init_user_ns, inode, attr);
+ mark_inode_dirty(inode);
+ return 0;
+}
+
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct proc_fs_info *fs_info,
+ struct task_struct *task,
+ enum proc_hidepid hide_pid_min)
+{
+ /*
+ * If 'hidpid' mount option is set force a ptrace check,
+ * we indicate that we are using a filesystem syscall
+ * by passing PTRACE_MODE_READ_FSCREDS
+ */
+ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
+ return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+
+ if (fs_info->hide_pid < hide_pid_min)
+ return true;
+ if (in_group_p(fs_info->pid_gid))
+ return true;
+ return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+}
+
+
+static int proc_pid_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
+{
+ struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
+ struct task_struct *task;
+ bool has_perms;
+
+ task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+ has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
+ put_task_struct(task);
+
+ if (!has_perms) {
+ if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
+ /*
+ * Let's make getdents(), stat(), and open()
+ * consistent with each other. If a process
+ * may not stat() a file, it shouldn't be seen
+ * in procfs at all.
+ */
+ return -ENOENT;
+ }
+
+ return -EPERM;
+ }
+ return generic_permission(&init_user_ns, inode, mask);
+}
+
+
+
+static const struct inode_operations proc_def_inode_operations = {
+ .setattr = proc_setattr,
+};
+
+static int proc_single_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
+ struct pid *pid = proc_pid(inode);
+ struct task_struct *task;
+ int ret;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return -ESRCH;
+
+ ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
+
+ put_task_struct(task);
+ return ret;
+}
+
+static int proc_single_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, proc_single_show, inode);
+}
+
+static const struct file_operations proc_single_file_operations = {
+ .open = proc_single_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
+{
+ struct task_struct *task = get_proc_task(inode);
+ struct mm_struct *mm = ERR_PTR(-ESRCH);
+
+ if (task) {
+ mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
+ put_task_struct(task);
+
+ if (!IS_ERR_OR_NULL(mm)) {
+ /* ensure this mm_struct can't be freed */
+ mmgrab(mm);
+ /* but do not pin its memory */
+ mmput(mm);
+ }
+ }
+
+ return mm;
+}
+
+static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+{
+ struct mm_struct *mm = proc_mem_open(inode, mode);
+
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+
+ file->private_data = mm;
+ return 0;
+}
+
+static int mem_open(struct inode *inode, struct file *file)
+{
+ int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
+
+ /* OK to pass negative loff_t, we can catch out-of-range */
+ file->f_mode |= FMODE_UNSIGNED_OFFSET;
+
+ return ret;
+}
+
+static ssize_t mem_rw(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, int write)
+{
+ struct mm_struct *mm = file->private_data;
+ unsigned long addr = *ppos;
+ ssize_t copied;
+ char *page;
+ unsigned int flags;
+
+ if (!mm)
+ return 0;
+
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ copied = 0;
+ if (!mmget_not_zero(mm))
+ goto free;
+
+ flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
+
+ while (count > 0) {
+ size_t this_len = min_t(size_t, count, PAGE_SIZE);
+
+ if (write && copy_from_user(page, buf, this_len)) {
+ copied = -EFAULT;
+ break;
+ }
+
+ this_len = access_remote_vm(mm, addr, page, this_len, flags);
+ if (!this_len) {
+ if (!copied)
+ copied = -EIO;
+ break;
+ }
+
+ if (!write && copy_to_user(buf, page, this_len)) {
+ copied = -EFAULT;
+ break;
+ }
+
+ buf += this_len;
+ addr += this_len;
+ copied += this_len;
+ count -= this_len;
+ }
+ *ppos = addr;
+
+ mmput(mm);
+free:
+ free_page((unsigned long) page);
+ return copied;
+}
+
+static ssize_t mem_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, buf, count, ppos, 0);
+}
+
+static ssize_t mem_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
+
+loff_t mem_lseek(struct file *file, loff_t offset, int orig)
+{
+ switch (orig) {
+ case 0:
+ file->f_pos = offset;
+ break;
+ case 1:
+ file->f_pos += offset;
+ break;
+ default:
+ return -EINVAL;
+ }
+ force_successful_syscall_return();
+ return file->f_pos;
+}
+
+static int mem_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+ if (mm)
+ mmdrop(mm);
+ return 0;
+}
+
+static const struct file_operations proc_mem_operations = {
+ .llseek = mem_lseek,
+ .read = mem_read,
+ .write = mem_write,
+ .open = mem_open,
+ .release = mem_release,
+};
+
+static int environ_open(struct inode *inode, struct file *file)
+{
+ return __mem_open(inode, file, PTRACE_MODE_READ);
+}
+
+static ssize_t environ_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char *page;
+ unsigned long src = *ppos;
+ int ret = 0;
+ struct mm_struct *mm = file->private_data;
+ unsigned long env_start, env_end;
+
+ /* Ensure the process spawned far enough to have an environment. */
+ if (!mm || !mm->env_end)
+ return 0;
+
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ ret = 0;
+ if (!mmget_not_zero(mm))
+ goto free;
+
+ spin_lock(&mm->arg_lock);
+ env_start = mm->env_start;
+ env_end = mm->env_end;
+ spin_unlock(&mm->arg_lock);
+
+ while (count > 0) {
+ size_t this_len, max_len;
+ int retval;
+
+ if (src >= (env_end - env_start))
+ break;
+
+ this_len = env_end - (env_start + src);
+
+ max_len = min_t(size_t, PAGE_SIZE, count);
+ this_len = min(max_len, this_len);
+
+ retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
+
+ if (retval <= 0) {
+ ret = retval;
+ break;
+ }
+
+ if (copy_to_user(buf, page, retval)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret += retval;
+ src += retval;
+ buf += retval;
+ count -= retval;
+ }
+ *ppos = src;
+ mmput(mm);
+
+free:
+ free_page((unsigned long) page);
+ return ret;
+}
+
+static const struct file_operations proc_environ_operations = {
+ .open = environ_open,
+ .read = environ_read,
+ .llseek = generic_file_llseek,
+ .release = mem_release,
+};
+
+static int auxv_open(struct inode *inode, struct file *file)
+{
+ return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
+}
+
+static ssize_t auxv_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mm_struct *mm = file->private_data;
+ unsigned int nwords = 0;
+
+ if (!mm)
+ return 0;
+ do {
+ nwords += 2;
+ } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+ return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
+ nwords * sizeof(mm->saved_auxv[0]));
+}
+
+static const struct file_operations proc_auxv_operations = {
+ .open = auxv_open,
+ .read = auxv_read,
+ .llseek = generic_file_llseek,
+ .release = mem_release,
+};
+
+static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct task_struct *task = get_proc_task(file_inode(file));
+ char buffer[PROC_NUMBUF];
+ int oom_adj = OOM_ADJUST_MIN;
+ size_t len;
+
+ if (!task)
+ return -ESRCH;
+ if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+ oom_adj = OOM_ADJUST_MAX;
+ else
+ oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+ OOM_SCORE_ADJ_MAX;
+ put_task_struct(task);
+ if (oom_adj > OOM_ADJUST_MAX)
+ oom_adj = OOM_ADJUST_MAX;
+ len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
+ return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
+{
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ int err = 0;
+
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ return -ESRCH;
+
+ mutex_lock(&oom_adj_mutex);
+ if (legacy) {
+ if (oom_adj < task->signal->oom_score_adj &&
+ !capable(CAP_SYS_RESOURCE)) {
+ err = -EACCES;
+ goto err_unlock;
+ }
+ /*
+ * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+ * /proc/pid/oom_score_adj instead.
+ */
+ pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+ current->comm, task_pid_nr(current), task_pid_nr(task),
+ task_pid_nr(task));
+ } else {
+ if ((short)oom_adj < task->signal->oom_score_adj_min &&
+ !capable(CAP_SYS_RESOURCE)) {
+ err = -EACCES;
+ goto err_unlock;
+ }
+ }
+
+ /*
+ * Make sure we will check other processes sharing the mm if this is
+ * not vfrok which wants its own oom_score_adj.
+ * pin the mm so it doesn't go away and get reused after task_unlock
+ */
+ if (!task->vfork_done) {
+ struct task_struct *p = find_lock_task_mm(task);
+
+ if (p) {
+ if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
+ mm = p->mm;
+ mmgrab(mm);
+ }
+ task_unlock(p);
+ }
+ }
+
+ task->signal->oom_score_adj = oom_adj;
+ if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ task->signal->oom_score_adj_min = (short)oom_adj;
+ trace_oom_score_adj_update(task);
+
+ if (mm) {
+ struct task_struct *p;
+
+ rcu_read_lock();
+ for_each_process(p) {
+ if (same_thread_group(task, p))
+ continue;
+
+ /* do not touch kernel threads or the global init */
+ if (p->flags & PF_KTHREAD || is_global_init(p))
+ continue;
+
+ task_lock(p);
+ if (!p->vfork_done && process_shares_mm(p, mm)) {
+ p->signal->oom_score_adj = oom_adj;
+ if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ p->signal->oom_score_adj_min = (short)oom_adj;
+ }
+ task_unlock(p);
+ }
+ rcu_read_unlock();
+ mmdrop(mm);
+ }
+err_unlock:
+ mutex_unlock(&oom_adj_mutex);
+ put_task_struct(task);
+ return err;
+}
+
+/*
+ * /proc/pid/oom_adj exists solely for backwards compatibility with previous
+ * kernels. The effective policy is defined by oom_score_adj, which has a
+ * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
+ * Values written to oom_adj are simply mapped linearly to oom_score_adj.
+ * Processes that become oom disabled via oom_adj will still be oom disabled
+ * with this implementation.
+ *
+ * oom_adj cannot be removed since existing userspace binaries use it.
+ */
+static ssize_t oom_adj_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char buffer[PROC_NUMBUF];
+ int oom_adj;
+ int err;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ err = kstrtoint(strstrip(buffer), 0, &oom_adj);
+ if (err)
+ goto out;
+ if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
+ oom_adj != OOM_DISABLE) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+ * value is always attainable.
+ */
+ if (oom_adj == OOM_ADJUST_MAX)
+ oom_adj = OOM_SCORE_ADJ_MAX;
+ else
+ oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+
+ err = __set_oom_adj(file, oom_adj, true);
+out:
+ return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_adj_operations = {
+ .read = oom_adj_read,
+ .write = oom_adj_write,
+ .llseek = generic_file_llseek,
+};
+
+static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task = get_proc_task(file_inode(file));
+ char buffer[PROC_NUMBUF];
+ short oom_score_adj = OOM_SCORE_ADJ_MIN;
+ size_t len;
+
+ if (!task)
+ return -ESRCH;
+ oom_score_adj = task->signal->oom_score_adj;
+ put_task_struct(task);
+ len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
+ return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char buffer[PROC_NUMBUF];
+ int oom_score_adj;
+ int err;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
+ if (err)
+ goto out;
+ if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
+ oom_score_adj > OOM_SCORE_ADJ_MAX) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = __set_oom_adj(file, oom_score_adj, false);
+out:
+ return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_score_adj_operations = {
+ .read = oom_score_adj_read,
+ .write = oom_score_adj_write,
+ .llseek = default_llseek,
+};
+
+#ifdef CONFIG_AUDIT
+#define TMPBUFLEN 11
+static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode * inode = file_inode(file);
+ struct task_struct *task = get_proc_task(inode);
+ ssize_t length;
+ char tmpbuf[TMPBUFLEN];
+
+ if (!task)
+ return -ESRCH;
+ length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
+ from_kuid(file->f_cred->user_ns,
+ audit_get_loginuid(task)));
+ put_task_struct(task);
+ return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+}
+
+static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode * inode = file_inode(file);
+ uid_t loginuid;
+ kuid_t kloginuid;
+ int rv;
+
+ /* Don't let kthreads write their own loginuid */
+ if (current->flags & PF_KTHREAD)
+ return -EPERM;
+
+ rcu_read_lock();
+ if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
+ rcu_read_unlock();
+ return -EPERM;
+ }
+ rcu_read_unlock();
+
+ if (*ppos != 0) {
+ /* No partial writes. */
+ return -EINVAL;
+ }
+
+ rv = kstrtou32_from_user(buf, count, 10, &loginuid);
+ if (rv < 0)
+ return rv;
+
+ /* is userspace tring to explicitly UNSET the loginuid? */
+ if (loginuid == AUDIT_UID_UNSET) {
+ kloginuid = INVALID_UID;
+ } else {
+ kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
+ if (!uid_valid(kloginuid))
+ return -EINVAL;
+ }
+
+ rv = audit_set_loginuid(kloginuid);
+ if (rv < 0)
+ return rv;
+ return count;
+}
+
+static const struct file_operations proc_loginuid_operations = {
+ .read = proc_loginuid_read,
+ .write = proc_loginuid_write,
+ .llseek = generic_file_llseek,
+};
+
+static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode * inode = file_inode(file);
+ struct task_struct *task = get_proc_task(inode);
+ ssize_t length;
+ char tmpbuf[TMPBUFLEN];
+
+ if (!task)
+ return -ESRCH;
+ length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
+ audit_get_sessionid(task));
+ put_task_struct(task);
+ return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+}
+
+static const struct file_operations proc_sessionid_operations = {
+ .read = proc_sessionid_read,
+ .llseek = generic_file_llseek,
+};
+#endif
+
+#ifdef CONFIG_FAULT_INJECTION
+static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task = get_proc_task(file_inode(file));
+ char buffer[PROC_NUMBUF];
+ size_t len;
+ int make_it_fail;
+
+ if (!task)
+ return -ESRCH;
+ make_it_fail = task->make_it_fail;
+ put_task_struct(task);
+
+ len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
+
+ return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t proc_fault_inject_write(struct file * file,
+ const char __user * buf, size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ char buffer[PROC_NUMBUF];
+ int make_it_fail;
+ int rv;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count))
+ return -EFAULT;
+ rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
+ if (rv < 0)
+ return rv;
+ if (make_it_fail < 0 || make_it_fail > 1)
+ return -EINVAL;
+
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ return -ESRCH;
+ task->make_it_fail = make_it_fail;
+ put_task_struct(task);
+
+ return count;
+}
+
+static const struct file_operations proc_fault_inject_operations = {
+ .read = proc_fault_inject_read,
+ .write = proc_fault_inject_write,
+ .llseek = generic_file_llseek,
+};
+
+static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ int err;
+ unsigned int n;
+
+ err = kstrtouint_from_user(buf, count, 0, &n);
+ if (err)
+ return err;
+
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ return -ESRCH;
+ task->fail_nth = n;
+ put_task_struct(task);
+
+ return count;
+}
+
+static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ char numbuf[PROC_NUMBUF];
+ ssize_t len;
+
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ return -ESRCH;
+ len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
+ put_task_struct(task);
+ return simple_read_from_buffer(buf, count, ppos, numbuf, len);
+}
+
+static const struct file_operations proc_fail_nth_operations = {
+ .read = proc_fail_nth_read,
+ .write = proc_fail_nth_write,
+};
+#endif
+
+
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * Print out various scheduling related per-task fields:
+ */
+static int sched_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
+ struct task_struct *p;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+ proc_sched_show_task(p, ns, m);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static ssize_t
+sched_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct inode *inode = file_inode(file);
+ struct task_struct *p;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+ proc_sched_set_task(p);
+
+ put_task_struct(p);
+
+ return count;
+}
+
+static int sched_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_show, inode);
+}
+
+static const struct file_operations proc_pid_sched_operations = {
+ .open = sched_open,
+ .read = seq_read,
+ .write = sched_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct task_struct *p;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+ proc_sched_autogroup_show_task(p, m);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct inode *inode = file_inode(file);
+ struct task_struct *p;
+ char buffer[PROC_NUMBUF];
+ int nice;
+ int err;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count))
+ return -EFAULT;
+
+ err = kstrtoint(strstrip(buffer), 0, &nice);
+ if (err < 0)
+ return err;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ err = proc_sched_autogroup_set_nice(p, nice);
+ if (err)
+ count = err;
+
+ put_task_struct(p);
+
+ return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ ret = single_open(filp, sched_autogroup_show, NULL);
+ if (!ret) {
+ struct seq_file *m = filp->private_data;
+
+ m->private = inode;
+ }
+ return ret;
+}
+
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+ .open = sched_autogroup_open,
+ .read = seq_read,
+ .write = sched_autogroup_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
+
+#ifdef CONFIG_TIME_NS
+static int timens_offsets_show(struct seq_file *m, void *v)
+{
+ struct task_struct *p;
+
+ p = get_proc_task(file_inode(m->file));
+ if (!p)
+ return -ESRCH;
+ proc_timens_show_offsets(p, m);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode *inode = file_inode(file);
+ struct proc_timens_offset offsets[2];
+ char *kbuf = NULL, *pos, *next_line;
+ struct task_struct *p;
+ int ret, noffsets;
+
+ /* Only allow < page size writes at the beginning of the file */
+ if ((*ppos != 0) || (count >= PAGE_SIZE))
+ return -EINVAL;
+
+ /* Slurp in the user data */
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ /* Parse the user data */
+ ret = -EINVAL;
+ noffsets = 0;
+ for (pos = kbuf; pos; pos = next_line) {
+ struct proc_timens_offset *off = &offsets[noffsets];
+ char clock[10];
+ int err;
+
+ /* Find the end of line and ensure we don't look past it */
+ next_line = strchr(pos, '\n');
+ if (next_line) {
+ *next_line = '\0';
+ next_line++;
+ if (*next_line == '\0')
+ next_line = NULL;
+ }
+
+ err = sscanf(pos, "%9s %lld %lu", clock,
+ &off->val.tv_sec, &off->val.tv_nsec);
+ if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
+ goto out;
+
+ clock[sizeof(clock) - 1] = 0;
+ if (strcmp(clock, "monotonic") == 0 ||
+ strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
+ off->clockid = CLOCK_MONOTONIC;
+ else if (strcmp(clock, "boottime") == 0 ||
+ strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
+ off->clockid = CLOCK_BOOTTIME;
+ else
+ goto out;
+
+ noffsets++;
+ if (noffsets == ARRAY_SIZE(offsets)) {
+ if (next_line)
+ count = next_line - kbuf;
+ break;
+ }
+ }
+
+ ret = -ESRCH;
+ p = get_proc_task(inode);
+ if (!p)
+ goto out;
+ ret = proc_timens_set_offset(file, p, offsets, noffsets);
+ put_task_struct(p);
+ if (ret)
+ goto out;
+
+ ret = count;
+out:
+ kfree(kbuf);
+ return ret;
+}
+
+static int timens_offsets_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, timens_offsets_show, inode);
+}
+
+static const struct file_operations proc_timens_offsets_operations = {
+ .open = timens_offsets_open,
+ .read = seq_read,
+ .write = timens_offsets_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_TIME_NS */
+
+static ssize_t comm_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct inode *inode = file_inode(file);
+ struct task_struct *p;
+ char buffer[TASK_COMM_LEN];
+ const size_t maxlen = sizeof(buffer) - 1;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
+ return -EFAULT;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ if (same_thread_group(current, p)) {
+ set_task_comm(p, buffer);
+ proc_comm_connector(p);
+ }
+ else
+ count = -EINVAL;
+
+ put_task_struct(p);
+
+ return count;
+}
+
+static int comm_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct task_struct *p;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ proc_task_name(m, p, false);
+ seq_putc(m, '\n');
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static int comm_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, comm_show, inode);
+}
+
+static const struct file_operations proc_pid_set_comm_operations = {
+ .open = comm_open,
+ .read = seq_read,
+ .write = comm_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
+{
+ struct task_struct *task;
+ struct file *exe_file;
+
+ task = get_proc_task(d_inode(dentry));
+ if (!task)
+ return -ENOENT;
+ exe_file = get_task_exe_file(task);
+ put_task_struct(task);
+ if (exe_file) {
+ *exe_path = exe_file->f_path;
+ path_get(&exe_file->f_path);
+ fput(exe_file);
+ return 0;
+ } else
+ return -ENOENT;
+}
+
+static const char *proc_pid_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ struct path path;
+ int error = -EACCES;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ /* Are we allowed to snoop on the tasks file descriptors? */
+ if (!proc_fd_access_allowed(inode))
+ goto out;
+
+ error = PROC_I(inode)->op.proc_get_link(dentry, &path);
+ if (error)
+ goto out;
+
+ error = nd_jump_link(&path);
+out:
+ return ERR_PTR(error);
+}
+
+static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen)
+{
+ char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
+ char *pathname;
+ int len;
+
+ if (!tmp)
+ return -ENOMEM;
+
+ pathname = d_path(path, tmp, PATH_MAX);
+ len = PTR_ERR(pathname);
+ if (IS_ERR(pathname))
+ goto out;
+ len = tmp + PATH_MAX - 1 - pathname;
+
+ if (len > buflen)
+ len = buflen;
+ if (copy_to_user(buffer, pathname, len))
+ len = -EFAULT;
+ out:
+ kfree(tmp);
+ return len;
+}
+
+static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
+{
+ int error = -EACCES;
+ struct inode *inode = d_inode(dentry);
+ struct path path;
+
+ /* Are we allowed to snoop on the tasks file descriptors? */
+ if (!proc_fd_access_allowed(inode))
+ goto out;
+
+ error = PROC_I(inode)->op.proc_get_link(dentry, &path);
+ if (error)
+ goto out;
+
+ error = do_proc_readlink(&path, buffer, buflen);
+ path_put(&path);
+out:
+ return error;
+}
+
+const struct inode_operations proc_pid_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .get_link = proc_pid_get_link,
+ .setattr = proc_setattr,
+};
+
+
+/* building an inode */
+
+void task_dump_owner(struct task_struct *task, umode_t mode,
+ kuid_t *ruid, kgid_t *rgid)
+{
+ /* Depending on the state of dumpable compute who should own a
+ * proc file for a task.
+ */
+ const struct cred *cred;
+ kuid_t uid;
+ kgid_t gid;
+
+ if (unlikely(task->flags & PF_KTHREAD)) {
+ *ruid = GLOBAL_ROOT_UID;
+ *rgid = GLOBAL_ROOT_GID;
+ return;
+ }
+
+ /* Default to the tasks effective ownership */
+ rcu_read_lock();
+ cred = __task_cred(task);
+ uid = cred->euid;
+ gid = cred->egid;
+ rcu_read_unlock();
+
+ /*
+ * Before the /proc/pid/status file was created the only way to read
+ * the effective uid of a /process was to stat /proc/pid. Reading
+ * /proc/pid/status is slow enough that procps and other packages
+ * kept stating /proc/pid. To keep the rules in /proc simple I have
+ * made this apply to all per process world readable and executable
+ * directories.
+ */
+ if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
+ struct mm_struct *mm;
+ task_lock(task);
+ mm = task->mm;
+ /* Make non-dumpable tasks owned by some root */
+ if (mm) {
+ if (get_dumpable(mm) != SUID_DUMP_USER) {
+ struct user_namespace *user_ns = mm->user_ns;
+
+ uid = make_kuid(user_ns, 0);
+ if (!uid_valid(uid))
+ uid = GLOBAL_ROOT_UID;
+
+ gid = make_kgid(user_ns, 0);
+ if (!gid_valid(gid))
+ gid = GLOBAL_ROOT_GID;
+ }
+ } else {
+ uid = GLOBAL_ROOT_UID;
+ gid = GLOBAL_ROOT_GID;
+ }
+ task_unlock(task);
+ }
+ *ruid = uid;
+ *rgid = gid;
+}
+
+void proc_pid_evict_inode(struct proc_inode *ei)
+{
+ struct pid *pid = ei->pid;
+
+ if (S_ISDIR(ei->vfs_inode.i_mode)) {
+ spin_lock(&pid->lock);
+ hlist_del_init_rcu(&ei->sibling_inodes);
+ spin_unlock(&pid->lock);
+ }
+
+ put_pid(pid);
+}
+
+struct inode *proc_pid_make_inode(struct super_block *sb,
+ struct task_struct *task, umode_t mode)
+{
+ struct inode * inode;
+ struct proc_inode *ei;
+ struct pid *pid;
+
+ /* We need a new inode */
+
+ inode = new_inode(sb);
+ if (!inode)
+ goto out;
+
+ /* Common stuff */
+ ei = PROC_I(inode);
+ inode->i_mode = mode;
+ inode->i_ino = get_next_ino();
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_op = &proc_def_inode_operations;
+
+ /*
+ * grab the reference to task.
+ */
+ pid = get_task_pid(task, PIDTYPE_PID);
+ if (!pid)
+ goto out_unlock;
+
+ /* Let the pid remember us for quick removal */
+ ei->pid = pid;
+
+ task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
+ security_task_to_inode(task, inode);
+
+out:
+ return inode;
+
+out_unlock:
+ iput(inode);
+ return NULL;
+}
+
+/*
+ * Generating an inode and adding it into @pid->inodes, so that task will
+ * invalidate inode's dentry before being released.
+ *
+ * This helper is used for creating dir-type entries under '/proc' and
+ * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
+ * can be released by invalidating '/proc/<tgid>' dentry.
+ * In theory, dentries under '/proc/<tgid>/task' can also be released by
+ * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
+ * thread exiting situation: Any one of threads should invalidate its
+ * '/proc/<tgid>/task/<pid>' dentry before released.
+ */
+static struct inode *proc_pid_make_base_inode(struct super_block *sb,
+ struct task_struct *task, umode_t mode)
+{
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct pid *pid;
+
+ inode = proc_pid_make_inode(sb, task, mode);
+ if (!inode)
+ return NULL;
+
+ /* Let proc_flush_pid find this directory inode */
+ ei = PROC_I(inode);
+ pid = ei->pid;
+ spin_lock(&pid->lock);
+ hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+ spin_unlock(&pid->lock);
+
+ return inode;
+}
+
+int pid_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
+ struct task_struct *task;
+
+ generic_fillattr(&init_user_ns, inode, stat);
+
+ stat->uid = GLOBAL_ROOT_UID;
+ stat->gid = GLOBAL_ROOT_GID;
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (task) {
+ if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
+ rcu_read_unlock();
+ /*
+ * This doesn't prevent learning whether PID exists,
+ * it only makes getattr() consistent with readdir().
+ */
+ return -ENOENT;
+ }
+ task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+/* dentry stuff */
+
+/*
+ * Set <pid>/... inode ownership (can change due to setuid(), etc.)
+ */
+void pid_update_inode(struct task_struct *task, struct inode *inode)
+{
+ task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
+
+ inode->i_mode &= ~(S_ISUID | S_ISGID);
+ security_task_to_inode(task, inode);
+}
+
+/*
+ * Rewrite the inode's ownerships here because the owning task may have
+ * performed a setuid(), etc.
+ *
+ */
+static int pid_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct inode *inode;
+ struct task_struct *task;
+ int ret = 0;
+
+ rcu_read_lock();
+ inode = d_inode_rcu(dentry);
+ if (!inode)
+ goto out;
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+
+ if (task) {
+ pid_update_inode(task, inode);
+ ret = 1;
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static inline bool proc_inode_is_dead(struct inode *inode)
+{
+ return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
+}
+
+int pid_delete_dentry(const struct dentry *dentry)
+{
+ /* Is the task we represent dead?
+ * If so, then don't put the dentry on the lru list,
+ * kill it immediately.
+ */
+ return proc_inode_is_dead(d_inode(dentry));
+}
+
+const struct dentry_operations pid_dentry_operations =
+{
+ .d_revalidate = pid_revalidate,
+ .d_delete = pid_delete_dentry,
+};
+
+/* Lookups */
+
+/*
+ * Fill a directory entry.
+ *
+ * If possible create the dcache entry and derive our inode number and
+ * file type from dcache entry.
+ *
+ * Since all of the proc inode numbers are dynamically generated, the inode
+ * numbers do not exist until the inode is cache. This means creating
+ * the dcache entry in readdir is necessary to keep the inode numbers
+ * reported by readdir in sync with the inode numbers reported
+ * by stat.
+ */
+bool proc_fill_cache(struct file *file, struct dir_context *ctx,
+ const char *name, unsigned int len,
+ instantiate_t instantiate, struct task_struct *task, const void *ptr)
+{
+ struct dentry *child, *dir = file->f_path.dentry;
+ struct qstr qname = QSTR_INIT(name, len);
+ struct inode *inode;
+ unsigned type = DT_UNKNOWN;
+ ino_t ino = 1;
+
+ child = d_hash_and_lookup(dir, &qname);
+ if (!child) {
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ child = d_alloc_parallel(dir, &qname, &wq);
+ if (IS_ERR(child))
+ goto end_instantiate;
+ if (d_in_lookup(child)) {
+ struct dentry *res;
+ res = instantiate(child, task, ptr);
+ d_lookup_done(child);
+ if (unlikely(res)) {
+ dput(child);
+ child = res;
+ if (IS_ERR(child))
+ goto end_instantiate;
+ }
+ }
+ }
+ inode = d_inode(child);
+ ino = inode->i_ino;
+ type = inode->i_mode >> 12;
+ dput(child);
+end_instantiate:
+ return dir_emit(ctx, name, len, ino, type);
+}
+
+/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+ unsigned long *start, unsigned long *end)
+{
+ const char *str = dentry->d_name.name;
+ unsigned long long sval, eval;
+ unsigned int len;
+
+ if (str[0] == '0' && str[1] != '-')
+ return -EINVAL;
+ len = _parse_integer(str, 16, &sval);
+ if (len & KSTRTOX_OVERFLOW)
+ return -EINVAL;
+ if (sval != (unsigned long)sval)
+ return -EINVAL;
+ str += len;
+
+ if (*str != '-')
+ return -EINVAL;
+ str++;
+
+ if (str[0] == '0' && str[1])
+ return -EINVAL;
+ len = _parse_integer(str, 16, &eval);
+ if (len & KSTRTOX_OVERFLOW)
+ return -EINVAL;
+ if (eval != (unsigned long)eval)
+ return -EINVAL;
+ str += len;
+
+ if (*str != '\0')
+ return -EINVAL;
+
+ *start = sval;
+ *end = eval;
+
+ return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ unsigned long vm_start, vm_end;
+ bool exact_vma_exists = false;
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ struct inode *inode;
+ int status = 0;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ inode = d_inode(dentry);
+ task = get_proc_task(inode);
+ if (!task)
+ goto out_notask;
+
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+ if (IS_ERR_OR_NULL(mm))
+ goto out;
+
+ if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+ status = mmap_read_lock_killable(mm);
+ if (!status) {
+ exact_vma_exists = !!find_exact_vma(mm, vm_start,
+ vm_end);
+ mmap_read_unlock(mm);
+ }
+ }
+
+ mmput(mm);
+
+ if (exact_vma_exists) {
+ task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
+
+ security_task_to_inode(task, inode);
+ status = 1;
+ }
+
+out:
+ put_task_struct(task);
+
+out_notask:
+ return status;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+ .d_revalidate = map_files_d_revalidate,
+ .d_delete = pid_delete_dentry,
+};
+
+static int map_files_get_link(struct dentry *dentry, struct path *path)
+{
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ int rc;
+
+ rc = -ENOENT;
+ task = get_proc_task(d_inode(dentry));
+ if (!task)
+ goto out;
+
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+ if (rc)
+ goto out_mmput;
+
+ rc = mmap_read_lock_killable(mm);
+ if (rc)
+ goto out_mmput;
+
+ rc = -ENOENT;
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (vma && vma->vm_file) {
+ *path = vma->vm_file->f_path;
+ path_get(path);
+ rc = 0;
+ }
+ mmap_read_unlock(mm);
+
+out_mmput:
+ mmput(mm);
+out:
+ return rc;
+}
+
+struct map_files_info {
+ unsigned long start;
+ unsigned long end;
+ fmode_t mode;
+};
+
+/*
+ * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due
+ * to concerns about how the symlinks may be used to bypass permissions on
+ * ancestor directories in the path to the file in question.
+ */
+static const char *
+proc_map_files_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ if (!checkpoint_restore_ns_capable(&init_user_ns))
+ return ERR_PTR(-EPERM);
+
+ return proc_pid_get_link(dentry, inode, done);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for get_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .get_link = proc_map_files_get_link,
+ .setattr = proc_setattr,
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ fmode_t mode = (fmode_t)(unsigned long)ptr;
+ struct proc_inode *ei;
+ struct inode *inode;
+
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
+ ((mode & FMODE_READ ) ? S_IRUSR : 0) |
+ ((mode & FMODE_WRITE) ? S_IWUSR : 0));
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ ei = PROC_I(inode);
+ ei->op.proc_get_link = map_files_get_link;
+
+ inode->i_op = &proc_map_files_link_inode_operations;
+ inode->i_size = 64;
+
+ d_set_d_op(dentry, &tid_map_files_dentry_operations);
+ return d_splice_alias(inode, dentry);
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
+{
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct dentry *result;
+ struct mm_struct *mm;
+
+ result = ERR_PTR(-ENOENT);
+ task = get_proc_task(dir);
+ if (!task)
+ goto out;
+
+ result = ERR_PTR(-EACCES);
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto out_put_task;
+
+ result = ERR_PTR(-ENOENT);
+ if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+ goto out_put_task;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_put_task;
+
+ result = ERR_PTR(-EINTR);
+ if (mmap_read_lock_killable(mm))
+ goto out_put_mm;
+
+ result = ERR_PTR(-ENOENT);
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (!vma)
+ goto out_no_vma;
+
+ if (vma->vm_file)
+ result = proc_map_files_instantiate(dentry, task,
+ (void *)(unsigned long)vma->vm_file->f_mode);
+
+out_no_vma:
+ mmap_read_unlock(mm);
+out_put_mm:
+ mmput(mm);
+out_put_task:
+ put_task_struct(task);
+out:
+ return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+ .lookup = proc_map_files_lookup,
+ .permission = proc_fd_permission,
+ .setattr = proc_setattr,
+};
+
+static int
+proc_map_files_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ unsigned long nr_files, pos, i;
+ GENRADIX(struct map_files_info) fa;
+ struct map_files_info *p;
+ int ret;
+ struct vma_iterator vmi;
+
+ genradix_init(&fa);
+
+ ret = -ENOENT;
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ goto out;
+
+ ret = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto out_put_task;
+
+ ret = 0;
+ if (!dir_emit_dots(file, ctx))
+ goto out_put_task;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_put_task;
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ mmput(mm);
+ goto out_put_task;
+ }
+
+ nr_files = 0;
+
+ /*
+ * We need two passes here:
+ *
+ * 1) Collect vmas of mapped files with mmap_lock taken
+ * 2) Release mmap_lock and instantiate entries
+ *
+ * otherwise we get lockdep complained, since filldir()
+ * routine might require mmap_lock taken in might_fault().
+ */
+
+ pos = 2;
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= ctx->pos)
+ continue;
+
+ p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
+ if (!p) {
+ ret = -ENOMEM;
+ mmap_read_unlock(mm);
+ mmput(mm);
+ goto out_put_task;
+ }
+
+ p->start = vma->vm_start;
+ p->end = vma->vm_end;
+ p->mode = vma->vm_file->f_mode;
+ }
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ for (i = 0; i < nr_files; i++) {
+ char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
+ unsigned int len;
+
+ p = genradix_ptr(&fa, i);
+ len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
+ if (!proc_fill_cache(file, ctx,
+ buf, len,
+ proc_map_files_instantiate,
+ task,
+ (void *)(unsigned long)p->mode))
+ break;
+ ctx->pos++;
+ }
+
+out_put_task:
+ put_task_struct(task);
+out:
+ genradix_free(&fa);
+ return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+ .read = generic_read_dir,
+ .iterate_shared = proc_map_files_readdir,
+ .llseek = generic_file_llseek,
+};
+
+#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
+struct timers_private {
+ struct pid *pid;
+ struct task_struct *task;
+ struct sighand_struct *sighand;
+ struct pid_namespace *ns;
+ unsigned long flags;
+};
+
+static void *timers_start(struct seq_file *m, loff_t *pos)
+{
+ struct timers_private *tp = m->private;
+
+ tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
+ if (!tp->task)
+ return ERR_PTR(-ESRCH);
+
+ tp->sighand = lock_task_sighand(tp->task, &tp->flags);
+ if (!tp->sighand)
+ return ERR_PTR(-ESRCH);
+
+ return seq_list_start(&tp->task->signal->posix_timers, *pos);
+}
+
+static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct timers_private *tp = m->private;
+ return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+}
+
+static void timers_stop(struct seq_file *m, void *v)
+{
+ struct timers_private *tp = m->private;
+
+ if (tp->sighand) {
+ unlock_task_sighand(tp->task, &tp->flags);
+ tp->sighand = NULL;
+ }
+
+ if (tp->task) {
+ put_task_struct(tp->task);
+ tp->task = NULL;
+ }
+}
+
+static int show_timer(struct seq_file *m, void *v)
+{
+ struct k_itimer *timer;
+ struct timers_private *tp = m->private;
+ int notify;
+ static const char * const nstr[] = {
+ [SIGEV_SIGNAL] = "signal",
+ [SIGEV_NONE] = "none",
+ [SIGEV_THREAD] = "thread",
+ };
+
+ timer = list_entry((struct list_head *)v, struct k_itimer, list);
+ notify = timer->it_sigev_notify;
+
+ seq_printf(m, "ID: %d\n", timer->it_id);
+ seq_printf(m, "signal: %d/%px\n",
+ timer->sigq->info.si_signo,
+ timer->sigq->info.si_value.sival_ptr);
+ seq_printf(m, "notify: %s/%s.%d\n",
+ nstr[notify & ~SIGEV_THREAD_ID],
+ (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
+ pid_nr_ns(timer->it_pid, tp->ns));
+ seq_printf(m, "ClockID: %d\n", timer->it_clock);
+
+ return 0;
+}
+
+static const struct seq_operations proc_timers_seq_ops = {
+ .start = timers_start,
+ .next = timers_next,
+ .stop = timers_stop,
+ .show = show_timer,
+};
+
+static int proc_timers_open(struct inode *inode, struct file *file)
+{
+ struct timers_private *tp;
+
+ tp = __seq_open_private(file, &proc_timers_seq_ops,
+ sizeof(struct timers_private));
+ if (!tp)
+ return -ENOMEM;
+
+ tp->pid = proc_pid(inode);
+ tp->ns = proc_pid_ns(inode->i_sb);
+ return 0;
+}
+
+static const struct file_operations proc_timers_operations = {
+ .open = proc_timers_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+#endif
+
+static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct inode *inode = file_inode(file);
+ struct task_struct *p;
+ u64 slack_ns;
+ int err;
+
+ err = kstrtoull_from_user(buf, count, 10, &slack_ns);
+ if (err < 0)
+ return err;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ if (p != current) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ count = -EPERM;
+ goto out;
+ }
+ rcu_read_unlock();
+
+ err = security_task_setscheduler(p);
+ if (err) {
+ count = err;
+ goto out;
+ }
+ }
+
+ task_lock(p);
+ if (slack_ns == 0)
+ p->timer_slack_ns = p->default_timer_slack_ns;
+ else
+ p->timer_slack_ns = slack_ns;
+ task_unlock(p);
+
+out:
+ put_task_struct(p);
+
+ return count;
+}
+
+static int timerslack_ns_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct task_struct *p;
+ int err = 0;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ if (p != current) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ err = -EPERM;
+ goto out;
+ }
+ rcu_read_unlock();
+
+ err = security_task_getscheduler(p);
+ if (err)
+ goto out;
+ }
+
+ task_lock(p);
+ seq_printf(m, "%llu\n", p->timer_slack_ns);
+ task_unlock(p);
+
+out:
+ put_task_struct(p);
+
+ return err;
+}
+
+static int timerslack_ns_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, timerslack_ns_show, inode);
+}
+
+static const struct file_operations proc_pid_set_timerslack_ns_operations = {
+ .open = timerslack_ns_open,
+ .read = seq_read,
+ .write = timerslack_ns_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *proc_pident_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ const struct pid_entry *p = ptr;
+ struct inode *inode;
+ struct proc_inode *ei;
+
+ inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ ei = PROC_I(inode);
+ if (S_ISDIR(inode->i_mode))
+ set_nlink(inode, 2); /* Use getattr to fix if necessary */
+ if (p->iop)
+ inode->i_op = p->iop;
+ if (p->fop)
+ inode->i_fop = p->fop;
+ ei->op = p->op;
+ pid_update_inode(task, inode);
+ d_set_d_op(dentry, &pid_dentry_operations);
+ return d_splice_alias(inode, dentry);
+}
+
+static struct dentry *proc_pident_lookup(struct inode *dir,
+ struct dentry *dentry,
+ const struct pid_entry *p,
+ const struct pid_entry *end)
+{
+ struct task_struct *task = get_proc_task(dir);
+ struct dentry *res = ERR_PTR(-ENOENT);
+
+ if (!task)
+ goto out_no_task;
+
+ /*
+ * Yes, it does not scale. And it should not. Don't add
+ * new entries into /proc/<tgid>/ without very good reasons.
+ */
+ for (; p < end; p++) {
+ if (p->len != dentry->d_name.len)
+ continue;
+ if (!memcmp(dentry->d_name.name, p->name, p->len)) {
+ res = proc_pident_instantiate(dentry, task, p);
+ break;
+ }
+ }
+ put_task_struct(task);
+out_no_task:
+ return res;
+}
+
+static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
+ const struct pid_entry *ents, unsigned int nents)
+{
+ struct task_struct *task = get_proc_task(file_inode(file));
+ const struct pid_entry *p;
+
+ if (!task)
+ return -ENOENT;
+
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+
+ if (ctx->pos >= nents + 2)
+ goto out;
+
+ for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
+ if (!proc_fill_cache(file, ctx, p->name, p->len,
+ proc_pident_instantiate, task, p))
+ break;
+ ctx->pos++;
+ }
+out:
+ put_task_struct(task);
+ return 0;
+}
+
+#ifdef CONFIG_SECURITY
+static int proc_pid_attr_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
+ return 0;
+}
+
+static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode * inode = file_inode(file);
+ char *p = NULL;
+ ssize_t length;
+ struct task_struct *task = get_proc_task(inode);
+
+ if (!task)
+ return -ESRCH;
+
+ length = security_getprocattr(task, PROC_I(inode)->op.lsm,
+ file->f_path.dentry->d_name.name,
+ &p);
+ put_task_struct(task);
+ if (length > 0)
+ length = simple_read_from_buffer(buf, count, ppos, p, length);
+ kfree(p);
+ return length;
+}
+
+static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode * inode = file_inode(file);
+ struct task_struct *task;
+ void *page;
+ int rv;
+
+ /* A task may only write when it was the opener. */
+ if (file->private_data != current->mm)
+ return -EPERM;
+
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ /* A task may only write its own attributes. */
+ if (current != task) {
+ rcu_read_unlock();
+ return -EACCES;
+ }
+ /* Prevent changes to overridden credentials. */
+ if (current_cred() != current_real_cred()) {
+ rcu_read_unlock();
+ return -EBUSY;
+ }
+ rcu_read_unlock();
+
+ if (count > PAGE_SIZE)
+ count = PAGE_SIZE;
+
+ /* No partial writes. */
+ if (*ppos != 0)
+ return -EINVAL;
+
+ page = memdup_user(buf, count);
+ if (IS_ERR(page)) {
+ rv = PTR_ERR(page);
+ goto out;
+ }
+
+ /* Guard against adverse ptrace interaction */
+ rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
+ if (rv < 0)
+ goto out_free;
+
+ rv = security_setprocattr(PROC_I(inode)->op.lsm,
+ file->f_path.dentry->d_name.name, page,
+ count);
+ mutex_unlock(&current->signal->cred_guard_mutex);
+out_free:
+ kfree(page);
+out:
+ return rv;
+}
+
+static const struct file_operations proc_pid_attr_operations = {
+ .open = proc_pid_attr_open,
+ .read = proc_pid_attr_read,
+ .write = proc_pid_attr_write,
+ .llseek = generic_file_llseek,
+ .release = mem_release,
+};
+
+#define LSM_DIR_OPS(LSM) \
+static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
+ struct dir_context *ctx) \
+{ \
+ return proc_pident_readdir(filp, ctx, \
+ LSM##_attr_dir_stuff, \
+ ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct file_operations proc_##LSM##_attr_dir_ops = { \
+ .read = generic_read_dir, \
+ .iterate = proc_##LSM##_attr_dir_iterate, \
+ .llseek = default_llseek, \
+}; \
+\
+static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
+ struct dentry *dentry, unsigned int flags) \
+{ \
+ return proc_pident_lookup(dir, dentry, \
+ LSM##_attr_dir_stuff, \
+ LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
+ .lookup = proc_##LSM##_attr_dir_lookup, \
+ .getattr = pid_getattr, \
+ .setattr = proc_setattr, \
+}
+
+#ifdef CONFIG_SECURITY_SMACK
+static const struct pid_entry smack_attr_dir_stuff[] = {
+ ATTR("smack", "current", 0666),
+};
+LSM_DIR_OPS(smack);
+#endif
+
+#ifdef CONFIG_SECURITY_APPARMOR
+static const struct pid_entry apparmor_attr_dir_stuff[] = {
+ ATTR("apparmor", "current", 0666),
+ ATTR("apparmor", "prev", 0444),
+ ATTR("apparmor", "exec", 0666),
+};
+LSM_DIR_OPS(apparmor);
+#endif
+
+static const struct pid_entry attr_dir_stuff[] = {
+ ATTR(NULL, "current", 0666),
+ ATTR(NULL, "prev", 0444),
+ ATTR(NULL, "exec", 0666),
+ ATTR(NULL, "fscreate", 0666),
+ ATTR(NULL, "keycreate", 0666),
+ ATTR(NULL, "sockcreate", 0666),
+#ifdef CONFIG_SECURITY_SMACK
+ DIR("smack", 0555,
+ proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
+#endif
+#ifdef CONFIG_SECURITY_APPARMOR
+ DIR("apparmor", 0555,
+ proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
+#endif
+};
+
+static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
+{
+ return proc_pident_readdir(file, ctx,
+ attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+}
+
+static const struct file_operations proc_attr_dir_operations = {
+ .read = generic_read_dir,
+ .iterate_shared = proc_attr_dir_readdir,
+ .llseek = generic_file_llseek,
+};
+
+static struct dentry *proc_attr_dir_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
+{
+ return proc_pident_lookup(dir, dentry,
+ attr_dir_stuff,
+ attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
+}
+
+static const struct inode_operations proc_attr_dir_inode_operations = {
+ .lookup = proc_attr_dir_lookup,
+ .getattr = pid_getattr,
+ .setattr = proc_setattr,
+};
+
+#endif
+
+#ifdef CONFIG_ELF_CORE
+static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task = get_proc_task(file_inode(file));
+ struct mm_struct *mm;
+ char buffer[PROC_NUMBUF];
+ size_t len;
+ int ret;
+
+ if (!task)
+ return -ESRCH;
+
+ ret = 0;
+ mm = get_task_mm(task);
+ if (mm) {
+ len = snprintf(buffer, sizeof(buffer), "%08lx\n",
+ ((mm->flags & MMF_DUMP_FILTER_MASK) >>
+ MMF_DUMP_FILTER_SHIFT));
+ mmput(mm);
+ ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
+ }
+
+ put_task_struct(task);
+
+ return ret;
+}
+
+static ssize_t proc_coredump_filter_write(struct file *file,
+ const char __user *buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct task_struct *task;
+ struct mm_struct *mm;
+ unsigned int val;
+ int ret;
+ int i;
+ unsigned long mask;
+
+ ret = kstrtouint_from_user(buf, count, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ ret = -ESRCH;
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ goto out_no_task;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_no_mm;
+ ret = 0;
+
+ for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
+ if (val & mask)
+ set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ else
+ clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ }
+
+ mmput(mm);
+ out_no_mm:
+ put_task_struct(task);
+ out_no_task:
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+static const struct file_operations proc_coredump_filter_operations = {
+ .read = proc_coredump_filter_read,
+ .write = proc_coredump_filter_write,
+ .llseek = generic_file_llseek,
+};
+#endif
+
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
+{
+ struct task_io_accounting acct = task->ioac;
+ unsigned long flags;
+ int result;
+
+ result = down_read_killable(&task->signal->exec_update_lock);
+ if (result)
+ return result;
+
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
+ result = -EACCES;
+ goto out_unlock;
+ }
+
+ if (whole && lock_task_sighand(task, &flags)) {
+ struct task_struct *t = task;
+
+ task_io_accounting_add(&acct, &task->signal->ioac);
+ while_each_thread(task, t)
+ task_io_accounting_add(&acct, &t->ioac);
+
+ unlock_task_sighand(task, &flags);
+ }
+ seq_printf(m,
+ "rchar: %llu\n"
+ "wchar: %llu\n"
+ "syscr: %llu\n"
+ "syscw: %llu\n"
+ "read_bytes: %llu\n"
+ "write_bytes: %llu\n"
+ "cancelled_write_bytes: %llu\n",
+ (unsigned long long)acct.rchar,
+ (unsigned long long)acct.wchar,
+ (unsigned long long)acct.syscr,
+ (unsigned long long)acct.syscw,
+ (unsigned long long)acct.read_bytes,
+ (unsigned long long)acct.write_bytes,
+ (unsigned long long)acct.cancelled_write_bytes);
+ result = 0;
+
+out_unlock:
+ up_read(&task->signal->exec_update_lock);
+ return result;
+}
+
+static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ return do_io_accounting(task, m, 0);
+}
+
+static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ return do_io_accounting(task, m, 1);
+}
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
+
+#ifdef CONFIG_USER_NS
+static int proc_id_map_open(struct inode *inode, struct file *file,
+ const struct seq_operations *seq_ops)
+{
+ struct user_namespace *ns = NULL;
+ struct task_struct *task;
+ struct seq_file *seq;
+ int ret = -EINVAL;
+
+ task = get_proc_task(inode);
+ if (task) {
+ rcu_read_lock();
+ ns = get_user_ns(task_cred_xxx(task, user_ns));
+ rcu_read_unlock();
+ put_task_struct(task);
+ }
+ if (!ns)
+ goto err;
+
+ ret = seq_open(file, seq_ops);
+ if (ret)
+ goto err_put_ns;
+
+ seq = file->private_data;
+ seq->private = ns;
+
+ return 0;
+err_put_ns:
+ put_user_ns(ns);
+err:
+ return ret;
+}
+
+static int proc_id_map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ put_user_ns(ns);
+ return seq_release(inode, file);
+}
+
+static int proc_uid_map_open(struct inode *inode, struct file *file)
+{
+ return proc_id_map_open(inode, file, &proc_uid_seq_operations);
+}
+
+static int proc_gid_map_open(struct inode *inode, struct file *file)
+{
+ return proc_id_map_open(inode, file, &proc_gid_seq_operations);
+}
+
+static int proc_projid_map_open(struct inode *inode, struct file *file)
+{
+ return proc_id_map_open(inode, file, &proc_projid_seq_operations);
+}
+
+static const struct file_operations proc_uid_map_operations = {
+ .open = proc_uid_map_open,
+ .write = proc_uid_map_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_id_map_release,
+};
+
+static const struct file_operations proc_gid_map_operations = {
+ .open = proc_gid_map_open,
+ .write = proc_gid_map_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_id_map_release,
+};
+
+static const struct file_operations proc_projid_map_operations = {
+ .open = proc_projid_map_open,
+ .write = proc_projid_map_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_id_map_release,
+};
+
+static int proc_setgroups_open(struct inode *inode, struct file *file)
+{
+ struct user_namespace *ns = NULL;
+ struct task_struct *task;
+ int ret;
+
+ ret = -ESRCH;
+ task = get_proc_task(inode);
+ if (task) {
+ rcu_read_lock();
+ ns = get_user_ns(task_cred_xxx(task, user_ns));
+ rcu_read_unlock();
+ put_task_struct(task);
+ }
+ if (!ns)
+ goto err;
+
+ if (file->f_mode & FMODE_WRITE) {
+ ret = -EACCES;
+ if (!ns_capable(ns, CAP_SYS_ADMIN))
+ goto err_put_ns;
+ }
+
+ ret = single_open(file, &proc_setgroups_show, ns);
+ if (ret)
+ goto err_put_ns;
+
+ return 0;
+err_put_ns:
+ put_user_ns(ns);
+err:
+ return ret;
+}
+
+static int proc_setgroups_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ int ret = single_release(inode, file);
+ put_user_ns(ns);
+ return ret;
+}
+
+static const struct file_operations proc_setgroups_operations = {
+ .open = proc_setgroups_open,
+ .write = proc_setgroups_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_setgroups_release,
+};
+#endif /* CONFIG_USER_NS */
+
+static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ int err = lock_trace(task);
+ if (!err) {
+ seq_printf(m, "%08x\n", task->personality);
+ unlock_trace(task);
+ }
+ return err;
+}
+
+#ifdef CONFIG_LIVEPATCH
+static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ seq_printf(m, "%d\n", task->patch_state);
+ return 0;
+}
+#endif /* CONFIG_LIVEPATCH */
+
+#ifdef CONFIG_KSM
+static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "%lu\n", mm->ksm_merging_pages);
+ mmput(mm);
+ }
+
+ return 0;
+}
+static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ mmput(mm);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_KSM */
+
+#ifdef CONFIG_STACKLEAK_METRICS
+static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ unsigned long prev_depth = THREAD_SIZE -
+ (task->prev_lowest_stack & (THREAD_SIZE - 1));
+ unsigned long depth = THREAD_SIZE -
+ (task->lowest_stack & (THREAD_SIZE - 1));
+
+ seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
+ prev_depth, depth);
+ return 0;
+}
+#endif /* CONFIG_STACKLEAK_METRICS */
+
+/*
+ * Thread groups
+ */
+static const struct file_operations proc_task_operations;
+static const struct inode_operations proc_task_inode_operations;
+
+static const struct pid_entry tgid_base_stuff[] = {
+ DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
+ DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+ DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+#ifdef CONFIG_NET
+ DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
+#endif
+ REG("environ", S_IRUSR, proc_environ_operations),
+ REG("auxv", S_IRUSR, proc_auxv_operations),
+ ONE("status", S_IRUGO, proc_pid_status),
+ ONE("personality", S_IRUSR, proc_pid_personality),
+ ONE("limits", S_IRUGO, proc_pid_limits),
+#ifdef CONFIG_SCHED_DEBUG
+ REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+ REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+#endif
+#ifdef CONFIG_TIME_NS
+ REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
+#endif
+ REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+ ONE("syscall", S_IRUSR, proc_pid_syscall),
+#endif
+ REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
+ ONE("stat", S_IRUGO, proc_tgid_stat),
+ ONE("statm", S_IRUGO, proc_pid_statm),
+ REG("maps", S_IRUGO, proc_pid_maps_operations),
+#ifdef CONFIG_NUMA
+ REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
+#endif
+ REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
+ LNK("cwd", proc_cwd_link),
+ LNK("root", proc_root_link),
+ LNK("exe", proc_exe_link),
+ REG("mounts", S_IRUGO, proc_mounts_operations),
+ REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
+ REG("mountstats", S_IRUSR, proc_mountstats_operations),
+#ifdef CONFIG_PROC_PAGE_MONITOR
+ REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+ REG("smaps", S_IRUGO, proc_pid_smaps_operations),
+ REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
+ REG("pagemap", S_IRUSR, proc_pagemap_operations),
+#endif
+#ifdef CONFIG_SECURITY
+ DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
+#endif
+#ifdef CONFIG_KALLSYMS
+ ONE("wchan", S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+ ONE("stack", S_IRUSR, proc_pid_stack),
+#endif
+#ifdef CONFIG_SCHED_INFO
+ ONE("schedstat", S_IRUGO, proc_pid_schedstat),
+#endif
+#ifdef CONFIG_LATENCYTOP
+ REG("latency", S_IRUGO, proc_lstats_operations),
+#endif
+#ifdef CONFIG_PROC_PID_CPUSET
+ ONE("cpuset", S_IRUGO, proc_cpuset_show),
+#endif
+#ifdef CONFIG_CGROUPS
+ ONE("cgroup", S_IRUGO, proc_cgroup_show),
+#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+ ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
+#endif
+ ONE("oom_score", S_IRUGO, proc_oom_score),
+ REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
+ REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+#ifdef CONFIG_AUDIT
+ REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
+ REG("sessionid", S_IRUGO, proc_sessionid_operations),
+#endif
+#ifdef CONFIG_FAULT_INJECTION
+ REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
+ REG("fail-nth", 0644, proc_fail_nth_operations),
+#endif
+#ifdef CONFIG_ELF_CORE
+ REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ ONE("io", S_IRUSR, proc_tgid_io_accounting),
+#endif
+#ifdef CONFIG_USER_NS
+ REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
+ REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
+ REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
+#endif
+#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
+ REG("timers", S_IRUGO, proc_timers_operations),
+#endif
+ REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
+#ifdef CONFIG_LIVEPATCH
+ ONE("patch_state", S_IRUSR, proc_pid_patch_state),
+#endif
+#ifdef CONFIG_STACKLEAK_METRICS
+ ONE("stack_depth", S_IRUGO, proc_stack_depth),
+#endif
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+ ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+#endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+ ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
+#endif
+};
+
+static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
+{
+ return proc_pident_readdir(file, ctx,
+ tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+}
+
+static const struct file_operations proc_tgid_base_operations = {
+ .read = generic_read_dir,
+ .iterate_shared = proc_tgid_base_readdir,
+ .llseek = generic_file_llseek,
+};
+
+struct pid *tgid_pidfd_to_pid(const struct file *file)
+{
+ if (file->f_op != &proc_tgid_base_operations)
+ return ERR_PTR(-EBADF);
+
+ return proc_pid(file_inode(file));
+}
+
+static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+ return proc_pident_lookup(dir, dentry,
+ tgid_base_stuff,
+ tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
+}
+
+static const struct inode_operations proc_tgid_base_inode_operations = {
+ .lookup = proc_tgid_base_lookup,
+ .getattr = pid_getattr,
+ .setattr = proc_setattr,
+ .permission = proc_pid_permission,
+};
+
+/**
+ * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache.
+ * @pid: pid that should be flushed.
+ *
+ * This function walks a list of inodes (that belong to any proc
+ * filesystem) that are attached to the pid and flushes them from
+ * the dentry cache.
+ *
+ * It is safe and reasonable to cache /proc entries for a task until
+ * that task exits. After that they just clog up the dcache with
+ * useless entries, possibly causing useful dcache entries to be
+ * flushed instead. This routine is provided to flush those useless
+ * dcache entries when a process is reaped.
+ *
+ * NOTE: This routine is just an optimization so it does not guarantee
+ * that no dcache entries will exist after a process is reaped
+ * it just makes it very unlikely that any will persist.
+ */
+
+void proc_flush_pid(struct pid *pid)
+{
+ proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
+}
+
+static struct dentry *proc_pid_instantiate(struct dentry * dentry,
+ struct task_struct *task, const void *ptr)
+{
+ struct inode *inode;
+
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ inode->i_op = &proc_tgid_base_inode_operations;
+ inode->i_fop = &proc_tgid_base_operations;
+ inode->i_flags|=S_IMMUTABLE;
+
+ set_nlink(inode, nlink_tgid);
+ pid_update_inode(task, inode);
+
+ d_set_d_op(dentry, &pid_dentry_operations);
+ return d_splice_alias(inode, dentry);
+}
+
+struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
+{
+ struct task_struct *task;
+ unsigned tgid;
+ struct proc_fs_info *fs_info;
+ struct pid_namespace *ns;
+ struct dentry *result = ERR_PTR(-ENOENT);
+
+ tgid = name_to_int(&dentry->d_name);
+ if (tgid == ~0U)
+ goto out;
+
+ fs_info = proc_sb_info(dentry->d_sb);
+ ns = fs_info->pid_ns;
+ rcu_read_lock();
+ task = find_task_by_pid_ns(tgid, ns);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+ if (!task)
+ goto out;
+
+ /* Limit procfs to only ptraceable tasks */
+ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
+ if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
+ goto out_put_task;
+ }
+
+ result = proc_pid_instantiate(dentry, task, NULL);
+out_put_task:
+ put_task_struct(task);
+out:
+ return result;
+}
+
+/*
+ * Find the first task with tgid >= tgid
+ *
+ */
+struct tgid_iter {
+ unsigned int tgid;
+ struct task_struct *task;
+};
+static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
+{
+ struct pid *pid;
+
+ if (iter.task)
+ put_task_struct(iter.task);
+ rcu_read_lock();
+retry:
+ iter.task = NULL;
+ pid = find_ge_pid(iter.tgid, ns);
+ if (pid) {
+ iter.tgid = pid_nr_ns(pid, ns);
+ iter.task = pid_task(pid, PIDTYPE_TGID);
+ if (!iter.task) {
+ iter.tgid += 1;
+ goto retry;
+ }
+ get_task_struct(iter.task);
+ }
+ rcu_read_unlock();
+ return iter;
+}
+
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
+
+/* for the /proc/ directory itself, after non-process stuff has been done */
+int proc_pid_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct tgid_iter iter;
+ struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
+ struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
+ loff_t pos = ctx->pos;
+
+ if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
+ return 0;
+
+ if (pos == TGID_OFFSET - 2) {
+ struct inode *inode = d_inode(fs_info->proc_self);
+ if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
+ return 0;
+ ctx->pos = pos = pos + 1;
+ }
+ if (pos == TGID_OFFSET - 1) {
+ struct inode *inode = d_inode(fs_info->proc_thread_self);
+ if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
+ return 0;
+ ctx->pos = pos = pos + 1;
+ }
+ iter.tgid = pos - TGID_OFFSET;
+ iter.task = NULL;
+ for (iter = next_tgid(ns, iter);
+ iter.task;
+ iter.tgid += 1, iter = next_tgid(ns, iter)) {
+ char name[10 + 1];
+ unsigned int len;
+
+ cond_resched();
+ if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
+ continue;
+
+ len = snprintf(name, sizeof(name), "%u", iter.tgid);
+ ctx->pos = iter.tgid + TGID_OFFSET;
+ if (!proc_fill_cache(file, ctx, name, len,
+ proc_pid_instantiate, iter.task, NULL)) {
+ put_task_struct(iter.task);
+ return 0;
+ }
+ }
+ ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
+ return 0;
+}
+
+/*
+ * proc_tid_comm_permission is a special permission function exclusively
+ * used for the node /proc/<pid>/task/<tid>/comm.
+ * It bypasses generic permission checks in the case where a task of the same
+ * task group attempts to access the node.
+ * The rationale behind this is that glibc and bionic access this node for
+ * cross thread naming (pthread_set/getname_np(!self)). However, if
+ * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
+ * which locks out the cross thread naming implementation.
+ * This function makes sure that the node is always accessible for members of
+ * same thread group.
+ */
+static int proc_tid_comm_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
+{
+ bool is_same_tgroup;
+ struct task_struct *task;
+
+ task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+ is_same_tgroup = same_thread_group(current, task);
+ put_task_struct(task);
+
+ if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
+ /* This file (/proc/<pid>/task/<tid>/comm) can always be
+ * read or written by the members of the corresponding
+ * thread group.
+ */
+ return 0;
+ }
+
+ return generic_permission(&init_user_ns, inode, mask);
+}
+
+static const struct inode_operations proc_tid_comm_inode_operations = {
+ .setattr = proc_setattr,
+ .permission = proc_tid_comm_permission,
+};
+
+/*
+ * Tasks
+ */
+static const struct pid_entry tid_base_stuff[] = {
+ DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+#ifdef CONFIG_NET
+ DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
+#endif
+ REG("environ", S_IRUSR, proc_environ_operations),
+ REG("auxv", S_IRUSR, proc_auxv_operations),
+ ONE("status", S_IRUGO, proc_pid_status),
+ ONE("personality", S_IRUSR, proc_pid_personality),
+ ONE("limits", S_IRUGO, proc_pid_limits),
+#ifdef CONFIG_SCHED_DEBUG
+ REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+ NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
+ &proc_tid_comm_inode_operations,
+ &proc_pid_set_comm_operations, {}),
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+ ONE("syscall", S_IRUSR, proc_pid_syscall),
+#endif
+ REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
+ ONE("stat", S_IRUGO, proc_tid_stat),
+ ONE("statm", S_IRUGO, proc_pid_statm),
+ REG("maps", S_IRUGO, proc_pid_maps_operations),
+#ifdef CONFIG_PROC_CHILDREN
+ REG("children", S_IRUGO, proc_tid_children_operations),
+#endif
+#ifdef CONFIG_NUMA
+ REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
+#endif
+ REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
+ LNK("cwd", proc_cwd_link),
+ LNK("root", proc_root_link),
+ LNK("exe", proc_exe_link),
+ REG("mounts", S_IRUGO, proc_mounts_operations),
+ REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
+#ifdef CONFIG_PROC_PAGE_MONITOR
+ REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+ REG("smaps", S_IRUGO, proc_pid_smaps_operations),
+ REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
+ REG("pagemap", S_IRUSR, proc_pagemap_operations),
+#endif
+#ifdef CONFIG_SECURITY
+ DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
+#endif
+#ifdef CONFIG_KALLSYMS
+ ONE("wchan", S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+ ONE("stack", S_IRUSR, proc_pid_stack),
+#endif
+#ifdef CONFIG_SCHED_INFO
+ ONE("schedstat", S_IRUGO, proc_pid_schedstat),
+#endif
+#ifdef CONFIG_LATENCYTOP
+ REG("latency", S_IRUGO, proc_lstats_operations),
+#endif
+#ifdef CONFIG_PROC_PID_CPUSET
+ ONE("cpuset", S_IRUGO, proc_cpuset_show),
+#endif
+#ifdef CONFIG_CGROUPS
+ ONE("cgroup", S_IRUGO, proc_cgroup_show),
+#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+ ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
+#endif
+ ONE("oom_score", S_IRUGO, proc_oom_score),
+ REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
+ REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+#ifdef CONFIG_AUDIT
+ REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
+ REG("sessionid", S_IRUGO, proc_sessionid_operations),
+#endif
+#ifdef CONFIG_FAULT_INJECTION
+ REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
+ REG("fail-nth", 0644, proc_fail_nth_operations),
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ ONE("io", S_IRUSR, proc_tid_io_accounting),
+#endif
+#ifdef CONFIG_USER_NS
+ REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
+ REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
+ REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
+#endif
+#ifdef CONFIG_LIVEPATCH
+ ONE("patch_state", S_IRUSR, proc_pid_patch_state),
+#endif
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+ ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+#endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+ ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+ ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
+#endif
+};
+
+static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
+{
+ return proc_pident_readdir(file, ctx,
+ tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+}
+
+static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+ return proc_pident_lookup(dir, dentry,
+ tid_base_stuff,
+ tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
+}
+
+static const struct file_operations proc_tid_base_operations = {
+ .read = generic_read_dir,
+ .iterate_shared = proc_tid_base_readdir,
+ .llseek = generic_file_llseek,
+};
+
+static const struct inode_operations proc_tid_base_inode_operations = {
+ .lookup = proc_tid_base_lookup,
+ .getattr = pid_getattr,
+ .setattr = proc_setattr,
+};
+
+static struct dentry *proc_task_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ struct inode *inode;
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ inode->i_op = &proc_tid_base_inode_operations;
+ inode->i_fop = &proc_tid_base_operations;
+ inode->i_flags |= S_IMMUTABLE;
+
+ set_nlink(inode, nlink_tid);
+ pid_update_inode(task, inode);
+
+ d_set_d_op(dentry, &pid_dentry_operations);
+ return d_splice_alias(inode, dentry);
+}
+
+static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+{
+ struct task_struct *task;
+ struct task_struct *leader = get_proc_task(dir);
+ unsigned tid;
+ struct proc_fs_info *fs_info;
+ struct pid_namespace *ns;
+ struct dentry *result = ERR_PTR(-ENOENT);
+
+ if (!leader)
+ goto out_no_task;
+
+ tid = name_to_int(&dentry->d_name);
+ if (tid == ~0U)
+ goto out;
+
+ fs_info = proc_sb_info(dentry->d_sb);
+ ns = fs_info->pid_ns;
+ rcu_read_lock();
+ task = find_task_by_pid_ns(tid, ns);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+ if (!task)
+ goto out;
+ if (!same_thread_group(leader, task))
+ goto out_drop_task;
+
+ result = proc_task_instantiate(dentry, task, NULL);
+out_drop_task:
+ put_task_struct(task);
+out:
+ put_task_struct(leader);
+out_no_task:
+ return result;
+}
+
+/*
+ * Find the first tid of a thread group to return to user space.
+ *
+ * Usually this is just the thread group leader, but if the users
+ * buffer was too small or there was a seek into the middle of the
+ * directory we have more work todo.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with the leader and walk nr
+ * threads past it.
+ */
+static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
+ struct pid_namespace *ns)
+{
+ struct task_struct *pos, *task;
+ unsigned long nr = f_pos;
+
+ if (nr != f_pos) /* 32bit overflow? */
+ return NULL;
+
+ rcu_read_lock();
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ goto fail;
+
+ /* Attempt to start with the tid of a thread */
+ if (tid && nr) {
+ pos = find_task_by_pid_ns(tid, ns);
+ if (pos && same_thread_group(pos, task))
+ goto found;
+ }
+
+ /* If nr exceeds the number of threads there is nothing todo */
+ if (nr >= get_nr_threads(task))
+ goto fail;
+
+ /* If we haven't found our starting place yet start
+ * with the leader and walk nr threads forward.
+ */
+ pos = task = task->group_leader;
+ do {
+ if (!nr--)
+ goto found;
+ } while_each_thread(task, pos);
+fail:
+ pos = NULL;
+ goto out;
+found:
+ get_task_struct(pos);
+out:
+ rcu_read_unlock();
+ return pos;
+}
+
+/*
+ * Find the next thread in the thread list.
+ * Return NULL if there is an error or no next thread.
+ *
+ * The reference to the input task_struct is released.
+ */
+static struct task_struct *next_tid(struct task_struct *start)
+{
+ struct task_struct *pos = NULL;
+ rcu_read_lock();
+ if (pid_alive(start)) {
+ pos = next_thread(start);
+ if (thread_group_leader(pos))
+ pos = NULL;
+ else
+ get_task_struct(pos);
+ }
+ rcu_read_unlock();
+ put_task_struct(start);
+ return pos;
+}
+
+/* for the /proc/TGID/task/ directories */
+static int proc_task_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
+ struct task_struct *task;
+ struct pid_namespace *ns;
+ int tid;
+
+ if (proc_inode_is_dead(inode))
+ return -ENOENT;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ /* f_version caches the tgid value that the last readdir call couldn't
+ * return. lseek aka telldir automagically resets f_version to 0.
+ */
+ ns = proc_pid_ns(inode->i_sb);
+ tid = (int)file->f_version;
+ file->f_version = 0;
+ for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
+ task;
+ task = next_tid(task), ctx->pos++) {
+ char name[10 + 1];
+ unsigned int len;
+
+ tid = task_pid_nr_ns(task, ns);
+ if (!tid)
+ continue; /* The task has just exited. */
+ len = snprintf(name, sizeof(name), "%u", tid);
+ if (!proc_fill_cache(file, ctx, name, len,
+ proc_task_instantiate, task, NULL)) {
+ /* returning this tgid failed, save it as the first
+ * pid for the next readir call */
+ file->f_version = (u64)tid;
+ put_task_struct(task);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int proc_task_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct task_struct *p = get_proc_task(inode);
+ generic_fillattr(&init_user_ns, inode, stat);
+
+ if (p) {
+ stat->nlink += get_nr_threads(p);
+ put_task_struct(p);
+ }
+
+ return 0;
+}
+
+static const struct inode_operations proc_task_inode_operations = {
+ .lookup = proc_task_lookup,
+ .getattr = proc_task_getattr,
+ .setattr = proc_setattr,
+ .permission = proc_pid_permission,
+};
+
+static const struct file_operations proc_task_operations = {
+ .read = generic_read_dir,
+ .iterate_shared = proc_task_readdir,
+ .llseek = generic_file_llseek,
+};
+
+void __init set_proc_pid_nlink(void)
+{
+ nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+ nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+}
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
new file mode 100644
index 000000000..2e244ada1
--- /dev/null
+++ b/fs/proc/bootconfig.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * /proc/bootconfig - Extra boot configuration
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/printk.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/bootconfig.h>
+#include <linux/slab.h>
+
+static char *saved_boot_config;
+
+static int boot_config_proc_show(struct seq_file *m, void *v)
+{
+ if (saved_boot_config)
+ seq_puts(m, saved_boot_config);
+ return 0;
+}
+
+/* Rest size of buffer */
+#define rest(dst, end) ((end) > (dst) ? (end) - (dst) : 0)
+
+/* Return the needed total length if @size is 0 */
+static int __init copy_xbc_key_value_list(char *dst, size_t size)
+{
+ struct xbc_node *leaf, *vnode;
+ char *key, *end = dst + size;
+ const char *val;
+ char q;
+ int ret = 0;
+
+ key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
+ if (!key)
+ return -ENOMEM;
+
+ xbc_for_each_key_value(leaf, val) {
+ ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX);
+ if (ret < 0)
+ break;
+ ret = snprintf(dst, rest(dst, end), "%s = ", key);
+ if (ret < 0)
+ break;
+ dst += ret;
+ vnode = xbc_node_get_child(leaf);
+ if (vnode) {
+ xbc_array_for_each_value(vnode, val) {
+ if (strchr(val, '"'))
+ q = '\'';
+ else
+ q = '"';
+ ret = snprintf(dst, rest(dst, end), "%c%s%c%s",
+ q, val, q, xbc_node_is_array(vnode) ? ", " : "\n");
+ if (ret < 0)
+ goto out;
+ dst += ret;
+ }
+ } else {
+ ret = snprintf(dst, rest(dst, end), "\"\"\n");
+ if (ret < 0)
+ break;
+ dst += ret;
+ }
+ }
+out:
+ kfree(key);
+
+ return ret < 0 ? ret : dst - (end - size);
+}
+
+static int __init proc_boot_config_init(void)
+{
+ int len;
+
+ len = copy_xbc_key_value_list(NULL, 0);
+ if (len < 0)
+ return len;
+
+ if (len > 0) {
+ saved_boot_config = kzalloc(len + 1, GFP_KERNEL);
+ if (!saved_boot_config)
+ return -ENOMEM;
+
+ len = copy_xbc_key_value_list(saved_boot_config, len + 1);
+ if (len < 0) {
+ kfree(saved_boot_config);
+ return len;
+ }
+ }
+
+ proc_create_single("bootconfig", 0, NULL, boot_config_proc_show);
+
+ return 0;
+}
+fs_initcall(proc_boot_config_init);
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
new file mode 100644
index 000000000..fa762c5fb
--- /dev/null
+++ b/fs/proc/cmdline.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static int cmdline_proc_show(struct seq_file *m, void *v)
+{
+ seq_puts(m, saved_command_line);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int __init proc_cmdline_init(void)
+{
+ proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ return 0;
+}
+fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
new file mode 100644
index 000000000..dfe6ce350
--- /dev/null
+++ b/fs/proc/consoles.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2010 Werner Fink, Jiri Slaby
+ */
+
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/tty_driver.h>
+
+/*
+ * This is handler for /proc/consoles
+ */
+static int show_console_dev(struct seq_file *m, void *v)
+{
+ static const struct {
+ short flag;
+ char name;
+ } con_flags[] = {
+ { CON_ENABLED, 'E' },
+ { CON_CONSDEV, 'C' },
+ { CON_BOOT, 'B' },
+ { CON_PRINTBUFFER, 'p' },
+ { CON_BRL, 'b' },
+ { CON_ANYTIME, 'a' },
+ };
+ char flags[ARRAY_SIZE(con_flags) + 1];
+ struct console *con = v;
+ unsigned int a;
+ dev_t dev = 0;
+
+ if (con->device) {
+ const struct tty_driver *driver;
+ int index;
+ driver = con->device(con, &index);
+ if (driver) {
+ dev = MKDEV(driver->major, driver->minor_start);
+ dev += index;
+ }
+ }
+
+ for (a = 0; a < ARRAY_SIZE(con_flags); a++)
+ flags[a] = (con->flags & con_flags[a].flag) ?
+ con_flags[a].name : ' ';
+ flags[a] = 0;
+
+ seq_setwidth(m, 21 - 1);
+ seq_printf(m, "%s%d", con->name, con->index);
+ seq_pad(m, ' ');
+ seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
+ con->write ? 'W' : '-', con->unblank ? 'U' : '-',
+ flags);
+ if (dev)
+ seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+ struct console *con;
+ loff_t off = 0;
+
+ console_lock();
+ for_each_console(con)
+ if (off++ == *pos)
+ break;
+
+ return con;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct console *con = v;
+ ++*pos;
+ return con->next;
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+ console_unlock();
+}
+
+static const struct seq_operations consoles_op = {
+ .start = c_start,
+ .next = c_next,
+ .stop = c_stop,
+ .show = show_console_dev
+};
+
+static int __init proc_consoles_init(void)
+{
+ proc_create_seq("consoles", 0, NULL, &consoles_op);
+ return 0;
+}
+fs_initcall(proc_consoles_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
new file mode 100644
index 000000000..f38bda5b8
--- /dev/null
+++ b/fs/proc/cpuinfo.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpufreq.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+extern const struct seq_operations cpuinfo_op;
+
+static int cpuinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &cpuinfo_op);
+}
+
+static const struct proc_ops cpuinfo_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = cpuinfo_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+};
+
+static int __init proc_cpuinfo_init(void)
+{
+ proc_create("cpuinfo", 0, NULL, &cpuinfo_proc_ops);
+ return 0;
+}
+fs_initcall(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
new file mode 100644
index 000000000..fe7bfcb7d
--- /dev/null
+++ b/fs/proc/devices.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+#include "internal.h"
+
+static int devinfo_show(struct seq_file *f, void *v)
+{
+ int i = *(loff_t *) v;
+
+ if (i < CHRDEV_MAJOR_MAX) {
+ if (i == 0)
+ seq_puts(f, "Character devices:\n");
+ chrdev_show(f, i);
+ }
+#ifdef CONFIG_BLOCK
+ else {
+ i -= CHRDEV_MAJOR_MAX;
+ if (i == 0)
+ seq_puts(f, "\nBlock devices:\n");
+ blkdev_show(f, i);
+ }
+#endif
+ return 0;
+}
+
+static void *devinfo_start(struct seq_file *f, loff_t *pos)
+{
+ if (*pos < (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
+ return pos;
+ return NULL;
+}
+
+static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
+{
+ (*pos)++;
+ if (*pos >= (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
+ return NULL;
+ return pos;
+}
+
+static void devinfo_stop(struct seq_file *f, void *v)
+{
+ /* Nothing to do */
+}
+
+static const struct seq_operations devinfo_ops = {
+ .start = devinfo_start,
+ .next = devinfo_next,
+ .stop = devinfo_stop,
+ .show = devinfo_show
+};
+
+static int __init proc_devices_init(void)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_seq("devices", 0, NULL, &devinfo_ops);
+ pde_make_permanent(pde);
+ return 0;
+}
+fs_initcall(proc_devices_init);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
new file mode 100644
index 000000000..913bef0d2
--- /dev/null
+++ b/fs/proc/fd.c
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched/signal.h>
+#include <linux/errno.h>
+#include <linux/dcache.h>
+#include <linux/path.h>
+#include <linux/fdtable.h>
+#include <linux/namei.h>
+#include <linux/pid.h>
+#include <linux/ptrace.h>
+#include <linux/security.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+
+#include <linux/proc_fs.h>
+
+#include "../mount.h"
+#include "internal.h"
+#include "fd.h"
+
+static int seq_show(struct seq_file *m, void *v)
+{
+ struct files_struct *files = NULL;
+ int f_flags = 0, ret = -ENOENT;
+ struct file *file = NULL;
+ struct task_struct *task;
+
+ task = get_proc_task(m->private);
+ if (!task)
+ return -ENOENT;
+
+ task_lock(task);
+ files = task->files;
+ if (files) {
+ unsigned int fd = proc_fd(m->private);
+
+ spin_lock(&files->file_lock);
+ file = files_lookup_fd_locked(files, fd);
+ if (file) {
+ struct fdtable *fdt = files_fdtable(files);
+
+ f_flags = file->f_flags;
+ if (close_on_exec(fd, fdt))
+ f_flags |= O_CLOEXEC;
+
+ get_file(file);
+ ret = 0;
+ }
+ spin_unlock(&files->file_lock);
+ }
+ task_unlock(task);
+ put_task_struct(task);
+
+ if (ret)
+ return ret;
+
+ seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n",
+ (long long)file->f_pos, f_flags,
+ real_mount(file->f_path.mnt)->mnt_id,
+ file_inode(file)->i_ino);
+
+ /* show_fd_locks() never deferences files so a stale value is safe */
+ show_fd_locks(m, file, files);
+ if (seq_has_overflowed(m))
+ goto out;
+
+ if (file->f_op->show_fdinfo)
+ file->f_op->show_fdinfo(m, file);
+
+out:
+ fput(file);
+ return 0;
+}
+
+static int proc_fdinfo_access_allowed(struct inode *inode)
+{
+ bool allowed = false;
+ struct task_struct *task = get_proc_task(inode);
+
+ if (!task)
+ return -ESRCH;
+
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+ put_task_struct(task);
+
+ if (!allowed)
+ return -EACCES;
+
+ return 0;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
+ return single_open(file, seq_show, inode);
+}
+
+static const struct file_operations proc_fdinfo_file_operations = {
+ .open = seq_fdinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
+{
+ struct file *file;
+
+ rcu_read_lock();
+ file = task_lookup_fd_rcu(task, fd);
+ if (file)
+ *mode = file->f_mode;
+ rcu_read_unlock();
+ return !!file;
+}
+
+static void tid_fd_update_inode(struct task_struct *task, struct inode *inode,
+ fmode_t f_mode)
+{
+ task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
+
+ if (S_ISLNK(inode->i_mode)) {
+ unsigned i_mode = S_IFLNK;
+ if (f_mode & FMODE_READ)
+ i_mode |= S_IRUSR | S_IXUSR;
+ if (f_mode & FMODE_WRITE)
+ i_mode |= S_IWUSR | S_IXUSR;
+ inode->i_mode = i_mode;
+ }
+ security_task_to_inode(task, inode);
+}
+
+static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct task_struct *task;
+ struct inode *inode;
+ unsigned int fd;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ inode = d_inode(dentry);
+ task = get_proc_task(inode);
+ fd = proc_fd(inode);
+
+ if (task) {
+ fmode_t f_mode;
+ if (tid_fd_mode(task, fd, &f_mode)) {
+ tid_fd_update_inode(task, inode, f_mode);
+ put_task_struct(task);
+ return 1;
+ }
+ put_task_struct(task);
+ }
+ return 0;
+}
+
+static const struct dentry_operations tid_fd_dentry_operations = {
+ .d_revalidate = tid_fd_revalidate,
+ .d_delete = pid_delete_dentry,
+};
+
+static int proc_fd_link(struct dentry *dentry, struct path *path)
+{
+ struct task_struct *task;
+ int ret = -ENOENT;
+
+ task = get_proc_task(d_inode(dentry));
+ if (task) {
+ unsigned int fd = proc_fd(d_inode(dentry));
+ struct file *fd_file;
+
+ fd_file = fget_task(task, fd);
+ if (fd_file) {
+ *path = fd_file->f_path;
+ path_get(&fd_file->f_path);
+ ret = 0;
+ fput(fd_file);
+ }
+ put_task_struct(task);
+ }
+
+ return ret;
+}
+
+struct fd_data {
+ fmode_t mode;
+ unsigned fd;
+};
+
+static struct dentry *proc_fd_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ const struct fd_data *data = ptr;
+ struct proc_inode *ei;
+ struct inode *inode;
+
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ ei = PROC_I(inode);
+ ei->fd = data->fd;
+
+ inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_size = 64;
+
+ ei->op.proc_get_link = proc_fd_link;
+ tid_fd_update_inode(task, inode, data->mode);
+
+ d_set_d_op(dentry, &tid_fd_dentry_operations);
+ return d_splice_alias(inode, dentry);
+}
+
+static struct dentry *proc_lookupfd_common(struct inode *dir,
+ struct dentry *dentry,
+ instantiate_t instantiate)
+{
+ struct task_struct *task = get_proc_task(dir);
+ struct fd_data data = {.fd = name_to_int(&dentry->d_name)};
+ struct dentry *result = ERR_PTR(-ENOENT);
+
+ if (!task)
+ goto out_no_task;
+ if (data.fd == ~0U)
+ goto out;
+ if (!tid_fd_mode(task, data.fd, &data.mode))
+ goto out;
+
+ result = instantiate(dentry, task, &data);
+out:
+ put_task_struct(task);
+out_no_task:
+ return result;
+}
+
+static int proc_readfd_common(struct file *file, struct dir_context *ctx,
+ instantiate_t instantiate)
+{
+ struct task_struct *p = get_proc_task(file_inode(file));
+ unsigned int fd;
+
+ if (!p)
+ return -ENOENT;
+
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+
+ rcu_read_lock();
+ for (fd = ctx->pos - 2;; fd++) {
+ struct file *f;
+ struct fd_data data;
+ char name[10 + 1];
+ unsigned int len;
+
+ f = task_lookup_next_fd_rcu(p, &fd);
+ ctx->pos = fd + 2LL;
+ if (!f)
+ break;
+ data.mode = f->f_mode;
+ rcu_read_unlock();
+ data.fd = fd;
+
+ len = snprintf(name, sizeof(name), "%u", fd);
+ if (!proc_fill_cache(file, ctx,
+ name, len, instantiate, p,
+ &data))
+ goto out;
+ cond_resched();
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+out:
+ put_task_struct(p);
+ return 0;
+}
+
+static int proc_readfd(struct file *file, struct dir_context *ctx)
+{
+ return proc_readfd_common(file, ctx, proc_fd_instantiate);
+}
+
+const struct file_operations proc_fd_operations = {
+ .read = generic_read_dir,
+ .iterate_shared = proc_readfd,
+ .llseek = generic_file_llseek,
+};
+
+static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
+}
+
+/*
+ * /proc/pid/fd needs a special permission handler so that a process can still
+ * access /proc/self/fd after it has executed a setuid().
+ */
+int proc_fd_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
+{
+ struct task_struct *p;
+ int rv;
+
+ rv = generic_permission(&init_user_ns, inode, mask);
+ if (rv == 0)
+ return rv;
+
+ rcu_read_lock();
+ p = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (p && same_thread_group(p, current))
+ rv = 0;
+ rcu_read_unlock();
+
+ return rv;
+}
+
+const struct inode_operations proc_fd_inode_operations = {
+ .lookup = proc_lookupfd,
+ .permission = proc_fd_permission,
+ .setattr = proc_setattr,
+};
+
+static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ const struct fd_data *data = ptr;
+ struct proc_inode *ei;
+ struct inode *inode;
+
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUGO);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ ei = PROC_I(inode);
+ ei->fd = data->fd;
+
+ inode->i_fop = &proc_fdinfo_file_operations;
+ tid_fd_update_inode(task, inode, 0);
+
+ d_set_d_op(dentry, &tid_fd_dentry_operations);
+ return d_splice_alias(inode, dentry);
+}
+
+static struct dentry *
+proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+ return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
+}
+
+static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
+{
+ return proc_readfd_common(file, ctx,
+ proc_fdinfo_instantiate);
+}
+
+static int proc_open_fdinfo(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+const struct inode_operations proc_fdinfo_inode_operations = {
+ .lookup = proc_lookupfdinfo,
+ .setattr = proc_setattr,
+};
+
+const struct file_operations proc_fdinfo_operations = {
+ .open = proc_open_fdinfo,
+ .read = generic_read_dir,
+ .iterate_shared = proc_readfdinfo,
+ .llseek = generic_file_llseek,
+};
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
new file mode 100644
index 000000000..c5a921a06
--- /dev/null
+++ b/fs/proc/fd.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PROCFS_FD_H__
+#define __PROCFS_FD_H__
+
+#include <linux/fs.h>
+
+extern const struct file_operations proc_fd_operations;
+extern const struct inode_operations proc_fd_inode_operations;
+
+extern const struct file_operations proc_fdinfo_operations;
+extern const struct inode_operations proc_fdinfo_inode_operations;
+
+extern int proc_fd_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask);
+
+static inline unsigned int proc_fd(struct inode *inode)
+{
+ return PROC_I(inode)->fd;
+}
+
+#endif /* __PROCFS_FD_H__ */
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
new file mode 100644
index 000000000..587b91d9d
--- /dev/null
+++ b/fs/proc/generic.c
@@ -0,0 +1,819 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * proc/fs/generic.c --- generic routines for the proc-fs
+ *
+ * This file contains generic proc-fs routines for handling
+ * directories and files.
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds.
+ * Copyright (C) 1997 Theodore Ts'o
+ */
+
+#include <linux/cache.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/printk.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/idr.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+
+static DEFINE_RWLOCK(proc_subdir_lock);
+
+struct kmem_cache *proc_dir_entry_cache __ro_after_init;
+
+void pde_free(struct proc_dir_entry *pde)
+{
+ if (S_ISLNK(pde->mode))
+ kfree(pde->data);
+ if (pde->name != pde->inline_name)
+ kfree(pde->name);
+ kmem_cache_free(proc_dir_entry_cache, pde);
+}
+
+static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len)
+{
+ if (len < de->namelen)
+ return -1;
+ if (len > de->namelen)
+ return 1;
+
+ return memcmp(name, de->name, len);
+}
+
+static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
+{
+ return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
+ subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
+{
+ return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry,
+ subdir_node);
+}
+
+static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
+ const char *name,
+ unsigned int len)
+{
+ struct rb_node *node = dir->subdir.rb_node;
+
+ while (node) {
+ struct proc_dir_entry *de = rb_entry(node,
+ struct proc_dir_entry,
+ subdir_node);
+ int result = proc_match(name, de, len);
+
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return de;
+ }
+ return NULL;
+}
+
+static bool pde_subdir_insert(struct proc_dir_entry *dir,
+ struct proc_dir_entry *de)
+{
+ struct rb_root *root = &dir->subdir;
+ struct rb_node **new = &root->rb_node, *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct proc_dir_entry *this = rb_entry(*new,
+ struct proc_dir_entry,
+ subdir_node);
+ int result = proc_match(de->name, this, de->namelen);
+
+ parent = *new;
+ if (result < 0)
+ new = &(*new)->rb_left;
+ else if (result > 0)
+ new = &(*new)->rb_right;
+ else
+ return false;
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&de->subdir_node, parent, new);
+ rb_insert_color(&de->subdir_node, root);
+ return true;
+}
+
+static int proc_notify_change(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct proc_dir_entry *de = PDE(inode);
+ int error;
+
+ error = setattr_prepare(&init_user_ns, dentry, iattr);
+ if (error)
+ return error;
+
+ setattr_copy(&init_user_ns, inode, iattr);
+ mark_inode_dirty(inode);
+
+ proc_set_user(de, inode->i_uid, inode->i_gid);
+ de->mode = inode->i_mode;
+ return 0;
+}
+
+static int proc_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct proc_dir_entry *de = PDE(inode);
+ if (de) {
+ nlink_t nlink = READ_ONCE(de->nlink);
+ if (nlink > 0) {
+ set_nlink(inode, nlink);
+ }
+ }
+
+ generic_fillattr(&init_user_ns, inode, stat);
+ return 0;
+}
+
+static const struct inode_operations proc_file_inode_operations = {
+ .setattr = proc_notify_change,
+};
+
+/*
+ * This function parses a name such as "tty/driver/serial", and
+ * returns the struct proc_dir_entry for "/proc/tty/driver", and
+ * returns "serial" in residual.
+ */
+static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+ const char **residual)
+{
+ const char *cp = name, *next;
+ struct proc_dir_entry *de;
+
+ de = *ret ?: &proc_root;
+ while ((next = strchr(cp, '/')) != NULL) {
+ de = pde_subdir_find(de, cp, next - cp);
+ if (!de) {
+ WARN(1, "name '%s'\n", name);
+ return -ENOENT;
+ }
+ cp = next + 1;
+ }
+ *residual = cp;
+ *ret = de;
+ return 0;
+}
+
+static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+ const char **residual)
+{
+ int rv;
+
+ read_lock(&proc_subdir_lock);
+ rv = __xlate_proc_name(name, ret, residual);
+ read_unlock(&proc_subdir_lock);
+ return rv;
+}
+
+static DEFINE_IDA(proc_inum_ida);
+
+#define PROC_DYNAMIC_FIRST 0xF0000000U
+
+/*
+ * Return an inode number between PROC_DYNAMIC_FIRST and
+ * 0xffffffff, or zero on failure.
+ */
+int proc_alloc_inum(unsigned int *inum)
+{
+ int i;
+
+ i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1,
+ GFP_KERNEL);
+ if (i < 0)
+ return i;
+
+ *inum = PROC_DYNAMIC_FIRST + (unsigned int)i;
+ return 0;
+}
+
+void proc_free_inum(unsigned int inum)
+{
+ ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
+}
+
+static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ if (atomic_read(&PDE(d_inode(dentry))->in_use) < 0)
+ return 0; /* revalidate */
+ return 1;
+}
+
+static int proc_misc_d_delete(const struct dentry *dentry)
+{
+ return atomic_read(&PDE(d_inode(dentry))->in_use) < 0;
+}
+
+static const struct dentry_operations proc_misc_dentry_ops = {
+ .d_revalidate = proc_misc_d_revalidate,
+ .d_delete = proc_misc_d_delete,
+};
+
+/*
+ * Don't create negative dentries here, return -ENOENT by hand
+ * instead.
+ */
+struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
+ struct proc_dir_entry *de)
+{
+ struct inode *inode;
+
+ read_lock(&proc_subdir_lock);
+ de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
+ if (de) {
+ pde_get(de);
+ read_unlock(&proc_subdir_lock);
+ inode = proc_get_inode(dir->i_sb, de);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ d_set_d_op(dentry, de->proc_dops);
+ return d_splice_alias(inode, dentry);
+ }
+ read_unlock(&proc_subdir_lock);
+ return ERR_PTR(-ENOENT);
+}
+
+struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb);
+
+ if (fs_info->pidonly == PROC_PIDONLY_ON)
+ return ERR_PTR(-ENOENT);
+
+ return proc_lookup_de(dir, dentry, PDE(dir));
+}
+
+/*
+ * This returns non-zero if at EOF, so that the /proc
+ * root directory can use this and check if it should
+ * continue with the <pid> entries..
+ *
+ * Note that the VFS-layer doesn't care about the return
+ * value of the readdir() call, as long as it's non-negative
+ * for success..
+ */
+int proc_readdir_de(struct file *file, struct dir_context *ctx,
+ struct proc_dir_entry *de)
+{
+ int i;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ i = ctx->pos - 2;
+ read_lock(&proc_subdir_lock);
+ de = pde_subdir_first(de);
+ for (;;) {
+ if (!de) {
+ read_unlock(&proc_subdir_lock);
+ return 0;
+ }
+ if (!i)
+ break;
+ de = pde_subdir_next(de);
+ i--;
+ }
+
+ do {
+ struct proc_dir_entry *next;
+ pde_get(de);
+ read_unlock(&proc_subdir_lock);
+ if (!dir_emit(ctx, de->name, de->namelen,
+ de->low_ino, de->mode >> 12)) {
+ pde_put(de);
+ return 0;
+ }
+ ctx->pos++;
+ read_lock(&proc_subdir_lock);
+ next = pde_subdir_next(de);
+ pde_put(de);
+ de = next;
+ } while (de);
+ read_unlock(&proc_subdir_lock);
+ return 1;
+}
+
+int proc_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
+ struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
+
+ if (fs_info->pidonly == PROC_PIDONLY_ON)
+ return 1;
+
+ return proc_readdir_de(file, ctx, PDE(inode));
+}
+
+/*
+ * These are the generic /proc directory operations. They
+ * use the in-memory "struct proc_dir_entry" tree to parse
+ * the /proc directory.
+ */
+static const struct file_operations proc_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = proc_readdir,
+};
+
+static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return 0;
+}
+
+const struct dentry_operations proc_net_dentry_ops = {
+ .d_revalidate = proc_net_d_revalidate,
+ .d_delete = always_delete_dentry,
+};
+
+/*
+ * proc directories can do almost nothing..
+ */
+static const struct inode_operations proc_dir_inode_operations = {
+ .lookup = proc_lookup,
+ .getattr = proc_getattr,
+ .setattr = proc_notify_change,
+};
+
+/* returns the registered entry, or frees dp and returns NULL on failure */
+struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
+ struct proc_dir_entry *dp)
+{
+ if (proc_alloc_inum(&dp->low_ino))
+ goto out_free_entry;
+
+ write_lock(&proc_subdir_lock);
+ dp->parent = dir;
+ if (pde_subdir_insert(dir, dp) == false) {
+ WARN(1, "proc_dir_entry '%s/%s' already registered\n",
+ dir->name, dp->name);
+ write_unlock(&proc_subdir_lock);
+ goto out_free_inum;
+ }
+ dir->nlink++;
+ write_unlock(&proc_subdir_lock);
+
+ return dp;
+out_free_inum:
+ proc_free_inum(dp->low_ino);
+out_free_entry:
+ pde_free(dp);
+ return NULL;
+}
+
+static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
+ const char *name,
+ umode_t mode,
+ nlink_t nlink)
+{
+ struct proc_dir_entry *ent = NULL;
+ const char *fn;
+ struct qstr qstr;
+
+ if (xlate_proc_name(name, parent, &fn) != 0)
+ goto out;
+ qstr.name = fn;
+ qstr.len = strlen(fn);
+ if (qstr.len == 0 || qstr.len >= 256) {
+ WARN(1, "name len %u\n", qstr.len);
+ return NULL;
+ }
+ if (qstr.len == 1 && fn[0] == '.') {
+ WARN(1, "name '.'\n");
+ return NULL;
+ }
+ if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') {
+ WARN(1, "name '..'\n");
+ return NULL;
+ }
+ if (*parent == &proc_root && name_to_int(&qstr) != ~0U) {
+ WARN(1, "create '/proc/%s' by hand\n", qstr.name);
+ return NULL;
+ }
+ if (is_empty_pde(*parent)) {
+ WARN(1, "attempt to add to permanently empty directory");
+ return NULL;
+ }
+
+ ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
+ if (!ent)
+ goto out;
+
+ if (qstr.len + 1 <= SIZEOF_PDE_INLINE_NAME) {
+ ent->name = ent->inline_name;
+ } else {
+ ent->name = kmalloc(qstr.len + 1, GFP_KERNEL);
+ if (!ent->name) {
+ pde_free(ent);
+ return NULL;
+ }
+ }
+
+ memcpy(ent->name, fn, qstr.len + 1);
+ ent->namelen = qstr.len;
+ ent->mode = mode;
+ ent->nlink = nlink;
+ ent->subdir = RB_ROOT;
+ refcount_set(&ent->refcnt, 1);
+ spin_lock_init(&ent->pde_unload_lock);
+ INIT_LIST_HEAD(&ent->pde_openers);
+ proc_set_user(ent, (*parent)->uid, (*parent)->gid);
+
+ ent->proc_dops = &proc_misc_dentry_ops;
+ /* Revalidate everything under /proc/${pid}/net */
+ if ((*parent)->proc_dops == &proc_net_dentry_ops)
+ pde_force_lookup(ent);
+
+out:
+ return ent;
+}
+
+struct proc_dir_entry *proc_symlink(const char *name,
+ struct proc_dir_entry *parent, const char *dest)
+{
+ struct proc_dir_entry *ent;
+
+ ent = __proc_create(&parent, name,
+ (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);
+
+ if (ent) {
+ ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
+ if (ent->data) {
+ strcpy((char*)ent->data,dest);
+ ent->proc_iops = &proc_link_inode_operations;
+ ent = proc_register(parent, ent);
+ } else {
+ pde_free(ent);
+ ent = NULL;
+ }
+ }
+ return ent;
+}
+EXPORT_SYMBOL(proc_symlink);
+
+struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, void *data, bool force_lookup)
+{
+ struct proc_dir_entry *ent;
+
+ if (mode == 0)
+ mode = S_IRUGO | S_IXUGO;
+
+ ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
+ if (ent) {
+ ent->data = data;
+ ent->proc_dir_ops = &proc_dir_operations;
+ ent->proc_iops = &proc_dir_inode_operations;
+ if (force_lookup) {
+ pde_force_lookup(ent);
+ }
+ ent = proc_register(parent, ent);
+ }
+ return ent;
+}
+EXPORT_SYMBOL_GPL(_proc_mkdir);
+
+struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, void *data)
+{
+ return _proc_mkdir(name, mode, parent, data, false);
+}
+EXPORT_SYMBOL_GPL(proc_mkdir_data);
+
+struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
+ struct proc_dir_entry *parent)
+{
+ return proc_mkdir_data(name, mode, parent, NULL);
+}
+EXPORT_SYMBOL(proc_mkdir_mode);
+
+struct proc_dir_entry *proc_mkdir(const char *name,
+ struct proc_dir_entry *parent)
+{
+ return proc_mkdir_data(name, 0, parent, NULL);
+}
+EXPORT_SYMBOL(proc_mkdir);
+
+struct proc_dir_entry *proc_create_mount_point(const char *name)
+{
+ umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ struct proc_dir_entry *ent, *parent = NULL;
+
+ ent = __proc_create(&parent, name, mode, 2);
+ if (ent) {
+ ent->data = NULL;
+ ent->proc_dir_ops = NULL;
+ ent->proc_iops = NULL;
+ ent = proc_register(parent, ent);
+ }
+ return ent;
+}
+EXPORT_SYMBOL(proc_create_mount_point);
+
+struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
+ struct proc_dir_entry **parent, void *data)
+{
+ struct proc_dir_entry *p;
+
+ if ((mode & S_IFMT) == 0)
+ mode |= S_IFREG;
+ if ((mode & S_IALLUGO) == 0)
+ mode |= S_IRUGO;
+ if (WARN_ON_ONCE(!S_ISREG(mode)))
+ return NULL;
+
+ p = __proc_create(parent, name, mode, 1);
+ if (p) {
+ p->proc_iops = &proc_file_inode_operations;
+ p->data = data;
+ }
+ return p;
+}
+
+static inline void pde_set_flags(struct proc_dir_entry *pde)
+{
+ if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+ pde->flags |= PROC_ENTRY_PERMANENT;
+}
+
+struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ const struct proc_ops *proc_ops, void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ p->proc_ops = proc_ops;
+ pde_set_flags(p);
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL(proc_create_data);
+
+struct proc_dir_entry *proc_create(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ const struct proc_ops *proc_ops)
+{
+ return proc_create_data(name, mode, parent, proc_ops, NULL);
+}
+EXPORT_SYMBOL(proc_create);
+
+static int proc_seq_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *de = PDE(inode);
+
+ if (de->state_size)
+ return seq_open_private(file, de->seq_ops, de->state_size);
+ return seq_open(file, de->seq_ops);
+}
+
+static int proc_seq_release(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *de = PDE(inode);
+
+ if (de->state_size)
+ return seq_release_private(inode, file);
+ return seq_release(inode, file);
+}
+
+static const struct proc_ops proc_seq_ops = {
+ /* not permanent -- can call into arbitrary seq_operations */
+ .proc_open = proc_seq_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = proc_seq_release,
+};
+
+struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, const struct seq_operations *ops,
+ unsigned int state_size, void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ p->proc_ops = &proc_seq_ops;
+ p->seq_ops = ops;
+ p->state_size = state_size;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL(proc_create_seq_private);
+
+static int proc_single_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *de = PDE(inode);
+
+ return single_open(file, de->single_show, de->data);
+}
+
+static const struct proc_ops proc_single_ops = {
+ /* not permanent -- can call into arbitrary ->single_show */
+ .proc_open = proc_single_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+};
+
+struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ int (*show)(struct seq_file *, void *), void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ p->proc_ops = &proc_single_ops;
+ p->single_show = show;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL(proc_create_single_data);
+
+void proc_set_size(struct proc_dir_entry *de, loff_t size)
+{
+ de->size = size;
+}
+EXPORT_SYMBOL(proc_set_size);
+
+void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
+{
+ de->uid = uid;
+ de->gid = gid;
+}
+EXPORT_SYMBOL(proc_set_user);
+
+void pde_put(struct proc_dir_entry *pde)
+{
+ if (refcount_dec_and_test(&pde->refcnt)) {
+ proc_free_inum(pde->low_ino);
+ pde_free(pde);
+ }
+}
+
+/*
+ * Remove a /proc entry and free it if it's not currently in use.
+ */
+void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+ struct proc_dir_entry *de = NULL;
+ const char *fn = name;
+ unsigned int len;
+
+ write_lock(&proc_subdir_lock);
+ if (__xlate_proc_name(name, &parent, &fn) != 0) {
+ write_unlock(&proc_subdir_lock);
+ return;
+ }
+ len = strlen(fn);
+
+ de = pde_subdir_find(parent, fn, len);
+ if (de) {
+ if (unlikely(pde_is_permanent(de))) {
+ WARN(1, "removing permanent /proc entry '%s'", de->name);
+ de = NULL;
+ } else {
+ rb_erase(&de->subdir_node, &parent->subdir);
+ if (S_ISDIR(de->mode))
+ parent->nlink--;
+ }
+ }
+ write_unlock(&proc_subdir_lock);
+ if (!de) {
+ WARN(1, "name '%s'\n", name);
+ return;
+ }
+
+ proc_entry_rundown(de);
+
+ WARN(pde_subdir_first(de),
+ "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
+ __func__, de->parent->name, de->name, pde_subdir_first(de)->name);
+ pde_put(de);
+}
+EXPORT_SYMBOL(remove_proc_entry);
+
+int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
+{
+ struct proc_dir_entry *root = NULL, *de, *next;
+ const char *fn = name;
+ unsigned int len;
+
+ write_lock(&proc_subdir_lock);
+ if (__xlate_proc_name(name, &parent, &fn) != 0) {
+ write_unlock(&proc_subdir_lock);
+ return -ENOENT;
+ }
+ len = strlen(fn);
+
+ root = pde_subdir_find(parent, fn, len);
+ if (!root) {
+ write_unlock(&proc_subdir_lock);
+ return -ENOENT;
+ }
+ if (unlikely(pde_is_permanent(root))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ root->parent->name, root->name);
+ return -EINVAL;
+ }
+ rb_erase(&root->subdir_node, &parent->subdir);
+
+ de = root;
+ while (1) {
+ next = pde_subdir_first(de);
+ if (next) {
+ if (unlikely(pde_is_permanent(next))) {
+ write_unlock(&proc_subdir_lock);
+ WARN(1, "removing permanent /proc entry '%s/%s'",
+ next->parent->name, next->name);
+ return -EINVAL;
+ }
+ rb_erase(&next->subdir_node, &de->subdir);
+ de = next;
+ continue;
+ }
+ next = de->parent;
+ if (S_ISDIR(de->mode))
+ next->nlink--;
+ write_unlock(&proc_subdir_lock);
+
+ proc_entry_rundown(de);
+ if (de == root)
+ break;
+ pde_put(de);
+
+ write_lock(&proc_subdir_lock);
+ de = next;
+ }
+ pde_put(root);
+ return 0;
+}
+EXPORT_SYMBOL(remove_proc_subtree);
+
+void *proc_get_parent_data(const struct inode *inode)
+{
+ struct proc_dir_entry *de = PDE(inode);
+ return de->parent->data;
+}
+EXPORT_SYMBOL_GPL(proc_get_parent_data);
+
+void proc_remove(struct proc_dir_entry *de)
+{
+ if (de)
+ remove_proc_subtree(de->name, de->parent);
+}
+EXPORT_SYMBOL(proc_remove);
+
+/*
+ * Pull a user buffer into memory and pass it to the file's write handler if
+ * one is supplied. The ->write() method is permitted to modify the
+ * kernel-side buffer.
+ */
+ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size,
+ loff_t *_pos)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(f));
+ char *buf;
+ int ret;
+
+ if (!pde->write)
+ return -EACCES;
+ if (size == 0 || size > PAGE_SIZE - 1)
+ return -EINVAL;
+ buf = memdup_user_nul(ubuf, size);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+ ret = pde->write(f, buf, size);
+ kfree(buf);
+ return ret == 0 ? size : ret;
+}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
new file mode 100644
index 000000000..f495fdb39
--- /dev/null
+++ b/fs/proc/inode.c
@@ -0,0 +1,704 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/proc/inode.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/cache.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel.h>
+#include <linux/pid_namespace.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/completion.h>
+#include <linux/poll.h>
+#include <linux/printk.h>
+#include <linux/file.h>
+#include <linux/limits.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/bug.h>
+
+#include "internal.h"
+
+static void proc_evict_inode(struct inode *inode)
+{
+ struct proc_dir_entry *de;
+ struct ctl_table_header *head;
+ struct proc_inode *ei = PROC_I(inode);
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+
+ /* Stop tracking associated processes */
+ if (ei->pid) {
+ proc_pid_evict_inode(ei);
+ ei->pid = NULL;
+ }
+
+ /* Let go of any associated proc directory entry */
+ de = ei->pde;
+ if (de) {
+ pde_put(de);
+ ei->pde = NULL;
+ }
+
+ head = ei->sysctl;
+ if (head) {
+ RCU_INIT_POINTER(ei->sysctl, NULL);
+ proc_sys_evict_inode(inode, head);
+ }
+}
+
+static struct kmem_cache *proc_inode_cachep __ro_after_init;
+static struct kmem_cache *pde_opener_cache __ro_after_init;
+
+static struct inode *proc_alloc_inode(struct super_block *sb)
+{
+ struct proc_inode *ei;
+
+ ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL);
+ if (!ei)
+ return NULL;
+ ei->pid = NULL;
+ ei->fd = 0;
+ ei->op.proc_get_link = NULL;
+ ei->pde = NULL;
+ ei->sysctl = NULL;
+ ei->sysctl_entry = NULL;
+ INIT_HLIST_NODE(&ei->sibling_inodes);
+ ei->ns_ops = NULL;
+ return &ei->vfs_inode;
+}
+
+static void proc_free_inode(struct inode *inode)
+{
+ kmem_cache_free(proc_inode_cachep, PROC_I(inode));
+}
+
+static void init_once(void *foo)
+{
+ struct proc_inode *ei = (struct proc_inode *) foo;
+
+ inode_init_once(&ei->vfs_inode);
+}
+
+void __init proc_init_kmemcache(void)
+{
+ proc_inode_cachep = kmem_cache_create("proc_inode_cache",
+ sizeof(struct proc_inode),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+ SLAB_PANIC),
+ init_once);
+ pde_opener_cache =
+ kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0,
+ SLAB_ACCOUNT|SLAB_PANIC, NULL);
+ proc_dir_entry_cache = kmem_cache_create_usercopy(
+ "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC,
+ offsetof(struct proc_dir_entry, inline_name),
+ SIZEOF_PDE_INLINE_NAME, NULL);
+ BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
+}
+
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
+{
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct hlist_node *node;
+ struct super_block *old_sb = NULL;
+
+ rcu_read_lock();
+ for (;;) {
+ struct super_block *sb;
+ node = hlist_first_rcu(inodes);
+ if (!node)
+ break;
+ ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+ spin_lock(lock);
+ hlist_del_init_rcu(&ei->sibling_inodes);
+ spin_unlock(lock);
+
+ inode = &ei->vfs_inode;
+ sb = inode->i_sb;
+ if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
+ continue;
+ inode = igrab(inode);
+ rcu_read_unlock();
+ if (sb != old_sb) {
+ if (old_sb)
+ deactivate_super(old_sb);
+ old_sb = sb;
+ }
+ if (unlikely(!inode)) {
+ rcu_read_lock();
+ continue;
+ }
+
+ if (S_ISDIR(inode->i_mode)) {
+ struct dentry *dir = d_find_any_alias(inode);
+ if (dir) {
+ d_invalidate(dir);
+ dput(dir);
+ }
+ } else {
+ struct dentry *dentry;
+ while ((dentry = d_find_alias(inode))) {
+ d_invalidate(dentry);
+ dput(dentry);
+ }
+ }
+ iput(inode);
+
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+ if (old_sb)
+ deactivate_super(old_sb);
+}
+
+static inline const char *hidepid2str(enum proc_hidepid v)
+{
+ switch (v) {
+ case HIDEPID_OFF: return "off";
+ case HIDEPID_NO_ACCESS: return "noaccess";
+ case HIDEPID_INVISIBLE: return "invisible";
+ case HIDEPID_NOT_PTRACEABLE: return "ptraceable";
+ }
+ WARN_ONCE(1, "bad hide_pid value: %d\n", v);
+ return "unknown";
+}
+
+static int proc_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct proc_fs_info *fs_info = proc_sb_info(root->d_sb);
+
+ if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID))
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid));
+ if (fs_info->hide_pid != HIDEPID_OFF)
+ seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid));
+ if (fs_info->pidonly != PROC_PIDONLY_OFF)
+ seq_printf(seq, ",subset=pid");
+
+ return 0;
+}
+
+const struct super_operations proc_sops = {
+ .alloc_inode = proc_alloc_inode,
+ .free_inode = proc_free_inode,
+ .drop_inode = generic_delete_inode,
+ .evict_inode = proc_evict_inode,
+ .statfs = simple_statfs,
+ .show_options = proc_show_options,
+};
+
+enum {BIAS = -1U<<31};
+
+static inline int use_pde(struct proc_dir_entry *pde)
+{
+ return likely(atomic_inc_unless_negative(&pde->in_use));
+}
+
+static void unuse_pde(struct proc_dir_entry *pde)
+{
+ if (unlikely(atomic_dec_return(&pde->in_use) == BIAS))
+ complete(pde->pde_unload_completion);
+}
+
+/*
+ * At most 2 contexts can enter this function: the one doing the last
+ * close on the descriptor and whoever is deleting PDE itself.
+ *
+ * First to enter calls ->proc_release hook and signals its completion
+ * to the second one which waits and then does nothing.
+ *
+ * PDE is locked on entry, unlocked on exit.
+ */
+static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
+ __releases(&pde->pde_unload_lock)
+{
+ /*
+ * close() (proc_reg_release()) can't delete an entry and proceed:
+ * ->release hook needs to be available at the right moment.
+ *
+ * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
+ * "struct file" needs to be available at the right moment.
+ */
+ if (pdeo->closing) {
+ /* somebody else is doing that, just wait */
+ DECLARE_COMPLETION_ONSTACK(c);
+ pdeo->c = &c;
+ spin_unlock(&pde->pde_unload_lock);
+ wait_for_completion(&c);
+ } else {
+ struct file *file;
+ struct completion *c;
+
+ pdeo->closing = true;
+ spin_unlock(&pde->pde_unload_lock);
+
+ file = pdeo->file;
+ pde->proc_ops->proc_release(file_inode(file), file);
+
+ spin_lock(&pde->pde_unload_lock);
+ /* Strictly after ->proc_release, see above. */
+ list_del(&pdeo->lh);
+ c = pdeo->c;
+ spin_unlock(&pde->pde_unload_lock);
+ if (unlikely(c))
+ complete(c);
+ kmem_cache_free(pde_opener_cache, pdeo);
+ }
+}
+
+void proc_entry_rundown(struct proc_dir_entry *de)
+{
+ DECLARE_COMPLETION_ONSTACK(c);
+ /* Wait until all existing callers into module are done. */
+ de->pde_unload_completion = &c;
+ if (atomic_add_return(BIAS, &de->in_use) != BIAS)
+ wait_for_completion(&c);
+
+ /* ->pde_openers list can't grow from now on. */
+
+ spin_lock(&de->pde_unload_lock);
+ while (!list_empty(&de->pde_openers)) {
+ struct pde_opener *pdeo;
+ pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
+ close_pdeo(de, pdeo);
+ spin_lock(&de->pde_unload_lock);
+ }
+ spin_unlock(&de->pde_unload_lock);
+}
+
+static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ loff_t rv = -EINVAL;
+
+ if (pde_is_permanent(pde)) {
+ return pde->proc_ops->proc_lseek(file, offset, whence);
+ } else if (use_pde(pde)) {
+ rv = pde->proc_ops->proc_lseek(file, offset, whence);
+ unuse_pde(pde);
+ }
+ return rv;
+}
+
+static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp));
+ ssize_t ret;
+
+ if (pde_is_permanent(pde))
+ return pde->proc_ops->proc_read_iter(iocb, iter);
+
+ if (!use_pde(pde))
+ return -EIO;
+ ret = pde->proc_ops->proc_read_iter(iocb, iter);
+ unuse_pde(pde);
+ return ret;
+}
+
+static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ typeof_member(struct proc_ops, proc_read) read;
+
+ read = pde->proc_ops->proc_read;
+ if (read)
+ return read(file, buf, count, ppos);
+ return -EIO;
+}
+
+static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ ssize_t rv = -EIO;
+
+ if (pde_is_permanent(pde)) {
+ return pde_read(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_read(pde, file, buf, count, ppos);
+ unuse_pde(pde);
+ }
+ return rv;
+}
+
+static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+ typeof_member(struct proc_ops, proc_write) write;
+
+ write = pde->proc_ops->proc_write;
+ if (write)
+ return write(file, buf, count, ppos);
+ return -EIO;
+}
+
+static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ ssize_t rv = -EIO;
+
+ if (pde_is_permanent(pde)) {
+ return pde_write(pde, file, buf, count, ppos);
+ } else if (use_pde(pde)) {
+ rv = pde_write(pde, file, buf, count, ppos);
+ unuse_pde(pde);
+ }
+ return rv;
+}
+
+static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
+{
+ typeof_member(struct proc_ops, proc_poll) poll;
+
+ poll = pde->proc_ops->proc_poll;
+ if (poll)
+ return poll(file, pts);
+ return DEFAULT_POLLMASK;
+}
+
+static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ __poll_t rv = DEFAULT_POLLMASK;
+
+ if (pde_is_permanent(pde)) {
+ return pde_poll(pde, file, pts);
+ } else if (use_pde(pde)) {
+ rv = pde_poll(pde, file, pts);
+ unuse_pde(pde);
+ }
+ return rv;
+}
+
+static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ typeof_member(struct proc_ops, proc_ioctl) ioctl;
+
+ ioctl = pde->proc_ops->proc_ioctl;
+ if (ioctl)
+ return ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
+static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ long rv = -ENOTTY;
+
+ if (pde_is_permanent(pde)) {
+ return pde_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_ioctl(pde, file, cmd, arg);
+ unuse_pde(pde);
+ }
+ return rv;
+}
+
+#ifdef CONFIG_COMPAT
+static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+ typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
+
+ compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+ if (compat_ioctl)
+ return compat_ioctl(file, cmd, arg);
+ return -ENOTTY;
+}
+
+static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ long rv = -ENOTTY;
+ if (pde_is_permanent(pde)) {
+ return pde_compat_ioctl(pde, file, cmd, arg);
+ } else if (use_pde(pde)) {
+ rv = pde_compat_ioctl(pde, file, cmd, arg);
+ unuse_pde(pde);
+ }
+ return rv;
+}
+#endif
+
+static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
+{
+ typeof_member(struct proc_ops, proc_mmap) mmap;
+
+ mmap = pde->proc_ops->proc_mmap;
+ if (mmap)
+ return mmap(file, vma);
+ return -EIO;
+}
+
+static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ int rv = -EIO;
+
+ if (pde_is_permanent(pde)) {
+ return pde_mmap(pde, file, vma);
+ } else if (use_pde(pde)) {
+ rv = pde_mmap(pde, file, vma);
+ unuse_pde(pde);
+ }
+ return rv;
+}
+
+static unsigned long
+pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
+
+ get_area = pde->proc_ops->proc_get_unmapped_area;
+#ifdef CONFIG_MMU
+ if (!get_area)
+ get_area = current->mm->get_unmapped_area;
+#endif
+ if (get_area)
+ return get_area(file, orig_addr, len, pgoff, flags);
+ return orig_addr;
+}
+
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct proc_dir_entry *pde = PDE(file_inode(file));
+ unsigned long rv = -EIO;
+
+ if (pde_is_permanent(pde)) {
+ return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
+ } else if (use_pde(pde)) {
+ rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
+ unuse_pde(pde);
+ }
+ return rv;
+}
+
+static int proc_reg_open(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *pde = PDE(inode);
+ int rv = 0;
+ typeof_member(struct proc_ops, proc_open) open;
+ typeof_member(struct proc_ops, proc_release) release;
+ struct pde_opener *pdeo;
+
+ if (!pde->proc_ops->proc_lseek)
+ file->f_mode &= ~FMODE_LSEEK;
+
+ if (pde_is_permanent(pde)) {
+ open = pde->proc_ops->proc_open;
+ if (open)
+ rv = open(inode, file);
+ return rv;
+ }
+
+ /*
+ * Ensure that
+ * 1) PDE's ->release hook will be called no matter what
+ * either normally by close()/->release, or forcefully by
+ * rmmod/remove_proc_entry.
+ *
+ * 2) rmmod isn't blocked by opening file in /proc and sitting on
+ * the descriptor (including "rmmod foo </proc/foo" scenario).
+ *
+ * Save every "struct file" with custom ->release hook.
+ */
+ if (!use_pde(pde))
+ return -ENOENT;
+
+ release = pde->proc_ops->proc_release;
+ if (release) {
+ pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
+ if (!pdeo) {
+ rv = -ENOMEM;
+ goto out_unuse;
+ }
+ }
+
+ open = pde->proc_ops->proc_open;
+ if (open)
+ rv = open(inode, file);
+
+ if (release) {
+ if (rv == 0) {
+ /* To know what to release. */
+ pdeo->file = file;
+ pdeo->closing = false;
+ pdeo->c = NULL;
+ spin_lock(&pde->pde_unload_lock);
+ list_add(&pdeo->lh, &pde->pde_openers);
+ spin_unlock(&pde->pde_unload_lock);
+ } else
+ kmem_cache_free(pde_opener_cache, pdeo);
+ }
+
+out_unuse:
+ unuse_pde(pde);
+ return rv;
+}
+
+static int proc_reg_release(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *pde = PDE(inode);
+ struct pde_opener *pdeo;
+
+ if (pde_is_permanent(pde)) {
+ typeof_member(struct proc_ops, proc_release) release;
+
+ release = pde->proc_ops->proc_release;
+ if (release) {
+ return release(inode, file);
+ }
+ return 0;
+ }
+
+ spin_lock(&pde->pde_unload_lock);
+ list_for_each_entry(pdeo, &pde->pde_openers, lh) {
+ if (pdeo->file == file) {
+ close_pdeo(pde, pdeo);
+ return 0;
+ }
+ }
+ spin_unlock(&pde->pde_unload_lock);
+ return 0;
+}
+
+static const struct file_operations proc_reg_file_ops = {
+ .llseek = proc_reg_llseek,
+ .read = proc_reg_read,
+ .write = proc_reg_write,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+
+static const struct file_operations proc_iter_file_ops = {
+ .llseek = proc_reg_llseek,
+ .read_iter = proc_reg_read_iter,
+ .write = proc_reg_write,
+ .splice_read = generic_file_splice_read,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+
+#ifdef CONFIG_COMPAT
+static const struct file_operations proc_reg_file_ops_compat = {
+ .llseek = proc_reg_llseek,
+ .read = proc_reg_read,
+ .write = proc_reg_write,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .compat_ioctl = proc_reg_compat_ioctl,
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+
+static const struct file_operations proc_iter_file_ops_compat = {
+ .llseek = proc_reg_llseek,
+ .read_iter = proc_reg_read_iter,
+ .splice_read = generic_file_splice_read,
+ .write = proc_reg_write,
+ .poll = proc_reg_poll,
+ .unlocked_ioctl = proc_reg_unlocked_ioctl,
+ .compat_ioctl = proc_reg_compat_ioctl,
+ .mmap = proc_reg_mmap,
+ .get_unmapped_area = proc_reg_get_unmapped_area,
+ .open = proc_reg_open,
+ .release = proc_reg_release,
+};
+#endif
+
+static void proc_put_link(void *p)
+{
+ unuse_pde(p);
+}
+
+static const char *proc_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ struct proc_dir_entry *pde = PDE(inode);
+ if (!use_pde(pde))
+ return ERR_PTR(-EINVAL);
+ set_delayed_call(done, proc_put_link, pde);
+ return pde->data;
+}
+
+const struct inode_operations proc_link_inode_operations = {
+ .get_link = proc_get_link,
+};
+
+struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
+{
+ struct inode *inode = new_inode(sb);
+
+ if (!inode) {
+ pde_put(de);
+ return NULL;
+ }
+
+ inode->i_private = de->data;
+ inode->i_ino = de->low_ino;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ PROC_I(inode)->pde = de;
+ if (is_empty_pde(de)) {
+ make_empty_dir_inode(inode);
+ return inode;
+ }
+
+ if (de->mode) {
+ inode->i_mode = de->mode;
+ inode->i_uid = de->uid;
+ inode->i_gid = de->gid;
+ }
+ if (de->size)
+ inode->i_size = de->size;
+ if (de->nlink)
+ set_nlink(inode, de->nlink);
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ if (de->proc_ops->proc_read_iter)
+ inode->i_fop = &proc_iter_file_ops;
+ else
+ inode->i_fop = &proc_reg_file_ops;
+#ifdef CONFIG_COMPAT
+ if (de->proc_ops->proc_compat_ioctl) {
+ if (de->proc_ops->proc_read_iter)
+ inode->i_fop = &proc_iter_file_ops_compat;
+ else
+ inode->i_fop = &proc_reg_file_ops_compat;
+ }
+#endif
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = de->proc_dir_ops;
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = de->proc_iops;
+ inode->i_fop = NULL;
+ } else {
+ BUG();
+ }
+ return inode;
+}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
new file mode 100644
index 000000000..6b921826d
--- /dev/null
+++ b/fs/proc/internal.h
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Internal procfs definitions
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/refcount.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/binfmts.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
+
+struct ctl_table_header;
+struct mempolicy;
+
+/*
+ * This is not completely implemented yet. The idea is to
+ * create an in-memory tree (like the actual /proc filesystem
+ * tree) of these proc_dir_entries, so that we can dynamically
+ * add new files to /proc.
+ *
+ * parent/subdir are used for the directory structure (every /proc file has a
+ * parent, but "subdir" is empty for all non-directory entries).
+ * subdir_node is used to build the rb tree "subdir" of the parent.
+ */
+struct proc_dir_entry {
+ /*
+ * number of callers into module in progress;
+ * negative -> it's going away RSN
+ */
+ atomic_t in_use;
+ refcount_t refcnt;
+ struct list_head pde_openers; /* who did ->open, but not ->release */
+ /* protects ->pde_openers and all struct pde_opener instances */
+ spinlock_t pde_unload_lock;
+ struct completion *pde_unload_completion;
+ const struct inode_operations *proc_iops;
+ union {
+ const struct proc_ops *proc_ops;
+ const struct file_operations *proc_dir_ops;
+ };
+ const struct dentry_operations *proc_dops;
+ union {
+ const struct seq_operations *seq_ops;
+ int (*single_show)(struct seq_file *, void *);
+ };
+ proc_write_t write;
+ void *data;
+ unsigned int state_size;
+ unsigned int low_ino;
+ nlink_t nlink;
+ kuid_t uid;
+ kgid_t gid;
+ loff_t size;
+ struct proc_dir_entry *parent;
+ struct rb_root subdir;
+ struct rb_node subdir_node;
+ char *name;
+ umode_t mode;
+ u8 flags;
+ u8 namelen;
+ char inline_name[];
+} __randomize_layout;
+
+#define SIZEOF_PDE ( \
+ sizeof(struct proc_dir_entry) < 128 ? 128 : \
+ sizeof(struct proc_dir_entry) < 192 ? 192 : \
+ sizeof(struct proc_dir_entry) < 256 ? 256 : \
+ sizeof(struct proc_dir_entry) < 512 ? 512 : \
+ 0)
+#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))
+
+static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_PERMANENT;
+}
+
+static inline void pde_make_permanent(struct proc_dir_entry *pde)
+{
+ pde->flags |= PROC_ENTRY_PERMANENT;
+}
+
+extern struct kmem_cache *proc_dir_entry_cache;
+void pde_free(struct proc_dir_entry *pde);
+
+union proc_op {
+ int (*proc_get_link)(struct dentry *, struct path *);
+ int (*proc_show)(struct seq_file *m,
+ struct pid_namespace *ns, struct pid *pid,
+ struct task_struct *task);
+ const char *lsm;
+};
+
+struct proc_inode {
+ struct pid *pid;
+ unsigned int fd;
+ union proc_op op;
+ struct proc_dir_entry *pde;
+ struct ctl_table_header *sysctl;
+ struct ctl_table *sysctl_entry;
+ struct hlist_node sibling_inodes;
+ const struct proc_ns_operations *ns_ops;
+ struct inode vfs_inode;
+} __randomize_layout;
+
+/*
+ * General functions
+ */
+static inline struct proc_inode *PROC_I(const struct inode *inode)
+{
+ return container_of(inode, struct proc_inode, vfs_inode);
+}
+
+static inline struct proc_dir_entry *PDE(const struct inode *inode)
+{
+ return PROC_I(inode)->pde;
+}
+
+static inline struct pid *proc_pid(const struct inode *inode)
+{
+ return PROC_I(inode)->pid;
+}
+
+static inline struct task_struct *get_proc_task(const struct inode *inode)
+{
+ return get_pid_task(proc_pid(inode), PIDTYPE_PID);
+}
+
+void task_dump_owner(struct task_struct *task, umode_t mode,
+ kuid_t *ruid, kgid_t *rgid);
+
+unsigned name_to_int(const struct qstr *qstr);
+/*
+ * Offset of the first process in the /proc root directory..
+ */
+#define FIRST_PROCESS_ENTRY 256
+
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 13
+
+/*
+ * array.c
+ */
+extern const struct file_operations proc_tid_children_operations;
+
+extern void proc_task_name(struct seq_file *m, struct task_struct *p,
+ bool escape);
+extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
+ struct pid *, struct task_struct *);
+
+/*
+ * base.c
+ */
+extern const struct dentry_operations pid_dentry_operations;
+extern int pid_getattr(struct user_namespace *, const struct path *,
+ struct kstat *, u32, unsigned int);
+extern int proc_setattr(struct user_namespace *, struct dentry *,
+ struct iattr *);
+extern void proc_pid_evict_inode(struct proc_inode *);
+extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
+extern void pid_update_inode(struct task_struct *, struct inode *);
+extern int pid_delete_dentry(const struct dentry *);
+extern int proc_pid_readdir(struct file *, struct dir_context *);
+struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
+extern loff_t mem_lseek(struct file *, loff_t, int);
+
+/* Lookups */
+typedef struct dentry *instantiate_t(struct dentry *,
+ struct task_struct *, const void *);
+bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int,
+ instantiate_t, struct task_struct *, const void *);
+
+/*
+ * generic.c
+ */
+struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
+ struct proc_dir_entry **parent, void *data);
+struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
+ struct proc_dir_entry *dp);
+extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
+struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
+extern int proc_readdir(struct file *, struct dir_context *);
+int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *);
+
+static inline void pde_get(struct proc_dir_entry *pde)
+{
+ refcount_inc(&pde->refcnt);
+}
+extern void pde_put(struct proc_dir_entry *);
+
+static inline bool is_empty_pde(const struct proc_dir_entry *pde)
+{
+ return S_ISDIR(pde->mode) && !pde->proc_iops;
+}
+extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *);
+
+/*
+ * inode.c
+ */
+struct pde_opener {
+ struct list_head lh;
+ struct file *file;
+ bool closing;
+ struct completion *c;
+} __randomize_layout;
+extern const struct inode_operations proc_link_inode_operations;
+extern const struct inode_operations proc_pid_link_inode_operations;
+extern const struct super_operations proc_sops;
+
+void proc_init_kmemcache(void);
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
+void set_proc_pid_nlink(void);
+extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+extern void proc_entry_rundown(struct proc_dir_entry *);
+
+/*
+ * proc_namespaces.c
+ */
+extern const struct inode_operations proc_ns_dir_inode_operations;
+extern const struct file_operations proc_ns_dir_operations;
+
+/*
+ * proc_net.c
+ */
+extern const struct file_operations proc_net_operations;
+extern const struct inode_operations proc_net_inode_operations;
+
+#ifdef CONFIG_NET
+extern int proc_net_init(void);
+#else
+static inline int proc_net_init(void) { return 0; }
+#endif
+
+/*
+ * proc_self.c
+ */
+extern int proc_setup_self(struct super_block *);
+
+/*
+ * proc_thread_self.c
+ */
+extern int proc_setup_thread_self(struct super_block *);
+extern void proc_thread_self_init(void);
+
+/*
+ * proc_sysctl.c
+ */
+#ifdef CONFIG_PROC_SYSCTL
+extern int proc_sys_init(void);
+extern void proc_sys_evict_inode(struct inode *inode,
+ struct ctl_table_header *head);
+#else
+static inline void proc_sys_init(void) { }
+static inline void proc_sys_evict_inode(struct inode *inode,
+ struct ctl_table_header *head) { }
+#endif
+
+/*
+ * proc_tty.c
+ */
+#ifdef CONFIG_TTY
+extern void proc_tty_init(void);
+#else
+static inline void proc_tty_init(void) {}
+#endif
+
+/*
+ * root.c
+ */
+extern struct proc_dir_entry proc_root;
+
+extern void proc_self_init(void);
+
+/*
+ * task_[no]mmu.c
+ */
+struct mem_size_stats;
+struct proc_maps_private {
+ struct inode *inode;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct vma_iterator iter;
+#ifdef CONFIG_NUMA
+ struct mempolicy *task_mempolicy;
+#endif
+} __randomize_layout;
+
+struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
+
+extern const struct file_operations proc_pid_maps_operations;
+extern const struct file_operations proc_pid_numa_maps_operations;
+extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_pid_smaps_rollup_operations;
+extern const struct file_operations proc_clear_refs_operations;
+extern const struct file_operations proc_pagemap_operations;
+
+extern unsigned long task_vsize(struct mm_struct *);
+extern unsigned long task_statm(struct mm_struct *,
+ unsigned long *, unsigned long *,
+ unsigned long *, unsigned long *);
+extern void task_mem(struct seq_file *, struct mm_struct *);
+
+extern const struct dentry_operations proc_net_dentry_ops;
+static inline void pde_force_lookup(struct proc_dir_entry *pde)
+{
+ /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
+ pde->proc_dops = &proc_net_dentry_ops;
+}
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
new file mode 100644
index 000000000..cb0edc7cb
--- /dev/null
+++ b/fs/proc/interrupts.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irqnr.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+/*
+ * /proc/interrupts
+ */
+static void *int_seq_start(struct seq_file *f, loff_t *pos)
+{
+ return (*pos <= nr_irqs) ? pos : NULL;
+}
+
+static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+ (*pos)++;
+ if (*pos > nr_irqs)
+ return NULL;
+ return pos;
+}
+
+static void int_seq_stop(struct seq_file *f, void *v)
+{
+ /* Nothing to do */
+}
+
+static const struct seq_operations int_seq_ops = {
+ .start = int_seq_start,
+ .next = int_seq_next,
+ .stop = int_seq_stop,
+ .show = show_interrupts
+};
+
+static int __init proc_interrupts_init(void)
+{
+ proc_create_seq("interrupts", 0, NULL, &int_seq_ops);
+ return 0;
+}
+fs_initcall(proc_interrupts_init);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
new file mode 100644
index 000000000..dff921f7c
--- /dev/null
+++ b/fs/proc/kcore.c
@@ -0,0 +1,701 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/proc/kcore.c kernel ELF core dumper
+ *
+ * Modelled on fs/exec.c:aout_core_dump()
+ * Jeremy Fitzhardinge <jeremy@sw.oz.au>
+ * ELF version written by David Howells <David.Howells@nexor.co.uk>
+ * Modified and incorporated into 2.3.x by Tigran Aivazian <tigran@veritas.com>
+ * Support to dump vmalloc'd areas (ELF only), Tigran Aivazian <tigran@veritas.com>
+ * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
+ */
+
+#include <linux/crash_core.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/kcore.h>
+#include <linux/user.h>
+#include <linux/capability.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/notifier.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/printk.h>
+#include <linux/memblock.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/io.h>
+#include <linux/list.h>
+#include <linux/ioport.h>
+#include <linux/memory.h>
+#include <linux/sched/task.h>
+#include <linux/security.h>
+#include <asm/sections.h>
+#include "internal.h"
+
+#define CORE_STR "CORE"
+
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS 0
+#endif
+
+static struct proc_dir_entry *proc_root_kcore;
+
+
+#ifndef kc_vaddr_to_offset
+#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET)
+#endif
+#ifndef kc_offset_to_vaddr
+#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
+#endif
+
+static LIST_HEAD(kclist_head);
+static DECLARE_RWSEM(kclist_lock);
+static int kcore_need_update = 1;
+
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * Same as oldmem_pfn_is_ram in vmcore
+ */
+static int (*mem_pfn_is_ram)(unsigned long pfn);
+
+int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+ if (mem_pfn_is_ram)
+ return -EBUSY;
+ mem_pfn_is_ram = fn;
+ return 0;
+}
+
+static int pfn_is_ram(unsigned long pfn)
+{
+ if (mem_pfn_is_ram)
+ return mem_pfn_is_ram(pfn);
+ else
+ return 1;
+}
+
+/* This doesn't grab kclist_lock, so it should only be used at init time. */
+void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
+ int type)
+{
+ new->addr = (unsigned long)addr;
+ new->size = size;
+ new->type = type;
+
+ list_add_tail(&new->list, &kclist_head);
+}
+
+static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
+ size_t *data_offset)
+{
+ size_t try, size;
+ struct kcore_list *m;
+
+ *nphdr = 1; /* PT_NOTE */
+ size = 0;
+
+ list_for_each_entry(m, &kclist_head, list) {
+ try = kc_vaddr_to_offset((size_t)m->addr + m->size);
+ if (try > size)
+ size = try;
+ *nphdr = *nphdr + 1;
+ }
+
+ *phdrs_len = *nphdr * sizeof(struct elf_phdr);
+ *notes_len = (4 * sizeof(struct elf_note) +
+ 3 * ALIGN(sizeof(CORE_STR), 4) +
+ VMCOREINFO_NOTE_NAME_BYTES +
+ ALIGN(sizeof(struct elf_prstatus), 4) +
+ ALIGN(sizeof(struct elf_prpsinfo), 4) +
+ ALIGN(arch_task_struct_size, 4) +
+ ALIGN(vmcoreinfo_size, 4));
+ *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len +
+ *notes_len);
+ return *data_offset + size;
+}
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
+ * because memory hole is not as big as !HIGHMEM case.
+ * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
+ */
+static int kcore_ram_list(struct list_head *head)
+{
+ struct kcore_list *ent;
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+ ent->addr = (unsigned long)__va(0);
+ ent->size = max_low_pfn << PAGE_SHIFT;
+ ent->type = KCORE_RAM;
+ list_add(&ent->list, head);
+ return 0;
+}
+
+#else /* !CONFIG_HIGHMEM */
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/* calculate vmemmap's address from given system ram pfn and register it */
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+{
+ unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
+ unsigned long nr_pages = ent->size >> PAGE_SHIFT;
+ unsigned long start, end;
+ struct kcore_list *vmm, *tmp;
+
+
+ start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
+ end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
+ end = PAGE_ALIGN(end);
+ /* overlap check (because we have to align page */
+ list_for_each_entry(tmp, head, list) {
+ if (tmp->type != KCORE_VMEMMAP)
+ continue;
+ if (start < tmp->addr + tmp->size)
+ if (end > tmp->addr)
+ end = tmp->addr;
+ }
+ if (start < end) {
+ vmm = kmalloc(sizeof(*vmm), GFP_KERNEL);
+ if (!vmm)
+ return 0;
+ vmm->addr = start;
+ vmm->size = end - start;
+ vmm->type = KCORE_VMEMMAP;
+ list_add_tail(&vmm->list, head);
+ }
+ return 1;
+
+}
+#else
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+{
+ return 1;
+}
+
+#endif
+
+static int
+kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+ struct list_head *head = (struct list_head *)arg;
+ struct kcore_list *ent;
+ struct page *p;
+
+ if (!pfn_valid(pfn))
+ return 1;
+
+ p = pfn_to_page(pfn);
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+ ent->addr = (unsigned long)page_to_virt(p);
+ ent->size = nr_pages << PAGE_SHIFT;
+
+ if (!virt_addr_valid(ent->addr))
+ goto free_out;
+
+ /* cut not-mapped area. ....from ppc-32 code. */
+ if (ULONG_MAX - ent->addr < ent->size)
+ ent->size = ULONG_MAX - ent->addr;
+
+ /*
+ * We've already checked virt_addr_valid so we know this address
+ * is a valid pointer, therefore we can check against it to determine
+ * if we need to trim
+ */
+ if (VMALLOC_START > ent->addr) {
+ if (VMALLOC_START - ent->addr < ent->size)
+ ent->size = VMALLOC_START - ent->addr;
+ }
+
+ ent->type = KCORE_RAM;
+ list_add_tail(&ent->list, head);
+
+ if (!get_sparsemem_vmemmap_info(ent, head)) {
+ list_del(&ent->list);
+ goto free_out;
+ }
+
+ return 0;
+free_out:
+ kfree(ent);
+ return 1;
+}
+
+static int kcore_ram_list(struct list_head *list)
+{
+ int nid, ret;
+ unsigned long end_pfn;
+
+ /* Not inialized....update now */
+ /* find out "max pfn" */
+ end_pfn = 0;
+ for_each_node_state(nid, N_MEMORY) {
+ unsigned long node_end;
+ node_end = node_end_pfn(nid);
+ if (end_pfn < node_end)
+ end_pfn = node_end;
+ }
+ /* scan 0 to max_pfn */
+ ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private);
+ if (ret)
+ return -ENOMEM;
+ return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+static int kcore_update_ram(void)
+{
+ LIST_HEAD(list);
+ LIST_HEAD(garbage);
+ int nphdr;
+ size_t phdrs_len, notes_len, data_offset;
+ struct kcore_list *tmp, *pos;
+ int ret = 0;
+
+ down_write(&kclist_lock);
+ if (!xchg(&kcore_need_update, 0))
+ goto out;
+
+ ret = kcore_ram_list(&list);
+ if (ret) {
+ /* Couldn't get the RAM list, try again next time. */
+ WRITE_ONCE(kcore_need_update, 1);
+ list_splice_tail(&list, &garbage);
+ goto out;
+ }
+
+ list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
+ if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
+ list_move(&pos->list, &garbage);
+ }
+ list_splice_tail(&list, &kclist_head);
+
+ proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, &notes_len,
+ &data_offset);
+
+out:
+ up_write(&kclist_lock);
+ list_for_each_entry_safe(pos, tmp, &garbage, list) {
+ list_del(&pos->list);
+ kfree(pos);
+ }
+ return ret;
+}
+
+static void append_kcore_note(char *notes, size_t *i, const char *name,
+ unsigned int type, const void *desc,
+ size_t descsz)
+{
+ struct elf_note *note = (struct elf_note *)&notes[*i];
+
+ note->n_namesz = strlen(name) + 1;
+ note->n_descsz = descsz;
+ note->n_type = type;
+ *i += sizeof(*note);
+ memcpy(&notes[*i], name, note->n_namesz);
+ *i = ALIGN(*i + note->n_namesz, 4);
+ memcpy(&notes[*i], desc, descsz);
+ *i = ALIGN(*i + descsz, 4);
+}
+
+static ssize_t
+read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
+{
+ char *buf = file->private_data;
+ size_t phdrs_offset, notes_offset, data_offset;
+ size_t page_offline_frozen = 1;
+ size_t phdrs_len, notes_len;
+ struct kcore_list *m;
+ size_t tsz;
+ int nphdr;
+ unsigned long start;
+ size_t orig_buflen = buflen;
+ int ret = 0;
+
+ down_read(&kclist_lock);
+ /*
+ * Don't race against drivers that set PageOffline() and expect no
+ * further page access.
+ */
+ page_offline_freeze();
+
+ get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
+ phdrs_offset = sizeof(struct elfhdr);
+ notes_offset = phdrs_offset + phdrs_len;
+
+ /* ELF file header. */
+ if (buflen && *fpos < sizeof(struct elfhdr)) {
+ struct elfhdr ehdr = {
+ .e_ident = {
+ [EI_MAG0] = ELFMAG0,
+ [EI_MAG1] = ELFMAG1,
+ [EI_MAG2] = ELFMAG2,
+ [EI_MAG3] = ELFMAG3,
+ [EI_CLASS] = ELF_CLASS,
+ [EI_DATA] = ELF_DATA,
+ [EI_VERSION] = EV_CURRENT,
+ [EI_OSABI] = ELF_OSABI,
+ },
+ .e_type = ET_CORE,
+ .e_machine = ELF_ARCH,
+ .e_version = EV_CURRENT,
+ .e_phoff = sizeof(struct elfhdr),
+ .e_flags = ELF_CORE_EFLAGS,
+ .e_ehsize = sizeof(struct elfhdr),
+ .e_phentsize = sizeof(struct elf_phdr),
+ .e_phnum = nphdr,
+ };
+
+ tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
+ if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ buffer += tsz;
+ buflen -= tsz;
+ *fpos += tsz;
+ }
+
+ /* ELF program headers. */
+ if (buflen && *fpos < phdrs_offset + phdrs_len) {
+ struct elf_phdr *phdrs, *phdr;
+
+ phdrs = kzalloc(phdrs_len, GFP_KERNEL);
+ if (!phdrs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ phdrs[0].p_type = PT_NOTE;
+ phdrs[0].p_offset = notes_offset;
+ phdrs[0].p_filesz = notes_len;
+
+ phdr = &phdrs[1];
+ list_for_each_entry(m, &kclist_head, list) {
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R | PF_W | PF_X;
+ phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
+ phdr->p_vaddr = (size_t)m->addr;
+ if (m->type == KCORE_RAM)
+ phdr->p_paddr = __pa(m->addr);
+ else if (m->type == KCORE_TEXT)
+ phdr->p_paddr = __pa_symbol(m->addr);
+ else
+ phdr->p_paddr = (elf_addr_t)-1;
+ phdr->p_filesz = phdr->p_memsz = m->size;
+ phdr->p_align = PAGE_SIZE;
+ phdr++;
+ }
+
+ tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
+ if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
+ tsz)) {
+ kfree(phdrs);
+ ret = -EFAULT;
+ goto out;
+ }
+ kfree(phdrs);
+
+ buffer += tsz;
+ buflen -= tsz;
+ *fpos += tsz;
+ }
+
+ /* ELF note segment. */
+ if (buflen && *fpos < notes_offset + notes_len) {
+ struct elf_prstatus prstatus = {};
+ struct elf_prpsinfo prpsinfo = {
+ .pr_sname = 'R',
+ .pr_fname = "vmlinux",
+ };
+ char *notes;
+ size_t i = 0;
+
+ strlcpy(prpsinfo.pr_psargs, saved_command_line,
+ sizeof(prpsinfo.pr_psargs));
+
+ notes = kzalloc(notes_len, GFP_KERNEL);
+ if (!notes) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus,
+ sizeof(prstatus));
+ append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo,
+ sizeof(prpsinfo));
+ append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
+ arch_task_struct_size);
+ /*
+ * vmcoreinfo_size is mostly constant after init time, but it
+ * can be changed by crash_save_vmcoreinfo(). Racing here with a
+ * panic on another CPU before the machine goes down is insanely
+ * unlikely, but it's better to not leave potential buffer
+ * overflows lying around, regardless.
+ */
+ append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0,
+ vmcoreinfo_data,
+ min(vmcoreinfo_size, notes_len - i));
+
+ tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
+ if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
+ kfree(notes);
+ ret = -EFAULT;
+ goto out;
+ }
+ kfree(notes);
+
+ buffer += tsz;
+ buflen -= tsz;
+ *fpos += tsz;
+ }
+
+ /*
+ * Check to see if our file offset matches with any of
+ * the addresses in the elf_phdr on our list.
+ */
+ start = kc_offset_to_vaddr(*fpos - data_offset);
+ if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+ tsz = buflen;
+
+ m = NULL;
+ while (buflen) {
+ struct page *page;
+ unsigned long pfn;
+
+ /*
+ * If this is the first iteration or the address is not within
+ * the previous entry, search for a matching entry.
+ */
+ if (!m || start < m->addr || start >= m->addr + m->size) {
+ struct kcore_list *iter;
+
+ m = NULL;
+ list_for_each_entry(iter, &kclist_head, list) {
+ if (start >= iter->addr &&
+ start < iter->addr + iter->size) {
+ m = iter;
+ break;
+ }
+ }
+ }
+
+ if (page_offline_frozen++ % MAX_ORDER_NR_PAGES == 0) {
+ page_offline_thaw();
+ cond_resched();
+ page_offline_freeze();
+ }
+
+ if (!m) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ goto skip;
+ }
+
+ switch (m->type) {
+ case KCORE_VMALLOC:
+ vread(buf, (char *)start, tsz);
+ /* we have to zero-fill user buffer even if no read */
+ if (copy_to_user(buffer, buf, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ break;
+ case KCORE_USER:
+ /* User page is handled prior to normal kernel page: */
+ if (copy_to_user(buffer, (char *)start, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ break;
+ case KCORE_RAM:
+ pfn = __pa(start) >> PAGE_SHIFT;
+ page = pfn_to_online_page(pfn);
+
+ /*
+ * Don't read offline sections, logically offline pages
+ * (e.g., inflated in a balloon), hwpoisoned pages,
+ * and explicitly excluded physical ranges.
+ */
+ if (!page || PageOffline(page) ||
+ is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ break;
+ }
+ fallthrough;
+ case KCORE_VMEMMAP:
+ case KCORE_TEXT:
+ if (kern_addr_valid(start)) {
+ /*
+ * Using bounce buffer to bypass the
+ * hardened user copy kernel text checks.
+ */
+ if (copy_from_kernel_nofault(buf, (void *)start,
+ tsz)) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ } else {
+ if (copy_to_user(buffer, buf, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+ } else {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+ break;
+ default:
+ pr_warn_once("Unhandled KCORE type: %d\n", m->type);
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+skip:
+ buflen -= tsz;
+ *fpos += tsz;
+ buffer += tsz;
+ start += tsz;
+ tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
+ }
+
+out:
+ page_offline_thaw();
+ up_read(&kclist_lock);
+ if (ret)
+ return ret;
+ return orig_buflen - buflen;
+}
+
+static int open_kcore(struct inode *inode, struct file *filp)
+{
+ int ret = security_locked_down(LOCKDOWN_KCORE);
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ if (ret)
+ return ret;
+
+ filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!filp->private_data)
+ return -ENOMEM;
+
+ if (kcore_need_update)
+ kcore_update_ram();
+ if (i_size_read(inode) != proc_root_kcore->size) {
+ inode_lock(inode);
+ i_size_write(inode, proc_root_kcore->size);
+ inode_unlock(inode);
+ }
+ return 0;
+}
+
+static int release_kcore(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static const struct proc_ops kcore_proc_ops = {
+ .proc_read = read_kcore,
+ .proc_open = open_kcore,
+ .proc_release = release_kcore,
+ .proc_lseek = default_llseek,
+};
+
+/* just remember that we have to update kcore */
+static int __meminit kcore_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ switch (action) {
+ case MEM_ONLINE:
+ case MEM_OFFLINE:
+ kcore_need_update = 1;
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kcore_callback_nb __meminitdata = {
+ .notifier_call = kcore_callback,
+ .priority = 0,
+};
+
+static struct kcore_list kcore_vmalloc;
+
+#ifdef CONFIG_ARCH_PROC_KCORE_TEXT
+static struct kcore_list kcore_text;
+/*
+ * If defined, special segment is used for mapping kernel text instead of
+ * direct-map area. We need to create special TEXT section.
+ */
+static void __init proc_kcore_text_init(void)
+{
+ kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
+}
+#else
+static void __init proc_kcore_text_init(void)
+{
+}
+#endif
+
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+/*
+ * MODULES_VADDR has no intersection with VMALLOC_ADDR.
+ */
+static struct kcore_list kcore_modules;
+static void __init add_modules_range(void)
+{
+ if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
+ kclist_add(&kcore_modules, (void *)MODULES_VADDR,
+ MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
+ }
+}
+#else
+static void __init add_modules_range(void)
+{
+}
+#endif
+
+static int __init proc_kcore_init(void)
+{
+ proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &kcore_proc_ops);
+ if (!proc_root_kcore) {
+ pr_err("couldn't create /proc/kcore\n");
+ return 0; /* Always returns 0. */
+ }
+ /* Store text area if it's special */
+ proc_kcore_text_init();
+ /* Store vmalloc area */
+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+ VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
+ add_modules_range();
+ /* Store direct-map area from physical memory map */
+ kcore_update_ram();
+ register_hotmemory_notifier(&kcore_callback_nb);
+
+ return 0;
+}
+fs_initcall(proc_kcore_init);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
new file mode 100644
index 000000000..2fc92a13f
--- /dev/null
+++ b/fs/proc/kmsg.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/proc/kmsg.c
+ *
+ * Copyright (C) 1992 by Linus Torvalds
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/syslog.h>
+
+#include <asm/io.h>
+
+static int kmsg_open(struct inode * inode, struct file * file)
+{
+ return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
+}
+
+static int kmsg_release(struct inode * inode, struct file * file)
+{
+ (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC);
+ return 0;
+}
+
+static ssize_t kmsg_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ if ((file->f_flags & O_NONBLOCK) &&
+ !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
+ return -EAGAIN;
+ return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC);
+}
+
+static __poll_t kmsg_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &log_wait, wait);
+ if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
+ return EPOLLIN | EPOLLRDNORM;
+ return 0;
+}
+
+
+static const struct proc_ops kmsg_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_read = kmsg_read,
+ .proc_poll = kmsg_poll,
+ .proc_open = kmsg_open,
+ .proc_release = kmsg_release,
+ .proc_lseek = generic_file_llseek,
+};
+
+static int __init proc_kmsg_init(void)
+{
+ proc_create("kmsg", S_IRUSR, NULL, &kmsg_proc_ops);
+ return 0;
+}
+fs_initcall(proc_kmsg_init);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
new file mode 100644
index 000000000..817981e57
--- /dev/null
+++ b/fs/proc/loadavg.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/time.h>
+#include "internal.h"
+
+static int loadavg_proc_show(struct seq_file *m, void *v)
+{
+ unsigned long avnrun[3];
+
+ get_avenrun(avnrun, FIXED_1/200, 0);
+
+ seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n",
+ LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+ LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+ LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
+ nr_running(), nr_threads,
+ idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
+ return 0;
+}
+
+static int __init proc_loadavg_init(void)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
+ pde_make_permanent(pde);
+ return 0;
+}
+fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
new file mode 100644
index 000000000..440960110
--- /dev/null
+++ b/fs/proc/meminfo.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/mman.h>
+#include <linux/mmzone.h>
+#include <linux/proc_fs.h>
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+#include <linux/atomic.h>
+#include <linux/vmalloc.h>
+#ifdef CONFIG_CMA
+#include <linux/cma.h>
+#endif
+#include <asm/page.h>
+#include "internal.h"
+
+void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
+{
+}
+
+static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
+{
+ seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
+ seq_write(m, " kB\n", 4);
+}
+
+static int meminfo_proc_show(struct seq_file *m, void *v)
+{
+ struct sysinfo i;
+ unsigned long committed;
+ long cached;
+ long available;
+ unsigned long pages[NR_LRU_LISTS];
+ unsigned long sreclaimable, sunreclaim;
+ int lru;
+
+ si_meminfo(&i);
+ si_swapinfo(&i);
+ committed = vm_memory_committed();
+
+ cached = global_node_page_state(NR_FILE_PAGES) -
+ total_swapcache_pages() - i.bufferram;
+ if (cached < 0)
+ cached = 0;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
+
+ available = si_mem_available();
+ sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
+ sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
+
+ show_val_kb(m, "MemTotal: ", i.totalram);
+ show_val_kb(m, "MemFree: ", i.freeram);
+ show_val_kb(m, "MemAvailable: ", available);
+ show_val_kb(m, "Buffers: ", i.bufferram);
+ show_val_kb(m, "Cached: ", cached);
+ show_val_kb(m, "SwapCached: ", total_swapcache_pages());
+ show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] +
+ pages[LRU_ACTIVE_FILE]);
+ show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] +
+ pages[LRU_INACTIVE_FILE]);
+ show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]);
+ show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]);
+ show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
+ show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
+ show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
+ show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK));
+
+#ifdef CONFIG_HIGHMEM
+ show_val_kb(m, "HighTotal: ", i.totalhigh);
+ show_val_kb(m, "HighFree: ", i.freehigh);
+ show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh);
+ show_val_kb(m, "LowFree: ", i.freeram - i.freehigh);
+#endif
+
+#ifndef CONFIG_MMU
+ show_val_kb(m, "MmapCopy: ",
+ (unsigned long)atomic_long_read(&mmap_pages_allocated));
+#endif
+
+ show_val_kb(m, "SwapTotal: ", i.totalswap);
+ show_val_kb(m, "SwapFree: ", i.freeswap);
+#ifdef CONFIG_ZSWAP
+ seq_printf(m, "Zswap: %8lu kB\n",
+ (unsigned long)(zswap_pool_total_size >> 10));
+ seq_printf(m, "Zswapped: %8lu kB\n",
+ (unsigned long)atomic_read(&zswap_stored_pages) <<
+ (PAGE_SHIFT - 10));
+#endif
+ show_val_kb(m, "Dirty: ",
+ global_node_page_state(NR_FILE_DIRTY));
+ show_val_kb(m, "Writeback: ",
+ global_node_page_state(NR_WRITEBACK));
+ show_val_kb(m, "AnonPages: ",
+ global_node_page_state(NR_ANON_MAPPED));
+ show_val_kb(m, "Mapped: ",
+ global_node_page_state(NR_FILE_MAPPED));
+ show_val_kb(m, "Shmem: ", i.sharedram);
+ show_val_kb(m, "KReclaimable: ", sreclaimable +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE));
+ show_val_kb(m, "Slab: ", sreclaimable + sunreclaim);
+ show_val_kb(m, "SReclaimable: ", sreclaimable);
+ show_val_kb(m, "SUnreclaim: ", sunreclaim);
+ seq_printf(m, "KernelStack: %8lu kB\n",
+ global_node_page_state(NR_KERNEL_STACK_KB));
+#ifdef CONFIG_SHADOW_CALL_STACK
+ seq_printf(m, "ShadowCallStack:%8lu kB\n",
+ global_node_page_state(NR_KERNEL_SCS_KB));
+#endif
+ show_val_kb(m, "PageTables: ",
+ global_node_page_state(NR_PAGETABLE));
+ show_val_kb(m, "SecPageTables: ",
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
+
+ show_val_kb(m, "NFS_Unstable: ", 0);
+ show_val_kb(m, "Bounce: ",
+ global_zone_page_state(NR_BOUNCE));
+ show_val_kb(m, "WritebackTmp: ",
+ global_node_page_state(NR_WRITEBACK_TEMP));
+ show_val_kb(m, "CommitLimit: ", vm_commit_limit());
+ show_val_kb(m, "Committed_AS: ", committed);
+ seq_printf(m, "VmallocTotal: %8lu kB\n",
+ (unsigned long)VMALLOC_TOTAL >> 10);
+ show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages());
+ show_val_kb(m, "VmallocChunk: ", 0ul);
+ show_val_kb(m, "Percpu: ", pcpu_nr_pages());
+
+#ifdef CONFIG_MEMORY_FAILURE
+ seq_printf(m, "HardwareCorrupted: %5lu kB\n",
+ atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ show_val_kb(m, "AnonHugePages: ",
+ global_node_page_state(NR_ANON_THPS));
+ show_val_kb(m, "ShmemHugePages: ",
+ global_node_page_state(NR_SHMEM_THPS));
+ show_val_kb(m, "ShmemPmdMapped: ",
+ global_node_page_state(NR_SHMEM_PMDMAPPED));
+ show_val_kb(m, "FileHugePages: ",
+ global_node_page_state(NR_FILE_THPS));
+ show_val_kb(m, "FilePmdMapped: ",
+ global_node_page_state(NR_FILE_PMDMAPPED));
+#endif
+
+#ifdef CONFIG_CMA
+ show_val_kb(m, "CmaTotal: ", totalcma_pages);
+ show_val_kb(m, "CmaFree: ",
+ global_zone_page_state(NR_FREE_CMA_PAGES));
+#endif
+
+ hugetlb_report_meminfo(m);
+
+ arch_report_meminfo(m);
+
+ return 0;
+}
+
+static int __init proc_meminfo_init(void)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
+ pde_make_permanent(pde);
+ return 0;
+}
+fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
new file mode 100644
index 000000000..8e159fc78
--- /dev/null
+++ b/fs/proc/namespaces.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/proc_fs.h>
+#include <linux/nsproxy.h>
+#include <linux/ptrace.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <net/net_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include "internal.h"
+
+
+static const struct proc_ns_operations *ns_entries[] = {
+#ifdef CONFIG_NET_NS
+ &netns_operations,
+#endif
+#ifdef CONFIG_UTS_NS
+ &utsns_operations,
+#endif
+#ifdef CONFIG_IPC_NS
+ &ipcns_operations,
+#endif
+#ifdef CONFIG_PID_NS
+ &pidns_operations,
+ &pidns_for_children_operations,
+#endif
+#ifdef CONFIG_USER_NS
+ &userns_operations,
+#endif
+ &mntns_operations,
+#ifdef CONFIG_CGROUPS
+ &cgroupns_operations,
+#endif
+#ifdef CONFIG_TIME_NS
+ &timens_operations,
+ &timens_for_children_operations,
+#endif
+};
+
+static const char *proc_ns_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
+ struct task_struct *task;
+ struct path ns_path;
+ int error = -EACCES;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ task = get_proc_task(inode);
+ if (!task)
+ return ERR_PTR(-EACCES);
+
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto out;
+
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (error)
+ goto out;
+
+ error = nd_jump_link(&ns_path);
+out:
+ put_task_struct(task);
+ return ERR_PTR(error);
+}
+
+static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+ struct inode *inode = d_inode(dentry);
+ const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
+ struct task_struct *task;
+ char name[50];
+ int res = -EACCES;
+
+ task = get_proc_task(inode);
+ if (!task)
+ return res;
+
+ if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
+ res = ns_get_name(name, sizeof(name), task, ns_ops);
+ if (res >= 0)
+ res = readlink_copy(buffer, buflen, name);
+ }
+ put_task_struct(task);
+ return res;
+}
+
+static const struct inode_operations proc_ns_link_inode_operations = {
+ .readlink = proc_ns_readlink,
+ .get_link = proc_ns_get_link,
+ .setattr = proc_setattr,
+};
+
+static struct dentry *proc_ns_instantiate(struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ const struct proc_ns_operations *ns_ops = ptr;
+ struct inode *inode;
+ struct proc_inode *ei;
+
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ ei = PROC_I(inode);
+ inode->i_op = &proc_ns_link_inode_operations;
+ ei->ns_ops = ns_ops;
+ pid_update_inode(task, inode);
+
+ d_set_d_op(dentry, &pid_dentry_operations);
+ return d_splice_alias(inode, dentry);
+}
+
+static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct task_struct *task = get_proc_task(file_inode(file));
+ const struct proc_ns_operations **entry, **last;
+
+ if (!task)
+ return -ENOENT;
+
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+ if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
+ goto out;
+ entry = ns_entries + (ctx->pos - 2);
+ last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+ while (entry <= last) {
+ const struct proc_ns_operations *ops = *entry;
+ if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
+ proc_ns_instantiate, task, ops))
+ break;
+ ctx->pos++;
+ entry++;
+ }
+out:
+ put_task_struct(task);
+ return 0;
+}
+
+const struct file_operations proc_ns_dir_operations = {
+ .read = generic_read_dir,
+ .iterate_shared = proc_ns_dir_readdir,
+ .llseek = generic_file_llseek,
+};
+
+static struct dentry *proc_ns_dir_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
+{
+ struct task_struct *task = get_proc_task(dir);
+ const struct proc_ns_operations **entry, **last;
+ unsigned int len = dentry->d_name.len;
+ struct dentry *res = ERR_PTR(-ENOENT);
+
+ if (!task)
+ goto out_no_task;
+
+ last = &ns_entries[ARRAY_SIZE(ns_entries)];
+ for (entry = ns_entries; entry < last; entry++) {
+ if (strlen((*entry)->name) != len)
+ continue;
+ if (!memcmp(dentry->d_name.name, (*entry)->name, len))
+ break;
+ }
+ if (entry == last)
+ goto out;
+
+ res = proc_ns_instantiate(dentry, task, *entry);
+out:
+ put_task_struct(task);
+out_no_task:
+ return res;
+}
+
+const struct inode_operations proc_ns_dir_inode_operations = {
+ .lookup = proc_ns_dir_lookup,
+ .getattr = pid_getattr,
+ .setattr = proc_setattr,
+};
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
new file mode 100644
index 000000000..4d3493579
--- /dev/null
+++ b/fs/proc/nommu.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* nommu.c: mmu-less memory info files
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mman.h>
+#include <linux/proc_fs.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/seq_file.h>
+#include <linux/hugetlb.h>
+#include <linux/vmalloc.h>
+#include <asm/tlb.h>
+#include <asm/div64.h>
+#include "internal.h"
+
+/*
+ * display a single region to a sequenced file
+ */
+static int nommu_region_show(struct seq_file *m, struct vm_region *region)
+{
+ unsigned long ino = 0;
+ struct file *file;
+ dev_t dev = 0;
+ int flags;
+
+ flags = region->vm_flags;
+ file = region->vm_file;
+
+ if (file) {
+ struct inode *inode = file_inode(region->vm_file);
+ dev = inode->i_sb->s_dev;
+ ino = inode->i_ino;
+ }
+
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+ seq_printf(m,
+ "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+ region->vm_start,
+ region->vm_end,
+ flags & VM_READ ? 'r' : '-',
+ flags & VM_WRITE ? 'w' : '-',
+ flags & VM_EXEC ? 'x' : '-',
+ flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+ ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
+ MAJOR(dev), MINOR(dev), ino);
+
+ if (file) {
+ seq_pad(m, ' ');
+ seq_file_path(m, file, "");
+ }
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+/*
+ * display a list of all the REGIONs the kernel knows about
+ * - nommu kernels have a single flat list
+ */
+static int nommu_region_list_show(struct seq_file *m, void *_p)
+{
+ struct rb_node *p = _p;
+
+ return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
+}
+
+static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
+{
+ struct rb_node *p;
+ loff_t pos = *_pos;
+
+ down_read(&nommu_region_sem);
+
+ for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
+ if (pos-- == 0)
+ return p;
+ return NULL;
+}
+
+static void nommu_region_list_stop(struct seq_file *m, void *v)
+{
+ up_read(&nommu_region_sem);
+}
+
+static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return rb_next((struct rb_node *) v);
+}
+
+static const struct seq_operations proc_nommu_region_list_seqop = {
+ .start = nommu_region_list_start,
+ .next = nommu_region_list_next,
+ .stop = nommu_region_list_stop,
+ .show = nommu_region_list_show
+};
+
+static int __init proc_nommu_init(void)
+{
+ proc_create_seq("maps", S_IRUGO, NULL, &proc_nommu_region_list_seqop);
+ return 0;
+}
+
+fs_initcall(proc_nommu_init);
diff --git a/fs/proc/page.c b/fs/proc/page.c
new file mode 100644
index 000000000..f2273b164
--- /dev/null
+++ b/fs/proc/page.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/memblock.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/ksm.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/huge_mm.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/hugetlb.h>
+#include <linux/memremap.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/kernel-page-flags.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+
+#define KPMSIZE sizeof(u64)
+#define KPMMASK (KPMSIZE - 1)
+#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
+
+static inline unsigned long get_max_dump_pfn(void)
+{
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * The memmap of early sections is completely populated and marked
+ * online even if max_pfn does not fall on a section boundary -
+ * pfn_to_online_page() will succeed on all pages. Allow inspecting
+ * these memmaps.
+ */
+ return round_up(max_pfn, PAGES_PER_SECTION);
+#else
+ return max_pfn;
+#endif
+}
+
+/* /proc/kpagecount - an array exposing page counts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page count.
+ */
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
+ u64 __user *out = (u64 __user *)buf;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+ u64 pcount;
+
+ pfn = src / KPMSIZE;
+ if (src & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
+
+ while (count > 0) {
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
+
+ if (!ppage || PageSlab(ppage) || page_has_type(ppage))
+ pcount = 0;
+ else
+ pcount = page_mapcount(ppage);
+
+ if (put_user(pcount, out)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ pfn++;
+ out++;
+ count -= KPMSIZE;
+
+ cond_resched();
+ }
+
+ *ppos += (char __user *)out - buf;
+ if (!ret)
+ ret = (char __user *)out - buf;
+ return ret;
+}
+
+static const struct proc_ops kpagecount_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_lseek = mem_lseek,
+ .proc_read = kpagecount_read,
+};
+
+/* /proc/kpageflags - an array exposing page flags
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page flags.
+ */
+
+static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
+{
+ return ((kflags >> kbit) & 1) << ubit;
+}
+
+u64 stable_page_flags(struct page *page)
+{
+ u64 k;
+ u64 u;
+
+ /*
+ * pseudo flag: KPF_NOPAGE
+ * it differentiates a memory hole from a page with no flags
+ */
+ if (!page)
+ return 1 << KPF_NOPAGE;
+
+ k = page->flags;
+ u = 0;
+
+ /*
+ * pseudo flags for the well known (anonymous) memory mapped pages
+ *
+ * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
+ * simple test in page_mapped() is not enough.
+ */
+ if (!PageSlab(page) && page_mapped(page))
+ u |= 1 << KPF_MMAP;
+ if (PageAnon(page))
+ u |= 1 << KPF_ANON;
+ if (PageKsm(page))
+ u |= 1 << KPF_KSM;
+
+ /*
+ * compound pages: export both head/tail info
+ * they together define a compound page's start/end pos and order
+ */
+ if (PageHead(page))
+ u |= 1 << KPF_COMPOUND_HEAD;
+ if (PageTail(page))
+ u |= 1 << KPF_COMPOUND_TAIL;
+ if (PageHuge(page))
+ u |= 1 << KPF_HUGE;
+ /*
+ * PageTransCompound can be true for non-huge compound pages (slab
+ * pages or pages allocated by drivers with __GFP_COMP) because it
+ * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
+ * to make sure a given page is a thp, not a non-huge compound page.
+ */
+ else if (PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+
+ if (PageLRU(head) || PageAnon(head))
+ u |= 1 << KPF_THP;
+ else if (is_huge_zero_page(head)) {
+ u |= 1 << KPF_ZERO_PAGE;
+ u |= 1 << KPF_THP;
+ }
+ } else if (is_zero_pfn(page_to_pfn(page)))
+ u |= 1 << KPF_ZERO_PAGE;
+
+
+ /*
+ * Caveats on high order pages: page->_refcount will only be set
+ * -1 on the head page; SLUB/SLQB do the same for PG_slab;
+ * SLOB won't set PG_slab at all on compound pages.
+ */
+ if (PageBuddy(page))
+ u |= 1 << KPF_BUDDY;
+ else if (page_count(page) == 0 && is_free_buddy_page(page))
+ u |= 1 << KPF_BUDDY;
+
+ if (PageOffline(page))
+ u |= 1 << KPF_OFFLINE;
+ if (PageTable(page))
+ u |= 1 << KPF_PGTABLE;
+
+ if (page_is_idle(page))
+ u |= 1 << KPF_IDLE;
+
+ u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
+
+ u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
+ if (PageTail(page) && PageSlab(compound_head(page)))
+ u |= 1 << KPF_SLAB;
+
+ u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
+ u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
+ u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate);
+ u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback);
+
+ u |= kpf_copy_bit(k, KPF_LRU, PG_lru);
+ u |= kpf_copy_bit(k, KPF_REFERENCED, PG_referenced);
+ u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active);
+ u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
+
+ if (PageSwapCache(page))
+ u |= 1 << KPF_SWAPCACHE;
+ u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
+
+ u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
+ u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked);
+
+#ifdef CONFIG_MEMORY_FAILURE
+ u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
+#endif
+
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
+ u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
+#endif
+
+ u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
+ u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk);
+ u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private);
+ u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
+ u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
+ u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
+#ifdef CONFIG_64BIT
+ u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+#endif
+
+ return u;
+};
+
+static ssize_t kpageflags_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
+ u64 __user *out = (u64 __user *)buf;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+
+ pfn = src / KPMSIZE;
+ if (src & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
+
+ while (count > 0) {
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
+
+ if (put_user(stable_page_flags(ppage), out)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ pfn++;
+ out++;
+ count -= KPMSIZE;
+
+ cond_resched();
+ }
+
+ *ppos += (char __user *)out - buf;
+ if (!ret)
+ ret = (char __user *)out - buf;
+ return ret;
+}
+
+static const struct proc_ops kpageflags_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_lseek = mem_lseek,
+ .proc_read = kpageflags_read,
+};
+
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
+ u64 __user *out = (u64 __user *)buf;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+ u64 ino;
+
+ pfn = src / KPMSIZE;
+ if (src & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
+
+ while (count > 0) {
+ /*
+ * TODO: ZONE_DEVICE support requires to identify
+ * memmaps that were actually initialized.
+ */
+ ppage = pfn_to_online_page(pfn);
+
+ if (ppage)
+ ino = page_cgroup_ino(ppage);
+ else
+ ino = 0;
+
+ if (put_user(ino, out)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ pfn++;
+ out++;
+ count -= KPMSIZE;
+
+ cond_resched();
+ }
+
+ *ppos += (char __user *)out - buf;
+ if (!ret)
+ ret = (char __user *)out - buf;
+ return ret;
+}
+
+static const struct proc_ops kpagecgroup_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_lseek = mem_lseek,
+ .proc_read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
+static int __init proc_page_init(void)
+{
+ proc_create("kpagecount", S_IRUSR, NULL, &kpagecount_proc_ops);
+ proc_create("kpageflags", S_IRUSR, NULL, &kpageflags_proc_ops);
+#ifdef CONFIG_MEMCG
+ proc_create("kpagecgroup", S_IRUSR, NULL, &kpagecgroup_proc_ops);
+#endif
+ return 0;
+}
+fs_initcall(proc_page_init);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
new file mode 100644
index 000000000..856839b8a
--- /dev/null
+++ b/fs/proc/proc_net.c
@@ -0,0 +1,416 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/proc/net.c
+ *
+ * Copyright (C) 2007
+ *
+ * Author: Eric Biederman <ebiederm@xmission.com>
+ *
+ * proc net directory handling functions
+ */
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <linux/uidgid.h>
+#include <net/net_namespace.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+
+static inline struct net *PDE_NET(struct proc_dir_entry *pde)
+{
+ return pde->parent->data;
+}
+
+static struct net *get_proc_net(const struct inode *inode)
+{
+ return maybe_get_net(PDE_NET(PDE(inode)));
+}
+
+static int seq_open_net(struct inode *inode, struct file *file)
+{
+ unsigned int state_size = PDE(inode)->state_size;
+ struct seq_net_private *p;
+ struct net *net;
+
+ WARN_ON_ONCE(state_size < sizeof(*p));
+
+ if (file->f_mode & FMODE_WRITE && !PDE(inode)->write)
+ return -EACCES;
+
+ net = get_proc_net(inode);
+ if (!net)
+ return -ENXIO;
+
+ p = __seq_open_private(file, PDE(inode)->seq_ops, state_size);
+ if (!p) {
+ put_net(net);
+ return -ENOMEM;
+ }
+#ifdef CONFIG_NET_NS
+ p->net = net;
+ netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL);
+#endif
+ return 0;
+}
+
+static void seq_file_net_put_net(struct seq_file *seq)
+{
+#ifdef CONFIG_NET_NS
+ struct seq_net_private *priv = seq->private;
+
+ put_net_track(priv->net, &priv->ns_tracker);
+#else
+ put_net(&init_net);
+#endif
+}
+
+static int seq_release_net(struct inode *ino, struct file *f)
+{
+ struct seq_file *seq = f->private_data;
+
+ seq_file_net_put_net(seq);
+ seq_release_private(ino, f);
+ return 0;
+}
+
+static const struct proc_ops proc_net_seq_ops = {
+ .proc_open = seq_open_net,
+ .proc_read = seq_read,
+ .proc_write = proc_simple_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release_net,
+};
+
+int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+#ifdef CONFIG_NET_NS
+ struct seq_net_private *p = priv_data;
+
+ p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker,
+ GFP_KERNEL);
+#endif
+ return 0;
+}
+
+void bpf_iter_fini_seq_net(void *priv_data)
+{
+#ifdef CONFIG_NET_NS
+ struct seq_net_private *p = priv_data;
+
+ put_net_track(p->net, &p->ns_tracker);
+#endif
+}
+
+struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, const struct seq_operations *ops,
+ unsigned int state_size, void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ pde_force_lookup(p);
+ p->proc_ops = &proc_net_seq_ops;
+ p->seq_ops = ops;
+ p->state_size = state_size;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_data);
+
+/**
+ * proc_create_net_data_write - Create a writable net_ns-specific proc file
+ * @name: The name of the file.
+ * @mode: The file's access mode.
+ * @parent: The parent directory in which to create.
+ * @ops: The seq_file ops with which to read the file.
+ * @write: The write method with which to 'modify' the file.
+ * @data: Data for retrieval by pde_data().
+ *
+ * Create a network namespaced proc file in the @parent directory with the
+ * specified @name and @mode that allows reading of a file that displays a
+ * series of elements and also provides for the file accepting writes that have
+ * some arbitrary effect.
+ *
+ * The functions in the @ops table are used to iterate over items to be
+ * presented and extract the readable content using the seq_file interface.
+ *
+ * The @write function is called with the data copied into a kernel space
+ * scratch buffer and has a NUL appended for convenience. The buffer may be
+ * modified by the @write function. @write should return 0 on success.
+ *
+ * The @data value is accessible from the @show and @write functions by calling
+ * pde_data() on the file inode. The network namespace must be accessed by
+ * calling seq_file_net() on the seq_file struct.
+ */
+struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ const struct seq_operations *ops,
+ proc_write_t write,
+ unsigned int state_size, void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ pde_force_lookup(p);
+ p->proc_ops = &proc_net_seq_ops;
+ p->seq_ops = ops;
+ p->state_size = state_size;
+ p->write = write;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_data_write);
+
+static int single_open_net(struct inode *inode, struct file *file)
+{
+ struct proc_dir_entry *de = PDE(inode);
+ struct net *net;
+ int err;
+
+ net = get_proc_net(inode);
+ if (!net)
+ return -ENXIO;
+
+ err = single_open(file, de->single_show, net);
+ if (err)
+ put_net(net);
+ return err;
+}
+
+static int single_release_net(struct inode *ino, struct file *f)
+{
+ struct seq_file *seq = f->private_data;
+ put_net(seq->private);
+ return single_release(ino, f);
+}
+
+static const struct proc_ops proc_net_single_ops = {
+ .proc_open = single_open_net,
+ .proc_read = seq_read,
+ .proc_write = proc_simple_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release_net,
+};
+
+struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ int (*show)(struct seq_file *, void *), void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ pde_force_lookup(p);
+ p->proc_ops = &proc_net_single_ops;
+ p->single_show = show;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_single);
+
+/**
+ * proc_create_net_single_write - Create a writable net_ns-specific proc file
+ * @name: The name of the file.
+ * @mode: The file's access mode.
+ * @parent: The parent directory in which to create.
+ * @show: The seqfile show method with which to read the file.
+ * @write: The write method with which to 'modify' the file.
+ * @data: Data for retrieval by pde_data().
+ *
+ * Create a network-namespaced proc file in the @parent directory with the
+ * specified @name and @mode that allows reading of a file that displays a
+ * single element rather than a series and also provides for the file accepting
+ * writes that have some arbitrary effect.
+ *
+ * The @show function is called to extract the readable content via the
+ * seq_file interface.
+ *
+ * The @write function is called with the data copied into a kernel space
+ * scratch buffer and has a NUL appended for convenience. The buffer may be
+ * modified by the @write function. @write should return 0 on success.
+ *
+ * The @data value is accessible from the @show and @write functions by calling
+ * pde_data() on the file inode. The network namespace must be accessed by
+ * calling seq_file_single_net() on the seq_file struct.
+ */
+struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ int (*show)(struct seq_file *, void *),
+ proc_write_t write,
+ void *data)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create_reg(name, mode, &parent, data);
+ if (!p)
+ return NULL;
+ pde_force_lookup(p);
+ p->proc_ops = &proc_net_single_ops;
+ p->single_show = show;
+ p->write = write;
+ return proc_register(parent, p);
+}
+EXPORT_SYMBOL_GPL(proc_create_net_single_write);
+
+static struct net *get_proc_task_net(struct inode *dir)
+{
+ struct task_struct *task;
+ struct nsproxy *ns;
+ struct net *net = NULL;
+
+ rcu_read_lock();
+ task = pid_task(proc_pid(dir), PIDTYPE_PID);
+ if (task != NULL) {
+ task_lock(task);
+ ns = task->nsproxy;
+ if (ns != NULL)
+ net = get_net(ns->net_ns);
+ task_unlock(task);
+ }
+ rcu_read_unlock();
+
+ return net;
+}
+
+static struct dentry *proc_tgid_net_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
+{
+ struct dentry *de;
+ struct net *net;
+
+ de = ERR_PTR(-ENOENT);
+ net = get_proc_task_net(dir);
+ if (net != NULL) {
+ de = proc_lookup_de(dir, dentry, net->proc_net);
+ put_net(net);
+ }
+ return de;
+}
+
+static int proc_tgid_net_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct net *net;
+
+ net = get_proc_task_net(inode);
+
+ generic_fillattr(&init_user_ns, inode, stat);
+
+ if (net != NULL) {
+ stat->nlink = net->proc_net->nlink;
+ put_net(net);
+ }
+
+ return 0;
+}
+
+const struct inode_operations proc_net_inode_operations = {
+ .lookup = proc_tgid_net_lookup,
+ .getattr = proc_tgid_net_getattr,
+};
+
+static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
+{
+ int ret;
+ struct net *net;
+
+ ret = -EINVAL;
+ net = get_proc_task_net(file_inode(file));
+ if (net != NULL) {
+ ret = proc_readdir_de(file, ctx, net->proc_net);
+ put_net(net);
+ }
+ return ret;
+}
+
+const struct file_operations proc_net_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = proc_tgid_net_readdir,
+};
+
+static __net_init int proc_net_ns_init(struct net *net)
+{
+ struct proc_dir_entry *netd, *net_statd;
+ kuid_t uid;
+ kgid_t gid;
+ int err;
+
+ /*
+ * This PDE acts only as an anchor for /proc/${pid}/net hierarchy.
+ * Corresponding inode (PDE(inode) == net->proc_net) is never
+ * instantiated therefore blanket zeroing is fine.
+ * net->proc_net_stat inode is instantiated normally.
+ */
+ err = -ENOMEM;
+ netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
+ if (!netd)
+ goto out;
+
+ netd->subdir = RB_ROOT;
+ netd->data = net;
+ netd->nlink = 2;
+ netd->namelen = 3;
+ netd->parent = &proc_root;
+ netd->name = netd->inline_name;
+ memcpy(netd->name, "net", 4);
+
+ uid = make_kuid(net->user_ns, 0);
+ if (!uid_valid(uid))
+ uid = netd->uid;
+
+ gid = make_kgid(net->user_ns, 0);
+ if (!gid_valid(gid))
+ gid = netd->gid;
+
+ proc_set_user(netd, uid, gid);
+
+ /* Seed dentry revalidation for /proc/${pid}/net */
+ pde_force_lookup(netd);
+
+ err = -EEXIST;
+ net_statd = proc_net_mkdir(net, "stat", netd);
+ if (!net_statd)
+ goto free_net;
+
+ net->proc_net = netd;
+ net->proc_net_stat = net_statd;
+ return 0;
+
+free_net:
+ pde_free(netd);
+out:
+ return err;
+}
+
+static __net_exit void proc_net_ns_exit(struct net *net)
+{
+ remove_proc_entry("stat", net->proc_net);
+ pde_free(net->proc_net);
+}
+
+static struct pernet_operations __net_initdata proc_net_ns_ops = {
+ .init = proc_net_ns_init,
+ .exit = proc_net_ns_exit,
+};
+
+int __init proc_net_init(void)
+{
+ proc_symlink("net", NULL, "self/net");
+
+ return register_pernet_subsys(&proc_net_ns_ops);
+}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
new file mode 100644
index 000000000..4a4c04a3b
--- /dev/null
+++ b/fs/proc/proc_sysctl.c
@@ -0,0 +1,1951 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * /proc/sys support
+ */
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/printk.h>
+#include <linux/security.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/module.h>
+#include <linux/bpf-cgroup.h>
+#include <linux/mount.h>
+#include <linux/kmemleak.h>
+#include "internal.h"
+
+#define list_for_each_table_entry(entry, table) \
+ for ((entry) = (table); (entry)->procname; (entry)++)
+
+static const struct dentry_operations proc_sys_dentry_operations;
+static const struct file_operations proc_sys_file_operations;
+static const struct inode_operations proc_sys_inode_operations;
+static const struct file_operations proc_sys_dir_file_operations;
+static const struct inode_operations proc_sys_dir_operations;
+
+/* Support for permanently empty directories */
+
+struct ctl_table sysctl_mount_point[] = {
+ { }
+};
+
+/**
+ * register_sysctl_mount_point() - registers a sysctl mount point
+ * @path: path for the mount point
+ *
+ * Used to create a permanently empty directory to serve as mount point.
+ * There are some subtle but important permission checks this allows in the
+ * case of unprivileged mounts.
+ */
+struct ctl_table_header *register_sysctl_mount_point(const char *path)
+{
+ return register_sysctl(path, sysctl_mount_point);
+}
+EXPORT_SYMBOL(register_sysctl_mount_point);
+
+static bool is_empty_dir(struct ctl_table_header *head)
+{
+ return head->ctl_table[0].child == sysctl_mount_point;
+}
+
+static void set_empty_dir(struct ctl_dir *dir)
+{
+ dir->header.ctl_table[0].child = sysctl_mount_point;
+}
+
+static void clear_empty_dir(struct ctl_dir *dir)
+
+{
+ dir->header.ctl_table[0].child = NULL;
+}
+
+void proc_sys_poll_notify(struct ctl_table_poll *poll)
+{
+ if (!poll)
+ return;
+
+ atomic_inc(&poll->event);
+ wake_up_interruptible(&poll->wait);
+}
+
+static struct ctl_table root_table[] = {
+ {
+ .procname = "",
+ .mode = S_IFDIR|S_IRUGO|S_IXUGO,
+ },
+ { }
+};
+static struct ctl_table_root sysctl_table_root = {
+ .default_set.dir.header = {
+ {{.count = 1,
+ .nreg = 1,
+ .ctl_table = root_table }},
+ .ctl_table_arg = root_table,
+ .root = &sysctl_table_root,
+ .set = &sysctl_table_root.default_set,
+ },
+};
+
+static DEFINE_SPINLOCK(sysctl_lock);
+
+static void drop_sysctl_table(struct ctl_table_header *header);
+static int sysctl_follow_link(struct ctl_table_header **phead,
+ struct ctl_table **pentry);
+static int insert_links(struct ctl_table_header *head);
+static void put_links(struct ctl_table_header *header);
+
+static void sysctl_print_dir(struct ctl_dir *dir)
+{
+ if (dir->header.parent)
+ sysctl_print_dir(dir->header.parent);
+ pr_cont("%s/", dir->header.ctl_table[0].procname);
+}
+
+static int namecmp(const char *name1, int len1, const char *name2, int len2)
+{
+ int cmp;
+
+ cmp = memcmp(name1, name2, min(len1, len2));
+ if (cmp == 0)
+ cmp = len1 - len2;
+ return cmp;
+}
+
+/* Called under sysctl_lock */
+static struct ctl_table *find_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir, const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+ struct rb_node *node = dir->root.rb_node;
+
+ while (node)
+ {
+ struct ctl_node *ctl_node;
+ const char *procname;
+ int cmp;
+
+ ctl_node = rb_entry(node, struct ctl_node, node);
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ procname = entry->procname;
+
+ cmp = namecmp(name, namelen, procname, strlen(procname));
+ if (cmp < 0)
+ node = node->rb_left;
+ else if (cmp > 0)
+ node = node->rb_right;
+ else {
+ *phead = head;
+ return entry;
+ }
+ }
+ return NULL;
+}
+
+static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+ struct rb_node *node = &head->node[entry - head->ctl_table].node;
+ struct rb_node **p = &head->parent->root.rb_node;
+ struct rb_node *parent = NULL;
+ const char *name = entry->procname;
+ int namelen = strlen(name);
+
+ while (*p) {
+ struct ctl_table_header *parent_head;
+ struct ctl_table *parent_entry;
+ struct ctl_node *parent_node;
+ const char *parent_name;
+ int cmp;
+
+ parent = *p;
+ parent_node = rb_entry(parent, struct ctl_node, node);
+ parent_head = parent_node->header;
+ parent_entry = &parent_head->ctl_table[parent_node - parent_head->node];
+ parent_name = parent_entry->procname;
+
+ cmp = namecmp(name, namelen, parent_name, strlen(parent_name));
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else {
+ pr_err("sysctl duplicate entry: ");
+ sysctl_print_dir(head->parent);
+ pr_cont("%s\n", entry->procname);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, &head->parent->root);
+ return 0;
+}
+
+static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+ struct rb_node *node = &head->node[entry - head->ctl_table].node;
+
+ rb_erase(node, &head->parent->root);
+}
+
+static void init_header(struct ctl_table_header *head,
+ struct ctl_table_root *root, struct ctl_table_set *set,
+ struct ctl_node *node, struct ctl_table *table)
+{
+ head->ctl_table = table;
+ head->ctl_table_arg = table;
+ head->used = 0;
+ head->count = 1;
+ head->nreg = 1;
+ head->unregistering = NULL;
+ head->root = root;
+ head->set = set;
+ head->parent = NULL;
+ head->node = node;
+ INIT_HLIST_HEAD(&head->inodes);
+ if (node) {
+ struct ctl_table *entry;
+
+ list_for_each_table_entry(entry, table) {
+ node->header = head;
+ node++;
+ }
+ }
+}
+
+static void erase_header(struct ctl_table_header *head)
+{
+ struct ctl_table *entry;
+
+ list_for_each_table_entry(entry, head->ctl_table)
+ erase_entry(head, entry);
+}
+
+static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
+{
+ struct ctl_table *entry;
+ int err;
+
+ /* Is this a permanently empty directory? */
+ if (is_empty_dir(&dir->header))
+ return -EROFS;
+
+ /* Am I creating a permanently empty directory? */
+ if (header->ctl_table == sysctl_mount_point) {
+ if (!RB_EMPTY_ROOT(&dir->root))
+ return -EINVAL;
+ set_empty_dir(dir);
+ }
+
+ dir->header.nreg++;
+ header->parent = dir;
+ err = insert_links(header);
+ if (err)
+ goto fail_links;
+ list_for_each_table_entry(entry, header->ctl_table) {
+ err = insert_entry(header, entry);
+ if (err)
+ goto fail;
+ }
+ return 0;
+fail:
+ erase_header(header);
+ put_links(header);
+fail_links:
+ if (header->ctl_table == sysctl_mount_point)
+ clear_empty_dir(dir);
+ header->parent = NULL;
+ drop_sysctl_table(&dir->header);
+ return err;
+}
+
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+ if (unlikely(p->unregistering))
+ return 0;
+ p->used++;
+ return 1;
+}
+
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+ if (!--p->used)
+ if (unlikely(p->unregistering))
+ complete(p->unregistering);
+}
+
+static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
+{
+ proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
+}
+
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+ /*
+ * if p->used is 0, nobody will ever touch that entry again;
+ * we'll eliminate all paths to it before dropping sysctl_lock
+ */
+ if (unlikely(p->used)) {
+ struct completion wait;
+ init_completion(&wait);
+ p->unregistering = &wait;
+ spin_unlock(&sysctl_lock);
+ wait_for_completion(&wait);
+ } else {
+ /* anything non-NULL; we'll never dereference it */
+ p->unregistering = ERR_PTR(-EINVAL);
+ spin_unlock(&sysctl_lock);
+ }
+ /*
+ * Invalidate dentries for unregistered sysctls: namespaced sysctls
+ * can have duplicate names and contaminate dcache very badly.
+ */
+ proc_sys_invalidate_dcache(p);
+ /*
+ * do not remove from the list until nobody holds it; walking the
+ * list in do_sysctl() relies on that.
+ */
+ spin_lock(&sysctl_lock);
+ erase_header(p);
+}
+
+static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+ BUG_ON(!head);
+ spin_lock(&sysctl_lock);
+ if (!use_table(head))
+ head = ERR_PTR(-ENOENT);
+ spin_unlock(&sysctl_lock);
+ return head;
+}
+
+static void sysctl_head_finish(struct ctl_table_header *head)
+{
+ if (!head)
+ return;
+ spin_lock(&sysctl_lock);
+ unuse_table(head);
+ spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root)
+{
+ struct ctl_table_set *set = &root->default_set;
+ if (root->lookup)
+ set = root->lookup(root);
+ return set;
+}
+
+static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+
+ spin_lock(&sysctl_lock);
+ entry = find_entry(&head, dir, name, namelen);
+ if (entry && use_table(head))
+ *phead = head;
+ else
+ entry = NULL;
+ spin_unlock(&sysctl_lock);
+ return entry;
+}
+
+static struct ctl_node *first_usable_entry(struct rb_node *node)
+{
+ struct ctl_node *ctl_node;
+
+ for (;node; node = rb_next(node)) {
+ ctl_node = rb_entry(node, struct ctl_node, node);
+ if (use_table(ctl_node->header))
+ return ctl_node;
+ }
+ return NULL;
+}
+
+static void first_entry(struct ctl_dir *dir,
+ struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+ struct ctl_table_header *head = NULL;
+ struct ctl_table *entry = NULL;
+ struct ctl_node *ctl_node;
+
+ spin_lock(&sysctl_lock);
+ ctl_node = first_usable_entry(rb_first(&dir->root));
+ spin_unlock(&sysctl_lock);
+ if (ctl_node) {
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ }
+ *phead = head;
+ *pentry = entry;
+}
+
+static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+ struct ctl_table_header *head = *phead;
+ struct ctl_table *entry = *pentry;
+ struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
+
+ spin_lock(&sysctl_lock);
+ unuse_table(head);
+
+ ctl_node = first_usable_entry(rb_next(&ctl_node->node));
+ spin_unlock(&sysctl_lock);
+ head = NULL;
+ if (ctl_node) {
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ }
+ *phead = head;
+ *pentry = entry;
+}
+
+/*
+ * sysctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+
+static int test_perm(int mode, int op)
+{
+ if (uid_eq(current_euid(), GLOBAL_ROOT_UID))
+ mode >>= 6;
+ else if (in_egroup_p(GLOBAL_ROOT_GID))
+ mode >>= 3;
+ if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
+ return 0;
+ return -EACCES;
+}
+
+static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
+{
+ struct ctl_table_root *root = head->root;
+ int mode;
+
+ if (root->permissions)
+ mode = root->permissions(head, table);
+ else
+ mode = table->mode;
+
+ return test_perm(mode, op);
+}
+
+static struct inode *proc_sys_make_inode(struct super_block *sb,
+ struct ctl_table_header *head, struct ctl_table *table)
+{
+ struct ctl_table_root *root = head->root;
+ struct inode *inode;
+ struct proc_inode *ei;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ inode->i_ino = get_next_ino();
+
+ ei = PROC_I(inode);
+
+ spin_lock(&sysctl_lock);
+ if (unlikely(head->unregistering)) {
+ spin_unlock(&sysctl_lock);
+ iput(inode);
+ return ERR_PTR(-ENOENT);
+ }
+ ei->sysctl = head;
+ ei->sysctl_entry = table;
+ hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
+ head->count++;
+ spin_unlock(&sysctl_lock);
+
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mode = table->mode;
+ if (!S_ISDIR(table->mode)) {
+ inode->i_mode |= S_IFREG;
+ inode->i_op = &proc_sys_inode_operations;
+ inode->i_fop = &proc_sys_file_operations;
+ } else {
+ inode->i_mode |= S_IFDIR;
+ inode->i_op = &proc_sys_dir_operations;
+ inode->i_fop = &proc_sys_dir_file_operations;
+ if (is_empty_dir(head))
+ make_empty_dir_inode(inode);
+ }
+
+ if (root->set_ownership)
+ root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
+ else {
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ }
+
+ return inode;
+}
+
+void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
+ if (!--head->count)
+ kfree_rcu(head, rcu);
+ spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_header *grab_header(struct inode *inode)
+{
+ struct ctl_table_header *head = PROC_I(inode)->sysctl;
+ if (!head)
+ head = &sysctl_table_root.default_set.dir.header;
+ return sysctl_head_grab(head);
+}
+
+static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct ctl_table_header *head = grab_header(dir);
+ struct ctl_table_header *h = NULL;
+ const struct qstr *name = &dentry->d_name;
+ struct ctl_table *p;
+ struct inode *inode;
+ struct dentry *err = ERR_PTR(-ENOENT);
+ struct ctl_dir *ctl_dir;
+ int ret;
+
+ if (IS_ERR(head))
+ return ERR_CAST(head);
+
+ ctl_dir = container_of(head, struct ctl_dir, header);
+
+ p = lookup_entry(&h, ctl_dir, name->name, name->len);
+ if (!p)
+ goto out;
+
+ if (S_ISLNK(p->mode)) {
+ ret = sysctl_follow_link(&h, &p);
+ err = ERR_PTR(ret);
+ if (ret)
+ goto out;
+ }
+
+ inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
+ if (IS_ERR(inode)) {
+ err = ERR_CAST(inode);
+ goto out;
+ }
+
+ d_set_d_op(dentry, &proc_sys_dentry_operations);
+ err = d_splice_alias(inode, dentry);
+
+out:
+ if (h)
+ sysctl_head_finish(h);
+ sysctl_head_finish(head);
+ return err;
+}
+
+static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
+ int write)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ size_t count = iov_iter_count(iter);
+ char *kbuf;
+ ssize_t error;
+
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
+ /*
+ * At this point we know that the sysctl was not unregistered
+ * and won't be until we finish.
+ */
+ error = -EPERM;
+ if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
+ goto out;
+
+ /* if that can happen at all, it should be -EINVAL, not -EISDIR */
+ error = -EINVAL;
+ if (!table->proc_handler)
+ goto out;
+
+ /* don't even try if the size is too large */
+ error = -ENOMEM;
+ if (count >= KMALLOC_MAX_SIZE)
+ goto out;
+ kbuf = kvzalloc(count + 1, GFP_KERNEL);
+ if (!kbuf)
+ goto out;
+
+ if (write) {
+ error = -EFAULT;
+ if (!copy_from_iter_full(kbuf, count, iter))
+ goto out_free_buf;
+ kbuf[count] = '\0';
+ }
+
+ error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count,
+ &iocb->ki_pos);
+ if (error)
+ goto out_free_buf;
+
+ /* careful: calling conventions are nasty here */
+ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos);
+ if (error)
+ goto out_free_buf;
+
+ if (!write) {
+ error = -EFAULT;
+ if (copy_to_iter(kbuf, count, iter) < count)
+ goto out_free_buf;
+ }
+
+ error = count;
+out_free_buf:
+ kvfree(kbuf);
+out:
+ sysctl_head_finish(head);
+
+ return error;
+}
+
+static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
+{
+ return proc_sys_call_handler(iocb, iter, 0);
+}
+
+static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+ return proc_sys_call_handler(iocb, iter, 1);
+}
+
+static int proc_sys_open(struct inode *inode, struct file *filp)
+{
+ struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+ /* sysctl was unregistered */
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
+ if (table->poll)
+ filp->private_data = proc_sys_poll_event(table->poll);
+
+ sysctl_head_finish(head);
+
+ return 0;
+}
+
+static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
+{
+ struct inode *inode = file_inode(filp);
+ struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ __poll_t ret = DEFAULT_POLLMASK;
+ unsigned long event;
+
+ /* sysctl was unregistered */
+ if (IS_ERR(head))
+ return EPOLLERR | EPOLLHUP;
+
+ if (!table->proc_handler)
+ goto out;
+
+ if (!table->poll)
+ goto out;
+
+ event = (unsigned long)filp->private_data;
+ poll_wait(filp, &table->poll->wait, wait);
+
+ if (event != atomic_read(&table->poll->event)) {
+ filp->private_data = proc_sys_poll_event(table->poll);
+ ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
+ }
+
+out:
+ sysctl_head_finish(head);
+
+ return ret;
+}
+
+static bool proc_sys_fill_cache(struct file *file,
+ struct dir_context *ctx,
+ struct ctl_table_header *head,
+ struct ctl_table *table)
+{
+ struct dentry *child, *dir = file->f_path.dentry;
+ struct inode *inode;
+ struct qstr qname;
+ ino_t ino = 0;
+ unsigned type = DT_UNKNOWN;
+
+ qname.name = table->procname;
+ qname.len = strlen(table->procname);
+ qname.hash = full_name_hash(dir, qname.name, qname.len);
+
+ child = d_lookup(dir, &qname);
+ if (!child) {
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ child = d_alloc_parallel(dir, &qname, &wq);
+ if (IS_ERR(child))
+ return false;
+ if (d_in_lookup(child)) {
+ struct dentry *res;
+ inode = proc_sys_make_inode(dir->d_sb, head, table);
+ if (IS_ERR(inode)) {
+ d_lookup_done(child);
+ dput(child);
+ return false;
+ }
+ d_set_d_op(child, &proc_sys_dentry_operations);
+ res = d_splice_alias(inode, child);
+ d_lookup_done(child);
+ if (unlikely(res)) {
+ if (IS_ERR(res)) {
+ dput(child);
+ return false;
+ }
+ dput(child);
+ child = res;
+ }
+ }
+ }
+ inode = d_inode(child);
+ ino = inode->i_ino;
+ type = inode->i_mode >> 12;
+ dput(child);
+ return dir_emit(ctx, qname.name, qname.len, ino, type);
+}
+
+static bool proc_sys_link_fill_cache(struct file *file,
+ struct dir_context *ctx,
+ struct ctl_table_header *head,
+ struct ctl_table *table)
+{
+ bool ret = true;
+
+ head = sysctl_head_grab(head);
+ if (IS_ERR(head))
+ return false;
+
+ /* It is not an error if we can not follow the link ignore it */
+ if (sysctl_follow_link(&head, &table))
+ goto out;
+
+ ret = proc_sys_fill_cache(file, ctx, head, table);
+out:
+ sysctl_head_finish(head);
+ return ret;
+}
+
+static int scan(struct ctl_table_header *head, struct ctl_table *table,
+ unsigned long *pos, struct file *file,
+ struct dir_context *ctx)
+{
+ bool res;
+
+ if ((*pos)++ < ctx->pos)
+ return true;
+
+ if (unlikely(S_ISLNK(table->mode)))
+ res = proc_sys_link_fill_cache(file, ctx, head, table);
+ else
+ res = proc_sys_fill_cache(file, ctx, head, table);
+
+ if (res)
+ ctx->pos = *pos;
+
+ return res;
+}
+
+static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct ctl_table_header *head = grab_header(file_inode(file));
+ struct ctl_table_header *h = NULL;
+ struct ctl_table *entry;
+ struct ctl_dir *ctl_dir;
+ unsigned long pos;
+
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
+ ctl_dir = container_of(head, struct ctl_dir, header);
+
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+
+ pos = 2;
+
+ for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
+ if (!scan(h, entry, &pos, file, ctx)) {
+ sysctl_head_finish(h);
+ break;
+ }
+ }
+out:
+ sysctl_head_finish(head);
+ return 0;
+}
+
+static int proc_sys_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
+{
+ /*
+ * sysctl entries that are not writeable,
+ * are _NOT_ writeable, capabilities or not.
+ */
+ struct ctl_table_header *head;
+ struct ctl_table *table;
+ int error;
+
+ /* Executable files are not allowed under /proc/sys/ */
+ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
+ return -EACCES;
+
+ head = grab_header(inode);
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
+ table = PROC_I(inode)->sysctl_entry;
+ if (!table) /* global root - r-xr-xr-x */
+ error = mask & MAY_WRITE ? -EACCES : 0;
+ else /* Use the permissions on the sysctl table entry */
+ error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
+
+ sysctl_head_finish(head);
+ return error;
+}
+
+static int proc_sys_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ int error;
+
+ if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
+ return -EPERM;
+
+ error = setattr_prepare(&init_user_ns, dentry, attr);
+ if (error)
+ return error;
+
+ setattr_copy(&init_user_ns, inode, attr);
+ mark_inode_dirty(inode);
+ return 0;
+}
+
+static int proc_sys_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
+ generic_fillattr(&init_user_ns, inode, stat);
+ if (table)
+ stat->mode = (stat->mode & S_IFMT) | table->mode;
+
+ sysctl_head_finish(head);
+ return 0;
+}
+
+static const struct file_operations proc_sys_file_operations = {
+ .open = proc_sys_open,
+ .poll = proc_sys_poll,
+ .read_iter = proc_sys_read,
+ .write_iter = proc_sys_write,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations proc_sys_dir_file_operations = {
+ .read = generic_read_dir,
+ .iterate_shared = proc_sys_readdir,
+ .llseek = generic_file_llseek,
+};
+
+static const struct inode_operations proc_sys_inode_operations = {
+ .permission = proc_sys_permission,
+ .setattr = proc_sys_setattr,
+ .getattr = proc_sys_getattr,
+};
+
+static const struct inode_operations proc_sys_dir_operations = {
+ .lookup = proc_sys_lookup,
+ .permission = proc_sys_permission,
+ .setattr = proc_sys_setattr,
+ .getattr = proc_sys_getattr,
+};
+
+static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ return !PROC_I(d_inode(dentry))->sysctl->unregistering;
+}
+
+static int proc_sys_delete(const struct dentry *dentry)
+{
+ return !!PROC_I(d_inode(dentry))->sysctl->unregistering;
+}
+
+static int sysctl_is_seen(struct ctl_table_header *p)
+{
+ struct ctl_table_set *set = p->set;
+ int res;
+ spin_lock(&sysctl_lock);
+ if (p->unregistering)
+ res = 0;
+ else if (!set->is_seen)
+ res = 1;
+ else
+ res = set->is_seen(set);
+ spin_unlock(&sysctl_lock);
+ return res;
+}
+
+static int proc_sys_compare(const struct dentry *dentry,
+ unsigned int len, const char *str, const struct qstr *name)
+{
+ struct ctl_table_header *head;
+ struct inode *inode;
+
+ /* Although proc doesn't have negative dentries, rcu-walk means
+ * that inode here can be NULL */
+ /* AV: can it, indeed? */
+ inode = d_inode_rcu(dentry);
+ if (!inode)
+ return 1;
+ if (name->len != len)
+ return 1;
+ if (memcmp(name->name, str, len))
+ return 1;
+ head = rcu_dereference(PROC_I(inode)->sysctl);
+ return !head || !sysctl_is_seen(head);
+}
+
+static const struct dentry_operations proc_sys_dentry_operations = {
+ .d_revalidate = proc_sys_revalidate,
+ .d_delete = proc_sys_delete,
+ .d_compare = proc_sys_compare,
+};
+
+static struct ctl_dir *find_subdir(struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+
+ entry = find_entry(&head, dir, name, namelen);
+ if (!entry)
+ return ERR_PTR(-ENOENT);
+ if (!S_ISDIR(entry->mode))
+ return ERR_PTR(-ENOTDIR);
+ return container_of(head, struct ctl_dir, header);
+}
+
+static struct ctl_dir *new_dir(struct ctl_table_set *set,
+ const char *name, int namelen)
+{
+ struct ctl_table *table;
+ struct ctl_dir *new;
+ struct ctl_node *node;
+ char *new_name;
+
+ new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
+ sizeof(struct ctl_table)*2 + namelen + 1,
+ GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ node = (struct ctl_node *)(new + 1);
+ table = (struct ctl_table *)(node + 1);
+ new_name = (char *)(table + 2);
+ memcpy(new_name, name, namelen);
+ table[0].procname = new_name;
+ table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
+ init_header(&new->header, set->dir.header.root, set, node, table);
+
+ return new;
+}
+
+/**
+ * get_subdir - find or create a subdir with the specified name.
+ * @dir: Directory to create the subdirectory in
+ * @name: The name of the subdirectory to find or create
+ * @namelen: The length of name
+ *
+ * Takes a directory with an elevated reference count so we know that
+ * if we drop the lock the directory will not go away. Upon success
+ * the reference is moved from @dir to the returned subdirectory.
+ * Upon error an error code is returned and the reference on @dir is
+ * simply dropped.
+ */
+static struct ctl_dir *get_subdir(struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_set *set = dir->header.set;
+ struct ctl_dir *subdir, *new = NULL;
+ int err;
+
+ spin_lock(&sysctl_lock);
+ subdir = find_subdir(dir, name, namelen);
+ if (!IS_ERR(subdir))
+ goto found;
+ if (PTR_ERR(subdir) != -ENOENT)
+ goto failed;
+
+ spin_unlock(&sysctl_lock);
+ new = new_dir(set, name, namelen);
+ spin_lock(&sysctl_lock);
+ subdir = ERR_PTR(-ENOMEM);
+ if (!new)
+ goto failed;
+
+ /* Was the subdir added while we dropped the lock? */
+ subdir = find_subdir(dir, name, namelen);
+ if (!IS_ERR(subdir))
+ goto found;
+ if (PTR_ERR(subdir) != -ENOENT)
+ goto failed;
+
+ /* Nope. Use the our freshly made directory entry. */
+ err = insert_header(dir, &new->header);
+ subdir = ERR_PTR(err);
+ if (err)
+ goto failed;
+ subdir = new;
+found:
+ subdir->header.nreg++;
+failed:
+ if (IS_ERR(subdir)) {
+ pr_err("sysctl could not get directory: ");
+ sysctl_print_dir(dir);
+ pr_cont("%*.*s %ld\n", namelen, namelen, name,
+ PTR_ERR(subdir));
+ }
+ drop_sysctl_table(&dir->header);
+ if (new)
+ drop_sysctl_table(&new->header);
+ spin_unlock(&sysctl_lock);
+ return subdir;
+}
+
+static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
+{
+ struct ctl_dir *parent;
+ const char *procname;
+ if (!dir->header.parent)
+ return &set->dir;
+ parent = xlate_dir(set, dir->header.parent);
+ if (IS_ERR(parent))
+ return parent;
+ procname = dir->header.ctl_table[0].procname;
+ return find_subdir(parent, procname, strlen(procname));
+}
+
+static int sysctl_follow_link(struct ctl_table_header **phead,
+ struct ctl_table **pentry)
+{
+ struct ctl_table_header *head;
+ struct ctl_table_root *root;
+ struct ctl_table_set *set;
+ struct ctl_table *entry;
+ struct ctl_dir *dir;
+ int ret;
+
+ spin_lock(&sysctl_lock);
+ root = (*pentry)->data;
+ set = lookup_header_set(root);
+ dir = xlate_dir(set, (*phead)->parent);
+ if (IS_ERR(dir))
+ ret = PTR_ERR(dir);
+ else {
+ const char *procname = (*pentry)->procname;
+ head = NULL;
+ entry = find_entry(&head, dir, procname, strlen(procname));
+ ret = -ENOENT;
+ if (entry && use_table(head)) {
+ unuse_table(*phead);
+ *phead = head;
+ *pentry = entry;
+ ret = 0;
+ }
+ }
+
+ spin_unlock(&sysctl_lock);
+ return ret;
+}
+
+static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_err("sysctl table check failed: %s/%s %pV\n",
+ path, table->procname, &vaf);
+
+ va_end(args);
+ return -EINVAL;
+}
+
+static int sysctl_check_table_array(const char *path, struct ctl_table *table)
+{
+ int err = 0;
+
+ if ((table->proc_handler == proc_douintvec) ||
+ (table->proc_handler == proc_douintvec_minmax)) {
+ if (table->maxlen != sizeof(unsigned int))
+ err |= sysctl_err(path, table, "array not allowed");
+ }
+
+ if (table->proc_handler == proc_dou8vec_minmax) {
+ if (table->maxlen != sizeof(u8))
+ err |= sysctl_err(path, table, "array not allowed");
+ }
+
+ return err;
+}
+
+static int sysctl_check_table(const char *path, struct ctl_table *table)
+{
+ struct ctl_table *entry;
+ int err = 0;
+ list_for_each_table_entry(entry, table) {
+ if (entry->child)
+ err |= sysctl_err(path, entry, "Not a file");
+
+ if ((entry->proc_handler == proc_dostring) ||
+ (entry->proc_handler == proc_dointvec) ||
+ (entry->proc_handler == proc_douintvec) ||
+ (entry->proc_handler == proc_douintvec_minmax) ||
+ (entry->proc_handler == proc_dointvec_minmax) ||
+ (entry->proc_handler == proc_dou8vec_minmax) ||
+ (entry->proc_handler == proc_dointvec_jiffies) ||
+ (entry->proc_handler == proc_dointvec_userhz_jiffies) ||
+ (entry->proc_handler == proc_dointvec_ms_jiffies) ||
+ (entry->proc_handler == proc_doulongvec_minmax) ||
+ (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (!entry->data)
+ err |= sysctl_err(path, entry, "No data");
+ if (!entry->maxlen)
+ err |= sysctl_err(path, entry, "No maxlen");
+ else
+ err |= sysctl_check_table_array(path, entry);
+ }
+ if (!entry->proc_handler)
+ err |= sysctl_err(path, entry, "No proc_handler");
+
+ if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode)
+ err |= sysctl_err(path, entry, "bogus .mode 0%o",
+ entry->mode);
+ }
+ return err;
+}
+
+static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
+ struct ctl_table_root *link_root)
+{
+ struct ctl_table *link_table, *entry, *link;
+ struct ctl_table_header *links;
+ struct ctl_node *node;
+ char *link_name;
+ int nr_entries, name_bytes;
+
+ name_bytes = 0;
+ nr_entries = 0;
+ list_for_each_table_entry(entry, table) {
+ nr_entries++;
+ name_bytes += strlen(entry->procname) + 1;
+ }
+
+ links = kzalloc(sizeof(struct ctl_table_header) +
+ sizeof(struct ctl_node)*nr_entries +
+ sizeof(struct ctl_table)*(nr_entries + 1) +
+ name_bytes,
+ GFP_KERNEL);
+
+ if (!links)
+ return NULL;
+
+ node = (struct ctl_node *)(links + 1);
+ link_table = (struct ctl_table *)(node + nr_entries);
+ link_name = (char *)&link_table[nr_entries + 1];
+ link = link_table;
+
+ list_for_each_table_entry(entry, table) {
+ int len = strlen(entry->procname) + 1;
+ memcpy(link_name, entry->procname, len);
+ link->procname = link_name;
+ link->mode = S_IFLNK|S_IRWXUGO;
+ link->data = link_root;
+ link_name += len;
+ link++;
+ }
+ init_header(links, dir->header.root, dir->header.set, node, link_table);
+ links->nreg = nr_entries;
+
+ return links;
+}
+
+static bool get_links(struct ctl_dir *dir,
+ struct ctl_table *table, struct ctl_table_root *link_root)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry, *link;
+
+ /* Are there links available for every entry in table? */
+ list_for_each_table_entry(entry, table) {
+ const char *procname = entry->procname;
+ link = find_entry(&head, dir, procname, strlen(procname));
+ if (!link)
+ return false;
+ if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
+ continue;
+ if (S_ISLNK(link->mode) && (link->data == link_root))
+ continue;
+ return false;
+ }
+
+ /* The checks passed. Increase the registration count on the links */
+ list_for_each_table_entry(entry, table) {
+ const char *procname = entry->procname;
+ link = find_entry(&head, dir, procname, strlen(procname));
+ head->nreg++;
+ }
+ return true;
+}
+
+static int insert_links(struct ctl_table_header *head)
+{
+ struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+ struct ctl_dir *core_parent;
+ struct ctl_table_header *links;
+ int err;
+
+ if (head->set == root_set)
+ return 0;
+
+ core_parent = xlate_dir(root_set, head->parent);
+ if (IS_ERR(core_parent))
+ return 0;
+
+ if (get_links(core_parent, head->ctl_table, head->root))
+ return 0;
+
+ core_parent->header.nreg++;
+ spin_unlock(&sysctl_lock);
+
+ links = new_links(core_parent, head->ctl_table, head->root);
+
+ spin_lock(&sysctl_lock);
+ err = -ENOMEM;
+ if (!links)
+ goto out;
+
+ err = 0;
+ if (get_links(core_parent, head->ctl_table, head->root)) {
+ kfree(links);
+ goto out;
+ }
+
+ err = insert_header(core_parent, links);
+ if (err)
+ kfree(links);
+out:
+ drop_sysctl_table(&core_parent->header);
+ return err;
+}
+
+/**
+ * __register_sysctl_table - register a leaf sysctl table
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure without any child. This table
+ * should not be free'd after registration. So it should not be
+ * used on stack. It can either be a global or dynamically allocated
+ * by the caller and free'd later after sysctl unregistration.
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * The members of the &struct ctl_table structure are used as follows:
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ * enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file
+ *
+ * child - must be %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ * XXX: we should eventually modify these to use long min / max [0]
+ * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes (where child is not NULL) are not allowed,
+ * sysctl_check_table() verifies this.
+ *
+ * There must be a proc_handler routine for any terminal nodes.
+ * Several default handlers are available to cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *__register_sysctl_table(
+ struct ctl_table_set *set,
+ const char *path, struct ctl_table *table)
+{
+ struct ctl_table_root *root = set->dir.header.root;
+ struct ctl_table_header *header;
+ const char *name, *nextname;
+ struct ctl_dir *dir;
+ struct ctl_table *entry;
+ struct ctl_node *node;
+ int nr_entries = 0;
+
+ list_for_each_table_entry(entry, table)
+ nr_entries++;
+
+ header = kzalloc(sizeof(struct ctl_table_header) +
+ sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT);
+ if (!header)
+ return NULL;
+
+ node = (struct ctl_node *)(header + 1);
+ init_header(header, root, set, node, table);
+ if (sysctl_check_table(path, table))
+ goto fail;
+
+ spin_lock(&sysctl_lock);
+ dir = &set->dir;
+ /* Reference moved down the directory tree get_subdir */
+ dir->header.nreg++;
+ spin_unlock(&sysctl_lock);
+
+ /* Find the directory for the ctl_table */
+ for (name = path; name; name = nextname) {
+ int namelen;
+ nextname = strchr(name, '/');
+ if (nextname) {
+ namelen = nextname - name;
+ nextname++;
+ } else {
+ namelen = strlen(name);
+ }
+ if (namelen == 0)
+ continue;
+
+ /*
+ * namelen ensures if name is "foo/bar/yay" only foo is
+ * registered first. We traverse as if using mkdir -p and
+ * return a ctl_dir for the last directory entry.
+ */
+ dir = get_subdir(dir, name, namelen);
+ if (IS_ERR(dir))
+ goto fail;
+ }
+
+ spin_lock(&sysctl_lock);
+ if (insert_header(dir, header))
+ goto fail_put_dir_locked;
+
+ drop_sysctl_table(&dir->header);
+ spin_unlock(&sysctl_lock);
+
+ return header;
+
+fail_put_dir_locked:
+ drop_sysctl_table(&dir->header);
+ spin_unlock(&sysctl_lock);
+fail:
+ kfree(header);
+ dump_stack();
+ return NULL;
+}
+
+/**
+ * register_sysctl - register a sysctl table
+ * @path: The path to the directory the sysctl table is in. If the path
+ * doesn't exist we will create it for you.
+ * @table: the table structure. The calller must ensure the life of the @table
+ * will be kept during the lifetime use of the syctl. It must not be freed
+ * until unregister_sysctl_table() is called with the given returned table
+ * with this registration. If your code is non modular then you don't need
+ * to call unregister_sysctl_table() and can instead use something like
+ * register_sysctl_init() which does not care for the result of the syctl
+ * registration.
+ *
+ * Register a sysctl table. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
+{
+ return __register_sysctl_table(&sysctl_table_root.default_set,
+ path, table);
+}
+EXPORT_SYMBOL(register_sysctl);
+
+/**
+ * __register_sysctl_init() - register sysctl table to path
+ * @path: path name for sysctl base. If that path doesn't exist we will create
+ * it for you.
+ * @table: This is the sysctl table that needs to be registered to the path.
+ * The caller must ensure the life of the @table will be kept during the
+ * lifetime use of the sysctl.
+ * @table_name: The name of sysctl table, only used for log printing when
+ * registration fails
+ *
+ * The sysctl interface is used by userspace to query or modify at runtime
+ * a predefined value set on a variable. These variables however have default
+ * values pre-set. Code which depends on these variables will always work even
+ * if register_sysctl() fails. If register_sysctl() fails you'd just loose the
+ * ability to query or modify the sysctls dynamically at run time. Chances of
+ * register_sysctl() failing on init are extremely low, and so for both reasons
+ * this function does not return any error as it is used by initialization code.
+ *
+ * Context: if your base directory does not exist it will be created for you.
+ */
+void __init __register_sysctl_init(const char *path, struct ctl_table *table,
+ const char *table_name)
+{
+ struct ctl_table_header *hdr = register_sysctl(path, table);
+
+ if (unlikely(!hdr)) {
+ pr_err("failed when register_sysctl %s to %s\n", table_name, path);
+ return;
+ }
+ kmemleak_not_leak(hdr);
+}
+
+static char *append_path(const char *path, char *pos, const char *name)
+{
+ int namelen;
+ namelen = strlen(name);
+ if (((pos - path) + namelen + 2) >= PATH_MAX)
+ return NULL;
+ memcpy(pos, name, namelen);
+ pos[namelen] = '/';
+ pos[namelen + 1] = '\0';
+ pos += namelen + 1;
+ return pos;
+}
+
+static int count_subheaders(struct ctl_table *table)
+{
+ int has_files = 0;
+ int nr_subheaders = 0;
+ struct ctl_table *entry;
+
+ /* special case: no directory and empty directory */
+ if (!table || !table->procname)
+ return 1;
+
+ list_for_each_table_entry(entry, table) {
+ if (entry->child)
+ nr_subheaders += count_subheaders(entry->child);
+ else
+ has_files = 1;
+ }
+ return nr_subheaders + has_files;
+}
+
+static int register_leaf_sysctl_tables(const char *path, char *pos,
+ struct ctl_table_header ***subheader, struct ctl_table_set *set,
+ struct ctl_table *table)
+{
+ struct ctl_table *ctl_table_arg = NULL;
+ struct ctl_table *entry, *files;
+ int nr_files = 0;
+ int nr_dirs = 0;
+ int err = -ENOMEM;
+
+ list_for_each_table_entry(entry, table) {
+ if (entry->child)
+ nr_dirs++;
+ else
+ nr_files++;
+ }
+
+ files = table;
+ /* If there are mixed files and directories we need a new table */
+ if (nr_dirs && nr_files) {
+ struct ctl_table *new;
+ files = kcalloc(nr_files + 1, sizeof(struct ctl_table),
+ GFP_KERNEL);
+ if (!files)
+ goto out;
+
+ ctl_table_arg = files;
+ new = files;
+
+ list_for_each_table_entry(entry, table) {
+ if (entry->child)
+ continue;
+ *new = *entry;
+ new++;
+ }
+ }
+
+ /* Register everything except a directory full of subdirectories */
+ if (nr_files || !nr_dirs) {
+ struct ctl_table_header *header;
+ header = __register_sysctl_table(set, path, files);
+ if (!header) {
+ kfree(ctl_table_arg);
+ goto out;
+ }
+
+ /* Remember if we need to free the file table */
+ header->ctl_table_arg = ctl_table_arg;
+ **subheader = header;
+ (*subheader)++;
+ }
+
+ /* Recurse into the subdirectories. */
+ list_for_each_table_entry(entry, table) {
+ char *child_pos;
+
+ if (!entry->child)
+ continue;
+
+ err = -ENAMETOOLONG;
+ child_pos = append_path(path, pos, entry->procname);
+ if (!child_pos)
+ goto out;
+
+ err = register_leaf_sysctl_tables(path, child_pos, subheader,
+ set, entry->child);
+ pos[0] = '\0';
+ if (err)
+ goto out;
+ }
+ err = 0;
+out:
+ /* On failure our caller will unregister all registered subheaders */
+ return err;
+}
+
+/**
+ * __register_sysctl_paths - register a sysctl table hierarchy
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ * We are slowly deprecating this call so avoid its use.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *__register_sysctl_paths(
+ struct ctl_table_set *set,
+ const struct ctl_path *path, struct ctl_table *table)
+{
+ struct ctl_table *ctl_table_arg = table;
+ int nr_subheaders = count_subheaders(table);
+ struct ctl_table_header *header = NULL, **subheaders, **subheader;
+ const struct ctl_path *component;
+ char *new_path, *pos;
+
+ pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!new_path)
+ return NULL;
+
+ pos[0] = '\0';
+ for (component = path; component->procname; component++) {
+ pos = append_path(new_path, pos, component->procname);
+ if (!pos)
+ goto out;
+ }
+ while (table->procname && table->child && !table[1].procname) {
+ pos = append_path(new_path, pos, table->procname);
+ if (!pos)
+ goto out;
+ table = table->child;
+ }
+ if (nr_subheaders == 1) {
+ header = __register_sysctl_table(set, new_path, table);
+ if (header)
+ header->ctl_table_arg = ctl_table_arg;
+ } else {
+ header = kzalloc(sizeof(*header) +
+ sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
+ if (!header)
+ goto out;
+
+ subheaders = (struct ctl_table_header **) (header + 1);
+ subheader = subheaders;
+ header->ctl_table_arg = ctl_table_arg;
+
+ if (register_leaf_sysctl_tables(new_path, pos, &subheader,
+ set, table))
+ goto err_register_leaves;
+ }
+
+out:
+ kfree(new_path);
+ return header;
+
+err_register_leaves:
+ while (subheader > subheaders) {
+ struct ctl_table_header *subh = *(--subheader);
+ struct ctl_table *table = subh->ctl_table_arg;
+ unregister_sysctl_table(subh);
+ kfree(table);
+ }
+ kfree(header);
+ header = NULL;
+ goto out;
+}
+
+/**
+ * register_sysctl_paths - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ * We are slowly deprecating this caller so avoid future uses of it.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+ struct ctl_table *table)
+{
+ return __register_sysctl_paths(&sysctl_table_root.default_set,
+ path, table);
+}
+EXPORT_SYMBOL(register_sysctl_paths);
+
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+ static const struct ctl_path null_path[] = { {} };
+
+ return register_sysctl_paths(null_path, table);
+}
+EXPORT_SYMBOL(register_sysctl_table);
+
+int __register_sysctl_base(struct ctl_table *base_table)
+{
+ struct ctl_table_header *hdr;
+
+ hdr = register_sysctl_table(base_table);
+ kmemleak_not_leak(hdr);
+ return 0;
+}
+
+static void put_links(struct ctl_table_header *header)
+{
+ struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+ struct ctl_table_root *root = header->root;
+ struct ctl_dir *parent = header->parent;
+ struct ctl_dir *core_parent;
+ struct ctl_table *entry;
+
+ if (header->set == root_set)
+ return;
+
+ core_parent = xlate_dir(root_set, parent);
+ if (IS_ERR(core_parent))
+ return;
+
+ list_for_each_table_entry(entry, header->ctl_table) {
+ struct ctl_table_header *link_head;
+ struct ctl_table *link;
+ const char *name = entry->procname;
+
+ link = find_entry(&link_head, core_parent, name, strlen(name));
+ if (link &&
+ ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) ||
+ (S_ISLNK(link->mode) && (link->data == root)))) {
+ drop_sysctl_table(link_head);
+ }
+ else {
+ pr_err("sysctl link missing during unregister: ");
+ sysctl_print_dir(parent);
+ pr_cont("%s\n", name);
+ }
+ }
+}
+
+static void drop_sysctl_table(struct ctl_table_header *header)
+{
+ struct ctl_dir *parent = header->parent;
+
+ if (--header->nreg)
+ return;
+
+ if (parent) {
+ put_links(header);
+ start_unregistering(header);
+ }
+
+ if (!--header->count)
+ kfree_rcu(header, rcu);
+
+ if (parent)
+ drop_sysctl_table(&parent->header);
+}
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+ int nr_subheaders;
+ might_sleep();
+
+ if (header == NULL)
+ return;
+
+ nr_subheaders = count_subheaders(header->ctl_table_arg);
+ if (unlikely(nr_subheaders > 1)) {
+ struct ctl_table_header **subheaders;
+ int i;
+
+ subheaders = (struct ctl_table_header **)(header + 1);
+ for (i = nr_subheaders -1; i >= 0; i--) {
+ struct ctl_table_header *subh = subheaders[i];
+ struct ctl_table *table = subh->ctl_table_arg;
+ unregister_sysctl_table(subh);
+ kfree(table);
+ }
+ kfree(header);
+ return;
+ }
+
+ spin_lock(&sysctl_lock);
+ drop_sysctl_table(header);
+ spin_unlock(&sysctl_lock);
+}
+EXPORT_SYMBOL(unregister_sysctl_table);
+
+void setup_sysctl_set(struct ctl_table_set *set,
+ struct ctl_table_root *root,
+ int (*is_seen)(struct ctl_table_set *))
+{
+ memset(set, 0, sizeof(*set));
+ set->is_seen = is_seen;
+ init_header(&set->dir.header, root, set, NULL, root_table);
+}
+
+void retire_sysctl_set(struct ctl_table_set *set)
+{
+ WARN_ON(!RB_EMPTY_ROOT(&set->dir.root));
+}
+
+int __init proc_sys_init(void)
+{
+ struct proc_dir_entry *proc_sys_root;
+
+ proc_sys_root = proc_mkdir("sys", NULL);
+ proc_sys_root->proc_iops = &proc_sys_dir_operations;
+ proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations;
+ proc_sys_root->nlink = 0;
+
+ return sysctl_init_bases();
+}
+
+struct sysctl_alias {
+ const char *kernel_param;
+ const char *sysctl_param;
+};
+
+/*
+ * Historically some settings had both sysctl and a command line parameter.
+ * With the generic sysctl. parameter support, we can handle them at a single
+ * place and only keep the historical name for compatibility. This is not meant
+ * to add brand new aliases. When adding existing aliases, consider whether
+ * the possibly different moment of changing the value (e.g. from early_param
+ * to the moment do_sysctl_args() is called) is an issue for the specific
+ * parameter.
+ */
+static const struct sysctl_alias sysctl_aliases[] = {
+ {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" },
+ {"hung_task_panic", "kernel.hung_task_panic" },
+ {"numa_zonelist_order", "vm.numa_zonelist_order" },
+ {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" },
+ { }
+};
+
+static const char *sysctl_find_alias(char *param)
+{
+ const struct sysctl_alias *alias;
+
+ for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) {
+ if (strcmp(alias->kernel_param, param) == 0)
+ return alias->sysctl_param;
+ }
+
+ return NULL;
+}
+
+bool sysctl_is_alias(char *param)
+{
+ const char *alias = sysctl_find_alias(param);
+
+ return alias != NULL;
+}
+
+/* Set sysctl value passed on kernel command line. */
+static int process_sysctl_arg(char *param, char *val,
+ const char *unused, void *arg)
+{
+ char *path;
+ struct vfsmount **proc_mnt = arg;
+ struct file_system_type *proc_fs_type;
+ struct file *file;
+ int len;
+ int err;
+ loff_t pos = 0;
+ ssize_t wret;
+
+ if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) {
+ param += sizeof("sysctl") - 1;
+
+ if (param[0] != '/' && param[0] != '.')
+ return 0;
+
+ param++;
+ } else {
+ param = (char *) sysctl_find_alias(param);
+ if (!param)
+ return 0;
+ }
+
+ if (!val)
+ return -EINVAL;
+ len = strlen(val);
+ if (len == 0)
+ return -EINVAL;
+
+ /*
+ * To set sysctl options, we use a temporary mount of proc, look up the
+ * respective sys/ file and write to it. To avoid mounting it when no
+ * options were given, we mount it only when the first sysctl option is
+ * found. Why not a persistent mount? There are problems with a
+ * persistent mount of proc in that it forces userspace not to use any
+ * proc mount options.
+ */
+ if (!*proc_mnt) {
+ proc_fs_type = get_fs_type("proc");
+ if (!proc_fs_type) {
+ pr_err("Failed to find procfs to set sysctl from command line\n");
+ return 0;
+ }
+ *proc_mnt = kern_mount(proc_fs_type);
+ put_filesystem(proc_fs_type);
+ if (IS_ERR(*proc_mnt)) {
+ pr_err("Failed to mount procfs to set sysctl from command line\n");
+ return 0;
+ }
+ }
+
+ path = kasprintf(GFP_KERNEL, "sys/%s", param);
+ if (!path)
+ panic("%s: Failed to allocate path for %s\n", __func__, param);
+ strreplace(path, '.', '/');
+
+ file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ if (err == -ENOENT)
+ pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n",
+ param, val);
+ else if (err == -EACCES)
+ pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n",
+ param, val);
+ else
+ pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n",
+ file, param, val);
+ goto out;
+ }
+ wret = kernel_write(file, val, len, &pos);
+ if (wret < 0) {
+ err = wret;
+ if (err == -EINVAL)
+ pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n",
+ param, val);
+ else
+ pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n",
+ ERR_PTR(err), param, val);
+ } else if (wret != len) {
+ pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n",
+ wret, len, path, param, val);
+ }
+
+ err = filp_close(file, NULL);
+ if (err)
+ pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n",
+ ERR_PTR(err), param, val);
+out:
+ kfree(path);
+ return 0;
+}
+
+void do_sysctl_args(void)
+{
+ char *command_line;
+ struct vfsmount *proc_mnt = NULL;
+
+ command_line = kstrdup(saved_command_line, GFP_KERNEL);
+ if (!command_line)
+ panic("%s: Failed to allocate copy of command line\n", __func__);
+
+ parse_args("Setting sysctl args", command_line,
+ NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg);
+
+ if (proc_mnt)
+ kern_unmount(proc_mnt);
+
+ kfree(command_line);
+}
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
new file mode 100644
index 000000000..5c6a5ceab
--- /dev/null
+++ b/fs/proc/proc_tty.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * proc_tty.c -- handles /proc/tty
+ *
+ * Copyright 1997, Theodore Ts'o
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/tty.h>
+#include <linux/seq_file.h>
+#include <linux/bitops.h>
+#include "internal.h"
+
+/*
+ * The /proc/tty directory inodes...
+ */
+static struct proc_dir_entry *proc_tty_driver;
+
+/*
+ * This is the handler for /proc/tty/drivers
+ */
+static void show_tty_range(struct seq_file *m, struct tty_driver *p,
+ dev_t from, int num)
+{
+ seq_printf(m, "%-20s ", p->driver_name ? p->driver_name : "unknown");
+ seq_printf(m, "/dev/%-8s ", p->name);
+ if (p->num > 1) {
+ seq_printf(m, "%3d %d-%d ", MAJOR(from), MINOR(from),
+ MINOR(from) + num - 1);
+ } else {
+ seq_printf(m, "%3d %7d ", MAJOR(from), MINOR(from));
+ }
+ switch (p->type) {
+ case TTY_DRIVER_TYPE_SYSTEM:
+ seq_puts(m, "system");
+ if (p->subtype == SYSTEM_TYPE_TTY)
+ seq_puts(m, ":/dev/tty");
+ else if (p->subtype == SYSTEM_TYPE_SYSCONS)
+ seq_puts(m, ":console");
+ else if (p->subtype == SYSTEM_TYPE_CONSOLE)
+ seq_puts(m, ":vtmaster");
+ break;
+ case TTY_DRIVER_TYPE_CONSOLE:
+ seq_puts(m, "console");
+ break;
+ case TTY_DRIVER_TYPE_SERIAL:
+ seq_puts(m, "serial");
+ break;
+ case TTY_DRIVER_TYPE_PTY:
+ if (p->subtype == PTY_TYPE_MASTER)
+ seq_puts(m, "pty:master");
+ else if (p->subtype == PTY_TYPE_SLAVE)
+ seq_puts(m, "pty:slave");
+ else
+ seq_puts(m, "pty");
+ break;
+ default:
+ seq_printf(m, "type:%d.%d", p->type, p->subtype);
+ }
+ seq_putc(m, '\n');
+}
+
+static int show_tty_driver(struct seq_file *m, void *v)
+{
+ struct tty_driver *p = list_entry(v, struct tty_driver, tty_drivers);
+ dev_t from = MKDEV(p->major, p->minor_start);
+ dev_t to = from + p->num;
+
+ if (&p->tty_drivers == tty_drivers.next) {
+ /* pseudo-drivers first */
+ seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
+ seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
+ seq_puts(m, "system:/dev/tty\n");
+ seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
+ seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
+ seq_puts(m, "system:console\n");
+#ifdef CONFIG_UNIX98_PTYS
+ seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
+ seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
+ seq_puts(m, "system\n");
+#endif
+#ifdef CONFIG_VT
+ seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
+ seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
+ seq_puts(m, "system:vtmaster\n");
+#endif
+ }
+
+ while (MAJOR(from) < MAJOR(to)) {
+ dev_t next = MKDEV(MAJOR(from)+1, 0);
+ show_tty_range(m, p, from, next - from);
+ from = next;
+ }
+ if (from != to)
+ show_tty_range(m, p, from, to - from);
+ return 0;
+}
+
+/* iterator */
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&tty_mutex);
+ return seq_list_start(&tty_drivers, *pos);
+}
+
+static void *t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &tty_drivers, pos);
+}
+
+static void t_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&tty_mutex);
+}
+
+static const struct seq_operations tty_drivers_op = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = show_tty_driver
+};
+
+/*
+ * This function is called by tty_register_driver() to handle
+ * registering the driver's /proc handler into /proc/tty/driver/<foo>
+ */
+void proc_tty_register_driver(struct tty_driver *driver)
+{
+ struct proc_dir_entry *ent;
+
+ if (!driver->driver_name || driver->proc_entry ||
+ !driver->ops->proc_show)
+ return;
+
+ ent = proc_create_single_data(driver->driver_name, 0, proc_tty_driver,
+ driver->ops->proc_show, driver);
+ driver->proc_entry = ent;
+}
+
+/*
+ * This function is called by tty_unregister_driver()
+ */
+void proc_tty_unregister_driver(struct tty_driver *driver)
+{
+ struct proc_dir_entry *ent;
+
+ ent = driver->proc_entry;
+ if (!ent)
+ return;
+
+ remove_proc_entry(ent->name, proc_tty_driver);
+
+ driver->proc_entry = NULL;
+}
+
+/*
+ * Called by proc_root_init() to initialize the /proc/tty subtree
+ */
+void __init proc_tty_init(void)
+{
+ if (!proc_mkdir("tty", NULL))
+ return;
+ proc_mkdir("tty/ldisc", NULL); /* Preserved: it's userspace visible */
+ /*
+ * /proc/tty/driver/serial reveals the exact character counts for
+ * serial links which is just too easy to abuse for inferring
+ * password lengths and inter-keystroke timings during password
+ * entry.
+ */
+ proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL);
+ proc_create_seq("tty/ldiscs", 0, NULL, &tty_ldiscs_seq_ops);
+ proc_create_seq("tty/drivers", 0, NULL, &tty_drivers_op);
+}
diff --git a/fs/proc/root.c b/fs/proc/root.c
new file mode 100644
index 000000000..3c2ee3eb1
--- /dev/null
+++ b/fs/proc/root.c
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/proc/root.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * proc root directory handling functions
+ */
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/sched/stat.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/user_namespace.h>
+#include <linux/fs_context.h>
+#include <linux/mount.h>
+#include <linux/pid_namespace.h>
+#include <linux/fs_parser.h>
+#include <linux/cred.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+struct proc_fs_context {
+ struct pid_namespace *pid_ns;
+ unsigned int mask;
+ enum proc_hidepid hidepid;
+ int gid;
+ enum proc_pidonly pidonly;
+};
+
+enum proc_param {
+ Opt_gid,
+ Opt_hidepid,
+ Opt_subset,
+};
+
+static const struct fs_parameter_spec proc_fs_parameters[] = {
+ fsparam_u32("gid", Opt_gid),
+ fsparam_string("hidepid", Opt_hidepid),
+ fsparam_string("subset", Opt_subset),
+ {}
+};
+
+static inline int valid_hidepid(unsigned int value)
+{
+ return (value == HIDEPID_OFF ||
+ value == HIDEPID_NO_ACCESS ||
+ value == HIDEPID_INVISIBLE ||
+ value == HIDEPID_NOT_PTRACEABLE);
+}
+
+static int proc_parse_hidepid_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct fs_parameter_spec hidepid_u32_spec = fsparam_u32("hidepid", Opt_hidepid);
+ struct fs_parse_result result;
+ int base = (unsigned long)hidepid_u32_spec.data;
+
+ if (param->type != fs_value_is_string)
+ return invalf(fc, "proc: unexpected type of hidepid value\n");
+
+ if (!kstrtouint(param->string, base, &result.uint_32)) {
+ if (!valid_hidepid(result.uint_32))
+ return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string);
+ ctx->hidepid = result.uint_32;
+ return 0;
+ }
+
+ if (!strcmp(param->string, "off"))
+ ctx->hidepid = HIDEPID_OFF;
+ else if (!strcmp(param->string, "noaccess"))
+ ctx->hidepid = HIDEPID_NO_ACCESS;
+ else if (!strcmp(param->string, "invisible"))
+ ctx->hidepid = HIDEPID_INVISIBLE;
+ else if (!strcmp(param->string, "ptraceable"))
+ ctx->hidepid = HIDEPID_NOT_PTRACEABLE;
+ else
+ return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string);
+
+ return 0;
+}
+
+static int proc_parse_subset_param(struct fs_context *fc, char *value)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ while (value) {
+ char *ptr = strchr(value, ',');
+
+ if (ptr != NULL)
+ *ptr++ = '\0';
+
+ if (*value != '\0') {
+ if (!strcmp(value, "pid")) {
+ ctx->pidonly = PROC_PIDONLY_ON;
+ } else {
+ return invalf(fc, "proc: unsupported subset option - %s\n", value);
+ }
+ }
+ value = ptr;
+ }
+
+ return 0;
+}
+
+static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, proc_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_gid:
+ ctx->gid = result.uint_32;
+ break;
+
+ case Opt_hidepid:
+ if (proc_parse_hidepid_param(fc, param))
+ return -EINVAL;
+ break;
+
+ case Opt_subset:
+ if (proc_parse_subset_param(fc, param->string) < 0)
+ return -EINVAL;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ ctx->mask |= 1 << opt;
+ return 0;
+}
+
+static void proc_apply_options(struct proc_fs_info *fs_info,
+ struct fs_context *fc,
+ struct user_namespace *user_ns)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ if (ctx->mask & (1 << Opt_gid))
+ fs_info->pid_gid = make_kgid(user_ns, ctx->gid);
+ if (ctx->mask & (1 << Opt_hidepid))
+ fs_info->hide_pid = ctx->hidepid;
+ if (ctx->mask & (1 << Opt_subset))
+ fs_info->pidonly = ctx->pidonly;
+}
+
+static int proc_fill_super(struct super_block *s, struct fs_context *fc)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct inode *root_inode;
+ struct proc_fs_info *fs_info;
+ int ret;
+
+ fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL);
+ if (!fs_info)
+ return -ENOMEM;
+
+ fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+ proc_apply_options(fs_info, fc, current_user_ns());
+
+ /* User space would break if executables or devices appear on proc */
+ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
+ s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
+ s->s_blocksize = 1024;
+ s->s_blocksize_bits = 10;
+ s->s_magic = PROC_SUPER_MAGIC;
+ s->s_op = &proc_sops;
+ s->s_time_gran = 1;
+ s->s_fs_info = fs_info;
+
+ /*
+ * procfs isn't actually a stacking filesystem; however, there is
+ * too much magic going on inside it to permit stacking things on
+ * top of it
+ */
+ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+
+ /* procfs dentries and inodes don't require IO to create */
+ s->s_shrink.seeks = 0;
+
+ pde_get(&proc_root);
+ root_inode = proc_get_inode(s, &proc_root);
+ if (!root_inode) {
+ pr_err("proc_fill_super: get root inode failed\n");
+ return -ENOMEM;
+ }
+
+ s->s_root = d_make_root(root_inode);
+ if (!s->s_root) {
+ pr_err("proc_fill_super: allocate dentry failed\n");
+ return -ENOMEM;
+ }
+
+ ret = proc_setup_self(s);
+ if (ret) {
+ return ret;
+ }
+ return proc_setup_thread_self(s);
+}
+
+static int proc_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct proc_fs_info *fs_info = proc_sb_info(sb);
+
+ sync_filesystem(sb);
+
+ proc_apply_options(fs_info, fc, current_user_ns());
+ return 0;
+}
+
+static int proc_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, proc_fill_super);
+}
+
+static void proc_fs_context_free(struct fs_context *fc)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+
+ put_pid_ns(ctx->pid_ns);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations proc_fs_context_ops = {
+ .free = proc_fs_context_free,
+ .parse_param = proc_parse_param,
+ .get_tree = proc_get_tree,
+ .reconfigure = proc_reconfigure,
+};
+
+static int proc_init_fs_context(struct fs_context *fc)
+{
+ struct proc_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ fc->fs_private = ctx;
+ fc->ops = &proc_fs_context_ops;
+ return 0;
+}
+
+static void proc_kill_sb(struct super_block *sb)
+{
+ struct proc_fs_info *fs_info = proc_sb_info(sb);
+
+ if (!fs_info) {
+ kill_anon_super(sb);
+ return;
+ }
+
+ dput(fs_info->proc_self);
+ dput(fs_info->proc_thread_self);
+
+ kill_anon_super(sb);
+ put_pid_ns(fs_info->pid_ns);
+ kfree(fs_info);
+}
+
+static struct file_system_type proc_fs_type = {
+ .name = "proc",
+ .init_fs_context = proc_init_fs_context,
+ .parameters = proc_fs_parameters,
+ .kill_sb = proc_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM,
+};
+
+void __init proc_root_init(void)
+{
+ proc_init_kmemcache();
+ set_proc_pid_nlink();
+ proc_self_init();
+ proc_thread_self_init();
+ proc_symlink("mounts", NULL, "self/mounts");
+
+ proc_net_init();
+ proc_mkdir("fs", NULL);
+ proc_mkdir("driver", NULL);
+ proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
+#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
+ /* just give it a mountpoint */
+ proc_create_mount_point("openprom");
+#endif
+ proc_tty_init();
+ proc_mkdir("bus", NULL);
+ proc_sys_init();
+
+ /*
+ * Last things last. It is not like userspace processes eager
+ * to open /proc files exist at this point but register last
+ * anyway.
+ */
+ register_filesystem(&proc_fs_type);
+}
+
+static int proc_root_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ generic_fillattr(&init_user_ns, d_inode(path->dentry), stat);
+ stat->nlink = proc_root.nlink + nr_processes();
+ return 0;
+}
+
+static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
+{
+ if (!proc_pid_lookup(dentry, flags))
+ return NULL;
+
+ return proc_lookup(dir, dentry, flags);
+}
+
+static int proc_root_readdir(struct file *file, struct dir_context *ctx)
+{
+ if (ctx->pos < FIRST_PROCESS_ENTRY) {
+ int error = proc_readdir(file, ctx);
+ if (unlikely(error <= 0))
+ return error;
+ ctx->pos = FIRST_PROCESS_ENTRY;
+ }
+
+ return proc_pid_readdir(file, ctx);
+}
+
+/*
+ * The root /proc directory is special, as it has the
+ * <pid> directories. Thus we don't use the generic
+ * directory handling functions for that..
+ */
+static const struct file_operations proc_root_operations = {
+ .read = generic_read_dir,
+ .iterate_shared = proc_root_readdir,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * proc root can do almost nothing..
+ */
+static const struct inode_operations proc_root_inode_operations = {
+ .lookup = proc_root_lookup,
+ .getattr = proc_root_getattr,
+};
+
+/*
+ * This is the root "inode" in the /proc tree..
+ */
+struct proc_dir_entry proc_root = {
+ .low_ino = PROC_ROOT_INO,
+ .namelen = 5,
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ .nlink = 2,
+ .refcnt = REFCOUNT_INIT(1),
+ .proc_iops = &proc_root_inode_operations,
+ .proc_dir_ops = &proc_root_operations,
+ .parent = &proc_root,
+ .subdir = RB_ROOT,
+ .name = "/proc",
+};
diff --git a/fs/proc/self.c b/fs/proc/self.c
new file mode 100644
index 000000000..72cd69bca
--- /dev/null
+++ b/fs/proc/self.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+
+/*
+ * /proc/self:
+ */
+static const char *proc_self_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
+ pid_t tgid = task_tgid_nr_ns(current, ns);
+ char *name;
+
+ if (!tgid)
+ return ERR_PTR(-ENOENT);
+ /* max length of unsigned int in decimal + NULL term */
+ name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
+ if (unlikely(!name))
+ return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
+ sprintf(name, "%u", tgid);
+ set_delayed_call(done, kfree_link, name);
+ return name;
+}
+
+static const struct inode_operations proc_self_inode_operations = {
+ .get_link = proc_self_get_link,
+};
+
+static unsigned self_inum __ro_after_init;
+
+int proc_setup_self(struct super_block *s)
+{
+ struct inode *root_inode = d_inode(s->s_root);
+ struct proc_fs_info *fs_info = proc_sb_info(s);
+ struct dentry *self;
+ int ret = -ENOMEM;
+
+ inode_lock(root_inode);
+ self = d_alloc_name(s->s_root, "self");
+ if (self) {
+ struct inode *inode = new_inode(s);
+ if (inode) {
+ inode->i_ino = self_inum;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mode = S_IFLNK | S_IRWXUGO;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_op = &proc_self_inode_operations;
+ d_add(self, inode);
+ ret = 0;
+ } else {
+ dput(self);
+ }
+ }
+ inode_unlock(root_inode);
+
+ if (ret)
+ pr_err("proc_fill_super: can't allocate /proc/self\n");
+ else
+ fs_info->proc_self = self;
+
+ return ret;
+}
+
+void __init proc_self_init(void)
+{
+ proc_alloc_inum(&self_inum);
+}
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
new file mode 100644
index 000000000..f4616083f
--- /dev/null
+++ b/fs/proc/softirqs.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * /proc/softirqs ... display the number of softirqs
+ */
+static int show_softirqs(struct seq_file *p, void *v)
+{
+ int i, j;
+
+ seq_puts(p, " ");
+ for_each_possible_cpu(i)
+ seq_printf(p, "CPU%-8d", i);
+ seq_putc(p, '\n');
+
+ for (i = 0; i < NR_SOFTIRQS; i++) {
+ seq_printf(p, "%12s:", softirq_to_name[i]);
+ for_each_possible_cpu(j)
+ seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
+ seq_putc(p, '\n');
+ }
+ return 0;
+}
+
+static int __init proc_softirqs_init(void)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("softirqs", 0, NULL, show_softirqs);
+ pde_make_permanent(pde);
+ return 0;
+}
+fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
new file mode 100644
index 000000000..4fb8729a6
--- /dev/null
+++ b/fs/proc/stat.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpumask.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/sched/stat.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/time_namespace.h>
+#include <linux/irqnr.h>
+#include <linux/sched/cputime.h>
+#include <linux/tick.h>
+
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+
+#ifdef arch_idle_time
+
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
+{
+ u64 idle;
+
+ idle = kcs->cpustat[CPUTIME_IDLE];
+ if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
+ idle += arch_idle_time(cpu);
+ return idle;
+}
+
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
+{
+ u64 iowait;
+
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
+ if (cpu_online(cpu) && nr_iowait_cpu(cpu))
+ iowait += arch_idle_time(cpu);
+ return iowait;
+}
+
+#else
+
+u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
+{
+ u64 idle, idle_usecs = -1ULL;
+
+ if (cpu_online(cpu))
+ idle_usecs = get_cpu_idle_time_us(cpu, NULL);
+
+ if (idle_usecs == -1ULL)
+ /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
+ idle = kcs->cpustat[CPUTIME_IDLE];
+ else
+ idle = idle_usecs * NSEC_PER_USEC;
+
+ return idle;
+}
+
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
+{
+ u64 iowait, iowait_usecs = -1ULL;
+
+ if (cpu_online(cpu))
+ iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
+
+ if (iowait_usecs == -1ULL)
+ /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
+ else
+ iowait = iowait_usecs * NSEC_PER_USEC;
+
+ return iowait;
+}
+
+#endif
+
+static void show_irq_gap(struct seq_file *p, unsigned int gap)
+{
+ static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
+
+ while (gap > 0) {
+ unsigned int inc;
+
+ inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2);
+ seq_write(p, zeros, 2 * inc);
+ gap -= inc;
+ }
+}
+
+static void show_all_irqs(struct seq_file *p)
+{
+ unsigned int i, next = 0;
+
+ for_each_active_irq(i) {
+ show_irq_gap(p, i - next);
+ seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
+ next = i + 1;
+ }
+ show_irq_gap(p, nr_irqs - next);
+}
+
+static int show_stat(struct seq_file *p, void *v)
+{
+ int i, j;
+ u64 user, nice, system, idle, iowait, irq, softirq, steal;
+ u64 guest, guest_nice;
+ u64 sum = 0;
+ u64 sum_softirq = 0;
+ unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
+ struct timespec64 boottime;
+
+ user = nice = system = idle = iowait =
+ irq = softirq = steal = 0;
+ guest = guest_nice = 0;
+ getboottime64(&boottime);
+ /* shift boot timestamp according to the timens offset */
+ timens_sub_boottime(&boottime);
+
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
+
+ user += cpustat[CPUTIME_USER];
+ nice += cpustat[CPUTIME_NICE];
+ system += cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(&kcpustat, i);
+ iowait += get_iowait_time(&kcpustat, i);
+ irq += cpustat[CPUTIME_IRQ];
+ softirq += cpustat[CPUTIME_SOFTIRQ];
+ steal += cpustat[CPUTIME_STEAL];
+ guest += cpustat[CPUTIME_GUEST];
+ guest_nice += cpustat[CPUTIME_GUEST_NICE];
+ sum += kstat_cpu_irqs_sum(i);
+ sum += arch_irq_stat_cpu(i);
+
+ for (j = 0; j < NR_SOFTIRQS; j++) {
+ unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
+
+ per_softirq_sums[j] += softirq_stat;
+ sum_softirq += softirq_stat;
+ }
+ }
+ sum += arch_irq_stat();
+
+ seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
+ seq_putc(p, '\n');
+
+ for_each_online_cpu(i) {
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
+
+ /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
+ user = cpustat[CPUTIME_USER];
+ nice = cpustat[CPUTIME_NICE];
+ system = cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(&kcpustat, i);
+ iowait = get_iowait_time(&kcpustat, i);
+ irq = cpustat[CPUTIME_IRQ];
+ softirq = cpustat[CPUTIME_SOFTIRQ];
+ steal = cpustat[CPUTIME_STEAL];
+ guest = cpustat[CPUTIME_GUEST];
+ guest_nice = cpustat[CPUTIME_GUEST_NICE];
+ seq_printf(p, "cpu%d", i);
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
+ seq_putc(p, '\n');
+ }
+ seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
+
+ show_all_irqs(p);
+
+ seq_printf(p,
+ "\nctxt %llu\n"
+ "btime %llu\n"
+ "processes %lu\n"
+ "procs_running %u\n"
+ "procs_blocked %u\n",
+ nr_context_switches(),
+ (unsigned long long)boottime.tv_sec,
+ total_forks,
+ nr_running(),
+ nr_iowait());
+
+ seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq);
+
+ for (i = 0; i < NR_SOFTIRQS; i++)
+ seq_put_decimal_ull(p, " ", per_softirq_sums[i]);
+ seq_putc(p, '\n');
+
+ return 0;
+}
+
+static int stat_open(struct inode *inode, struct file *file)
+{
+ unsigned int size = 1024 + 128 * num_online_cpus();
+
+ /* minimum size to display an interrupt count : 2 bytes */
+ size += 2 * nr_irqs;
+ return single_open_size(file, show_stat, NULL, size);
+}
+
+static const struct proc_ops stat_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = stat_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+};
+
+static int __init proc_stat_init(void)
+{
+ proc_create("stat", 0, NULL, &stat_proc_ops);
+ return 0;
+}
+fs_initcall(proc_stat_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
new file mode 100644
index 000000000..a954305fb
--- /dev/null
+++ b/fs/proc/task_mmu.c
@@ -0,0 +1,2026 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/pagewalk.h>
+#include <linux/mm_inline.h>
+#include <linux/hugetlb.h>
+#include <linux/huge_mm.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/sched/mm.h>
+#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/shmem_fs.h>
+#include <linux/uaccess.h>
+#include <linux/pkeys.h>
+
+#include <asm/elf.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+#define SEQ_PUT_DEC(str, val) \
+ seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
+void task_mem(struct seq_file *m, struct mm_struct *mm)
+{
+ unsigned long text, lib, swap, anon, file, shmem;
+ unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+
+ anon = get_mm_counter(mm, MM_ANONPAGES);
+ file = get_mm_counter(mm, MM_FILEPAGES);
+ shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+
+ /*
+ * Note: to minimize their overhead, mm maintains hiwater_vm and
+ * hiwater_rss only when about to *lower* total_vm or rss. Any
+ * collector of these hiwater stats must therefore get total_vm
+ * and rss too, which will usually be the higher. Barriers? not
+ * worth the effort, such snapshots can always be inconsistent.
+ */
+ hiwater_vm = total_vm = mm->total_vm;
+ if (hiwater_vm < mm->hiwater_vm)
+ hiwater_vm = mm->hiwater_vm;
+ hiwater_rss = total_rss = anon + file + shmem;
+ if (hiwater_rss < mm->hiwater_rss)
+ hiwater_rss = mm->hiwater_rss;
+
+ /* split executable areas between text and lib */
+ text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
+ text = min(text, mm->exec_vm << PAGE_SHIFT);
+ lib = (mm->exec_vm << PAGE_SHIFT) - text;
+
+ swap = get_mm_counter(mm, MM_SWAPENTS);
+ SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
+ SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
+ SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
+ SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm));
+ SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss);
+ SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss);
+ SEQ_PUT_DEC(" kB\nRssAnon:\t", anon);
+ SEQ_PUT_DEC(" kB\nRssFile:\t", file);
+ SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem);
+ SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm);
+ SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm);
+ seq_put_decimal_ull_width(m,
+ " kB\nVmExe:\t", text >> 10, 8);
+ seq_put_decimal_ull_width(m,
+ " kB\nVmLib:\t", lib >> 10, 8);
+ seq_put_decimal_ull_width(m,
+ " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
+ SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
+ seq_puts(m, " kB\n");
+ hugetlb_report_usage(m, mm);
+}
+#undef SEQ_PUT_DEC
+
+unsigned long task_vsize(struct mm_struct *mm)
+{
+ return PAGE_SIZE * mm->total_vm;
+}
+
+unsigned long task_statm(struct mm_struct *mm,
+ unsigned long *shared, unsigned long *text,
+ unsigned long *data, unsigned long *resident)
+{
+ *shared = get_mm_counter(mm, MM_FILEPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
+ *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
+ >> PAGE_SHIFT;
+ *data = mm->data_vm + mm->stack_vm;
+ *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
+ return mm->total_vm;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Save get_task_policy() for show_numa_map().
+ */
+static void hold_task_mempolicy(struct proc_maps_private *priv)
+{
+ struct task_struct *task = priv->task;
+
+ task_lock(task);
+ priv->task_mempolicy = get_task_policy(task);
+ mpol_get(priv->task_mempolicy);
+ task_unlock(task);
+}
+static void release_task_mempolicy(struct proc_maps_private *priv)
+{
+ mpol_put(priv->task_mempolicy);
+}
+#else
+static void hold_task_mempolicy(struct proc_maps_private *priv)
+{
+}
+static void release_task_mempolicy(struct proc_maps_private *priv)
+{
+}
+#endif
+
+static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
+ loff_t *ppos)
+{
+ struct vm_area_struct *vma = vma_next(&priv->iter);
+
+ if (vma) {
+ *ppos = vma->vm_start;
+ } else {
+ *ppos = -2UL;
+ vma = get_gate_vma(priv->mm);
+ }
+
+ return vma;
+}
+
+static void *m_start(struct seq_file *m, loff_t *ppos)
+{
+ struct proc_maps_private *priv = m->private;
+ unsigned long last_addr = *ppos;
+ struct mm_struct *mm;
+
+ /* See m_next(). Zero at the start or after lseek. */
+ if (last_addr == -1UL)
+ return NULL;
+
+ priv->task = get_proc_task(priv->inode);
+ if (!priv->task)
+ return ERR_PTR(-ESRCH);
+
+ mm = priv->mm;
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
+ return NULL;
+ }
+
+ if (mmap_read_lock_killable(mm)) {
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
+ return ERR_PTR(-EINTR);
+ }
+
+ vma_iter_init(&priv->iter, mm, last_addr);
+ hold_task_mempolicy(priv);
+ if (last_addr == -2UL)
+ return get_gate_vma(mm);
+
+ return proc_get_vma(priv, ppos);
+}
+
+static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
+{
+ if (*ppos == -2UL) {
+ *ppos = -1UL;
+ return NULL;
+ }
+ return proc_get_vma(m->private, ppos);
+}
+
+static void m_stop(struct seq_file *m, void *v)
+{
+ struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->mm;
+
+ if (!priv->task)
+ return;
+
+ release_task_mempolicy(priv);
+ mmap_read_unlock(mm);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
+}
+
+static int proc_maps_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops, int psize)
+{
+ struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
+
+ if (!priv)
+ return -ENOMEM;
+
+ priv->inode = inode;
+ priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->mm)) {
+ int err = PTR_ERR(priv->mm);
+
+ seq_release_private(inode, file);
+ return err;
+ }
+
+ return 0;
+}
+
+static int proc_map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ if (priv->mm)
+ mmdrop(priv->mm);
+
+ return seq_release_private(inode, file);
+}
+
+static int do_maps_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
+{
+ return proc_maps_open(inode, file, ops,
+ sizeof(struct proc_maps_private));
+}
+
+/*
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static int is_stack(struct vm_area_struct *vma)
+{
+ /*
+ * We make no effort to guess what a given thread considers to be
+ * its "stack". It's not even well-defined for programs written
+ * languages like Go.
+ */
+ return vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack;
+}
+
+static void show_vma_header_prefix(struct seq_file *m,
+ unsigned long start, unsigned long end,
+ vm_flags_t flags, unsigned long long pgoff,
+ dev_t dev, unsigned long ino)
+{
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+ seq_put_hex_ll(m, NULL, start, 8);
+ seq_put_hex_ll(m, "-", end, 8);
+ seq_putc(m, ' ');
+ seq_putc(m, flags & VM_READ ? 'r' : '-');
+ seq_putc(m, flags & VM_WRITE ? 'w' : '-');
+ seq_putc(m, flags & VM_EXEC ? 'x' : '-');
+ seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p');
+ seq_put_hex_ll(m, " ", pgoff, 8);
+ seq_put_hex_ll(m, " ", MAJOR(dev), 2);
+ seq_put_hex_ll(m, ":", MINOR(dev), 2);
+ seq_put_decimal_ull(m, " ", ino);
+ seq_putc(m, ' ');
+}
+
+static void
+show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct file *file = vma->vm_file;
+ vm_flags_t flags = vma->vm_flags;
+ unsigned long ino = 0;
+ unsigned long long pgoff = 0;
+ unsigned long start, end;
+ dev_t dev = 0;
+ const char *name = NULL;
+
+ if (file) {
+ struct inode *inode = file_inode(vma->vm_file);
+ dev = inode->i_sb->s_dev;
+ ino = inode->i_ino;
+ pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
+ }
+
+ start = vma->vm_start;
+ end = vma->vm_end;
+ show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
+
+ /*
+ * Print the dentry name for named mappings, and a
+ * special [heap] marker for the heap:
+ */
+ if (file) {
+ seq_pad(m, ' ');
+ seq_file_path(m, file, "\n");
+ goto done;
+ }
+
+ if (vma->vm_ops && vma->vm_ops->name) {
+ name = vma->vm_ops->name(vma);
+ if (name)
+ goto done;
+ }
+
+ name = arch_vma_name(vma);
+ if (!name) {
+ struct anon_vma_name *anon_name;
+
+ if (!mm) {
+ name = "[vdso]";
+ goto done;
+ }
+
+ if (vma->vm_start <= mm->brk &&
+ vma->vm_end >= mm->start_brk) {
+ name = "[heap]";
+ goto done;
+ }
+
+ if (is_stack(vma)) {
+ name = "[stack]";
+ goto done;
+ }
+
+ anon_name = anon_vma_name(vma);
+ if (anon_name) {
+ seq_pad(m, ' ');
+ seq_printf(m, "[anon:%s]", anon_name->name);
+ }
+ }
+
+done:
+ if (name) {
+ seq_pad(m, ' ');
+ seq_puts(m, name);
+ }
+ seq_putc(m, '\n');
+}
+
+static int show_map(struct seq_file *m, void *v)
+{
+ show_map_vma(m, v);
+ return 0;
+}
+
+static const struct seq_operations proc_pid_maps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_map
+};
+
+static int pid_maps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+ .open = pid_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_map_release,
+};
+
+/*
+ * Proportional Set Size(PSS): my share of RSS.
+ *
+ * PSS of a process is the count of pages it has in memory, where each
+ * page is divided by the number of processes sharing it. So if a
+ * process has 1000 pages all to itself, and 1000 shared with one other
+ * process, its PSS will be 1500.
+ *
+ * To keep (accumulated) division errors low, we adopt a 64bit
+ * fixed-point pss counter to minimize division errors. So (pss >>
+ * PSS_SHIFT) would be the real byte count.
+ *
+ * A shift of 12 before division means (assuming 4K page size):
+ * - 1M 3-user-pages add up to 8KB errors;
+ * - supports mapcount up to 2^24, or 16M;
+ * - supports PSS up to 2^52 bytes, or 4PB.
+ */
+#define PSS_SHIFT 12
+
+#ifdef CONFIG_PROC_PAGE_MONITOR
+struct mem_size_stats {
+ unsigned long resident;
+ unsigned long shared_clean;
+ unsigned long shared_dirty;
+ unsigned long private_clean;
+ unsigned long private_dirty;
+ unsigned long referenced;
+ unsigned long anonymous;
+ unsigned long lazyfree;
+ unsigned long anonymous_thp;
+ unsigned long shmem_thp;
+ unsigned long file_thp;
+ unsigned long swap;
+ unsigned long shared_hugetlb;
+ unsigned long private_hugetlb;
+ u64 pss;
+ u64 pss_anon;
+ u64 pss_file;
+ u64 pss_shmem;
+ u64 pss_dirty;
+ u64 pss_locked;
+ u64 swap_pss;
+};
+
+static void smaps_page_accumulate(struct mem_size_stats *mss,
+ struct page *page, unsigned long size, unsigned long pss,
+ bool dirty, bool locked, bool private)
+{
+ mss->pss += pss;
+
+ if (PageAnon(page))
+ mss->pss_anon += pss;
+ else if (PageSwapBacked(page))
+ mss->pss_shmem += pss;
+ else
+ mss->pss_file += pss;
+
+ if (locked)
+ mss->pss_locked += pss;
+
+ if (dirty || PageDirty(page)) {
+ mss->pss_dirty += pss;
+ if (private)
+ mss->private_dirty += size;
+ else
+ mss->shared_dirty += size;
+ } else {
+ if (private)
+ mss->private_clean += size;
+ else
+ mss->shared_clean += size;
+ }
+}
+
+static void smaps_account(struct mem_size_stats *mss, struct page *page,
+ bool compound, bool young, bool dirty, bool locked,
+ bool migration)
+{
+ int i, nr = compound ? compound_nr(page) : 1;
+ unsigned long size = nr * PAGE_SIZE;
+
+ /*
+ * First accumulate quantities that depend only on |size| and the type
+ * of the compound page.
+ */
+ if (PageAnon(page)) {
+ mss->anonymous += size;
+ if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+ mss->lazyfree += size;
+ }
+
+ mss->resident += size;
+ /* Accumulate the size in pages that have been accessed. */
+ if (young || page_is_young(page) || PageReferenced(page))
+ mss->referenced += size;
+
+ /*
+ * Then accumulate quantities that may depend on sharing, or that may
+ * differ page-by-page.
+ *
+ * page_count(page) == 1 guarantees the page is mapped exactly once.
+ * If any subpage of the compound page mapped with PTE it would elevate
+ * page_count().
+ *
+ * The page_mapcount() is called to get a snapshot of the mapcount.
+ * Without holding the page lock this snapshot can be slightly wrong as
+ * we cannot always read the mapcount atomically. It is not safe to
+ * call page_mapcount() even with PTL held if the page is not mapped,
+ * especially for migration entries. Treat regular migration entries
+ * as mapcount == 1.
+ */
+ if ((page_count(page) == 1) || migration) {
+ smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
+ locked, true);
+ return;
+ }
+ for (i = 0; i < nr; i++, page++) {
+ int mapcount = page_mapcount(page);
+ unsigned long pss = PAGE_SIZE << PSS_SHIFT;
+ if (mapcount >= 2)
+ pss /= mapcount;
+ smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
+ mapcount < 2);
+ }
+}
+
+#ifdef CONFIG_SHMEM
+static int smaps_pte_hole(unsigned long addr, unsigned long end,
+ __always_unused int depth, struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+
+ mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
+ linear_page_index(vma, addr),
+ linear_page_index(vma, end));
+
+ return 0;
+}
+#else
+#define smaps_pte_hole NULL
+#endif /* CONFIG_SHMEM */
+
+static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
+{
+#ifdef CONFIG_SHMEM
+ if (walk->ops->pte_hole) {
+ /* depth is not used */
+ smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
+ }
+#endif
+}
+
+static void smaps_pte_entry(pte_t *pte, unsigned long addr,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ bool locked = !!(vma->vm_flags & VM_LOCKED);
+ struct page *page = NULL;
+ bool migration = false, young = false, dirty = false;
+
+ if (pte_present(*pte)) {
+ page = vm_normal_page(vma, addr, *pte);
+ young = pte_young(*pte);
+ dirty = pte_dirty(*pte);
+ } else if (is_swap_pte(*pte)) {
+ swp_entry_t swpent = pte_to_swp_entry(*pte);
+
+ if (!non_swap_entry(swpent)) {
+ int mapcount;
+
+ mss->swap += PAGE_SIZE;
+ mapcount = swp_swapcount(swpent);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
+
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
+ }
+ } else if (is_pfn_swap_entry(swpent)) {
+ if (is_migration_entry(swpent))
+ migration = true;
+ page = pfn_swap_entry_to_page(swpent);
+ }
+ } else {
+ smaps_pte_hole_lookup(addr, walk);
+ return;
+ }
+
+ if (!page)
+ return;
+
+ smaps_account(mss, page, false, young, dirty, locked, migration);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ bool locked = !!(vma->vm_flags & VM_LOCKED);
+ struct page *page = NULL;
+ bool migration = false;
+
+ if (pmd_present(*pmd)) {
+ /* FOLL_DUMP will return -EFAULT on huge zero page */
+ page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+ } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
+ swp_entry_t entry = pmd_to_swp_entry(*pmd);
+
+ if (is_migration_entry(entry)) {
+ migration = true;
+ page = pfn_swap_entry_to_page(entry);
+ }
+ }
+ if (IS_ERR_OR_NULL(page))
+ return;
+ if (PageAnon(page))
+ mss->anonymous_thp += HPAGE_PMD_SIZE;
+ else if (PageSwapBacked(page))
+ mss->shmem_thp += HPAGE_PMD_SIZE;
+ else if (is_zone_device_page(page))
+ /* pass */;
+ else
+ mss->file_thp += HPAGE_PMD_SIZE;
+
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+ locked, migration);
+}
+#else
+static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
+ struct mm_walk *walk)
+{
+}
+#endif
+
+static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ smaps_pmd_entry(pmd, addr, walk);
+ spin_unlock(ptl);
+ goto out;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ goto out;
+ /*
+ * The mmap_lock held all the way back in m_start() is what
+ * keeps khugepaged out of here and from collapsing things
+ * in here.
+ */
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE)
+ smaps_pte_entry(pte, addr, walk);
+ pte_unmap_unlock(pte - 1, ptl);
+out:
+ cond_resched();
+ return 0;
+}
+
+static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
+{
+ /*
+ * Don't forget to update Documentation/ on changes.
+ */
+ static const char mnemonics[BITS_PER_LONG][2] = {
+ /*
+ * In case if we meet a flag we don't know about.
+ */
+ [0 ... (BITS_PER_LONG-1)] = "??",
+
+ [ilog2(VM_READ)] = "rd",
+ [ilog2(VM_WRITE)] = "wr",
+ [ilog2(VM_EXEC)] = "ex",
+ [ilog2(VM_SHARED)] = "sh",
+ [ilog2(VM_MAYREAD)] = "mr",
+ [ilog2(VM_MAYWRITE)] = "mw",
+ [ilog2(VM_MAYEXEC)] = "me",
+ [ilog2(VM_MAYSHARE)] = "ms",
+ [ilog2(VM_GROWSDOWN)] = "gd",
+ [ilog2(VM_PFNMAP)] = "pf",
+ [ilog2(VM_LOCKED)] = "lo",
+ [ilog2(VM_IO)] = "io",
+ [ilog2(VM_SEQ_READ)] = "sr",
+ [ilog2(VM_RAND_READ)] = "rr",
+ [ilog2(VM_DONTCOPY)] = "dc",
+ [ilog2(VM_DONTEXPAND)] = "de",
+ [ilog2(VM_ACCOUNT)] = "ac",
+ [ilog2(VM_NORESERVE)] = "nr",
+ [ilog2(VM_HUGETLB)] = "ht",
+ [ilog2(VM_SYNC)] = "sf",
+ [ilog2(VM_ARCH_1)] = "ar",
+ [ilog2(VM_WIPEONFORK)] = "wf",
+ [ilog2(VM_DONTDUMP)] = "dd",
+#ifdef CONFIG_ARM64_BTI
+ [ilog2(VM_ARM64_BTI)] = "bt",
+#endif
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ [ilog2(VM_SOFTDIRTY)] = "sd",
+#endif
+ [ilog2(VM_MIXEDMAP)] = "mm",
+ [ilog2(VM_HUGEPAGE)] = "hg",
+ [ilog2(VM_NOHUGEPAGE)] = "nh",
+ [ilog2(VM_MERGEABLE)] = "mg",
+ [ilog2(VM_UFFD_MISSING)]= "um",
+ [ilog2(VM_UFFD_WP)] = "uw",
+#ifdef CONFIG_ARM64_MTE
+ [ilog2(VM_MTE)] = "mt",
+ [ilog2(VM_MTE_ALLOWED)] = "",
+#endif
+#ifdef CONFIG_ARCH_HAS_PKEYS
+ /* These come out via ProtectionKey: */
+ [ilog2(VM_PKEY_BIT0)] = "",
+ [ilog2(VM_PKEY_BIT1)] = "",
+ [ilog2(VM_PKEY_BIT2)] = "",
+ [ilog2(VM_PKEY_BIT3)] = "",
+#if VM_PKEY_BIT4
+ [ilog2(VM_PKEY_BIT4)] = "",
+#endif
+#endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+ [ilog2(VM_UFFD_MINOR)] = "ui",
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+ };
+ size_t i;
+
+ seq_puts(m, "VmFlags: ");
+ for (i = 0; i < BITS_PER_LONG; i++) {
+ if (!mnemonics[i][0])
+ continue;
+ if (vma->vm_flags & (1UL << i)) {
+ seq_putc(m, mnemonics[i][0]);
+ seq_putc(m, mnemonics[i][1]);
+ seq_putc(m, ' ');
+ }
+ }
+ seq_putc(m, '\n');
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page = NULL;
+
+ if (pte_present(*pte)) {
+ page = vm_normal_page(vma, addr, *pte);
+ } else if (is_swap_pte(*pte)) {
+ swp_entry_t swpent = pte_to_swp_entry(*pte);
+
+ if (is_pfn_swap_entry(swpent))
+ page = pfn_swap_entry_to_page(swpent);
+ }
+ if (page) {
+ if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte))
+ mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+ else
+ mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+ }
+ return 0;
+}
+#else
+#define smaps_hugetlb_range NULL
+#endif /* HUGETLB_PAGE */
+
+static const struct mm_walk_ops smaps_walk_ops = {
+ .pmd_entry = smaps_pte_range,
+ .hugetlb_entry = smaps_hugetlb_range,
+};
+
+static const struct mm_walk_ops smaps_shmem_walk_ops = {
+ .pmd_entry = smaps_pte_range,
+ .hugetlb_entry = smaps_hugetlb_range,
+ .pte_hole = smaps_pte_hole,
+};
+
+/*
+ * Gather mem stats from @vma with the indicated beginning
+ * address @start, and keep them in @mss.
+ *
+ * Use vm_start of @vma as the beginning address if @start is 0.
+ */
+static void smap_gather_stats(struct vm_area_struct *vma,
+ struct mem_size_stats *mss, unsigned long start)
+{
+ const struct mm_walk_ops *ops = &smaps_walk_ops;
+
+ /* Invalid start */
+ if (start >= vma->vm_end)
+ return;
+
+#ifdef CONFIG_SHMEM
+ if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
+ /*
+ * For shared or readonly shmem mappings we know that all
+ * swapped out pages belong to the shmem object, and we can
+ * obtain the swap value much more efficiently. For private
+ * writable mappings, we might have COW pages that are
+ * not affected by the parent swapped out pages of the shmem
+ * object, so we have to distinguish them during the page walk.
+ * Unless we know that the shmem object (or the part mapped by
+ * our VMA) has no swapped out pages at all.
+ */
+ unsigned long shmem_swapped = shmem_swap_usage(vma);
+
+ if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE))) {
+ mss->swap += shmem_swapped;
+ } else {
+ ops = &smaps_shmem_walk_ops;
+ }
+ }
+#endif
+ /* mmap_lock is held in m_start */
+ if (!start)
+ walk_page_vma(vma, ops, mss);
+ else
+ walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
+}
+
+#define SEQ_PUT_DEC(str, val) \
+ seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
+
+/* Show the contents common for smaps and smaps_rollup */
+static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
+ bool rollup_mode)
+{
+ SEQ_PUT_DEC("Rss: ", mss->resident);
+ SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT);
+ if (rollup_mode) {
+ /*
+ * These are meaningful only for smaps_rollup, otherwise two of
+ * them are zero, and the other one is the same as Pss.
+ */
+ SEQ_PUT_DEC(" kB\nPss_Anon: ",
+ mss->pss_anon >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nPss_File: ",
+ mss->pss_file >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nPss_Shmem: ",
+ mss->pss_shmem >> PSS_SHIFT);
+ }
+ SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean);
+ SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty);
+ SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean);
+ SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
+ SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
+ SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
+ SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
+ SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
+ SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
+ SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
+ SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
+ seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
+ mss->private_hugetlb >> 10, 7);
+ SEQ_PUT_DEC(" kB\nSwap: ", mss->swap);
+ SEQ_PUT_DEC(" kB\nSwapPss: ",
+ mss->swap_pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nLocked: ",
+ mss->pss_locked >> PSS_SHIFT);
+ seq_puts(m, " kB\n");
+}
+
+static int show_smap(struct seq_file *m, void *v)
+{
+ struct vm_area_struct *vma = v;
+ struct mem_size_stats mss;
+
+ memset(&mss, 0, sizeof(mss));
+
+ smap_gather_stats(vma, &mss, 0);
+
+ show_map_vma(m, vma);
+
+ SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start);
+ SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
+ SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma));
+ seq_puts(m, " kB\n");
+
+ __show_smap(m, &mss, false);
+
+ seq_printf(m, "THPeligible: %d\n",
+ hugepage_vma_check(vma, vma->vm_flags, true, false, true));
+
+ if (arch_pkeys_enabled())
+ seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
+ show_smap_vma_flags(m, vma);
+
+ return 0;
+}
+
+static int show_smaps_rollup(struct seq_file *m, void *v)
+{
+ struct proc_maps_private *priv = m->private;
+ struct mem_size_stats mss;
+ struct mm_struct *mm = priv->mm;
+ struct vm_area_struct *vma;
+ unsigned long vma_start = 0, last_vma_end = 0;
+ int ret = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+
+ priv->task = get_proc_task(priv->inode);
+ if (!priv->task)
+ return -ESRCH;
+
+ if (!mm || !mmget_not_zero(mm)) {
+ ret = -ESRCH;
+ goto out_put_task;
+ }
+
+ memset(&mss, 0, sizeof(mss));
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ goto out_put_mm;
+
+ hold_task_mempolicy(priv);
+ vma = mas_find(&mas, ULONG_MAX);
+
+ if (unlikely(!vma))
+ goto empty_set;
+
+ vma_start = vma->vm_start;
+ do {
+ smap_gather_stats(vma, &mss, 0);
+ last_vma_end = vma->vm_end;
+
+ /*
+ * Release mmap_lock temporarily if someone wants to
+ * access it for write request.
+ */
+ if (mmap_lock_is_contended(mm)) {
+ mas_pause(&mas);
+ mmap_read_unlock(mm);
+ ret = mmap_read_lock_killable(mm);
+ if (ret) {
+ release_task_mempolicy(priv);
+ goto out_put_mm;
+ }
+
+ /*
+ * After dropping the lock, there are four cases to
+ * consider. See the following example for explanation.
+ *
+ * +------+------+-----------+
+ * | VMA1 | VMA2 | VMA3 |
+ * +------+------+-----------+
+ * | | | |
+ * 4k 8k 16k 400k
+ *
+ * Suppose we drop the lock after reading VMA2 due to
+ * contention, then we get:
+ *
+ * last_vma_end = 16k
+ *
+ * 1) VMA2 is freed, but VMA3 exists:
+ *
+ * find_vma(mm, 16k - 1) will return VMA3.
+ * In this case, just continue from VMA3.
+ *
+ * 2) VMA2 still exists:
+ *
+ * find_vma(mm, 16k - 1) will return VMA2.
+ * Iterate the loop like the original one.
+ *
+ * 3) No more VMAs can be found:
+ *
+ * find_vma(mm, 16k - 1) will return NULL.
+ * No more things to do, just break.
+ *
+ * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
+ *
+ * find_vma(mm, 16k - 1) will return VMA' whose range
+ * contains last_vma_end.
+ * Iterate VMA' from last_vma_end.
+ */
+ vma = mas_find(&mas, ULONG_MAX);
+ /* Case 3 above */
+ if (!vma)
+ break;
+
+ /* Case 1 above */
+ if (vma->vm_start >= last_vma_end)
+ continue;
+
+ /* Case 4 above */
+ if (vma->vm_end > last_vma_end)
+ smap_gather_stats(vma, &mss, last_vma_end);
+ }
+ /* Case 2 above */
+ } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
+
+empty_set:
+ show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
+ seq_pad(m, ' ');
+ seq_puts(m, "[rollup]\n");
+
+ __show_smap(m, &mss, true);
+
+ release_task_mempolicy(priv);
+ mmap_read_unlock(mm);
+
+out_put_mm:
+ mmput(mm);
+out_put_task:
+ put_task_struct(priv->task);
+ priv->task = NULL;
+
+ return ret;
+}
+#undef SEQ_PUT_DEC
+
+static const struct seq_operations proc_pid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_smap
+};
+
+static int pid_smaps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_pid_smaps_op);
+}
+
+static int smaps_rollup_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ struct proc_maps_private *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT);
+ if (!priv)
+ return -ENOMEM;
+
+ ret = single_open(file, show_smaps_rollup, priv);
+ if (ret)
+ goto out_free;
+
+ priv->inode = inode;
+ priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->mm)) {
+ ret = PTR_ERR(priv->mm);
+
+ single_release(inode, file);
+ goto out_free;
+ }
+
+ return 0;
+
+out_free:
+ kfree(priv);
+ return ret;
+}
+
+static int smaps_rollup_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ if (priv->mm)
+ mmdrop(priv->mm);
+
+ kfree(priv);
+ return single_release(inode, file);
+}
+
+const struct file_operations proc_pid_smaps_operations = {
+ .open = pid_smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_map_release,
+};
+
+const struct file_operations proc_pid_smaps_rollup_operations = {
+ .open = smaps_rollup_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = smaps_rollup_release,
+};
+
+enum clear_refs_types {
+ CLEAR_REFS_ALL = 1,
+ CLEAR_REFS_ANON,
+ CLEAR_REFS_MAPPED,
+ CLEAR_REFS_SOFT_DIRTY,
+ CLEAR_REFS_MM_HIWATER_RSS,
+ CLEAR_REFS_LAST,
+};
+
+struct clear_refs_private {
+ enum clear_refs_types type;
+};
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+
+static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+{
+ struct page *page;
+
+ if (!pte_write(pte))
+ return false;
+ if (!is_cow_mapping(vma->vm_flags))
+ return false;
+ if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
+ return false;
+ page = vm_normal_page(vma, addr, pte);
+ if (!page)
+ return false;
+ return page_maybe_dma_pinned(page);
+}
+
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+ /*
+ * The soft-dirty tracker uses #PF-s to catch writes
+ * to pages, so write-protect the pte as well. See the
+ * Documentation/admin-guide/mm/soft-dirty.rst for full description
+ * of how soft-dirty works.
+ */
+ pte_t ptent = *pte;
+
+ if (pte_present(ptent)) {
+ pte_t old_pte;
+
+ if (pte_is_pinned(vma, addr, ptent))
+ return;
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_wrprotect(old_pte);
+ ptent = pte_clear_soft_dirty(ptent);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+ } else if (is_swap_pte(ptent)) {
+ ptent = pte_swp_clear_soft_dirty(ptent);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
+ }
+}
+#else
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+}
+#endif
+
+#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t old, pmd = *pmdp;
+
+ if (pmd_present(pmd)) {
+ /* See comment in change_huge_pmd() */
+ old = pmdp_invalidate(vma, addr, pmdp);
+ if (pmd_dirty(old))
+ pmd = pmd_mkdirty(pmd);
+ if (pmd_young(old))
+ pmd = pmd_mkyoung(pmd);
+
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_clear_soft_dirty(pmd);
+
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+ pmd = pmd_swp_clear_soft_dirty(pmd);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ }
+}
+#else
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+}
+#endif
+
+static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *pte, ptent;
+ spinlock_t *ptl;
+ struct page *page;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ clear_soft_dirty_pmd(vma, addr, pmd);
+ goto out;
+ }
+
+ if (!pmd_present(*pmd))
+ goto out;
+
+ page = pmd_page(*pmd);
+
+ /* Clear accessed and referenced bits. */
+ pmdp_test_and_clear_young(vma, addr, pmd);
+ test_and_clear_page_young(page);
+ ClearPageReferenced(page);
+out:
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ clear_soft_dirty(vma, addr, pte);
+ continue;
+ }
+
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /* Clear accessed and referenced bits. */
+ ptep_test_and_clear_young(vma, addr, pte);
+ test_and_clear_page_young(page);
+ ClearPageReferenced(page);
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+ return 0;
+}
+
+static int clear_refs_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ /*
+ * Writing 1 to /proc/pid/clear_refs affects all pages.
+ * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
+ * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
+ * Writing 4 to /proc/pid/clear_refs affects all pages.
+ */
+ if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
+ return 1;
+ if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
+ return 1;
+ return 0;
+}
+
+static const struct mm_walk_ops clear_refs_walk_ops = {
+ .pmd_entry = clear_refs_pte_range,
+ .test_walk = clear_refs_test_walk,
+};
+
+static ssize_t clear_refs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ char buffer[PROC_NUMBUF];
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ enum clear_refs_types type;
+ int itype;
+ int rv;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count))
+ return -EFAULT;
+ rv = kstrtoint(strstrip(buffer), 10, &itype);
+ if (rv < 0)
+ return rv;
+ type = (enum clear_refs_types)itype;
+ if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
+ return -EINVAL;
+
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ return -ESRCH;
+ mm = get_task_mm(task);
+ if (mm) {
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ struct mmu_notifier_range range;
+ struct clear_refs_private cp = {
+ .type = type,
+ };
+
+ if (mmap_write_lock_killable(mm)) {
+ count = -EINTR;
+ goto out_mm;
+ }
+ if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+ /*
+ * Writing 5 to /proc/pid/clear_refs resets the peak
+ * resident set size to this mm's current rss value.
+ */
+ reset_mm_hiwater_rss(mm);
+ goto out_unlock;
+ }
+
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
+ if (!(vma->vm_flags & VM_SOFTDIRTY))
+ continue;
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+ vma_set_page_prot(vma);
+ }
+
+ inc_tlb_flush_pending(mm);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
+ 0, NULL, mm, 0, -1UL);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+ walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
+ mmu_notifier_invalidate_range_end(&range);
+ flush_tlb_mm(mm);
+ dec_tlb_flush_pending(mm);
+ }
+out_unlock:
+ mmap_write_unlock(mm);
+out_mm:
+ mmput(mm);
+ }
+ put_task_struct(task);
+
+ return count;
+}
+
+const struct file_operations proc_clear_refs_operations = {
+ .write = clear_refs_write,
+ .llseek = noop_llseek,
+};
+
+typedef struct {
+ u64 pme;
+} pagemap_entry_t;
+
+struct pagemapread {
+ int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
+ pagemap_entry_t *buffer;
+ bool show_pfn;
+};
+
+#define PAGEMAP_WALK_SIZE (PMD_SIZE)
+#define PAGEMAP_WALK_MASK (PMD_MASK)
+
+#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
+#define PM_SOFT_DIRTY BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_UFFD_WP BIT_ULL(57)
+#define PM_FILE BIT_ULL(61)
+#define PM_SWAP BIT_ULL(62)
+#define PM_PRESENT BIT_ULL(63)
+
+#define PM_END_OF_BUFFER 1
+
+static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
+{
+ return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
+}
+
+static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
+ struct pagemapread *pm)
+{
+ pm->buffer[pm->pos++] = *pme;
+ if (pm->pos >= pm->len)
+ return PM_END_OF_BUFFER;
+ return 0;
+}
+
+static int pagemap_pte_hole(unsigned long start, unsigned long end,
+ __always_unused int depth, struct mm_walk *walk)
+{
+ struct pagemapread *pm = walk->private;
+ unsigned long addr = start;
+ int err = 0;
+
+ while (addr < end) {
+ struct vm_area_struct *vma = find_vma(walk->mm, addr);
+ pagemap_entry_t pme = make_pme(0, 0);
+ /* End of address space hole, which we mark as non-present. */
+ unsigned long hole_end;
+
+ if (vma)
+ hole_end = min(end, vma->vm_start);
+ else
+ hole_end = end;
+
+ for (; addr < hole_end; addr += PAGE_SIZE) {
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ goto out;
+ }
+
+ if (!vma)
+ break;
+
+ /* Addresses in the VMA. */
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ pme = make_pme(0, PM_SOFT_DIRTY);
+ for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ goto out;
+ }
+ }
+out:
+ return err;
+}
+
+static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
+ struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+{
+ u64 frame = 0, flags = 0;
+ struct page *page = NULL;
+ bool migration = false;
+
+ if (pte_present(pte)) {
+ if (pm->show_pfn)
+ frame = pte_pfn(pte);
+ flags |= PM_PRESENT;
+ page = vm_normal_page(vma, addr, pte);
+ if (pte_soft_dirty(pte))
+ flags |= PM_SOFT_DIRTY;
+ if (pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+ } else if (is_swap_pte(pte)) {
+ swp_entry_t entry;
+ if (pte_swp_soft_dirty(pte))
+ flags |= PM_SOFT_DIRTY;
+ if (pte_swp_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+ entry = pte_to_swp_entry(pte);
+ if (pm->show_pfn) {
+ pgoff_t offset;
+ /*
+ * For PFN swap offsets, keeping the offset field
+ * to be PFN only to be compatible with old smaps.
+ */
+ if (is_pfn_swap_entry(entry))
+ offset = swp_offset_pfn(entry);
+ else
+ offset = swp_offset(entry);
+ frame = swp_type(entry) |
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
+ flags |= PM_SWAP;
+ migration = is_migration_entry(entry);
+ if (is_pfn_swap_entry(entry))
+ page = pfn_swap_entry_to_page(entry);
+ if (pte_marker_entry_uffd_wp(entry))
+ flags |= PM_UFFD_WP;
+ }
+
+ if (page && !PageAnon(page))
+ flags |= PM_FILE;
+ if (page && !migration && page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
+
+ return make_pme(frame, flags);
+}
+
+static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ struct pagemapread *pm = walk->private;
+ spinlock_t *ptl;
+ pte_t *pte, *orig_pte;
+ int err = 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool migration = false;
+
+ ptl = pmd_trans_huge_lock(pmdp, vma);
+ if (ptl) {
+ u64 flags = 0, frame = 0;
+ pmd_t pmd = *pmdp;
+ struct page *page = NULL;
+
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
+
+ if (pmd_present(pmd)) {
+ page = pmd_page(pmd);
+
+ flags |= PM_PRESENT;
+ if (pmd_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+ if (pmd_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
+ if (pm->show_pfn)
+ frame = pmd_pfn(pmd) +
+ ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ }
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ else if (is_swap_pmd(pmd)) {
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+ unsigned long offset;
+
+ if (pm->show_pfn) {
+ if (is_pfn_swap_entry(entry))
+ offset = swp_offset_pfn(entry);
+ else
+ offset = swp_offset(entry);
+ offset = offset +
+ ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ frame = swp_type(entry) |
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
+ flags |= PM_SWAP;
+ if (pmd_swp_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+ if (pmd_swp_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
+ VM_BUG_ON(!is_pmd_migration_entry(pmd));
+ migration = is_migration_entry(entry);
+ page = pfn_swap_entry_to_page(entry);
+ }
+#endif
+
+ if (page && !migration && page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ for (; addr != end; addr += PAGE_SIZE) {
+ pagemap_entry_t pme = make_pme(frame, flags);
+
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ break;
+ if (pm->show_pfn) {
+ if (flags & PM_PRESENT)
+ frame++;
+ else if (flags & PM_SWAP)
+ frame += (1 << MAX_SWAPFILES_SHIFT);
+ }
+ }
+ spin_unlock(ptl);
+ return err;
+ }
+
+ if (pmd_trans_unstable(pmdp))
+ return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+ /*
+ * We can assume that @vma always points to a valid one and @end never
+ * goes beyond vma->vm_end.
+ */
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
+ for (; addr < end; pte++, addr += PAGE_SIZE) {
+ pagemap_entry_t pme;
+
+ pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+
+ cond_resched();
+
+ return err;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/* This function walks within one hugetlb entry in the single call */
+static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct pagemapread *pm = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ u64 flags = 0, frame = 0;
+ int err = 0;
+ pte_t pte;
+
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
+
+ pte = huge_ptep_get(ptep);
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+
+ if (!PageAnon(page))
+ flags |= PM_FILE;
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ if (huge_pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte) +
+ ((addr & ~hmask) >> PAGE_SHIFT);
+ } else if (pte_swp_uffd_wp_any(pte)) {
+ flags |= PM_UFFD_WP;
+ }
+
+ for (; addr != end; addr += PAGE_SIZE) {
+ pagemap_entry_t pme = make_pme(frame, flags);
+
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ return err;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
+ }
+
+ cond_resched();
+
+ return err;
+}
+#else
+#define pagemap_hugetlb_range NULL
+#endif /* HUGETLB_PAGE */
+
+static const struct mm_walk_ops pagemap_ops = {
+ .pmd_entry = pagemap_pmd_range,
+ .pte_hole = pagemap_pte_hole,
+ .hugetlb_entry = pagemap_hugetlb_range,
+};
+
+/*
+ * /proc/pid/pagemap - an array mapping virtual pages to pfns
+ *
+ * For each page in the address space, this file contains one 64-bit entry
+ * consisting of the following:
+ *
+ * Bits 0-54 page frame number (PFN) if present
+ * Bits 0-4 swap type if swapped
+ * Bits 5-54 swap offset if swapped
+ * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
+ * Bit 56 page exclusively mapped
+ * Bit 57 pte is uffd-wp write-protected
+ * Bits 58-60 zero
+ * Bit 61 page is file-page or shared-anon
+ * Bit 62 page swapped
+ * Bit 63 page present
+ *
+ * If the page is not present but in swap, then the PFN contains an
+ * encoding of the swap file number and the page's offset into the
+ * swap. Unmapped pages return a null PFN. This allows determining
+ * precisely which pages are mapped (or in swap) and comparing mapped
+ * pages between processes.
+ *
+ * Efficient users of this interface will use /proc/pid/maps to
+ * determine which areas of memory are actually mapped and llseek to
+ * skip over unmapped regions.
+ */
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mm_struct *mm = file->private_data;
+ struct pagemapread pm;
+ unsigned long src;
+ unsigned long svpfn;
+ unsigned long start_vaddr;
+ unsigned long end_vaddr;
+ int ret = 0, copied = 0;
+
+ if (!mm || !mmget_not_zero(mm))
+ goto out;
+
+ ret = -EINVAL;
+ /* file position must be aligned */
+ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
+ goto out_mm;
+
+ ret = 0;
+ if (!count)
+ goto out_mm;
+
+ /* do not disclose physical addresses: attack vector */
+ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
+
+ pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
+ pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!pm.buffer)
+ goto out_mm;
+
+ src = *ppos;
+ svpfn = src / PM_ENTRY_BYTES;
+ end_vaddr = mm->task_size;
+
+ /* watch out for wraparound */
+ start_vaddr = end_vaddr;
+ if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
+ start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
+
+ /* Ensure the address is inside the task */
+ if (start_vaddr > mm->task_size)
+ start_vaddr = end_vaddr;
+
+ /*
+ * The odds are that this will stop walking way
+ * before end_vaddr, because the length of the
+ * user buffer is tracked in "pm", and the walk
+ * will stop when we hit the end of the buffer.
+ */
+ ret = 0;
+ while (count && (start_vaddr < end_vaddr)) {
+ int len;
+ unsigned long end;
+
+ pm.pos = 0;
+ end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
+ /* overflow ? */
+ if (end < start_vaddr || end > end_vaddr)
+ end = end_vaddr;
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ goto out_free;
+ ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
+ mmap_read_unlock(mm);
+ start_vaddr = end;
+
+ len = min(count, PM_ENTRY_BYTES * pm.pos);
+ if (copy_to_user(buf, pm.buffer, len)) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+ copied += len;
+ buf += len;
+ count -= len;
+ }
+ *ppos += copied;
+ if (!ret || ret == PM_END_OF_BUFFER)
+ ret = copied;
+
+out_free:
+ kfree(pm.buffer);
+out_mm:
+ mmput(mm);
+out:
+ return ret;
+}
+
+static int pagemap_open(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm;
+
+ mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+ file->private_data = mm;
+ return 0;
+}
+
+static int pagemap_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+
+ if (mm)
+ mmdrop(mm);
+ return 0;
+}
+
+const struct file_operations proc_pagemap_operations = {
+ .llseek = mem_lseek, /* borrow this */
+ .read = pagemap_read,
+ .open = pagemap_open,
+ .release = pagemap_release,
+};
+#endif /* CONFIG_PROC_PAGE_MONITOR */
+
+#ifdef CONFIG_NUMA
+
+struct numa_maps {
+ unsigned long pages;
+ unsigned long anon;
+ unsigned long active;
+ unsigned long writeback;
+ unsigned long mapcount_max;
+ unsigned long dirty;
+ unsigned long swapcache;
+ unsigned long node[MAX_NUMNODES];
+};
+
+struct numa_maps_private {
+ struct proc_maps_private proc_maps;
+ struct numa_maps md;
+};
+
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
+ unsigned long nr_pages)
+{
+ int count = page_mapcount(page);
+
+ md->pages += nr_pages;
+ if (pte_dirty || PageDirty(page))
+ md->dirty += nr_pages;
+
+ if (PageSwapCache(page))
+ md->swapcache += nr_pages;
+
+ if (PageActive(page) || PageUnevictable(page))
+ md->active += nr_pages;
+
+ if (PageWriteback(page))
+ md->writeback += nr_pages;
+
+ if (PageAnon(page))
+ md->anon += nr_pages;
+
+ if (count > md->mapcount_max)
+ md->mapcount_max = count;
+
+ md->node[page_to_nid(page)] += nr_pages;
+}
+
+static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page;
+ int nid;
+
+ if (!pte_present(pte))
+ return NULL;
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page || is_zone_device_page(page))
+ return NULL;
+
+ if (PageReserved(page))
+ return NULL;
+
+ nid = page_to_nid(page);
+ if (!node_isset(nid, node_states[N_MEMORY]))
+ return NULL;
+
+ return page;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
+ struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page;
+ int nid;
+
+ if (!pmd_present(pmd))
+ return NULL;
+
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (!page)
+ return NULL;
+
+ if (PageReserved(page))
+ return NULL;
+
+ nid = page_to_nid(page);
+ if (!node_isset(nid, node_states[N_MEMORY]))
+ return NULL;
+
+ return page;
+}
+#endif
+
+static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct numa_maps *md = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ spinlock_t *ptl;
+ pte_t *orig_pte;
+ pte_t *pte;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ struct page *page;
+
+ page = can_gather_numa_stats_pmd(*pmd, vma, addr);
+ if (page)
+ gather_stats(page, md, pmd_dirty(*pmd),
+ HPAGE_PMD_SIZE/PAGE_SIZE);
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+#endif
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ do {
+ struct page *page = can_gather_numa_stats(*pte, vma, addr);
+ if (!page)
+ continue;
+ gather_stats(page, md, pte_dirty(*pte), 1);
+
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte_unmap_unlock(orig_pte, ptl);
+ cond_resched();
+ return 0;
+}
+#ifdef CONFIG_HUGETLB_PAGE
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+ pte_t huge_pte = huge_ptep_get(pte);
+ struct numa_maps *md;
+ struct page *page;
+
+ if (!pte_present(huge_pte))
+ return 0;
+
+ page = pte_page(huge_pte);
+
+ md = walk->private;
+ gather_stats(page, md, pte_dirty(huge_pte), 1);
+ return 0;
+}
+
+#else
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+ return 0;
+}
+#endif
+
+static const struct mm_walk_ops show_numa_ops = {
+ .hugetlb_entry = gather_hugetlb_stats,
+ .pmd_entry = gather_pte_stats,
+};
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static int show_numa_map(struct seq_file *m, void *v)
+{
+ struct numa_maps_private *numa_priv = m->private;
+ struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
+ struct vm_area_struct *vma = v;
+ struct numa_maps *md = &numa_priv->md;
+ struct file *file = vma->vm_file;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mempolicy *pol;
+ char buffer[64];
+ int nid;
+
+ if (!mm)
+ return 0;
+
+ /* Ensure we start with an empty set of numa_maps statistics. */
+ memset(md, 0, sizeof(*md));
+
+ pol = __get_vma_policy(vma, vma->vm_start);
+ if (pol) {
+ mpol_to_str(buffer, sizeof(buffer), pol);
+ mpol_cond_put(pol);
+ } else {
+ mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
+ }
+
+ seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+
+ if (file) {
+ seq_puts(m, " file=");
+ seq_file_path(m, file, "\n\t= ");
+ } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ seq_puts(m, " heap");
+ } else if (is_stack(vma)) {
+ seq_puts(m, " stack");
+ }
+
+ if (is_vm_hugetlb_page(vma))
+ seq_puts(m, " huge");
+
+ /* mmap_lock is held by m_start */
+ walk_page_vma(vma, &show_numa_ops, md);
+
+ if (!md->pages)
+ goto out;
+
+ if (md->anon)
+ seq_printf(m, " anon=%lu", md->anon);
+
+ if (md->dirty)
+ seq_printf(m, " dirty=%lu", md->dirty);
+
+ if (md->pages != md->anon && md->pages != md->dirty)
+ seq_printf(m, " mapped=%lu", md->pages);
+
+ if (md->mapcount_max > 1)
+ seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+ if (md->swapcache)
+ seq_printf(m, " swapcache=%lu", md->swapcache);
+
+ if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+ seq_printf(m, " active=%lu", md->active);
+
+ if (md->writeback)
+ seq_printf(m, " writeback=%lu", md->writeback);
+
+ for_each_node_state(nid, N_MEMORY)
+ if (md->node[nid])
+ seq_printf(m, " N%d=%lu", nid, md->node[nid]);
+
+ seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
+out:
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static const struct seq_operations proc_pid_numa_maps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_numa_map,
+};
+
+static int pid_numa_maps_open(struct inode *inode, struct file *file)
+{
+ return proc_maps_open(inode, file, &proc_pid_numa_maps_op,
+ sizeof(struct numa_maps_private));
+}
+
+const struct file_operations proc_pid_numa_maps_operations = {
+ .open = pid_numa_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_map_release,
+};
+
+#endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
new file mode 100644
index 000000000..dc05780f9
--- /dev/null
+++ b/fs/proc/task_nommu.c
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/fs_struct.h>
+#include <linux/mount.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/sched/mm.h>
+
+#include "internal.h"
+
+/*
+ * Logic: we've got two memory sums for each process, "shared", and
+ * "non-shared". Shared memory may get counted more than once, for
+ * each process that owns it. Non-shared memory is counted
+ * accurately.
+ */
+void task_mem(struct seq_file *m, struct mm_struct *mm)
+{
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *vma;
+ struct vm_region *region;
+ unsigned long bytes = 0, sbytes = 0, slack = 0, size;
+
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
+ bytes += kobjsize(vma);
+
+ region = vma->vm_region;
+ if (region) {
+ size = kobjsize(region);
+ size += region->vm_end - region->vm_start;
+ } else {
+ size = vma->vm_end - vma->vm_start;
+ }
+
+ if (atomic_read(&mm->mm_count) > 1 ||
+ vma->vm_flags & VM_MAYSHARE) {
+ sbytes += size;
+ } else {
+ bytes += size;
+ if (region)
+ slack = region->vm_end - vma->vm_end;
+ }
+ }
+
+ if (atomic_read(&mm->mm_count) > 1)
+ sbytes += kobjsize(mm);
+ else
+ bytes += kobjsize(mm);
+
+ if (current->fs && current->fs->users > 1)
+ sbytes += kobjsize(current->fs);
+ else
+ bytes += kobjsize(current->fs);
+
+ if (current->files && atomic_read(&current->files->count) > 1)
+ sbytes += kobjsize(current->files);
+ else
+ bytes += kobjsize(current->files);
+
+ if (current->sighand && refcount_read(&current->sighand->count) > 1)
+ sbytes += kobjsize(current->sighand);
+ else
+ bytes += kobjsize(current->sighand);
+
+ bytes += kobjsize(current); /* includes kernel stack */
+
+ seq_printf(m,
+ "Mem:\t%8lu bytes\n"
+ "Slack:\t%8lu bytes\n"
+ "Shared:\t%8lu bytes\n",
+ bytes, slack, sbytes);
+
+ mmap_read_unlock(mm);
+}
+
+unsigned long task_vsize(struct mm_struct *mm)
+{
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *vma;
+ unsigned long vsize = 0;
+
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma)
+ vsize += vma->vm_end - vma->vm_start;
+ mmap_read_unlock(mm);
+ return vsize;
+}
+
+unsigned long task_statm(struct mm_struct *mm,
+ unsigned long *shared, unsigned long *text,
+ unsigned long *data, unsigned long *resident)
+{
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *vma;
+ struct vm_region *region;
+ unsigned long size = kobjsize(mm);
+
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
+ size += kobjsize(vma);
+ region = vma->vm_region;
+ if (region) {
+ size += kobjsize(region);
+ size += region->vm_end - region->vm_start;
+ }
+ }
+
+ *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
+ >> PAGE_SHIFT;
+ *data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK))
+ >> PAGE_SHIFT;
+ mmap_read_unlock(mm);
+ size >>= PAGE_SHIFT;
+ size += *text + *data;
+ *resident = size;
+ return size;
+}
+
+static int is_stack(struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ /*
+ * We make no effort to guess what a given thread considers to be
+ * its "stack". It's not even well-defined for programs written
+ * languages like Go.
+ */
+ return vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack;
+}
+
+/*
+ * display a single VMA to a sequenced file
+ */
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long ino = 0;
+ struct file *file;
+ dev_t dev = 0;
+ int flags;
+ unsigned long long pgoff = 0;
+
+ flags = vma->vm_flags;
+ file = vma->vm_file;
+
+ if (file) {
+ struct inode *inode = file_inode(vma->vm_file);
+ dev = inode->i_sb->s_dev;
+ ino = inode->i_ino;
+ pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+ }
+
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+ seq_printf(m,
+ "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+ vma->vm_start,
+ vma->vm_end,
+ flags & VM_READ ? 'r' : '-',
+ flags & VM_WRITE ? 'w' : '-',
+ flags & VM_EXEC ? 'x' : '-',
+ flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+ pgoff,
+ MAJOR(dev), MINOR(dev), ino);
+
+ if (file) {
+ seq_pad(m, ' ');
+ seq_file_path(m, file, "");
+ } else if (mm && is_stack(vma)) {
+ seq_pad(m, ' ');
+ seq_puts(m, "[stack]");
+ }
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+/*
+ * display mapping lines for a particular process's /proc/pid/maps
+ */
+static int show_map(struct seq_file *m, void *_p)
+{
+ return nommu_vma_show(m, _p);
+}
+
+static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
+ loff_t *ppos)
+{
+ struct vm_area_struct *vma = vma_next(&priv->iter);
+
+ if (vma) {
+ *ppos = vma->vm_start;
+ } else {
+ *ppos = -1UL;
+ }
+
+ return vma;
+}
+
+static void *m_start(struct seq_file *m, loff_t *ppos)
+{
+ struct proc_maps_private *priv = m->private;
+ unsigned long last_addr = *ppos;
+ struct mm_struct *mm;
+
+ /* See proc_get_vma(). Zero at the start or after lseek. */
+ if (last_addr == -1UL)
+ return NULL;
+
+ /* pin the task and mm whilst we play with them */
+ priv->task = get_proc_task(priv->inode);
+ if (!priv->task)
+ return ERR_PTR(-ESRCH);
+
+ mm = priv->mm;
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
+ return NULL;
+ }
+
+ if (mmap_read_lock_killable(mm)) {
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
+ return ERR_PTR(-EINTR);
+ }
+
+ vma_iter_init(&priv->iter, mm, last_addr);
+
+ return proc_get_vma(priv, ppos);
+}
+
+static void m_stop(struct seq_file *m, void *v)
+{
+ struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->mm;
+
+ if (!priv->task)
+ return;
+
+ mmap_read_unlock(mm);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
+}
+
+static void *m_next(struct seq_file *m, void *_p, loff_t *ppos)
+{
+ return proc_get_vma(m->private, ppos);
+}
+
+static const struct seq_operations proc_pid_maps_ops = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_map
+};
+
+static int maps_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
+{
+ struct proc_maps_private *priv;
+
+ priv = __seq_open_private(file, ops, sizeof(*priv));
+ if (!priv)
+ return -ENOMEM;
+
+ priv->inode = inode;
+ priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(priv->mm)) {
+ int err = PTR_ERR(priv->mm);
+
+ seq_release_private(inode, file);
+ return err;
+ }
+
+ return 0;
+}
+
+
+static int map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ if (priv->mm)
+ mmdrop(priv->mm);
+
+ return seq_release_private(inode, file);
+}
+
+static int pid_maps_open(struct inode *inode, struct file *file)
+{
+ return maps_open(inode, file, &proc_pid_maps_ops);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+ .open = pid_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = map_release,
+};
+
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
new file mode 100644
index 000000000..a553273fb
--- /dev/null
+++ b/fs/proc/thread_self.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+
+/*
+ * /proc/thread_self:
+ */
+static const char *proc_thread_self_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
+ pid_t tgid = task_tgid_nr_ns(current, ns);
+ pid_t pid = task_pid_nr_ns(current, ns);
+ char *name;
+
+ if (!pid)
+ return ERR_PTR(-ENOENT);
+ name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
+ if (unlikely(!name))
+ return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
+ sprintf(name, "%u/task/%u", tgid, pid);
+ set_delayed_call(done, kfree_link, name);
+ return name;
+}
+
+static const struct inode_operations proc_thread_self_inode_operations = {
+ .get_link = proc_thread_self_get_link,
+};
+
+static unsigned thread_self_inum __ro_after_init;
+
+int proc_setup_thread_self(struct super_block *s)
+{
+ struct inode *root_inode = d_inode(s->s_root);
+ struct proc_fs_info *fs_info = proc_sb_info(s);
+ struct dentry *thread_self;
+ int ret = -ENOMEM;
+
+ inode_lock(root_inode);
+ thread_self = d_alloc_name(s->s_root, "thread-self");
+ if (thread_self) {
+ struct inode *inode = new_inode(s);
+ if (inode) {
+ inode->i_ino = thread_self_inum;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mode = S_IFLNK | S_IRWXUGO;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_op = &proc_thread_self_inode_operations;
+ d_add(thread_self, inode);
+ ret = 0;
+ } else {
+ dput(thread_self);
+ }
+ }
+ inode_unlock(root_inode);
+
+ if (ret)
+ pr_err("proc_fill_super: can't allocate /proc/thread-self\n");
+ else
+ fs_info->proc_thread_self = thread_self;
+
+ return ret;
+}
+
+void __init proc_thread_self_init(void)
+{
+ proc_alloc_inum(&thread_self_inum);
+}
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
new file mode 100644
index 000000000..b5343d209
--- /dev/null
+++ b/fs/proc/uptime.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/time.h>
+#include <linux/time_namespace.h>
+#include <linux/kernel_stat.h>
+#include "internal.h"
+
+static int uptime_proc_show(struct seq_file *m, void *v)
+{
+ struct timespec64 uptime;
+ struct timespec64 idle;
+ u64 idle_nsec;
+ u32 rem;
+ int i;
+
+ idle_nsec = 0;
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcs;
+
+ kcpustat_cpu_fetch(&kcs, i);
+ idle_nsec += get_idle_time(&kcs, i);
+ }
+
+ ktime_get_boottime_ts64(&uptime);
+ timens_add_boottime(&uptime);
+
+ idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem);
+ idle.tv_nsec = rem;
+ seq_printf(m, "%lu.%02lu %lu.%02lu\n",
+ (unsigned long) uptime.tv_sec,
+ (uptime.tv_nsec / (NSEC_PER_SEC / 100)),
+ (unsigned long) idle.tv_sec,
+ (idle.tv_nsec / (NSEC_PER_SEC / 100)));
+ return 0;
+}
+
+static int __init proc_uptime_init(void)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("uptime", 0, NULL, uptime_proc_show);
+ pde_make_permanent(pde);
+ return 0;
+}
+fs_initcall(proc_uptime_init);
diff --git a/fs/proc/util.c b/fs/proc/util.c
new file mode 100644
index 000000000..98f8adc17
--- /dev/null
+++ b/fs/proc/util.c
@@ -0,0 +1,24 @@
+#include <linux/dcache.h>
+#include "internal.h"
+
+unsigned name_to_int(const struct qstr *qstr)
+{
+ const char *name = qstr->name;
+ int len = qstr->len;
+ unsigned n = 0;
+
+ if (len > 1 && *name == '0')
+ goto out;
+ do {
+ unsigned c = *name++ - '0';
+ if (c > 9)
+ goto out;
+ if (n >= (~0U-9)/10)
+ goto out;
+ n *= 10;
+ n += c;
+ } while (--len > 0);
+ return n;
+out:
+ return ~0U;
+}
diff --git a/fs/proc/version.c b/fs/proc/version.c
new file mode 100644
index 000000000..02e3c3cd4
--- /dev/null
+++ b/fs/proc/version.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/utsname.h>
+#include "internal.h"
+
+static int version_proc_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, linux_proc_banner,
+ utsname()->sysname,
+ utsname()->release,
+ utsname()->version);
+ return 0;
+}
+
+static int __init proc_version_init(void)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("version", 0, NULL, version_proc_show);
+ pde_make_permanent(pde);
+ return 0;
+}
+fs_initcall(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
new file mode 100644
index 000000000..1ec0647a2
--- /dev/null
+++ b/fs/proc/vmcore.c
@@ -0,0 +1,1603 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * fs/proc/vmcore.c Interface for accessing the crash
+ * dump from the system's previous life.
+ * Heavily borrowed from fs/proc/kcore.c
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ * Copyright (C) IBM Corporation, 2004. All rights reserved
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/kcore.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/printk.h>
+#include <linux/memblock.h>
+#include <linux/init.h>
+#include <linux/crash_dump.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/cc_platform.h>
+#include <asm/io.h>
+#include "internal.h"
+
+/* List representing chunks of contiguous memory areas and their offsets in
+ * vmcore file.
+ */
+static LIST_HEAD(vmcore_list);
+
+/* Stores the pointer to the buffer containing kernel elf core headers. */
+static char *elfcorebuf;
+static size_t elfcorebuf_sz;
+static size_t elfcorebuf_sz_orig;
+
+static char *elfnotes_buf;
+static size_t elfnotes_sz;
+/* Size of all notes minus the device dump notes */
+static size_t elfnotes_orig_sz;
+
+/* Total size of vmcore file. */
+static u64 vmcore_size;
+
+static struct proc_dir_entry *proc_vmcore;
+
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/* Device Dump list and mutex to synchronize access to list */
+static LIST_HEAD(vmcoredd_list);
+static DEFINE_MUTEX(vmcoredd_mutex);
+
+static bool vmcoredd_disabled;
+core_param(novmcoredd, vmcoredd_disabled, bool, 0);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Device Dump Size */
+static size_t vmcoredd_orig_sz;
+
+static DEFINE_SPINLOCK(vmcore_cb_lock);
+DEFINE_STATIC_SRCU(vmcore_cb_srcu);
+/* List of registered vmcore callbacks. */
+static LIST_HEAD(vmcore_cb_list);
+/* Whether the vmcore has been opened once. */
+static bool vmcore_opened;
+
+void register_vmcore_cb(struct vmcore_cb *cb)
+{
+ INIT_LIST_HEAD(&cb->next);
+ spin_lock(&vmcore_cb_lock);
+ list_add_tail(&cb->next, &vmcore_cb_list);
+ /*
+ * Registering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., manual driver loading).
+ */
+ if (vmcore_opened)
+ pr_warn_once("Unexpected vmcore callback registration\n");
+ spin_unlock(&vmcore_cb_lock);
+}
+EXPORT_SYMBOL_GPL(register_vmcore_cb);
+
+void unregister_vmcore_cb(struct vmcore_cb *cb)
+{
+ spin_lock(&vmcore_cb_lock);
+ list_del_rcu(&cb->next);
+ /*
+ * Unregistering a vmcore callback after the vmcore was opened is
+ * very unusual (e.g., forced driver removal), but we cannot stop
+ * unregistering.
+ */
+ if (vmcore_opened)
+ pr_warn_once("Unexpected vmcore callback unregistration\n");
+ spin_unlock(&vmcore_cb_lock);
+
+ synchronize_srcu(&vmcore_cb_srcu);
+}
+EXPORT_SYMBOL_GPL(unregister_vmcore_cb);
+
+static bool pfn_is_ram(unsigned long pfn)
+{
+ struct vmcore_cb *cb;
+ bool ret = true;
+
+ list_for_each_entry_srcu(cb, &vmcore_cb_list, next,
+ srcu_read_lock_held(&vmcore_cb_srcu)) {
+ if (unlikely(!cb->pfn_is_ram))
+ continue;
+ ret = cb->pfn_is_ram(cb, pfn);
+ if (!ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int open_vmcore(struct inode *inode, struct file *file)
+{
+ spin_lock(&vmcore_cb_lock);
+ vmcore_opened = true;
+ spin_unlock(&vmcore_cb_lock);
+
+ return 0;
+}
+
+/* Reads a page from the oldmem device from given offset. */
+ssize_t read_from_oldmem(struct iov_iter *iter, size_t count,
+ u64 *ppos, bool encrypted)
+{
+ unsigned long pfn, offset;
+ ssize_t nr_bytes;
+ ssize_t read = 0, tmp;
+ int idx;
+
+ if (!count)
+ return 0;
+
+ offset = (unsigned long)(*ppos % PAGE_SIZE);
+ pfn = (unsigned long)(*ppos / PAGE_SIZE);
+
+ idx = srcu_read_lock(&vmcore_cb_srcu);
+ do {
+ if (count > (PAGE_SIZE - offset))
+ nr_bytes = PAGE_SIZE - offset;
+ else
+ nr_bytes = count;
+
+ /* If pfn is not ram, return zeros for sparse dump files */
+ if (!pfn_is_ram(pfn)) {
+ tmp = iov_iter_zero(nr_bytes, iter);
+ } else {
+ if (encrypted)
+ tmp = copy_oldmem_page_encrypted(iter, pfn,
+ nr_bytes,
+ offset);
+ else
+ tmp = copy_oldmem_page(iter, pfn, nr_bytes,
+ offset);
+ }
+ if (tmp < nr_bytes) {
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
+ return -EFAULT;
+ }
+
+ *ppos += nr_bytes;
+ count -= nr_bytes;
+ read += nr_bytes;
+ ++pfn;
+ offset = 0;
+ } while (count);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
+
+ return read;
+}
+
+/*
+ * Architectures may override this function to allocate ELF header in 2nd kernel
+ */
+int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
+{
+ return 0;
+}
+
+/*
+ * Architectures may override this function to free header
+ */
+void __weak elfcorehdr_free(unsigned long long addr)
+{}
+
+/*
+ * Architectures may override this function to read from ELF header
+ */
+ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
+{
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos, false);
+}
+
+/*
+ * Architectures may override this function to read from notes sections
+ */
+ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
+{
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+}
+
+/*
+ * Architectures may override this function to map oldmem
+ */
+int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
+ unsigned long from, unsigned long pfn,
+ unsigned long size, pgprot_t prot)
+{
+ prot = pgprot_encrypted(prot);
+ return remap_pfn_range(vma, from, pfn, size, prot);
+}
+
+/*
+ * Architectures which support memory encryption override this.
+ */
+ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter,
+ unsigned long pfn, size_t csize, unsigned long offset)
+{
+ return copy_oldmem_page(iter, pfn, csize, offset);
+}
+
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size)
+{
+ struct vmcoredd_node *dump;
+ u64 offset = 0;
+ int ret = 0;
+ size_t tsz;
+ char *buf;
+
+ mutex_lock(&vmcoredd_mutex);
+ list_for_each_entry(dump, &vmcoredd_list, list) {
+ if (start < offset + dump->size) {
+ tsz = min(offset + (u64)dump->size - start, (u64)size);
+ buf = dump->buf + start - offset;
+ if (copy_to_iter(buf, tsz, iter) < tsz) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ size -= tsz;
+ start += tsz;
+
+ /* Leave now if buffer filled already */
+ if (!size)
+ goto out_unlock;
+ }
+ offset += dump->size;
+ }
+
+out_unlock:
+ mutex_unlock(&vmcoredd_mutex);
+ return ret;
+}
+
+#ifdef CONFIG_MMU
+static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
+ u64 start, size_t size)
+{
+ struct vmcoredd_node *dump;
+ u64 offset = 0;
+ int ret = 0;
+ size_t tsz;
+ char *buf;
+
+ mutex_lock(&vmcoredd_mutex);
+ list_for_each_entry(dump, &vmcoredd_list, list) {
+ if (start < offset + dump->size) {
+ tsz = min(offset + (u64)dump->size - start, (u64)size);
+ buf = dump->buf + start - offset;
+ if (remap_vmalloc_range_partial(vma, dst, buf, 0,
+ tsz)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ size -= tsz;
+ start += tsz;
+ dst += tsz;
+
+ /* Leave now if buffer filled already */
+ if (!size)
+ goto out_unlock;
+ }
+ offset += dump->size;
+ }
+
+out_unlock:
+ mutex_unlock(&vmcoredd_mutex);
+ return ret;
+}
+#endif /* CONFIG_MMU */
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Read from the ELF header and then the crash dump. On error, negative value is
+ * returned otherwise number of bytes read are returned.
+ */
+static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
+{
+ ssize_t acc = 0, tmp;
+ size_t tsz;
+ u64 start;
+ struct vmcore *m = NULL;
+
+ if (!iov_iter_count(iter) || *fpos >= vmcore_size)
+ return 0;
+
+ iov_iter_truncate(iter, vmcore_size - *fpos);
+
+ /* Read ELF core header */
+ if (*fpos < elfcorebuf_sz) {
+ tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter));
+ if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz)
+ return -EFAULT;
+ *fpos += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (!iov_iter_count(iter))
+ return acc;
+ }
+
+ /* Read Elf note segment */
+ if (*fpos < elfcorebuf_sz + elfnotes_sz) {
+ void *kaddr;
+
+ /* We add device dumps before other elf notes because the
+ * other elf notes may not fill the elf notes buffer
+ * completely and we will end up with zero-filled data
+ * between the elf notes and the device dumps. Tools will
+ * then try to decode this zero-filled data as valid notes
+ * and we don't want that. Hence, adding device dumps before
+ * the other elf notes ensure that zero-filled data can be
+ * avoided.
+ */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ /* Read device dumps */
+ if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) {
+ tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+ (size_t)*fpos, iov_iter_count(iter));
+ start = *fpos - elfcorebuf_sz;
+ if (vmcoredd_copy_dumps(iter, start, tsz))
+ return -EFAULT;
+
+ *fpos += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (!iov_iter_count(iter))
+ return acc;
+ }
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+ /* Read remaining elf notes */
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos,
+ iov_iter_count(iter));
+ kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz;
+ if (copy_to_iter(kaddr, tsz, iter) < tsz)
+ return -EFAULT;
+
+ *fpos += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (!iov_iter_count(iter))
+ return acc;
+ }
+
+ list_for_each_entry(m, &vmcore_list, list) {
+ if (*fpos < m->offset + m->size) {
+ tsz = (size_t)min_t(unsigned long long,
+ m->offset + m->size - *fpos,
+ iov_iter_count(iter));
+ start = m->paddr + *fpos - m->offset;
+ tmp = read_from_oldmem(iter, tsz, &start,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+ if (tmp < 0)
+ return tmp;
+ *fpos += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (!iov_iter_count(iter))
+ return acc;
+ }
+ }
+
+ return acc;
+}
+
+static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter)
+{
+ return __read_vmcore(iter, &iocb->ki_pos);
+}
+
+/*
+ * The vmcore fault handler uses the page cache and fills data using the
+ * standard __read_vmcore() function.
+ *
+ * On s390 the fault handler is used for memory regions that can't be mapped
+ * directly with remap_pfn_range().
+ */
+static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
+{
+#ifdef CONFIG_S390
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ pgoff_t index = vmf->pgoff;
+ struct iov_iter iter;
+ struct kvec kvec;
+ struct page *page;
+ loff_t offset;
+ int rc;
+
+ page = find_or_create_page(mapping, index, GFP_KERNEL);
+ if (!page)
+ return VM_FAULT_OOM;
+ if (!PageUptodate(page)) {
+ offset = (loff_t) index << PAGE_SHIFT;
+ kvec.iov_base = page_address(page);
+ kvec.iov_len = PAGE_SIZE;
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE);
+
+ rc = __read_vmcore(&iter, &offset);
+ if (rc < 0) {
+ unlock_page(page);
+ put_page(page);
+ return vmf_error(rc);
+ }
+ SetPageUptodate(page);
+ }
+ unlock_page(page);
+ vmf->page = page;
+ return 0;
+#else
+ return VM_FAULT_SIGBUS;
+#endif
+}
+
+static const struct vm_operations_struct vmcore_mmap_ops = {
+ .fault = mmap_vmcore_fault,
+};
+
+/**
+ * vmcore_alloc_buf - allocate buffer in vmalloc memory
+ * @size: size of buffer
+ *
+ * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
+ * the buffer to user-space by means of remap_vmalloc_range().
+ *
+ * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
+ * disabled and there's no need to allow users to mmap the buffer.
+ */
+static inline char *vmcore_alloc_buf(size_t size)
+{
+#ifdef CONFIG_MMU
+ return vmalloc_user(size);
+#else
+ return vzalloc(size);
+#endif
+}
+
+/*
+ * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
+ * essential for mmap_vmcore() in order to map physically
+ * non-contiguous objects (ELF header, ELF note segment and memory
+ * regions in the 1st kernel pointed to by PT_LOAD entries) into
+ * virtually contiguous user-space in ELF layout.
+ */
+#ifdef CONFIG_MMU
+/*
+ * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
+ * reported as not being ram with the zero page.
+ *
+ * @vma: vm_area_struct describing requested mapping
+ * @from: start remapping from
+ * @pfn: page frame number to start remapping to
+ * @size: remapping size
+ * @prot: protection bits
+ *
+ * Returns zero on success, -EAGAIN on failure.
+ */
+static int remap_oldmem_pfn_checked(struct vm_area_struct *vma,
+ unsigned long from, unsigned long pfn,
+ unsigned long size, pgprot_t prot)
+{
+ unsigned long map_size;
+ unsigned long pos_start, pos_end, pos;
+ unsigned long zeropage_pfn = my_zero_pfn(0);
+ size_t len = 0;
+
+ pos_start = pfn;
+ pos_end = pfn + (size >> PAGE_SHIFT);
+
+ for (pos = pos_start; pos < pos_end; ++pos) {
+ if (!pfn_is_ram(pos)) {
+ /*
+ * We hit a page which is not ram. Remap the continuous
+ * region between pos_start and pos-1 and replace
+ * the non-ram page at pos with the zero page.
+ */
+ if (pos > pos_start) {
+ /* Remap continuous region */
+ map_size = (pos - pos_start) << PAGE_SHIFT;
+ if (remap_oldmem_pfn_range(vma, from + len,
+ pos_start, map_size,
+ prot))
+ goto fail;
+ len += map_size;
+ }
+ /* Remap the zero page */
+ if (remap_oldmem_pfn_range(vma, from + len,
+ zeropage_pfn,
+ PAGE_SIZE, prot))
+ goto fail;
+ len += PAGE_SIZE;
+ pos_start = pos + 1;
+ }
+ }
+ if (pos > pos_start) {
+ /* Remap the rest */
+ map_size = (pos - pos_start) << PAGE_SHIFT;
+ if (remap_oldmem_pfn_range(vma, from + len, pos_start,
+ map_size, prot))
+ goto fail;
+ }
+ return 0;
+fail:
+ do_munmap(vma->vm_mm, from, len, NULL);
+ return -EAGAIN;
+}
+
+static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
+ unsigned long from, unsigned long pfn,
+ unsigned long size, pgprot_t prot)
+{
+ int ret, idx;
+
+ /*
+ * Check if a callback was registered to avoid looping over all
+ * pages without a reason.
+ */
+ idx = srcu_read_lock(&vmcore_cb_srcu);
+ if (!list_empty(&vmcore_cb_list))
+ ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
+ else
+ ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
+ return ret;
+}
+
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+ size_t size = vma->vm_end - vma->vm_start;
+ u64 start, end, len, tsz;
+ struct vmcore *m;
+
+ start = (u64)vma->vm_pgoff << PAGE_SHIFT;
+ end = start + size;
+
+ if (size > vmcore_size || end > vmcore_size)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+ return -EPERM;
+
+ vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+ vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_ops = &vmcore_mmap_ops;
+
+ len = 0;
+
+ if (start < elfcorebuf_sz) {
+ u64 pfn;
+
+ tsz = min(elfcorebuf_sz - (size_t)start, size);
+ pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
+ if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
+ vma->vm_page_prot))
+ return -EAGAIN;
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
+ }
+
+ if (start < elfcorebuf_sz + elfnotes_sz) {
+ void *kaddr;
+
+ /* We add device dumps before other elf notes because the
+ * other elf notes may not fill the elf notes buffer
+ * completely and we will end up with zero-filled data
+ * between the elf notes and the device dumps. Tools will
+ * then try to decode this zero-filled data as valid notes
+ * and we don't want that. Hence, adding device dumps before
+ * the other elf notes ensure that zero-filled data can be
+ * avoided. This also ensures that the device dumps and
+ * other elf notes can be properly mmaped at page aligned
+ * address.
+ */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ /* Read device dumps */
+ if (start < elfcorebuf_sz + vmcoredd_orig_sz) {
+ u64 start_off;
+
+ tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+ (size_t)start, size);
+ start_off = start - elfcorebuf_sz;
+ if (vmcoredd_mmap_dumps(vma, vma->vm_start + len,
+ start_off, tsz))
+ goto fail;
+
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ /* leave now if filled buffer already */
+ if (!size)
+ return 0;
+ }
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+ /* Read remaining elf notes */
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
+ kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz;
+ if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
+ kaddr, 0, tsz))
+ goto fail;
+
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
+ }
+
+ list_for_each_entry(m, &vmcore_list, list) {
+ if (start < m->offset + m->size) {
+ u64 paddr = 0;
+
+ tsz = (size_t)min_t(unsigned long long,
+ m->offset + m->size - start, size);
+ paddr = m->paddr + start - m->offset;
+ if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len,
+ paddr >> PAGE_SHIFT, tsz,
+ vma->vm_page_prot))
+ goto fail;
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ if (size == 0)
+ return 0;
+ }
+ }
+
+ return 0;
+fail:
+ do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
+ return -EAGAIN;
+}
+#else
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+ return -ENOSYS;
+}
+#endif
+
+static const struct proc_ops vmcore_proc_ops = {
+ .proc_open = open_vmcore,
+ .proc_read_iter = read_vmcore,
+ .proc_lseek = default_llseek,
+ .proc_mmap = mmap_vmcore,
+};
+
+static struct vmcore* __init get_new_element(void)
+{
+ return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
+}
+
+static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+ struct list_head *vc_list)
+{
+ u64 size;
+ struct vmcore *m;
+
+ size = elfsz + elfnotesegsz;
+ list_for_each_entry(m, vc_list, list) {
+ size += m->size;
+ }
+ return size;
+}
+
+/**
+ * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
+{
+ int i, rc=0;
+ Elf64_Phdr *phdr_ptr;
+ Elf64_Nhdr *nhdr_ptr;
+
+ phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ void *notes_section;
+ u64 offset, max_sz, sz, real_sz = 0;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ max_sz = phdr_ptr->p_memsz;
+ offset = phdr_ptr->p_offset;
+ notes_section = kmalloc(max_sz, GFP_KERNEL);
+ if (!notes_section)
+ return -ENOMEM;
+ rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
+ if (rc < 0) {
+ kfree(notes_section);
+ return rc;
+ }
+ nhdr_ptr = notes_section;
+ while (nhdr_ptr->n_namesz != 0) {
+ sz = sizeof(Elf64_Nhdr) +
+ (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
+ (((u64)nhdr_ptr->n_descsz + 3) & ~3);
+ if ((real_sz + sz) > max_sz) {
+ pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
+ nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
+ break;
+ }
+ real_sz += sz;
+ nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
+ }
+ kfree(notes_section);
+ phdr_ptr->p_memsz = real_sz;
+ if (real_sz == 0) {
+ pr_warn("Warning: Zero PT_NOTE entries found\n");
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * get_note_number_and_size_elf64 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
+ int *nr_ptnote, u64 *sz_ptnote)
+{
+ int i;
+ Elf64_Phdr *phdr_ptr;
+
+ *nr_ptnote = *sz_ptnote = 0;
+
+ phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ *nr_ptnote += 1;
+ *sz_ptnote += phdr_ptr->p_memsz;
+ }
+
+ return 0;
+}
+
+/**
+ * copy_notes_elf64 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
+{
+ int i, rc=0;
+ Elf64_Phdr *phdr_ptr;
+
+ phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 offset;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ offset = phdr_ptr->p_offset;
+ rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
+ &offset);
+ if (rc < 0)
+ return rc;
+ notes_buf += phdr_ptr->p_memsz;
+ }
+
+ return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+ char **notes_buf, size_t *notes_sz)
+{
+ int i, nr_ptnote=0, rc=0;
+ char *tmp;
+ Elf64_Ehdr *ehdr_ptr;
+ Elf64_Phdr phdr;
+ u64 phdr_sz = 0, note_off;
+
+ ehdr_ptr = (Elf64_Ehdr *)elfptr;
+
+ rc = update_note_header_size_elf64(ehdr_ptr);
+ if (rc < 0)
+ return rc;
+
+ rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
+ if (rc < 0)
+ return rc;
+
+ *notes_sz = roundup(phdr_sz, PAGE_SIZE);
+ *notes_buf = vmcore_alloc_buf(*notes_sz);
+ if (!*notes_buf)
+ return -ENOMEM;
+
+ rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
+ if (rc < 0)
+ return rc;
+
+ /* Prepare merged PT_NOTE program header. */
+ phdr.p_type = PT_NOTE;
+ phdr.p_flags = 0;
+ note_off = sizeof(Elf64_Ehdr) +
+ (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
+ phdr.p_offset = roundup(note_off, PAGE_SIZE);
+ phdr.p_vaddr = phdr.p_paddr = 0;
+ phdr.p_filesz = phdr.p_memsz = phdr_sz;
+ phdr.p_align = 0;
+
+ /* Add merged PT_NOTE program header*/
+ tmp = elfptr + sizeof(Elf64_Ehdr);
+ memcpy(tmp, &phdr, sizeof(phdr));
+ tmp += sizeof(phdr);
+
+ /* Remove unwanted PT_NOTE program headers. */
+ i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
+ *elfsz = *elfsz - i;
+ memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+ memset(elfptr + *elfsz, 0, i);
+ *elfsz = roundup(*elfsz, PAGE_SIZE);
+
+ /* Modify e_phnum to reflect merged headers. */
+ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+
+ /* Store the size of all notes. We need this to update the note
+ * header when the device dumps will be added.
+ */
+ elfnotes_orig_sz = phdr.p_memsz;
+
+ return 0;
+}
+
+/**
+ * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
+{
+ int i, rc=0;
+ Elf32_Phdr *phdr_ptr;
+ Elf32_Nhdr *nhdr_ptr;
+
+ phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ void *notes_section;
+ u64 offset, max_sz, sz, real_sz = 0;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ max_sz = phdr_ptr->p_memsz;
+ offset = phdr_ptr->p_offset;
+ notes_section = kmalloc(max_sz, GFP_KERNEL);
+ if (!notes_section)
+ return -ENOMEM;
+ rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
+ if (rc < 0) {
+ kfree(notes_section);
+ return rc;
+ }
+ nhdr_ptr = notes_section;
+ while (nhdr_ptr->n_namesz != 0) {
+ sz = sizeof(Elf32_Nhdr) +
+ (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
+ (((u64)nhdr_ptr->n_descsz + 3) & ~3);
+ if ((real_sz + sz) > max_sz) {
+ pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
+ nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
+ break;
+ }
+ real_sz += sz;
+ nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
+ }
+ kfree(notes_section);
+ phdr_ptr->p_memsz = real_sz;
+ if (real_sz == 0) {
+ pr_warn("Warning: Zero PT_NOTE entries found\n");
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * get_note_number_and_size_elf32 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
+ int *nr_ptnote, u64 *sz_ptnote)
+{
+ int i;
+ Elf32_Phdr *phdr_ptr;
+
+ *nr_ptnote = *sz_ptnote = 0;
+
+ phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ *nr_ptnote += 1;
+ *sz_ptnote += phdr_ptr->p_memsz;
+ }
+
+ return 0;
+}
+
+/**
+ * copy_notes_elf32 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
+{
+ int i, rc=0;
+ Elf32_Phdr *phdr_ptr;
+
+ phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 offset;
+ if (phdr_ptr->p_type != PT_NOTE)
+ continue;
+ offset = phdr_ptr->p_offset;
+ rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
+ &offset);
+ if (rc < 0)
+ return rc;
+ notes_buf += phdr_ptr->p_memsz;
+ }
+
+ return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
+ char **notes_buf, size_t *notes_sz)
+{
+ int i, nr_ptnote=0, rc=0;
+ char *tmp;
+ Elf32_Ehdr *ehdr_ptr;
+ Elf32_Phdr phdr;
+ u64 phdr_sz = 0, note_off;
+
+ ehdr_ptr = (Elf32_Ehdr *)elfptr;
+
+ rc = update_note_header_size_elf32(ehdr_ptr);
+ if (rc < 0)
+ return rc;
+
+ rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
+ if (rc < 0)
+ return rc;
+
+ *notes_sz = roundup(phdr_sz, PAGE_SIZE);
+ *notes_buf = vmcore_alloc_buf(*notes_sz);
+ if (!*notes_buf)
+ return -ENOMEM;
+
+ rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
+ if (rc < 0)
+ return rc;
+
+ /* Prepare merged PT_NOTE program header. */
+ phdr.p_type = PT_NOTE;
+ phdr.p_flags = 0;
+ note_off = sizeof(Elf32_Ehdr) +
+ (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
+ phdr.p_offset = roundup(note_off, PAGE_SIZE);
+ phdr.p_vaddr = phdr.p_paddr = 0;
+ phdr.p_filesz = phdr.p_memsz = phdr_sz;
+ phdr.p_align = 0;
+
+ /* Add merged PT_NOTE program header*/
+ tmp = elfptr + sizeof(Elf32_Ehdr);
+ memcpy(tmp, &phdr, sizeof(phdr));
+ tmp += sizeof(phdr);
+
+ /* Remove unwanted PT_NOTE program headers. */
+ i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
+ *elfsz = *elfsz - i;
+ memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
+ memset(elfptr + *elfsz, 0, i);
+ *elfsz = roundup(*elfsz, PAGE_SIZE);
+
+ /* Modify e_phnum to reflect merged headers. */
+ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+
+ /* Store the size of all notes. We need this to update the note
+ * header when the device dumps will be added.
+ */
+ elfnotes_orig_sz = phdr.p_memsz;
+
+ return 0;
+}
+
+/* Add memory chunks represented by program headers to vmcore list. Also update
+ * the new offset fields of exported program headers. */
+static int __init process_ptload_program_headers_elf64(char *elfptr,
+ size_t elfsz,
+ size_t elfnotes_sz,
+ struct list_head *vc_list)
+{
+ int i;
+ Elf64_Ehdr *ehdr_ptr;
+ Elf64_Phdr *phdr_ptr;
+ loff_t vmcore_off;
+ struct vmcore *new;
+
+ ehdr_ptr = (Elf64_Ehdr *)elfptr;
+ phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
+
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 paddr, start, end, size;
+
+ if (phdr_ptr->p_type != PT_LOAD)
+ continue;
+
+ paddr = phdr_ptr->p_offset;
+ start = rounddown(paddr, PAGE_SIZE);
+ end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+ size = end - start;
+
+ /* Add this contiguous chunk of memory to vmcore list.*/
+ new = get_new_element();
+ if (!new)
+ return -ENOMEM;
+ new->paddr = start;
+ new->size = size;
+ list_add_tail(&new->list, vc_list);
+
+ /* Update the program header offset. */
+ phdr_ptr->p_offset = vmcore_off + (paddr - start);
+ vmcore_off = vmcore_off + size;
+ }
+ return 0;
+}
+
+static int __init process_ptload_program_headers_elf32(char *elfptr,
+ size_t elfsz,
+ size_t elfnotes_sz,
+ struct list_head *vc_list)
+{
+ int i;
+ Elf32_Ehdr *ehdr_ptr;
+ Elf32_Phdr *phdr_ptr;
+ loff_t vmcore_off;
+ struct vmcore *new;
+
+ ehdr_ptr = (Elf32_Ehdr *)elfptr;
+ phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
+
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
+
+ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+ u64 paddr, start, end, size;
+
+ if (phdr_ptr->p_type != PT_LOAD)
+ continue;
+
+ paddr = phdr_ptr->p_offset;
+ start = rounddown(paddr, PAGE_SIZE);
+ end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+ size = end - start;
+
+ /* Add this contiguous chunk of memory to vmcore list.*/
+ new = get_new_element();
+ if (!new)
+ return -ENOMEM;
+ new->paddr = start;
+ new->size = size;
+ list_add_tail(&new->list, vc_list);
+
+ /* Update the program header offset */
+ phdr_ptr->p_offset = vmcore_off + (paddr - start);
+ vmcore_off = vmcore_off + size;
+ }
+ return 0;
+}
+
+/* Sets offset fields of vmcore elements. */
+static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
+ struct list_head *vc_list)
+{
+ loff_t vmcore_off;
+ struct vmcore *m;
+
+ /* Skip Elf header, program headers and Elf note segment. */
+ vmcore_off = elfsz + elfnotes_sz;
+
+ list_for_each_entry(m, vc_list, list) {
+ m->offset = vmcore_off;
+ vmcore_off += m->size;
+ }
+}
+
+static void free_elfcorebuf(void)
+{
+ free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
+ elfcorebuf = NULL;
+ vfree(elfnotes_buf);
+ elfnotes_buf = NULL;
+}
+
+static int __init parse_crash_elf64_headers(void)
+{
+ int rc=0;
+ Elf64_Ehdr ehdr;
+ u64 addr;
+
+ addr = elfcorehdr_addr;
+
+ /* Read Elf header */
+ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr);
+ if (rc < 0)
+ return rc;
+
+ /* Do some basic Verification. */
+ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
+ (ehdr.e_type != ET_CORE) ||
+ !vmcore_elf64_check_arch(&ehdr) ||
+ ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
+ ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
+ ehdr.e_version != EV_CURRENT ||
+ ehdr.e_ehsize != sizeof(Elf64_Ehdr) ||
+ ehdr.e_phentsize != sizeof(Elf64_Phdr) ||
+ ehdr.e_phnum == 0) {
+ pr_warn("Warning: Core image elf header is not sane\n");
+ return -EINVAL;
+ }
+
+ /* Read in all elf headers. */
+ elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
+ ehdr.e_phnum * sizeof(Elf64_Phdr);
+ elfcorebuf_sz = elfcorebuf_sz_orig;
+ elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(elfcorebuf_sz_orig));
+ if (!elfcorebuf)
+ return -ENOMEM;
+ addr = elfcorehdr_addr;
+ rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
+ if (rc < 0)
+ goto fail;
+
+ /* Merge all PT_NOTE headers into one. */
+ rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
+ &elfnotes_buf, &elfnotes_sz);
+ if (rc)
+ goto fail;
+ rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
+ elfnotes_sz, &vmcore_list);
+ if (rc)
+ goto fail;
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+ return 0;
+fail:
+ free_elfcorebuf();
+ return rc;
+}
+
+static int __init parse_crash_elf32_headers(void)
+{
+ int rc=0;
+ Elf32_Ehdr ehdr;
+ u64 addr;
+
+ addr = elfcorehdr_addr;
+
+ /* Read Elf header */
+ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr);
+ if (rc < 0)
+ return rc;
+
+ /* Do some basic Verification. */
+ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
+ (ehdr.e_type != ET_CORE) ||
+ !vmcore_elf32_check_arch(&ehdr) ||
+ ehdr.e_ident[EI_CLASS] != ELFCLASS32||
+ ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
+ ehdr.e_version != EV_CURRENT ||
+ ehdr.e_ehsize != sizeof(Elf32_Ehdr) ||
+ ehdr.e_phentsize != sizeof(Elf32_Phdr) ||
+ ehdr.e_phnum == 0) {
+ pr_warn("Warning: Core image elf header is not sane\n");
+ return -EINVAL;
+ }
+
+ /* Read in all elf headers. */
+ elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
+ elfcorebuf_sz = elfcorebuf_sz_orig;
+ elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(elfcorebuf_sz_orig));
+ if (!elfcorebuf)
+ return -ENOMEM;
+ addr = elfcorehdr_addr;
+ rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
+ if (rc < 0)
+ goto fail;
+
+ /* Merge all PT_NOTE headers into one. */
+ rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
+ &elfnotes_buf, &elfnotes_sz);
+ if (rc)
+ goto fail;
+ rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
+ elfnotes_sz, &vmcore_list);
+ if (rc)
+ goto fail;
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+ return 0;
+fail:
+ free_elfcorebuf();
+ return rc;
+}
+
+static int __init parse_crash_elf_headers(void)
+{
+ unsigned char e_ident[EI_NIDENT];
+ u64 addr;
+ int rc=0;
+
+ addr = elfcorehdr_addr;
+ rc = elfcorehdr_read(e_ident, EI_NIDENT, &addr);
+ if (rc < 0)
+ return rc;
+ if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
+ pr_warn("Warning: Core image elf header not found\n");
+ return -EINVAL;
+ }
+
+ if (e_ident[EI_CLASS] == ELFCLASS64) {
+ rc = parse_crash_elf64_headers();
+ if (rc)
+ return rc;
+ } else if (e_ident[EI_CLASS] == ELFCLASS32) {
+ rc = parse_crash_elf32_headers();
+ if (rc)
+ return rc;
+ } else {
+ pr_warn("Warning: Core image elf header is not sane\n");
+ return -EINVAL;
+ }
+
+ /* Determine vmcore size. */
+ vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+ &vmcore_list);
+
+ return 0;
+}
+
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/**
+ * vmcoredd_write_header - Write vmcore device dump header at the
+ * beginning of the dump's buffer.
+ * @buf: Output buffer where the note is written
+ * @data: Dump info
+ * @size: Size of the dump
+ *
+ * Fills beginning of the dump's buffer with vmcore device dump header.
+ */
+static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
+ u32 size)
+{
+ struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf;
+
+ vdd_hdr->n_namesz = sizeof(vdd_hdr->name);
+ vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name);
+ vdd_hdr->n_type = NT_VMCOREDD;
+
+ strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME,
+ sizeof(vdd_hdr->name));
+ memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name));
+}
+
+/**
+ * vmcoredd_update_program_headers - Update all Elf program headers
+ * @elfptr: Pointer to elf header
+ * @elfnotesz: Size of elf notes aligned to page size
+ * @vmcoreddsz: Size of device dumps to be added to elf note header
+ *
+ * Determine type of Elf header (Elf64 or Elf32) and update the elf note size.
+ * Also update the offsets of all the program headers after the elf note header.
+ */
+static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
+ size_t vmcoreddsz)
+{
+ unsigned char *e_ident = (unsigned char *)elfptr;
+ u64 start, end, size;
+ loff_t vmcore_off;
+ u32 i;
+
+ vmcore_off = elfcorebuf_sz + elfnotesz;
+
+ if (e_ident[EI_CLASS] == ELFCLASS64) {
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr;
+ Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr));
+
+ /* Update all program headers */
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_NOTE) {
+ /* Update note size */
+ phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+ phdr->p_filesz = phdr->p_memsz;
+ continue;
+ }
+
+ start = rounddown(phdr->p_offset, PAGE_SIZE);
+ end = roundup(phdr->p_offset + phdr->p_memsz,
+ PAGE_SIZE);
+ size = end - start;
+ phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+ vmcore_off += size;
+ }
+ } else {
+ Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr;
+ Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr));
+
+ /* Update all program headers */
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_NOTE) {
+ /* Update note size */
+ phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+ phdr->p_filesz = phdr->p_memsz;
+ continue;
+ }
+
+ start = rounddown(phdr->p_offset, PAGE_SIZE);
+ end = roundup(phdr->p_offset + phdr->p_memsz,
+ PAGE_SIZE);
+ size = end - start;
+ phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+ vmcore_off += size;
+ }
+ }
+}
+
+/**
+ * vmcoredd_update_size - Update the total size of the device dumps and update
+ * Elf header
+ * @dump_size: Size of the current device dump to be added to total size
+ *
+ * Update the total size of all the device dumps and update the Elf program
+ * headers. Calculate the new offsets for the vmcore list and update the
+ * total vmcore size.
+ */
+static void vmcoredd_update_size(size_t dump_size)
+{
+ vmcoredd_orig_sz += dump_size;
+ elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz;
+ vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz,
+ vmcoredd_orig_sz);
+
+ /* Update vmcore list offsets */
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+
+ vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+ &vmcore_list);
+ proc_vmcore->size = vmcore_size;
+}
+
+/**
+ * vmcore_add_device_dump - Add a buffer containing device dump to vmcore
+ * @data: dump info.
+ *
+ * Allocate a buffer and invoke the calling driver's dump collect routine.
+ * Write Elf note at the beginning of the buffer to indicate vmcore device
+ * dump and add the dump to global list.
+ */
+int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+ struct vmcoredd_node *dump;
+ void *buf = NULL;
+ size_t data_size;
+ int ret;
+
+ if (vmcoredd_disabled) {
+ pr_err_once("Device dump is disabled\n");
+ return -EINVAL;
+ }
+
+ if (!data || !strlen(data->dump_name) ||
+ !data->vmcoredd_callback || !data->size)
+ return -EINVAL;
+
+ dump = vzalloc(sizeof(*dump));
+ if (!dump) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ /* Keep size of the buffer page aligned so that it can be mmaped */
+ data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
+ PAGE_SIZE);
+
+ /* Allocate buffer for driver's to write their dumps */
+ buf = vmcore_alloc_buf(data_size);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ vmcoredd_write_header(buf, data, data_size -
+ sizeof(struct vmcoredd_header));
+
+ /* Invoke the driver's dump collection routing */
+ ret = data->vmcoredd_callback(data, buf +
+ sizeof(struct vmcoredd_header));
+ if (ret)
+ goto out_err;
+
+ dump->buf = buf;
+ dump->size = data_size;
+
+ /* Add the dump to driver sysfs list */
+ mutex_lock(&vmcoredd_mutex);
+ list_add_tail(&dump->list, &vmcoredd_list);
+ mutex_unlock(&vmcoredd_mutex);
+
+ vmcoredd_update_size(data_size);
+ return 0;
+
+out_err:
+ vfree(buf);
+ vfree(dump);
+
+ return ret;
+}
+EXPORT_SYMBOL(vmcore_add_device_dump);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Free all dumps in vmcore device dump list */
+static void vmcore_free_device_dumps(void)
+{
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ mutex_lock(&vmcoredd_mutex);
+ while (!list_empty(&vmcoredd_list)) {
+ struct vmcoredd_node *dump;
+
+ dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node,
+ list);
+ list_del(&dump->list);
+ vfree(dump->buf);
+ vfree(dump);
+ }
+ mutex_unlock(&vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+}
+
+/* Init function for vmcore module. */
+static int __init vmcore_init(void)
+{
+ int rc = 0;
+
+ /* Allow architectures to allocate ELF header in 2nd kernel */
+ rc = elfcorehdr_alloc(&elfcorehdr_addr, &elfcorehdr_size);
+ if (rc)
+ return rc;
+ /*
+ * If elfcorehdr= has been passed in cmdline or created in 2nd kernel,
+ * then capture the dump.
+ */
+ if (!(is_vmcore_usable()))
+ return rc;
+ rc = parse_crash_elf_headers();
+ if (rc) {
+ pr_warn("Kdump: vmcore not initialized\n");
+ return rc;
+ }
+ elfcorehdr_free(elfcorehdr_addr);
+ elfcorehdr_addr = ELFCORE_ADDR_ERR;
+
+ proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &vmcore_proc_ops);
+ if (proc_vmcore)
+ proc_vmcore->size = vmcore_size;
+ return 0;
+}
+fs_initcall(vmcore_init);
+
+/* Cleanup function for vmcore module. */
+void vmcore_cleanup(void)
+{
+ if (proc_vmcore) {
+ proc_remove(proc_vmcore);
+ proc_vmcore = NULL;
+ }
+
+ /* clear the vmcore list. */
+ while (!list_empty(&vmcore_list)) {
+ struct vmcore *m;
+
+ m = list_first_entry(&vmcore_list, struct vmcore, list);
+ list_del(&m->list);
+ kfree(m);
+ }
+ free_elfcorebuf();
+
+ /* clear vmcore device dump list */
+ vmcore_free_device_dumps();
+}