diff options
Diffstat (limited to '')
37 files changed, 16812 insertions, 0 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig new file mode 100644 index 000000000..32b1116ae --- /dev/null +++ b/fs/proc/Kconfig @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: GPL-2.0-only +config PROC_FS + bool "/proc file system support" if EXPERT + default y + help + This is a virtual file system providing information about the status + of the system. "Virtual" means that it doesn't take up any space on + your hard disk: the files are created on the fly by the kernel when + you try to access them. Also, you cannot read the files with older + version of the program less: you need to use more or cat. + + It's totally cool; for example, "cat /proc/interrupts" gives + information about what the different IRQs are used for at the moment + (there is a small number of Interrupt ReQuest lines in your computer + that are used by the attached devices to gain the CPU's attention -- + often a source of trouble if two devices are mistakenly configured + to use the same IRQ). The program procinfo to display some + information about your system gathered from the /proc file system. + + Before you can use the /proc file system, it has to be mounted, + meaning it has to be given a location in the directory hierarchy. + That location should be /proc. A command such as "mount -t proc proc + /proc" or the equivalent line in /etc/fstab does the job. + + The /proc file system is explained in the file + <file:Documentation/filesystems/proc.rst> and on the proc(5) manpage + ("man 5 proc"). + + This option will enlarge your kernel by about 67 KB. Several + programs depend on this, so everyone should say Y here. + +config PROC_KCORE + bool "/proc/kcore support" if !ARM + depends on PROC_FS && MMU + select CRASH_CORE + help + Provides a virtual ELF core file of the live kernel. This can + be read with gdb and other ELF tools. No modifications can be + made using this mechanism. + +config PROC_VMCORE + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP + default y + help + Exports the dump image of crashed kernel in ELF format. + +config PROC_VMCORE_DEVICE_DUMP + bool "Device Hardware/Firmware Log Collection" + depends on PROC_VMCORE + default n + help + After kernel panic, device drivers can collect the device + specific snapshot of their hardware or firmware before the + underlying devices are initialized in crash recovery kernel. + Note that the device driver must be present in the crash + recovery kernel's initramfs to collect its underlying device + snapshot. + + If you say Y here, the collected device dumps will be added + as ELF notes to /proc/vmcore. You can still disable device + dump using the kernel command line option 'novmcoredd'. + +config PROC_SYSCTL + bool "Sysctl support (/proc/sys)" if EXPERT + depends on PROC_FS + select SYSCTL + default y + help + The sysctl interface provides a means of dynamically changing + certain kernel parameters and variables on the fly without requiring + a recompile of the kernel or reboot of the system. The primary + interface is through /proc/sys. If you say Y here a tree of + modifiable sysctl entries will be generated beneath the + /proc/sys directory. They are explained in the files + in <file:Documentation/admin-guide/sysctl/>. Note that enabling this + option will enlarge the kernel by at least 8 KB. + + As it is generally a good thing, you should say Y here unless + building a kernel for install/rescue disks or your system is very + limited in memory. + +config PROC_PAGE_MONITOR + default y + depends on PROC_FS && MMU + bool "Enable /proc page monitoring" if EXPERT + help + Various /proc files exist to monitor process memory utilization: + /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, + /proc/kpagecount, and /proc/kpageflags. Disabling these + interfaces will reduce the size of the kernel by approximately 4kb. + +config PROC_CHILDREN + bool "Include /proc/<pid>/task/<tid>/children file" + depends on PROC_FS + default n + help + Provides a fast way to retrieve first level children pids of a task. See + <file:Documentation/filesystems/proc.rst> for more information. + + Say Y if you are running any user-space software which takes benefit from + this interface. For example, rkt is such a piece of software. + +config PROC_PID_ARCH_STATUS + def_bool n + depends on PROC_FS + +config PROC_CPU_RESCTRL + def_bool n + depends on PROC_FS diff --git a/fs/proc/Makefile b/fs/proc/Makefile new file mode 100644 index 000000000..bd08616ed --- /dev/null +++ b/fs/proc/Makefile @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the Linux proc filesystem routines. +# + +obj-y += proc.o + +CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,) +proc-y := nommu.o task_nommu.o +proc-$(CONFIG_MMU) := task_mmu.o + +proc-y += inode.o root.o base.o generic.o array.o \ + fd.o +proc-$(CONFIG_TTY) += proc_tty.o +proc-y += cmdline.o +proc-y += consoles.o +proc-y += cpuinfo.o +proc-y += devices.o +proc-y += interrupts.o +proc-y += loadavg.o +proc-y += meminfo.o +proc-y += stat.o +proc-y += uptime.o +proc-y += util.o +proc-y += version.o +proc-y += softirqs.o +proc-y += namespaces.o +proc-y += self.o +proc-y += thread_self.o +proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o +proc-$(CONFIG_NET) += proc_net.o +proc-$(CONFIG_PROC_KCORE) += kcore.o +proc-$(CONFIG_PROC_VMCORE) += vmcore.o +proc-$(CONFIG_PRINTK) += kmsg.o +proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o +proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o diff --git a/fs/proc/array.c b/fs/proc/array.c new file mode 100644 index 000000000..49283b810 --- /dev/null +++ b/fs/proc/array.c @@ -0,0 +1,803 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/proc/array.c + * + * Copyright (C) 1992 by Linus Torvalds + * based on ideas by Darren Senn + * + * Fixes: + * Michael. K. Johnson: stat,statm extensions. + * <johnsonm@stolaf.edu> + * + * Pauline Middelink : Made cmdline,envline only break at '\0's, to + * make sure SET_PROCTITLE works. Also removed + * bad '!' which forced address recalculation for + * EVERY character on the current page. + * <middelin@polyware.iaf.nl> + * + * Danny ter Haar : added cpuinfo + * <dth@cistron.nl> + * + * Alessandro Rubini : profile extension. + * <rubini@ipvvis.unipv.it> + * + * Jeff Tranter : added BogoMips field to cpuinfo + * <Jeff_Tranter@Mitel.COM> + * + * Bruno Haible : remove 4K limit for the maps file + * <haible@ma2s2.mathematik.uni-karlsruhe.de> + * + * Yves Arrouye : remove removal of trailing spaces in get_array. + * <Yves.Arrouye@marin.fdn.fr> + * + * Jerome Forissier : added per-CPU time information to /proc/stat + * and /proc/<pid>/cpu extension + * <forissier@isia.cma.fr> + * - Incorporation and non-SMP safe operation + * of forissier patch in 2.1.78 by + * Hans Marcus <crowbar@concepts.nl> + * + * aeb@cwi.nl : /proc/partitions + * + * + * Alan Cox : security fixes. + * <alan@lxorguk.ukuu.org.uk> + * + * Al Viro : safe handling of mm_struct + * + * Gerhard Wichert : added BIGMEM support + * Siemens AG <Gerhard.Wichert@pdb.siemens.de> + * + * Al Viro & Jeff Garzik : moved most of the thing into base.c and + * : proc_misc.c. The rest may eventually go into + * : base.c too. + */ + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/time_namespace.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/tty.h> +#include <linux/string.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/task_stack.h> +#include <linux/sched/task.h> +#include <linux/sched/cputime.h> +#include <linux/proc_fs.h> +#include <linux/ioport.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/signal.h> +#include <linux/highmem.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/times.h> +#include <linux/cpuset.h> +#include <linux/rcupdate.h> +#include <linux/delayacct.h> +#include <linux/seq_file.h> +#include <linux/pid_namespace.h> +#include <linux/prctl.h> +#include <linux/ptrace.h> +#include <linux/string_helpers.h> +#include <linux/user_namespace.h> +#include <linux/fs_struct.h> +#include <linux/kthread.h> + +#include <asm/processor.h> +#include "internal.h" + +void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) +{ + char tcomm[64]; + + /* + * Test before PF_KTHREAD because all workqueue worker threads are + * kernel threads. + */ + if (p->flags & PF_WQ_WORKER) + wq_worker_comm(tcomm, sizeof(tcomm), p); + else if (p->flags & PF_KTHREAD) + get_kthread_comm(tcomm, sizeof(tcomm), p); + else + __get_task_comm(tcomm, sizeof(tcomm), p); + + if (escape) + seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + else + seq_printf(m, "%.64s", tcomm); +} + +/* + * The task state array is a strange "bitmap" of + * reasons to sleep. Thus "running" is zero, and + * you can test for combinations of others with + * simple bit tests. + */ +static const char * const task_state_array[] = { + + /* states in TASK_REPORT: */ + "R (running)", /* 0x00 */ + "S (sleeping)", /* 0x01 */ + "D (disk sleep)", /* 0x02 */ + "T (stopped)", /* 0x04 */ + "t (tracing stop)", /* 0x08 */ + "X (dead)", /* 0x10 */ + "Z (zombie)", /* 0x20 */ + "P (parked)", /* 0x40 */ + + /* states beyond TASK_REPORT: */ + "I (idle)", /* 0x80 */ +}; + +static inline const char *get_task_state(struct task_struct *tsk) +{ + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); + return task_state_array[task_state_index(tsk)]; +} + +static inline void task_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *p) +{ + struct user_namespace *user_ns = seq_user_ns(m); + struct group_info *group_info; + int g, umask = -1; + struct task_struct *tracer; + const struct cred *cred; + pid_t ppid, tpid = 0, tgid, ngid; + unsigned int max_fds = 0; + + rcu_read_lock(); + ppid = pid_alive(p) ? + task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; + + tracer = ptrace_parent(p); + if (tracer) + tpid = task_pid_nr_ns(tracer, ns); + + tgid = task_tgid_nr_ns(p, ns); + ngid = task_numa_group_id(p); + cred = get_task_cred(p); + + task_lock(p); + if (p->fs) + umask = p->fs->umask; + if (p->files) + max_fds = files_fdtable(p->files)->max_fds; + task_unlock(p); + rcu_read_unlock(); + + if (umask >= 0) + seq_printf(m, "Umask:\t%#04o\n", umask); + seq_puts(m, "State:\t"); + seq_puts(m, get_task_state(p)); + + seq_put_decimal_ull(m, "\nTgid:\t", tgid); + seq_put_decimal_ull(m, "\nNgid:\t", ngid); + seq_put_decimal_ull(m, "\nPid:\t", pid_nr_ns(pid, ns)); + seq_put_decimal_ull(m, "\nPPid:\t", ppid); + seq_put_decimal_ull(m, "\nTracerPid:\t", tpid); + seq_put_decimal_ull(m, "\nUid:\t", from_kuid_munged(user_ns, cred->uid)); + seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->euid)); + seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->suid)); + seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->fsuid)); + seq_put_decimal_ull(m, "\nGid:\t", from_kgid_munged(user_ns, cred->gid)); + seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid)); + seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid)); + seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid)); + seq_put_decimal_ull(m, "\nFDSize:\t", max_fds); + + seq_puts(m, "\nGroups:\t"); + group_info = cred->group_info; + for (g = 0; g < group_info->ngroups; g++) + seq_put_decimal_ull(m, g ? " " : "", + from_kgid_munged(user_ns, group_info->gid[g])); + put_cred(cred); + /* Trailing space shouldn't have been added in the first place. */ + seq_putc(m, ' '); + +#ifdef CONFIG_PID_NS + seq_puts(m, "\nNStgid:"); + for (g = ns->level; g <= pid->level; g++) + seq_put_decimal_ull(m, "\t", task_tgid_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSpid:"); + for (g = ns->level; g <= pid->level; g++) + seq_put_decimal_ull(m, "\t", task_pid_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSpgid:"); + for (g = ns->level; g <= pid->level; g++) + seq_put_decimal_ull(m, "\t", task_pgrp_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSsid:"); + for (g = ns->level; g <= pid->level; g++) + seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns)); +#endif + seq_putc(m, '\n'); +} + +void render_sigset_t(struct seq_file *m, const char *header, + sigset_t *set) +{ + int i; + + seq_puts(m, header); + + i = _NSIG; + do { + int x = 0; + + i -= 4; + if (sigismember(set, i+1)) x |= 1; + if (sigismember(set, i+2)) x |= 2; + if (sigismember(set, i+3)) x |= 4; + if (sigismember(set, i+4)) x |= 8; + seq_putc(m, hex_asc[x]); + } while (i >= 4); + + seq_putc(m, '\n'); +} + +static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign, + sigset_t *sigcatch) +{ + struct k_sigaction *k; + int i; + + k = p->sighand->action; + for (i = 1; i <= _NSIG; ++i, ++k) { + if (k->sa.sa_handler == SIG_IGN) + sigaddset(sigign, i); + else if (k->sa.sa_handler != SIG_DFL) + sigaddset(sigcatch, i); + } +} + +static inline void task_sig(struct seq_file *m, struct task_struct *p) +{ + unsigned long flags; + sigset_t pending, shpending, blocked, ignored, caught; + int num_threads = 0; + unsigned int qsize = 0; + unsigned long qlim = 0; + + sigemptyset(&pending); + sigemptyset(&shpending); + sigemptyset(&blocked); + sigemptyset(&ignored); + sigemptyset(&caught); + + if (lock_task_sighand(p, &flags)) { + pending = p->pending.signal; + shpending = p->signal->shared_pending.signal; + blocked = p->blocked; + collect_sigign_sigcatch(p, &ignored, &caught); + num_threads = get_nr_threads(p); + rcu_read_lock(); /* FIXME: is this correct? */ + qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); + rcu_read_unlock(); + qlim = task_rlimit(p, RLIMIT_SIGPENDING); + unlock_task_sighand(p, &flags); + } + + seq_put_decimal_ull(m, "Threads:\t", num_threads); + seq_put_decimal_ull(m, "\nSigQ:\t", qsize); + seq_put_decimal_ull(m, "/", qlim); + + /* render them all */ + render_sigset_t(m, "\nSigPnd:\t", &pending); + render_sigset_t(m, "ShdPnd:\t", &shpending); + render_sigset_t(m, "SigBlk:\t", &blocked); + render_sigset_t(m, "SigIgn:\t", &ignored); + render_sigset_t(m, "SigCgt:\t", &caught); +} + +static void render_cap_t(struct seq_file *m, const char *header, + kernel_cap_t *a) +{ + unsigned __capi; + + seq_puts(m, header); + CAP_FOR_EACH_U32(__capi) { + seq_put_hex_ll(m, NULL, + a->cap[CAP_LAST_U32 - __capi], 8); + } + seq_putc(m, '\n'); +} + +static inline void task_cap(struct seq_file *m, struct task_struct *p) +{ + const struct cred *cred; + kernel_cap_t cap_inheritable, cap_permitted, cap_effective, + cap_bset, cap_ambient; + + rcu_read_lock(); + cred = __task_cred(p); + cap_inheritable = cred->cap_inheritable; + cap_permitted = cred->cap_permitted; + cap_effective = cred->cap_effective; + cap_bset = cred->cap_bset; + cap_ambient = cred->cap_ambient; + rcu_read_unlock(); + + render_cap_t(m, "CapInh:\t", &cap_inheritable); + render_cap_t(m, "CapPrm:\t", &cap_permitted); + render_cap_t(m, "CapEff:\t", &cap_effective); + render_cap_t(m, "CapBnd:\t", &cap_bset); + render_cap_t(m, "CapAmb:\t", &cap_ambient); +} + +static inline void task_seccomp(struct seq_file *m, struct task_struct *p) +{ + seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); +#ifdef CONFIG_SECCOMP + seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); +#ifdef CONFIG_SECCOMP_FILTER + seq_put_decimal_ull(m, "\nSeccomp_filters:\t", + atomic_read(&p->seccomp.filter_count)); +#endif +#endif + seq_puts(m, "\nSpeculation_Store_Bypass:\t"); + switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { + case -EINVAL: + seq_puts(m, "unknown"); + break; + case PR_SPEC_NOT_AFFECTED: + seq_puts(m, "not vulnerable"); + break; + case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: + seq_puts(m, "thread force mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_DISABLE: + seq_puts(m, "thread mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_ENABLE: + seq_puts(m, "thread vulnerable"); + break; + case PR_SPEC_DISABLE: + seq_puts(m, "globally mitigated"); + break; + default: + seq_puts(m, "vulnerable"); + break; + } + + seq_puts(m, "\nSpeculationIndirectBranch:\t"); + switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_INDIRECT_BRANCH)) { + case -EINVAL: + seq_puts(m, "unsupported"); + break; + case PR_SPEC_NOT_AFFECTED: + seq_puts(m, "not affected"); + break; + case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: + seq_puts(m, "conditional force disabled"); + break; + case PR_SPEC_PRCTL | PR_SPEC_DISABLE: + seq_puts(m, "conditional disabled"); + break; + case PR_SPEC_PRCTL | PR_SPEC_ENABLE: + seq_puts(m, "conditional enabled"); + break; + case PR_SPEC_ENABLE: + seq_puts(m, "always enabled"); + break; + case PR_SPEC_DISABLE: + seq_puts(m, "always disabled"); + break; + default: + seq_puts(m, "unknown"); + break; + } + seq_putc(m, '\n'); +} + +static inline void task_context_switch_counts(struct seq_file *m, + struct task_struct *p) +{ + seq_put_decimal_ull(m, "voluntary_ctxt_switches:\t", p->nvcsw); + seq_put_decimal_ull(m, "\nnonvoluntary_ctxt_switches:\t", p->nivcsw); + seq_putc(m, '\n'); +} + +static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) +{ + seq_printf(m, "Cpus_allowed:\t%*pb\n", + cpumask_pr_args(&task->cpus_mask)); + seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", + cpumask_pr_args(&task->cpus_mask)); +} + +static inline void task_core_dumping(struct seq_file *m, struct task_struct *task) +{ + seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state); + seq_putc(m, '\n'); +} + +static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) +{ + bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); + + if (thp_enabled) + thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags); + seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); +} + +int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm = get_task_mm(task); + + seq_puts(m, "Name:\t"); + proc_task_name(m, task, true); + seq_putc(m, '\n'); + + task_state(m, ns, pid, task); + + if (mm) { + task_mem(m, mm); + task_core_dumping(m, task); + task_thp_status(m, mm); + mmput(mm); + } + task_sig(m, task); + task_cap(m, task); + task_seccomp(m, task); + task_cpus_allowed(m, task); + cpuset_task_status_allowed(m, task); + task_context_switch_counts(m, task); + return 0; +} + +static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task, int whole) +{ + unsigned long vsize, eip, esp, wchan = 0; + int priority, nice; + int tty_pgrp = -1, tty_nr = 0; + sigset_t sigign, sigcatch; + char state; + pid_t ppid = 0, pgid = -1, sid = -1; + int num_threads = 0; + int permitted; + struct mm_struct *mm; + unsigned long long start_time; + unsigned long cmin_flt = 0, cmaj_flt = 0; + unsigned long min_flt = 0, maj_flt = 0; + u64 cutime, cstime, utime, stime; + u64 cgtime, gtime; + unsigned long rsslim = 0; + unsigned long flags; + int exit_code = task->exit_code; + + state = *get_task_state(task); + vsize = eip = esp = 0; + permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT); + mm = get_task_mm(task); + if (mm) { + vsize = task_vsize(mm); + /* + * esp and eip are intentionally zeroed out. There is no + * non-racy way to read them without freezing the task. + * Programs that need reliable values can use ptrace(2). + * + * The only exception is if the task is core dumping because + * a program is not able to use ptrace(2) in that case. It is + * safe because the task has stopped executing permanently. + */ + if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) { + if (try_get_task_stack(task)) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + put_task_stack(task); + } + } + } + + sigemptyset(&sigign); + sigemptyset(&sigcatch); + cutime = cstime = utime = stime = 0; + cgtime = gtime = 0; + + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + + if (sig->tty) { + struct pid *pgrp = tty_get_pgrp(sig->tty); + tty_pgrp = pid_nr_ns(pgrp, ns); + put_pid(pgrp); + tty_nr = new_encode_dev(tty_devnum(sig->tty)); + } + + num_threads = get_nr_threads(task); + collect_sigign_sigcatch(task, &sigign, &sigcatch); + + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); + + /* add up live thread stats at the group level */ + if (whole) { + struct task_struct *t = task; + do { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime += task_gtime(t); + } while_each_thread(task, t); + + min_flt += sig->min_flt; + maj_flt += sig->maj_flt; + thread_group_cputime_adjusted(task, &utime, &stime); + gtime += sig->gtime; + + if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED)) + exit_code = sig->group_exit_code; + } + + sid = task_session_nr_ns(task, ns); + ppid = task_tgid_nr_ns(task->real_parent, ns); + pgid = task_pgrp_nr_ns(task, ns); + + unlock_task_sighand(task, &flags); + } + + if (permitted && (!whole || num_threads < 2)) + wchan = !task_is_running(task); + if (!whole) { + min_flt = task->min_flt; + maj_flt = task->maj_flt; + task_cputime_adjusted(task, &utime, &stime); + gtime = task_gtime(task); + } + + /* scale priority and nice values from timeslices to -20..20 */ + /* to make it look like a "normal" Unix priority/nice value */ + priority = task_prio(task); + nice = task_nice(task); + + /* apply timens offset for boottime and convert nsec -> ticks */ + start_time = + nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime)); + + seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); + seq_puts(m, " ("); + proc_task_name(m, task, false); + seq_puts(m, ") "); + seq_putc(m, state); + seq_put_decimal_ll(m, " ", ppid); + seq_put_decimal_ll(m, " ", pgid); + seq_put_decimal_ll(m, " ", sid); + seq_put_decimal_ll(m, " ", tty_nr); + seq_put_decimal_ll(m, " ", tty_pgrp); + seq_put_decimal_ull(m, " ", task->flags); + seq_put_decimal_ull(m, " ", min_flt); + seq_put_decimal_ull(m, " ", cmin_flt); + seq_put_decimal_ull(m, " ", maj_flt); + seq_put_decimal_ull(m, " ", cmaj_flt); + seq_put_decimal_ull(m, " ", nsec_to_clock_t(utime)); + seq_put_decimal_ull(m, " ", nsec_to_clock_t(stime)); + seq_put_decimal_ll(m, " ", nsec_to_clock_t(cutime)); + seq_put_decimal_ll(m, " ", nsec_to_clock_t(cstime)); + seq_put_decimal_ll(m, " ", priority); + seq_put_decimal_ll(m, " ", nice); + seq_put_decimal_ll(m, " ", num_threads); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", start_time); + seq_put_decimal_ull(m, " ", vsize); + seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0); + seq_put_decimal_ull(m, " ", rsslim); + seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0); + seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0); + seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0); + seq_put_decimal_ull(m, " ", esp); + seq_put_decimal_ull(m, " ", eip); + /* The signal information here is obsolete. + * It must be decimal for Linux 2.0 compatibility. + * Use /proc/#/status for real-time signals. + */ + seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL); + + /* + * We used to output the absolute kernel address, but that's an + * information leak - so instead we show a 0/1 flag here, to signal + * to user-space whether there's a wchan field in /proc/PID/wchan. + * + * This works with older implementations of procps as well. + */ + seq_put_decimal_ull(m, " ", wchan); + + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ll(m, " ", task->exit_signal); + seq_put_decimal_ll(m, " ", task_cpu(task)); + seq_put_decimal_ull(m, " ", task->rt_priority); + seq_put_decimal_ull(m, " ", task->policy); + seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task)); + seq_put_decimal_ull(m, " ", nsec_to_clock_t(gtime)); + seq_put_decimal_ll(m, " ", nsec_to_clock_t(cgtime)); + + if (mm && permitted) { + seq_put_decimal_ull(m, " ", mm->start_data); + seq_put_decimal_ull(m, " ", mm->end_data); + seq_put_decimal_ull(m, " ", mm->start_brk); + seq_put_decimal_ull(m, " ", mm->arg_start); + seq_put_decimal_ull(m, " ", mm->arg_end); + seq_put_decimal_ull(m, " ", mm->env_start); + seq_put_decimal_ull(m, " ", mm->env_end); + } else + seq_puts(m, " 0 0 0 0 0 0 0"); + + if (permitted) + seq_put_decimal_ll(m, " ", exit_code); + else + seq_puts(m, " 0"); + + seq_putc(m, '\n'); + if (mm) + mmput(mm); + return 0; +} + +int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_task_stat(m, ns, pid, task, 0); +} + +int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_task_stat(m, ns, pid, task, 1); +} + +int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm = get_task_mm(task); + + if (mm) { + unsigned long size; + unsigned long resident = 0; + unsigned long shared = 0; + unsigned long text = 0; + unsigned long data = 0; + + size = task_statm(mm, &shared, &text, &data, &resident); + mmput(mm); + + /* + * For quick read, open code by putting numbers directly + * expected format is + * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + * size, resident, shared, text, data); + */ + seq_put_decimal_ull(m, "", size); + seq_put_decimal_ull(m, " ", resident); + seq_put_decimal_ull(m, " ", shared); + seq_put_decimal_ull(m, " ", text); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", data); + seq_put_decimal_ull(m, " ", 0); + seq_putc(m, '\n'); + } else { + seq_write(m, "0 0 0 0 0 0 0\n", 14); + } + return 0; +} + +#ifdef CONFIG_PROC_CHILDREN +static struct pid * +get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) +{ + struct task_struct *start, *task; + struct pid *pid = NULL; + + read_lock(&tasklist_lock); + + start = pid_task(proc_pid(inode), PIDTYPE_PID); + if (!start) + goto out; + + /* + * Lets try to continue searching first, this gives + * us significant speedup on children-rich processes. + */ + if (pid_prev) { + task = pid_task(pid_prev, PIDTYPE_PID); + if (task && task->real_parent == start && + !(list_empty(&task->sibling))) { + if (list_is_last(&task->sibling, &start->children)) + goto out; + task = list_first_entry(&task->sibling, + struct task_struct, sibling); + pid = get_pid(task_pid(task)); + goto out; + } + } + + /* + * Slow search case. + * + * We might miss some children here if children + * are exited while we were not holding the lock, + * but it was never promised to be accurate that + * much. + * + * "Just suppose that the parent sleeps, but N children + * exit after we printed their tids. Now the slow paths + * skips N extra children, we miss N tasks." (c) + * + * So one need to stop or freeze the leader and all + * its children to get a precise result. + */ + list_for_each_entry(task, &start->children, sibling) { + if (pos-- == 0) { + pid = get_pid(task_pid(task)); + break; + } + } + +out: + read_unlock(&tasklist_lock); + return pid; +} + +static int children_seq_show(struct seq_file *seq, void *v) +{ + struct inode *inode = file_inode(seq->file); + + seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode->i_sb))); + return 0; +} + +static void *children_seq_start(struct seq_file *seq, loff_t *pos) +{ + return get_children_pid(file_inode(seq->file), NULL, *pos); +} + +static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct pid *pid; + + pid = get_children_pid(file_inode(seq->file), v, *pos + 1); + put_pid(v); + + ++*pos; + return pid; +} + +static void children_seq_stop(struct seq_file *seq, void *v) +{ + put_pid(v); +} + +static const struct seq_operations children_seq_ops = { + .start = children_seq_start, + .next = children_seq_next, + .stop = children_seq_stop, + .show = children_seq_show, +}; + +static int children_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &children_seq_ops); +} + +const struct file_operations proc_tid_children_operations = { + .open = children_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROC_CHILDREN */ diff --git a/fs/proc/base.c b/fs/proc/base.c new file mode 100644 index 000000000..74442e017 --- /dev/null +++ b/fs/proc/base.c @@ -0,0 +1,3928 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/proc/base.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * proc base directory handling functions + * + * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. + * Instead of using magical inumbers to determine the kind of object + * we allocate and fill in-core inodes upon lookup. They don't even + * go into icache. We cache the reference to task_struct upon lookup too. + * Eventually it should become a filesystem in its own. We don't use the + * rest of procfs anymore. + * + * + * Changelog: + * 17-Jan-2005 + * Allan Bezerra + * Bruna Moreira <bruna.moreira@indt.org.br> + * Edjard Mota <edjard.mota@indt.org.br> + * Ilias Biris <ilias.biris@indt.org.br> + * Mauricio Lin <mauricio.lin@indt.org.br> + * + * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT + * + * A new process specific entry (smaps) included in /proc. It shows the + * size of rss for each memory area. The maps entry lacks information + * about physical memory size (rss) for each mapped file, i.e., + * rss information for executables and library files. + * This additional information is useful for any tools that need to know + * about physical memory consumption for a process specific library. + * + * Changelog: + * 21-Feb-2005 + * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT + * Pud inclusion in the page table walking. + * + * ChangeLog: + * 10-Mar-2005 + * 10LE Instituto Nokia de Tecnologia - INdT: + * A better way to walks through the page table as suggested by Hugh Dickins. + * + * Simo Piiroinen <simo.piiroinen@nokia.com>: + * Smaps information related to shared, private, clean and dirty pages. + * + * Paul Mundt <paul.mundt@nokia.com>: + * Overall revision about smaps. + */ + +#include <linux/uaccess.h> + +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/init.h> +#include <linux/capability.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/generic-radix-tree.h> +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/namei.h> +#include <linux/mnt_namespace.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/rcupdate.h> +#include <linux/kallsyms.h> +#include <linux/stacktrace.h> +#include <linux/resource.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/security.h> +#include <linux/ptrace.h> +#include <linux/printk.h> +#include <linux/cache.h> +#include <linux/cgroup.h> +#include <linux/cpuset.h> +#include <linux/audit.h> +#include <linux/poll.h> +#include <linux/nsproxy.h> +#include <linux/oom.h> +#include <linux/elf.h> +#include <linux/pid_namespace.h> +#include <linux/user_namespace.h> +#include <linux/fs_struct.h> +#include <linux/slab.h> +#include <linux/sched/autogroup.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> +#include <linux/sched/debug.h> +#include <linux/sched/stat.h> +#include <linux/posix-timers.h> +#include <linux/time_namespace.h> +#include <linux/resctrl.h> +#include <linux/cn_proc.h> +#include <trace/events/oom.h> +#include "internal.h" +#include "fd.h" + +#include "../../lib/kstrtox.h" + +/* NOTE: + * Implementing inode permission operations in /proc is almost + * certainly an error. Permission checks need to happen during + * each system call not at open time. The reason is that most of + * what we wish to check for permissions in /proc varies at runtime. + * + * The classic example of a problem is opening file descriptors + * in /proc for a task before it execs a suid executable. + */ + +static u8 nlink_tid __ro_after_init; +static u8 nlink_tgid __ro_after_init; + +struct pid_entry { + const char *name; + unsigned int len; + umode_t mode; + const struct inode_operations *iop; + const struct file_operations *fop; + union proc_op op; +}; + +#define NOD(NAME, MODE, IOP, FOP, OP) { \ + .name = (NAME), \ + .len = sizeof(NAME) - 1, \ + .mode = MODE, \ + .iop = IOP, \ + .fop = FOP, \ + .op = OP, \ +} + +#define DIR(NAME, MODE, iops, fops) \ + NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) +#define LNK(NAME, get_link) \ + NOD(NAME, (S_IFLNK|S_IRWXUGO), \ + &proc_pid_link_inode_operations, NULL, \ + { .proc_get_link = get_link } ) +#define REG(NAME, MODE, fops) \ + NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) +#define ONE(NAME, MODE, show) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_single_file_operations, \ + { .proc_show = show } ) +#define ATTR(LSM, NAME, MODE) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_pid_attr_operations, \ + { .lsm = LSM }) + +/* + * Count the number of hardlinks for the pid_entry table, excluding the . + * and .. links. + */ +static unsigned int __init pid_entry_nlink(const struct pid_entry *entries, + unsigned int n) +{ + unsigned int i; + unsigned int count; + + count = 2; + for (i = 0; i < n; ++i) { + if (S_ISDIR(entries[i].mode)) + ++count; + } + + return count; +} + +static int get_task_root(struct task_struct *task, struct path *root) +{ + int result = -ENOENT; + + task_lock(task); + if (task->fs) { + get_fs_root(task->fs, root); + result = 0; + } + task_unlock(task); + return result; +} + +static int proc_cwd_link(struct dentry *dentry, struct path *path) +{ + struct task_struct *task = get_proc_task(d_inode(dentry)); + int result = -ENOENT; + + if (task) { + task_lock(task); + if (task->fs) { + get_fs_pwd(task->fs, path); + result = 0; + } + task_unlock(task); + put_task_struct(task); + } + return result; +} + +static int proc_root_link(struct dentry *dentry, struct path *path) +{ + struct task_struct *task = get_proc_task(d_inode(dentry)); + int result = -ENOENT; + + if (task) { + result = get_task_root(task, path); + put_task_struct(task); + } + return result; +} + +/* + * If the user used setproctitle(), we just get the string from + * user space at arg_start, and limit it to a maximum of one page. + */ +static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf, + size_t count, unsigned long pos, + unsigned long arg_start) +{ + char *page; + int ret, got; + + if (pos >= PAGE_SIZE) + return 0; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + ret = 0; + got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON); + if (got > 0) { + int len = strnlen(page, got); + + /* Include the NUL character if it was found */ + if (len < got) + len++; + + if (len > pos) { + len -= pos; + if (len > count) + len = count; + len -= copy_to_user(buf, page+pos, len); + if (!len) + len = -EFAULT; + ret = len; + } + } + free_page((unsigned long)page); + return ret; +} + +static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, + size_t count, loff_t *ppos) +{ + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long pos, len; + char *page, c; + + /* Check if process spawned far enough to have cmdline. */ + if (!mm->env_end) + return 0; + + spin_lock(&mm->arg_lock); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + spin_unlock(&mm->arg_lock); + + if (arg_start >= arg_end) + return 0; + + /* + * We allow setproctitle() to overwrite the argument + * strings, and overflow past the original end. But + * only when it overflows into the environment area. + */ + if (env_start != arg_end || env_end < env_start) + env_start = env_end = arg_end; + len = env_end - arg_start; + + /* We're not going to care if "*ppos" has high bits set */ + pos = *ppos; + if (pos >= len) + return 0; + if (count > len - pos) + count = len - pos; + if (!count) + return 0; + + /* + * Magical special case: if the argv[] end byte is not + * zero, the user has overwritten it with setproctitle(3). + * + * Possible future enhancement: do this only once when + * pos is 0, and set a flag in the 'struct file'. + */ + if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c) + return get_mm_proctitle(mm, buf, count, pos, arg_start); + + /* + * For the non-setproctitle() case we limit things strictly + * to the [arg_start, arg_end[ range. + */ + pos += arg_start; + if (pos < arg_start || pos >= arg_end) + return 0; + if (count > arg_end - pos) + count = arg_end - pos; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + len = 0; + while (count) { + int got; + size_t size = min_t(size_t, PAGE_SIZE, count); + + got = access_remote_vm(mm, pos, page, size, FOLL_ANON); + if (got <= 0) + break; + got -= copy_to_user(buf, page, got); + if (unlikely(!got)) { + if (!len) + len = -EFAULT; + break; + } + pos += got; + buf += got; + len += got; + count -= got; + } + + free_page((unsigned long)page); + return len; +} + +static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf, + size_t count, loff_t *pos) +{ + struct mm_struct *mm; + ssize_t ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = get_mm_cmdline(mm, buf, count, pos); + mmput(mm); + return ret; +} + +static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct task_struct *tsk; + ssize_t ret; + + BUG_ON(*pos < 0); + + tsk = get_proc_task(file_inode(file)); + if (!tsk) + return -ESRCH; + ret = get_task_cmdline(tsk, buf, count, pos); + put_task_struct(tsk); + if (ret > 0) + *pos += ret; + return ret; +} + +static const struct file_operations proc_pid_cmdline_ops = { + .read = proc_pid_cmdline_read, + .llseek = generic_file_llseek, +}; + +#ifdef CONFIG_KALLSYMS +/* + * Provides a wchan file via kallsyms in a proper one-value-per-file format. + * Returns the resolved symbol. If that fails, simply return the address. + */ +static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long wchan; + char symname[KSYM_NAME_LEN]; + + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto print0; + + wchan = get_wchan(task); + if (wchan && !lookup_symbol_name(wchan, symname)) { + seq_puts(m, symname); + return 0; + } + +print0: + seq_putc(m, '0'); + return 0; +} +#endif /* CONFIG_KALLSYMS */ + +static int lock_trace(struct task_struct *task) +{ + int err = down_read_killable(&task->signal->exec_update_lock); + if (err) + return err; + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { + up_read(&task->signal->exec_update_lock); + return -EPERM; + } + return 0; +} + +static void unlock_trace(struct task_struct *task) +{ + up_read(&task->signal->exec_update_lock); +} + +#ifdef CONFIG_STACKTRACE + +#define MAX_STACK_TRACE_DEPTH 64 + +static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long *entries; + int err; + + /* + * The ability to racily run the kernel stack unwinder on a running task + * and then observe the unwinder output is scary; while it is useful for + * debugging kernel issues, it can also allow an attacker to leak kernel + * stack contents. + * Doing this in a manner that is at least safe from races would require + * some work to ensure that the remote task can not be scheduled; and + * even then, this would still expose the unwinder as local attack + * surface. + * Therefore, this interface is restricted to root. + */ + if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EACCES; + + entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), + GFP_KERNEL); + if (!entries) + return -ENOMEM; + + err = lock_trace(task); + if (!err) { + unsigned int i, nr_entries; + + nr_entries = stack_trace_save_tsk(task, entries, + MAX_STACK_TRACE_DEPTH, 0); + + for (i = 0; i < nr_entries; i++) { + seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); + } + + unlock_trace(task); + } + kfree(entries); + + return err; +} +#endif + +#ifdef CONFIG_SCHED_INFO +/* + * Provides /proc/PID/schedstat + */ +static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + if (unlikely(!sched_info_on())) + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", + (unsigned long long)task->se.sum_exec_runtime, + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + + return 0; +} +#endif + +#ifdef CONFIG_LATENCYTOP +static int lstats_show_proc(struct seq_file *m, void *v) +{ + int i; + struct inode *inode = m->private; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + seq_puts(m, "Latency Top version : v0.1\n"); + for (i = 0; i < LT_SAVECOUNT; i++) { + struct latency_record *lr = &task->latency_record[i]; + if (lr->backtrace[0]) { + int q; + seq_printf(m, "%i %li %li", + lr->count, lr->time, lr->max); + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { + unsigned long bt = lr->backtrace[q]; + + if (!bt) + break; + seq_printf(m, " %ps", (void *)bt); + } + seq_putc(m, '\n'); + } + + } + put_task_struct(task); + return 0; +} + +static int lstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, lstats_show_proc, inode); +} + +static ssize_t lstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + + if (!task) + return -ESRCH; + clear_tsk_latency_tracing(task); + put_task_struct(task); + + return count; +} + +static const struct file_operations proc_lstats_operations = { + .open = lstats_open, + .read = seq_read, + .write = lstats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long totalpages = totalram_pages() + total_swap_pages; + unsigned long points = 0; + long badness; + + badness = oom_badness(task, totalpages); + /* + * Special case OOM_SCORE_ADJ_MIN for all others scale the + * badness value into [0, 2000] range which we have been + * exporting for a long time so userspace might depend on it. + */ + if (badness != LONG_MIN) + points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3; + + seq_printf(m, "%lu\n", points); + + return 0; +} + +struct limit_names { + const char *name; + const char *unit; +}; + +static const struct limit_names lnames[RLIM_NLIMITS] = { + [RLIMIT_CPU] = {"Max cpu time", "seconds"}, + [RLIMIT_FSIZE] = {"Max file size", "bytes"}, + [RLIMIT_DATA] = {"Max data size", "bytes"}, + [RLIMIT_STACK] = {"Max stack size", "bytes"}, + [RLIMIT_CORE] = {"Max core file size", "bytes"}, + [RLIMIT_RSS] = {"Max resident set", "bytes"}, + [RLIMIT_NPROC] = {"Max processes", "processes"}, + [RLIMIT_NOFILE] = {"Max open files", "files"}, + [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"}, + [RLIMIT_AS] = {"Max address space", "bytes"}, + [RLIMIT_LOCKS] = {"Max file locks", "locks"}, + [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"}, + [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"}, + [RLIMIT_NICE] = {"Max nice priority", NULL}, + [RLIMIT_RTPRIO] = {"Max realtime priority", NULL}, + [RLIMIT_RTTIME] = {"Max realtime timeout", "us"}, +}; + +/* Display limits for a process */ +static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned int i; + unsigned long flags; + + struct rlimit rlim[RLIM_NLIMITS]; + + if (!lock_task_sighand(task, &flags)) + return 0; + memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); + unlock_task_sighand(task, &flags); + + /* + * print the file header + */ + seq_puts(m, "Limit " + "Soft Limit " + "Hard Limit " + "Units \n"); + + for (i = 0; i < RLIM_NLIMITS; i++) { + if (rlim[i].rlim_cur == RLIM_INFINITY) + seq_printf(m, "%-25s %-20s ", + lnames[i].name, "unlimited"); + else + seq_printf(m, "%-25s %-20lu ", + lnames[i].name, rlim[i].rlim_cur); + + if (rlim[i].rlim_max == RLIM_INFINITY) + seq_printf(m, "%-20s ", "unlimited"); + else + seq_printf(m, "%-20lu ", rlim[i].rlim_max); + + if (lnames[i].unit) + seq_printf(m, "%-10s\n", lnames[i].unit); + else + seq_putc(m, '\n'); + } + + return 0; +} + +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK +static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct syscall_info info; + u64 *args = &info.data.args[0]; + int res; + + res = lock_trace(task); + if (res) + return res; + + if (task_current_syscall(task, &info)) + seq_puts(m, "running\n"); + else if (info.data.nr < 0) + seq_printf(m, "%d 0x%llx 0x%llx\n", + info.data.nr, info.sp, info.data.instruction_pointer); + else + seq_printf(m, + "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n", + info.data.nr, + args[0], args[1], args[2], args[3], args[4], args[5], + info.sp, info.data.instruction_pointer); + unlock_trace(task); + + return 0; +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ + +/************************************************************************/ +/* Here the fs part begins */ +/************************************************************************/ + +/* permission checks */ +static bool proc_fd_access_allowed(struct inode *inode) +{ + struct task_struct *task; + bool allowed = false; + /* Allow access to a task's file descriptors if it is us or we + * may use ptrace attach to the process and find out that + * information. + */ + task = get_proc_task(inode); + if (task) { + allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); + put_task_struct(task); + } + return allowed; +} + +int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) +{ + int error; + struct inode *inode = d_inode(dentry); + + if (attr->ia_valid & ATTR_MODE) + return -EPERM; + + error = setattr_prepare(&init_user_ns, dentry, attr); + if (error) + return error; + + setattr_copy(&init_user_ns, inode, attr); + mark_inode_dirty(inode); + return 0; +} + +/* + * May current process learn task's sched/cmdline info (for hide_pid_min=1) + * or euid/egid (for hide_pid_min=2)? + */ +static bool has_pid_permissions(struct proc_fs_info *fs_info, + struct task_struct *task, + enum proc_hidepid hide_pid_min) +{ + /* + * If 'hidpid' mount option is set force a ptrace check, + * we indicate that we are using a filesystem syscall + * by passing PTRACE_MODE_READ_FSCREDS + */ + if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) + return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); + + if (fs_info->hide_pid < hide_pid_min) + return true; + if (in_group_p(fs_info->pid_gid)) + return true; + return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); +} + + +static int proc_pid_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) +{ + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); + struct task_struct *task; + bool has_perms; + + task = get_proc_task(inode); + if (!task) + return -ESRCH; + has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS); + put_task_struct(task); + + if (!has_perms) { + if (fs_info->hide_pid == HIDEPID_INVISIBLE) { + /* + * Let's make getdents(), stat(), and open() + * consistent with each other. If a process + * may not stat() a file, it shouldn't be seen + * in procfs at all. + */ + return -ENOENT; + } + + return -EPERM; + } + return generic_permission(&init_user_ns, inode, mask); +} + + + +static const struct inode_operations proc_def_inode_operations = { + .setattr = proc_setattr, +}; + +static int proc_single_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); + struct pid *pid = proc_pid(inode); + struct task_struct *task; + int ret; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return -ESRCH; + + ret = PROC_I(inode)->op.proc_show(m, ns, pid, task); + + put_task_struct(task); + return ret; +} + +static int proc_single_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, proc_single_show, inode); +} + +static const struct file_operations proc_single_file_operations = { + .open = proc_single_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + + +struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) +{ + struct task_struct *task = get_proc_task(inode); + struct mm_struct *mm = ERR_PTR(-ESRCH); + + if (task) { + mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); + put_task_struct(task); + + if (!IS_ERR_OR_NULL(mm)) { + /* ensure this mm_struct can't be freed */ + mmgrab(mm); + /* but do not pin its memory */ + mmput(mm); + } + } + + return mm; +} + +static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) +{ + struct mm_struct *mm = proc_mem_open(inode, mode); + + if (IS_ERR(mm)) + return PTR_ERR(mm); + + file->private_data = mm; + return 0; +} + +static int mem_open(struct inode *inode, struct file *file) +{ + int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); + + /* OK to pass negative loff_t, we can catch out-of-range */ + file->f_mode |= FMODE_UNSIGNED_OFFSET; + + return ret; +} + +static ssize_t mem_rw(struct file *file, char __user *buf, + size_t count, loff_t *ppos, int write) +{ + struct mm_struct *mm = file->private_data; + unsigned long addr = *ppos; + ssize_t copied; + char *page; + unsigned int flags; + + if (!mm) + return 0; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + copied = 0; + if (!mmget_not_zero(mm)) + goto free; + + flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); + + while (count > 0) { + size_t this_len = min_t(size_t, count, PAGE_SIZE); + + if (write && copy_from_user(page, buf, this_len)) { + copied = -EFAULT; + break; + } + + this_len = access_remote_vm(mm, addr, page, this_len, flags); + if (!this_len) { + if (!copied) + copied = -EIO; + break; + } + + if (!write && copy_to_user(buf, page, this_len)) { + copied = -EFAULT; + break; + } + + buf += this_len; + addr += this_len; + copied += this_len; + count -= this_len; + } + *ppos = addr; + + mmput(mm); +free: + free_page((unsigned long) page); + return copied; +} + +static ssize_t mem_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, buf, count, ppos, 0); +} + +static ssize_t mem_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, (char __user*)buf, count, ppos, 1); +} + +loff_t mem_lseek(struct file *file, loff_t offset, int orig) +{ + switch (orig) { + case 0: + file->f_pos = offset; + break; + case 1: + file->f_pos += offset; + break; + default: + return -EINVAL; + } + force_successful_syscall_return(); + return file->f_pos; +} + +static int mem_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + if (mm) + mmdrop(mm); + return 0; +} + +static const struct file_operations proc_mem_operations = { + .llseek = mem_lseek, + .read = mem_read, + .write = mem_write, + .open = mem_open, + .release = mem_release, +}; + +static int environ_open(struct inode *inode, struct file *file) +{ + return __mem_open(inode, file, PTRACE_MODE_READ); +} + +static ssize_t environ_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *page; + unsigned long src = *ppos; + int ret = 0; + struct mm_struct *mm = file->private_data; + unsigned long env_start, env_end; + + /* Ensure the process spawned far enough to have an environment. */ + if (!mm || !mm->env_end) + return 0; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + ret = 0; + if (!mmget_not_zero(mm)) + goto free; + + spin_lock(&mm->arg_lock); + env_start = mm->env_start; + env_end = mm->env_end; + spin_unlock(&mm->arg_lock); + + while (count > 0) { + size_t this_len, max_len; + int retval; + + if (src >= (env_end - env_start)) + break; + + this_len = env_end - (env_start + src); + + max_len = min_t(size_t, PAGE_SIZE, count); + this_len = min(max_len, this_len); + + retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON); + + if (retval <= 0) { + ret = retval; + break; + } + + if (copy_to_user(buf, page, retval)) { + ret = -EFAULT; + break; + } + + ret += retval; + src += retval; + buf += retval; + count -= retval; + } + *ppos = src; + mmput(mm); + +free: + free_page((unsigned long) page); + return ret; +} + +static const struct file_operations proc_environ_operations = { + .open = environ_open, + .read = environ_read, + .llseek = generic_file_llseek, + .release = mem_release, +}; + +static int auxv_open(struct inode *inode, struct file *file) +{ + return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); +} + +static ssize_t auxv_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + unsigned int nwords = 0; + + if (!mm) + return 0; + do { + nwords += 2; + } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ + return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv, + nwords * sizeof(mm->saved_auxv[0])); +} + +static const struct file_operations proc_auxv_operations = { + .open = auxv_open, + .read = auxv_read, + .llseek = generic_file_llseek, + .release = mem_release, +}; + +static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + char buffer[PROC_NUMBUF]; + int oom_adj = OOM_ADJUST_MIN; + size_t len; + + if (!task) + return -ESRCH; + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; + put_task_struct(task); + if (oom_adj > OOM_ADJUST_MAX) + oom_adj = OOM_ADJUST_MAX; + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) +{ + struct mm_struct *mm = NULL; + struct task_struct *task; + int err = 0; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + + mutex_lock(&oom_adj_mutex); + if (legacy) { + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_unlock; + } + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + } else { + if ((short)oom_adj < task->signal->oom_score_adj_min && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_unlock; + } + } + + /* + * Make sure we will check other processes sharing the mm if this is + * not vfrok which wants its own oom_score_adj. + * pin the mm so it doesn't go away and get reused after task_unlock + */ + if (!task->vfork_done) { + struct task_struct *p = find_lock_task_mm(task); + + if (p) { + if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { + mm = p->mm; + mmgrab(mm); + } + task_unlock(p); + } + } + + task->signal->oom_score_adj = oom_adj; + if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = (short)oom_adj; + trace_oom_score_adj_update(task); + + if (mm) { + struct task_struct *p; + + rcu_read_lock(); + for_each_process(p) { + if (same_thread_group(task, p)) + continue; + + /* do not touch kernel threads or the global init */ + if (p->flags & PF_KTHREAD || is_global_init(p)) + continue; + + task_lock(p); + if (!p->vfork_done && process_shares_mm(p, mm)) { + p->signal->oom_score_adj = oom_adj; + if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) + p->signal->oom_score_adj_min = (short)oom_adj; + } + task_unlock(p); + } + rcu_read_unlock(); + mmdrop(mm); + } +err_unlock: + mutex_unlock(&oom_adj_mutex); + put_task_struct(task); + return err; +} + +/* + * /proc/pid/oom_adj exists solely for backwards compatibility with previous + * kernels. The effective policy is defined by oom_score_adj, which has a + * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. + * Values written to oom_adj are simply mapped linearly to oom_score_adj. + * Processes that become oom disabled via oom_adj will still be oom disabled + * with this implementation. + * + * oom_adj cannot be removed since existing userspace binaries use it. + */ +static ssize_t oom_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char buffer[PROC_NUMBUF]; + int oom_adj; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_adj); + if (err) + goto out; + if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && + oom_adj != OOM_DISABLE) { + err = -EINVAL; + goto out; + } + + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (oom_adj == OOM_ADJUST_MAX) + oom_adj = OOM_SCORE_ADJ_MAX; + else + oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + + err = __set_oom_adj(file, oom_adj, true); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_adj_operations = { + .read = oom_adj_read, + .write = oom_adj_write, + .llseek = generic_file_llseek, +}; + +static ssize_t oom_score_adj_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + char buffer[PROC_NUMBUF]; + short oom_score_adj = OOM_SCORE_ADJ_MIN; + size_t len; + + if (!task) + return -ESRCH; + oom_score_adj = task->signal->oom_score_adj; + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char buffer[PROC_NUMBUF]; + int oom_score_adj; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); + if (err) + goto out; + if (oom_score_adj < OOM_SCORE_ADJ_MIN || + oom_score_adj > OOM_SCORE_ADJ_MAX) { + err = -EINVAL; + goto out; + } + + err = __set_oom_adj(file, oom_score_adj, false); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_score_adj_operations = { + .read = oom_score_adj_read, + .write = oom_score_adj_write, + .llseek = default_llseek, +}; + +#ifdef CONFIG_AUDIT +#define TMPBUFLEN 11 +static ssize_t proc_loginuid_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file_inode(file); + struct task_struct *task = get_proc_task(inode); + ssize_t length; + char tmpbuf[TMPBUFLEN]; + + if (!task) + return -ESRCH; + length = scnprintf(tmpbuf, TMPBUFLEN, "%u", + from_kuid(file->f_cred->user_ns, + audit_get_loginuid(task))); + put_task_struct(task); + return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); +} + +static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file_inode(file); + uid_t loginuid; + kuid_t kloginuid; + int rv; + + /* Don't let kthreads write their own loginuid */ + if (current->flags & PF_KTHREAD) + return -EPERM; + + rcu_read_lock(); + if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); + + if (*ppos != 0) { + /* No partial writes. */ + return -EINVAL; + } + + rv = kstrtou32_from_user(buf, count, 10, &loginuid); + if (rv < 0) + return rv; + + /* is userspace tring to explicitly UNSET the loginuid? */ + if (loginuid == AUDIT_UID_UNSET) { + kloginuid = INVALID_UID; + } else { + kloginuid = make_kuid(file->f_cred->user_ns, loginuid); + if (!uid_valid(kloginuid)) + return -EINVAL; + } + + rv = audit_set_loginuid(kloginuid); + if (rv < 0) + return rv; + return count; +} + +static const struct file_operations proc_loginuid_operations = { + .read = proc_loginuid_read, + .write = proc_loginuid_write, + .llseek = generic_file_llseek, +}; + +static ssize_t proc_sessionid_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file_inode(file); + struct task_struct *task = get_proc_task(inode); + ssize_t length; + char tmpbuf[TMPBUFLEN]; + + if (!task) + return -ESRCH; + length = scnprintf(tmpbuf, TMPBUFLEN, "%u", + audit_get_sessionid(task)); + put_task_struct(task); + return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); +} + +static const struct file_operations proc_sessionid_operations = { + .read = proc_sessionid_read, + .llseek = generic_file_llseek, +}; +#endif + +#ifdef CONFIG_FAULT_INJECTION +static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + char buffer[PROC_NUMBUF]; + size_t len; + int make_it_fail; + + if (!task) + return -ESRCH; + make_it_fail = task->make_it_fail; + put_task_struct(task); + + len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail); + + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t proc_fault_inject_write(struct file * file, + const char __user * buf, size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + int make_it_fail; + int rv; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + rv = kstrtoint(strstrip(buffer), 0, &make_it_fail); + if (rv < 0) + return rv; + if (make_it_fail < 0 || make_it_fail > 1) + return -EINVAL; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + task->make_it_fail = make_it_fail; + put_task_struct(task); + + return count; +} + +static const struct file_operations proc_fault_inject_operations = { + .read = proc_fault_inject_read, + .write = proc_fault_inject_write, + .llseek = generic_file_llseek, +}; + +static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err; + unsigned int n; + + err = kstrtouint_from_user(buf, count, 0, &n); + if (err) + return err; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + task->fail_nth = n; + put_task_struct(task); + + return count; +} + +static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char numbuf[PROC_NUMBUF]; + ssize_t len; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); + put_task_struct(task); + return simple_read_from_buffer(buf, count, ppos, numbuf, len); +} + +static const struct file_operations proc_fail_nth_operations = { + .read = proc_fail_nth_read, + .write = proc_fail_nth_write, +}; +#endif + + +#ifdef CONFIG_SCHED_DEBUG +/* + * Print out various scheduling related per-task fields: + */ +static int sched_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_show_task(p, ns, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_set_task(p); + + put_task_struct(p); + + return count; +} + +static int sched_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_show, inode); +} + +static const struct file_operations proc_pid_sched_operations = { + .open = sched_open, + .read = seq_read, + .write = sched_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +/* + * Print out autogroup related information: + */ +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_autogroup_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int nice; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = kstrtoint(strstrip(buffer), 0, &nice); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = proc_sched_autogroup_set_nice(p, nice); + if (err) + count = err; + + put_task_struct(p); + + return count; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_autogroup_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, + .read = seq_read, + .write = sched_autogroup_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_SCHED_AUTOGROUP */ + +#ifdef CONFIG_TIME_NS +static int timens_offsets_show(struct seq_file *m, void *v) +{ + struct task_struct *p; + + p = get_proc_task(file_inode(m->file)); + if (!p) + return -ESRCH; + proc_timens_show_offsets(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t timens_offsets_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file_inode(file); + struct proc_timens_offset offsets[2]; + char *kbuf = NULL, *pos, *next_line; + struct task_struct *p; + int ret, noffsets; + + /* Only allow < page size writes at the beginning of the file */ + if ((*ppos != 0) || (count >= PAGE_SIZE)) + return -EINVAL; + + /* Slurp in the user data */ + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Parse the user data */ + ret = -EINVAL; + noffsets = 0; + for (pos = kbuf; pos; pos = next_line) { + struct proc_timens_offset *off = &offsets[noffsets]; + char clock[10]; + int err; + + /* Find the end of line and ensure we don't look past it */ + next_line = strchr(pos, '\n'); + if (next_line) { + *next_line = '\0'; + next_line++; + if (*next_line == '\0') + next_line = NULL; + } + + err = sscanf(pos, "%9s %lld %lu", clock, + &off->val.tv_sec, &off->val.tv_nsec); + if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) + goto out; + + clock[sizeof(clock) - 1] = 0; + if (strcmp(clock, "monotonic") == 0 || + strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0) + off->clockid = CLOCK_MONOTONIC; + else if (strcmp(clock, "boottime") == 0 || + strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0) + off->clockid = CLOCK_BOOTTIME; + else + goto out; + + noffsets++; + if (noffsets == ARRAY_SIZE(offsets)) { + if (next_line) + count = next_line - kbuf; + break; + } + } + + ret = -ESRCH; + p = get_proc_task(inode); + if (!p) + goto out; + ret = proc_timens_set_offset(file, p, offsets, noffsets); + put_task_struct(p); + if (ret) + goto out; + + ret = count; +out: + kfree(kbuf); + return ret; +} + +static int timens_offsets_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timens_offsets_show, inode); +} + +static const struct file_operations proc_timens_offsets_operations = { + .open = timens_offsets_open, + .read = seq_read, + .write = timens_offsets_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_TIME_NS */ + +static ssize_t comm_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; + + memset(buffer, 0, sizeof(buffer)); + if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (same_thread_group(current, p)) { + set_task_comm(p, buffer); + proc_comm_connector(p); + } + else + count = -EINVAL; + + put_task_struct(p); + + return count; +} + +static int comm_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + proc_task_name(m, p, false); + seq_putc(m, '\n'); + + put_task_struct(p); + + return 0; +} + +static int comm_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, comm_show, inode); +} + +static const struct file_operations proc_pid_set_comm_operations = { + .open = comm_open, + .read = seq_read, + .write = comm_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int proc_exe_link(struct dentry *dentry, struct path *exe_path) +{ + struct task_struct *task; + struct file *exe_file; + + task = get_proc_task(d_inode(dentry)); + if (!task) + return -ENOENT; + exe_file = get_task_exe_file(task); + put_task_struct(task); + if (exe_file) { + *exe_path = exe_file->f_path; + path_get(&exe_file->f_path); + fput(exe_file); + return 0; + } else + return -ENOENT; +} + +static const char *proc_pid_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct path path; + int error = -EACCES; + + if (!dentry) + return ERR_PTR(-ECHILD); + + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) + goto out; + + error = PROC_I(inode)->op.proc_get_link(dentry, &path); + if (error) + goto out; + + error = nd_jump_link(&path); +out: + return ERR_PTR(error); +} + +static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen) +{ + char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); + char *pathname; + int len; + + if (!tmp) + return -ENOMEM; + + pathname = d_path(path, tmp, PATH_MAX); + len = PTR_ERR(pathname); + if (IS_ERR(pathname)) + goto out; + len = tmp + PATH_MAX - 1 - pathname; + + if (len > buflen) + len = buflen; + if (copy_to_user(buffer, pathname, len)) + len = -EFAULT; + out: + kfree(tmp); + return len; +} + +static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) +{ + int error = -EACCES; + struct inode *inode = d_inode(dentry); + struct path path; + + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) + goto out; + + error = PROC_I(inode)->op.proc_get_link(dentry, &path); + if (error) + goto out; + + error = do_proc_readlink(&path, buffer, buflen); + path_put(&path); +out: + return error; +} + +const struct inode_operations proc_pid_link_inode_operations = { + .readlink = proc_pid_readlink, + .get_link = proc_pid_get_link, + .setattr = proc_setattr, +}; + + +/* building an inode */ + +void task_dump_owner(struct task_struct *task, umode_t mode, + kuid_t *ruid, kgid_t *rgid) +{ + /* Depending on the state of dumpable compute who should own a + * proc file for a task. + */ + const struct cred *cred; + kuid_t uid; + kgid_t gid; + + if (unlikely(task->flags & PF_KTHREAD)) { + *ruid = GLOBAL_ROOT_UID; + *rgid = GLOBAL_ROOT_GID; + return; + } + + /* Default to the tasks effective ownership */ + rcu_read_lock(); + cred = __task_cred(task); + uid = cred->euid; + gid = cred->egid; + rcu_read_unlock(); + + /* + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. + */ + if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) { + struct mm_struct *mm; + task_lock(task); + mm = task->mm; + /* Make non-dumpable tasks owned by some root */ + if (mm) { + if (get_dumpable(mm) != SUID_DUMP_USER) { + struct user_namespace *user_ns = mm->user_ns; + + uid = make_kuid(user_ns, 0); + if (!uid_valid(uid)) + uid = GLOBAL_ROOT_UID; + + gid = make_kgid(user_ns, 0); + if (!gid_valid(gid)) + gid = GLOBAL_ROOT_GID; + } + } else { + uid = GLOBAL_ROOT_UID; + gid = GLOBAL_ROOT_GID; + } + task_unlock(task); + } + *ruid = uid; + *rgid = gid; +} + +void proc_pid_evict_inode(struct proc_inode *ei) +{ + struct pid *pid = ei->pid; + + if (S_ISDIR(ei->vfs_inode.i_mode)) { + spin_lock(&pid->lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(&pid->lock); + } + + put_pid(pid); +} + +struct inode *proc_pid_make_inode(struct super_block *sb, + struct task_struct *task, umode_t mode) +{ + struct inode * inode; + struct proc_inode *ei; + struct pid *pid; + + /* We need a new inode */ + + inode = new_inode(sb); + if (!inode) + goto out; + + /* Common stuff */ + ei = PROC_I(inode); + inode->i_mode = mode; + inode->i_ino = get_next_ino(); + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_op = &proc_def_inode_operations; + + /* + * grab the reference to task. + */ + pid = get_task_pid(task, PIDTYPE_PID); + if (!pid) + goto out_unlock; + + /* Let the pid remember us for quick removal */ + ei->pid = pid; + + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); + security_task_to_inode(task, inode); + +out: + return inode; + +out_unlock: + iput(inode); + return NULL; +} + +/* + * Generating an inode and adding it into @pid->inodes, so that task will + * invalidate inode's dentry before being released. + * + * This helper is used for creating dir-type entries under '/proc' and + * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>' + * can be released by invalidating '/proc/<tgid>' dentry. + * In theory, dentries under '/proc/<tgid>/task' can also be released by + * invalidating '/proc/<tgid>' dentry, we reserve it to handle single + * thread exiting situation: Any one of threads should invalidate its + * '/proc/<tgid>/task/<pid>' dentry before released. + */ +static struct inode *proc_pid_make_base_inode(struct super_block *sb, + struct task_struct *task, umode_t mode) +{ + struct inode *inode; + struct proc_inode *ei; + struct pid *pid; + + inode = proc_pid_make_inode(sb, task, mode); + if (!inode) + return NULL; + + /* Let proc_flush_pid find this directory inode */ + ei = PROC_I(inode); + pid = ei->pid; + spin_lock(&pid->lock); + hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); + spin_unlock(&pid->lock); + + return inode; +} + +int pid_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); + struct task_struct *task; + + generic_fillattr(&init_user_ns, inode, stat); + + stat->uid = GLOBAL_ROOT_UID; + stat->gid = GLOBAL_ROOT_GID; + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) { + rcu_read_unlock(); + /* + * This doesn't prevent learning whether PID exists, + * it only makes getattr() consistent with readdir(). + */ + return -ENOENT; + } + task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid); + } + rcu_read_unlock(); + return 0; +} + +/* dentry stuff */ + +/* + * Set <pid>/... inode ownership (can change due to setuid(), etc.) + */ +void pid_update_inode(struct task_struct *task, struct inode *inode) +{ + task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid); + + inode->i_mode &= ~(S_ISUID | S_ISGID); + security_task_to_inode(task, inode); +} + +/* + * Rewrite the inode's ownerships here because the owning task may have + * performed a setuid(), etc. + * + */ +static int pid_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct inode *inode; + struct task_struct *task; + int ret = 0; + + rcu_read_lock(); + inode = d_inode_rcu(dentry); + if (!inode) + goto out; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + + if (task) { + pid_update_inode(task, inode); + ret = 1; + } +out: + rcu_read_unlock(); + return ret; +} + +static inline bool proc_inode_is_dead(struct inode *inode) +{ + return !proc_pid(inode)->tasks[PIDTYPE_PID].first; +} + +int pid_delete_dentry(const struct dentry *dentry) +{ + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return proc_inode_is_dead(d_inode(dentry)); +} + +const struct dentry_operations pid_dentry_operations = +{ + .d_revalidate = pid_revalidate, + .d_delete = pid_delete_dentry, +}; + +/* Lookups */ + +/* + * Fill a directory entry. + * + * If possible create the dcache entry and derive our inode number and + * file type from dcache entry. + * + * Since all of the proc inode numbers are dynamically generated, the inode + * numbers do not exist until the inode is cache. This means creating + * the dcache entry in readdir is necessary to keep the inode numbers + * reported by readdir in sync with the inode numbers reported + * by stat. + */ +bool proc_fill_cache(struct file *file, struct dir_context *ctx, + const char *name, unsigned int len, + instantiate_t instantiate, struct task_struct *task, const void *ptr) +{ + struct dentry *child, *dir = file->f_path.dentry; + struct qstr qname = QSTR_INIT(name, len); + struct inode *inode; + unsigned type = DT_UNKNOWN; + ino_t ino = 1; + + child = d_hash_and_lookup(dir, &qname); + if (!child) { + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) + goto end_instantiate; + if (d_in_lookup(child)) { + struct dentry *res; + res = instantiate(child, task, ptr); + d_lookup_done(child); + if (unlikely(res)) { + dput(child); + child = res; + if (IS_ERR(child)) + goto end_instantiate; + } + } + } + inode = d_inode(child); + ino = inode->i_ino; + type = inode->i_mode >> 12; + dput(child); +end_instantiate: + return dir_emit(ctx, name, len, ino, type); +} + +/* + * dname_to_vma_addr - maps a dentry name into two unsigned longs + * which represent vma start and end addresses. + */ +static int dname_to_vma_addr(struct dentry *dentry, + unsigned long *start, unsigned long *end) +{ + const char *str = dentry->d_name.name; + unsigned long long sval, eval; + unsigned int len; + + if (str[0] == '0' && str[1] != '-') + return -EINVAL; + len = _parse_integer(str, 16, &sval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (sval != (unsigned long)sval) + return -EINVAL; + str += len; + + if (*str != '-') + return -EINVAL; + str++; + + if (str[0] == '0' && str[1]) + return -EINVAL; + len = _parse_integer(str, 16, &eval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (eval != (unsigned long)eval) + return -EINVAL; + str += len; + + if (*str != '\0') + return -EINVAL; + + *start = sval; + *end = eval; + + return 0; +} + +static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + unsigned long vm_start, vm_end; + bool exact_vma_exists = false; + struct mm_struct *mm = NULL; + struct task_struct *task; + struct inode *inode; + int status = 0; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = d_inode(dentry); + task = get_proc_task(inode); + if (!task) + goto out_notask; + + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR_OR_NULL(mm)) + goto out; + + if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { + status = mmap_read_lock_killable(mm); + if (!status) { + exact_vma_exists = !!find_exact_vma(mm, vm_start, + vm_end); + mmap_read_unlock(mm); + } + } + + mmput(mm); + + if (exact_vma_exists) { + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); + + security_task_to_inode(task, inode); + status = 1; + } + +out: + put_task_struct(task); + +out_notask: + return status; +} + +static const struct dentry_operations tid_map_files_dentry_operations = { + .d_revalidate = map_files_d_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int map_files_get_link(struct dentry *dentry, struct path *path) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + int rc; + + rc = -ENOENT; + task = get_proc_task(d_inode(dentry)); + if (!task) + goto out; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); + if (rc) + goto out_mmput; + + rc = mmap_read_lock_killable(mm); + if (rc) + goto out_mmput; + + rc = -ENOENT; + vma = find_exact_vma(mm, vm_start, vm_end); + if (vma && vma->vm_file) { + *path = vma->vm_file->f_path; + path_get(path); + rc = 0; + } + mmap_read_unlock(mm); + +out_mmput: + mmput(mm); +out: + return rc; +} + +struct map_files_info { + unsigned long start; + unsigned long end; + fmode_t mode; +}; + +/* + * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due + * to concerns about how the symlinks may be used to bypass permissions on + * ancestor directories in the path to the file in question. + */ +static const char * +proc_map_files_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + if (!checkpoint_restore_ns_capable(&init_user_ns)) + return ERR_PTR(-EPERM); + + return proc_pid_get_link(dentry, inode, done); +} + +/* + * Identical to proc_pid_link_inode_operations except for get_link() + */ +static const struct inode_operations proc_map_files_link_inode_operations = { + .readlink = proc_pid_readlink, + .get_link = proc_map_files_get_link, + .setattr = proc_setattr, +}; + +static struct dentry * +proc_map_files_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + fmode_t mode = (fmode_t)(unsigned long)ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | + ((mode & FMODE_READ ) ? S_IRUSR : 0) | + ((mode & FMODE_WRITE) ? S_IWUSR : 0)); + if (!inode) + return ERR_PTR(-ENOENT); + + ei = PROC_I(inode); + ei->op.proc_get_link = map_files_get_link; + + inode->i_op = &proc_map_files_link_inode_operations; + inode->i_size = 64; + + d_set_d_op(dentry, &tid_map_files_dentry_operations); + return d_splice_alias(inode, dentry); +} + +static struct dentry *proc_map_files_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct dentry *result; + struct mm_struct *mm; + + result = ERR_PTR(-ENOENT); + task = get_proc_task(dir); + if (!task) + goto out; + + result = ERR_PTR(-EACCES); + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto out_put_task; + + result = ERR_PTR(-ENOENT); + if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) + goto out_put_task; + + mm = get_task_mm(task); + if (!mm) + goto out_put_task; + + result = ERR_PTR(-EINTR); + if (mmap_read_lock_killable(mm)) + goto out_put_mm; + + result = ERR_PTR(-ENOENT); + vma = find_exact_vma(mm, vm_start, vm_end); + if (!vma) + goto out_no_vma; + + if (vma->vm_file) + result = proc_map_files_instantiate(dentry, task, + (void *)(unsigned long)vma->vm_file->f_mode); + +out_no_vma: + mmap_read_unlock(mm); +out_put_mm: + mmput(mm); +out_put_task: + put_task_struct(task); +out: + return result; +} + +static const struct inode_operations proc_map_files_inode_operations = { + .lookup = proc_map_files_lookup, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static int +proc_map_files_readdir(struct file *file, struct dir_context *ctx) +{ + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + unsigned long nr_files, pos, i; + GENRADIX(struct map_files_info) fa; + struct map_files_info *p; + int ret; + struct vma_iterator vmi; + + genradix_init(&fa); + + ret = -ENOENT; + task = get_proc_task(file_inode(file)); + if (!task) + goto out; + + ret = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto out_put_task; + + ret = 0; + if (!dir_emit_dots(file, ctx)) + goto out_put_task; + + mm = get_task_mm(task); + if (!mm) + goto out_put_task; + + ret = mmap_read_lock_killable(mm); + if (ret) { + mmput(mm); + goto out_put_task; + } + + nr_files = 0; + + /* + * We need two passes here: + * + * 1) Collect vmas of mapped files with mmap_lock taken + * 2) Release mmap_lock and instantiate entries + * + * otherwise we get lockdep complained, since filldir() + * routine might require mmap_lock taken in might_fault(). + */ + + pos = 2; + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { + if (!vma->vm_file) + continue; + if (++pos <= ctx->pos) + continue; + + p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL); + if (!p) { + ret = -ENOMEM; + mmap_read_unlock(mm); + mmput(mm); + goto out_put_task; + } + + p->start = vma->vm_start; + p->end = vma->vm_end; + p->mode = vma->vm_file->f_mode; + } + mmap_read_unlock(mm); + mmput(mm); + + for (i = 0; i < nr_files; i++) { + char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ + unsigned int len; + + p = genradix_ptr(&fa, i); + len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); + if (!proc_fill_cache(file, ctx, + buf, len, + proc_map_files_instantiate, + task, + (void *)(unsigned long)p->mode)) + break; + ctx->pos++; + } + +out_put_task: + put_task_struct(task); +out: + genradix_free(&fa); + return ret; +} + +static const struct file_operations proc_map_files_operations = { + .read = generic_read_dir, + .iterate_shared = proc_map_files_readdir, + .llseek = generic_file_llseek, +}; + +#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) +struct timers_private { + struct pid *pid; + struct task_struct *task; + struct sighand_struct *sighand; + struct pid_namespace *ns; + unsigned long flags; +}; + +static void *timers_start(struct seq_file *m, loff_t *pos) +{ + struct timers_private *tp = m->private; + + tp->task = get_pid_task(tp->pid, PIDTYPE_PID); + if (!tp->task) + return ERR_PTR(-ESRCH); + + tp->sighand = lock_task_sighand(tp->task, &tp->flags); + if (!tp->sighand) + return ERR_PTR(-ESRCH); + + return seq_list_start(&tp->task->signal->posix_timers, *pos); +} + +static void *timers_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct timers_private *tp = m->private; + return seq_list_next(v, &tp->task->signal->posix_timers, pos); +} + +static void timers_stop(struct seq_file *m, void *v) +{ + struct timers_private *tp = m->private; + + if (tp->sighand) { + unlock_task_sighand(tp->task, &tp->flags); + tp->sighand = NULL; + } + + if (tp->task) { + put_task_struct(tp->task); + tp->task = NULL; + } +} + +static int show_timer(struct seq_file *m, void *v) +{ + struct k_itimer *timer; + struct timers_private *tp = m->private; + int notify; + static const char * const nstr[] = { + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", + }; + + timer = list_entry((struct list_head *)v, struct k_itimer, list); + notify = timer->it_sigev_notify; + + seq_printf(m, "ID: %d\n", timer->it_id); + seq_printf(m, "signal: %d/%px\n", + timer->sigq->info.si_signo, + timer->sigq->info.si_value.sival_ptr); + seq_printf(m, "notify: %s/%s.%d\n", + nstr[notify & ~SIGEV_THREAD_ID], + (notify & SIGEV_THREAD_ID) ? "tid" : "pid", + pid_nr_ns(timer->it_pid, tp->ns)); + seq_printf(m, "ClockID: %d\n", timer->it_clock); + + return 0; +} + +static const struct seq_operations proc_timers_seq_ops = { + .start = timers_start, + .next = timers_next, + .stop = timers_stop, + .show = show_timer, +}; + +static int proc_timers_open(struct inode *inode, struct file *file) +{ + struct timers_private *tp; + + tp = __seq_open_private(file, &proc_timers_seq_ops, + sizeof(struct timers_private)); + if (!tp) + return -ENOMEM; + + tp->pid = proc_pid(inode); + tp->ns = proc_pid_ns(inode->i_sb); + return 0; +} + +static const struct file_operations proc_timers_operations = { + .open = proc_timers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + u64 slack_ns; + int err; + + err = kstrtoull_from_user(buf, count, 10, &slack_ns); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (p != current) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + count = -EPERM; + goto out; + } + rcu_read_unlock(); + + err = security_task_setscheduler(p); + if (err) { + count = err; + goto out; + } + } + + task_lock(p); + if (slack_ns == 0) + p->timer_slack_ns = p->default_timer_slack_ns; + else + p->timer_slack_ns = slack_ns; + task_unlock(p); + +out: + put_task_struct(p); + + return count; +} + +static int timerslack_ns_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + int err = 0; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (p != current) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + err = -EPERM; + goto out; + } + rcu_read_unlock(); + + err = security_task_getscheduler(p); + if (err) + goto out; + } + + task_lock(p); + seq_printf(m, "%llu\n", p->timer_slack_ns); + task_unlock(p); + +out: + put_task_struct(p); + + return err; +} + +static int timerslack_ns_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timerslack_ns_show, inode); +} + +static const struct file_operations proc_pid_set_timerslack_ns_operations = { + .open = timerslack_ns_open, + .read = seq_read, + .write = timerslack_ns_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *proc_pident_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + const struct pid_entry *p = ptr; + struct inode *inode; + struct proc_inode *ei; + + inode = proc_pid_make_inode(dentry->d_sb, task, p->mode); + if (!inode) + return ERR_PTR(-ENOENT); + + ei = PROC_I(inode); + if (S_ISDIR(inode->i_mode)) + set_nlink(inode, 2); /* Use getattr to fix if necessary */ + if (p->iop) + inode->i_op = p->iop; + if (p->fop) + inode->i_fop = p->fop; + ei->op = p->op; + pid_update_inode(task, inode); + d_set_d_op(dentry, &pid_dentry_operations); + return d_splice_alias(inode, dentry); +} + +static struct dentry *proc_pident_lookup(struct inode *dir, + struct dentry *dentry, + const struct pid_entry *p, + const struct pid_entry *end) +{ + struct task_struct *task = get_proc_task(dir); + struct dentry *res = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + + /* + * Yes, it does not scale. And it should not. Don't add + * new entries into /proc/<tgid>/ without very good reasons. + */ + for (; p < end; p++) { + if (p->len != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, p->name, p->len)) { + res = proc_pident_instantiate(dentry, task, p); + break; + } + } + put_task_struct(task); +out_no_task: + return res; +} + +static int proc_pident_readdir(struct file *file, struct dir_context *ctx, + const struct pid_entry *ents, unsigned int nents) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + const struct pid_entry *p; + + if (!task) + return -ENOENT; + + if (!dir_emit_dots(file, ctx)) + goto out; + + if (ctx->pos >= nents + 2) + goto out; + + for (p = ents + (ctx->pos - 2); p < ents + nents; p++) { + if (!proc_fill_cache(file, ctx, p->name, p->len, + proc_pident_instantiate, task, p)) + break; + ctx->pos++; + } +out: + put_task_struct(task); + return 0; +} + +#ifdef CONFIG_SECURITY +static int proc_pid_attr_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); + return 0; +} + +static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file_inode(file); + char *p = NULL; + ssize_t length; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + + length = security_getprocattr(task, PROC_I(inode)->op.lsm, + file->f_path.dentry->d_name.name, + &p); + put_task_struct(task); + if (length > 0) + length = simple_read_from_buffer(buf, count, ppos, p, length); + kfree(p); + return length; +} + +static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file_inode(file); + struct task_struct *task; + void *page; + int rv; + + /* A task may only write when it was the opener. */ + if (file->private_data != current->mm) + return -EPERM; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } + /* A task may only write its own attributes. */ + if (current != task) { + rcu_read_unlock(); + return -EACCES; + } + /* Prevent changes to overridden credentials. */ + if (current_cred() != current_real_cred()) { + rcu_read_unlock(); + return -EBUSY; + } + rcu_read_unlock(); + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + /* No partial writes. */ + if (*ppos != 0) + return -EINVAL; + + page = memdup_user(buf, count); + if (IS_ERR(page)) { + rv = PTR_ERR(page); + goto out; + } + + /* Guard against adverse ptrace interaction */ + rv = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); + if (rv < 0) + goto out_free; + + rv = security_setprocattr(PROC_I(inode)->op.lsm, + file->f_path.dentry->d_name.name, page, + count); + mutex_unlock(¤t->signal->cred_guard_mutex); +out_free: + kfree(page); +out: + return rv; +} + +static const struct file_operations proc_pid_attr_operations = { + .open = proc_pid_attr_open, + .read = proc_pid_attr_read, + .write = proc_pid_attr_write, + .llseek = generic_file_llseek, + .release = mem_release, +}; + +#define LSM_DIR_OPS(LSM) \ +static int proc_##LSM##_attr_dir_iterate(struct file *filp, \ + struct dir_context *ctx) \ +{ \ + return proc_pident_readdir(filp, ctx, \ + LSM##_attr_dir_stuff, \ + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ +} \ +\ +static const struct file_operations proc_##LSM##_attr_dir_ops = { \ + .read = generic_read_dir, \ + .iterate = proc_##LSM##_attr_dir_iterate, \ + .llseek = default_llseek, \ +}; \ +\ +static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \ + struct dentry *dentry, unsigned int flags) \ +{ \ + return proc_pident_lookup(dir, dentry, \ + LSM##_attr_dir_stuff, \ + LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ +} \ +\ +static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ + .lookup = proc_##LSM##_attr_dir_lookup, \ + .getattr = pid_getattr, \ + .setattr = proc_setattr, \ +} + +#ifdef CONFIG_SECURITY_SMACK +static const struct pid_entry smack_attr_dir_stuff[] = { + ATTR("smack", "current", 0666), +}; +LSM_DIR_OPS(smack); +#endif + +#ifdef CONFIG_SECURITY_APPARMOR +static const struct pid_entry apparmor_attr_dir_stuff[] = { + ATTR("apparmor", "current", 0666), + ATTR("apparmor", "prev", 0444), + ATTR("apparmor", "exec", 0666), +}; +LSM_DIR_OPS(apparmor); +#endif + +static const struct pid_entry attr_dir_stuff[] = { + ATTR(NULL, "current", 0666), + ATTR(NULL, "prev", 0444), + ATTR(NULL, "exec", 0666), + ATTR(NULL, "fscreate", 0666), + ATTR(NULL, "keycreate", 0666), + ATTR(NULL, "sockcreate", 0666), +#ifdef CONFIG_SECURITY_SMACK + DIR("smack", 0555, + proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), +#endif +#ifdef CONFIG_SECURITY_APPARMOR + DIR("apparmor", 0555, + proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops), +#endif +}; + +static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) +{ + return proc_pident_readdir(file, ctx, + attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); +} + +static const struct file_operations proc_attr_dir_operations = { + .read = generic_read_dir, + .iterate_shared = proc_attr_dir_readdir, + .llseek = generic_file_llseek, +}; + +static struct dentry *proc_attr_dir_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + return proc_pident_lookup(dir, dentry, + attr_dir_stuff, + attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff)); +} + +static const struct inode_operations proc_attr_dir_inode_operations = { + .lookup = proc_attr_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +#endif + +#ifdef CONFIG_ELF_CORE +static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct mm_struct *mm; + char buffer[PROC_NUMBUF]; + size_t len; + int ret; + + if (!task) + return -ESRCH; + + ret = 0; + mm = get_task_mm(task); + if (mm) { + len = snprintf(buffer, sizeof(buffer), "%08lx\n", + ((mm->flags & MMF_DUMP_FILTER_MASK) >> + MMF_DUMP_FILTER_SHIFT)); + mmput(mm); + ret = simple_read_from_buffer(buf, count, ppos, buffer, len); + } + + put_task_struct(task); + + return ret; +} + +static ssize_t proc_coredump_filter_write(struct file *file, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + struct task_struct *task; + struct mm_struct *mm; + unsigned int val; + int ret; + int i; + unsigned long mask; + + ret = kstrtouint_from_user(buf, count, 0, &val); + if (ret < 0) + return ret; + + ret = -ESRCH; + task = get_proc_task(file_inode(file)); + if (!task) + goto out_no_task; + + mm = get_task_mm(task); + if (!mm) + goto out_no_mm; + ret = 0; + + for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { + if (val & mask) + set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + else + clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + } + + mmput(mm); + out_no_mm: + put_task_struct(task); + out_no_task: + if (ret < 0) + return ret; + return count; +} + +static const struct file_operations proc_coredump_filter_operations = { + .read = proc_coredump_filter_read, + .write = proc_coredump_filter_write, + .llseek = generic_file_llseek, +}; +#endif + +#ifdef CONFIG_TASK_IO_ACCOUNTING +static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) +{ + struct task_io_accounting acct = task->ioac; + unsigned long flags; + int result; + + result = down_read_killable(&task->signal->exec_update_lock); + if (result) + return result; + + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { + result = -EACCES; + goto out_unlock; + } + + if (whole && lock_task_sighand(task, &flags)) { + struct task_struct *t = task; + + task_io_accounting_add(&acct, &task->signal->ioac); + while_each_thread(task, t) + task_io_accounting_add(&acct, &t->ioac); + + unlock_task_sighand(task, &flags); + } + seq_printf(m, + "rchar: %llu\n" + "wchar: %llu\n" + "syscr: %llu\n" + "syscw: %llu\n" + "read_bytes: %llu\n" + "write_bytes: %llu\n" + "cancelled_write_bytes: %llu\n", + (unsigned long long)acct.rchar, + (unsigned long long)acct.wchar, + (unsigned long long)acct.syscr, + (unsigned long long)acct.syscw, + (unsigned long long)acct.read_bytes, + (unsigned long long)acct.write_bytes, + (unsigned long long)acct.cancelled_write_bytes); + result = 0; + +out_unlock: + up_read(&task->signal->exec_update_lock); + return result; +} + +static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_io_accounting(task, m, 0); +} + +static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_io_accounting(task, m, 1); +} +#endif /* CONFIG_TASK_IO_ACCOUNTING */ + +#ifdef CONFIG_USER_NS +static int proc_id_map_open(struct inode *inode, struct file *file, + const struct seq_operations *seq_ops) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + struct seq_file *seq; + int ret = -EINVAL; + + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + ret = seq_open(file, seq_ops); + if (ret) + goto err_put_ns; + + seq = file->private_data; + seq->private = ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_id_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + put_user_ns(ns); + return seq_release(inode, file); +} + +static int proc_uid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_uid_seq_operations); +} + +static int proc_gid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_gid_seq_operations); +} + +static int proc_projid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_projid_seq_operations); +} + +static const struct file_operations proc_uid_map_operations = { + .open = proc_uid_map_open, + .write = proc_uid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; + +static const struct file_operations proc_gid_map_operations = { + .open = proc_gid_map_open, + .write = proc_gid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; + +static const struct file_operations proc_projid_map_operations = { + .open = proc_projid_map_open, + .write = proc_projid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; + +static int proc_setgroups_open(struct inode *inode, struct file *file) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + int ret; + + ret = -ESRCH; + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + if (file->f_mode & FMODE_WRITE) { + ret = -EACCES; + if (!ns_capable(ns, CAP_SYS_ADMIN)) + goto err_put_ns; + } + + ret = single_open(file, &proc_setgroups_show, ns); + if (ret) + goto err_put_ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_setgroups_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + int ret = single_release(inode, file); + put_user_ns(ns); + return ret; +} + +static const struct file_operations proc_setgroups_operations = { + .open = proc_setgroups_open, + .write = proc_setgroups_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_setgroups_release, +}; +#endif /* CONFIG_USER_NS */ + +static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + int err = lock_trace(task); + if (!err) { + seq_printf(m, "%08x\n", task->personality); + unlock_trace(task); + } + return err; +} + +#ifdef CONFIG_LIVEPATCH +static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + seq_printf(m, "%d\n", task->patch_state); + return 0; +} +#endif /* CONFIG_LIVEPATCH */ + +#ifdef CONFIG_KSM +static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "%lu\n", mm->ksm_merging_pages); + mmput(mm); + } + + return 0; +} +static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + mmput(mm); + } + + return 0; +} +#endif /* CONFIG_KSM */ + +#ifdef CONFIG_STACKLEAK_METRICS +static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long prev_depth = THREAD_SIZE - + (task->prev_lowest_stack & (THREAD_SIZE - 1)); + unsigned long depth = THREAD_SIZE - + (task->lowest_stack & (THREAD_SIZE - 1)); + + seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n", + prev_depth, depth); + return 0; +} +#endif /* CONFIG_STACKLEAK_METRICS */ + +/* + * Thread groups + */ +static const struct file_operations proc_task_operations; +static const struct inode_operations proc_task_inode_operations; + +static const struct pid_entry tgid_base_stuff[] = { + DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), + DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), + DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), +#ifdef CONFIG_NET + DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), +#endif + REG("environ", S_IRUSR, proc_environ_operations), + REG("auxv", S_IRUSR, proc_auxv_operations), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUSR, proc_pid_personality), + ONE("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), +#endif +#ifdef CONFIG_SCHED_AUTOGROUP + REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif +#ifdef CONFIG_TIME_NS + REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), +#endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + ONE("syscall", S_IRUSR, proc_pid_syscall), +#endif + REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), + ONE("stat", S_IRUGO, proc_tgid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_pid_maps_operations), +#ifdef CONFIG_NUMA + REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), +#endif + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), + REG("mountstats", S_IRUSR, proc_mountstats_operations), +#ifdef CONFIG_PROC_PAGE_MONITOR + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), +#endif +#ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), +#endif +#ifdef CONFIG_KALLSYMS + ONE("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUSR, proc_pid_stack), +#endif +#ifdef CONFIG_SCHED_INFO + ONE("schedstat", S_IRUGO, proc_pid_schedstat), +#endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, proc_lstats_operations), +#endif +#ifdef CONFIG_PROC_PID_CPUSET + ONE("cpuset", S_IRUGO, proc_cpuset_show), +#endif +#ifdef CONFIG_CGROUPS + ONE("cgroup", S_IRUGO, proc_cgroup_show), +#endif +#ifdef CONFIG_PROC_CPU_RESCTRL + ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), +#endif + ONE("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_AUDIT + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), +#endif +#ifdef CONFIG_FAULT_INJECTION + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), + REG("fail-nth", 0644, proc_fail_nth_operations), +#endif +#ifdef CONFIG_ELF_CORE + REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + ONE("io", S_IRUSR, proc_tgid_io_accounting), +#endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), +#endif +#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) + REG("timers", S_IRUGO, proc_timers_operations), +#endif + REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), +#ifdef CONFIG_LIVEPATCH + ONE("patch_state", S_IRUSR, proc_pid_patch_state), +#endif +#ifdef CONFIG_STACKLEAK_METRICS + ONE("stack_depth", S_IRUGO, proc_stack_depth), +#endif +#ifdef CONFIG_PROC_PID_ARCH_STATUS + ONE("arch_status", S_IRUGO, proc_pid_arch_status), +#endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif +#ifdef CONFIG_KSM + ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), +#endif +}; + +static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) +{ + return proc_pident_readdir(file, ctx, + tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); +} + +static const struct file_operations proc_tgid_base_operations = { + .read = generic_read_dir, + .iterate_shared = proc_tgid_base_readdir, + .llseek = generic_file_llseek, +}; + +struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + if (file->f_op != &proc_tgid_base_operations) + return ERR_PTR(-EBADF); + + return proc_pid(file_inode(file)); +} + +static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return proc_pident_lookup(dir, dentry, + tgid_base_stuff, + tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff)); +} + +static const struct inode_operations proc_tgid_base_inode_operations = { + .lookup = proc_tgid_base_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, + .permission = proc_pid_permission, +}; + +/** + * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache. + * @pid: pid that should be flushed. + * + * This function walks a list of inodes (that belong to any proc + * filesystem) that are attached to the pid and flushes them from + * the dentry cache. + * + * It is safe and reasonable to cache /proc entries for a task until + * that task exits. After that they just clog up the dcache with + * useless entries, possibly causing useful dcache entries to be + * flushed instead. This routine is provided to flush those useless + * dcache entries when a process is reaped. + * + * NOTE: This routine is just an optimization so it does not guarantee + * that no dcache entries will exist after a process is reaped + * it just makes it very unlikely that any will persist. + */ + +void proc_flush_pid(struct pid *pid) +{ + proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); +} + +static struct dentry *proc_pid_instantiate(struct dentry * dentry, + struct task_struct *task, const void *ptr) +{ + struct inode *inode; + + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); + if (!inode) + return ERR_PTR(-ENOENT); + + inode->i_op = &proc_tgid_base_inode_operations; + inode->i_fop = &proc_tgid_base_operations; + inode->i_flags|=S_IMMUTABLE; + + set_nlink(inode, nlink_tgid); + pid_update_inode(task, inode); + + d_set_d_op(dentry, &pid_dentry_operations); + return d_splice_alias(inode, dentry); +} + +struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) +{ + struct task_struct *task; + unsigned tgid; + struct proc_fs_info *fs_info; + struct pid_namespace *ns; + struct dentry *result = ERR_PTR(-ENOENT); + + tgid = name_to_int(&dentry->d_name); + if (tgid == ~0U) + goto out; + + fs_info = proc_sb_info(dentry->d_sb); + ns = fs_info->pid_ns; + rcu_read_lock(); + task = find_task_by_pid_ns(tgid, ns); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) + goto out; + + /* Limit procfs to only ptraceable tasks */ + if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) { + if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS)) + goto out_put_task; + } + + result = proc_pid_instantiate(dentry, task, NULL); +out_put_task: + put_task_struct(task); +out: + return result; +} + +/* + * Find the first task with tgid >= tgid + * + */ +struct tgid_iter { + unsigned int tgid; + struct task_struct *task; +}; +static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter) +{ + struct pid *pid; + + if (iter.task) + put_task_struct(iter.task); + rcu_read_lock(); +retry: + iter.task = NULL; + pid = find_ge_pid(iter.tgid, ns); + if (pid) { + iter.tgid = pid_nr_ns(pid, ns); + iter.task = pid_task(pid, PIDTYPE_TGID); + if (!iter.task) { + iter.tgid += 1; + goto retry; + } + get_task_struct(iter.task); + } + rcu_read_unlock(); + return iter; +} + +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2) + +/* for the /proc/ directory itself, after non-process stuff has been done */ +int proc_pid_readdir(struct file *file, struct dir_context *ctx) +{ + struct tgid_iter iter; + struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb); + struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb); + loff_t pos = ctx->pos; + + if (pos >= PID_MAX_LIMIT + TGID_OFFSET) + return 0; + + if (pos == TGID_OFFSET - 2) { + struct inode *inode = d_inode(fs_info->proc_self); + if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) + return 0; + ctx->pos = pos = pos + 1; + } + if (pos == TGID_OFFSET - 1) { + struct inode *inode = d_inode(fs_info->proc_thread_self); + if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) + return 0; + ctx->pos = pos = pos + 1; + } + iter.tgid = pos - TGID_OFFSET; + iter.task = NULL; + for (iter = next_tgid(ns, iter); + iter.task; + iter.tgid += 1, iter = next_tgid(ns, iter)) { + char name[10 + 1]; + unsigned int len; + + cond_resched(); + if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE)) + continue; + + len = snprintf(name, sizeof(name), "%u", iter.tgid); + ctx->pos = iter.tgid + TGID_OFFSET; + if (!proc_fill_cache(file, ctx, name, len, + proc_pid_instantiate, iter.task, NULL)) { + put_task_struct(iter.task); + return 0; + } + } + ctx->pos = PID_MAX_LIMIT + TGID_OFFSET; + return 0; +} + +/* + * proc_tid_comm_permission is a special permission function exclusively + * used for the node /proc/<pid>/task/<tid>/comm. + * It bypasses generic permission checks in the case where a task of the same + * task group attempts to access the node. + * The rationale behind this is that glibc and bionic access this node for + * cross thread naming (pthread_set/getname_np(!self)). However, if + * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0, + * which locks out the cross thread naming implementation. + * This function makes sure that the node is always accessible for members of + * same thread group. + */ +static int proc_tid_comm_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) +{ + bool is_same_tgroup; + struct task_struct *task; + + task = get_proc_task(inode); + if (!task) + return -ESRCH; + is_same_tgroup = same_thread_group(current, task); + put_task_struct(task); + + if (likely(is_same_tgroup && !(mask & MAY_EXEC))) { + /* This file (/proc/<pid>/task/<tid>/comm) can always be + * read or written by the members of the corresponding + * thread group. + */ + return 0; + } + + return generic_permission(&init_user_ns, inode, mask); +} + +static const struct inode_operations proc_tid_comm_inode_operations = { + .setattr = proc_setattr, + .permission = proc_tid_comm_permission, +}; + +/* + * Tasks + */ +static const struct pid_entry tid_base_stuff[] = { + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), +#ifdef CONFIG_NET + DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), +#endif + REG("environ", S_IRUSR, proc_environ_operations), + REG("auxv", S_IRUSR, proc_auxv_operations), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUSR, proc_pid_personality), + ONE("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), +#endif + NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, + &proc_tid_comm_inode_operations, + &proc_pid_set_comm_operations, {}), +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + ONE("syscall", S_IRUSR, proc_pid_syscall), +#endif + REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), + ONE("stat", S_IRUGO, proc_tid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_pid_maps_operations), +#ifdef CONFIG_PROC_CHILDREN + REG("children", S_IRUGO, proc_tid_children_operations), +#endif +#ifdef CONFIG_NUMA + REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), +#endif + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), +#ifdef CONFIG_PROC_PAGE_MONITOR + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), +#endif +#ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), +#endif +#ifdef CONFIG_KALLSYMS + ONE("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUSR, proc_pid_stack), +#endif +#ifdef CONFIG_SCHED_INFO + ONE("schedstat", S_IRUGO, proc_pid_schedstat), +#endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, proc_lstats_operations), +#endif +#ifdef CONFIG_PROC_PID_CPUSET + ONE("cpuset", S_IRUGO, proc_cpuset_show), +#endif +#ifdef CONFIG_CGROUPS + ONE("cgroup", S_IRUGO, proc_cgroup_show), +#endif +#ifdef CONFIG_PROC_CPU_RESCTRL + ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show), +#endif + ONE("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_AUDIT + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), +#endif +#ifdef CONFIG_FAULT_INJECTION + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), + REG("fail-nth", 0644, proc_fail_nth_operations), +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + ONE("io", S_IRUSR, proc_tid_io_accounting), +#endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), +#endif +#ifdef CONFIG_LIVEPATCH + ONE("patch_state", S_IRUSR, proc_pid_patch_state), +#endif +#ifdef CONFIG_PROC_PID_ARCH_STATUS + ONE("arch_status", S_IRUGO, proc_pid_arch_status), +#endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif +#ifdef CONFIG_KSM + ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), +#endif +}; + +static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) +{ + return proc_pident_readdir(file, ctx, + tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); +} + +static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return proc_pident_lookup(dir, dentry, + tid_base_stuff, + tid_base_stuff + ARRAY_SIZE(tid_base_stuff)); +} + +static const struct file_operations proc_tid_base_operations = { + .read = generic_read_dir, + .iterate_shared = proc_tid_base_readdir, + .llseek = generic_file_llseek, +}; + +static const struct inode_operations proc_tid_base_inode_operations = { + .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +static struct dentry *proc_task_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + struct inode *inode; + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); + if (!inode) + return ERR_PTR(-ENOENT); + + inode->i_op = &proc_tid_base_inode_operations; + inode->i_fop = &proc_tid_base_operations; + inode->i_flags |= S_IMMUTABLE; + + set_nlink(inode, nlink_tid); + pid_update_inode(task, inode); + + d_set_d_op(dentry, &pid_dentry_operations); + return d_splice_alias(inode, dentry); +} + +static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) +{ + struct task_struct *task; + struct task_struct *leader = get_proc_task(dir); + unsigned tid; + struct proc_fs_info *fs_info; + struct pid_namespace *ns; + struct dentry *result = ERR_PTR(-ENOENT); + + if (!leader) + goto out_no_task; + + tid = name_to_int(&dentry->d_name); + if (tid == ~0U) + goto out; + + fs_info = proc_sb_info(dentry->d_sb); + ns = fs_info->pid_ns; + rcu_read_lock(); + task = find_task_by_pid_ns(tid, ns); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) + goto out; + if (!same_thread_group(leader, task)) + goto out_drop_task; + + result = proc_task_instantiate(dentry, task, NULL); +out_drop_task: + put_task_struct(task); +out: + put_task_struct(leader); +out_no_task: + return result; +} + +/* + * Find the first tid of a thread group to return to user space. + * + * Usually this is just the thread group leader, but if the users + * buffer was too small or there was a seek into the middle of the + * directory we have more work todo. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with the leader and walk nr + * threads past it. + */ +static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, + struct pid_namespace *ns) +{ + struct task_struct *pos, *task; + unsigned long nr = f_pos; + + if (nr != f_pos) /* 32bit overflow? */ + return NULL; + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + if (!task) + goto fail; + + /* Attempt to start with the tid of a thread */ + if (tid && nr) { + pos = find_task_by_pid_ns(tid, ns); + if (pos && same_thread_group(pos, task)) + goto found; + } + + /* If nr exceeds the number of threads there is nothing todo */ + if (nr >= get_nr_threads(task)) + goto fail; + + /* If we haven't found our starting place yet start + * with the leader and walk nr threads forward. + */ + pos = task = task->group_leader; + do { + if (!nr--) + goto found; + } while_each_thread(task, pos); +fail: + pos = NULL; + goto out; +found: + get_task_struct(pos); +out: + rcu_read_unlock(); + return pos; +} + +/* + * Find the next thread in the thread list. + * Return NULL if there is an error or no next thread. + * + * The reference to the input task_struct is released. + */ +static struct task_struct *next_tid(struct task_struct *start) +{ + struct task_struct *pos = NULL; + rcu_read_lock(); + if (pid_alive(start)) { + pos = next_thread(start); + if (thread_group_leader(pos)) + pos = NULL; + else + get_task_struct(pos); + } + rcu_read_unlock(); + put_task_struct(start); + return pos; +} + +/* for the /proc/TGID/task/ directories */ +static int proc_task_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct task_struct *task; + struct pid_namespace *ns; + int tid; + + if (proc_inode_is_dead(inode)) + return -ENOENT; + + if (!dir_emit_dots(file, ctx)) + return 0; + + /* f_version caches the tgid value that the last readdir call couldn't + * return. lseek aka telldir automagically resets f_version to 0. + */ + ns = proc_pid_ns(inode->i_sb); + tid = (int)file->f_version; + file->f_version = 0; + for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); + task; + task = next_tid(task), ctx->pos++) { + char name[10 + 1]; + unsigned int len; + + tid = task_pid_nr_ns(task, ns); + if (!tid) + continue; /* The task has just exited. */ + len = snprintf(name, sizeof(name), "%u", tid); + if (!proc_fill_cache(file, ctx, name, len, + proc_task_instantiate, task, NULL)) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + file->f_version = (u64)tid; + put_task_struct(task); + break; + } + } + + return 0; +} + +static int proc_task_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct task_struct *p = get_proc_task(inode); + generic_fillattr(&init_user_ns, inode, stat); + + if (p) { + stat->nlink += get_nr_threads(p); + put_task_struct(p); + } + + return 0; +} + +static const struct inode_operations proc_task_inode_operations = { + .lookup = proc_task_lookup, + .getattr = proc_task_getattr, + .setattr = proc_setattr, + .permission = proc_pid_permission, +}; + +static const struct file_operations proc_task_operations = { + .read = generic_read_dir, + .iterate_shared = proc_task_readdir, + .llseek = generic_file_llseek, +}; + +void __init set_proc_pid_nlink(void) +{ + nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); + nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); +} diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c new file mode 100644 index 000000000..2e244ada1 --- /dev/null +++ b/fs/proc/bootconfig.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * /proc/bootconfig - Extra boot configuration + */ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/printk.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/bootconfig.h> +#include <linux/slab.h> + +static char *saved_boot_config; + +static int boot_config_proc_show(struct seq_file *m, void *v) +{ + if (saved_boot_config) + seq_puts(m, saved_boot_config); + return 0; +} + +/* Rest size of buffer */ +#define rest(dst, end) ((end) > (dst) ? (end) - (dst) : 0) + +/* Return the needed total length if @size is 0 */ +static int __init copy_xbc_key_value_list(char *dst, size_t size) +{ + struct xbc_node *leaf, *vnode; + char *key, *end = dst + size; + const char *val; + char q; + int ret = 0; + + key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL); + if (!key) + return -ENOMEM; + + xbc_for_each_key_value(leaf, val) { + ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX); + if (ret < 0) + break; + ret = snprintf(dst, rest(dst, end), "%s = ", key); + if (ret < 0) + break; + dst += ret; + vnode = xbc_node_get_child(leaf); + if (vnode) { + xbc_array_for_each_value(vnode, val) { + if (strchr(val, '"')) + q = '\''; + else + q = '"'; + ret = snprintf(dst, rest(dst, end), "%c%s%c%s", + q, val, q, xbc_node_is_array(vnode) ? ", " : "\n"); + if (ret < 0) + goto out; + dst += ret; + } + } else { + ret = snprintf(dst, rest(dst, end), "\"\"\n"); + if (ret < 0) + break; + dst += ret; + } + } +out: + kfree(key); + + return ret < 0 ? ret : dst - (end - size); +} + +static int __init proc_boot_config_init(void) +{ + int len; + + len = copy_xbc_key_value_list(NULL, 0); + if (len < 0) + return len; + + if (len > 0) { + saved_boot_config = kzalloc(len + 1, GFP_KERNEL); + if (!saved_boot_config) + return -ENOMEM; + + len = copy_xbc_key_value_list(saved_boot_config, len + 1); + if (len < 0) { + kfree(saved_boot_config); + return len; + } + } + + proc_create_single("bootconfig", 0, NULL, boot_config_proc_show); + + return 0; +} +fs_initcall(proc_boot_config_init); diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c new file mode 100644 index 000000000..fa762c5fb --- /dev/null +++ b/fs/proc/cmdline.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +static int cmdline_proc_show(struct seq_file *m, void *v) +{ + seq_puts(m, saved_command_line); + seq_putc(m, '\n'); + return 0; +} + +static int __init proc_cmdline_init(void) +{ + proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + return 0; +} +fs_initcall(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c new file mode 100644 index 000000000..dfe6ce350 --- /dev/null +++ b/fs/proc/consoles.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2010 Werner Fink, Jiri Slaby + */ + +#include <linux/console.h> +#include <linux/kernel.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/tty_driver.h> + +/* + * This is handler for /proc/consoles + */ +static int show_console_dev(struct seq_file *m, void *v) +{ + static const struct { + short flag; + char name; + } con_flags[] = { + { CON_ENABLED, 'E' }, + { CON_CONSDEV, 'C' }, + { CON_BOOT, 'B' }, + { CON_PRINTBUFFER, 'p' }, + { CON_BRL, 'b' }, + { CON_ANYTIME, 'a' }, + }; + char flags[ARRAY_SIZE(con_flags) + 1]; + struct console *con = v; + unsigned int a; + dev_t dev = 0; + + if (con->device) { + const struct tty_driver *driver; + int index; + driver = con->device(con, &index); + if (driver) { + dev = MKDEV(driver->major, driver->minor_start); + dev += index; + } + } + + for (a = 0; a < ARRAY_SIZE(con_flags); a++) + flags[a] = (con->flags & con_flags[a].flag) ? + con_flags[a].name : ' '; + flags[a] = 0; + + seq_setwidth(m, 21 - 1); + seq_printf(m, "%s%d", con->name, con->index); + seq_pad(m, ' '); + seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', + con->write ? 'W' : '-', con->unblank ? 'U' : '-', + flags); + if (dev) + seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); + + seq_putc(m, '\n'); + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + struct console *con; + loff_t off = 0; + + console_lock(); + for_each_console(con) + if (off++ == *pos) + break; + + return con; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct console *con = v; + ++*pos; + return con->next; +} + +static void c_stop(struct seq_file *m, void *v) +{ + console_unlock(); +} + +static const struct seq_operations consoles_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_console_dev +}; + +static int __init proc_consoles_init(void) +{ + proc_create_seq("consoles", 0, NULL, &consoles_op); + return 0; +} +fs_initcall(proc_consoles_init); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c new file mode 100644 index 000000000..f38bda5b8 --- /dev/null +++ b/fs/proc/cpuinfo.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpufreq.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +extern const struct seq_operations cpuinfo_op; + +static int cpuinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cpuinfo_op); +} + +static const struct proc_ops cpuinfo_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_open = cpuinfo_open, + .proc_read_iter = seq_read_iter, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int __init proc_cpuinfo_init(void) +{ + proc_create("cpuinfo", 0, NULL, &cpuinfo_proc_ops); + return 0; +} +fs_initcall(proc_cpuinfo_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c new file mode 100644 index 000000000..fe7bfcb7d --- /dev/null +++ b/fs/proc/devices.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/blkdev.h> +#include "internal.h" + +static int devinfo_show(struct seq_file *f, void *v) +{ + int i = *(loff_t *) v; + + if (i < CHRDEV_MAJOR_MAX) { + if (i == 0) + seq_puts(f, "Character devices:\n"); + chrdev_show(f, i); + } +#ifdef CONFIG_BLOCK + else { + i -= CHRDEV_MAJOR_MAX; + if (i == 0) + seq_puts(f, "\nBlock devices:\n"); + blkdev_show(f, i); + } +#endif + return 0; +} + +static void *devinfo_start(struct seq_file *f, loff_t *pos) +{ + if (*pos < (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX)) + return pos; + return NULL; +} + +static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos >= (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX)) + return NULL; + return pos; +} + +static void devinfo_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations devinfo_ops = { + .start = devinfo_start, + .next = devinfo_next, + .stop = devinfo_stop, + .show = devinfo_show +}; + +static int __init proc_devices_init(void) +{ + struct proc_dir_entry *pde; + + pde = proc_create_seq("devices", 0, NULL, &devinfo_ops); + pde_make_permanent(pde); + return 0; +} +fs_initcall(proc_devices_init); diff --git a/fs/proc/fd.c b/fs/proc/fd.c new file mode 100644 index 000000000..913bef0d2 --- /dev/null +++ b/fs/proc/fd.c @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sched/signal.h> +#include <linux/errno.h> +#include <linux/dcache.h> +#include <linux/path.h> +#include <linux/fdtable.h> +#include <linux/namei.h> +#include <linux/pid.h> +#include <linux/ptrace.h> +#include <linux/security.h> +#include <linux/file.h> +#include <linux/seq_file.h> +#include <linux/fs.h> + +#include <linux/proc_fs.h> + +#include "../mount.h" +#include "internal.h" +#include "fd.h" + +static int seq_show(struct seq_file *m, void *v) +{ + struct files_struct *files = NULL; + int f_flags = 0, ret = -ENOENT; + struct file *file = NULL; + struct task_struct *task; + + task = get_proc_task(m->private); + if (!task) + return -ENOENT; + + task_lock(task); + files = task->files; + if (files) { + unsigned int fd = proc_fd(m->private); + + spin_lock(&files->file_lock); + file = files_lookup_fd_locked(files, fd); + if (file) { + struct fdtable *fdt = files_fdtable(files); + + f_flags = file->f_flags; + if (close_on_exec(fd, fdt)) + f_flags |= O_CLOEXEC; + + get_file(file); + ret = 0; + } + spin_unlock(&files->file_lock); + } + task_unlock(task); + put_task_struct(task); + + if (ret) + return ret; + + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n", + (long long)file->f_pos, f_flags, + real_mount(file->f_path.mnt)->mnt_id, + file_inode(file)->i_ino); + + /* show_fd_locks() never deferences files so a stale value is safe */ + show_fd_locks(m, file, files); + if (seq_has_overflowed(m)) + goto out; + + if (file->f_op->show_fdinfo) + file->f_op->show_fdinfo(m, file); + +out: + fput(file); + return 0; +} + +static int proc_fdinfo_access_allowed(struct inode *inode) +{ + bool allowed = false; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + + allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); + put_task_struct(task); + + if (!allowed) + return -EACCES; + + return 0; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + + return single_open(file, seq_show, inode); +} + +static const struct file_operations proc_fdinfo_file_operations = { + .open = seq_fdinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) +{ + struct file *file; + + rcu_read_lock(); + file = task_lookup_fd_rcu(task, fd); + if (file) + *mode = file->f_mode; + rcu_read_unlock(); + return !!file; +} + +static void tid_fd_update_inode(struct task_struct *task, struct inode *inode, + fmode_t f_mode) +{ + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); + + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } + security_task_to_inode(task, inode); +} + +static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct task_struct *task; + struct inode *inode; + unsigned int fd; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = d_inode(dentry); + task = get_proc_task(inode); + fd = proc_fd(inode); + + if (task) { + fmode_t f_mode; + if (tid_fd_mode(task, fd, &f_mode)) { + tid_fd_update_inode(task, inode, f_mode); + put_task_struct(task); + return 1; + } + put_task_struct(task); + } + return 0; +} + +static const struct dentry_operations tid_fd_dentry_operations = { + .d_revalidate = tid_fd_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_fd_link(struct dentry *dentry, struct path *path) +{ + struct task_struct *task; + int ret = -ENOENT; + + task = get_proc_task(d_inode(dentry)); + if (task) { + unsigned int fd = proc_fd(d_inode(dentry)); + struct file *fd_file; + + fd_file = fget_task(task, fd); + if (fd_file) { + *path = fd_file->f_path; + path_get(&fd_file->f_path); + ret = 0; + fput(fd_file); + } + put_task_struct(task); + } + + return ret; +} + +struct fd_data { + fmode_t mode; + unsigned fd; +}; + +static struct dentry *proc_fd_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + const struct fd_data *data = ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK); + if (!inode) + return ERR_PTR(-ENOENT); + + ei = PROC_I(inode); + ei->fd = data->fd; + + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + + ei->op.proc_get_link = proc_fd_link; + tid_fd_update_inode(task, inode, data->mode); + + d_set_d_op(dentry, &tid_fd_dentry_operations); + return d_splice_alias(inode, dentry); +} + +static struct dentry *proc_lookupfd_common(struct inode *dir, + struct dentry *dentry, + instantiate_t instantiate) +{ + struct task_struct *task = get_proc_task(dir); + struct fd_data data = {.fd = name_to_int(&dentry->d_name)}; + struct dentry *result = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + if (data.fd == ~0U) + goto out; + if (!tid_fd_mode(task, data.fd, &data.mode)) + goto out; + + result = instantiate(dentry, task, &data); +out: + put_task_struct(task); +out_no_task: + return result; +} + +static int proc_readfd_common(struct file *file, struct dir_context *ctx, + instantiate_t instantiate) +{ + struct task_struct *p = get_proc_task(file_inode(file)); + unsigned int fd; + + if (!p) + return -ENOENT; + + if (!dir_emit_dots(file, ctx)) + goto out; + + rcu_read_lock(); + for (fd = ctx->pos - 2;; fd++) { + struct file *f; + struct fd_data data; + char name[10 + 1]; + unsigned int len; + + f = task_lookup_next_fd_rcu(p, &fd); + ctx->pos = fd + 2LL; + if (!f) + break; + data.mode = f->f_mode; + rcu_read_unlock(); + data.fd = fd; + + len = snprintf(name, sizeof(name), "%u", fd); + if (!proc_fill_cache(file, ctx, + name, len, instantiate, p, + &data)) + goto out; + cond_resched(); + rcu_read_lock(); + } + rcu_read_unlock(); +out: + put_task_struct(p); + return 0; +} + +static int proc_readfd(struct file *file, struct dir_context *ctx) +{ + return proc_readfd_common(file, ctx, proc_fd_instantiate); +} + +const struct file_operations proc_fd_operations = { + .read = generic_read_dir, + .iterate_shared = proc_readfd, + .llseek = generic_file_llseek, +}; + +static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); +} + +/* + * /proc/pid/fd needs a special permission handler so that a process can still + * access /proc/self/fd after it has executed a setuid(). + */ +int proc_fd_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) +{ + struct task_struct *p; + int rv; + + rv = generic_permission(&init_user_ns, inode, mask); + if (rv == 0) + return rv; + + rcu_read_lock(); + p = pid_task(proc_pid(inode), PIDTYPE_PID); + if (p && same_thread_group(p, current)) + rv = 0; + rcu_read_unlock(); + + return rv; +} + +const struct inode_operations proc_fd_inode_operations = { + .lookup = proc_lookupfd, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + const struct fd_data *data = ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUGO); + if (!inode) + return ERR_PTR(-ENOENT); + + ei = PROC_I(inode); + ei->fd = data->fd; + + inode->i_fop = &proc_fdinfo_file_operations; + tid_fd_update_inode(task, inode, 0); + + d_set_d_op(dentry, &tid_fd_dentry_operations); + return d_splice_alias(inode, dentry); +} + +static struct dentry * +proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); +} + +static int proc_readfdinfo(struct file *file, struct dir_context *ctx) +{ + return proc_readfd_common(file, ctx, + proc_fdinfo_instantiate); +} + +static int proc_open_fdinfo(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + + return 0; +} + +const struct inode_operations proc_fdinfo_inode_operations = { + .lookup = proc_lookupfdinfo, + .setattr = proc_setattr, +}; + +const struct file_operations proc_fdinfo_operations = { + .open = proc_open_fdinfo, + .read = generic_read_dir, + .iterate_shared = proc_readfdinfo, + .llseek = generic_file_llseek, +}; diff --git a/fs/proc/fd.h b/fs/proc/fd.h new file mode 100644 index 000000000..c5a921a06 --- /dev/null +++ b/fs/proc/fd.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PROCFS_FD_H__ +#define __PROCFS_FD_H__ + +#include <linux/fs.h> + +extern const struct file_operations proc_fd_operations; +extern const struct inode_operations proc_fd_inode_operations; + +extern const struct file_operations proc_fdinfo_operations; +extern const struct inode_operations proc_fdinfo_inode_operations; + +extern int proc_fd_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); + +static inline unsigned int proc_fd(struct inode *inode) +{ + return PROC_I(inode)->fd; +} + +#endif /* __PROCFS_FD_H__ */ diff --git a/fs/proc/generic.c b/fs/proc/generic.c new file mode 100644 index 000000000..587b91d9d --- /dev/null +++ b/fs/proc/generic.c @@ -0,0 +1,819 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * proc/fs/generic.c --- generic routines for the proc-fs + * + * This file contains generic proc-fs routines for handling + * directories and files. + * + * Copyright (C) 1991, 1992 Linus Torvalds. + * Copyright (C) 1997 Theodore Ts'o + */ + +#include <linux/cache.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/namei.h> +#include <linux/slab.h> +#include <linux/printk.h> +#include <linux/mount.h> +#include <linux/init.h> +#include <linux/idr.h> +#include <linux/bitops.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/uaccess.h> +#include <linux/seq_file.h> + +#include "internal.h" + +static DEFINE_RWLOCK(proc_subdir_lock); + +struct kmem_cache *proc_dir_entry_cache __ro_after_init; + +void pde_free(struct proc_dir_entry *pde) +{ + if (S_ISLNK(pde->mode)) + kfree(pde->data); + if (pde->name != pde->inline_name) + kfree(pde->name); + kmem_cache_free(proc_dir_entry_cache, pde); +} + +static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) +{ + if (len < de->namelen) + return -1; + if (len > de->namelen) + return 1; + + return memcmp(name, de->name, len); +} + +static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) +{ + return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, + subdir_node); +} + +static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) +{ + return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry, + subdir_node); +} + +static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, + const char *name, + unsigned int len) +{ + struct rb_node *node = dir->subdir.rb_node; + + while (node) { + struct proc_dir_entry *de = rb_entry(node, + struct proc_dir_entry, + subdir_node); + int result = proc_match(name, de, len); + + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return de; + } + return NULL; +} + +static bool pde_subdir_insert(struct proc_dir_entry *dir, + struct proc_dir_entry *de) +{ + struct rb_root *root = &dir->subdir; + struct rb_node **new = &root->rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct proc_dir_entry *this = rb_entry(*new, + struct proc_dir_entry, + subdir_node); + int result = proc_match(de->name, this, de->namelen); + + parent = *new; + if (result < 0) + new = &(*new)->rb_left; + else if (result > 0) + new = &(*new)->rb_right; + else + return false; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&de->subdir_node, parent, new); + rb_insert_color(&de->subdir_node, root); + return true; +} + +static int proc_notify_change(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + struct proc_dir_entry *de = PDE(inode); + int error; + + error = setattr_prepare(&init_user_ns, dentry, iattr); + if (error) + return error; + + setattr_copy(&init_user_ns, inode, iattr); + mark_inode_dirty(inode); + + proc_set_user(de, inode->i_uid, inode->i_gid); + de->mode = inode->i_mode; + return 0; +} + +static int proc_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct proc_dir_entry *de = PDE(inode); + if (de) { + nlink_t nlink = READ_ONCE(de->nlink); + if (nlink > 0) { + set_nlink(inode, nlink); + } + } + + generic_fillattr(&init_user_ns, inode, stat); + return 0; +} + +static const struct inode_operations proc_file_inode_operations = { + .setattr = proc_notify_change, +}; + +/* + * This function parses a name such as "tty/driver/serial", and + * returns the struct proc_dir_entry for "/proc/tty/driver", and + * returns "serial" in residual. + */ +static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + const char *cp = name, *next; + struct proc_dir_entry *de; + + de = *ret ?: &proc_root; + while ((next = strchr(cp, '/')) != NULL) { + de = pde_subdir_find(de, cp, next - cp); + if (!de) { + WARN(1, "name '%s'\n", name); + return -ENOENT; + } + cp = next + 1; + } + *residual = cp; + *ret = de; + return 0; +} + +static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + int rv; + + read_lock(&proc_subdir_lock); + rv = __xlate_proc_name(name, ret, residual); + read_unlock(&proc_subdir_lock); + return rv; +} + +static DEFINE_IDA(proc_inum_ida); + +#define PROC_DYNAMIC_FIRST 0xF0000000U + +/* + * Return an inode number between PROC_DYNAMIC_FIRST and + * 0xffffffff, or zero on failure. + */ +int proc_alloc_inum(unsigned int *inum) +{ + int i; + + i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1, + GFP_KERNEL); + if (i < 0) + return i; + + *inum = PROC_DYNAMIC_FIRST + (unsigned int)i; + return 0; +} + +void proc_free_inum(unsigned int inum) +{ + ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); +} + +static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + if (flags & LOOKUP_RCU) + return -ECHILD; + + if (atomic_read(&PDE(d_inode(dentry))->in_use) < 0) + return 0; /* revalidate */ + return 1; +} + +static int proc_misc_d_delete(const struct dentry *dentry) +{ + return atomic_read(&PDE(d_inode(dentry))->in_use) < 0; +} + +static const struct dentry_operations proc_misc_dentry_ops = { + .d_revalidate = proc_misc_d_revalidate, + .d_delete = proc_misc_d_delete, +}; + +/* + * Don't create negative dentries here, return -ENOENT by hand + * instead. + */ +struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, + struct proc_dir_entry *de) +{ + struct inode *inode; + + read_lock(&proc_subdir_lock); + de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); + if (de) { + pde_get(de); + read_unlock(&proc_subdir_lock); + inode = proc_get_inode(dir->i_sb, de); + if (!inode) + return ERR_PTR(-ENOMEM); + d_set_d_op(dentry, de->proc_dops); + return d_splice_alias(inode, dentry); + } + read_unlock(&proc_subdir_lock); + return ERR_PTR(-ENOENT); +} + +struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb); + + if (fs_info->pidonly == PROC_PIDONLY_ON) + return ERR_PTR(-ENOENT); + + return proc_lookup_de(dir, dentry, PDE(dir)); +} + +/* + * This returns non-zero if at EOF, so that the /proc + * root directory can use this and check if it should + * continue with the <pid> entries.. + * + * Note that the VFS-layer doesn't care about the return + * value of the readdir() call, as long as it's non-negative + * for success.. + */ +int proc_readdir_de(struct file *file, struct dir_context *ctx, + struct proc_dir_entry *de) +{ + int i; + + if (!dir_emit_dots(file, ctx)) + return 0; + + i = ctx->pos - 2; + read_lock(&proc_subdir_lock); + de = pde_subdir_first(de); + for (;;) { + if (!de) { + read_unlock(&proc_subdir_lock); + return 0; + } + if (!i) + break; + de = pde_subdir_next(de); + i--; + } + + do { + struct proc_dir_entry *next; + pde_get(de); + read_unlock(&proc_subdir_lock); + if (!dir_emit(ctx, de->name, de->namelen, + de->low_ino, de->mode >> 12)) { + pde_put(de); + return 0; + } + ctx->pos++; + read_lock(&proc_subdir_lock); + next = pde_subdir_next(de); + pde_put(de); + de = next; + } while (de); + read_unlock(&proc_subdir_lock); + return 1; +} + +int proc_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); + + if (fs_info->pidonly == PROC_PIDONLY_ON) + return 1; + + return proc_readdir_de(file, ctx, PDE(inode)); +} + +/* + * These are the generic /proc directory operations. They + * use the in-memory "struct proc_dir_entry" tree to parse + * the /proc directory. + */ +static const struct file_operations proc_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = proc_readdir, +}; + +static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return 0; +} + +const struct dentry_operations proc_net_dentry_ops = { + .d_revalidate = proc_net_d_revalidate, + .d_delete = always_delete_dentry, +}; + +/* + * proc directories can do almost nothing.. + */ +static const struct inode_operations proc_dir_inode_operations = { + .lookup = proc_lookup, + .getattr = proc_getattr, + .setattr = proc_notify_change, +}; + +/* returns the registered entry, or frees dp and returns NULL on failure */ +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp) +{ + if (proc_alloc_inum(&dp->low_ino)) + goto out_free_entry; + + write_lock(&proc_subdir_lock); + dp->parent = dir; + if (pde_subdir_insert(dir, dp) == false) { + WARN(1, "proc_dir_entry '%s/%s' already registered\n", + dir->name, dp->name); + write_unlock(&proc_subdir_lock); + goto out_free_inum; + } + dir->nlink++; + write_unlock(&proc_subdir_lock); + + return dp; +out_free_inum: + proc_free_inum(dp->low_ino); +out_free_entry: + pde_free(dp); + return NULL; +} + +static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, + const char *name, + umode_t mode, + nlink_t nlink) +{ + struct proc_dir_entry *ent = NULL; + const char *fn; + struct qstr qstr; + + if (xlate_proc_name(name, parent, &fn) != 0) + goto out; + qstr.name = fn; + qstr.len = strlen(fn); + if (qstr.len == 0 || qstr.len >= 256) { + WARN(1, "name len %u\n", qstr.len); + return NULL; + } + if (qstr.len == 1 && fn[0] == '.') { + WARN(1, "name '.'\n"); + return NULL; + } + if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') { + WARN(1, "name '..'\n"); + return NULL; + } + if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { + WARN(1, "create '/proc/%s' by hand\n", qstr.name); + return NULL; + } + if (is_empty_pde(*parent)) { + WARN(1, "attempt to add to permanently empty directory"); + return NULL; + } + + ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); + if (!ent) + goto out; + + if (qstr.len + 1 <= SIZEOF_PDE_INLINE_NAME) { + ent->name = ent->inline_name; + } else { + ent->name = kmalloc(qstr.len + 1, GFP_KERNEL); + if (!ent->name) { + pde_free(ent); + return NULL; + } + } + + memcpy(ent->name, fn, qstr.len + 1); + ent->namelen = qstr.len; + ent->mode = mode; + ent->nlink = nlink; + ent->subdir = RB_ROOT; + refcount_set(&ent->refcnt, 1); + spin_lock_init(&ent->pde_unload_lock); + INIT_LIST_HEAD(&ent->pde_openers); + proc_set_user(ent, (*parent)->uid, (*parent)->gid); + + ent->proc_dops = &proc_misc_dentry_ops; + /* Revalidate everything under /proc/${pid}/net */ + if ((*parent)->proc_dops == &proc_net_dentry_ops) + pde_force_lookup(ent); + +out: + return ent; +} + +struct proc_dir_entry *proc_symlink(const char *name, + struct proc_dir_entry *parent, const char *dest) +{ + struct proc_dir_entry *ent; + + ent = __proc_create(&parent, name, + (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1); + + if (ent) { + ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); + if (ent->data) { + strcpy((char*)ent->data,dest); + ent->proc_iops = &proc_link_inode_operations; + ent = proc_register(parent, ent); + } else { + pde_free(ent); + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL(proc_symlink); + +struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode, + struct proc_dir_entry *parent, void *data, bool force_lookup) +{ + struct proc_dir_entry *ent; + + if (mode == 0) + mode = S_IRUGO | S_IXUGO; + + ent = __proc_create(&parent, name, S_IFDIR | mode, 2); + if (ent) { + ent->data = data; + ent->proc_dir_ops = &proc_dir_operations; + ent->proc_iops = &proc_dir_inode_operations; + if (force_lookup) { + pde_force_lookup(ent); + } + ent = proc_register(parent, ent); + } + return ent; +} +EXPORT_SYMBOL_GPL(_proc_mkdir); + +struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, void *data) +{ + return _proc_mkdir(name, mode, parent, data, false); +} +EXPORT_SYMBOL_GPL(proc_mkdir_data); + +struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, + struct proc_dir_entry *parent) +{ + return proc_mkdir_data(name, mode, parent, NULL); +} +EXPORT_SYMBOL(proc_mkdir_mode); + +struct proc_dir_entry *proc_mkdir(const char *name, + struct proc_dir_entry *parent) +{ + return proc_mkdir_data(name, 0, parent, NULL); +} +EXPORT_SYMBOL(proc_mkdir); + +struct proc_dir_entry *proc_create_mount_point(const char *name) +{ + umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO; + struct proc_dir_entry *ent, *parent = NULL; + + ent = __proc_create(&parent, name, mode, 2); + if (ent) { + ent->data = NULL; + ent->proc_dir_ops = NULL; + ent->proc_iops = NULL; + ent = proc_register(parent, ent); + } + return ent; +} +EXPORT_SYMBOL(proc_create_mount_point); + +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data) +{ + struct proc_dir_entry *p; + + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + if (WARN_ON_ONCE(!S_ISREG(mode))) + return NULL; + + p = __proc_create(parent, name, mode, 1); + if (p) { + p->proc_iops = &proc_file_inode_operations; + p->data = data; + } + return p; +} + +static inline void pde_set_flags(struct proc_dir_entry *pde) +{ + if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) + pde->flags |= PROC_ENTRY_PERMANENT; +} + +struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct proc_ops *proc_ops, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_ops = proc_ops; + pde_set_flags(p); + return proc_register(parent, p); +} +EXPORT_SYMBOL(proc_create_data); + +struct proc_dir_entry *proc_create(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct proc_ops *proc_ops) +{ + return proc_create_data(name, mode, parent, proc_ops, NULL); +} +EXPORT_SYMBOL(proc_create); + +static int proc_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + if (de->state_size) + return seq_open_private(file, de->seq_ops, de->state_size); + return seq_open(file, de->seq_ops); +} + +static int proc_seq_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + if (de->state_size) + return seq_release_private(inode, file); + return seq_release(inode, file); +} + +static const struct proc_ops proc_seq_ops = { + /* not permanent -- can call into arbitrary seq_operations */ + .proc_open = proc_seq_open, + .proc_read_iter = seq_read_iter, + .proc_lseek = seq_lseek, + .proc_release = proc_seq_release, +}; + +struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, + struct proc_dir_entry *parent, const struct seq_operations *ops, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_ops = &proc_seq_ops; + p->seq_ops = ops; + p->state_size = state_size; + return proc_register(parent, p); +} +EXPORT_SYMBOL(proc_create_seq_private); + +static int proc_single_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + return single_open(file, de->single_show, de->data); +} + +static const struct proc_ops proc_single_ops = { + /* not permanent -- can call into arbitrary ->single_show */ + .proc_open = proc_single_open, + .proc_read_iter = seq_read_iter, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_ops = &proc_single_ops; + p->single_show = show; + return proc_register(parent, p); +} +EXPORT_SYMBOL(proc_create_single_data); + +void proc_set_size(struct proc_dir_entry *de, loff_t size) +{ + de->size = size; +} +EXPORT_SYMBOL(proc_set_size); + +void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) +{ + de->uid = uid; + de->gid = gid; +} +EXPORT_SYMBOL(proc_set_user); + +void pde_put(struct proc_dir_entry *pde) +{ + if (refcount_dec_and_test(&pde->refcnt)) { + proc_free_inum(pde->low_ino); + pde_free(pde); + } +} + +/* + * Remove a /proc entry and free it if it's not currently in use. + */ +void remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry *de = NULL; + const char *fn = name; + unsigned int len; + + write_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + write_unlock(&proc_subdir_lock); + return; + } + len = strlen(fn); + + de = pde_subdir_find(parent, fn, len); + if (de) { + if (unlikely(pde_is_permanent(de))) { + WARN(1, "removing permanent /proc entry '%s'", de->name); + de = NULL; + } else { + rb_erase(&de->subdir_node, &parent->subdir); + if (S_ISDIR(de->mode)) + parent->nlink--; + } + } + write_unlock(&proc_subdir_lock); + if (!de) { + WARN(1, "name '%s'\n", name); + return; + } + + proc_entry_rundown(de); + + WARN(pde_subdir_first(de), + "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n", + __func__, de->parent->name, de->name, pde_subdir_first(de)->name); + pde_put(de); +} +EXPORT_SYMBOL(remove_proc_entry); + +int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry *root = NULL, *de, *next; + const char *fn = name; + unsigned int len; + + write_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + write_unlock(&proc_subdir_lock); + return -ENOENT; + } + len = strlen(fn); + + root = pde_subdir_find(parent, fn, len); + if (!root) { + write_unlock(&proc_subdir_lock); + return -ENOENT; + } + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + root->parent->name, root->name); + return -EINVAL; + } + rb_erase(&root->subdir_node, &parent->subdir); + + de = root; + while (1) { + next = pde_subdir_first(de); + if (next) { + if (unlikely(pde_is_permanent(next))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + next->parent->name, next->name); + return -EINVAL; + } + rb_erase(&next->subdir_node, &de->subdir); + de = next; + continue; + } + next = de->parent; + if (S_ISDIR(de->mode)) + next->nlink--; + write_unlock(&proc_subdir_lock); + + proc_entry_rundown(de); + if (de == root) + break; + pde_put(de); + + write_lock(&proc_subdir_lock); + de = next; + } + pde_put(root); + return 0; +} +EXPORT_SYMBOL(remove_proc_subtree); + +void *proc_get_parent_data(const struct inode *inode) +{ + struct proc_dir_entry *de = PDE(inode); + return de->parent->data; +} +EXPORT_SYMBOL_GPL(proc_get_parent_data); + +void proc_remove(struct proc_dir_entry *de) +{ + if (de) + remove_proc_subtree(de->name, de->parent); +} +EXPORT_SYMBOL(proc_remove); + +/* + * Pull a user buffer into memory and pass it to the file's write handler if + * one is supplied. The ->write() method is permitted to modify the + * kernel-side buffer. + */ +ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size, + loff_t *_pos) +{ + struct proc_dir_entry *pde = PDE(file_inode(f)); + char *buf; + int ret; + + if (!pde->write) + return -EACCES; + if (size == 0 || size > PAGE_SIZE - 1) + return -EINVAL; + buf = memdup_user_nul(ubuf, size); + if (IS_ERR(buf)) + return PTR_ERR(buf); + ret = pde->write(f, buf, size); + kfree(buf); + return ret == 0 ? size : ret; +} diff --git a/fs/proc/inode.c b/fs/proc/inode.c new file mode 100644 index 000000000..f495fdb39 --- /dev/null +++ b/fs/proc/inode.c @@ -0,0 +1,704 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/proc/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/cache.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/kernel.h> +#include <linux/pid_namespace.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/completion.h> +#include <linux/poll.h> +#include <linux/printk.h> +#include <linux/file.h> +#include <linux/limits.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/sysctl.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/mount.h> +#include <linux/bug.h> + +#include "internal.h" + +static void proc_evict_inode(struct inode *inode) +{ + struct proc_dir_entry *de; + struct ctl_table_header *head; + struct proc_inode *ei = PROC_I(inode); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + + /* Stop tracking associated processes */ + if (ei->pid) { + proc_pid_evict_inode(ei); + ei->pid = NULL; + } + + /* Let go of any associated proc directory entry */ + de = ei->pde; + if (de) { + pde_put(de); + ei->pde = NULL; + } + + head = ei->sysctl; + if (head) { + RCU_INIT_POINTER(ei->sysctl, NULL); + proc_sys_evict_inode(inode, head); + } +} + +static struct kmem_cache *proc_inode_cachep __ro_after_init; +static struct kmem_cache *pde_opener_cache __ro_after_init; + +static struct inode *proc_alloc_inode(struct super_block *sb) +{ + struct proc_inode *ei; + + ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + ei->pid = NULL; + ei->fd = 0; + ei->op.proc_get_link = NULL; + ei->pde = NULL; + ei->sysctl = NULL; + ei->sysctl_entry = NULL; + INIT_HLIST_NODE(&ei->sibling_inodes); + ei->ns_ops = NULL; + return &ei->vfs_inode; +} + +static void proc_free_inode(struct inode *inode) +{ + kmem_cache_free(proc_inode_cachep, PROC_I(inode)); +} + +static void init_once(void *foo) +{ + struct proc_inode *ei = (struct proc_inode *) foo; + + inode_init_once(&ei->vfs_inode); +} + +void __init proc_init_kmemcache(void) +{ + proc_inode_cachep = kmem_cache_create("proc_inode_cache", + sizeof(struct proc_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT| + SLAB_PANIC), + init_once); + pde_opener_cache = + kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, + SLAB_ACCOUNT|SLAB_PANIC, NULL); + proc_dir_entry_cache = kmem_cache_create_usercopy( + "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC, + offsetof(struct proc_dir_entry, inline_name), + SIZEOF_PDE_INLINE_NAME, NULL); + BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); +} + +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) +{ + struct inode *inode; + struct proc_inode *ei; + struct hlist_node *node; + struct super_block *old_sb = NULL; + + rcu_read_lock(); + for (;;) { + struct super_block *sb; + node = hlist_first_rcu(inodes); + if (!node) + break; + ei = hlist_entry(node, struct proc_inode, sibling_inodes); + spin_lock(lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(lock); + + inode = &ei->vfs_inode; + sb = inode->i_sb; + if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active)) + continue; + inode = igrab(inode); + rcu_read_unlock(); + if (sb != old_sb) { + if (old_sb) + deactivate_super(old_sb); + old_sb = sb; + } + if (unlikely(!inode)) { + rcu_read_lock(); + continue; + } + + if (S_ISDIR(inode->i_mode)) { + struct dentry *dir = d_find_any_alias(inode); + if (dir) { + d_invalidate(dir); + dput(dir); + } + } else { + struct dentry *dentry; + while ((dentry = d_find_alias(inode))) { + d_invalidate(dentry); + dput(dentry); + } + } + iput(inode); + + rcu_read_lock(); + } + rcu_read_unlock(); + if (old_sb) + deactivate_super(old_sb); +} + +static inline const char *hidepid2str(enum proc_hidepid v) +{ + switch (v) { + case HIDEPID_OFF: return "off"; + case HIDEPID_NO_ACCESS: return "noaccess"; + case HIDEPID_INVISIBLE: return "invisible"; + case HIDEPID_NOT_PTRACEABLE: return "ptraceable"; + } + WARN_ONCE(1, "bad hide_pid value: %d\n", v); + return "unknown"; +} + +static int proc_show_options(struct seq_file *seq, struct dentry *root) +{ + struct proc_fs_info *fs_info = proc_sb_info(root->d_sb); + + if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid)); + if (fs_info->hide_pid != HIDEPID_OFF) + seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid)); + if (fs_info->pidonly != PROC_PIDONLY_OFF) + seq_printf(seq, ",subset=pid"); + + return 0; +} + +const struct super_operations proc_sops = { + .alloc_inode = proc_alloc_inode, + .free_inode = proc_free_inode, + .drop_inode = generic_delete_inode, + .evict_inode = proc_evict_inode, + .statfs = simple_statfs, + .show_options = proc_show_options, +}; + +enum {BIAS = -1U<<31}; + +static inline int use_pde(struct proc_dir_entry *pde) +{ + return likely(atomic_inc_unless_negative(&pde->in_use)); +} + +static void unuse_pde(struct proc_dir_entry *pde) +{ + if (unlikely(atomic_dec_return(&pde->in_use) == BIAS)) + complete(pde->pde_unload_completion); +} + +/* + * At most 2 contexts can enter this function: the one doing the last + * close on the descriptor and whoever is deleting PDE itself. + * + * First to enter calls ->proc_release hook and signals its completion + * to the second one which waits and then does nothing. + * + * PDE is locked on entry, unlocked on exit. + */ +static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) + __releases(&pde->pde_unload_lock) +{ + /* + * close() (proc_reg_release()) can't delete an entry and proceed: + * ->release hook needs to be available at the right moment. + * + * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: + * "struct file" needs to be available at the right moment. + */ + if (pdeo->closing) { + /* somebody else is doing that, just wait */ + DECLARE_COMPLETION_ONSTACK(c); + pdeo->c = &c; + spin_unlock(&pde->pde_unload_lock); + wait_for_completion(&c); + } else { + struct file *file; + struct completion *c; + + pdeo->closing = true; + spin_unlock(&pde->pde_unload_lock); + + file = pdeo->file; + pde->proc_ops->proc_release(file_inode(file), file); + + spin_lock(&pde->pde_unload_lock); + /* Strictly after ->proc_release, see above. */ + list_del(&pdeo->lh); + c = pdeo->c; + spin_unlock(&pde->pde_unload_lock); + if (unlikely(c)) + complete(c); + kmem_cache_free(pde_opener_cache, pdeo); + } +} + +void proc_entry_rundown(struct proc_dir_entry *de) +{ + DECLARE_COMPLETION_ONSTACK(c); + /* Wait until all existing callers into module are done. */ + de->pde_unload_completion = &c; + if (atomic_add_return(BIAS, &de->in_use) != BIAS) + wait_for_completion(&c); + + /* ->pde_openers list can't grow from now on. */ + + spin_lock(&de->pde_unload_lock); + while (!list_empty(&de->pde_openers)) { + struct pde_opener *pdeo; + pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); + close_pdeo(de, pdeo); + spin_lock(&de->pde_unload_lock); + } + spin_unlock(&de->pde_unload_lock); +} + +static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + loff_t rv = -EINVAL; + + if (pde_is_permanent(pde)) { + return pde->proc_ops->proc_lseek(file, offset, whence); + } else if (use_pde(pde)) { + rv = pde->proc_ops->proc_lseek(file, offset, whence); + unuse_pde(pde); + } + return rv; +} + +static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp)); + ssize_t ret; + + if (pde_is_permanent(pde)) + return pde->proc_ops->proc_read_iter(iocb, iter); + + if (!use_pde(pde)) + return -EIO; + ret = pde->proc_ops->proc_read_iter(iocb, iter); + unuse_pde(pde); + return ret; +} + +static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_read) read; + + read = pde->proc_ops->proc_read; + if (read) + return read(file, buf, count, ppos); + return -EIO; +} + +static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + ssize_t rv = -EIO; + + if (pde_is_permanent(pde)) { + return pde_read(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_read(pde, file, buf, count, ppos); + unuse_pde(pde); + } + return rv; +} + +static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_write) write; + + write = pde->proc_ops->proc_write; + if (write) + return write(file, buf, count, ppos); + return -EIO; +} + +static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + ssize_t rv = -EIO; + + if (pde_is_permanent(pde)) { + return pde_write(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_write(pde, file, buf, count, ppos); + unuse_pde(pde); + } + return rv; +} + +static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) +{ + typeof_member(struct proc_ops, proc_poll) poll; + + poll = pde->proc_ops->proc_poll; + if (poll) + return poll(file, pts); + return DEFAULT_POLLMASK; +} + +static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + __poll_t rv = DEFAULT_POLLMASK; + + if (pde_is_permanent(pde)) { + return pde_poll(pde, file, pts); + } else if (use_pde(pde)) { + rv = pde_poll(pde, file, pts); + unuse_pde(pde); + } + return rv; +} + +static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_ioctl) ioctl; + + ioctl = pde->proc_ops->proc_ioctl; + if (ioctl) + return ioctl(file, cmd, arg); + return -ENOTTY; +} + +static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + long rv = -ENOTTY; + + if (pde_is_permanent(pde)) { + return pde_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_ioctl(pde, file, cmd, arg); + unuse_pde(pde); + } + return rv; +} + +#ifdef CONFIG_COMPAT +static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; + + compat_ioctl = pde->proc_ops->proc_compat_ioctl; + if (compat_ioctl) + return compat_ioctl(file, cmd, arg); + return -ENOTTY; +} + +static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + long rv = -ENOTTY; + if (pde_is_permanent(pde)) { + return pde_compat_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_compat_ioctl(pde, file, cmd, arg); + unuse_pde(pde); + } + return rv; +} +#endif + +static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) +{ + typeof_member(struct proc_ops, proc_mmap) mmap; + + mmap = pde->proc_ops->proc_mmap; + if (mmap) + return mmap(file, vma); + return -EIO; +} + +static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + int rv = -EIO; + + if (pde_is_permanent(pde)) { + return pde_mmap(pde, file, vma); + } else if (use_pde(pde)) { + rv = pde_mmap(pde, file, vma); + unuse_pde(pde); + } + return rv; +} + +static unsigned long +pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; + + get_area = pde->proc_ops->proc_get_unmapped_area; +#ifdef CONFIG_MMU + if (!get_area) + get_area = current->mm->get_unmapped_area; +#endif + if (get_area) + return get_area(file, orig_addr, len, pgoff, flags); + return orig_addr; +} + +static unsigned long +proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + unsigned long rv = -EIO; + + if (pde_is_permanent(pde)) { + return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); + } else if (use_pde(pde)) { + rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); + unuse_pde(pde); + } + return rv; +} + +static int proc_reg_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + int rv = 0; + typeof_member(struct proc_ops, proc_open) open; + typeof_member(struct proc_ops, proc_release) release; + struct pde_opener *pdeo; + + if (!pde->proc_ops->proc_lseek) + file->f_mode &= ~FMODE_LSEEK; + + if (pde_is_permanent(pde)) { + open = pde->proc_ops->proc_open; + if (open) + rv = open(inode, file); + return rv; + } + + /* + * Ensure that + * 1) PDE's ->release hook will be called no matter what + * either normally by close()/->release, or forcefully by + * rmmod/remove_proc_entry. + * + * 2) rmmod isn't blocked by opening file in /proc and sitting on + * the descriptor (including "rmmod foo </proc/foo" scenario). + * + * Save every "struct file" with custom ->release hook. + */ + if (!use_pde(pde)) + return -ENOENT; + + release = pde->proc_ops->proc_release; + if (release) { + pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); + if (!pdeo) { + rv = -ENOMEM; + goto out_unuse; + } + } + + open = pde->proc_ops->proc_open; + if (open) + rv = open(inode, file); + + if (release) { + if (rv == 0) { + /* To know what to release. */ + pdeo->file = file; + pdeo->closing = false; + pdeo->c = NULL; + spin_lock(&pde->pde_unload_lock); + list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); + } else + kmem_cache_free(pde_opener_cache, pdeo); + } + +out_unuse: + unuse_pde(pde); + return rv; +} + +static int proc_reg_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct pde_opener *pdeo; + + if (pde_is_permanent(pde)) { + typeof_member(struct proc_ops, proc_release) release; + + release = pde->proc_ops->proc_release; + if (release) { + return release(inode, file); + } + return 0; + } + + spin_lock(&pde->pde_unload_lock); + list_for_each_entry(pdeo, &pde->pde_openers, lh) { + if (pdeo->file == file) { + close_pdeo(pde, pdeo); + return 0; + } + } + spin_unlock(&pde->pde_unload_lock); + return 0; +} + +static const struct file_operations proc_reg_file_ops = { + .llseek = proc_reg_llseek, + .read = proc_reg_read, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, + .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, + .open = proc_reg_open, + .release = proc_reg_release, +}; + +static const struct file_operations proc_iter_file_ops = { + .llseek = proc_reg_llseek, + .read_iter = proc_reg_read_iter, + .write = proc_reg_write, + .splice_read = generic_file_splice_read, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, + .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, + .open = proc_reg_open, + .release = proc_reg_release, +}; + +#ifdef CONFIG_COMPAT +static const struct file_operations proc_reg_file_ops_compat = { + .llseek = proc_reg_llseek, + .read = proc_reg_read, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, + .compat_ioctl = proc_reg_compat_ioctl, + .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, + .open = proc_reg_open, + .release = proc_reg_release, +}; + +static const struct file_operations proc_iter_file_ops_compat = { + .llseek = proc_reg_llseek, + .read_iter = proc_reg_read_iter, + .splice_read = generic_file_splice_read, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, + .compat_ioctl = proc_reg_compat_ioctl, + .mmap = proc_reg_mmap, + .get_unmapped_area = proc_reg_get_unmapped_area, + .open = proc_reg_open, + .release = proc_reg_release, +}; +#endif + +static void proc_put_link(void *p) +{ + unuse_pde(p); +} + +static const char *proc_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct proc_dir_entry *pde = PDE(inode); + if (!use_pde(pde)) + return ERR_PTR(-EINVAL); + set_delayed_call(done, proc_put_link, pde); + return pde->data; +} + +const struct inode_operations proc_link_inode_operations = { + .get_link = proc_get_link, +}; + +struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) +{ + struct inode *inode = new_inode(sb); + + if (!inode) { + pde_put(de); + return NULL; + } + + inode->i_private = de->data; + inode->i_ino = de->low_ino; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + PROC_I(inode)->pde = de; + if (is_empty_pde(de)) { + make_empty_dir_inode(inode); + return inode; + } + + if (de->mode) { + inode->i_mode = de->mode; + inode->i_uid = de->uid; + inode->i_gid = de->gid; + } + if (de->size) + inode->i_size = de->size; + if (de->nlink) + set_nlink(inode, de->nlink); + + if (S_ISREG(inode->i_mode)) { + inode->i_op = de->proc_iops; + if (de->proc_ops->proc_read_iter) + inode->i_fop = &proc_iter_file_ops; + else + inode->i_fop = &proc_reg_file_ops; +#ifdef CONFIG_COMPAT + if (de->proc_ops->proc_compat_ioctl) { + if (de->proc_ops->proc_read_iter) + inode->i_fop = &proc_iter_file_ops_compat; + else + inode->i_fop = &proc_reg_file_ops_compat; + } +#endif + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = de->proc_dir_ops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = NULL; + } else { + BUG(); + } + return inode; +} diff --git a/fs/proc/internal.h b/fs/proc/internal.h new file mode 100644 index 000000000..6b921826d --- /dev/null +++ b/fs/proc/internal.h @@ -0,0 +1,318 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Internal procfs definitions + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/proc_fs.h> +#include <linux/proc_ns.h> +#include <linux/refcount.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> +#include <linux/binfmts.h> +#include <linux/sched/coredump.h> +#include <linux/sched/task.h> + +struct ctl_table_header; +struct mempolicy; + +/* + * This is not completely implemented yet. The idea is to + * create an in-memory tree (like the actual /proc filesystem + * tree) of these proc_dir_entries, so that we can dynamically + * add new files to /proc. + * + * parent/subdir are used for the directory structure (every /proc file has a + * parent, but "subdir" is empty for all non-directory entries). + * subdir_node is used to build the rb tree "subdir" of the parent. + */ +struct proc_dir_entry { + /* + * number of callers into module in progress; + * negative -> it's going away RSN + */ + atomic_t in_use; + refcount_t refcnt; + struct list_head pde_openers; /* who did ->open, but not ->release */ + /* protects ->pde_openers and all struct pde_opener instances */ + spinlock_t pde_unload_lock; + struct completion *pde_unload_completion; + const struct inode_operations *proc_iops; + union { + const struct proc_ops *proc_ops; + const struct file_operations *proc_dir_ops; + }; + const struct dentry_operations *proc_dops; + union { + const struct seq_operations *seq_ops; + int (*single_show)(struct seq_file *, void *); + }; + proc_write_t write; + void *data; + unsigned int state_size; + unsigned int low_ino; + nlink_t nlink; + kuid_t uid; + kgid_t gid; + loff_t size; + struct proc_dir_entry *parent; + struct rb_root subdir; + struct rb_node subdir_node; + char *name; + umode_t mode; + u8 flags; + u8 namelen; + char inline_name[]; +} __randomize_layout; + +#define SIZEOF_PDE ( \ + sizeof(struct proc_dir_entry) < 128 ? 128 : \ + sizeof(struct proc_dir_entry) < 192 ? 192 : \ + sizeof(struct proc_dir_entry) < 256 ? 256 : \ + sizeof(struct proc_dir_entry) < 512 ? 512 : \ + 0) +#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) + +static inline bool pde_is_permanent(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_PERMANENT; +} + +static inline void pde_make_permanent(struct proc_dir_entry *pde) +{ + pde->flags |= PROC_ENTRY_PERMANENT; +} + +extern struct kmem_cache *proc_dir_entry_cache; +void pde_free(struct proc_dir_entry *pde); + +union proc_op { + int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_show)(struct seq_file *m, + struct pid_namespace *ns, struct pid *pid, + struct task_struct *task); + const char *lsm; +}; + +struct proc_inode { + struct pid *pid; + unsigned int fd; + union proc_op op; + struct proc_dir_entry *pde; + struct ctl_table_header *sysctl; + struct ctl_table *sysctl_entry; + struct hlist_node sibling_inodes; + const struct proc_ns_operations *ns_ops; + struct inode vfs_inode; +} __randomize_layout; + +/* + * General functions + */ +static inline struct proc_inode *PROC_I(const struct inode *inode) +{ + return container_of(inode, struct proc_inode, vfs_inode); +} + +static inline struct proc_dir_entry *PDE(const struct inode *inode) +{ + return PROC_I(inode)->pde; +} + +static inline struct pid *proc_pid(const struct inode *inode) +{ + return PROC_I(inode)->pid; +} + +static inline struct task_struct *get_proc_task(const struct inode *inode) +{ + return get_pid_task(proc_pid(inode), PIDTYPE_PID); +} + +void task_dump_owner(struct task_struct *task, umode_t mode, + kuid_t *ruid, kgid_t *rgid); + +unsigned name_to_int(const struct qstr *qstr); +/* + * Offset of the first process in the /proc root directory.. + */ +#define FIRST_PROCESS_ENTRY 256 + +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 13 + +/* + * array.c + */ +extern const struct file_operations proc_tid_children_operations; + +extern void proc_task_name(struct seq_file *m, struct task_struct *p, + bool escape); +extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_status(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); + +/* + * base.c + */ +extern const struct dentry_operations pid_dentry_operations; +extern int pid_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); +extern int proc_setattr(struct user_namespace *, struct dentry *, + struct iattr *); +extern void proc_pid_evict_inode(struct proc_inode *); +extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); +extern void pid_update_inode(struct task_struct *, struct inode *); +extern int pid_delete_dentry(const struct dentry *); +extern int proc_pid_readdir(struct file *, struct dir_context *); +struct dentry *proc_pid_lookup(struct dentry *, unsigned int); +extern loff_t mem_lseek(struct file *, loff_t, int); + +/* Lookups */ +typedef struct dentry *instantiate_t(struct dentry *, + struct task_struct *, const void *); +bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, + instantiate_t, struct task_struct *, const void *); + +/* + * generic.c + */ +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data); +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp); +extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); +struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); +extern int proc_readdir(struct file *, struct dir_context *); +int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); + +static inline void pde_get(struct proc_dir_entry *pde) +{ + refcount_inc(&pde->refcnt); +} +extern void pde_put(struct proc_dir_entry *); + +static inline bool is_empty_pde(const struct proc_dir_entry *pde) +{ + return S_ISDIR(pde->mode) && !pde->proc_iops; +} +extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *); + +/* + * inode.c + */ +struct pde_opener { + struct list_head lh; + struct file *file; + bool closing; + struct completion *c; +} __randomize_layout; +extern const struct inode_operations proc_link_inode_operations; +extern const struct inode_operations proc_pid_link_inode_operations; +extern const struct super_operations proc_sops; + +void proc_init_kmemcache(void); +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); +void set_proc_pid_nlink(void); +extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +extern void proc_entry_rundown(struct proc_dir_entry *); + +/* + * proc_namespaces.c + */ +extern const struct inode_operations proc_ns_dir_inode_operations; +extern const struct file_operations proc_ns_dir_operations; + +/* + * proc_net.c + */ +extern const struct file_operations proc_net_operations; +extern const struct inode_operations proc_net_inode_operations; + +#ifdef CONFIG_NET +extern int proc_net_init(void); +#else +static inline int proc_net_init(void) { return 0; } +#endif + +/* + * proc_self.c + */ +extern int proc_setup_self(struct super_block *); + +/* + * proc_thread_self.c + */ +extern int proc_setup_thread_self(struct super_block *); +extern void proc_thread_self_init(void); + +/* + * proc_sysctl.c + */ +#ifdef CONFIG_PROC_SYSCTL +extern int proc_sys_init(void); +extern void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head); +#else +static inline void proc_sys_init(void) { } +static inline void proc_sys_evict_inode(struct inode *inode, + struct ctl_table_header *head) { } +#endif + +/* + * proc_tty.c + */ +#ifdef CONFIG_TTY +extern void proc_tty_init(void); +#else +static inline void proc_tty_init(void) {} +#endif + +/* + * root.c + */ +extern struct proc_dir_entry proc_root; + +extern void proc_self_init(void); + +/* + * task_[no]mmu.c + */ +struct mem_size_stats; +struct proc_maps_private { + struct inode *inode; + struct task_struct *task; + struct mm_struct *mm; + struct vma_iterator iter; +#ifdef CONFIG_NUMA + struct mempolicy *task_mempolicy; +#endif +} __randomize_layout; + +struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); + +extern const struct file_operations proc_pid_maps_operations; +extern const struct file_operations proc_pid_numa_maps_operations; +extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_pid_smaps_rollup_operations; +extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; + +extern unsigned long task_vsize(struct mm_struct *); +extern unsigned long task_statm(struct mm_struct *, + unsigned long *, unsigned long *, + unsigned long *, unsigned long *); +extern void task_mem(struct seq_file *, struct mm_struct *); + +extern const struct dentry_operations proc_net_dentry_ops; +static inline void pde_force_lookup(struct proc_dir_entry *pde) +{ + /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ + pde->proc_dops = &proc_net_dentry_ops; +} diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c new file mode 100644 index 000000000..cb0edc7cb --- /dev/null +++ b/fs/proc/interrupts.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/irqnr.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +/* + * /proc/interrupts + */ +static void *int_seq_start(struct seq_file *f, loff_t *pos) +{ + return (*pos <= nr_irqs) ? pos : NULL; +} + +static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos > nr_irqs) + return NULL; + return pos; +} + +static void int_seq_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations int_seq_ops = { + .start = int_seq_start, + .next = int_seq_next, + .stop = int_seq_stop, + .show = show_interrupts +}; + +static int __init proc_interrupts_init(void) +{ + proc_create_seq("interrupts", 0, NULL, &int_seq_ops); + return 0; +} +fs_initcall(proc_interrupts_init); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c new file mode 100644 index 000000000..dff921f7c --- /dev/null +++ b/fs/proc/kcore.c @@ -0,0 +1,701 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/proc/kcore.c kernel ELF core dumper + * + * Modelled on fs/exec.c:aout_core_dump() + * Jeremy Fitzhardinge <jeremy@sw.oz.au> + * ELF version written by David Howells <David.Howells@nexor.co.uk> + * Modified and incorporated into 2.3.x by Tigran Aivazian <tigran@veritas.com> + * Support to dump vmalloc'd areas (ELF only), Tigran Aivazian <tigran@veritas.com> + * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com> + */ + +#include <linux/crash_core.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/kcore.h> +#include <linux/user.h> +#include <linux/capability.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/notifier.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <asm/io.h> +#include <linux/list.h> +#include <linux/ioport.h> +#include <linux/memory.h> +#include <linux/sched/task.h> +#include <linux/security.h> +#include <asm/sections.h> +#include "internal.h" + +#define CORE_STR "CORE" + +#ifndef ELF_CORE_EFLAGS +#define ELF_CORE_EFLAGS 0 +#endif + +static struct proc_dir_entry *proc_root_kcore; + + +#ifndef kc_vaddr_to_offset +#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET) +#endif +#ifndef kc_offset_to_vaddr +#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) +#endif + +static LIST_HEAD(kclist_head); +static DECLARE_RWSEM(kclist_lock); +static int kcore_need_update = 1; + +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * Same as oldmem_pfn_is_ram in vmcore + */ +static int (*mem_pfn_is_ram)(unsigned long pfn); + +int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (mem_pfn_is_ram) + return -EBUSY; + mem_pfn_is_ram = fn; + return 0; +} + +static int pfn_is_ram(unsigned long pfn) +{ + if (mem_pfn_is_ram) + return mem_pfn_is_ram(pfn); + else + return 1; +} + +/* This doesn't grab kclist_lock, so it should only be used at init time. */ +void __init kclist_add(struct kcore_list *new, void *addr, size_t size, + int type) +{ + new->addr = (unsigned long)addr; + new->size = size; + new->type = type; + + list_add_tail(&new->list, &kclist_head); +} + +static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len, + size_t *data_offset) +{ + size_t try, size; + struct kcore_list *m; + + *nphdr = 1; /* PT_NOTE */ + size = 0; + + list_for_each_entry(m, &kclist_head, list) { + try = kc_vaddr_to_offset((size_t)m->addr + m->size); + if (try > size) + size = try; + *nphdr = *nphdr + 1; + } + + *phdrs_len = *nphdr * sizeof(struct elf_phdr); + *notes_len = (4 * sizeof(struct elf_note) + + 3 * ALIGN(sizeof(CORE_STR), 4) + + VMCOREINFO_NOTE_NAME_BYTES + + ALIGN(sizeof(struct elf_prstatus), 4) + + ALIGN(sizeof(struct elf_prpsinfo), 4) + + ALIGN(arch_task_struct_size, 4) + + ALIGN(vmcoreinfo_size, 4)); + *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len + + *notes_len); + return *data_offset + size; +} + +#ifdef CONFIG_HIGHMEM +/* + * If no highmem, we can assume [0...max_low_pfn) continuous range of memory + * because memory hole is not as big as !HIGHMEM case. + * (HIGHMEM is special because part of memory is _invisible_ from the kernel.) + */ +static int kcore_ram_list(struct list_head *head) +{ + struct kcore_list *ent; + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->addr = (unsigned long)__va(0); + ent->size = max_low_pfn << PAGE_SHIFT; + ent->type = KCORE_RAM; + list_add(&ent->list, head); + return 0; +} + +#else /* !CONFIG_HIGHMEM */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* calculate vmemmap's address from given system ram pfn and register it */ +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +{ + unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT; + unsigned long nr_pages = ent->size >> PAGE_SHIFT; + unsigned long start, end; + struct kcore_list *vmm, *tmp; + + + start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK; + end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1; + end = PAGE_ALIGN(end); + /* overlap check (because we have to align page */ + list_for_each_entry(tmp, head, list) { + if (tmp->type != KCORE_VMEMMAP) + continue; + if (start < tmp->addr + tmp->size) + if (end > tmp->addr) + end = tmp->addr; + } + if (start < end) { + vmm = kmalloc(sizeof(*vmm), GFP_KERNEL); + if (!vmm) + return 0; + vmm->addr = start; + vmm->size = end - start; + vmm->type = KCORE_VMEMMAP; + list_add_tail(&vmm->list, head); + } + return 1; + +} +#else +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +{ + return 1; +} + +#endif + +static int +kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) +{ + struct list_head *head = (struct list_head *)arg; + struct kcore_list *ent; + struct page *p; + + if (!pfn_valid(pfn)) + return 1; + + p = pfn_to_page(pfn); + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->addr = (unsigned long)page_to_virt(p); + ent->size = nr_pages << PAGE_SHIFT; + + if (!virt_addr_valid(ent->addr)) + goto free_out; + + /* cut not-mapped area. ....from ppc-32 code. */ + if (ULONG_MAX - ent->addr < ent->size) + ent->size = ULONG_MAX - ent->addr; + + /* + * We've already checked virt_addr_valid so we know this address + * is a valid pointer, therefore we can check against it to determine + * if we need to trim + */ + if (VMALLOC_START > ent->addr) { + if (VMALLOC_START - ent->addr < ent->size) + ent->size = VMALLOC_START - ent->addr; + } + + ent->type = KCORE_RAM; + list_add_tail(&ent->list, head); + + if (!get_sparsemem_vmemmap_info(ent, head)) { + list_del(&ent->list); + goto free_out; + } + + return 0; +free_out: + kfree(ent); + return 1; +} + +static int kcore_ram_list(struct list_head *list) +{ + int nid, ret; + unsigned long end_pfn; + + /* Not inialized....update now */ + /* find out "max pfn" */ + end_pfn = 0; + for_each_node_state(nid, N_MEMORY) { + unsigned long node_end; + node_end = node_end_pfn(nid); + if (end_pfn < node_end) + end_pfn = node_end; + } + /* scan 0 to max_pfn */ + ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private); + if (ret) + return -ENOMEM; + return 0; +} +#endif /* CONFIG_HIGHMEM */ + +static int kcore_update_ram(void) +{ + LIST_HEAD(list); + LIST_HEAD(garbage); + int nphdr; + size_t phdrs_len, notes_len, data_offset; + struct kcore_list *tmp, *pos; + int ret = 0; + + down_write(&kclist_lock); + if (!xchg(&kcore_need_update, 0)) + goto out; + + ret = kcore_ram_list(&list); + if (ret) { + /* Couldn't get the RAM list, try again next time. */ + WRITE_ONCE(kcore_need_update, 1); + list_splice_tail(&list, &garbage); + goto out; + } + + list_for_each_entry_safe(pos, tmp, &kclist_head, list) { + if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP) + list_move(&pos->list, &garbage); + } + list_splice_tail(&list, &kclist_head); + + proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, ¬es_len, + &data_offset); + +out: + up_write(&kclist_lock); + list_for_each_entry_safe(pos, tmp, &garbage, list) { + list_del(&pos->list); + kfree(pos); + } + return ret; +} + +static void append_kcore_note(char *notes, size_t *i, const char *name, + unsigned int type, const void *desc, + size_t descsz) +{ + struct elf_note *note = (struct elf_note *)¬es[*i]; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = descsz; + note->n_type = type; + *i += sizeof(*note); + memcpy(¬es[*i], name, note->n_namesz); + *i = ALIGN(*i + note->n_namesz, 4); + memcpy(¬es[*i], desc, descsz); + *i = ALIGN(*i + descsz, 4); +} + +static ssize_t +read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) +{ + char *buf = file->private_data; + size_t phdrs_offset, notes_offset, data_offset; + size_t page_offline_frozen = 1; + size_t phdrs_len, notes_len; + struct kcore_list *m; + size_t tsz; + int nphdr; + unsigned long start; + size_t orig_buflen = buflen; + int ret = 0; + + down_read(&kclist_lock); + /* + * Don't race against drivers that set PageOffline() and expect no + * further page access. + */ + page_offline_freeze(); + + get_kcore_size(&nphdr, &phdrs_len, ¬es_len, &data_offset); + phdrs_offset = sizeof(struct elfhdr); + notes_offset = phdrs_offset + phdrs_len; + + /* ELF file header. */ + if (buflen && *fpos < sizeof(struct elfhdr)) { + struct elfhdr ehdr = { + .e_ident = { + [EI_MAG0] = ELFMAG0, + [EI_MAG1] = ELFMAG1, + [EI_MAG2] = ELFMAG2, + [EI_MAG3] = ELFMAG3, + [EI_CLASS] = ELF_CLASS, + [EI_DATA] = ELF_DATA, + [EI_VERSION] = EV_CURRENT, + [EI_OSABI] = ELF_OSABI, + }, + .e_type = ET_CORE, + .e_machine = ELF_ARCH, + .e_version = EV_CURRENT, + .e_phoff = sizeof(struct elfhdr), + .e_flags = ELF_CORE_EFLAGS, + .e_ehsize = sizeof(struct elfhdr), + .e_phentsize = sizeof(struct elf_phdr), + .e_phnum = nphdr, + }; + + tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos); + if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) { + ret = -EFAULT; + goto out; + } + + buffer += tsz; + buflen -= tsz; + *fpos += tsz; + } + + /* ELF program headers. */ + if (buflen && *fpos < phdrs_offset + phdrs_len) { + struct elf_phdr *phdrs, *phdr; + + phdrs = kzalloc(phdrs_len, GFP_KERNEL); + if (!phdrs) { + ret = -ENOMEM; + goto out; + } + + phdrs[0].p_type = PT_NOTE; + phdrs[0].p_offset = notes_offset; + phdrs[0].p_filesz = notes_len; + + phdr = &phdrs[1]; + list_for_each_entry(m, &kclist_head, list) { + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset; + phdr->p_vaddr = (size_t)m->addr; + if (m->type == KCORE_RAM) + phdr->p_paddr = __pa(m->addr); + else if (m->type == KCORE_TEXT) + phdr->p_paddr = __pa_symbol(m->addr); + else + phdr->p_paddr = (elf_addr_t)-1; + phdr->p_filesz = phdr->p_memsz = m->size; + phdr->p_align = PAGE_SIZE; + phdr++; + } + + tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos); + if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset, + tsz)) { + kfree(phdrs); + ret = -EFAULT; + goto out; + } + kfree(phdrs); + + buffer += tsz; + buflen -= tsz; + *fpos += tsz; + } + + /* ELF note segment. */ + if (buflen && *fpos < notes_offset + notes_len) { + struct elf_prstatus prstatus = {}; + struct elf_prpsinfo prpsinfo = { + .pr_sname = 'R', + .pr_fname = "vmlinux", + }; + char *notes; + size_t i = 0; + + strlcpy(prpsinfo.pr_psargs, saved_command_line, + sizeof(prpsinfo.pr_psargs)); + + notes = kzalloc(notes_len, GFP_KERNEL); + if (!notes) { + ret = -ENOMEM; + goto out; + } + + append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo, + sizeof(prpsinfo)); + append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, + arch_task_struct_size); + /* + * vmcoreinfo_size is mostly constant after init time, but it + * can be changed by crash_save_vmcoreinfo(). Racing here with a + * panic on another CPU before the machine goes down is insanely + * unlikely, but it's better to not leave potential buffer + * overflows lying around, regardless. + */ + append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0, + vmcoreinfo_data, + min(vmcoreinfo_size, notes_len - i)); + + tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); + if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) { + kfree(notes); + ret = -EFAULT; + goto out; + } + kfree(notes); + + buffer += tsz; + buflen -= tsz; + *fpos += tsz; + } + + /* + * Check to see if our file offset matches with any of + * the addresses in the elf_phdr on our list. + */ + start = kc_offset_to_vaddr(*fpos - data_offset); + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + + m = NULL; + while (buflen) { + struct page *page; + unsigned long pfn; + + /* + * If this is the first iteration or the address is not within + * the previous entry, search for a matching entry. + */ + if (!m || start < m->addr || start >= m->addr + m->size) { + struct kcore_list *iter; + + m = NULL; + list_for_each_entry(iter, &kclist_head, list) { + if (start >= iter->addr && + start < iter->addr + iter->size) { + m = iter; + break; + } + } + } + + if (page_offline_frozen++ % MAX_ORDER_NR_PAGES == 0) { + page_offline_thaw(); + cond_resched(); + page_offline_freeze(); + } + + if (!m) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } + goto skip; + } + + switch (m->type) { + case KCORE_VMALLOC: + vread(buf, (char *)start, tsz); + /* we have to zero-fill user buffer even if no read */ + if (copy_to_user(buffer, buf, tsz)) { + ret = -EFAULT; + goto out; + } + break; + case KCORE_USER: + /* User page is handled prior to normal kernel page: */ + if (copy_to_user(buffer, (char *)start, tsz)) { + ret = -EFAULT; + goto out; + } + break; + case KCORE_RAM: + pfn = __pa(start) >> PAGE_SHIFT; + page = pfn_to_online_page(pfn); + + /* + * Don't read offline sections, logically offline pages + * (e.g., inflated in a balloon), hwpoisoned pages, + * and explicitly excluded physical ranges. + */ + if (!page || PageOffline(page) || + is_page_hwpoison(page) || !pfn_is_ram(pfn)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } + break; + } + fallthrough; + case KCORE_VMEMMAP: + case KCORE_TEXT: + if (kern_addr_valid(start)) { + /* + * Using bounce buffer to bypass the + * hardened user copy kernel text checks. + */ + if (copy_from_kernel_nofault(buf, (void *)start, + tsz)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } + } else { + if (copy_to_user(buffer, buf, tsz)) { + ret = -EFAULT; + goto out; + } + } + } else { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } + } + break; + default: + pr_warn_once("Unhandled KCORE type: %d\n", m->type); + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } + } +skip: + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + start += tsz; + tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); + } + +out: + page_offline_thaw(); + up_read(&kclist_lock); + if (ret) + return ret; + return orig_buflen - buflen; +} + +static int open_kcore(struct inode *inode, struct file *filp) +{ + int ret = security_locked_down(LOCKDOWN_KCORE); + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (ret) + return ret; + + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!filp->private_data) + return -ENOMEM; + + if (kcore_need_update) + kcore_update_ram(); + if (i_size_read(inode) != proc_root_kcore->size) { + inode_lock(inode); + i_size_write(inode, proc_root_kcore->size); + inode_unlock(inode); + } + return 0; +} + +static int release_kcore(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static const struct proc_ops kcore_proc_ops = { + .proc_read = read_kcore, + .proc_open = open_kcore, + .proc_release = release_kcore, + .proc_lseek = default_llseek, +}; + +/* just remember that we have to update kcore */ +static int __meminit kcore_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: + case MEM_OFFLINE: + kcore_need_update = 1; + break; + } + return NOTIFY_OK; +} + +static struct notifier_block kcore_callback_nb __meminitdata = { + .notifier_call = kcore_callback, + .priority = 0, +}; + +static struct kcore_list kcore_vmalloc; + +#ifdef CONFIG_ARCH_PROC_KCORE_TEXT +static struct kcore_list kcore_text; +/* + * If defined, special segment is used for mapping kernel text instead of + * direct-map area. We need to create special TEXT section. + */ +static void __init proc_kcore_text_init(void) +{ + kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT); +} +#else +static void __init proc_kcore_text_init(void) +{ +} +#endif + +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) +/* + * MODULES_VADDR has no intersection with VMALLOC_ADDR. + */ +static struct kcore_list kcore_modules; +static void __init add_modules_range(void) +{ + if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) { + kclist_add(&kcore_modules, (void *)MODULES_VADDR, + MODULES_END - MODULES_VADDR, KCORE_VMALLOC); + } +} +#else +static void __init add_modules_range(void) +{ +} +#endif + +static int __init proc_kcore_init(void) +{ + proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &kcore_proc_ops); + if (!proc_root_kcore) { + pr_err("couldn't create /proc/kcore\n"); + return 0; /* Always returns 0. */ + } + /* Store text area if it's special */ + proc_kcore_text_init(); + /* Store vmalloc area */ + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + VMALLOC_END - VMALLOC_START, KCORE_VMALLOC); + add_modules_range(); + /* Store direct-map area from physical memory map */ + kcore_update_ram(); + register_hotmemory_notifier(&kcore_callback_nb); + + return 0; +} +fs_initcall(proc_kcore_init); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c new file mode 100644 index 000000000..2fc92a13f --- /dev/null +++ b/fs/proc/kmsg.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/proc/kmsg.c + * + * Copyright (C) 1992 by Linus Torvalds + * + */ + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <linux/fs.h> +#include <linux/syslog.h> + +#include <asm/io.h> + +static int kmsg_open(struct inode * inode, struct file * file) +{ + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); +} + +static int kmsg_release(struct inode * inode, struct file * file) +{ + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC); + return 0; +} + +static ssize_t kmsg_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + if ((file->f_flags & O_NONBLOCK) && + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) + return -EAGAIN; + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); +} + +static __poll_t kmsg_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &log_wait, wait); + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + + +static const struct proc_ops kmsg_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_read = kmsg_read, + .proc_poll = kmsg_poll, + .proc_open = kmsg_open, + .proc_release = kmsg_release, + .proc_lseek = generic_file_llseek, +}; + +static int __init proc_kmsg_init(void) +{ + proc_create("kmsg", S_IRUSR, NULL, &kmsg_proc_ops); + return 0; +} +fs_initcall(proc_kmsg_init); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c new file mode 100644 index 000000000..817981e57 --- /dev/null +++ b/fs/proc/loadavg.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/pid_namespace.h> +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/stat.h> +#include <linux/seq_file.h> +#include <linux/seqlock.h> +#include <linux/time.h> +#include "internal.h" + +static int loadavg_proc_show(struct seq_file *m, void *v) +{ + unsigned long avnrun[3]; + + get_avenrun(avnrun, FIXED_1/200, 0); + + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n", + LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), + LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), + LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), + nr_running(), nr_threads, + idr_get_cursor(&task_active_pid_ns(current)->idr) - 1); + return 0; +} + +static int __init proc_loadavg_init(void) +{ + struct proc_dir_entry *pde; + + pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + pde_make_permanent(pde); + return 0; +} +fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c new file mode 100644 index 000000000..440960110 --- /dev/null +++ b/fs/proc/meminfo.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/mman.h> +#include <linux/mmzone.h> +#include <linux/proc_fs.h> +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/swap.h> +#include <linux/vmstat.h> +#include <linux/atomic.h> +#include <linux/vmalloc.h> +#ifdef CONFIG_CMA +#include <linux/cma.h> +#endif +#include <asm/page.h> +#include "internal.h" + +void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) +{ +} + +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) +{ + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); + seq_write(m, " kB\n", 4); +} + +static int meminfo_proc_show(struct seq_file *m, void *v) +{ + struct sysinfo i; + unsigned long committed; + long cached; + long available; + unsigned long pages[NR_LRU_LISTS]; + unsigned long sreclaimable, sunreclaim; + int lru; + + si_meminfo(&i); + si_swapinfo(&i); + committed = vm_memory_committed(); + + cached = global_node_page_state(NR_FILE_PAGES) - + total_swapcache_pages() - i.bufferram; + if (cached < 0) + cached = 0; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); + + available = si_mem_available(); + sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); + sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); + + show_val_kb(m, "MemTotal: ", i.totalram); + show_val_kb(m, "MemFree: ", i.freeram); + show_val_kb(m, "MemAvailable: ", available); + show_val_kb(m, "Buffers: ", i.bufferram); + show_val_kb(m, "Cached: ", cached); + show_val_kb(m, "SwapCached: ", total_swapcache_pages()); + show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + + pages[LRU_ACTIVE_FILE]); + show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + + pages[LRU_INACTIVE_FILE]); + show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); + show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); + show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); + show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); + show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); + show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); + +#ifdef CONFIG_HIGHMEM + show_val_kb(m, "HighTotal: ", i.totalhigh); + show_val_kb(m, "HighFree: ", i.freehigh); + show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh); + show_val_kb(m, "LowFree: ", i.freeram - i.freehigh); +#endif + +#ifndef CONFIG_MMU + show_val_kb(m, "MmapCopy: ", + (unsigned long)atomic_long_read(&mmap_pages_allocated)); +#endif + + show_val_kb(m, "SwapTotal: ", i.totalswap); + show_val_kb(m, "SwapFree: ", i.freeswap); +#ifdef CONFIG_ZSWAP + seq_printf(m, "Zswap: %8lu kB\n", + (unsigned long)(zswap_pool_total_size >> 10)); + seq_printf(m, "Zswapped: %8lu kB\n", + (unsigned long)atomic_read(&zswap_stored_pages) << + (PAGE_SHIFT - 10)); +#endif + show_val_kb(m, "Dirty: ", + global_node_page_state(NR_FILE_DIRTY)); + show_val_kb(m, "Writeback: ", + global_node_page_state(NR_WRITEBACK)); + show_val_kb(m, "AnonPages: ", + global_node_page_state(NR_ANON_MAPPED)); + show_val_kb(m, "Mapped: ", + global_node_page_state(NR_FILE_MAPPED)); + show_val_kb(m, "Shmem: ", i.sharedram); + show_val_kb(m, "KReclaimable: ", sreclaimable + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); + show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); + show_val_kb(m, "SReclaimable: ", sreclaimable); + show_val_kb(m, "SUnreclaim: ", sunreclaim); + seq_printf(m, "KernelStack: %8lu kB\n", + global_node_page_state(NR_KERNEL_STACK_KB)); +#ifdef CONFIG_SHADOW_CALL_STACK + seq_printf(m, "ShadowCallStack:%8lu kB\n", + global_node_page_state(NR_KERNEL_SCS_KB)); +#endif + show_val_kb(m, "PageTables: ", + global_node_page_state(NR_PAGETABLE)); + show_val_kb(m, "SecPageTables: ", + global_node_page_state(NR_SECONDARY_PAGETABLE)); + + show_val_kb(m, "NFS_Unstable: ", 0); + show_val_kb(m, "Bounce: ", + global_zone_page_state(NR_BOUNCE)); + show_val_kb(m, "WritebackTmp: ", + global_node_page_state(NR_WRITEBACK_TEMP)); + show_val_kb(m, "CommitLimit: ", vm_commit_limit()); + show_val_kb(m, "Committed_AS: ", committed); + seq_printf(m, "VmallocTotal: %8lu kB\n", + (unsigned long)VMALLOC_TOTAL >> 10); + show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); + show_val_kb(m, "VmallocChunk: ", 0ul); + show_val_kb(m, "Percpu: ", pcpu_nr_pages()); + +#ifdef CONFIG_MEMORY_FAILURE + seq_printf(m, "HardwareCorrupted: %5lu kB\n", + atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)); +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + show_val_kb(m, "AnonHugePages: ", + global_node_page_state(NR_ANON_THPS)); + show_val_kb(m, "ShmemHugePages: ", + global_node_page_state(NR_SHMEM_THPS)); + show_val_kb(m, "ShmemPmdMapped: ", + global_node_page_state(NR_SHMEM_PMDMAPPED)); + show_val_kb(m, "FileHugePages: ", + global_node_page_state(NR_FILE_THPS)); + show_val_kb(m, "FilePmdMapped: ", + global_node_page_state(NR_FILE_PMDMAPPED)); +#endif + +#ifdef CONFIG_CMA + show_val_kb(m, "CmaTotal: ", totalcma_pages); + show_val_kb(m, "CmaFree: ", + global_zone_page_state(NR_FREE_CMA_PAGES)); +#endif + + hugetlb_report_meminfo(m); + + arch_report_meminfo(m); + + return 0; +} + +static int __init proc_meminfo_init(void) +{ + struct proc_dir_entry *pde; + + pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + pde_make_permanent(pde); + return 0; +} +fs_initcall(proc_meminfo_init); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c new file mode 100644 index 000000000..8e159fc78 --- /dev/null +++ b/fs/proc/namespaces.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/proc_fs.h> +#include <linux/nsproxy.h> +#include <linux/ptrace.h> +#include <linux/namei.h> +#include <linux/file.h> +#include <linux/utsname.h> +#include <net/net_namespace.h> +#include <linux/ipc_namespace.h> +#include <linux/pid_namespace.h> +#include <linux/user_namespace.h> +#include "internal.h" + + +static const struct proc_ns_operations *ns_entries[] = { +#ifdef CONFIG_NET_NS + &netns_operations, +#endif +#ifdef CONFIG_UTS_NS + &utsns_operations, +#endif +#ifdef CONFIG_IPC_NS + &ipcns_operations, +#endif +#ifdef CONFIG_PID_NS + &pidns_operations, + &pidns_for_children_operations, +#endif +#ifdef CONFIG_USER_NS + &userns_operations, +#endif + &mntns_operations, +#ifdef CONFIG_CGROUPS + &cgroupns_operations, +#endif +#ifdef CONFIG_TIME_NS + &timens_operations, + &timens_for_children_operations, +#endif +}; + +static const char *proc_ns_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; + struct task_struct *task; + struct path ns_path; + int error = -EACCES; + + if (!dentry) + return ERR_PTR(-ECHILD); + + task = get_proc_task(inode); + if (!task) + return ERR_PTR(-EACCES); + + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto out; + + error = ns_get_path(&ns_path, task, ns_ops); + if (error) + goto out; + + error = nd_jump_link(&ns_path); +out: + put_task_struct(task); + return ERR_PTR(error); +} + +static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct inode *inode = d_inode(dentry); + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; + struct task_struct *task; + char name[50]; + int res = -EACCES; + + task = get_proc_task(inode); + if (!task) + return res; + + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { + res = ns_get_name(name, sizeof(name), task, ns_ops); + if (res >= 0) + res = readlink_copy(buffer, buflen, name); + } + put_task_struct(task); + return res; +} + +static const struct inode_operations proc_ns_link_inode_operations = { + .readlink = proc_ns_readlink, + .get_link = proc_ns_get_link, + .setattr = proc_setattr, +}; + +static struct dentry *proc_ns_instantiate(struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + const struct proc_ns_operations *ns_ops = ptr; + struct inode *inode; + struct proc_inode *ei; + + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO); + if (!inode) + return ERR_PTR(-ENOENT); + + ei = PROC_I(inode); + inode->i_op = &proc_ns_link_inode_operations; + ei->ns_ops = ns_ops; + pid_update_inode(task, inode); + + d_set_d_op(dentry, &pid_dentry_operations); + return d_splice_alias(inode, dentry); +} + +static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + const struct proc_ns_operations **entry, **last; + + if (!task) + return -ENOENT; + + if (!dir_emit_dots(file, ctx)) + goto out; + if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) + goto out; + entry = ns_entries + (ctx->pos - 2); + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + while (entry <= last) { + const struct proc_ns_operations *ops = *entry; + if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), + proc_ns_instantiate, task, ops)) + break; + ctx->pos++; + entry++; + } +out: + put_task_struct(task); + return 0; +} + +const struct file_operations proc_ns_dir_operations = { + .read = generic_read_dir, + .iterate_shared = proc_ns_dir_readdir, + .llseek = generic_file_llseek, +}; + +static struct dentry *proc_ns_dir_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + struct task_struct *task = get_proc_task(dir); + const struct proc_ns_operations **entry, **last; + unsigned int len = dentry->d_name.len; + struct dentry *res = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + + last = &ns_entries[ARRAY_SIZE(ns_entries)]; + for (entry = ns_entries; entry < last; entry++) { + if (strlen((*entry)->name) != len) + continue; + if (!memcmp(dentry->d_name.name, (*entry)->name, len)) + break; + } + if (entry == last) + goto out; + + res = proc_ns_instantiate(dentry, task, *entry); +out: + put_task_struct(task); +out_no_task: + return res; +} + +const struct inode_operations proc_ns_dir_inode_operations = { + .lookup = proc_ns_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c new file mode 100644 index 000000000..4d3493579 --- /dev/null +++ b/fs/proc/nommu.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* nommu.c: mmu-less memory info files + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/mman.h> +#include <linux/proc_fs.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/seq_file.h> +#include <linux/hugetlb.h> +#include <linux/vmalloc.h> +#include <asm/tlb.h> +#include <asm/div64.h> +#include "internal.h" + +/* + * display a single region to a sequenced file + */ +static int nommu_region_show(struct seq_file *m, struct vm_region *region) +{ + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags; + + flags = region->vm_flags; + file = region->vm_file; + + if (file) { + struct inode *inode = file_inode(region->vm_file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + } + + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", + region->vm_start, + region->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + ((loff_t)region->vm_pgoff) << PAGE_SHIFT, + MAJOR(dev), MINOR(dev), ino); + + if (file) { + seq_pad(m, ' '); + seq_file_path(m, file, ""); + } + + seq_putc(m, '\n'); + return 0; +} + +/* + * display a list of all the REGIONs the kernel knows about + * - nommu kernels have a single flat list + */ +static int nommu_region_list_show(struct seq_file *m, void *_p) +{ + struct rb_node *p = _p; + + return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb)); +} + +static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos) +{ + struct rb_node *p; + loff_t pos = *_pos; + + down_read(&nommu_region_sem); + + for (p = rb_first(&nommu_region_tree); p; p = rb_next(p)) + if (pos-- == 0) + return p; + return NULL; +} + +static void nommu_region_list_stop(struct seq_file *m, void *v) +{ + up_read(&nommu_region_sem); +} + +static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return rb_next((struct rb_node *) v); +} + +static const struct seq_operations proc_nommu_region_list_seqop = { + .start = nommu_region_list_start, + .next = nommu_region_list_next, + .stop = nommu_region_list_stop, + .show = nommu_region_list_show +}; + +static int __init proc_nommu_init(void) +{ + proc_create_seq("maps", S_IRUGO, NULL, &proc_nommu_region_list_seqop); + return 0; +} + +fs_initcall(proc_nommu_init); diff --git a/fs/proc/page.c b/fs/proc/page.c new file mode 100644 index 000000000..f2273b164 --- /dev/null +++ b/fs/proc/page.c @@ -0,0 +1,342 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/memblock.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/ksm.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/huge_mm.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/hugetlb.h> +#include <linux/memremap.h> +#include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> +#include <linux/page_idle.h> +#include <linux/kernel-page-flags.h> +#include <linux/uaccess.h> +#include "internal.h" + +#define KPMSIZE sizeof(u64) +#define KPMMASK (KPMSIZE - 1) +#define KPMBITS (KPMSIZE * BITS_PER_BYTE) + +static inline unsigned long get_max_dump_pfn(void) +{ +#ifdef CONFIG_SPARSEMEM + /* + * The memmap of early sections is completely populated and marked + * online even if max_pfn does not fall on a section boundary - + * pfn_to_online_page() will succeed on all pages. Allow inspecting + * these memmaps. + */ + return round_up(max_pfn, PAGES_PER_SECTION); +#else + return max_pfn; +#endif +} + +/* /proc/kpagecount - an array exposing page counts + * + * Each entry is a u64 representing the corresponding + * physical page count. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + const unsigned long max_dump_pfn = get_max_dump_pfn(); + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 pcount; + + pfn = src / KPMSIZE; + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); + + while (count > 0) { + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); + + if (!ppage || PageSlab(ppage) || page_has_type(ppage)) + pcount = 0; + else + pcount = page_mapcount(ppage); + + if (put_user(pcount, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + + cond_resched(); + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct proc_ops kpagecount_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_lseek = mem_lseek, + .proc_read = kpagecount_read, +}; + +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ + +static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) +{ + return ((kflags >> kbit) & 1) << ubit; +} + +u64 stable_page_flags(struct page *page) +{ + u64 k; + u64 u; + + /* + * pseudo flag: KPF_NOPAGE + * it differentiates a memory hole from a page with no flags + */ + if (!page) + return 1 << KPF_NOPAGE; + + k = page->flags; + u = 0; + + /* + * pseudo flags for the well known (anonymous) memory mapped pages + * + * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the + * simple test in page_mapped() is not enough. + */ + if (!PageSlab(page) && page_mapped(page)) + u |= 1 << KPF_MMAP; + if (PageAnon(page)) + u |= 1 << KPF_ANON; + if (PageKsm(page)) + u |= 1 << KPF_KSM; + + /* + * compound pages: export both head/tail info + * they together define a compound page's start/end pos and order + */ + if (PageHead(page)) + u |= 1 << KPF_COMPOUND_HEAD; + if (PageTail(page)) + u |= 1 << KPF_COMPOUND_TAIL; + if (PageHuge(page)) + u |= 1 << KPF_HUGE; + /* + * PageTransCompound can be true for non-huge compound pages (slab + * pages or pages allocated by drivers with __GFP_COMP) because it + * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon + * to make sure a given page is a thp, not a non-huge compound page. + */ + else if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + if (PageLRU(head) || PageAnon(head)) + u |= 1 << KPF_THP; + else if (is_huge_zero_page(head)) { + u |= 1 << KPF_ZERO_PAGE; + u |= 1 << KPF_THP; + } + } else if (is_zero_pfn(page_to_pfn(page))) + u |= 1 << KPF_ZERO_PAGE; + + + /* + * Caveats on high order pages: page->_refcount will only be set + * -1 on the head page; SLUB/SLQB do the same for PG_slab; + * SLOB won't set PG_slab at all on compound pages. + */ + if (PageBuddy(page)) + u |= 1 << KPF_BUDDY; + else if (page_count(page) == 0 && is_free_buddy_page(page)) + u |= 1 << KPF_BUDDY; + + if (PageOffline(page)) + u |= 1 << KPF_OFFLINE; + if (PageTable(page)) + u |= 1 << KPF_PGTABLE; + + if (page_is_idle(page)) + u |= 1 << KPF_IDLE; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); + if (PageTail(page) && PageSlab(compound_head(page))) + u |= 1 << KPF_SLAB; + + u |= kpf_copy_bit(k, KPF_ERROR, PG_error); + u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); + u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); + u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback); + + u |= kpf_copy_bit(k, KPF_LRU, PG_lru); + u |= kpf_copy_bit(k, KPF_REFERENCED, PG_referenced); + u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); + u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); + + if (PageSwapCache(page)) + u |= 1 << KPF_SWAPCACHE; + u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); + + u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); + u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); + +#ifdef CONFIG_MEMORY_FAILURE + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); +#endif + +#ifdef CONFIG_ARCH_USES_PG_UNCACHED + u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); +#endif + + u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); + u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk); + u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); + u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); + u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); + u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); +#ifdef CONFIG_64BIT + u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); +#endif + + return u; +}; + +static ssize_t kpageflags_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + const unsigned long max_dump_pfn = get_max_dump_pfn(); + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + + pfn = src / KPMSIZE; + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); + + while (count > 0) { + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); + + if (put_user(stable_page_flags(ppage), out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + + cond_resched(); + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct proc_ops kpageflags_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_lseek = mem_lseek, + .proc_read = kpageflags_read, +}; + +#ifdef CONFIG_MEMCG +static ssize_t kpagecgroup_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + const unsigned long max_dump_pfn = get_max_dump_pfn(); + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 ino; + + pfn = src / KPMSIZE; + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); + + while (count > 0) { + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); + + if (ppage) + ino = page_cgroup_ino(ppage); + else + ino = 0; + + if (put_user(ino, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + + cond_resched(); + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct proc_ops kpagecgroup_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_lseek = mem_lseek, + .proc_read = kpagecgroup_read, +}; +#endif /* CONFIG_MEMCG */ + +static int __init proc_page_init(void) +{ + proc_create("kpagecount", S_IRUSR, NULL, &kpagecount_proc_ops); + proc_create("kpageflags", S_IRUSR, NULL, &kpageflags_proc_ops); +#ifdef CONFIG_MEMCG + proc_create("kpagecgroup", S_IRUSR, NULL, &kpagecgroup_proc_ops); +#endif + return 0; +} +fs_initcall(proc_page_init); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c new file mode 100644 index 000000000..856839b8a --- /dev/null +++ b/fs/proc/proc_net.c @@ -0,0 +1,416 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/fs/proc/net.c + * + * Copyright (C) 2007 + * + * Author: Eric Biederman <ebiederm@xmission.com> + * + * proc net directory handling functions + */ +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/module.h> +#include <linux/bitops.h> +#include <linux/mount.h> +#include <linux/nsproxy.h> +#include <linux/uidgid.h> +#include <net/net_namespace.h> +#include <linux/seq_file.h> + +#include "internal.h" + +static inline struct net *PDE_NET(struct proc_dir_entry *pde) +{ + return pde->parent->data; +} + +static struct net *get_proc_net(const struct inode *inode) +{ + return maybe_get_net(PDE_NET(PDE(inode))); +} + +static int seq_open_net(struct inode *inode, struct file *file) +{ + unsigned int state_size = PDE(inode)->state_size; + struct seq_net_private *p; + struct net *net; + + WARN_ON_ONCE(state_size < sizeof(*p)); + + if (file->f_mode & FMODE_WRITE && !PDE(inode)->write) + return -EACCES; + + net = get_proc_net(inode); + if (!net) + return -ENXIO; + + p = __seq_open_private(file, PDE(inode)->seq_ops, state_size); + if (!p) { + put_net(net); + return -ENOMEM; + } +#ifdef CONFIG_NET_NS + p->net = net; + netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL); +#endif + return 0; +} + +static void seq_file_net_put_net(struct seq_file *seq) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *priv = seq->private; + + put_net_track(priv->net, &priv->ns_tracker); +#else + put_net(&init_net); +#endif +} + +static int seq_release_net(struct inode *ino, struct file *f) +{ + struct seq_file *seq = f->private_data; + + seq_file_net_put_net(seq); + seq_release_private(ino, f); + return 0; +} + +static const struct proc_ops proc_net_seq_ops = { + .proc_open = seq_open_net, + .proc_read = seq_read, + .proc_write = proc_simple_write, + .proc_lseek = seq_lseek, + .proc_release = seq_release_net, +}; + +int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; + + p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker, + GFP_KERNEL); +#endif + return 0; +} + +void bpf_iter_fini_seq_net(void *priv_data) +{ +#ifdef CONFIG_NET_NS + struct seq_net_private *p = priv_data; + + put_net_track(p->net, &p->ns_tracker); +#endif +} + +struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, const struct seq_operations *ops, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + pde_force_lookup(p); + p->proc_ops = &proc_net_seq_ops; + p->seq_ops = ops; + p->state_size = state_size; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_data); + +/** + * proc_create_net_data_write - Create a writable net_ns-specific proc file + * @name: The name of the file. + * @mode: The file's access mode. + * @parent: The parent directory in which to create. + * @ops: The seq_file ops with which to read the file. + * @write: The write method with which to 'modify' the file. + * @data: Data for retrieval by pde_data(). + * + * Create a network namespaced proc file in the @parent directory with the + * specified @name and @mode that allows reading of a file that displays a + * series of elements and also provides for the file accepting writes that have + * some arbitrary effect. + * + * The functions in the @ops table are used to iterate over items to be + * presented and extract the readable content using the seq_file interface. + * + * The @write function is called with the data copied into a kernel space + * scratch buffer and has a NUL appended for convenience. The buffer may be + * modified by the @write function. @write should return 0 on success. + * + * The @data value is accessible from the @show and @write functions by calling + * pde_data() on the file inode. The network namespace must be accessed by + * calling seq_file_net() on the seq_file struct. + */ +struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct seq_operations *ops, + proc_write_t write, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + pde_force_lookup(p); + p->proc_ops = &proc_net_seq_ops; + p->seq_ops = ops; + p->state_size = state_size; + p->write = write; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_data_write); + +static int single_open_net(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + struct net *net; + int err; + + net = get_proc_net(inode); + if (!net) + return -ENXIO; + + err = single_open(file, de->single_show, net); + if (err) + put_net(net); + return err; +} + +static int single_release_net(struct inode *ino, struct file *f) +{ + struct seq_file *seq = f->private_data; + put_net(seq->private); + return single_release(ino, f); +} + +static const struct proc_ops proc_net_single_ops = { + .proc_open = single_open_net, + .proc_read = seq_read, + .proc_write = proc_simple_write, + .proc_lseek = seq_lseek, + .proc_release = single_release_net, +}; + +struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + pde_force_lookup(p); + p->proc_ops = &proc_net_single_ops; + p->single_show = show; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_single); + +/** + * proc_create_net_single_write - Create a writable net_ns-specific proc file + * @name: The name of the file. + * @mode: The file's access mode. + * @parent: The parent directory in which to create. + * @show: The seqfile show method with which to read the file. + * @write: The write method with which to 'modify' the file. + * @data: Data for retrieval by pde_data(). + * + * Create a network-namespaced proc file in the @parent directory with the + * specified @name and @mode that allows reading of a file that displays a + * single element rather than a series and also provides for the file accepting + * writes that have some arbitrary effect. + * + * The @show function is called to extract the readable content via the + * seq_file interface. + * + * The @write function is called with the data copied into a kernel space + * scratch buffer and has a NUL appended for convenience. The buffer may be + * modified by the @write function. @write should return 0 on success. + * + * The @data value is accessible from the @show and @write functions by calling + * pde_data() on the file inode. The network namespace must be accessed by + * calling seq_file_single_net() on the seq_file struct. + */ +struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), + proc_write_t write, + void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + pde_force_lookup(p); + p->proc_ops = &proc_net_single_ops; + p->single_show = show; + p->write = write; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_single_write); + +static struct net *get_proc_task_net(struct inode *dir) +{ + struct task_struct *task; + struct nsproxy *ns; + struct net *net = NULL; + + rcu_read_lock(); + task = pid_task(proc_pid(dir), PIDTYPE_PID); + if (task != NULL) { + task_lock(task); + ns = task->nsproxy; + if (ns != NULL) + net = get_net(ns->net_ns); + task_unlock(task); + } + rcu_read_unlock(); + + return net; +} + +static struct dentry *proc_tgid_net_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + struct dentry *de; + struct net *net; + + de = ERR_PTR(-ENOENT); + net = get_proc_task_net(dir); + if (net != NULL) { + de = proc_lookup_de(dir, dentry, net->proc_net); + put_net(net); + } + return de; +} + +static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct net *net; + + net = get_proc_task_net(inode); + + generic_fillattr(&init_user_ns, inode, stat); + + if (net != NULL) { + stat->nlink = net->proc_net->nlink; + put_net(net); + } + + return 0; +} + +const struct inode_operations proc_net_inode_operations = { + .lookup = proc_tgid_net_lookup, + .getattr = proc_tgid_net_getattr, +}; + +static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) +{ + int ret; + struct net *net; + + ret = -EINVAL; + net = get_proc_task_net(file_inode(file)); + if (net != NULL) { + ret = proc_readdir_de(file, ctx, net->proc_net); + put_net(net); + } + return ret; +} + +const struct file_operations proc_net_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = proc_tgid_net_readdir, +}; + +static __net_init int proc_net_ns_init(struct net *net) +{ + struct proc_dir_entry *netd, *net_statd; + kuid_t uid; + kgid_t gid; + int err; + + /* + * This PDE acts only as an anchor for /proc/${pid}/net hierarchy. + * Corresponding inode (PDE(inode) == net->proc_net) is never + * instantiated therefore blanket zeroing is fine. + * net->proc_net_stat inode is instantiated normally. + */ + err = -ENOMEM; + netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); + if (!netd) + goto out; + + netd->subdir = RB_ROOT; + netd->data = net; + netd->nlink = 2; + netd->namelen = 3; + netd->parent = &proc_root; + netd->name = netd->inline_name; + memcpy(netd->name, "net", 4); + + uid = make_kuid(net->user_ns, 0); + if (!uid_valid(uid)) + uid = netd->uid; + + gid = make_kgid(net->user_ns, 0); + if (!gid_valid(gid)) + gid = netd->gid; + + proc_set_user(netd, uid, gid); + + /* Seed dentry revalidation for /proc/${pid}/net */ + pde_force_lookup(netd); + + err = -EEXIST; + net_statd = proc_net_mkdir(net, "stat", netd); + if (!net_statd) + goto free_net; + + net->proc_net = netd; + net->proc_net_stat = net_statd; + return 0; + +free_net: + pde_free(netd); +out: + return err; +} + +static __net_exit void proc_net_ns_exit(struct net *net) +{ + remove_proc_entry("stat", net->proc_net); + pde_free(net->proc_net); +} + +static struct pernet_operations __net_initdata proc_net_ns_ops = { + .init = proc_net_ns_init, + .exit = proc_net_ns_exit, +}; + +int __init proc_net_init(void) +{ + proc_symlink("net", NULL, "self/net"); + + return register_pernet_subsys(&proc_net_ns_ops); +} diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c new file mode 100644 index 000000000..4a4c04a3b --- /dev/null +++ b/fs/proc/proc_sysctl.c @@ -0,0 +1,1951 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * /proc/sys support + */ +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <linux/printk.h> +#include <linux/security.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/namei.h> +#include <linux/mm.h> +#include <linux/uio.h> +#include <linux/module.h> +#include <linux/bpf-cgroup.h> +#include <linux/mount.h> +#include <linux/kmemleak.h> +#include "internal.h" + +#define list_for_each_table_entry(entry, table) \ + for ((entry) = (table); (entry)->procname; (entry)++) + +static const struct dentry_operations proc_sys_dentry_operations; +static const struct file_operations proc_sys_file_operations; +static const struct inode_operations proc_sys_inode_operations; +static const struct file_operations proc_sys_dir_file_operations; +static const struct inode_operations proc_sys_dir_operations; + +/* Support for permanently empty directories */ + +struct ctl_table sysctl_mount_point[] = { + { } +}; + +/** + * register_sysctl_mount_point() - registers a sysctl mount point + * @path: path for the mount point + * + * Used to create a permanently empty directory to serve as mount point. + * There are some subtle but important permission checks this allows in the + * case of unprivileged mounts. + */ +struct ctl_table_header *register_sysctl_mount_point(const char *path) +{ + return register_sysctl(path, sysctl_mount_point); +} +EXPORT_SYMBOL(register_sysctl_mount_point); + +static bool is_empty_dir(struct ctl_table_header *head) +{ + return head->ctl_table[0].child == sysctl_mount_point; +} + +static void set_empty_dir(struct ctl_dir *dir) +{ + dir->header.ctl_table[0].child = sysctl_mount_point; +} + +static void clear_empty_dir(struct ctl_dir *dir) + +{ + dir->header.ctl_table[0].child = NULL; +} + +void proc_sys_poll_notify(struct ctl_table_poll *poll) +{ + if (!poll) + return; + + atomic_inc(&poll->event); + wake_up_interruptible(&poll->wait); +} + +static struct ctl_table root_table[] = { + { + .procname = "", + .mode = S_IFDIR|S_IRUGO|S_IXUGO, + }, + { } +}; +static struct ctl_table_root sysctl_table_root = { + .default_set.dir.header = { + {{.count = 1, + .nreg = 1, + .ctl_table = root_table }}, + .ctl_table_arg = root_table, + .root = &sysctl_table_root, + .set = &sysctl_table_root.default_set, + }, +}; + +static DEFINE_SPINLOCK(sysctl_lock); + +static void drop_sysctl_table(struct ctl_table_header *header); +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry); +static int insert_links(struct ctl_table_header *head); +static void put_links(struct ctl_table_header *header); + +static void sysctl_print_dir(struct ctl_dir *dir) +{ + if (dir->header.parent) + sysctl_print_dir(dir->header.parent); + pr_cont("%s/", dir->header.ctl_table[0].procname); +} + +static int namecmp(const char *name1, int len1, const char *name2, int len2) +{ + int cmp; + + cmp = memcmp(name1, name2, min(len1, len2)); + if (cmp == 0) + cmp = len1 - len2; + return cmp; +} + +/* Called under sysctl_lock */ +static struct ctl_table *find_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + struct rb_node *node = dir->root.rb_node; + + while (node) + { + struct ctl_node *ctl_node; + const char *procname; + int cmp; + + ctl_node = rb_entry(node, struct ctl_node, node); + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + procname = entry->procname; + + cmp = namecmp(name, namelen, procname, strlen(procname)); + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else { + *phead = head; + return entry; + } + } + return NULL; +} + +static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + struct rb_node **p = &head->parent->root.rb_node; + struct rb_node *parent = NULL; + const char *name = entry->procname; + int namelen = strlen(name); + + while (*p) { + struct ctl_table_header *parent_head; + struct ctl_table *parent_entry; + struct ctl_node *parent_node; + const char *parent_name; + int cmp; + + parent = *p; + parent_node = rb_entry(parent, struct ctl_node, node); + parent_head = parent_node->header; + parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; + parent_name = parent_entry->procname; + + cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { + pr_err("sysctl duplicate entry: "); + sysctl_print_dir(head->parent); + pr_cont("%s\n", entry->procname); + return -EEXIST; + } + } + + rb_link_node(node, parent, p); + rb_insert_color(node, &head->parent->root); + return 0; +} + +static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + + rb_erase(node, &head->parent->root); +} + +static void init_header(struct ctl_table_header *head, + struct ctl_table_root *root, struct ctl_table_set *set, + struct ctl_node *node, struct ctl_table *table) +{ + head->ctl_table = table; + head->ctl_table_arg = table; + head->used = 0; + head->count = 1; + head->nreg = 1; + head->unregistering = NULL; + head->root = root; + head->set = set; + head->parent = NULL; + head->node = node; + INIT_HLIST_HEAD(&head->inodes); + if (node) { + struct ctl_table *entry; + + list_for_each_table_entry(entry, table) { + node->header = head; + node++; + } + } +} + +static void erase_header(struct ctl_table_header *head) +{ + struct ctl_table *entry; + + list_for_each_table_entry(entry, head->ctl_table) + erase_entry(head, entry); +} + +static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) +{ + struct ctl_table *entry; + int err; + + /* Is this a permanently empty directory? */ + if (is_empty_dir(&dir->header)) + return -EROFS; + + /* Am I creating a permanently empty directory? */ + if (header->ctl_table == sysctl_mount_point) { + if (!RB_EMPTY_ROOT(&dir->root)) + return -EINVAL; + set_empty_dir(dir); + } + + dir->header.nreg++; + header->parent = dir; + err = insert_links(header); + if (err) + goto fail_links; + list_for_each_table_entry(entry, header->ctl_table) { + err = insert_entry(header, entry); + if (err) + goto fail; + } + return 0; +fail: + erase_header(header); + put_links(header); +fail_links: + if (header->ctl_table == sysctl_mount_point) + clear_empty_dir(dir); + header->parent = NULL; + drop_sysctl_table(&dir->header); + return err; +} + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ + if (unlikely(p->unregistering)) + return 0; + p->used++; + return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ + if (!--p->used) + if (unlikely(p->unregistering)) + complete(p->unregistering); +} + +static void proc_sys_invalidate_dcache(struct ctl_table_header *head) +{ + proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ + /* + * if p->used is 0, nobody will ever touch that entry again; + * we'll eliminate all paths to it before dropping sysctl_lock + */ + if (unlikely(p->used)) { + struct completion wait; + init_completion(&wait); + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); + } else { + /* anything non-NULL; we'll never dereference it */ + p->unregistering = ERR_PTR(-EINVAL); + spin_unlock(&sysctl_lock); + } + /* + * Invalidate dentries for unregistered sysctls: namespaced sysctls + * can have duplicate names and contaminate dcache very badly. + */ + proc_sys_invalidate_dcache(p); + /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + spin_lock(&sysctl_lock); + erase_header(p); +} + +static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) +{ + BUG_ON(!head); + spin_lock(&sysctl_lock); + if (!use_table(head)) + head = ERR_PTR(-ENOENT); + spin_unlock(&sysctl_lock); + return head; +} + +static void sysctl_head_finish(struct ctl_table_header *head) +{ + if (!head) + return; + spin_lock(&sysctl_lock); + unuse_table(head); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_set * +lookup_header_set(struct ctl_table_root *root) +{ + struct ctl_table_set *set = &root->default_set; + if (root->lookup) + set = root->lookup(root); + return set; +} + +static struct ctl_table *lookup_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + + spin_lock(&sysctl_lock); + entry = find_entry(&head, dir, name, namelen); + if (entry && use_table(head)) + *phead = head; + else + entry = NULL; + spin_unlock(&sysctl_lock); + return entry; +} + +static struct ctl_node *first_usable_entry(struct rb_node *node) +{ + struct ctl_node *ctl_node; + + for (;node; node = rb_next(node)) { + ctl_node = rb_entry(node, struct ctl_node, node); + if (use_table(ctl_node->header)) + return ctl_node; + } + return NULL; +} + +static void first_entry(struct ctl_dir *dir, + struct ctl_table_header **phead, struct ctl_table **pentry) +{ + struct ctl_table_header *head = NULL; + struct ctl_table *entry = NULL; + struct ctl_node *ctl_node; + + spin_lock(&sysctl_lock); + ctl_node = first_usable_entry(rb_first(&dir->root)); + spin_unlock(&sysctl_lock); + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + } + *phead = head; + *pentry = entry; +} + +static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) +{ + struct ctl_table_header *head = *phead; + struct ctl_table *entry = *pentry; + struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; + + spin_lock(&sysctl_lock); + unuse_table(head); + + ctl_node = first_usable_entry(rb_next(&ctl_node->node)); + spin_unlock(&sysctl_lock); + head = NULL; + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + } + *phead = head; + *pentry = entry; +} + +/* + * sysctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ + if (uid_eq(current_euid(), GLOBAL_ROOT_UID)) + mode >>= 6; + else if (in_egroup_p(GLOBAL_ROOT_GID)) + mode >>= 3; + if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) + return 0; + return -EACCES; +} + +static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op) +{ + struct ctl_table_root *root = head->root; + int mode; + + if (root->permissions) + mode = root->permissions(head, table); + else + mode = table->mode; + + return test_perm(mode, op); +} + +static struct inode *proc_sys_make_inode(struct super_block *sb, + struct ctl_table_header *head, struct ctl_table *table) +{ + struct ctl_table_root *root = head->root; + struct inode *inode; + struct proc_inode *ei; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_ino = get_next_ino(); + + ei = PROC_I(inode); + + spin_lock(&sysctl_lock); + if (unlikely(head->unregistering)) { + spin_unlock(&sysctl_lock); + iput(inode); + return ERR_PTR(-ENOENT); + } + ei->sysctl = head; + ei->sysctl_entry = table; + hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); + head->count++; + spin_unlock(&sysctl_lock); + + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mode = table->mode; + if (!S_ISDIR(table->mode)) { + inode->i_mode |= S_IFREG; + inode->i_op = &proc_sys_inode_operations; + inode->i_fop = &proc_sys_file_operations; + } else { + inode->i_mode |= S_IFDIR; + inode->i_op = &proc_sys_dir_operations; + inode->i_fop = &proc_sys_dir_file_operations; + if (is_empty_dir(head)) + make_empty_dir_inode(inode); + } + + if (root->set_ownership) + root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); + else { + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + } + + return inode; +} + +void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_header *grab_header(struct inode *inode) +{ + struct ctl_table_header *head = PROC_I(inode)->sysctl; + if (!head) + head = &sysctl_table_root.default_set.dir.header; + return sysctl_head_grab(head); +} + +static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ctl_table_header *head = grab_header(dir); + struct ctl_table_header *h = NULL; + const struct qstr *name = &dentry->d_name; + struct ctl_table *p; + struct inode *inode; + struct dentry *err = ERR_PTR(-ENOENT); + struct ctl_dir *ctl_dir; + int ret; + + if (IS_ERR(head)) + return ERR_CAST(head); + + ctl_dir = container_of(head, struct ctl_dir, header); + + p = lookup_entry(&h, ctl_dir, name->name, name->len); + if (!p) + goto out; + + if (S_ISLNK(p->mode)) { + ret = sysctl_follow_link(&h, &p); + err = ERR_PTR(ret); + if (ret) + goto out; + } + + inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); + if (IS_ERR(inode)) { + err = ERR_CAST(inode); + goto out; + } + + d_set_d_op(dentry, &proc_sys_dentry_operations); + err = d_splice_alias(inode, dentry); + +out: + if (h) + sysctl_head_finish(h); + sysctl_head_finish(head); + return err; +} + +static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, + int write) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + size_t count = iov_iter_count(iter); + char *kbuf; + ssize_t error; + + if (IS_ERR(head)) + return PTR_ERR(head); + + /* + * At this point we know that the sysctl was not unregistered + * and won't be until we finish. + */ + error = -EPERM; + if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) + goto out; + + /* if that can happen at all, it should be -EINVAL, not -EISDIR */ + error = -EINVAL; + if (!table->proc_handler) + goto out; + + /* don't even try if the size is too large */ + error = -ENOMEM; + if (count >= KMALLOC_MAX_SIZE) + goto out; + kbuf = kvzalloc(count + 1, GFP_KERNEL); + if (!kbuf) + goto out; + + if (write) { + error = -EFAULT; + if (!copy_from_iter_full(kbuf, count, iter)) + goto out_free_buf; + kbuf[count] = '\0'; + } + + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, + &iocb->ki_pos); + if (error) + goto out_free_buf; + + /* careful: calling conventions are nasty here */ + error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos); + if (error) + goto out_free_buf; + + if (!write) { + error = -EFAULT; + if (copy_to_iter(kbuf, count, iter) < count) + goto out_free_buf; + } + + error = count; +out_free_buf: + kvfree(kbuf); +out: + sysctl_head_finish(head); + + return error; +} + +static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter) +{ + return proc_sys_call_handler(iocb, iter, 0); +} + +static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) +{ + return proc_sys_call_handler(iocb, iter, 1); +} + +static int proc_sys_open(struct inode *inode, struct file *filp) +{ + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + /* sysctl was unregistered */ + if (IS_ERR(head)) + return PTR_ERR(head); + + if (table->poll) + filp->private_data = proc_sys_poll_event(table->poll); + + sysctl_head_finish(head); + + return 0; +} + +static __poll_t proc_sys_poll(struct file *filp, poll_table *wait) +{ + struct inode *inode = file_inode(filp); + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + __poll_t ret = DEFAULT_POLLMASK; + unsigned long event; + + /* sysctl was unregistered */ + if (IS_ERR(head)) + return EPOLLERR | EPOLLHUP; + + if (!table->proc_handler) + goto out; + + if (!table->poll) + goto out; + + event = (unsigned long)filp->private_data; + poll_wait(filp, &table->poll->wait, wait); + + if (event != atomic_read(&table->poll->event)) { + filp->private_data = proc_sys_poll_event(table->poll); + ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; + } + +out: + sysctl_head_finish(head); + + return ret; +} + +static bool proc_sys_fill_cache(struct file *file, + struct dir_context *ctx, + struct ctl_table_header *head, + struct ctl_table *table) +{ + struct dentry *child, *dir = file->f_path.dentry; + struct inode *inode; + struct qstr qname; + ino_t ino = 0; + unsigned type = DT_UNKNOWN; + + qname.name = table->procname; + qname.len = strlen(table->procname); + qname.hash = full_name_hash(dir, qname.name, qname.len); + + child = d_lookup(dir, &qname); + if (!child) { + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) + return false; + if (d_in_lookup(child)) { + struct dentry *res; + inode = proc_sys_make_inode(dir->d_sb, head, table); + if (IS_ERR(inode)) { + d_lookup_done(child); + dput(child); + return false; + } + d_set_d_op(child, &proc_sys_dentry_operations); + res = d_splice_alias(inode, child); + d_lookup_done(child); + if (unlikely(res)) { + if (IS_ERR(res)) { + dput(child); + return false; + } + dput(child); + child = res; + } + } + } + inode = d_inode(child); + ino = inode->i_ino; + type = inode->i_mode >> 12; + dput(child); + return dir_emit(ctx, qname.name, qname.len, ino, type); +} + +static bool proc_sys_link_fill_cache(struct file *file, + struct dir_context *ctx, + struct ctl_table_header *head, + struct ctl_table *table) +{ + bool ret = true; + + head = sysctl_head_grab(head); + if (IS_ERR(head)) + return false; + + /* It is not an error if we can not follow the link ignore it */ + if (sysctl_follow_link(&head, &table)) + goto out; + + ret = proc_sys_fill_cache(file, ctx, head, table); +out: + sysctl_head_finish(head); + return ret; +} + +static int scan(struct ctl_table_header *head, struct ctl_table *table, + unsigned long *pos, struct file *file, + struct dir_context *ctx) +{ + bool res; + + if ((*pos)++ < ctx->pos) + return true; + + if (unlikely(S_ISLNK(table->mode))) + res = proc_sys_link_fill_cache(file, ctx, head, table); + else + res = proc_sys_fill_cache(file, ctx, head, table); + + if (res) + ctx->pos = *pos; + + return res; +} + +static int proc_sys_readdir(struct file *file, struct dir_context *ctx) +{ + struct ctl_table_header *head = grab_header(file_inode(file)); + struct ctl_table_header *h = NULL; + struct ctl_table *entry; + struct ctl_dir *ctl_dir; + unsigned long pos; + + if (IS_ERR(head)) + return PTR_ERR(head); + + ctl_dir = container_of(head, struct ctl_dir, header); + + if (!dir_emit_dots(file, ctx)) + goto out; + + pos = 2; + + for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { + if (!scan(h, entry, &pos, file, ctx)) { + sysctl_head_finish(h); + break; + } + } +out: + sysctl_head_finish(head); + return 0; +} + +static int proc_sys_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) +{ + /* + * sysctl entries that are not writeable, + * are _NOT_ writeable, capabilities or not. + */ + struct ctl_table_header *head; + struct ctl_table *table; + int error; + + /* Executable files are not allowed under /proc/sys/ */ + if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) + return -EACCES; + + head = grab_header(inode); + if (IS_ERR(head)) + return PTR_ERR(head); + + table = PROC_I(inode)->sysctl_entry; + if (!table) /* global root - r-xr-xr-x */ + error = mask & MAY_WRITE ? -EACCES : 0; + else /* Use the permissions on the sysctl table entry */ + error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); + + sysctl_head_finish(head); + return error; +} + +static int proc_sys_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + int error; + + if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) + return -EPERM; + + error = setattr_prepare(&init_user_ns, dentry, attr); + if (error) + return error; + + setattr_copy(&init_user_ns, inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static int proc_sys_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + if (IS_ERR(head)) + return PTR_ERR(head); + + generic_fillattr(&init_user_ns, inode, stat); + if (table) + stat->mode = (stat->mode & S_IFMT) | table->mode; + + sysctl_head_finish(head); + return 0; +} + +static const struct file_operations proc_sys_file_operations = { + .open = proc_sys_open, + .poll = proc_sys_poll, + .read_iter = proc_sys_read, + .write_iter = proc_sys_write, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .llseek = default_llseek, +}; + +static const struct file_operations proc_sys_dir_file_operations = { + .read = generic_read_dir, + .iterate_shared = proc_sys_readdir, + .llseek = generic_file_llseek, +}; + +static const struct inode_operations proc_sys_inode_operations = { + .permission = proc_sys_permission, + .setattr = proc_sys_setattr, + .getattr = proc_sys_getattr, +}; + +static const struct inode_operations proc_sys_dir_operations = { + .lookup = proc_sys_lookup, + .permission = proc_sys_permission, + .setattr = proc_sys_setattr, + .getattr = proc_sys_getattr, +}; + +static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) +{ + if (flags & LOOKUP_RCU) + return -ECHILD; + return !PROC_I(d_inode(dentry))->sysctl->unregistering; +} + +static int proc_sys_delete(const struct dentry *dentry) +{ + return !!PROC_I(d_inode(dentry))->sysctl->unregistering; +} + +static int sysctl_is_seen(struct ctl_table_header *p) +{ + struct ctl_table_set *set = p->set; + int res; + spin_lock(&sysctl_lock); + if (p->unregistering) + res = 0; + else if (!set->is_seen) + res = 1; + else + res = set->is_seen(set); + spin_unlock(&sysctl_lock); + return res; +} + +static int proc_sys_compare(const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + struct ctl_table_header *head; + struct inode *inode; + + /* Although proc doesn't have negative dentries, rcu-walk means + * that inode here can be NULL */ + /* AV: can it, indeed? */ + inode = d_inode_rcu(dentry); + if (!inode) + return 1; + if (name->len != len) + return 1; + if (memcmp(name->name, str, len)) + return 1; + head = rcu_dereference(PROC_I(inode)->sysctl); + return !head || !sysctl_is_seen(head); +} + +static const struct dentry_operations proc_sys_dentry_operations = { + .d_revalidate = proc_sys_revalidate, + .d_delete = proc_sys_delete, + .d_compare = proc_sys_compare, +}; + +static struct ctl_dir *find_subdir(struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + + entry = find_entry(&head, dir, name, namelen); + if (!entry) + return ERR_PTR(-ENOENT); + if (!S_ISDIR(entry->mode)) + return ERR_PTR(-ENOTDIR); + return container_of(head, struct ctl_dir, header); +} + +static struct ctl_dir *new_dir(struct ctl_table_set *set, + const char *name, int namelen) +{ + struct ctl_table *table; + struct ctl_dir *new; + struct ctl_node *node; + char *new_name; + + new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + + sizeof(struct ctl_table)*2 + namelen + 1, + GFP_KERNEL); + if (!new) + return NULL; + + node = (struct ctl_node *)(new + 1); + table = (struct ctl_table *)(node + 1); + new_name = (char *)(table + 2); + memcpy(new_name, name, namelen); + table[0].procname = new_name; + table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; + init_header(&new->header, set->dir.header.root, set, node, table); + + return new; +} + +/** + * get_subdir - find or create a subdir with the specified name. + * @dir: Directory to create the subdirectory in + * @name: The name of the subdirectory to find or create + * @namelen: The length of name + * + * Takes a directory with an elevated reference count so we know that + * if we drop the lock the directory will not go away. Upon success + * the reference is moved from @dir to the returned subdirectory. + * Upon error an error code is returned and the reference on @dir is + * simply dropped. + */ +static struct ctl_dir *get_subdir(struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_set *set = dir->header.set; + struct ctl_dir *subdir, *new = NULL; + int err; + + spin_lock(&sysctl_lock); + subdir = find_subdir(dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + spin_unlock(&sysctl_lock); + new = new_dir(set, name, namelen); + spin_lock(&sysctl_lock); + subdir = ERR_PTR(-ENOMEM); + if (!new) + goto failed; + + /* Was the subdir added while we dropped the lock? */ + subdir = find_subdir(dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + /* Nope. Use the our freshly made directory entry. */ + err = insert_header(dir, &new->header); + subdir = ERR_PTR(err); + if (err) + goto failed; + subdir = new; +found: + subdir->header.nreg++; +failed: + if (IS_ERR(subdir)) { + pr_err("sysctl could not get directory: "); + sysctl_print_dir(dir); + pr_cont("%*.*s %ld\n", namelen, namelen, name, + PTR_ERR(subdir)); + } + drop_sysctl_table(&dir->header); + if (new) + drop_sysctl_table(&new->header); + spin_unlock(&sysctl_lock); + return subdir; +} + +static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) +{ + struct ctl_dir *parent; + const char *procname; + if (!dir->header.parent) + return &set->dir; + parent = xlate_dir(set, dir->header.parent); + if (IS_ERR(parent)) + return parent; + procname = dir->header.ctl_table[0].procname; + return find_subdir(parent, procname, strlen(procname)); +} + +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry) +{ + struct ctl_table_header *head; + struct ctl_table_root *root; + struct ctl_table_set *set; + struct ctl_table *entry; + struct ctl_dir *dir; + int ret; + + spin_lock(&sysctl_lock); + root = (*pentry)->data; + set = lookup_header_set(root); + dir = xlate_dir(set, (*phead)->parent); + if (IS_ERR(dir)) + ret = PTR_ERR(dir); + else { + const char *procname = (*pentry)->procname; + head = NULL; + entry = find_entry(&head, dir, procname, strlen(procname)); + ret = -ENOENT; + if (entry && use_table(head)) { + unuse_table(*phead); + *phead = head; + *pentry = entry; + ret = 0; + } + } + + spin_unlock(&sysctl_lock); + return ret; +} + +static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("sysctl table check failed: %s/%s %pV\n", + path, table->procname, &vaf); + + va_end(args); + return -EINVAL; +} + +static int sysctl_check_table_array(const char *path, struct ctl_table *table) +{ + int err = 0; + + if ((table->proc_handler == proc_douintvec) || + (table->proc_handler == proc_douintvec_minmax)) { + if (table->maxlen != sizeof(unsigned int)) + err |= sysctl_err(path, table, "array not allowed"); + } + + if (table->proc_handler == proc_dou8vec_minmax) { + if (table->maxlen != sizeof(u8)) + err |= sysctl_err(path, table, "array not allowed"); + } + + return err; +} + +static int sysctl_check_table(const char *path, struct ctl_table *table) +{ + struct ctl_table *entry; + int err = 0; + list_for_each_table_entry(entry, table) { + if (entry->child) + err |= sysctl_err(path, entry, "Not a file"); + + if ((entry->proc_handler == proc_dostring) || + (entry->proc_handler == proc_dointvec) || + (entry->proc_handler == proc_douintvec) || + (entry->proc_handler == proc_douintvec_minmax) || + (entry->proc_handler == proc_dointvec_minmax) || + (entry->proc_handler == proc_dou8vec_minmax) || + (entry->proc_handler == proc_dointvec_jiffies) || + (entry->proc_handler == proc_dointvec_userhz_jiffies) || + (entry->proc_handler == proc_dointvec_ms_jiffies) || + (entry->proc_handler == proc_doulongvec_minmax) || + (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!entry->data) + err |= sysctl_err(path, entry, "No data"); + if (!entry->maxlen) + err |= sysctl_err(path, entry, "No maxlen"); + else + err |= sysctl_check_table_array(path, entry); + } + if (!entry->proc_handler) + err |= sysctl_err(path, entry, "No proc_handler"); + + if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode) + err |= sysctl_err(path, entry, "bogus .mode 0%o", + entry->mode); + } + return err; +} + +static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, + struct ctl_table_root *link_root) +{ + struct ctl_table *link_table, *entry, *link; + struct ctl_table_header *links; + struct ctl_node *node; + char *link_name; + int nr_entries, name_bytes; + + name_bytes = 0; + nr_entries = 0; + list_for_each_table_entry(entry, table) { + nr_entries++; + name_bytes += strlen(entry->procname) + 1; + } + + links = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries + + sizeof(struct ctl_table)*(nr_entries + 1) + + name_bytes, + GFP_KERNEL); + + if (!links) + return NULL; + + node = (struct ctl_node *)(links + 1); + link_table = (struct ctl_table *)(node + nr_entries); + link_name = (char *)&link_table[nr_entries + 1]; + link = link_table; + + list_for_each_table_entry(entry, table) { + int len = strlen(entry->procname) + 1; + memcpy(link_name, entry->procname, len); + link->procname = link_name; + link->mode = S_IFLNK|S_IRWXUGO; + link->data = link_root; + link_name += len; + link++; + } + init_header(links, dir->header.root, dir->header.set, node, link_table); + links->nreg = nr_entries; + + return links; +} + +static bool get_links(struct ctl_dir *dir, + struct ctl_table *table, struct ctl_table_root *link_root) +{ + struct ctl_table_header *head; + struct ctl_table *entry, *link; + + /* Are there links available for every entry in table? */ + list_for_each_table_entry(entry, table) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + if (!link) + return false; + if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) + continue; + if (S_ISLNK(link->mode) && (link->data == link_root)) + continue; + return false; + } + + /* The checks passed. Increase the registration count on the links */ + list_for_each_table_entry(entry, table) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + head->nreg++; + } + return true; +} + +static int insert_links(struct ctl_table_header *head) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_dir *core_parent; + struct ctl_table_header *links; + int err; + + if (head->set == root_set) + return 0; + + core_parent = xlate_dir(root_set, head->parent); + if (IS_ERR(core_parent)) + return 0; + + if (get_links(core_parent, head->ctl_table, head->root)) + return 0; + + core_parent->header.nreg++; + spin_unlock(&sysctl_lock); + + links = new_links(core_parent, head->ctl_table, head->root); + + spin_lock(&sysctl_lock); + err = -ENOMEM; + if (!links) + goto out; + + err = 0; + if (get_links(core_parent, head->ctl_table, head->root)) { + kfree(links); + goto out; + } + + err = insert_header(core_parent, links); + if (err) + kfree(links); +out: + drop_sysctl_table(&core_parent->header); + return err; +} + +/** + * __register_sysctl_table - register a leaf sysctl table + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure without any child. This table + * should not be free'd after registration. So it should not be + * used on stack. It can either be a global or dynamically allocated + * by the caller and free'd later after sysctl unregistration. + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * The members of the &struct ctl_table structure are used as follows: + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file + * + * child - must be %NULL. + * + * proc_handler - the text handler routine (described below) + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * XXX: we should eventually modify these to use long min / max [0] + * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes (where child is not NULL) are not allowed, + * sysctl_check_table() verifies this. + * + * There must be a proc_handler routine for any terminal nodes. + * Several default handlers are available to cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *__register_sysctl_table( + struct ctl_table_set *set, + const char *path, struct ctl_table *table) +{ + struct ctl_table_root *root = set->dir.header.root; + struct ctl_table_header *header; + const char *name, *nextname; + struct ctl_dir *dir; + struct ctl_table *entry; + struct ctl_node *node; + int nr_entries = 0; + + list_for_each_table_entry(entry, table) + nr_entries++; + + header = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT); + if (!header) + return NULL; + + node = (struct ctl_node *)(header + 1); + init_header(header, root, set, node, table); + if (sysctl_check_table(path, table)) + goto fail; + + spin_lock(&sysctl_lock); + dir = &set->dir; + /* Reference moved down the directory tree get_subdir */ + dir->header.nreg++; + spin_unlock(&sysctl_lock); + + /* Find the directory for the ctl_table */ + for (name = path; name; name = nextname) { + int namelen; + nextname = strchr(name, '/'); + if (nextname) { + namelen = nextname - name; + nextname++; + } else { + namelen = strlen(name); + } + if (namelen == 0) + continue; + + /* + * namelen ensures if name is "foo/bar/yay" only foo is + * registered first. We traverse as if using mkdir -p and + * return a ctl_dir for the last directory entry. + */ + dir = get_subdir(dir, name, namelen); + if (IS_ERR(dir)) + goto fail; + } + + spin_lock(&sysctl_lock); + if (insert_header(dir, header)) + goto fail_put_dir_locked; + + drop_sysctl_table(&dir->header); + spin_unlock(&sysctl_lock); + + return header; + +fail_put_dir_locked: + drop_sysctl_table(&dir->header); + spin_unlock(&sysctl_lock); +fail: + kfree(header); + dump_stack(); + return NULL; +} + +/** + * register_sysctl - register a sysctl table + * @path: The path to the directory the sysctl table is in. If the path + * doesn't exist we will create it for you. + * @table: the table structure. The calller must ensure the life of the @table + * will be kept during the lifetime use of the syctl. It must not be freed + * until unregister_sysctl_table() is called with the given returned table + * with this registration. If your code is non modular then you don't need + * to call unregister_sysctl_table() and can instead use something like + * register_sysctl_init() which does not care for the result of the syctl + * registration. + * + * Register a sysctl table. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) +{ + return __register_sysctl_table(&sysctl_table_root.default_set, + path, table); +} +EXPORT_SYMBOL(register_sysctl); + +/** + * __register_sysctl_init() - register sysctl table to path + * @path: path name for sysctl base. If that path doesn't exist we will create + * it for you. + * @table: This is the sysctl table that needs to be registered to the path. + * The caller must ensure the life of the @table will be kept during the + * lifetime use of the sysctl. + * @table_name: The name of sysctl table, only used for log printing when + * registration fails + * + * The sysctl interface is used by userspace to query or modify at runtime + * a predefined value set on a variable. These variables however have default + * values pre-set. Code which depends on these variables will always work even + * if register_sysctl() fails. If register_sysctl() fails you'd just loose the + * ability to query or modify the sysctls dynamically at run time. Chances of + * register_sysctl() failing on init are extremely low, and so for both reasons + * this function does not return any error as it is used by initialization code. + * + * Context: if your base directory does not exist it will be created for you. + */ +void __init __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} + +static char *append_path(const char *path, char *pos, const char *name) +{ + int namelen; + namelen = strlen(name); + if (((pos - path) + namelen + 2) >= PATH_MAX) + return NULL; + memcpy(pos, name, namelen); + pos[namelen] = '/'; + pos[namelen + 1] = '\0'; + pos += namelen + 1; + return pos; +} + +static int count_subheaders(struct ctl_table *table) +{ + int has_files = 0; + int nr_subheaders = 0; + struct ctl_table *entry; + + /* special case: no directory and empty directory */ + if (!table || !table->procname) + return 1; + + list_for_each_table_entry(entry, table) { + if (entry->child) + nr_subheaders += count_subheaders(entry->child); + else + has_files = 1; + } + return nr_subheaders + has_files; +} + +static int register_leaf_sysctl_tables(const char *path, char *pos, + struct ctl_table_header ***subheader, struct ctl_table_set *set, + struct ctl_table *table) +{ + struct ctl_table *ctl_table_arg = NULL; + struct ctl_table *entry, *files; + int nr_files = 0; + int nr_dirs = 0; + int err = -ENOMEM; + + list_for_each_table_entry(entry, table) { + if (entry->child) + nr_dirs++; + else + nr_files++; + } + + files = table; + /* If there are mixed files and directories we need a new table */ + if (nr_dirs && nr_files) { + struct ctl_table *new; + files = kcalloc(nr_files + 1, sizeof(struct ctl_table), + GFP_KERNEL); + if (!files) + goto out; + + ctl_table_arg = files; + new = files; + + list_for_each_table_entry(entry, table) { + if (entry->child) + continue; + *new = *entry; + new++; + } + } + + /* Register everything except a directory full of subdirectories */ + if (nr_files || !nr_dirs) { + struct ctl_table_header *header; + header = __register_sysctl_table(set, path, files); + if (!header) { + kfree(ctl_table_arg); + goto out; + } + + /* Remember if we need to free the file table */ + header->ctl_table_arg = ctl_table_arg; + **subheader = header; + (*subheader)++; + } + + /* Recurse into the subdirectories. */ + list_for_each_table_entry(entry, table) { + char *child_pos; + + if (!entry->child) + continue; + + err = -ENAMETOOLONG; + child_pos = append_path(path, pos, entry->procname); + if (!child_pos) + goto out; + + err = register_leaf_sysctl_tables(path, child_pos, subheader, + set, entry->child); + pos[0] = '\0'; + if (err) + goto out; + } + err = 0; +out: + /* On failure our caller will unregister all registered subheaders */ + return err; +} + +/** + * __register_sysctl_paths - register a sysctl table hierarchy + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * We are slowly deprecating this call so avoid its use. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_set *set, + const struct ctl_path *path, struct ctl_table *table) +{ + struct ctl_table *ctl_table_arg = table; + int nr_subheaders = count_subheaders(table); + struct ctl_table_header *header = NULL, **subheaders, **subheader; + const struct ctl_path *component; + char *new_path, *pos; + + pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (!new_path) + return NULL; + + pos[0] = '\0'; + for (component = path; component->procname; component++) { + pos = append_path(new_path, pos, component->procname); + if (!pos) + goto out; + } + while (table->procname && table->child && !table[1].procname) { + pos = append_path(new_path, pos, table->procname); + if (!pos) + goto out; + table = table->child; + } + if (nr_subheaders == 1) { + header = __register_sysctl_table(set, new_path, table); + if (header) + header->ctl_table_arg = ctl_table_arg; + } else { + header = kzalloc(sizeof(*header) + + sizeof(*subheaders)*nr_subheaders, GFP_KERNEL); + if (!header) + goto out; + + subheaders = (struct ctl_table_header **) (header + 1); + subheader = subheaders; + header->ctl_table_arg = ctl_table_arg; + + if (register_leaf_sysctl_tables(new_path, pos, &subheader, + set, table)) + goto err_register_leaves; + } + +out: + kfree(new_path); + return header; + +err_register_leaves: + while (subheader > subheaders) { + struct ctl_table_header *subh = *(--subheader); + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + header = NULL; + goto out; +} + +/** + * register_sysctl_paths - register a sysctl table hierarchy + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * We are slowly deprecating this caller so avoid future uses of it. + * + * See __register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table) +{ + return __register_sysctl_paths(&sysctl_table_root.default_set, + path, table); +} +EXPORT_SYMBOL(register_sysctl_paths); + +/** + * register_sysctl_table - register a sysctl table hierarchy + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_table(struct ctl_table *table) +{ + static const struct ctl_path null_path[] = { {} }; + + return register_sysctl_paths(null_path, table); +} +EXPORT_SYMBOL(register_sysctl_table); + +int __register_sysctl_base(struct ctl_table *base_table) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_table(base_table); + kmemleak_not_leak(hdr); + return 0; +} + +static void put_links(struct ctl_table_header *header) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_table_root *root = header->root; + struct ctl_dir *parent = header->parent; + struct ctl_dir *core_parent; + struct ctl_table *entry; + + if (header->set == root_set) + return; + + core_parent = xlate_dir(root_set, parent); + if (IS_ERR(core_parent)) + return; + + list_for_each_table_entry(entry, header->ctl_table) { + struct ctl_table_header *link_head; + struct ctl_table *link; + const char *name = entry->procname; + + link = find_entry(&link_head, core_parent, name, strlen(name)); + if (link && + ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || + (S_ISLNK(link->mode) && (link->data == root)))) { + drop_sysctl_table(link_head); + } + else { + pr_err("sysctl link missing during unregister: "); + sysctl_print_dir(parent); + pr_cont("%s\n", name); + } + } +} + +static void drop_sysctl_table(struct ctl_table_header *header) +{ + struct ctl_dir *parent = header->parent; + + if (--header->nreg) + return; + + if (parent) { + put_links(header); + start_unregistering(header); + } + + if (!--header->count) + kfree_rcu(header, rcu); + + if (parent) + drop_sysctl_table(&parent->header); +} + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + int nr_subheaders; + might_sleep(); + + if (header == NULL) + return; + + nr_subheaders = count_subheaders(header->ctl_table_arg); + if (unlikely(nr_subheaders > 1)) { + struct ctl_table_header **subheaders; + int i; + + subheaders = (struct ctl_table_header **)(header + 1); + for (i = nr_subheaders -1; i >= 0; i--) { + struct ctl_table_header *subh = subheaders[i]; + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + return; + } + + spin_lock(&sysctl_lock); + drop_sysctl_table(header); + spin_unlock(&sysctl_lock); +} +EXPORT_SYMBOL(unregister_sysctl_table); + +void setup_sysctl_set(struct ctl_table_set *set, + struct ctl_table_root *root, + int (*is_seen)(struct ctl_table_set *)) +{ + memset(set, 0, sizeof(*set)); + set->is_seen = is_seen; + init_header(&set->dir.header, root, set, NULL, root_table); +} + +void retire_sysctl_set(struct ctl_table_set *set) +{ + WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); +} + +int __init proc_sys_init(void) +{ + struct proc_dir_entry *proc_sys_root; + + proc_sys_root = proc_mkdir("sys", NULL); + proc_sys_root->proc_iops = &proc_sys_dir_operations; + proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; + proc_sys_root->nlink = 0; + + return sysctl_init_bases(); +} + +struct sysctl_alias { + const char *kernel_param; + const char *sysctl_param; +}; + +/* + * Historically some settings had both sysctl and a command line parameter. + * With the generic sysctl. parameter support, we can handle them at a single + * place and only keep the historical name for compatibility. This is not meant + * to add brand new aliases. When adding existing aliases, consider whether + * the possibly different moment of changing the value (e.g. from early_param + * to the moment do_sysctl_args() is called) is an issue for the specific + * parameter. + */ +static const struct sysctl_alias sysctl_aliases[] = { + {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, + {"hung_task_panic", "kernel.hung_task_panic" }, + {"numa_zonelist_order", "vm.numa_zonelist_order" }, + {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, + { } +}; + +static const char *sysctl_find_alias(char *param) +{ + const struct sysctl_alias *alias; + + for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { + if (strcmp(alias->kernel_param, param) == 0) + return alias->sysctl_param; + } + + return NULL; +} + +bool sysctl_is_alias(char *param) +{ + const char *alias = sysctl_find_alias(param); + + return alias != NULL; +} + +/* Set sysctl value passed on kernel command line. */ +static int process_sysctl_arg(char *param, char *val, + const char *unused, void *arg) +{ + char *path; + struct vfsmount **proc_mnt = arg; + struct file_system_type *proc_fs_type; + struct file *file; + int len; + int err; + loff_t pos = 0; + ssize_t wret; + + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { + param += sizeof("sysctl") - 1; + + if (param[0] != '/' && param[0] != '.') + return 0; + + param++; + } else { + param = (char *) sysctl_find_alias(param); + if (!param) + return 0; + } + + if (!val) + return -EINVAL; + len = strlen(val); + if (len == 0) + return -EINVAL; + + /* + * To set sysctl options, we use a temporary mount of proc, look up the + * respective sys/ file and write to it. To avoid mounting it when no + * options were given, we mount it only when the first sysctl option is + * found. Why not a persistent mount? There are problems with a + * persistent mount of proc in that it forces userspace not to use any + * proc mount options. + */ + if (!*proc_mnt) { + proc_fs_type = get_fs_type("proc"); + if (!proc_fs_type) { + pr_err("Failed to find procfs to set sysctl from command line\n"); + return 0; + } + *proc_mnt = kern_mount(proc_fs_type); + put_filesystem(proc_fs_type); + if (IS_ERR(*proc_mnt)) { + pr_err("Failed to mount procfs to set sysctl from command line\n"); + return 0; + } + } + + path = kasprintf(GFP_KERNEL, "sys/%s", param); + if (!path) + panic("%s: Failed to allocate path for %s\n", __func__, param); + strreplace(path, '.', '/'); + + file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0); + if (IS_ERR(file)) { + err = PTR_ERR(file); + if (err == -ENOENT) + pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", + param, val); + else if (err == -EACCES) + pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", + param, val); + else + pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", + file, param, val); + goto out; + } + wret = kernel_write(file, val, len, &pos); + if (wret < 0) { + err = wret; + if (err == -EINVAL) + pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", + param, val); + else + pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", + ERR_PTR(err), param, val); + } else if (wret != len) { + pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", + wret, len, path, param, val); + } + + err = filp_close(file, NULL); + if (err) + pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", + ERR_PTR(err), param, val); +out: + kfree(path); + return 0; +} + +void do_sysctl_args(void) +{ + char *command_line; + struct vfsmount *proc_mnt = NULL; + + command_line = kstrdup(saved_command_line, GFP_KERNEL); + if (!command_line) + panic("%s: Failed to allocate copy of command line\n", __func__); + + parse_args("Setting sysctl args", command_line, + NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); + + if (proc_mnt) + kern_unmount(proc_mnt); + + kfree(command_line); +} diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c new file mode 100644 index 000000000..5c6a5ceab --- /dev/null +++ b/fs/proc/proc_tty.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * proc_tty.c -- handles /proc/tty + * + * Copyright 1997, Theodore Ts'o + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/tty.h> +#include <linux/seq_file.h> +#include <linux/bitops.h> +#include "internal.h" + +/* + * The /proc/tty directory inodes... + */ +static struct proc_dir_entry *proc_tty_driver; + +/* + * This is the handler for /proc/tty/drivers + */ +static void show_tty_range(struct seq_file *m, struct tty_driver *p, + dev_t from, int num) +{ + seq_printf(m, "%-20s ", p->driver_name ? p->driver_name : "unknown"); + seq_printf(m, "/dev/%-8s ", p->name); + if (p->num > 1) { + seq_printf(m, "%3d %d-%d ", MAJOR(from), MINOR(from), + MINOR(from) + num - 1); + } else { + seq_printf(m, "%3d %7d ", MAJOR(from), MINOR(from)); + } + switch (p->type) { + case TTY_DRIVER_TYPE_SYSTEM: + seq_puts(m, "system"); + if (p->subtype == SYSTEM_TYPE_TTY) + seq_puts(m, ":/dev/tty"); + else if (p->subtype == SYSTEM_TYPE_SYSCONS) + seq_puts(m, ":console"); + else if (p->subtype == SYSTEM_TYPE_CONSOLE) + seq_puts(m, ":vtmaster"); + break; + case TTY_DRIVER_TYPE_CONSOLE: + seq_puts(m, "console"); + break; + case TTY_DRIVER_TYPE_SERIAL: + seq_puts(m, "serial"); + break; + case TTY_DRIVER_TYPE_PTY: + if (p->subtype == PTY_TYPE_MASTER) + seq_puts(m, "pty:master"); + else if (p->subtype == PTY_TYPE_SLAVE) + seq_puts(m, "pty:slave"); + else + seq_puts(m, "pty"); + break; + default: + seq_printf(m, "type:%d.%d", p->type, p->subtype); + } + seq_putc(m, '\n'); +} + +static int show_tty_driver(struct seq_file *m, void *v) +{ + struct tty_driver *p = list_entry(v, struct tty_driver, tty_drivers); + dev_t from = MKDEV(p->major, p->minor_start); + dev_t to = from + p->num; + + if (&p->tty_drivers == tty_drivers.next) { + /* pseudo-drivers first */ + seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0); + seq_puts(m, "system:/dev/tty\n"); + seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1); + seq_puts(m, "system:console\n"); +#ifdef CONFIG_UNIX98_PTYS + seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2); + seq_puts(m, "system\n"); +#endif +#ifdef CONFIG_VT + seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0"); + seq_printf(m, "%3d %7d ", TTY_MAJOR, 0); + seq_puts(m, "system:vtmaster\n"); +#endif + } + + while (MAJOR(from) < MAJOR(to)) { + dev_t next = MKDEV(MAJOR(from)+1, 0); + show_tty_range(m, p, from, next - from); + from = next; + } + if (from != to) + show_tty_range(m, p, from, to - from); + return 0; +} + +/* iterator */ +static void *t_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&tty_mutex); + return seq_list_start(&tty_drivers, *pos); +} + +static void *t_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &tty_drivers, pos); +} + +static void t_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&tty_mutex); +} + +static const struct seq_operations tty_drivers_op = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = show_tty_driver +}; + +/* + * This function is called by tty_register_driver() to handle + * registering the driver's /proc handler into /proc/tty/driver/<foo> + */ +void proc_tty_register_driver(struct tty_driver *driver) +{ + struct proc_dir_entry *ent; + + if (!driver->driver_name || driver->proc_entry || + !driver->ops->proc_show) + return; + + ent = proc_create_single_data(driver->driver_name, 0, proc_tty_driver, + driver->ops->proc_show, driver); + driver->proc_entry = ent; +} + +/* + * This function is called by tty_unregister_driver() + */ +void proc_tty_unregister_driver(struct tty_driver *driver) +{ + struct proc_dir_entry *ent; + + ent = driver->proc_entry; + if (!ent) + return; + + remove_proc_entry(ent->name, proc_tty_driver); + + driver->proc_entry = NULL; +} + +/* + * Called by proc_root_init() to initialize the /proc/tty subtree + */ +void __init proc_tty_init(void) +{ + if (!proc_mkdir("tty", NULL)) + return; + proc_mkdir("tty/ldisc", NULL); /* Preserved: it's userspace visible */ + /* + * /proc/tty/driver/serial reveals the exact character counts for + * serial links which is just too easy to abuse for inferring + * password lengths and inter-keystroke timings during password + * entry. + */ + proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL); + proc_create_seq("tty/ldiscs", 0, NULL, &tty_ldiscs_seq_ops); + proc_create_seq("tty/drivers", 0, NULL, &tty_drivers_op); +} diff --git a/fs/proc/root.c b/fs/proc/root.c new file mode 100644 index 000000000..3c2ee3eb1 --- /dev/null +++ b/fs/proc/root.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/proc/root.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * proc root directory handling functions + */ +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/sched/stat.h> +#include <linux/module.h> +#include <linux/bitops.h> +#include <linux/user_namespace.h> +#include <linux/fs_context.h> +#include <linux/mount.h> +#include <linux/pid_namespace.h> +#include <linux/fs_parser.h> +#include <linux/cred.h> +#include <linux/magic.h> +#include <linux/slab.h> + +#include "internal.h" + +struct proc_fs_context { + struct pid_namespace *pid_ns; + unsigned int mask; + enum proc_hidepid hidepid; + int gid; + enum proc_pidonly pidonly; +}; + +enum proc_param { + Opt_gid, + Opt_hidepid, + Opt_subset, +}; + +static const struct fs_parameter_spec proc_fs_parameters[] = { + fsparam_u32("gid", Opt_gid), + fsparam_string("hidepid", Opt_hidepid), + fsparam_string("subset", Opt_subset), + {} +}; + +static inline int valid_hidepid(unsigned int value) +{ + return (value == HIDEPID_OFF || + value == HIDEPID_NO_ACCESS || + value == HIDEPID_INVISIBLE || + value == HIDEPID_NOT_PTRACEABLE); +} + +static int proc_parse_hidepid_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct fs_parameter_spec hidepid_u32_spec = fsparam_u32("hidepid", Opt_hidepid); + struct fs_parse_result result; + int base = (unsigned long)hidepid_u32_spec.data; + + if (param->type != fs_value_is_string) + return invalf(fc, "proc: unexpected type of hidepid value\n"); + + if (!kstrtouint(param->string, base, &result.uint_32)) { + if (!valid_hidepid(result.uint_32)) + return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); + ctx->hidepid = result.uint_32; + return 0; + } + + if (!strcmp(param->string, "off")) + ctx->hidepid = HIDEPID_OFF; + else if (!strcmp(param->string, "noaccess")) + ctx->hidepid = HIDEPID_NO_ACCESS; + else if (!strcmp(param->string, "invisible")) + ctx->hidepid = HIDEPID_INVISIBLE; + else if (!strcmp(param->string, "ptraceable")) + ctx->hidepid = HIDEPID_NOT_PTRACEABLE; + else + return invalf(fc, "proc: unknown value of hidepid - %s\n", param->string); + + return 0; +} + +static int proc_parse_subset_param(struct fs_context *fc, char *value) +{ + struct proc_fs_context *ctx = fc->fs_private; + + while (value) { + char *ptr = strchr(value, ','); + + if (ptr != NULL) + *ptr++ = '\0'; + + if (*value != '\0') { + if (!strcmp(value, "pid")) { + ctx->pidonly = PROC_PIDONLY_ON; + } else { + return invalf(fc, "proc: unsupported subset option - %s\n", value); + } + } + value = ptr; + } + + return 0; +} + +static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, proc_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_gid: + ctx->gid = result.uint_32; + break; + + case Opt_hidepid: + if (proc_parse_hidepid_param(fc, param)) + return -EINVAL; + break; + + case Opt_subset: + if (proc_parse_subset_param(fc, param->string) < 0) + return -EINVAL; + break; + + default: + return -EINVAL; + } + + ctx->mask |= 1 << opt; + return 0; +} + +static void proc_apply_options(struct proc_fs_info *fs_info, + struct fs_context *fc, + struct user_namespace *user_ns) +{ + struct proc_fs_context *ctx = fc->fs_private; + + if (ctx->mask & (1 << Opt_gid)) + fs_info->pid_gid = make_kgid(user_ns, ctx->gid); + if (ctx->mask & (1 << Opt_hidepid)) + fs_info->hide_pid = ctx->hidepid; + if (ctx->mask & (1 << Opt_subset)) + fs_info->pidonly = ctx->pidonly; +} + +static int proc_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct inode *root_inode; + struct proc_fs_info *fs_info; + int ret; + + fs_info = kzalloc(sizeof(*fs_info), GFP_KERNEL); + if (!fs_info) + return -ENOMEM; + + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + proc_apply_options(fs_info, fc, current_user_ns()); + + /* User space would break if executables or devices appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; + s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = PROC_SUPER_MAGIC; + s->s_op = &proc_sops; + s->s_time_gran = 1; + s->s_fs_info = fs_info; + + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + + /* procfs dentries and inodes don't require IO to create */ + s->s_shrink.seeks = 0; + + pde_get(&proc_root); + root_inode = proc_get_inode(s, &proc_root); + if (!root_inode) { + pr_err("proc_fill_super: get root inode failed\n"); + return -ENOMEM; + } + + s->s_root = d_make_root(root_inode); + if (!s->s_root) { + pr_err("proc_fill_super: allocate dentry failed\n"); + return -ENOMEM; + } + + ret = proc_setup_self(s); + if (ret) { + return ret; + } + return proc_setup_thread_self(s); +} + +static int proc_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct proc_fs_info *fs_info = proc_sb_info(sb); + + sync_filesystem(sb); + + proc_apply_options(fs_info, fc, current_user_ns()); + return 0; +} + +static int proc_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, proc_fill_super); +} + +static void proc_fs_context_free(struct fs_context *fc) +{ + struct proc_fs_context *ctx = fc->fs_private; + + put_pid_ns(ctx->pid_ns); + kfree(ctx); +} + +static const struct fs_context_operations proc_fs_context_ops = { + .free = proc_fs_context_free, + .parse_param = proc_parse_param, + .get_tree = proc_get_tree, + .reconfigure = proc_reconfigure, +}; + +static int proc_init_fs_context(struct fs_context *fc) +{ + struct proc_fs_context *ctx; + + ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + fc->fs_private = ctx; + fc->ops = &proc_fs_context_ops; + return 0; +} + +static void proc_kill_sb(struct super_block *sb) +{ + struct proc_fs_info *fs_info = proc_sb_info(sb); + + if (!fs_info) { + kill_anon_super(sb); + return; + } + + dput(fs_info->proc_self); + dput(fs_info->proc_thread_self); + + kill_anon_super(sb); + put_pid_ns(fs_info->pid_ns); + kfree(fs_info); +} + +static struct file_system_type proc_fs_type = { + .name = "proc", + .init_fs_context = proc_init_fs_context, + .parameters = proc_fs_parameters, + .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM, +}; + +void __init proc_root_init(void) +{ + proc_init_kmemcache(); + set_proc_pid_nlink(); + proc_self_init(); + proc_thread_self_init(); + proc_symlink("mounts", NULL, "self/mounts"); + + proc_net_init(); + proc_mkdir("fs", NULL); + proc_mkdir("driver", NULL); + proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ +#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) + /* just give it a mountpoint */ + proc_create_mount_point("openprom"); +#endif + proc_tty_init(); + proc_mkdir("bus", NULL); + proc_sys_init(); + + /* + * Last things last. It is not like userspace processes eager + * to open /proc files exist at this point but register last + * anyway. + */ + register_filesystem(&proc_fs_type); +} + +static int proc_root_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); + stat->nlink = proc_root.nlink + nr_processes(); + return 0; +} + +static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) +{ + if (!proc_pid_lookup(dentry, flags)) + return NULL; + + return proc_lookup(dir, dentry, flags); +} + +static int proc_root_readdir(struct file *file, struct dir_context *ctx) +{ + if (ctx->pos < FIRST_PROCESS_ENTRY) { + int error = proc_readdir(file, ctx); + if (unlikely(error <= 0)) + return error; + ctx->pos = FIRST_PROCESS_ENTRY; + } + + return proc_pid_readdir(file, ctx); +} + +/* + * The root /proc directory is special, as it has the + * <pid> directories. Thus we don't use the generic + * directory handling functions for that.. + */ +static const struct file_operations proc_root_operations = { + .read = generic_read_dir, + .iterate_shared = proc_root_readdir, + .llseek = generic_file_llseek, +}; + +/* + * proc root can do almost nothing.. + */ +static const struct inode_operations proc_root_inode_operations = { + .lookup = proc_root_lookup, + .getattr = proc_root_getattr, +}; + +/* + * This is the root "inode" in the /proc tree.. + */ +struct proc_dir_entry proc_root = { + .low_ino = PROC_ROOT_INO, + .namelen = 5, + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2, + .refcnt = REFCOUNT_INIT(1), + .proc_iops = &proc_root_inode_operations, + .proc_dir_ops = &proc_root_operations, + .parent = &proc_root, + .subdir = RB_ROOT, + .name = "/proc", +}; diff --git a/fs/proc/self.c b/fs/proc/self.c new file mode 100644 index 000000000..72cd69bca --- /dev/null +++ b/fs/proc/self.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cache.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/pid_namespace.h> +#include "internal.h" + +/* + * /proc/self: + */ +static const char *proc_self_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); + pid_t tgid = task_tgid_nr_ns(current, ns); + char *name; + + if (!tgid) + return ERR_PTR(-ENOENT); + /* max length of unsigned int in decimal + NULL term */ + name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); + if (unlikely(!name)) + return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); + sprintf(name, "%u", tgid); + set_delayed_call(done, kfree_link, name); + return name; +} + +static const struct inode_operations proc_self_inode_operations = { + .get_link = proc_self_get_link, +}; + +static unsigned self_inum __ro_after_init; + +int proc_setup_self(struct super_block *s) +{ + struct inode *root_inode = d_inode(s->s_root); + struct proc_fs_info *fs_info = proc_sb_info(s); + struct dentry *self; + int ret = -ENOMEM; + + inode_lock(root_inode); + self = d_alloc_name(s->s_root, "self"); + if (self) { + struct inode *inode = new_inode(s); + if (inode) { + inode->i_ino = self_inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_op = &proc_self_inode_operations; + d_add(self, inode); + ret = 0; + } else { + dput(self); + } + } + inode_unlock(root_inode); + + if (ret) + pr_err("proc_fill_super: can't allocate /proc/self\n"); + else + fs_info->proc_self = self; + + return ret; +} + +void __init proc_self_init(void) +{ + proc_alloc_inum(&self_inum); +} diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c new file mode 100644 index 000000000..f4616083f --- /dev/null +++ b/fs/proc/softirqs.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "internal.h" + +/* + * /proc/softirqs ... display the number of softirqs + */ +static int show_softirqs(struct seq_file *p, void *v) +{ + int i, j; + + seq_puts(p, " "); + for_each_possible_cpu(i) + seq_printf(p, "CPU%-8d", i); + seq_putc(p, '\n'); + + for (i = 0; i < NR_SOFTIRQS; i++) { + seq_printf(p, "%12s:", softirq_to_name[i]); + for_each_possible_cpu(j) + seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); + seq_putc(p, '\n'); + } + return 0; +} + +static int __init proc_softirqs_init(void) +{ + struct proc_dir_entry *pde; + + pde = proc_create_single("softirqs", 0, NULL, show_softirqs); + pde_make_permanent(pde); + return 0; +} +fs_initcall(proc_softirqs_init); diff --git a/fs/proc/stat.c b/fs/proc/stat.c new file mode 100644 index 000000000..4fb8729a6 --- /dev/null +++ b/fs/proc/stat.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpumask.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/sched/stat.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/time_namespace.h> +#include <linux/irqnr.h> +#include <linux/sched/cputime.h> +#include <linux/tick.h> + +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif +#ifndef arch_irq_stat +#define arch_irq_stat() 0 +#endif + +#ifdef arch_idle_time + +u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) +{ + u64 idle; + + idle = kcs->cpustat[CPUTIME_IDLE]; + if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) + idle += arch_idle_time(cpu); + return idle; +} + +static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) +{ + u64 iowait; + + iowait = kcs->cpustat[CPUTIME_IOWAIT]; + if (cpu_online(cpu) && nr_iowait_cpu(cpu)) + iowait += arch_idle_time(cpu); + return iowait; +} + +#else + +u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) +{ + u64 idle, idle_usecs = -1ULL; + + if (cpu_online(cpu)) + idle_usecs = get_cpu_idle_time_us(cpu, NULL); + + if (idle_usecs == -1ULL) + /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ + idle = kcs->cpustat[CPUTIME_IDLE]; + else + idle = idle_usecs * NSEC_PER_USEC; + + return idle; +} + +static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) +{ + u64 iowait, iowait_usecs = -1ULL; + + if (cpu_online(cpu)) + iowait_usecs = get_cpu_iowait_time_us(cpu, NULL); + + if (iowait_usecs == -1ULL) + /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ + iowait = kcs->cpustat[CPUTIME_IOWAIT]; + else + iowait = iowait_usecs * NSEC_PER_USEC; + + return iowait; +} + +#endif + +static void show_irq_gap(struct seq_file *p, unsigned int gap) +{ + static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; + + while (gap > 0) { + unsigned int inc; + + inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2); + seq_write(p, zeros, 2 * inc); + gap -= inc; + } +} + +static void show_all_irqs(struct seq_file *p) +{ + unsigned int i, next = 0; + + for_each_active_irq(i) { + show_irq_gap(p, i - next); + seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); + next = i + 1; + } + show_irq_gap(p, nr_irqs - next); +} + +static int show_stat(struct seq_file *p, void *v) +{ + int i, j; + u64 user, nice, system, idle, iowait, irq, softirq, steal; + u64 guest, guest_nice; + u64 sum = 0; + u64 sum_softirq = 0; + unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; + struct timespec64 boottime; + + user = nice = system = idle = iowait = + irq = softirq = steal = 0; + guest = guest_nice = 0; + getboottime64(&boottime); + /* shift boot timestamp according to the timens offset */ + timens_sub_boottime(&boottime); + + for_each_possible_cpu(i) { + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + + kcpustat_cpu_fetch(&kcpustat, i); + + user += cpustat[CPUTIME_USER]; + nice += cpustat[CPUTIME_NICE]; + system += cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(&kcpustat, i); + iowait += get_iowait_time(&kcpustat, i); + irq += cpustat[CPUTIME_IRQ]; + softirq += cpustat[CPUTIME_SOFTIRQ]; + steal += cpustat[CPUTIME_STEAL]; + guest += cpustat[CPUTIME_GUEST]; + guest_nice += cpustat[CPUTIME_GUEST_NICE]; + sum += kstat_cpu_irqs_sum(i); + sum += arch_irq_stat_cpu(i); + + for (j = 0; j < NR_SOFTIRQS; j++) { + unsigned int softirq_stat = kstat_softirqs_cpu(j, i); + + per_softirq_sums[j] += softirq_stat; + sum_softirq += softirq_stat; + } + } + sum += arch_irq_stat(); + + seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); + seq_putc(p, '\n'); + + for_each_online_cpu(i) { + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + + kcpustat_cpu_fetch(&kcpustat, i); + + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ + user = cpustat[CPUTIME_USER]; + nice = cpustat[CPUTIME_NICE]; + system = cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(&kcpustat, i); + iowait = get_iowait_time(&kcpustat, i); + irq = cpustat[CPUTIME_IRQ]; + softirq = cpustat[CPUTIME_SOFTIRQ]; + steal = cpustat[CPUTIME_STEAL]; + guest = cpustat[CPUTIME_GUEST]; + guest_nice = cpustat[CPUTIME_GUEST_NICE]; + seq_printf(p, "cpu%d", i); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); + seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); + seq_putc(p, '\n'); + } + seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); + + show_all_irqs(p); + + seq_printf(p, + "\nctxt %llu\n" + "btime %llu\n" + "processes %lu\n" + "procs_running %u\n" + "procs_blocked %u\n", + nr_context_switches(), + (unsigned long long)boottime.tv_sec, + total_forks, + nr_running(), + nr_iowait()); + + seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq); + + for (i = 0; i < NR_SOFTIRQS; i++) + seq_put_decimal_ull(p, " ", per_softirq_sums[i]); + seq_putc(p, '\n'); + + return 0; +} + +static int stat_open(struct inode *inode, struct file *file) +{ + unsigned int size = 1024 + 128 * num_online_cpus(); + + /* minimum size to display an interrupt count : 2 bytes */ + size += 2 * nr_irqs; + return single_open_size(file, show_stat, NULL, size); +} + +static const struct proc_ops stat_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_open = stat_open, + .proc_read_iter = seq_read_iter, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +static int __init proc_stat_init(void) +{ + proc_create("stat", 0, NULL, &stat_proc_ops); + return 0; +} +fs_initcall(proc_stat_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c new file mode 100644 index 000000000..a954305fb --- /dev/null +++ b/fs/proc/task_mmu.c @@ -0,0 +1,2026 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/pagewalk.h> +#include <linux/mm_inline.h> +#include <linux/hugetlb.h> +#include <linux/huge_mm.h> +#include <linux/mount.h> +#include <linux/seq_file.h> +#include <linux/highmem.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/mempolicy.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/sched/mm.h> +#include <linux/swapops.h> +#include <linux/mmu_notifier.h> +#include <linux/page_idle.h> +#include <linux/shmem_fs.h> +#include <linux/uaccess.h> +#include <linux/pkeys.h> + +#include <asm/elf.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include "internal.h" + +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) +void task_mem(struct seq_file *m, struct mm_struct *mm) +{ + unsigned long text, lib, swap, anon, file, shmem; + unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; + + anon = get_mm_counter(mm, MM_ANONPAGES); + file = get_mm_counter(mm, MM_FILEPAGES); + shmem = get_mm_counter(mm, MM_SHMEMPAGES); + + /* + * Note: to minimize their overhead, mm maintains hiwater_vm and + * hiwater_rss only when about to *lower* total_vm or rss. Any + * collector of these hiwater stats must therefore get total_vm + * and rss too, which will usually be the higher. Barriers? not + * worth the effort, such snapshots can always be inconsistent. + */ + hiwater_vm = total_vm = mm->total_vm; + if (hiwater_vm < mm->hiwater_vm) + hiwater_vm = mm->hiwater_vm; + hiwater_rss = total_rss = anon + file + shmem; + if (hiwater_rss < mm->hiwater_rss) + hiwater_rss = mm->hiwater_rss; + + /* split executable areas between text and lib */ + text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); + text = min(text, mm->exec_vm << PAGE_SHIFT); + lib = (mm->exec_vm << PAGE_SHIFT) - text; + + swap = get_mm_counter(mm, MM_SWAPENTS); + SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); + SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); + SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); + SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); + SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); + SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); + SEQ_PUT_DEC(" kB\nRssFile:\t", file); + SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); + SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); + SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); + seq_put_decimal_ull_width(m, + " kB\nVmExe:\t", text >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmLib:\t", lib >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); + SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); + seq_puts(m, " kB\n"); + hugetlb_report_usage(m, mm); +} +#undef SEQ_PUT_DEC + +unsigned long task_vsize(struct mm_struct *mm) +{ + return PAGE_SIZE * mm->total_vm; +} + +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) +{ + *shared = get_mm_counter(mm, MM_FILEPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) + >> PAGE_SHIFT; + *data = mm->data_vm + mm->stack_vm; + *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + return mm->total_vm; +} + +#ifdef CONFIG_NUMA +/* + * Save get_task_policy() for show_numa_map(). + */ +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ + struct task_struct *task = priv->task; + + task_lock(task); + priv->task_mempolicy = get_task_policy(task); + mpol_get(priv->task_mempolicy); + task_unlock(task); +} +static void release_task_mempolicy(struct proc_maps_private *priv) +{ + mpol_put(priv->task_mempolicy); +} +#else +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ +} +static void release_task_mempolicy(struct proc_maps_private *priv) +{ +} +#endif + +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -2UL; + vma = get_gate_vma(priv->mm); + } + + return vma; +} + +static void *m_start(struct seq_file *m, loff_t *ppos) +{ + struct proc_maps_private *priv = m->private; + unsigned long last_addr = *ppos; + struct mm_struct *mm; + + /* See m_next(). Zero at the start or after lseek. */ + if (last_addr == -1UL) + return NULL; + + priv->task = get_proc_task(priv->inode); + if (!priv->task) + return ERR_PTR(-ESRCH); + + mm = priv->mm; + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; + return NULL; + } + + if (mmap_read_lock_killable(mm)) { + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; + return ERR_PTR(-EINTR); + } + + vma_iter_init(&priv->iter, mm, last_addr); + hold_task_mempolicy(priv); + if (last_addr == -2UL) + return get_gate_vma(mm); + + return proc_get_vma(priv, ppos); +} + +static void *m_next(struct seq_file *m, void *v, loff_t *ppos) +{ + if (*ppos == -2UL) { + *ppos = -1UL; + return NULL; + } + return proc_get_vma(m->private, ppos); +} + +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; + + if (!priv->task) + return; + + release_task_mempolicy(priv); + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; +} + +static int proc_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops, int psize) +{ + struct proc_maps_private *priv = __seq_open_private(file, ops, psize); + + if (!priv) + return -ENOMEM; + + priv->inode = inode; + priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->mm)) { + int err = PTR_ERR(priv->mm); + + seq_release_private(inode, file); + return err; + } + + return 0; +} + +static int proc_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->mm) + mmdrop(priv->mm); + + return seq_release_private(inode, file); +} + +static int do_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + return proc_maps_open(inode, file, ops, + sizeof(struct proc_maps_private)); +} + +/* + * Indicate if the VMA is a stack for the given task; for + * /proc/PID/maps that is the stack of the main task. + */ +static int is_stack(struct vm_area_struct *vma) +{ + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; +} + +static void show_vma_header_prefix(struct seq_file *m, + unsigned long start, unsigned long end, + vm_flags_t flags, unsigned long long pgoff, + dev_t dev, unsigned long ino) +{ + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_put_hex_ll(m, NULL, start, 8); + seq_put_hex_ll(m, "-", end, 8); + seq_putc(m, ' '); + seq_putc(m, flags & VM_READ ? 'r' : '-'); + seq_putc(m, flags & VM_WRITE ? 'w' : '-'); + seq_putc(m, flags & VM_EXEC ? 'x' : '-'); + seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); + seq_put_hex_ll(m, " ", pgoff, 8); + seq_put_hex_ll(m, " ", MAJOR(dev), 2); + seq_put_hex_ll(m, ":", MINOR(dev), 2); + seq_put_decimal_ull(m, " ", ino); + seq_putc(m, ' '); +} + +static void +show_map_vma(struct seq_file *m, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + struct file *file = vma->vm_file; + vm_flags_t flags = vma->vm_flags; + unsigned long ino = 0; + unsigned long long pgoff = 0; + unsigned long start, end; + dev_t dev = 0; + const char *name = NULL; + + if (file) { + struct inode *inode = file_inode(vma->vm_file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; + } + + start = vma->vm_start; + end = vma->vm_end; + show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); + + /* + * Print the dentry name for named mappings, and a + * special [heap] marker for the heap: + */ + if (file) { + seq_pad(m, ' '); + seq_file_path(m, file, "\n"); + goto done; + } + + if (vma->vm_ops && vma->vm_ops->name) { + name = vma->vm_ops->name(vma); + if (name) + goto done; + } + + name = arch_vma_name(vma); + if (!name) { + struct anon_vma_name *anon_name; + + if (!mm) { + name = "[vdso]"; + goto done; + } + + if (vma->vm_start <= mm->brk && + vma->vm_end >= mm->start_brk) { + name = "[heap]"; + goto done; + } + + if (is_stack(vma)) { + name = "[stack]"; + goto done; + } + + anon_name = anon_vma_name(vma); + if (anon_name) { + seq_pad(m, ' '); + seq_printf(m, "[anon:%s]", anon_name->name); + } + } + +done: + if (name) { + seq_pad(m, ' '); + seq_puts(m, name); + } + seq_putc(m, '\n'); +} + +static int show_map(struct seq_file *m, void *v) +{ + show_map_vma(m, v); + return 0; +} + +static const struct seq_operations proc_pid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_map +}; + +static int pid_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_maps_op); +} + +const struct file_operations proc_pid_maps_operations = { + .open = pid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; + +/* + * Proportional Set Size(PSS): my share of RSS. + * + * PSS of a process is the count of pages it has in memory, where each + * page is divided by the number of processes sharing it. So if a + * process has 1000 pages all to itself, and 1000 shared with one other + * process, its PSS will be 1500. + * + * To keep (accumulated) division errors low, we adopt a 64bit + * fixed-point pss counter to minimize division errors. So (pss >> + * PSS_SHIFT) would be the real byte count. + * + * A shift of 12 before division means (assuming 4K page size): + * - 1M 3-user-pages add up to 8KB errors; + * - supports mapcount up to 2^24, or 16M; + * - supports PSS up to 2^52 bytes, or 4PB. + */ +#define PSS_SHIFT 12 + +#ifdef CONFIG_PROC_PAGE_MONITOR +struct mem_size_stats { + unsigned long resident; + unsigned long shared_clean; + unsigned long shared_dirty; + unsigned long private_clean; + unsigned long private_dirty; + unsigned long referenced; + unsigned long anonymous; + unsigned long lazyfree; + unsigned long anonymous_thp; + unsigned long shmem_thp; + unsigned long file_thp; + unsigned long swap; + unsigned long shared_hugetlb; + unsigned long private_hugetlb; + u64 pss; + u64 pss_anon; + u64 pss_file; + u64 pss_shmem; + u64 pss_dirty; + u64 pss_locked; + u64 swap_pss; +}; + +static void smaps_page_accumulate(struct mem_size_stats *mss, + struct page *page, unsigned long size, unsigned long pss, + bool dirty, bool locked, bool private) +{ + mss->pss += pss; + + if (PageAnon(page)) + mss->pss_anon += pss; + else if (PageSwapBacked(page)) + mss->pss_shmem += pss; + else + mss->pss_file += pss; + + if (locked) + mss->pss_locked += pss; + + if (dirty || PageDirty(page)) { + mss->pss_dirty += pss; + if (private) + mss->private_dirty += size; + else + mss->shared_dirty += size; + } else { + if (private) + mss->private_clean += size; + else + mss->shared_clean += size; + } +} + +static void smaps_account(struct mem_size_stats *mss, struct page *page, + bool compound, bool young, bool dirty, bool locked, + bool migration) +{ + int i, nr = compound ? compound_nr(page) : 1; + unsigned long size = nr * PAGE_SIZE; + + /* + * First accumulate quantities that depend only on |size| and the type + * of the compound page. + */ + if (PageAnon(page)) { + mss->anonymous += size; + if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) + mss->lazyfree += size; + } + + mss->resident += size; + /* Accumulate the size in pages that have been accessed. */ + if (young || page_is_young(page) || PageReferenced(page)) + mss->referenced += size; + + /* + * Then accumulate quantities that may depend on sharing, or that may + * differ page-by-page. + * + * page_count(page) == 1 guarantees the page is mapped exactly once. + * If any subpage of the compound page mapped with PTE it would elevate + * page_count(). + * + * The page_mapcount() is called to get a snapshot of the mapcount. + * Without holding the page lock this snapshot can be slightly wrong as + * we cannot always read the mapcount atomically. It is not safe to + * call page_mapcount() even with PTL held if the page is not mapped, + * especially for migration entries. Treat regular migration entries + * as mapcount == 1. + */ + if ((page_count(page) == 1) || migration) { + smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, + locked, true); + return; + } + for (i = 0; i < nr; i++, page++) { + int mapcount = page_mapcount(page); + unsigned long pss = PAGE_SIZE << PSS_SHIFT; + if (mapcount >= 2) + pss /= mapcount; + smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked, + mapcount < 2); + } +} + +#ifdef CONFIG_SHMEM +static int smaps_pte_hole(unsigned long addr, unsigned long end, + __always_unused int depth, struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; + + mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, + linear_page_index(vma, addr), + linear_page_index(vma, end)); + + return 0; +} +#else +#define smaps_pte_hole NULL +#endif /* CONFIG_SHMEM */ + +static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) +{ +#ifdef CONFIG_SHMEM + if (walk->ops->pte_hole) { + /* depth is not used */ + smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); + } +#endif +} + +static void smaps_pte_entry(pte_t *pte, unsigned long addr, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; + bool locked = !!(vma->vm_flags & VM_LOCKED); + struct page *page = NULL; + bool migration = false, young = false, dirty = false; + + if (pte_present(*pte)) { + page = vm_normal_page(vma, addr, *pte); + young = pte_young(*pte); + dirty = pte_dirty(*pte); + } else if (is_swap_pte(*pte)) { + swp_entry_t swpent = pte_to_swp_entry(*pte); + + if (!non_swap_entry(swpent)) { + int mapcount; + + mss->swap += PAGE_SIZE; + mapcount = swp_swapcount(swpent); + if (mapcount >= 2) { + u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; + + do_div(pss_delta, mapcount); + mss->swap_pss += pss_delta; + } else { + mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; + } + } else if (is_pfn_swap_entry(swpent)) { + if (is_migration_entry(swpent)) + migration = true; + page = pfn_swap_entry_to_page(swpent); + } + } else { + smaps_pte_hole_lookup(addr, walk); + return; + } + + if (!page) + return; + + smaps_account(mss, page, false, young, dirty, locked, migration); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; + bool locked = !!(vma->vm_flags & VM_LOCKED); + struct page *page = NULL; + bool migration = false; + + if (pmd_present(*pmd)) { + /* FOLL_DUMP will return -EFAULT on huge zero page */ + page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); + + if (is_migration_entry(entry)) { + migration = true; + page = pfn_swap_entry_to_page(entry); + } + } + if (IS_ERR_OR_NULL(page)) + return; + if (PageAnon(page)) + mss->anonymous_thp += HPAGE_PMD_SIZE; + else if (PageSwapBacked(page)) + mss->shmem_thp += HPAGE_PMD_SIZE; + else if (is_zone_device_page(page)) + /* pass */; + else + mss->file_thp += HPAGE_PMD_SIZE; + + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), + locked, migration); +} +#else +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, + struct mm_walk *walk) +{ +} +#endif + +static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + pte_t *pte; + spinlock_t *ptl; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + smaps_pmd_entry(pmd, addr, walk); + spin_unlock(ptl); + goto out; + } + + if (pmd_trans_unstable(pmd)) + goto out; + /* + * The mmap_lock held all the way back in m_start() is what + * keeps khugepaged out of here and from collapsing things + * in here. + */ + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) + smaps_pte_entry(pte, addr, walk); + pte_unmap_unlock(pte - 1, ptl); +out: + cond_resched(); + return 0; +} + +static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) +{ + /* + * Don't forget to update Documentation/ on changes. + */ + static const char mnemonics[BITS_PER_LONG][2] = { + /* + * In case if we meet a flag we don't know about. + */ + [0 ... (BITS_PER_LONG-1)] = "??", + + [ilog2(VM_READ)] = "rd", + [ilog2(VM_WRITE)] = "wr", + [ilog2(VM_EXEC)] = "ex", + [ilog2(VM_SHARED)] = "sh", + [ilog2(VM_MAYREAD)] = "mr", + [ilog2(VM_MAYWRITE)] = "mw", + [ilog2(VM_MAYEXEC)] = "me", + [ilog2(VM_MAYSHARE)] = "ms", + [ilog2(VM_GROWSDOWN)] = "gd", + [ilog2(VM_PFNMAP)] = "pf", + [ilog2(VM_LOCKED)] = "lo", + [ilog2(VM_IO)] = "io", + [ilog2(VM_SEQ_READ)] = "sr", + [ilog2(VM_RAND_READ)] = "rr", + [ilog2(VM_DONTCOPY)] = "dc", + [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_ACCOUNT)] = "ac", + [ilog2(VM_NORESERVE)] = "nr", + [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_SYNC)] = "sf", + [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_WIPEONFORK)] = "wf", + [ilog2(VM_DONTDUMP)] = "dd", +#ifdef CONFIG_ARM64_BTI + [ilog2(VM_ARM64_BTI)] = "bt", +#endif +#ifdef CONFIG_MEM_SOFT_DIRTY + [ilog2(VM_SOFTDIRTY)] = "sd", +#endif + [ilog2(VM_MIXEDMAP)] = "mm", + [ilog2(VM_HUGEPAGE)] = "hg", + [ilog2(VM_NOHUGEPAGE)] = "nh", + [ilog2(VM_MERGEABLE)] = "mg", + [ilog2(VM_UFFD_MISSING)]= "um", + [ilog2(VM_UFFD_WP)] = "uw", +#ifdef CONFIG_ARM64_MTE + [ilog2(VM_MTE)] = "mt", + [ilog2(VM_MTE_ALLOWED)] = "", +#endif +#ifdef CONFIG_ARCH_HAS_PKEYS + /* These come out via ProtectionKey: */ + [ilog2(VM_PKEY_BIT0)] = "", + [ilog2(VM_PKEY_BIT1)] = "", + [ilog2(VM_PKEY_BIT2)] = "", + [ilog2(VM_PKEY_BIT3)] = "", +#if VM_PKEY_BIT4 + [ilog2(VM_PKEY_BIT4)] = "", +#endif +#endif /* CONFIG_ARCH_HAS_PKEYS */ +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + [ilog2(VM_UFFD_MINOR)] = "ui", +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ + }; + size_t i; + + seq_puts(m, "VmFlags: "); + for (i = 0; i < BITS_PER_LONG; i++) { + if (!mnemonics[i][0]) + continue; + if (vma->vm_flags & (1UL << i)) { + seq_putc(m, mnemonics[i][0]); + seq_putc(m, mnemonics[i][1]); + seq_putc(m, ' '); + } + } + seq_putc(m, '\n'); +} + +#ifdef CONFIG_HUGETLB_PAGE +static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; + struct page *page = NULL; + + if (pte_present(*pte)) { + page = vm_normal_page(vma, addr, *pte); + } else if (is_swap_pte(*pte)) { + swp_entry_t swpent = pte_to_swp_entry(*pte); + + if (is_pfn_swap_entry(swpent)) + page = pfn_swap_entry_to_page(swpent); + } + if (page) { + if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) + mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); + else + mss->private_hugetlb += huge_page_size(hstate_vma(vma)); + } + return 0; +} +#else +#define smaps_hugetlb_range NULL +#endif /* HUGETLB_PAGE */ + +static const struct mm_walk_ops smaps_walk_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, +}; + +static const struct mm_walk_ops smaps_shmem_walk_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, + .pte_hole = smaps_pte_hole, +}; + +/* + * Gather mem stats from @vma with the indicated beginning + * address @start, and keep them in @mss. + * + * Use vm_start of @vma as the beginning address if @start is 0. + */ +static void smap_gather_stats(struct vm_area_struct *vma, + struct mem_size_stats *mss, unsigned long start) +{ + const struct mm_walk_ops *ops = &smaps_walk_ops; + + /* Invalid start */ + if (start >= vma->vm_end) + return; + +#ifdef CONFIG_SHMEM + if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { + /* + * For shared or readonly shmem mappings we know that all + * swapped out pages belong to the shmem object, and we can + * obtain the swap value much more efficiently. For private + * writable mappings, we might have COW pages that are + * not affected by the parent swapped out pages of the shmem + * object, so we have to distinguish them during the page walk. + * Unless we know that the shmem object (or the part mapped by + * our VMA) has no swapped out pages at all. + */ + unsigned long shmem_swapped = shmem_swap_usage(vma); + + if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || + !(vma->vm_flags & VM_WRITE))) { + mss->swap += shmem_swapped; + } else { + ops = &smaps_shmem_walk_ops; + } + } +#endif + /* mmap_lock is held in m_start */ + if (!start) + walk_page_vma(vma, ops, mss); + else + walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); +} + +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) >> 10, 8) + +/* Show the contents common for smaps and smaps_rollup */ +static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, + bool rollup_mode) +{ + SEQ_PUT_DEC("Rss: ", mss->resident); + SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); + if (rollup_mode) { + /* + * These are meaningful only for smaps_rollup, otherwise two of + * them are zero, and the other one is the same as Pss. + */ + SEQ_PUT_DEC(" kB\nPss_Anon: ", + mss->pss_anon >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_File: ", + mss->pss_file >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_Shmem: ", + mss->pss_shmem >> PSS_SHIFT); + } + SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); + SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); + SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); + SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); + SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); + SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); + SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); + SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); + SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); + seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", + mss->private_hugetlb >> 10, 7); + SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); + SEQ_PUT_DEC(" kB\nSwapPss: ", + mss->swap_pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nLocked: ", + mss->pss_locked >> PSS_SHIFT); + seq_puts(m, " kB\n"); +} + +static int show_smap(struct seq_file *m, void *v) +{ + struct vm_area_struct *vma = v; + struct mem_size_stats mss; + + memset(&mss, 0, sizeof(mss)); + + smap_gather_stats(vma, &mss, 0); + + show_map_vma(m, vma); + + SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); + SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); + SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); + seq_puts(m, " kB\n"); + + __show_smap(m, &mss, false); + + seq_printf(m, "THPeligible: %d\n", + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); + + if (arch_pkeys_enabled()) + seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); + show_smap_vma_flags(m, vma); + + return 0; +} + +static int show_smaps_rollup(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct mem_size_stats mss; + struct mm_struct *mm = priv->mm; + struct vm_area_struct *vma; + unsigned long vma_start = 0, last_vma_end = 0; + int ret = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + + priv->task = get_proc_task(priv->inode); + if (!priv->task) + return -ESRCH; + + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + goto out_put_task; + } + + memset(&mss, 0, sizeof(mss)); + + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_put_mm; + + hold_task_mempolicy(priv); + vma = mas_find(&mas, ULONG_MAX); + + if (unlikely(!vma)) + goto empty_set; + + vma_start = vma->vm_start; + do { + smap_gather_stats(vma, &mss, 0); + last_vma_end = vma->vm_end; + + /* + * Release mmap_lock temporarily if someone wants to + * access it for write request. + */ + if (mmap_lock_is_contended(mm)) { + mas_pause(&mas); + mmap_read_unlock(mm); + ret = mmap_read_lock_killable(mm); + if (ret) { + release_task_mempolicy(priv); + goto out_put_mm; + } + + /* + * After dropping the lock, there are four cases to + * consider. See the following example for explanation. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * Suppose we drop the lock after reading VMA2 due to + * contention, then we get: + * + * last_vma_end = 16k + * + * 1) VMA2 is freed, but VMA3 exists: + * + * find_vma(mm, 16k - 1) will return VMA3. + * In this case, just continue from VMA3. + * + * 2) VMA2 still exists: + * + * find_vma(mm, 16k - 1) will return VMA2. + * Iterate the loop like the original one. + * + * 3) No more VMAs can be found: + * + * find_vma(mm, 16k - 1) will return NULL. + * No more things to do, just break. + * + * 4) (last_vma_end - 1) is the middle of a vma (VMA'): + * + * find_vma(mm, 16k - 1) will return VMA' whose range + * contains last_vma_end. + * Iterate VMA' from last_vma_end. + */ + vma = mas_find(&mas, ULONG_MAX); + /* Case 3 above */ + if (!vma) + break; + + /* Case 1 above */ + if (vma->vm_start >= last_vma_end) + continue; + + /* Case 4 above */ + if (vma->vm_end > last_vma_end) + smap_gather_stats(vma, &mss, last_vma_end); + } + /* Case 2 above */ + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + +empty_set: + show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); + seq_pad(m, ' '); + seq_puts(m, "[rollup]\n"); + + __show_smap(m, &mss, true); + + release_task_mempolicy(priv); + mmap_read_unlock(mm); + +out_put_mm: + mmput(mm); +out_put_task: + put_task_struct(priv->task); + priv->task = NULL; + + return ret; +} +#undef SEQ_PUT_DEC + +static const struct seq_operations proc_pid_smaps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_smap +}; + +static int pid_smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_smaps_op); +} + +static int smaps_rollup_open(struct inode *inode, struct file *file) +{ + int ret; + struct proc_maps_private *priv; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); + if (!priv) + return -ENOMEM; + + ret = single_open(file, show_smaps_rollup, priv); + if (ret) + goto out_free; + + priv->inode = inode; + priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->mm)) { + ret = PTR_ERR(priv->mm); + + single_release(inode, file); + goto out_free; + } + + return 0; + +out_free: + kfree(priv); + return ret; +} + +static int smaps_rollup_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->mm) + mmdrop(priv->mm); + + kfree(priv); + return single_release(inode, file); +} + +const struct file_operations proc_pid_smaps_operations = { + .open = pid_smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; + +const struct file_operations proc_pid_smaps_rollup_operations = { + .open = smaps_rollup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = smaps_rollup_release, +}; + +enum clear_refs_types { + CLEAR_REFS_ALL = 1, + CLEAR_REFS_ANON, + CLEAR_REFS_MAPPED, + CLEAR_REFS_SOFT_DIRTY, + CLEAR_REFS_MM_HIWATER_RSS, + CLEAR_REFS_LAST, +}; + +struct clear_refs_private { + enum clear_refs_types type; +}; + +#ifdef CONFIG_MEM_SOFT_DIRTY + +static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) +{ + struct page *page; + + if (!pte_write(pte)) + return false; + if (!is_cow_mapping(vma->vm_flags)) + return false; + if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) + return false; + page = vm_normal_page(vma, addr, pte); + if (!page) + return false; + return page_maybe_dma_pinned(page); +} + +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ + /* + * The soft-dirty tracker uses #PF-s to catch writes + * to pages, so write-protect the pte as well. See the + * Documentation/admin-guide/mm/soft-dirty.rst for full description + * of how soft-dirty works. + */ + pte_t ptent = *pte; + + if (pte_present(ptent)) { + pte_t old_pte; + + if (pte_is_pinned(vma, addr, ptent)) + return; + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_wrprotect(old_pte); + ptent = pte_clear_soft_dirty(ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_clear_soft_dirty(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); + } +} +#else +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +} +#endif + +#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old, pmd = *pmdp; + + if (pmd_present(pmd)) { + /* See comment in change_huge_pmd() */ + old = pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(old)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(old)) + pmd = pmd_mkyoung(pmd); + + pmd = pmd_wrprotect(pmd); + pmd = pmd_clear_soft_dirty(pmd); + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + pmd = pmd_swp_clear_soft_dirty(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } +} +#else +static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ +} +#endif + +static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = walk->vma; + pte_t *pte, ptent; + spinlock_t *ptl; + struct page *page; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty_pmd(vma, addr, pmd); + goto out; + } + + if (!pmd_present(*pmd)) + goto out; + + page = pmd_page(*pmd); + + /* Clear accessed and referenced bits. */ + pmdp_test_and_clear_young(vma, addr, pmd); + test_and_clear_page_young(page); + ClearPageReferenced(page); +out: + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty(vma, addr, pte); + continue; + } + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* Clear accessed and referenced bits. */ + ptep_test_and_clear_young(vma, addr, pte); + test_and_clear_page_young(page); + ClearPageReferenced(page); + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; +} + +static int clear_refs_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = walk->vma; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + /* + * Writing 1 to /proc/pid/clear_refs affects all pages. + * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. + * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. + * Writing 4 to /proc/pid/clear_refs affects all pages. + */ + if (cp->type == CLEAR_REFS_ANON && vma->vm_file) + return 1; + if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) + return 1; + return 0; +} + +static const struct mm_walk_ops clear_refs_walk_ops = { + .pmd_entry = clear_refs_pte_range, + .test_walk = clear_refs_test_walk, +}; + +static ssize_t clear_refs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + struct mm_struct *mm; + struct vm_area_struct *vma; + enum clear_refs_types type; + int itype; + int rv; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + rv = kstrtoint(strstrip(buffer), 10, &itype); + if (rv < 0) + return rv; + type = (enum clear_refs_types)itype; + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) + return -EINVAL; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + mm = get_task_mm(task); + if (mm) { + MA_STATE(mas, &mm->mm_mt, 0, 0); + struct mmu_notifier_range range; + struct clear_refs_private cp = { + .type = type, + }; + + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } + if (type == CLEAR_REFS_MM_HIWATER_RSS) { + /* + * Writing 5 to /proc/pid/clear_refs resets the peak + * resident set size to this mm's current rss value. + */ + reset_mm_hiwater_rss(mm); + goto out_unlock; + } + + if (type == CLEAR_REFS_SOFT_DIRTY) { + mas_for_each(&mas, vma, ULONG_MAX) { + if (!(vma->vm_flags & VM_SOFTDIRTY)) + continue; + vma->vm_flags &= ~VM_SOFTDIRTY; + vma_set_page_prot(vma); + } + + inc_tlb_flush_pending(mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, + 0, NULL, mm, 0, -1UL); + mmu_notifier_invalidate_range_start(&range); + } + walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); + if (type == CLEAR_REFS_SOFT_DIRTY) { + mmu_notifier_invalidate_range_end(&range); + flush_tlb_mm(mm); + dec_tlb_flush_pending(mm); + } +out_unlock: + mmap_write_unlock(mm); +out_mm: + mmput(mm); + } + put_task_struct(task); + + return count; +} + +const struct file_operations proc_clear_refs_operations = { + .write = clear_refs_write, + .llseek = noop_llseek, +}; + +typedef struct { + u64 pme; +} pagemap_entry_t; + +struct pagemapread { + int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ + pagemap_entry_t *buffer; + bool show_pfn; +}; + +#define PAGEMAP_WALK_SIZE (PMD_SIZE) +#define PAGEMAP_WALK_MASK (PMD_MASK) + +#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) +#define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_UFFD_WP BIT_ULL(57) +#define PM_FILE BIT_ULL(61) +#define PM_SWAP BIT_ULL(62) +#define PM_PRESENT BIT_ULL(63) + +#define PM_END_OF_BUFFER 1 + +static inline pagemap_entry_t make_pme(u64 frame, u64 flags) +{ + return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; +} + +static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, + struct pagemapread *pm) +{ + pm->buffer[pm->pos++] = *pme; + if (pm->pos >= pm->len) + return PM_END_OF_BUFFER; + return 0; +} + +static int pagemap_pte_hole(unsigned long start, unsigned long end, + __always_unused int depth, struct mm_walk *walk) +{ + struct pagemapread *pm = walk->private; + unsigned long addr = start; + int err = 0; + + while (addr < end) { + struct vm_area_struct *vma = find_vma(walk->mm, addr); + pagemap_entry_t pme = make_pme(0, 0); + /* End of address space hole, which we mark as non-present. */ + unsigned long hole_end; + + if (vma) + hole_end = min(end, vma->vm_start); + else + hole_end = end; + + for (; addr < hole_end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, &pme, pm); + if (err) + goto out; + } + + if (!vma) + break; + + /* Addresses in the VMA. */ + if (vma->vm_flags & VM_SOFTDIRTY) + pme = make_pme(0, PM_SOFT_DIRTY); + for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { + err = add_to_pagemap(addr, &pme, pm); + if (err) + goto out; + } + } +out: + return err; +} + +static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, + struct vm_area_struct *vma, unsigned long addr, pte_t pte) +{ + u64 frame = 0, flags = 0; + struct page *page = NULL; + bool migration = false; + + if (pte_present(pte)) { + if (pm->show_pfn) + frame = pte_pfn(pte); + flags |= PM_PRESENT; + page = vm_normal_page(vma, addr, pte); + if (pte_soft_dirty(pte)) + flags |= PM_SOFT_DIRTY; + if (pte_uffd_wp(pte)) + flags |= PM_UFFD_WP; + } else if (is_swap_pte(pte)) { + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) + flags |= PM_SOFT_DIRTY; + if (pte_swp_uffd_wp(pte)) + flags |= PM_UFFD_WP; + entry = pte_to_swp_entry(pte); + if (pm->show_pfn) { + pgoff_t offset; + /* + * For PFN swap offsets, keeping the offset field + * to be PFN only to be compatible with old smaps. + */ + if (is_pfn_swap_entry(entry)) + offset = swp_offset_pfn(entry); + else + offset = swp_offset(entry); + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); + } + flags |= PM_SWAP; + migration = is_migration_entry(entry); + if (is_pfn_swap_entry(entry)) + page = pfn_swap_entry_to_page(entry); + if (pte_marker_entry_uffd_wp(entry)) + flags |= PM_UFFD_WP; + } + + if (page && !PageAnon(page)) + flags |= PM_FILE; + if (page && !migration && page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; + + return make_pme(frame, flags); +} + +static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct pagemapread *pm = walk->private; + spinlock_t *ptl; + pte_t *pte, *orig_pte; + int err = 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool migration = false; + + ptl = pmd_trans_huge_lock(pmdp, vma); + if (ptl) { + u64 flags = 0, frame = 0; + pmd_t pmd = *pmdp; + struct page *page = NULL; + + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; + + if (pmd_present(pmd)) { + page = pmd_page(pmd); + + flags |= PM_PRESENT; + if (pmd_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + if (pmd_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + if (pm->show_pfn) + frame = pmd_pfn(pmd) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + else if (is_swap_pmd(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + unsigned long offset; + + if (pm->show_pfn) { + if (is_pfn_swap_entry(entry)) + offset = swp_offset_pfn(entry); + else + offset = swp_offset(entry); + offset = offset + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); + } + flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + if (pmd_swp_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + migration = is_migration_entry(entry); + page = pfn_swap_entry_to_page(entry); + } +#endif + + if (page && !migration && page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + + for (; addr != end; addr += PAGE_SIZE) { + pagemap_entry_t pme = make_pme(frame, flags); + + err = add_to_pagemap(addr, &pme, pm); + if (err) + break; + if (pm->show_pfn) { + if (flags & PM_PRESENT) + frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); + } + } + spin_unlock(ptl); + return err; + } + + if (pmd_trans_unstable(pmdp)) + return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + /* + * We can assume that @vma always points to a valid one and @end never + * goes beyond vma->vm_end. + */ + orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); + for (; addr < end; pte++, addr += PAGE_SIZE) { + pagemap_entry_t pme; + + pme = pte_to_pagemap_entry(pm, vma, addr, *pte); + err = add_to_pagemap(addr, &pme, pm); + if (err) + break; + } + pte_unmap_unlock(orig_pte, ptl); + + cond_resched(); + + return err; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* This function walks within one hugetlb entry in the single call */ +static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct pagemapread *pm = walk->private; + struct vm_area_struct *vma = walk->vma; + u64 flags = 0, frame = 0; + int err = 0; + pte_t pte; + + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; + + pte = huge_ptep_get(ptep); + if (pte_present(pte)) { + struct page *page = pte_page(pte); + + if (!PageAnon(page)) + flags |= PM_FILE; + + if (page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + + if (huge_pte_uffd_wp(pte)) + flags |= PM_UFFD_WP; + + flags |= PM_PRESENT; + if (pm->show_pfn) + frame = pte_pfn(pte) + + ((addr & ~hmask) >> PAGE_SHIFT); + } else if (pte_swp_uffd_wp_any(pte)) { + flags |= PM_UFFD_WP; + } + + for (; addr != end; addr += PAGE_SIZE) { + pagemap_entry_t pme = make_pme(frame, flags); + + err = add_to_pagemap(addr, &pme, pm); + if (err) + return err; + if (pm->show_pfn && (flags & PM_PRESENT)) + frame++; + } + + cond_resched(); + + return err; +} +#else +#define pagemap_hugetlb_range NULL +#endif /* HUGETLB_PAGE */ + +static const struct mm_walk_ops pagemap_ops = { + .pmd_entry = pagemap_pmd_range, + .pte_hole = pagemap_pte_hole, + .hugetlb_entry = pagemap_hugetlb_range, +}; + +/* + * /proc/pid/pagemap - an array mapping virtual pages to pfns + * + * For each page in the address space, this file contains one 64-bit entry + * consisting of the following: + * + * Bits 0-54 page frame number (PFN) if present + * Bits 0-4 swap type if swapped + * Bits 5-54 swap offset if swapped + * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) + * Bit 56 page exclusively mapped + * Bit 57 pte is uffd-wp write-protected + * Bits 58-60 zero + * Bit 61 page is file-page or shared-anon + * Bit 62 page swapped + * Bit 63 page present + * + * If the page is not present but in swap, then the PFN contains an + * encoding of the swap file number and the page's offset into the + * swap. Unmapped pages return a null PFN. This allows determining + * precisely which pages are mapped (or in swap) and comparing mapped + * pages between processes. + * + * Efficient users of this interface will use /proc/pid/maps to + * determine which areas of memory are actually mapped and llseek to + * skip over unmapped regions. + */ +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + struct pagemapread pm; + unsigned long src; + unsigned long svpfn; + unsigned long start_vaddr; + unsigned long end_vaddr; + int ret = 0, copied = 0; + + if (!mm || !mmget_not_zero(mm)) + goto out; + + ret = -EINVAL; + /* file position must be aligned */ + if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) + goto out_mm; + + ret = 0; + if (!count) + goto out_mm; + + /* do not disclose physical addresses: attack vector */ + pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); + + pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); + pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); + ret = -ENOMEM; + if (!pm.buffer) + goto out_mm; + + src = *ppos; + svpfn = src / PM_ENTRY_BYTES; + end_vaddr = mm->task_size; + + /* watch out for wraparound */ + start_vaddr = end_vaddr; + if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) + start_vaddr = untagged_addr(svpfn << PAGE_SHIFT); + + /* Ensure the address is inside the task */ + if (start_vaddr > mm->task_size) + start_vaddr = end_vaddr; + + /* + * The odds are that this will stop walking way + * before end_vaddr, because the length of the + * user buffer is tracked in "pm", and the walk + * will stop when we hit the end of the buffer. + */ + ret = 0; + while (count && (start_vaddr < end_vaddr)) { + int len; + unsigned long end; + + pm.pos = 0; + end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; + /* overflow ? */ + if (end < start_vaddr || end > end_vaddr) + end = end_vaddr; + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_free; + ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); + mmap_read_unlock(mm); + start_vaddr = end; + + len = min(count, PM_ENTRY_BYTES * pm.pos); + if (copy_to_user(buf, pm.buffer, len)) { + ret = -EFAULT; + goto out_free; + } + copied += len; + buf += len; + count -= len; + } + *ppos += copied; + if (!ret || ret == PM_END_OF_BUFFER) + ret = copied; + +out_free: + kfree(pm.buffer); +out_mm: + mmput(mm); +out: + return ret; +} + +static int pagemap_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm; + + mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(mm)) + return PTR_ERR(mm); + file->private_data = mm; + return 0; +} + +static int pagemap_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + + if (mm) + mmdrop(mm); + return 0; +} + +const struct file_operations proc_pagemap_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_read, + .open = pagemap_open, + .release = pagemap_release, +}; +#endif /* CONFIG_PROC_PAGE_MONITOR */ + +#ifdef CONFIG_NUMA + +struct numa_maps { + unsigned long pages; + unsigned long anon; + unsigned long active; + unsigned long writeback; + unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; + unsigned long node[MAX_NUMNODES]; +}; + +struct numa_maps_private { + struct proc_maps_private proc_maps; + struct numa_maps md; +}; + +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, + unsigned long nr_pages) +{ + int count = page_mapcount(page); + + md->pages += nr_pages; + if (pte_dirty || PageDirty(page)) + md->dirty += nr_pages; + + if (PageSwapCache(page)) + md->swapcache += nr_pages; + + if (PageActive(page) || PageUnevictable(page)) + md->active += nr_pages; + + if (PageWriteback(page)) + md->writeback += nr_pages; + + if (PageAnon(page)) + md->anon += nr_pages; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->node[page_to_nid(page)] += nr_pages; +} + +static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pte_present(pte)) + return NULL; + + page = vm_normal_page(vma, addr, pte); + if (!page || is_zone_device_page(page)) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_MEMORY])) + return NULL; + + return page; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static struct page *can_gather_numa_stats_pmd(pmd_t pmd, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pmd_present(pmd)) + return NULL; + + page = vm_normal_page_pmd(vma, addr, pmd); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_MEMORY])) + return NULL; + + return page; +} +#endif + +static int gather_pte_stats(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md = walk->private; + struct vm_area_struct *vma = walk->vma; + spinlock_t *ptl; + pte_t *orig_pte; + pte_t *pte; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + struct page *page; + + page = can_gather_numa_stats_pmd(*pmd, vma, addr); + if (page) + gather_stats(page, md, pmd_dirty(*pmd), + HPAGE_PMD_SIZE/PAGE_SIZE); + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +#endif + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + do { + struct page *page = can_gather_numa_stats(*pte, vma, addr); + if (!page) + continue; + gather_stats(page, md, pte_dirty(*pte), 1); + + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(orig_pte, ptl); + cond_resched(); + return 0; +} +#ifdef CONFIG_HUGETLB_PAGE +static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + pte_t huge_pte = huge_ptep_get(pte); + struct numa_maps *md; + struct page *page; + + if (!pte_present(huge_pte)) + return 0; + + page = pte_page(huge_pte); + + md = walk->private; + gather_stats(page, md, pte_dirty(huge_pte), 1); + return 0; +} + +#else +static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + return 0; +} +#endif + +static const struct mm_walk_ops show_numa_ops = { + .hugetlb_entry = gather_hugetlb_stats, + .pmd_entry = gather_pte_stats, +}; + +/* + * Display pages allocated per node and memory policy via /proc. + */ +static int show_numa_map(struct seq_file *m, void *v) +{ + struct numa_maps_private *numa_priv = m->private; + struct proc_maps_private *proc_priv = &numa_priv->proc_maps; + struct vm_area_struct *vma = v; + struct numa_maps *md = &numa_priv->md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; + struct mempolicy *pol; + char buffer[64]; + int nid; + + if (!mm) + return 0; + + /* Ensure we start with an empty set of numa_maps statistics. */ + memset(md, 0, sizeof(*md)); + + pol = __get_vma_policy(vma, vma->vm_start); + if (pol) { + mpol_to_str(buffer, sizeof(buffer), pol); + mpol_cond_put(pol); + } else { + mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); + } + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_puts(m, " file="); + seq_file_path(m, file, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_puts(m, " heap"); + } else if (is_stack(vma)) { + seq_puts(m, " stack"); + } + + if (is_vm_hugetlb_page(vma)) + seq_puts(m, " huge"); + + /* mmap_lock is held by m_start */ + walk_page_vma(vma, &show_numa_ops, md); + + if (!md->pages) + goto out; + + if (md->anon) + seq_printf(m, " anon=%lu", md->anon); + + if (md->dirty) + seq_printf(m, " dirty=%lu", md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m, " swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m, " active=%lu", md->active); + + if (md->writeback) + seq_printf(m, " writeback=%lu", md->writeback); + + for_each_node_state(nid, N_MEMORY) + if (md->node[nid]) + seq_printf(m, " N%d=%lu", nid, md->node[nid]); + + seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); +out: + seq_putc(m, '\n'); + return 0; +} + +static const struct seq_operations proc_pid_numa_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_numa_map, +}; + +static int pid_numa_maps_open(struct inode *inode, struct file *file) +{ + return proc_maps_open(inode, file, &proc_pid_numa_maps_op, + sizeof(struct numa_maps_private)); +} + +const struct file_operations proc_pid_numa_maps_operations = { + .open = pid_numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; + +#endif /* CONFIG_NUMA */ diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c new file mode 100644 index 000000000..dc05780f9 --- /dev/null +++ b/fs/proc/task_nommu.c @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/fs_struct.h> +#include <linux/mount.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/sched/mm.h> + +#include "internal.h" + +/* + * Logic: we've got two memory sums for each process, "shared", and + * "non-shared". Shared memory may get counted more than once, for + * each process that owns it. Non-shared memory is counted + * accurately. + */ +void task_mem(struct seq_file *m, struct mm_struct *mm) +{ + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *vma; + struct vm_region *region; + unsigned long bytes = 0, sbytes = 0, slack = 0, size; + + mmap_read_lock(mm); + for_each_vma(vmi, vma) { + bytes += kobjsize(vma); + + region = vma->vm_region; + if (region) { + size = kobjsize(region); + size += region->vm_end - region->vm_start; + } else { + size = vma->vm_end - vma->vm_start; + } + + if (atomic_read(&mm->mm_count) > 1 || + vma->vm_flags & VM_MAYSHARE) { + sbytes += size; + } else { + bytes += size; + if (region) + slack = region->vm_end - vma->vm_end; + } + } + + if (atomic_read(&mm->mm_count) > 1) + sbytes += kobjsize(mm); + else + bytes += kobjsize(mm); + + if (current->fs && current->fs->users > 1) + sbytes += kobjsize(current->fs); + else + bytes += kobjsize(current->fs); + + if (current->files && atomic_read(¤t->files->count) > 1) + sbytes += kobjsize(current->files); + else + bytes += kobjsize(current->files); + + if (current->sighand && refcount_read(¤t->sighand->count) > 1) + sbytes += kobjsize(current->sighand); + else + bytes += kobjsize(current->sighand); + + bytes += kobjsize(current); /* includes kernel stack */ + + seq_printf(m, + "Mem:\t%8lu bytes\n" + "Slack:\t%8lu bytes\n" + "Shared:\t%8lu bytes\n", + bytes, slack, sbytes); + + mmap_read_unlock(mm); +} + +unsigned long task_vsize(struct mm_struct *mm) +{ + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *vma; + unsigned long vsize = 0; + + mmap_read_lock(mm); + for_each_vma(vmi, vma) + vsize += vma->vm_end - vma->vm_start; + mmap_read_unlock(mm); + return vsize; +} + +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) +{ + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *vma; + struct vm_region *region; + unsigned long size = kobjsize(mm); + + mmap_read_lock(mm); + for_each_vma(vmi, vma) { + size += kobjsize(vma); + region = vma->vm_region; + if (region) { + size += kobjsize(region); + size += region->vm_end - region->vm_start; + } + } + + *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) + >> PAGE_SHIFT; + *data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK)) + >> PAGE_SHIFT; + mmap_read_unlock(mm); + size >>= PAGE_SHIFT; + size += *text + *data; + *resident = size; + return size; +} + +static int is_stack(struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack; +} + +/* + * display a single VMA to a sequenced file + */ +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags; + unsigned long long pgoff = 0; + + flags = vma->vm_flags; + file = vma->vm_file; + + if (file) { + struct inode *inode = file_inode(vma->vm_file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; + } + + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", + vma->vm_start, + vma->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + pgoff, + MAJOR(dev), MINOR(dev), ino); + + if (file) { + seq_pad(m, ' '); + seq_file_path(m, file, ""); + } else if (mm && is_stack(vma)) { + seq_pad(m, ' '); + seq_puts(m, "[stack]"); + } + + seq_putc(m, '\n'); + return 0; +} + +/* + * display mapping lines for a particular process's /proc/pid/maps + */ +static int show_map(struct seq_file *m, void *_p) +{ + return nommu_vma_show(m, _p); +} + +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -1UL; + } + + return vma; +} + +static void *m_start(struct seq_file *m, loff_t *ppos) +{ + struct proc_maps_private *priv = m->private; + unsigned long last_addr = *ppos; + struct mm_struct *mm; + + /* See proc_get_vma(). Zero at the start or after lseek. */ + if (last_addr == -1UL) + return NULL; + + /* pin the task and mm whilst we play with them */ + priv->task = get_proc_task(priv->inode); + if (!priv->task) + return ERR_PTR(-ESRCH); + + mm = priv->mm; + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; + return NULL; + } + + if (mmap_read_lock_killable(mm)) { + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; + return ERR_PTR(-EINTR); + } + + vma_iter_init(&priv->iter, mm, last_addr); + + return proc_get_vma(priv, ppos); +} + +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; + + if (!priv->task) + return; + + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; +} + +static void *m_next(struct seq_file *m, void *_p, loff_t *ppos) +{ + return proc_get_vma(m->private, ppos); +} + +static const struct seq_operations proc_pid_maps_ops = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_map +}; + +static int maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + struct proc_maps_private *priv; + + priv = __seq_open_private(file, ops, sizeof(*priv)); + if (!priv) + return -ENOMEM; + + priv->inode = inode; + priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->mm)) { + int err = PTR_ERR(priv->mm); + + seq_release_private(inode, file); + return err; + } + + return 0; +} + + +static int map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->mm) + mmdrop(priv->mm); + + return seq_release_private(inode, file); +} + +static int pid_maps_open(struct inode *inode, struct file *file) +{ + return maps_open(inode, file, &proc_pid_maps_ops); +} + +const struct file_operations proc_pid_maps_operations = { + .open = pid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = map_release, +}; + diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c new file mode 100644 index 000000000..a553273fb --- /dev/null +++ b/fs/proc/thread_self.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cache.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/pid_namespace.h> +#include "internal.h" + +/* + * /proc/thread_self: + */ +static const char *proc_thread_self_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct pid_namespace *ns = proc_pid_ns(inode->i_sb); + pid_t tgid = task_tgid_nr_ns(current, ns); + pid_t pid = task_pid_nr_ns(current, ns); + char *name; + + if (!pid) + return ERR_PTR(-ENOENT); + name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); + if (unlikely(!name)) + return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); + sprintf(name, "%u/task/%u", tgid, pid); + set_delayed_call(done, kfree_link, name); + return name; +} + +static const struct inode_operations proc_thread_self_inode_operations = { + .get_link = proc_thread_self_get_link, +}; + +static unsigned thread_self_inum __ro_after_init; + +int proc_setup_thread_self(struct super_block *s) +{ + struct inode *root_inode = d_inode(s->s_root); + struct proc_fs_info *fs_info = proc_sb_info(s); + struct dentry *thread_self; + int ret = -ENOMEM; + + inode_lock(root_inode); + thread_self = d_alloc_name(s->s_root, "thread-self"); + if (thread_self) { + struct inode *inode = new_inode(s); + if (inode) { + inode->i_ino = thread_self_inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_op = &proc_thread_self_inode_operations; + d_add(thread_self, inode); + ret = 0; + } else { + dput(thread_self); + } + } + inode_unlock(root_inode); + + if (ret) + pr_err("proc_fill_super: can't allocate /proc/thread-self\n"); + else + fs_info->proc_thread_self = thread_self; + + return ret; +} + +void __init proc_thread_self_init(void) +{ + proc_alloc_inum(&thread_self_inum); +} diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c new file mode 100644 index 000000000..b5343d209 --- /dev/null +++ b/fs/proc/uptime.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/time.h> +#include <linux/time_namespace.h> +#include <linux/kernel_stat.h> +#include "internal.h" + +static int uptime_proc_show(struct seq_file *m, void *v) +{ + struct timespec64 uptime; + struct timespec64 idle; + u64 idle_nsec; + u32 rem; + int i; + + idle_nsec = 0; + for_each_possible_cpu(i) { + struct kernel_cpustat kcs; + + kcpustat_cpu_fetch(&kcs, i); + idle_nsec += get_idle_time(&kcs, i); + } + + ktime_get_boottime_ts64(&uptime); + timens_add_boottime(&uptime); + + idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; + seq_printf(m, "%lu.%02lu %lu.%02lu\n", + (unsigned long) uptime.tv_sec, + (uptime.tv_nsec / (NSEC_PER_SEC / 100)), + (unsigned long) idle.tv_sec, + (idle.tv_nsec / (NSEC_PER_SEC / 100))); + return 0; +} + +static int __init proc_uptime_init(void) +{ + struct proc_dir_entry *pde; + + pde = proc_create_single("uptime", 0, NULL, uptime_proc_show); + pde_make_permanent(pde); + return 0; +} +fs_initcall(proc_uptime_init); diff --git a/fs/proc/util.c b/fs/proc/util.c new file mode 100644 index 000000000..98f8adc17 --- /dev/null +++ b/fs/proc/util.c @@ -0,0 +1,24 @@ +#include <linux/dcache.h> +#include "internal.h" + +unsigned name_to_int(const struct qstr *qstr) +{ + const char *name = qstr->name; + int len = qstr->len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + do { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } while (--len > 0); + return n; +out: + return ~0U; +} diff --git a/fs/proc/version.c b/fs/proc/version.c new file mode 100644 index 000000000..02e3c3cd4 --- /dev/null +++ b/fs/proc/version.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/utsname.h> +#include "internal.h" + +static int version_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, linux_proc_banner, + utsname()->sysname, + utsname()->release, + utsname()->version); + return 0; +} + +static int __init proc_version_init(void) +{ + struct proc_dir_entry *pde; + + pde = proc_create_single("version", 0, NULL, version_proc_show); + pde_make_permanent(pde); + return 0; +} +fs_initcall(proc_version_init); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c new file mode 100644 index 000000000..1ec0647a2 --- /dev/null +++ b/fs/proc/vmcore.c @@ -0,0 +1,1603 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * fs/proc/vmcore.c Interface for accessing the crash + * dump from the system's previous life. + * Heavily borrowed from fs/proc/kcore.c + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + * + */ + +#include <linux/mm.h> +#include <linux/kcore.h> +#include <linux/user.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <linux/init.h> +#include <linux/crash_dump.h> +#include <linux/list.h> +#include <linux/moduleparam.h> +#include <linux/mutex.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/cc_platform.h> +#include <asm/io.h> +#include "internal.h" + +/* List representing chunks of contiguous memory areas and their offsets in + * vmcore file. + */ +static LIST_HEAD(vmcore_list); + +/* Stores the pointer to the buffer containing kernel elf core headers. */ +static char *elfcorebuf; +static size_t elfcorebuf_sz; +static size_t elfcorebuf_sz_orig; + +static char *elfnotes_buf; +static size_t elfnotes_sz; +/* Size of all notes minus the device dump notes */ +static size_t elfnotes_orig_sz; + +/* Total size of vmcore file. */ +static u64 vmcore_size; + +static struct proc_dir_entry *proc_vmcore; + +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +/* Device Dump list and mutex to synchronize access to list */ +static LIST_HEAD(vmcoredd_list); +static DEFINE_MUTEX(vmcoredd_mutex); + +static bool vmcoredd_disabled; +core_param(novmcoredd, vmcoredd_disabled, bool, 0); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +/* Device Dump Size */ +static size_t vmcoredd_orig_sz; + +static DEFINE_SPINLOCK(vmcore_cb_lock); +DEFINE_STATIC_SRCU(vmcore_cb_srcu); +/* List of registered vmcore callbacks. */ +static LIST_HEAD(vmcore_cb_list); +/* Whether the vmcore has been opened once. */ +static bool vmcore_opened; + +void register_vmcore_cb(struct vmcore_cb *cb) +{ + INIT_LIST_HEAD(&cb->next); + spin_lock(&vmcore_cb_lock); + list_add_tail(&cb->next, &vmcore_cb_list); + /* + * Registering a vmcore callback after the vmcore was opened is + * very unusual (e.g., manual driver loading). + */ + if (vmcore_opened) + pr_warn_once("Unexpected vmcore callback registration\n"); + spin_unlock(&vmcore_cb_lock); +} +EXPORT_SYMBOL_GPL(register_vmcore_cb); + +void unregister_vmcore_cb(struct vmcore_cb *cb) +{ + spin_lock(&vmcore_cb_lock); + list_del_rcu(&cb->next); + /* + * Unregistering a vmcore callback after the vmcore was opened is + * very unusual (e.g., forced driver removal), but we cannot stop + * unregistering. + */ + if (vmcore_opened) + pr_warn_once("Unexpected vmcore callback unregistration\n"); + spin_unlock(&vmcore_cb_lock); + + synchronize_srcu(&vmcore_cb_srcu); +} +EXPORT_SYMBOL_GPL(unregister_vmcore_cb); + +static bool pfn_is_ram(unsigned long pfn) +{ + struct vmcore_cb *cb; + bool ret = true; + + list_for_each_entry_srcu(cb, &vmcore_cb_list, next, + srcu_read_lock_held(&vmcore_cb_srcu)) { + if (unlikely(!cb->pfn_is_ram)) + continue; + ret = cb->pfn_is_ram(cb, pfn); + if (!ret) + break; + } + + return ret; +} + +static int open_vmcore(struct inode *inode, struct file *file) +{ + spin_lock(&vmcore_cb_lock); + vmcore_opened = true; + spin_unlock(&vmcore_cb_lock); + + return 0; +} + +/* Reads a page from the oldmem device from given offset. */ +ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted) +{ + unsigned long pfn, offset; + ssize_t nr_bytes; + ssize_t read = 0, tmp; + int idx; + + if (!count) + return 0; + + offset = (unsigned long)(*ppos % PAGE_SIZE); + pfn = (unsigned long)(*ppos / PAGE_SIZE); + + idx = srcu_read_lock(&vmcore_cb_srcu); + do { + if (count > (PAGE_SIZE - offset)) + nr_bytes = PAGE_SIZE - offset; + else + nr_bytes = count; + + /* If pfn is not ram, return zeros for sparse dump files */ + if (!pfn_is_ram(pfn)) { + tmp = iov_iter_zero(nr_bytes, iter); + } else { + if (encrypted) + tmp = copy_oldmem_page_encrypted(iter, pfn, + nr_bytes, + offset); + else + tmp = copy_oldmem_page(iter, pfn, nr_bytes, + offset); + } + if (tmp < nr_bytes) { + srcu_read_unlock(&vmcore_cb_srcu, idx); + return -EFAULT; + } + + *ppos += nr_bytes; + count -= nr_bytes; + read += nr_bytes; + ++pfn; + offset = 0; + } while (count); + srcu_read_unlock(&vmcore_cb_srcu, idx); + + return read; +} + +/* + * Architectures may override this function to allocate ELF header in 2nd kernel + */ +int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) +{ + return 0; +} + +/* + * Architectures may override this function to free header + */ +void __weak elfcorehdr_free(unsigned long long addr) +{} + +/* + * Architectures may override this function to read from ELF header + */ +ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, false); +} + +/* + * Architectures may override this function to read from notes sections + */ +ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) +{ + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); +} + +/* + * Architectures may override this function to map oldmem + */ +int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + prot = pgprot_encrypted(prot); + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Architectures which support memory encryption override this. + */ +ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter, + unsigned long pfn, size_t csize, unsigned long offset) +{ + return copy_oldmem_page(iter, pfn, csize, offset); +} + +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + int ret = 0; + size_t tsz; + char *buf; + + mutex_lock(&vmcoredd_mutex); + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (copy_to_iter(buf, tsz, iter) < tsz) { + ret = -EFAULT; + goto out_unlock; + } + + size -= tsz; + start += tsz; + + /* Leave now if buffer filled already */ + if (!size) + goto out_unlock; + } + offset += dump->size; + } + +out_unlock: + mutex_unlock(&vmcoredd_mutex); + return ret; +} + +#ifdef CONFIG_MMU +static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, + u64 start, size_t size) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + int ret = 0; + size_t tsz; + char *buf; + + mutex_lock(&vmcoredd_mutex); + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (remap_vmalloc_range_partial(vma, dst, buf, 0, + tsz)) { + ret = -EFAULT; + goto out_unlock; + } + + size -= tsz; + start += tsz; + dst += tsz; + + /* Leave now if buffer filled already */ + if (!size) + goto out_unlock; + } + offset += dump->size; + } + +out_unlock: + mutex_unlock(&vmcoredd_mutex); + return ret; +} +#endif /* CONFIG_MMU */ +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +/* Read from the ELF header and then the crash dump. On error, negative value is + * returned otherwise number of bytes read are returned. + */ +static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) +{ + ssize_t acc = 0, tmp; + size_t tsz; + u64 start; + struct vmcore *m = NULL; + + if (!iov_iter_count(iter) || *fpos >= vmcore_size) + return 0; + + iov_iter_truncate(iter, vmcore_size - *fpos); + + /* Read ELF core header */ + if (*fpos < elfcorebuf_sz) { + tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter)); + if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz) + return -EFAULT; + *fpos += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (!iov_iter_count(iter)) + return acc; + } + + /* Read Elf note segment */ + if (*fpos < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; + + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) { + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)*fpos, iov_iter_count(iter)); + start = *fpos - elfcorebuf_sz; + if (vmcoredd_copy_dumps(iter, start, tsz)) + return -EFAULT; + + *fpos += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (!iov_iter_count(iter)) + return acc; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, + iov_iter_count(iter)); + kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz; + if (copy_to_iter(kaddr, tsz, iter) < tsz) + return -EFAULT; + + *fpos += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (!iov_iter_count(iter)) + return acc; + } + + list_for_each_entry(m, &vmcore_list, list) { + if (*fpos < m->offset + m->size) { + tsz = (size_t)min_t(unsigned long long, + m->offset + m->size - *fpos, + iov_iter_count(iter)); + start = m->paddr + *fpos - m->offset; + tmp = read_from_oldmem(iter, tsz, &start, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); + if (tmp < 0) + return tmp; + *fpos += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (!iov_iter_count(iter)) + return acc; + } + } + + return acc; +} + +static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter) +{ + return __read_vmcore(iter, &iocb->ki_pos); +} + +/* + * The vmcore fault handler uses the page cache and fills data using the + * standard __read_vmcore() function. + * + * On s390 the fault handler is used for memory regions that can't be mapped + * directly with remap_pfn_range(). + */ +static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) +{ +#ifdef CONFIG_S390 + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + pgoff_t index = vmf->pgoff; + struct iov_iter iter; + struct kvec kvec; + struct page *page; + loff_t offset; + int rc; + + page = find_or_create_page(mapping, index, GFP_KERNEL); + if (!page) + return VM_FAULT_OOM; + if (!PageUptodate(page)) { + offset = (loff_t) index << PAGE_SHIFT; + kvec.iov_base = page_address(page); + kvec.iov_len = PAGE_SIZE; + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE); + + rc = __read_vmcore(&iter, &offset); + if (rc < 0) { + unlock_page(page); + put_page(page); + return vmf_error(rc); + } + SetPageUptodate(page); + } + unlock_page(page); + vmf->page = page; + return 0; +#else + return VM_FAULT_SIGBUS; +#endif +} + +static const struct vm_operations_struct vmcore_mmap_ops = { + .fault = mmap_vmcore_fault, +}; + +/** + * vmcore_alloc_buf - allocate buffer in vmalloc memory + * @size: size of buffer + * + * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap + * the buffer to user-space by means of remap_vmalloc_range(). + * + * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is + * disabled and there's no need to allow users to mmap the buffer. + */ +static inline char *vmcore_alloc_buf(size_t size) +{ +#ifdef CONFIG_MMU + return vmalloc_user(size); +#else + return vzalloc(size); +#endif +} + +/* + * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is + * essential for mmap_vmcore() in order to map physically + * non-contiguous objects (ELF header, ELF note segment and memory + * regions in the 1st kernel pointed to by PT_LOAD entries) into + * virtually contiguous user-space in ELF layout. + */ +#ifdef CONFIG_MMU +/* + * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages + * reported as not being ram with the zero page. + * + * @vma: vm_area_struct describing requested mapping + * @from: start remapping from + * @pfn: page frame number to start remapping to + * @size: remapping size + * @prot: protection bits + * + * Returns zero on success, -EAGAIN on failure. + */ +static int remap_oldmem_pfn_checked(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + unsigned long map_size; + unsigned long pos_start, pos_end, pos; + unsigned long zeropage_pfn = my_zero_pfn(0); + size_t len = 0; + + pos_start = pfn; + pos_end = pfn + (size >> PAGE_SHIFT); + + for (pos = pos_start; pos < pos_end; ++pos) { + if (!pfn_is_ram(pos)) { + /* + * We hit a page which is not ram. Remap the continuous + * region between pos_start and pos-1 and replace + * the non-ram page at pos with the zero page. + */ + if (pos > pos_start) { + /* Remap continuous region */ + map_size = (pos - pos_start) << PAGE_SHIFT; + if (remap_oldmem_pfn_range(vma, from + len, + pos_start, map_size, + prot)) + goto fail; + len += map_size; + } + /* Remap the zero page */ + if (remap_oldmem_pfn_range(vma, from + len, + zeropage_pfn, + PAGE_SIZE, prot)) + goto fail; + len += PAGE_SIZE; + pos_start = pos + 1; + } + } + if (pos > pos_start) { + /* Remap the rest */ + map_size = (pos - pos_start) << PAGE_SHIFT; + if (remap_oldmem_pfn_range(vma, from + len, pos_start, + map_size, prot)) + goto fail; + } + return 0; +fail: + do_munmap(vma->vm_mm, from, len, NULL); + return -EAGAIN; +} + +static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + int ret, idx; + + /* + * Check if a callback was registered to avoid looping over all + * pages without a reason. + */ + idx = srcu_read_lock(&vmcore_cb_srcu); + if (!list_empty(&vmcore_cb_list)) + ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); + else + ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); + srcu_read_unlock(&vmcore_cb_srcu, idx); + return ret; +} + +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) +{ + size_t size = vma->vm_end - vma->vm_start; + u64 start, end, len, tsz; + struct vmcore *m; + + start = (u64)vma->vm_pgoff << PAGE_SHIFT; + end = start + size; + + if (size > vmcore_size || end > vmcore_size) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + + vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + vma->vm_flags |= VM_MIXEDMAP; + vma->vm_ops = &vmcore_mmap_ops; + + len = 0; + + if (start < elfcorebuf_sz) { + u64 pfn; + + tsz = min(elfcorebuf_sz - (size_t)start, size); + pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT; + if (remap_pfn_range(vma, vma->vm_start, pfn, tsz, + vma->vm_page_prot)) + return -EAGAIN; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + + if (start < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; + + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. This also ensures that the device dumps and + * other elf notes can be properly mmaped at page aligned + * address. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (start < elfcorebuf_sz + vmcoredd_orig_sz) { + u64 start_off; + + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)start, size); + start_off = start - elfcorebuf_sz; + if (vmcoredd_mmap_dumps(vma, vma->vm_start + len, + start_off, tsz)) + goto fail; + + size -= tsz; + start += tsz; + len += tsz; + + /* leave now if filled buffer already */ + if (!size) + return 0; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); + kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz; + if (remap_vmalloc_range_partial(vma, vma->vm_start + len, + kaddr, 0, tsz)) + goto fail; + + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + + list_for_each_entry(m, &vmcore_list, list) { + if (start < m->offset + m->size) { + u64 paddr = 0; + + tsz = (size_t)min_t(unsigned long long, + m->offset + m->size - start, size); + paddr = m->paddr + start - m->offset; + if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len, + paddr >> PAGE_SHIFT, tsz, + vma->vm_page_prot)) + goto fail; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + } + + return 0; +fail: + do_munmap(vma->vm_mm, vma->vm_start, len, NULL); + return -EAGAIN; +} +#else +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) +{ + return -ENOSYS; +} +#endif + +static const struct proc_ops vmcore_proc_ops = { + .proc_open = open_vmcore, + .proc_read_iter = read_vmcore, + .proc_lseek = default_llseek, + .proc_mmap = mmap_vmcore, +}; + +static struct vmcore* __init get_new_element(void) +{ + return kzalloc(sizeof(struct vmcore), GFP_KERNEL); +} + +static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz, + struct list_head *vc_list) +{ + u64 size; + struct vmcore *m; + + size = elfsz + elfnotesegsz; + list_for_each_entry(m, vc_list, list) { + size += m->size; + } + return size; +} + +/** + * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) +{ + int i, rc=0; + Elf64_Phdr *phdr_ptr; + Elf64_Nhdr *nhdr_ptr; + + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + void *notes_section; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = elfcorehdr_read_notes(notes_section, max_sz, &offset); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + while (nhdr_ptr->n_namesz != 0) { + sz = sizeof(Elf64_Nhdr) + + (((u64)nhdr_ptr->n_namesz + 3) & ~3) + + (((u64)nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } + real_sz += sz; + nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); + } + kfree(notes_section); + phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + } + } + + return 0; +} + +/** + * get_note_number_and_size_elf64 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf64_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; + } + + return 0; +} + +/** + * copy_notes_elf64 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf64_Phdr *phdr_ptr; + + phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, + &offset); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; + } + + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + + rc = update_note_header_size_elf64(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = vmcore_alloc_buf(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf64(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); + phdr.p_offset = roundup(note_off, PAGE_SIZE); + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf64_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + + return 0; +} + +/** + * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) +{ + int i, rc=0; + Elf32_Phdr *phdr_ptr; + Elf32_Nhdr *nhdr_ptr; + + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + void *notes_section; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = elfcorehdr_read_notes(notes_section, max_sz, &offset); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + while (nhdr_ptr->n_namesz != 0) { + sz = sizeof(Elf32_Nhdr) + + (((u64)nhdr_ptr->n_namesz + 3) & ~3) + + (((u64)nhdr_ptr->n_descsz + 3) & ~3); + if ((real_sz + sz) > max_sz) { + pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", + nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); + break; + } + real_sz += sz; + nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); + } + kfree(notes_section); + phdr_ptr->p_memsz = real_sz; + if (real_sz == 0) { + pr_warn("Warning: Zero PT_NOTE entries found\n"); + } + } + + return 0; +} + +/** + * get_note_number_and_size_elf32 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf32_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; + } + + return 0; +} + +/** + * copy_notes_elf32 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf32_Phdr *phdr_ptr; + + phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, + &offset); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; + } + + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + + rc = update_note_header_size_elf32(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = vmcore_alloc_buf(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf32(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); + phdr.p_offset = roundup(note_off, PAGE_SIZE); + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf32_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + + return 0; +} + +/* Add memory chunks represented by program headers to vmcore list. Also update + * the new offset fields of exported program headers. */ +static int __init process_ptload_program_headers_elf64(char *elfptr, + size_t elfsz, + size_t elfnotes_sz, + struct list_head *vc_list) +{ + int i; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ + + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + + if (phdr_ptr->p_type != PT_LOAD) + continue; + + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = start; + new->size = size; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset. */ + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; + } + return 0; +} + +static int __init process_ptload_program_headers_elf32(char *elfptr, + size_t elfsz, + size_t elfnotes_sz, + struct list_head *vc_list) +{ + int i; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ + + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + + if (phdr_ptr->p_type != PT_LOAD) + continue; + + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = start; + new->size = size; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset */ + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; + } + return 0; +} + +/* Sets offset fields of vmcore elements. */ +static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, + struct list_head *vc_list) +{ + loff_t vmcore_off; + struct vmcore *m; + + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; + + list_for_each_entry(m, vc_list, list) { + m->offset = vmcore_off; + vmcore_off += m->size; + } +} + +static void free_elfcorebuf(void) +{ + free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig)); + elfcorebuf = NULL; + vfree(elfnotes_buf); + elfnotes_buf = NULL; +} + +static int __init parse_crash_elf64_headers(void) +{ + int rc=0; + Elf64_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || + !vmcore_elf64_check_arch(&ehdr) || + ehdr.e_ident[EI_CLASS] != ELFCLASS64 || + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf64_Ehdr) || + ehdr.e_phentsize != sizeof(Elf64_Phdr) || + ehdr.e_phnum == 0) { + pr_warn("Warning: Core image elf header is not sane\n"); + return -EINVAL; + } + + /* Read in all elf headers. */ + elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) + + ehdr.e_phnum * sizeof(Elf64_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); + if (!elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr); + if (rc < 0) + goto fail; + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); + if (rc) + goto fail; + rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, + elfnotes_sz, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); + return 0; +fail: + free_elfcorebuf(); + return rc; +} + +static int __init parse_crash_elf32_headers(void) +{ + int rc=0; + Elf32_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || + !vmcore_elf32_check_arch(&ehdr) || + ehdr.e_ident[EI_CLASS] != ELFCLASS32|| + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf32_Ehdr) || + ehdr.e_phentsize != sizeof(Elf32_Phdr) || + ehdr.e_phnum == 0) { + pr_warn("Warning: Core image elf header is not sane\n"); + return -EINVAL; + } + + /* Read in all elf headers. */ + elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); + if (!elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr); + if (rc < 0) + goto fail; + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); + if (rc) + goto fail; + rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, + elfnotes_sz, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); + return 0; +fail: + free_elfcorebuf(); + return rc; +} + +static int __init parse_crash_elf_headers(void) +{ + unsigned char e_ident[EI_NIDENT]; + u64 addr; + int rc=0; + + addr = elfcorehdr_addr; + rc = elfcorehdr_read(e_ident, EI_NIDENT, &addr); + if (rc < 0) + return rc; + if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { + pr_warn("Warning: Core image elf header not found\n"); + return -EINVAL; + } + + if (e_ident[EI_CLASS] == ELFCLASS64) { + rc = parse_crash_elf64_headers(); + if (rc) + return rc; + } else if (e_ident[EI_CLASS] == ELFCLASS32) { + rc = parse_crash_elf32_headers(); + if (rc) + return rc; + } else { + pr_warn("Warning: Core image elf header is not sane\n"); + return -EINVAL; + } + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + + return 0; +} + +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +/** + * vmcoredd_write_header - Write vmcore device dump header at the + * beginning of the dump's buffer. + * @buf: Output buffer where the note is written + * @data: Dump info + * @size: Size of the dump + * + * Fills beginning of the dump's buffer with vmcore device dump header. + */ +static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, + u32 size) +{ + struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf; + + vdd_hdr->n_namesz = sizeof(vdd_hdr->name); + vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name); + vdd_hdr->n_type = NT_VMCOREDD; + + strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME, + sizeof(vdd_hdr->name)); + memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name)); +} + +/** + * vmcoredd_update_program_headers - Update all Elf program headers + * @elfptr: Pointer to elf header + * @elfnotesz: Size of elf notes aligned to page size + * @vmcoreddsz: Size of device dumps to be added to elf note header + * + * Determine type of Elf header (Elf64 or Elf32) and update the elf note size. + * Also update the offsets of all the program headers after the elf note header. + */ +static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, + size_t vmcoreddsz) +{ + unsigned char *e_ident = (unsigned char *)elfptr; + u64 start, end, size; + loff_t vmcore_off; + u32 i; + + vmcore_off = elfcorebuf_sz + elfnotesz; + + if (e_ident[EI_CLASS] == ELFCLASS64) { + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr; + Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } else { + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr; + Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } +} + +/** + * vmcoredd_update_size - Update the total size of the device dumps and update + * Elf header + * @dump_size: Size of the current device dump to be added to total size + * + * Update the total size of all the device dumps and update the Elf program + * headers. Calculate the new offsets for the vmcore list and update the + * total vmcore size. + */ +static void vmcoredd_update_size(size_t dump_size) +{ + vmcoredd_orig_sz += dump_size; + elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz; + vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz, + vmcoredd_orig_sz); + + /* Update vmcore list offsets */ + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); + + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + proc_vmcore->size = vmcore_size; +} + +/** + * vmcore_add_device_dump - Add a buffer containing device dump to vmcore + * @data: dump info. + * + * Allocate a buffer and invoke the calling driver's dump collect routine. + * Write Elf note at the beginning of the buffer to indicate vmcore device + * dump and add the dump to global list. + */ +int vmcore_add_device_dump(struct vmcoredd_data *data) +{ + struct vmcoredd_node *dump; + void *buf = NULL; + size_t data_size; + int ret; + + if (vmcoredd_disabled) { + pr_err_once("Device dump is disabled\n"); + return -EINVAL; + } + + if (!data || !strlen(data->dump_name) || + !data->vmcoredd_callback || !data->size) + return -EINVAL; + + dump = vzalloc(sizeof(*dump)); + if (!dump) { + ret = -ENOMEM; + goto out_err; + } + + /* Keep size of the buffer page aligned so that it can be mmaped */ + data_size = roundup(sizeof(struct vmcoredd_header) + data->size, + PAGE_SIZE); + + /* Allocate buffer for driver's to write their dumps */ + buf = vmcore_alloc_buf(data_size); + if (!buf) { + ret = -ENOMEM; + goto out_err; + } + + vmcoredd_write_header(buf, data, data_size - + sizeof(struct vmcoredd_header)); + + /* Invoke the driver's dump collection routing */ + ret = data->vmcoredd_callback(data, buf + + sizeof(struct vmcoredd_header)); + if (ret) + goto out_err; + + dump->buf = buf; + dump->size = data_size; + + /* Add the dump to driver sysfs list */ + mutex_lock(&vmcoredd_mutex); + list_add_tail(&dump->list, &vmcoredd_list); + mutex_unlock(&vmcoredd_mutex); + + vmcoredd_update_size(data_size); + return 0; + +out_err: + vfree(buf); + vfree(dump); + + return ret; +} +EXPORT_SYMBOL(vmcore_add_device_dump); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +/* Free all dumps in vmcore device dump list */ +static void vmcore_free_device_dumps(void) +{ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + mutex_lock(&vmcoredd_mutex); + while (!list_empty(&vmcoredd_list)) { + struct vmcoredd_node *dump; + + dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node, + list); + list_del(&dump->list); + vfree(dump->buf); + vfree(dump); + } + mutex_unlock(&vmcoredd_mutex); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ +} + +/* Init function for vmcore module. */ +static int __init vmcore_init(void) +{ + int rc = 0; + + /* Allow architectures to allocate ELF header in 2nd kernel */ + rc = elfcorehdr_alloc(&elfcorehdr_addr, &elfcorehdr_size); + if (rc) + return rc; + /* + * If elfcorehdr= has been passed in cmdline or created in 2nd kernel, + * then capture the dump. + */ + if (!(is_vmcore_usable())) + return rc; + rc = parse_crash_elf_headers(); + if (rc) { + pr_warn("Kdump: vmcore not initialized\n"); + return rc; + } + elfcorehdr_free(elfcorehdr_addr); + elfcorehdr_addr = ELFCORE_ADDR_ERR; + + proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &vmcore_proc_ops); + if (proc_vmcore) + proc_vmcore->size = vmcore_size; + return 0; +} +fs_initcall(vmcore_init); + +/* Cleanup function for vmcore module. */ +void vmcore_cleanup(void) +{ + if (proc_vmcore) { + proc_remove(proc_vmcore); + proc_vmcore = NULL; + } + + /* clear the vmcore list. */ + while (!list_empty(&vmcore_list)) { + struct vmcore *m; + + m = list_first_entry(&vmcore_list, struct vmcore, list); + list_del(&m->list); + kfree(m); + } + free_elfcorebuf(); + + /* clear vmcore device dump list */ + vmcore_free_device_dumps(); +} diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c new file mode 100644 index 000000000..846f9455a --- /dev/null +++ b/fs/proc_namespace.c @@ -0,0 +1,348 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/proc_namespace.c - handling of /proc/<pid>/{mounts,mountinfo,mountstats} + * + * In fact, that's a piece of procfs; it's *almost* isolated from + * the rest of fs/proc, but has rather close relationships with + * fs/namespace.c, thus here instead of fs/proc + * + */ +#include <linux/mnt_namespace.h> +#include <linux/nsproxy.h> +#include <linux/security.h> +#include <linux/fs_struct.h> +#include <linux/sched/task.h> + +#include "proc/internal.h" /* only for get_proc_task() in ->open() */ + +#include "pnode.h" +#include "internal.h" + +static __poll_t mounts_poll(struct file *file, poll_table *wait) +{ + struct seq_file *m = file->private_data; + struct proc_mounts *p = m->private; + struct mnt_namespace *ns = p->ns; + __poll_t res = EPOLLIN | EPOLLRDNORM; + int event; + + poll_wait(file, &p->ns->poll, wait); + + event = READ_ONCE(ns->event); + if (m->poll_event != event) { + m->poll_event = event; + res |= EPOLLERR | EPOLLPRI; + } + + return res; +} + +struct proc_fs_opts { + int flag; + const char *str; +}; + +static int show_sb_opts(struct seq_file *m, struct super_block *sb) +{ + static const struct proc_fs_opts fs_opts[] = { + { SB_SYNCHRONOUS, ",sync" }, + { SB_DIRSYNC, ",dirsync" }, + { SB_MANDLOCK, ",mand" }, + { SB_LAZYTIME, ",lazytime" }, + { 0, NULL } + }; + const struct proc_fs_opts *fs_infop; + + for (fs_infop = fs_opts; fs_infop->flag; fs_infop++) { + if (sb->s_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } + + return security_sb_show_options(m, sb); +} + +static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) +{ + static const struct proc_fs_opts mnt_opts[] = { + { MNT_NOSUID, ",nosuid" }, + { MNT_NODEV, ",nodev" }, + { MNT_NOEXEC, ",noexec" }, + { MNT_NOATIME, ",noatime" }, + { MNT_NODIRATIME, ",nodiratime" }, + { MNT_RELATIME, ",relatime" }, + { MNT_NOSYMFOLLOW, ",nosymfollow" }, + { 0, NULL } + }; + const struct proc_fs_opts *fs_infop; + + for (fs_infop = mnt_opts; fs_infop->flag; fs_infop++) { + if (mnt->mnt_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } + + if (is_idmapped_mnt(mnt)) + seq_puts(m, ",idmapped"); +} + +static inline void mangle(struct seq_file *m, const char *s) +{ + seq_escape(m, s, " \t\n\\#"); +} + +static void show_type(struct seq_file *m, struct super_block *sb) +{ + mangle(m, sb->s_type->name); + if (sb->s_subtype) { + seq_putc(m, '.'); + mangle(m, sb->s_subtype); + } +} + +static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) +{ + struct proc_mounts *p = m->private; + struct mount *r = real_mount(mnt); + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct super_block *sb = mnt_path.dentry->d_sb; + int err; + + if (sb->s_op->show_devname) { + err = sb->s_op->show_devname(m, mnt_path.dentry); + if (err) + goto out; + } else { + mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + } + seq_putc(m, ' '); + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); + if (err) + goto out; + seq_putc(m, ' '); + show_type(m, sb); + seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); + err = show_sb_opts(m, sb); + if (err) + goto out; + show_mnt_opts(m, mnt); + if (sb->s_op->show_options) + err = sb->s_op->show_options(m, mnt_path.dentry); + seq_puts(m, " 0 0\n"); +out: + return err; +} + +static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) +{ + struct proc_mounts *p = m->private; + struct mount *r = real_mount(mnt); + struct super_block *sb = mnt->mnt_sb; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + int err; + + seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, + MAJOR(sb->s_dev), MINOR(sb->s_dev)); + if (sb->s_op->show_path) { + err = sb->s_op->show_path(m, mnt->mnt_root); + if (err) + goto out; + } else { + seq_dentry(m, mnt->mnt_root, " \t\n\\"); + } + seq_putc(m, ' '); + + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); + if (err) + goto out; + + seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); + show_mnt_opts(m, mnt); + + /* Tagged fields ("foo:X" or "bar") */ + if (IS_MNT_SHARED(r)) + seq_printf(m, " shared:%i", r->mnt_group_id); + if (IS_MNT_SLAVE(r)) { + int master = r->mnt_master->mnt_group_id; + int dom = get_dominating_id(r, &p->root); + seq_printf(m, " master:%i", master); + if (dom && dom != master) + seq_printf(m, " propagate_from:%i", dom); + } + if (IS_MNT_UNBINDABLE(r)) + seq_puts(m, " unbindable"); + + /* Filesystem specific data */ + seq_puts(m, " - "); + show_type(m, sb); + seq_putc(m, ' '); + if (sb->s_op->show_devname) { + err = sb->s_op->show_devname(m, mnt->mnt_root); + if (err) + goto out; + } else { + mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + } + seq_puts(m, sb_rdonly(sb) ? " ro" : " rw"); + err = show_sb_opts(m, sb); + if (err) + goto out; + if (sb->s_op->show_options) + err = sb->s_op->show_options(m, mnt->mnt_root); + seq_putc(m, '\n'); +out: + return err; +} + +static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) +{ + struct proc_mounts *p = m->private; + struct mount *r = real_mount(mnt); + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct super_block *sb = mnt_path.dentry->d_sb; + int err; + + /* device */ + if (sb->s_op->show_devname) { + seq_puts(m, "device "); + err = sb->s_op->show_devname(m, mnt_path.dentry); + if (err) + goto out; + } else { + if (r->mnt_devname) { + seq_puts(m, "device "); + mangle(m, r->mnt_devname); + } else + seq_puts(m, "no device"); + } + + /* mount point */ + seq_puts(m, " mounted on "); + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); + if (err) + goto out; + seq_putc(m, ' '); + + /* file system type */ + seq_puts(m, "with fstype "); + show_type(m, sb); + + /* optional statistics */ + if (sb->s_op->show_stats) { + seq_putc(m, ' '); + err = sb->s_op->show_stats(m, mnt_path.dentry); + } + + seq_putc(m, '\n'); +out: + return err; +} + +static int mounts_open_common(struct inode *inode, struct file *file, + int (*show)(struct seq_file *, struct vfsmount *)) +{ + struct task_struct *task = get_proc_task(inode); + struct nsproxy *nsp; + struct mnt_namespace *ns = NULL; + struct path root; + struct proc_mounts *p; + struct seq_file *m; + int ret = -EINVAL; + + if (!task) + goto err; + + task_lock(task); + nsp = task->nsproxy; + if (!nsp || !nsp->mnt_ns) { + task_unlock(task); + put_task_struct(task); + goto err; + } + ns = nsp->mnt_ns; + get_mnt_ns(ns); + if (!task->fs) { + task_unlock(task); + put_task_struct(task); + ret = -ENOENT; + goto err_put_ns; + } + get_fs_root(task->fs, &root); + task_unlock(task); + put_task_struct(task); + + ret = seq_open_private(file, &mounts_op, sizeof(struct proc_mounts)); + if (ret) + goto err_put_path; + + m = file->private_data; + m->poll_event = ns->event; + + p = m->private; + p->ns = ns; + p->root = root; + p->show = show; + INIT_LIST_HEAD(&p->cursor.mnt_list); + p->cursor.mnt.mnt_flags = MNT_CURSOR; + + return 0; + + err_put_path: + path_put(&root); + err_put_ns: + put_mnt_ns(ns); + err: + return ret; +} + +static int mounts_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct proc_mounts *p = m->private; + path_put(&p->root); + mnt_cursor_del(p->ns, &p->cursor); + put_mnt_ns(p->ns); + return seq_release_private(inode, file); +} + +static int mounts_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_vfsmnt); +} + +static int mountinfo_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_mountinfo); +} + +static int mountstats_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, show_vfsstat); +} + +const struct file_operations proc_mounts_operations = { + .open = mounts_open, + .read_iter = seq_read_iter, + .splice_read = generic_file_splice_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; + +const struct file_operations proc_mountinfo_operations = { + .open = mountinfo_open, + .read_iter = seq_read_iter, + .splice_read = generic_file_splice_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; + +const struct file_operations proc_mountstats_operations = { + .open = mountstats_open, + .read_iter = seq_read_iter, + .splice_read = generic_file_splice_read, + .llseek = seq_lseek, + .release = mounts_release, +}; |