diff options
Diffstat (limited to 'init')
-rw-r--r-- | init/Kconfig | 2038 | ||||
-rw-r--r-- | init/Makefile | 37 | ||||
-rw-r--r-- | init/calibrate.c | 316 | ||||
-rw-r--r-- | init/do_mounts.c | 637 | ||||
-rw-r--r-- | init/do_mounts.h | 63 | ||||
-rw-r--r-- | init/do_mounts_initrd.c | 128 | ||||
-rw-r--r-- | init/do_mounts_md.c | 304 | ||||
-rw-r--r-- | init/do_mounts_rd.c | 345 | ||||
-rw-r--r-- | init/init_task.c | 192 | ||||
-rw-r--r-- | init/initramfs.c | 655 | ||||
-rw-r--r-- | init/main.c | 1184 | ||||
-rw-r--r-- | init/noinitramfs.c | 52 | ||||
-rw-r--r-- | init/version.c | 54 |
13 files changed, 6005 insertions, 0 deletions
diff --git a/init/Kconfig b/init/Kconfig new file mode 100644 index 000000000..15c02e15d --- /dev/null +++ b/init/Kconfig @@ -0,0 +1,2038 @@ +config DEFCONFIG_LIST + string + depends on !UML + option defconfig_list + default "/lib/modules/$(shell,uname -r)/.config" + default "/etc/kernel-config" + default "/boot/config-$(shell,uname -r)" + default ARCH_DEFCONFIG + default "arch/$(ARCH)/defconfig" + +config CC_IS_GCC + def_bool $(success,$(CC) --version | head -n 1 | grep -q gcc) + +config GCC_VERSION + int + default $(shell,$(srctree)/scripts/gcc-version.sh -p $(CC) | sed 's/^0*//') if CC_IS_GCC + default 0 + +config CC_IS_CLANG + def_bool $(success,$(CC) --version | head -n 1 | grep -q clang) + +config LD_IS_LLD + def_bool $(success,$(LD) -v | head -n 1 | grep -q LLD) + +config CLANG_VERSION + int + default $(shell,$(srctree)/scripts/clang-version.sh $(CC)) + +config CC_HAS_ASM_GOTO + def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC)) + +config CONSTRUCTORS + bool + depends on !UML + +config IRQ_WORK + bool + +config BUILDTIME_EXTABLE_SORT + bool + +config THREAD_INFO_IN_TASK + bool + help + Select this to move thread_info off the stack into task_struct. To + make this work, an arch will need to remove all thread_info fields + except flags and fix any runtime bugs. + + One subtle change that will be needed is to use try_get_task_stack() + and put_task_stack() in save_thread_stack_tsk() and get_wchan(). + +menu "General setup" + +config BROKEN + bool + +config BROKEN_ON_SMP + bool + depends on BROKEN || !SMP + default y + +config INIT_ENV_ARG_LIMIT + int + default 32 if !UML + default 128 if UML + help + Maximum of each of the number of arguments and environment + variables passed to init from the kernel command line. + +config COMPILE_TEST + bool "Compile also drivers which will not load" + depends on HAS_IOMEM + help + Some drivers can be compiled on a different platform than they are + intended to be run on. Despite they cannot be loaded there (or even + when they load they cannot be used due to missing HW support), + developers still, opposing to distributors, might want to build such + drivers to compile-test them. + + If you are a developer and want to build everything available, say Y + here. If you are a user/distributor, say N here to exclude useless + drivers to be distributed. + +config LOCALVERSION + string "Local version - append to kernel release" + help + Append an extra string to the end of your kernel version. + This will show up when you type uname, for example. + The string you set here will be appended after the contents of + any files with a filename matching localversion* in your + object and source tree, in that order. Your total string can + be a maximum of 64 characters. + +config LOCALVERSION_AUTO + bool "Automatically append version information to the version string" + default y + depends on !COMPILE_TEST + help + This will try to automatically determine if the current tree is a + release tree by looking for git tags that belong to the current + top of tree revision. + + A string of the format -gxxxxxxxx will be added to the localversion + if a git-based tree is found. The string generated by this will be + appended after any matching localversion* files, and after the value + set in CONFIG_LOCALVERSION. + + (The actual string used here is the first eight characters produced + by running the command: + + $ git rev-parse --verify HEAD + + which is done within the script "scripts/setlocalversion".) + +config BUILD_SALT + string "Build ID Salt" + default "" + help + The build ID is used to link binaries and their debug info. Setting + this option will use the value in the calculation of the build id. + This is mostly useful for distributions which want to ensure the + build is unique between builds. It's safe to leave the default. + +config HAVE_KERNEL_GZIP + bool + +config HAVE_KERNEL_BZIP2 + bool + +config HAVE_KERNEL_LZMA + bool + +config HAVE_KERNEL_XZ + bool + +config HAVE_KERNEL_LZO + bool + +config HAVE_KERNEL_LZ4 + bool + +config HAVE_KERNEL_UNCOMPRESSED + bool + +choice + prompt "Kernel compression mode" + default KERNEL_GZIP + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 || HAVE_KERNEL_UNCOMPRESSED + help + The linux kernel is a kind of self-extracting executable. + Several compression algorithms are available, which differ + in efficiency, compression and decompression speed. + Compression speed is only relevant when building a kernel. + Decompression speed is relevant at each boot. + + If you have any problems with bzip2 or lzma compressed + kernels, mail me (Alain Knaff) <alain@knaff.lu>. (An older + version of this functionality (bzip2 only), for 2.4, was + supplied by Christian Ludwig) + + High compression options are mostly useful for users, who + are low on disk space (embedded systems), but for whom ram + size matters less. + + If in doubt, select 'gzip' + +config KERNEL_GZIP + bool "Gzip" + depends on HAVE_KERNEL_GZIP + help + The old and tried gzip compression. It provides a good balance + between compression ratio and decompression speed. + +config KERNEL_BZIP2 + bool "Bzip2" + depends on HAVE_KERNEL_BZIP2 + help + Its compression ratio and speed is intermediate. + Decompression speed is slowest among the choices. The kernel + size is about 10% smaller with bzip2, in comparison to gzip. + Bzip2 uses a large amount of memory. For modern kernels you + will need at least 8MB RAM or more for booting. + +config KERNEL_LZMA + bool "LZMA" + depends on HAVE_KERNEL_LZMA + help + This compression algorithm's ratio is best. Decompression speed + is between gzip and bzip2. Compression is slowest. + The kernel size is about 33% smaller with LZMA in comparison to gzip. + +config KERNEL_XZ + bool "XZ" + depends on HAVE_KERNEL_XZ + help + XZ uses the LZMA2 algorithm and instruction set specific + BCJ filters which can improve compression ratio of executable + code. The size of the kernel is about 30% smaller with XZ in + comparison to gzip. On architectures for which there is a BCJ + filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ + will create a few percent smaller kernel than plain LZMA. + + The speed is about the same as with LZMA: The decompression + speed of XZ is better than that of bzip2 but worse than gzip + and LZO. Compression is slow. + +config KERNEL_LZO + bool "LZO" + depends on HAVE_KERNEL_LZO + help + Its compression ratio is the poorest among the choices. The kernel + size is about 10% bigger than gzip; however its speed + (both compression and decompression) is the fastest. + +config KERNEL_LZ4 + bool "LZ4" + depends on HAVE_KERNEL_LZ4 + help + LZ4 is an LZ77-type compressor with a fixed, byte-oriented encoding. + A preliminary version of LZ4 de/compression tool is available at + <https://code.google.com/p/lz4/>. + + Its compression ratio is worse than LZO. The size of the kernel + is about 8% bigger than LZO. But the decompression speed is + faster than LZO. + +config KERNEL_UNCOMPRESSED + bool "None" + depends on HAVE_KERNEL_UNCOMPRESSED + help + Produce uncompressed kernel image. This option is usually not what + you want. It is useful for debugging the kernel in slow simulation + environments, where decompressing and moving the kernel is awfully + slow. This option allows early boot code to skip the decompressor + and jump right at uncompressed kernel image. + +endchoice + +config DEFAULT_HOSTNAME + string "Default hostname" + default "(none)" + help + This option determines the default system hostname before userspace + calls sethostname(2). The kernel traditionally uses "(none)" here, + but you may wish to use a different default here to make a minimal + system more usable with less configuration. + +# +# For some reason microblaze and nios2 hard code SWAP=n. Hopefully we can +# add proper SWAP support to them, in which case this can be remove. +# +config ARCH_NO_SWAP + bool + +config SWAP + bool "Support for paging of anonymous memory (swap)" + depends on MMU && BLOCK && !ARCH_NO_SWAP + default y + help + This option allows you to choose whether you want to have support + for so called swap devices or swap files in your kernel that are + used to provide more virtual memory than the actual RAM present + in your computer. If unsure say Y. + +config SYSVIPC + bool "System V IPC" + ---help--- + Inter Process Communication is a suite of library functions and + system calls which let processes (running programs) synchronize and + exchange information. It is generally considered to be a good thing, + and some programs won't run unless you say Y here. In particular, if + you want to run the DOS emulator dosemu under Linux (read the + DOSEMU-HOWTO, available from <http://www.tldp.org/docs.html#howto>), + you'll need to say Y here. + + You can find documentation about IPC with "info ipc" and also in + section 6.4 of the Linux Programmer's Guide, available from + <http://www.tldp.org/guides.html>. + +config SYSVIPC_SYSCTL + bool + depends on SYSVIPC + depends on SYSCTL + default y + +config POSIX_MQUEUE + bool "POSIX Message Queues" + depends on NET + ---help--- + POSIX variant of message queues is a part of IPC. In POSIX message + queues every message has a priority which decides about succession + of receiving it by a process. If you want to compile and run + programs written e.g. for Solaris with use of its POSIX message + queues (functions mq_*) say Y here. + + POSIX message queues are visible as a filesystem called 'mqueue' + and can be mounted somewhere if you want to do filesystem + operations on message queues. + + If unsure, say Y. + +config POSIX_MQUEUE_SYSCTL + bool + depends on POSIX_MQUEUE + depends on SYSCTL + default y + +config CROSS_MEMORY_ATTACH + bool "Enable process_vm_readv/writev syscalls" + depends on MMU + default y + help + Enabling this option adds the system calls process_vm_readv and + process_vm_writev which allow a process with the correct privileges + to directly read from or write to another process' address space. + See the man page for more details. + +config USELIB + bool "uselib syscall" + def_bool ALPHA || M68K || SPARC || X86_32 || IA32_EMULATION + help + This option enables the uselib syscall, a system call used in the + dynamic linker from libc5 and earlier. glibc does not use this + system call. If you intend to run programs built on libc5 or + earlier, you may need to enable this syscall. Current systems + running glibc can safely disable this. + +config AUDIT + bool "Auditing support" + depends on NET + help + Enable auditing infrastructure that can be used with another + kernel subsystem, such as SELinux (which requires this for + logging of avc messages output). System call auditing is included + on architectures which support it. + +config HAVE_ARCH_AUDITSYSCALL + bool + +config AUDITSYSCALL + def_bool y + depends on AUDIT && HAVE_ARCH_AUDITSYSCALL + +config AUDIT_WATCH + def_bool y + depends on AUDITSYSCALL + select FSNOTIFY + +config AUDIT_TREE + def_bool y + depends on AUDITSYSCALL + select FSNOTIFY + +source "kernel/irq/Kconfig" +source "kernel/time/Kconfig" +source "kernel/Kconfig.preempt" + +menu "CPU/Task time and stats accounting" + +config VIRT_CPU_ACCOUNTING + bool + +choice + prompt "Cputime accounting" + default TICK_CPU_ACCOUNTING if !PPC64 + default VIRT_CPU_ACCOUNTING_NATIVE if PPC64 + +# Kind of a stub config for the pure tick based cputime accounting +config TICK_CPU_ACCOUNTING + bool "Simple tick based cputime accounting" + depends on !S390 && !NO_HZ_FULL + help + This is the basic tick based cputime accounting that maintains + statistics about user, system and idle time spent on per jiffies + granularity. + + If unsure, say Y. + +config VIRT_CPU_ACCOUNTING_NATIVE + bool "Deterministic task and CPU time accounting" + depends on HAVE_VIRT_CPU_ACCOUNTING && !NO_HZ_FULL + select VIRT_CPU_ACCOUNTING + help + Select this option to enable more accurate task and CPU time + accounting. This is done by reading a CPU counter on each + kernel entry and exit and on transitions within the kernel + between system, softirq and hardirq state, so there is a + small performance impact. In the case of s390 or IBM POWER > 5, + this also enables accounting of stolen time on logically-partitioned + systems. + +config VIRT_CPU_ACCOUNTING_GEN + bool "Full dynticks CPU time accounting" + depends on HAVE_CONTEXT_TRACKING + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN + select VIRT_CPU_ACCOUNTING + select CONTEXT_TRACKING + help + Select this option to enable task and CPU time accounting on full + dynticks systems. This accounting is implemented by watching every + kernel-user boundaries using the context tracking subsystem. + The accounting is thus performed at the expense of some significant + overhead. + + For now this is only useful if you are working on the full + dynticks subsystem development. + + If unsure, say N. + +endchoice + +config IRQ_TIME_ACCOUNTING + bool "Fine granularity task level IRQ time accounting" + depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE + help + Select this option to enable fine granularity task irq time + accounting. This is done by reading a timestamp on each + transitions between softirq and hardirq state, so there can be a + small performance impact. + + If in doubt, say N here. + +config HAVE_SCHED_AVG_IRQ + def_bool y + depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING + depends on SMP + +config BSD_PROCESS_ACCT + bool "BSD Process Accounting" + depends on MULTIUSER + help + If you say Y here, a user level program will be able to instruct the + kernel (via a special system call) to write process accounting + information to a file: whenever a process exits, information about + that process will be appended to the file by the kernel. The + information includes things such as creation time, owning user, + command name, memory usage, controlling terminal etc. (the complete + list is in the struct acct in <file:include/linux/acct.h>). It is + up to the user level program to do useful things with this + information. This is generally a good idea, so say Y. + +config BSD_PROCESS_ACCT_V3 + bool "BSD Process Accounting version 3 file format" + depends on BSD_PROCESS_ACCT + default n + help + If you say Y here, the process accounting information is written + in a new file format that also logs the process IDs of each + process and its parent. Note that this file format is incompatible + with previous v0/v1/v2 file formats, so you will need updated tools + for processing it. A preliminary version of these tools is available + at <http://www.gnu.org/software/acct/>. + +config TASKSTATS + bool "Export task/process statistics through netlink" + depends on NET + depends on MULTIUSER + default n + help + Export selected statistics for tasks/processes through the + generic netlink interface. Unlike BSD process accounting, the + statistics are available during the lifetime of tasks/processes as + responses to commands. Like BSD accounting, they are sent to user + space on task exit. + + Say N if unsure. + +config TASK_DELAY_ACCT + bool "Enable per-task delay accounting" + depends on TASKSTATS + select SCHED_INFO + help + Collect information on time spent by a task waiting for system + resources like cpu, synchronous block I/O completion and swapping + in pages. Such statistics can help in setting a task's priorities + relative to other tasks for cpu, io, rss limits etc. + + Say N if unsure. + +config TASK_XACCT + bool "Enable extended accounting over taskstats" + depends on TASKSTATS + help + Collect extended task accounting data and send the data + to userland for processing over the taskstats interface. + + Say N if unsure. + +config TASK_IO_ACCOUNTING + bool "Enable per-task storage I/O accounting" + depends on TASK_XACCT + help + Collect information on the number of bytes of storage I/O which this + task has caused. + + Say N if unsure. + +endmenu # "CPU/Task time and stats accounting" + +config CPU_ISOLATION + bool "CPU isolation" + depends on SMP || COMPILE_TEST + default y + help + Make sure that CPUs running critical tasks are not disturbed by + any source of "noise" such as unbound workqueues, timers, kthreads... + Unbound jobs get offloaded to housekeeping CPUs. This is driven by + the "isolcpus=" boot parameter. + + Say Y if unsure. + +source "kernel/rcu/Kconfig" + +config BUILD_BIN2C + bool + default n + +config IKCONFIG + tristate "Kernel .config support" + select BUILD_BIN2C + ---help--- + This option enables the complete Linux kernel ".config" file + contents to be saved in the kernel. It provides documentation + of which kernel options are used in a running kernel or in an + on-disk kernel. This information can be extracted from the kernel + image file with the script scripts/extract-ikconfig and used as + input to rebuild the current kernel or to build another kernel. + It can also be extracted from a running kernel by reading + /proc/config.gz if enabled (below). + +config IKCONFIG_PROC + bool "Enable access to .config through /proc/config.gz" + depends on IKCONFIG && PROC_FS + ---help--- + This option enables access to the kernel configuration file + through /proc/config.gz. + +config LOG_BUF_SHIFT + int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" + range 12 25 if !H8300 + range 12 19 if H8300 + default 17 + depends on PRINTK + help + Select the minimal kernel log buffer size as a power of 2. + The final size is affected by LOG_CPU_MAX_BUF_SHIFT config + parameter, see below. Any higher size also might be forced + by "log_buf_len" boot parameter. + + Examples: + 17 => 128 KB + 16 => 64 KB + 15 => 32 KB + 14 => 16 KB + 13 => 8 KB + 12 => 4 KB + +config LOG_CPU_MAX_BUF_SHIFT + int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" + depends on SMP + range 0 21 + default 12 if !BASE_SMALL + default 0 if BASE_SMALL + depends on PRINTK + help + This option allows to increase the default ring buffer size + according to the number of CPUs. The value defines the contribution + of each CPU as a power of 2. The used space is typically only few + lines however it might be much more when problems are reported, + e.g. backtraces. + + The increased size means that a new buffer has to be allocated and + the original static one is unused. It makes sense only on systems + with more CPUs. Therefore this value is used only when the sum of + contributions is greater than the half of the default kernel ring + buffer as defined by LOG_BUF_SHIFT. The default values are set + so that more than 64 CPUs are needed to trigger the allocation. + + Also this option is ignored when "log_buf_len" kernel parameter is + used as it forces an exact (power of two) size of the ring buffer. + + The number of possible CPUs is used for this computation ignoring + hotplugging making the computation optimal for the worst case + scenario while allowing a simple algorithm to be used from bootup. + + Examples shift values and their meaning: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + +config PRINTK_SAFE_LOG_BUF_SHIFT + int "Temporary per-CPU printk log buffer size (12 => 4KB, 13 => 8KB)" + range 10 21 + default 13 + depends on PRINTK + help + Select the size of an alternate printk per-CPU buffer where messages + printed from usafe contexts are temporary stored. One example would + be NMI messages, another one - printk recursion. The messages are + copied to the main log buffer in a safe context to avoid a deadlock. + The value defines the size as a power of 2. + + Those messages are rare and limited. The largest one is when + a backtrace is printed. It usually fits into 4KB. Select + 8KB if you want to be on the safe side. + + Examples: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + +# +# Architectures with an unreliable sched_clock() should select this: +# +config HAVE_UNSTABLE_SCHED_CLOCK + bool + +config GENERIC_SCHED_CLOCK + bool + +# +# For architectures that want to enable the support for NUMA-affine scheduler +# balancing logic: +# +config ARCH_SUPPORTS_NUMA_BALANCING + bool + +# +# For architectures that prefer to flush all TLBs after a number of pages +# are unmapped instead of sending one IPI per page to flush. The architecture +# must provide guarantees on what happens if a clean TLB cache entry is +# written after the unmap. Details are in mm/rmap.c near the check for +# should_defer_flush. The architecture should also consider if the full flush +# and the refill costs are offset by the savings of sending fewer IPIs. +config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + bool + +# +# For architectures that know their GCC __int128 support is sound +# +config ARCH_SUPPORTS_INT128 + bool + +# For architectures that (ab)use NUMA to represent different memory regions +# all cpu-local but of different latencies, such as SuperH. +# +config ARCH_WANT_NUMA_VARIABLE_LOCALITY + bool + +config NUMA_BALANCING + bool "Memory placement aware NUMA scheduler" + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when + it has references to the node the task is running on. + + This system will be inactive on UMA systems. + +config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y + depends on NUMA_BALANCING + help + If set, automatic NUMA balancing will be enabled if running on a NUMA + machine. + +menuconfig CGROUPS + bool "Control Group support" + select KERNFS + help + This option adds support for grouping sets of processes together, for + use with process control subsystems such as Cpusets, CFS, memory + controls or device isolation. + See + - Documentation/scheduler/sched-design-CFS.txt (CFS) + - Documentation/cgroup-v1/ (features for grouping, isolation + and resource control) + + Say N if unsure. + +if CGROUPS + +config PAGE_COUNTER + bool + +config MEMCG + bool "Memory controller" + select PAGE_COUNTER + select EVENTFD + help + Provides control over the memory footprint of tasks in a cgroup. + +config MEMCG_SWAP + bool "Swap controller" + depends on MEMCG && SWAP + help + Provides control over the swap space consumed by tasks in a cgroup. + +config MEMCG_SWAP_ENABLED + bool "Swap controller enabled by default" + depends on MEMCG_SWAP + default y + help + Memory Resource Controller Swap Extension comes with its price in + a bigger memory consumption. General purpose distribution kernels + which want to enable the feature but keep it disabled by default + and let the user enable it by swapaccount=1 boot command line + parameter should have this option unselected. + For those who want to have the feature enabled by default should + select this option (if, for some reason, they need to disable it + then swapaccount=0 does the trick). + +config MEMCG_KMEM + bool + depends on MEMCG && !SLOB + default y + +config BLK_CGROUP + bool "IO controller" + depends on BLOCK + default n + ---help--- + Generic block IO controller cgroup interface. This is the common + cgroup interface which should be used by various IO controlling + policies. + + Currently, CFQ IO scheduler uses it to recognize task groups and + control disk bandwidth allocation (proportional time slice allocation) + to such task groups. It is also used by bio throttling logic in + block layer to implement upper limit in IO rates on a device. + + This option only enables generic Block IO controller infrastructure. + One needs to also enable actual IO controlling logic/policy. For + enabling proportional weight division of disk bandwidth in CFQ, set + CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set + CONFIG_BLK_DEV_THROTTLING=y. + + See Documentation/cgroup-v1/blkio-controller.txt for more information. + +config DEBUG_BLK_CGROUP + bool "IO controller debugging" + depends on BLK_CGROUP + default n + ---help--- + Enable some debugging help. Currently it exports additional stat + files in a cgroup which can be useful for debugging. + +config CGROUP_WRITEBACK + bool + depends on MEMCG && BLK_CGROUP + default y + +menuconfig CGROUP_SCHED + bool "CPU controller" + default n + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +if CGROUP_SCHED +config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED + default CGROUP_SCHED + +config CFS_BANDWIDTH + bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" + depends on FAIR_GROUP_SCHED + default n + help + This option allows users to define CPU bandwidth rates (limits) for + tasks running within the fair group scheduler. Groups with no limit + set are considered to be unconstrained and will run with no + restriction. + See Documentation/scheduler/sched-bwc.txt for more information. + +config RT_GROUP_SCHED + bool "Group scheduling for SCHED_RR/FIFO" + depends on CGROUP_SCHED + default n + help + This feature lets you explicitly allocate real CPU bandwidth + to task groups. If enabled, it will also make it impossible to + schedule realtime tasks for non-root users until you allocate + realtime bandwidth for them. + See Documentation/scheduler/sched-rt-group.txt for more information. + +endif #CGROUP_SCHED + +config CGROUP_PIDS + bool "PIDs controller" + help + Provides enforcement of process number limits in the scope of a + cgroup. Any attempt to fork more processes than is allowed in the + cgroup will fail. PIDs are fundamentally a global resource because it + is fairly trivial to reach PID exhaustion before you reach even a + conservative kmemcg limit. As a result, it is possible to grind a + system to halt without being limited by other cgroup policies. The + PIDs controller is designed to stop this from happening. + + It should be noted that organisational operations (such as attaching + to a cgroup hierarchy will *not* be blocked by the PIDs controller), + since the PIDs limit only affects a process's ability to fork, not to + attach to a cgroup. + +config CGROUP_RDMA + bool "RDMA controller" + help + Provides enforcement of RDMA resources defined by IB stack. + It is fairly easy for consumers to exhaust RDMA resources, which + can result into resource unavailability to other consumers. + RDMA controller is designed to stop this from happening. + Attaching processes with active RDMA resources to the cgroup + hierarchy is allowed even if can cross the hierarchy's limit. + +config CGROUP_FREEZER + bool "Freezer controller" + help + Provides a way to freeze and unfreeze all tasks in a + cgroup. + + This option affects the ORIGINAL cgroup interface. The cgroup2 memory + controller includes important in-kernel memory consumers per default. + + If you're using cgroup2, say N. + +config CGROUP_HUGETLB + bool "HugeTLB controller" + depends on HUGETLB_PAGE + select PAGE_COUNTER + default n + help + Provides a cgroup controller for HugeTLB pages. + When you enable this, you can put a per cgroup limit on HugeTLB usage. + The limit is enforced during page fault. Since HugeTLB doesn't + support page reclaim, enforcing the limit at page fault time implies + that, the application will get SIGBUS signal if it tries to access + HugeTLB pages beyond its limit. This requires the application to know + beforehand how much HugeTLB pages it would require for its use. The + control group is tracked in the third page lru pointer. This means + that we cannot use the controller with huge page less than 3 pages. + +config CPUSETS + bool "Cpuset controller" + depends on SMP + help + This option will let you create and manage CPUSETs which + allow dynamically partitioning a system into sets of CPUs and + Memory Nodes and assigning tasks to run only within those sets. + This is primarily useful on large SMP or NUMA systems. + + Say N if unsure. + +config PROC_PID_CPUSET + bool "Include legacy /proc/<pid>/cpuset file" + depends on CPUSETS + default y + +config CGROUP_DEVICE + bool "Device controller" + help + Provides a cgroup controller implementing whitelists for + devices which a process in the cgroup can mknod or open. + +config CGROUP_CPUACCT + bool "Simple CPU accounting controller" + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. + +config CGROUP_PERF + bool "Perf controller" + depends on PERF_EVENTS + help + This option extends the perf per-cpu mode to restrict monitoring + to threads which belong to the cgroup specified and run on the + designated cpu. + + Say N if unsure. + +config CGROUP_BPF + bool "Support for eBPF programs attached to cgroups" + depends on BPF_SYSCALL + select SOCK_CGROUP_DATA + help + Allow attaching eBPF programs to a cgroup using the bpf(2) + syscall command BPF_PROG_ATTACH. + + In which context these programs are accessed depends on the type + of attachment. For instance, programs that are attached using + BPF_CGROUP_INET_INGRESS will be executed on the ingress path of + inet sockets. + +config CGROUP_DEBUG + bool "Debug controller" + default n + depends on DEBUG_KERNEL + help + This option enables a simple controller that exports + debugging information about the cgroups framework. This + controller is for control cgroup debugging only. Its + interfaces are not stable. + + Say N. + +config SOCK_CGROUP_DATA + bool + default n + +endif # CGROUPS + +menuconfig NAMESPACES + bool "Namespaces support" if EXPERT + depends on MULTIUSER + default !EXPERT + help + Provides the way to make tasks work with different objects using + the same id. For example same IPC id may refer to different objects + or same user id or pid may refer to different tasks when used in + different namespaces. + +if NAMESPACES + +config UTS_NS + bool "UTS namespace" + default y + help + In this namespace tasks see different info provided with the + uname() system call + +config IPC_NS + bool "IPC namespace" + depends on (SYSVIPC || POSIX_MQUEUE) + default y + help + In this namespace tasks work with IPC ids which correspond to + different IPC objects in different namespaces. + +config USER_NS + bool "User namespace" + default n + help + This allows containers, i.e. vservers, to use user namespaces + to provide different user info for different servers. + + When user namespaces are enabled in the kernel it is + recommended that the MEMCG option also be enabled and that + user-space use the memory control groups to limit the amount + of memory a memory unprivileged users can use. + + If unsure, say N. + +config PID_NS + bool "PID Namespaces" + default y + help + Support process id namespaces. This allows having multiple + processes with the same pid as long as they are in different + pid namespaces. This is a building block of containers. + +config NET_NS + bool "Network namespace" + depends on NET + default y + help + Allow user space to create what appear to be multiple instances + of the network stack. + +endif # NAMESPACES + +config CHECKPOINT_RESTORE + bool "Checkpoint/restore support" + select PROC_CHILDREN + default n + help + Enables additional kernel features in a sake of checkpoint/restore. + In particular it adds auxiliary prctl codes to setup process text, + data and heap segment sizes, and a few additional /proc filesystem + entries. + + If unsure, say N here. + +config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED + help + This option optimizes the scheduler for common desktop workloads by + automatically creating and populating task groups. This separation + of workloads isolates aggressive CPU burners (like build jobs) from + desktop applications. Task group autogeneration is currently based + upon task session. + +config SYSFS_DEPRECATED + bool "Enable deprecated sysfs features to support old userspace tools" + depends on SYSFS + default n + help + This option adds code that switches the layout of the "block" class + devices, to not show up in /sys/class/block/, but only in + /sys/block/. + + This switch is only active when the sysfs.deprecated=1 boot option is + passed or the SYSFS_DEPRECATED_V2 option is set. + + This option allows new kernels to run on old distributions and tools, + which might get confused by /sys/class/block/. Since 2007/2008 all + major distributions and tools handle this just fine. + + Recent distributions and userspace tools after 2009/2010 depend on + the existence of /sys/class/block/, and will not work with this + option enabled. + + Only if you are using a new kernel on an old distribution, you might + need to say Y here. + +config SYSFS_DEPRECATED_V2 + bool "Enable deprecated sysfs features by default" + default n + depends on SYSFS + depends on SYSFS_DEPRECATED + help + Enable deprecated sysfs by default. + + See the CONFIG_SYSFS_DEPRECATED option for more details about this + option. + + Only if you are using a new kernel on an old distribution, you might + need to say Y here. Even then, odds are you would not need it + enabled, you can always pass the boot option if absolutely necessary. + +config RELAY + bool "Kernel->user space relay support (formerly relayfs)" + select IRQ_WORK + help + This option enables support for relay interface support in + certain file systems (such as debugfs). + It is designed to provide an efficient mechanism for tools and + facilities to relay large amounts of data from kernel space to + user space. + + If unsure, say N. + +config BLK_DEV_INITRD + bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support" + help + The initial RAM filesystem is a ramfs which is loaded by the + boot loader (loadlin or lilo) and that is mounted as root + before the normal boot procedure. It is typically used to + load modules needed to mount the "real" root file system, + etc. See <file:Documentation/admin-guide/initrd.rst> for details. + + If RAM disk support (BLK_DEV_RAM) is also included, this + also enables initial RAM disk (initrd) support and adds + 15 Kbytes (more on some other architectures) to the kernel size. + + If unsure say Y. + +if BLK_DEV_INITRD + +source "usr/Kconfig" + +endif + +choice + prompt "Compiler optimization level" + default CC_OPTIMIZE_FOR_PERFORMANCE + +config CC_OPTIMIZE_FOR_PERFORMANCE + bool "Optimize for performance" + help + This is the default optimization level for the kernel, building + with the "-O2" compiler flag for best performance and most + helpful compile-time warnings. + +config CC_OPTIMIZE_FOR_SIZE + bool "Optimize for size" + help + Enabling this option will pass "-Os" instead of "-O2" to + your compiler resulting in a smaller kernel. + + If unsure, say N. + +endchoice + +config HAVE_LD_DEAD_CODE_DATA_ELIMINATION + bool + help + This requires that the arch annotates or otherwise protects + its external entry points from being discarded. Linker scripts + must also merge .text.*, .data.*, and .bss.* correctly into + output sections. Care must be taken not to pull in unrelated + sections (e.g., '.text.init'). Typically '.' in section names + is used to distinguish them from label names / C identifiers. + +config LD_DEAD_CODE_DATA_ELIMINATION + bool "Dead code and data elimination (EXPERIMENTAL)" + depends on HAVE_LD_DEAD_CODE_DATA_ELIMINATION + depends on EXPERT + depends on !(FUNCTION_TRACER && CC_IS_GCC && GCC_VERSION < 40800) + depends on $(cc-option,-ffunction-sections -fdata-sections) + depends on $(ld-option,--gc-sections) + help + Enable this if you want to do dead code and data elimination with + the linker by compiling with -ffunction-sections -fdata-sections, + and linking with --gc-sections. + + This can reduce on disk and in-memory size of the kernel + code and static data, particularly for small configs and + on small systems. This has the possibility of introducing + silently broken kernel if the required annotations are not + present. This option is not well tested yet, so use at your + own risk. + +config SYSCTL + bool + +config ANON_INODES + bool + +config HAVE_UID16 + bool + +config SYSCTL_EXCEPTION_TRACE + bool + help + Enable support for /proc/sys/debug/exception-trace. + +config SYSCTL_ARCH_UNALIGN_NO_WARN + bool + help + Enable support for /proc/sys/kernel/ignore-unaligned-usertrap + Allows arch to define/use @no_unaligned_warning to possibly warn + about unaligned access emulation going on under the hood. + +config SYSCTL_ARCH_UNALIGN_ALLOW + bool + help + Enable support for /proc/sys/kernel/unaligned-trap + Allows arches to define/use @unaligned_enabled to runtime toggle + the unaligned access emulation. + see arch/parisc/kernel/unaligned.c for reference + +config HAVE_PCSPKR_PLATFORM + bool + +# interpreter that classic socket filters depend on +config BPF + bool + +menuconfig EXPERT + bool "Configure standard kernel features (expert users)" + # Unhide debug options, to make the on-by-default options visible + select DEBUG_KERNEL + help + This option allows certain base kernel options and settings + to be disabled or tweaked. This is for specialized + environments which can tolerate a "non-standard" kernel. + Only use this if you really know what you are doing. + +config UID16 + bool "Enable 16-bit UID system calls" if EXPERT + depends on HAVE_UID16 && MULTIUSER + default y + help + This enables the legacy 16-bit UID syscall wrappers. + +config MULTIUSER + bool "Multiple users, groups and capabilities support" if EXPERT + default y + help + This option enables support for non-root users, groups and + capabilities. + + If you say N here, all processes will run with UID 0, GID 0, and all + possible capabilities. Saying N here also compiles out support for + system calls related to UIDs, GIDs, and capabilities, such as setuid, + setgid, and capset. + + If unsure, say Y here. + +config SGETMASK_SYSCALL + bool "sgetmask/ssetmask syscalls support" if EXPERT + def_bool PARISC || M68K || PPC || MIPS || X86 || SPARC || MICROBLAZE || SUPERH + ---help--- + sys_sgetmask and sys_ssetmask are obsolete system calls + no longer supported in libc but still enabled by default in some + architectures. + + If unsure, leave the default option here. + +config SYSFS_SYSCALL + bool "Sysfs syscall support" if EXPERT + default y + ---help--- + sys_sysfs is an obsolete system call no longer supported in libc. + Note that disabling this option is more secure but might break + compatibility with some systems. + + If unsure say Y here. + +config SYSCTL_SYSCALL + bool "Sysctl syscall support" if EXPERT + depends on PROC_SYSCTL + default n + select SYSCTL + ---help--- + sys_sysctl uses binary paths that have been found challenging + to properly maintain and use. The interface in /proc/sys + using paths with ascii names is now the primary path to this + information. + + Almost nothing using the binary sysctl interface so if you are + trying to save some space it is probably safe to disable this, + making your kernel marginally smaller. + + If unsure say N here. + +config FHANDLE + bool "open by fhandle syscalls" if EXPERT + select EXPORTFS + default y + help + If you say Y here, a user level program will be able to map + file names to handle and then later use the handle for + different file system operations. This is useful in implementing + userspace file servers, which now track files using handles instead + of names. The handle would remain the same even if file names + get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) + syscalls. + +config POSIX_TIMERS + bool "Posix Clocks & timers" if EXPERT + default y + help + This includes native support for POSIX timers to the kernel. + Some embedded systems have no use for them and therefore they + can be configured out to reduce the size of the kernel image. + + When this option is disabled, the following syscalls won't be + available: timer_create, timer_gettime: timer_getoverrun, + timer_settime, timer_delete, clock_adjtime, getitimer, + setitimer, alarm. Furthermore, the clock_settime, clock_gettime, + clock_getres and clock_nanosleep syscalls will be limited to + CLOCK_REALTIME, CLOCK_MONOTONIC and CLOCK_BOOTTIME only. + + If unsure say y. + +config PRINTK + default y + bool "Enable support for printk" if EXPERT + select IRQ_WORK + help + This option enables normal printk support. Removing it + eliminates most of the message strings from the kernel image + and makes the kernel more or less silent. As this makes it + very difficult to diagnose system problems, saying N here is + strongly discouraged. + +config PRINTK_NMI + def_bool y + depends on PRINTK + depends on HAVE_NMI + +config BUG + bool "BUG() support" if EXPERT + default y + help + Disabling this option eliminates support for BUG and WARN, reducing + the size of your kernel image and potentially quietly ignoring + numerous fatal conditions. You should only consider disabling this + option for embedded systems with no facilities for reporting errors. + Just say Y. + +config ELF_CORE + depends on COREDUMP + default y + bool "Enable ELF core dumps" if EXPERT + help + Enable support for generating core dumps. Disabling saves about 4k. + + +config PCSPKR_PLATFORM + bool "Enable PC-Speaker support" if EXPERT + depends on HAVE_PCSPKR_PLATFORM + select I8253_LOCK + default y + help + This option allows to disable the internal PC-Speaker + support, saving some memory. + +config BASE_FULL + default y + bool "Enable full-sized data structures for core" if EXPERT + help + Disabling this option reduces the size of miscellaneous core + kernel data structures. This saves memory on small machines, + but may reduce performance. + +config FUTEX + bool "Enable futex support" if EXPERT + default y + imply RT_MUTEXES + help + Disabling this option will cause the kernel to be built without + support for "fast userspace mutexes". The resulting kernel may not + run glibc-based applications correctly. + +config FUTEX_PI + bool + depends on FUTEX && RT_MUTEXES + default y + +config HAVE_FUTEX_CMPXCHG + bool + depends on FUTEX + help + Architectures should select this if futex_atomic_cmpxchg_inatomic() + is implemented and always working. This removes a couple of runtime + checks. + +config EPOLL + bool "Enable eventpoll support" if EXPERT + default y + select ANON_INODES + help + Disabling this option will cause the kernel to be built without + support for epoll family of system calls. + +config SIGNALFD + bool "Enable signalfd() system call" if EXPERT + select ANON_INODES + default y + help + Enable the signalfd() system call that allows to receive signals + on a file descriptor. + + If unsure, say Y. + +config TIMERFD + bool "Enable timerfd() system call" if EXPERT + select ANON_INODES + default y + help + Enable the timerfd() system call that allows to receive timer + events on a file descriptor. + + If unsure, say Y. + +config EVENTFD + bool "Enable eventfd() system call" if EXPERT + select ANON_INODES + default y + help + Enable the eventfd() system call that allows to receive both + kernel notification (ie. KAIO) or userspace notifications. + + If unsure, say Y. + +config SHMEM + bool "Use full shmem filesystem" if EXPERT + default y + depends on MMU + help + The shmem is an internal filesystem used to manage shared memory. + It is backed by swap and manages resource limits. It is also exported + to userspace as tmpfs if TMPFS is enabled. Disabling this + option replaces shmem and tmpfs with the much simpler ramfs code, + which may be appropriate on small systems without swap. + +config AIO + bool "Enable AIO support" if EXPERT + default y + help + This option enables POSIX asynchronous I/O which may by used + by some high performance threaded applications. Disabling + this option saves about 7k. + +config ADVISE_SYSCALLS + bool "Enable madvise/fadvise syscalls" if EXPERT + default y + help + This option enables the madvise and fadvise syscalls, used by + applications to advise the kernel about their future memory or file + usage, improving performance. If building an embedded system where no + applications use these syscalls, you can disable this option to save + space. + +config MEMBARRIER + bool "Enable membarrier() system call" if EXPERT + default y + help + Enable the membarrier() system call that allows issuing memory + barriers across all running threads, which can be used to distribute + the cost of user-space memory barriers asymmetrically by transforming + pairs of memory barriers into pairs consisting of membarrier() and a + compiler barrier. + + If unsure, say Y. + +config KALLSYMS + bool "Load all symbols for debugging/ksymoops" if EXPERT + default y + help + Say Y here to let the kernel print out symbolic crash information and + symbolic stack backtraces. This increases the size of the kernel + somewhat, as all symbols have to be loaded into the kernel image. + +config KALLSYMS_ALL + bool "Include all symbols in kallsyms" + depends on DEBUG_KERNEL && KALLSYMS + help + Normally kallsyms only contains the symbols of functions for nicer + OOPS messages and backtraces (i.e., symbols from the text and inittext + sections). This is sufficient for most cases. And only in very rare + cases (e.g., when a debugger is used) all symbols are required (e.g., + names of variables from the data sections, etc). + + This option makes sure that all symbols are loaded into the kernel + image (i.e., symbols from all sections) in cost of increased kernel + size (depending on the kernel configuration, it may be 300KiB or + something like this). + + Say N unless you really need all symbols. + +config KALLSYMS_ABSOLUTE_PERCPU + bool + depends on KALLSYMS + default X86_64 && SMP + +config KALLSYMS_BASE_RELATIVE + bool + depends on KALLSYMS + default !IA64 + help + Instead of emitting them as absolute values in the native word size, + emit the symbol references in the kallsyms table as 32-bit entries, + each containing a relative value in the range [base, base + U32_MAX] + or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either + an absolute value in the range [0, S32_MAX] or a relative value in the + range [base, base + S32_MAX], where base is the lowest relative symbol + address encountered in the image. + + On 64-bit builds, this reduces the size of the address table by 50%, + but more importantly, it results in entries whose values are build + time constants, and no relocation pass is required at runtime to fix + up the entries based on the runtime load address of the kernel. + +# end of the "standard kernel features (expert users)" menu + +# syscall, maps, verifier +config BPF_SYSCALL + bool "Enable bpf() system call" + select ANON_INODES + select BPF + select IRQ_WORK + default n + help + Enable the bpf() system call that allows to manipulate eBPF + programs and maps via file descriptors. + +config BPF_JIT_ALWAYS_ON + bool "Permanently enable BPF JIT and remove BPF interpreter" + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT + help + Enables BPF JIT and removes BPF interpreter to avoid + speculative execution of BPF instructions by the interpreter + +config BPF_UNPRIV_DEFAULT_OFF + bool "Disable unprivileged BPF by default" + depends on BPF_SYSCALL + help + Disables unprivileged BPF by default by setting the corresponding + /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can + still reenable it by setting it to 0 later on, or permanently + disable it by setting it to 1 (from which no other transition to + 0 is possible anymore). + +config USERFAULTFD + bool "Enable userfaultfd() system call" + select ANON_INODES + depends on MMU + help + Enable the userfaultfd() system call that allows to intercept and + handle page faults in userland. + +config ARCH_HAS_MEMBARRIER_CALLBACKS + bool + +config ARCH_HAS_MEMBARRIER_SYNC_CORE + bool + +config RSEQ + bool "Enable rseq() system call" if EXPERT + default y + depends on HAVE_RSEQ + select MEMBARRIER + help + Enable the restartable sequences system call. It provides a + user-space cache for the current CPU number value, which + speeds up getting the current CPU number from user-space, + as well as an ABI to speed up user-space operations on + per-CPU data. + + If unsure, say Y. + +config DEBUG_RSEQ + default n + bool "Enabled debugging of rseq() system call" if EXPERT + depends on RSEQ && DEBUG_KERNEL + help + Enable extra debugging checks for the rseq system call. + + If unsure, say N. + +config EMBEDDED + bool "Embedded system" + option allnoconfig_y + select EXPERT + help + This option should be enabled if compiling the kernel for + an embedded system so certain expert options are available + for configuration. + +config HAVE_PERF_EVENTS + bool + help + See tools/perf/design.txt for details. + +config PERF_USE_VMALLOC + bool + help + See tools/perf/design.txt for details + +config PC104 + bool "PC/104 support" if EXPERT + help + Expose PC/104 form factor device drivers and options available for + selection and configuration. Enable this option if your target + machine has a PC/104 bus. + +menu "Kernel Performance Events And Counters" + +config PERF_EVENTS + bool "Kernel performance events and counters" + default y if PROFILING + depends on HAVE_PERF_EVENTS + select ANON_INODES + select IRQ_WORK + select SRCU + help + Enable kernel support for various performance events provided + by software and hardware. + + Software events are supported either built-in or via the + use of generic tracepoints. + + Most modern CPUs support performance events via performance + counter registers. These registers count the number of certain + types of hw events: such as instructions executed, cachemisses + suffered, or branches mis-predicted - without slowing down the + kernel or applications. These registers can also trigger interrupts + when a threshold number of events have passed - and can thus be + used to profile the code that runs on that CPU. + + The Linux Performance Event subsystem provides an abstraction of + these software and hardware event capabilities, available via a + system call and used by the "perf" utility in tools/perf/. It + provides per task and per CPU counters, and it provides event + capabilities on top of those. + + Say Y if unsure. + +config DEBUG_PERF_USE_VMALLOC + default n + bool "Debug: use vmalloc to back perf mmap() buffers" + depends on PERF_EVENTS && DEBUG_KERNEL && !PPC + select PERF_USE_VMALLOC + help + Use vmalloc memory to back perf mmap() buffers. + + Mostly useful for debugging the vmalloc code on platforms + that don't require it. + + Say N if unsure. + +endmenu + +config VM_EVENT_COUNTERS + default y + bool "Enable VM event counters for /proc/vmstat" if EXPERT + help + VM event counters are needed for event counts to be shown. + This option allows the disabling of the VM event counters + on EXPERT systems. /proc/vmstat will only show page counts + if VM event counters are disabled. + +config SLUB_DEBUG + default y + bool "Enable SLUB debugging support" if EXPERT + depends on SLUB && SYSFS + help + SLUB has extensive debug support features. Disabling these can + result in significant savings in code size. This also disables + SLUB sysfs support. /sys/slab will not exist and there will be + no support for cache validation etc. + +config SLUB_MEMCG_SYSFS_ON + default n + bool "Enable memcg SLUB sysfs support by default" if EXPERT + depends on SLUB && SYSFS && MEMCG + help + SLUB creates a directory under /sys/kernel/slab for each + allocation cache to host info and debug files. If memory + cgroup is enabled, each cache can have per memory cgroup + caches. SLUB can create the same sysfs directories for these + caches under /sys/kernel/slab/CACHE/cgroup but it can lead + to a very high number of debug files being created. This is + controlled by slub_memcg_sysfs boot parameter and this + config option determines the parameter's default value. + +config COMPAT_BRK + bool "Disable heap randomization" + default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). + This option changes the bootup default to heap randomization + disabled, and can be overridden at runtime by setting + /proc/sys/kernel/randomize_va_space to 2. + + On non-ancient distros (post-2000 ones) N is usually a safe choice. + +choice + prompt "Choose SLAB allocator" + default SLUB + help + This option allows to select a slab allocator. + +config SLAB + bool "SLAB" + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + The regular slab allocator that is established and known to work + well in all environments. It organizes cache hot objects in + per cpu and per node queues. + +config SLUB + bool "SLUB (Unqueued Allocator)" + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + SLUB is a slab allocator that minimizes cache line usage + instead of managing queues of cached objects (SLAB approach). + Per cpu caching is realized using slabs of objects instead + of queues of objects. SLUB can use memory efficiently + and has enhanced diagnostics. SLUB is the default choice for + a slab allocator. + +config SLOB + depends on EXPERT + bool "SLOB (Simple Allocator)" + help + SLOB replaces the stock allocator with a drastically simpler + allocator. SLOB is generally more space efficient but + does not perform as well on large systems. + +endchoice + +config SLAB_MERGE_DEFAULT + bool "Allow slab caches to be merged" + default y + help + For reduced kernel memory fragmentation, slab caches can be + merged when they share the same size and other characteristics. + This carries a risk of kernel heap overflows being able to + overwrite objects from merged caches (and more easily control + cache layout), which makes such heap attacks easier to exploit + by attackers. By keeping caches unmerged, these kinds of exploits + can usually only damage objects in the same cache. To disable + merging at runtime, "slab_nomerge" can be passed on the kernel + command line. + +config SLAB_FREELIST_RANDOM + default n + depends on SLAB || SLUB + bool "SLAB freelist randomization" + help + Randomizes the freelist order used on creating new pages. This + security feature reduces the predictability of the kernel slab + allocator against heap overflows. + +config SLAB_FREELIST_HARDENED + bool "Harden slab freelist metadata" + depends on SLUB + help + Many kernel heap attacks try to target slab cache metadata and + other infrastructure. This options makes minor performance + sacrifies to harden the kernel slab allocator against common + freelist exploit methods. + +config SLUB_CPU_PARTIAL + default y + depends on SLUB && SMP + bool "SLUB per cpu partial cache" + help + Per cpu partial caches accellerate objects allocation and freeing + that is local to a processor at the price of more indeterminism + in the latency of the free. On overflow these caches will be cleared + which requires the taking of locks that may cause latency spikes. + Typically one would choose no for a realtime system. + +config MMAP_ALLOW_UNINITIALIZED + bool "Allow mmapped anonymous memory to be uninitialized" + depends on EXPERT && !MMU + default n + help + Normally, and according to the Linux spec, anonymous memory obtained + from mmap() has its contents cleared before it is passed to + userspace. Enabling this config option allows you to request that + mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus + providing a huge performance boost. If this option is not enabled, + then the flag will be ignored. + + This is taken advantage of by uClibc's malloc(), and also by + ELF-FDPIC binfmt's brk and stack allocator. + + Because of the obvious security issues, this option should only be + enabled on embedded devices where you control what is run in + userspace. Since that isn't generally a problem on no-MMU systems, + it is normally safe to say Y here. + + See Documentation/nommu-mmap.txt for more information. + +config SYSTEM_DATA_VERIFICATION + def_bool n + select SYSTEM_TRUSTED_KEYRING + select KEYS + select CRYPTO + select CRYPTO_RSA + select ASYMMETRIC_KEY_TYPE + select ASYMMETRIC_PUBLIC_KEY_SUBTYPE + select ASN1 + select OID_REGISTRY + select X509_CERTIFICATE_PARSER + select PKCS7_MESSAGE_PARSER + help + Provide PKCS#7 message verification using the contents of the system + trusted keyring to provide public keys. This then can be used for + module verification, kexec image verification and firmware blob + verification. + +config PROFILING + bool "Profiling support" + help + Say Y here to enable the extended profiling support mechanisms used + by profilers such as OProfile. + +# +# Place an empty function call at each tracepoint site. Can be +# dynamically changed for a probe function. +# +config TRACEPOINTS + bool + +endmenu # General setup + +source "arch/Kconfig" + +config RT_MUTEXES + bool + +config BASE_SMALL + int + default 0 if BASE_FULL + default 1 if !BASE_FULL + +menuconfig MODULES + bool "Enable loadable module support" + option modules + help + Kernel modules are small pieces of compiled code which can + be inserted in the running kernel, rather than being + permanently built into the kernel. You use the "modprobe" + tool to add (and sometimes remove) them. If you say Y here, + many parts of the kernel can be built as modules (by + answering M instead of Y where indicated): this is most + useful for infrequently used options which are not required + for booting. For more information, see the man pages for + modprobe, lsmod, modinfo, insmod and rmmod. + + If you say Y here, you will need to run "make + modules_install" to put the modules under /lib/modules/ + where modprobe can find them (you may need to be root to do + this). + + If unsure, say Y. + +if MODULES + +config MODULE_FORCE_LOAD + bool "Forced module loading" + default n + help + Allow loading of modules without version information (ie. modprobe + --force). Forced module loading sets the 'F' (forced) taint flag and + is usually a really bad idea. + +config MODULE_UNLOAD + bool "Module unloading" + help + Without this option you will not be able to unload any + modules (note that some modules may not be unloadable + anyway), which makes your kernel smaller, faster + and simpler. If unsure, say Y. + +config MODULE_FORCE_UNLOAD + bool "Forced module unloading" + depends on MODULE_UNLOAD + help + This option allows you to force a module to unload, even if the + kernel believes it is unsafe: the kernel will remove the module + without waiting for anyone to stop using it (using the -f option to + rmmod). This is mainly for kernel developers and desperate users. + If unsure, say N. + +config MODVERSIONS + bool "Module versioning support" + help + Usually, you have to use modules compiled with your kernel. + Saying Y here makes it sometimes possible to use modules + compiled for different kernels, by adding enough information + to the modules to (hopefully) spot any changes which would + make them incompatible with the kernel you are running. If + unsure, say N. + +config MODULE_REL_CRCS + bool + depends on MODVERSIONS + +config MODULE_SRCVERSION_ALL + bool "Source checksum for all modules" + help + Modules which contain a MODULE_VERSION get an extra "srcversion" + field inserted into their modinfo section, which contains a + sum of the source files which made it. This helps maintainers + see exactly which source was used to build a module (since + others sometimes change the module source without updating + the version). With this option, such a "srcversion" field + will be created for all modules. If unsure, say N. + +config MODULE_SIG + bool "Module signature verification" + depends on MODULES + select SYSTEM_DATA_VERIFICATION + help + Check modules for valid signatures upon load: the signature + is simply appended to the module. For more information see + <file:Documentation/admin-guide/module-signing.rst>. + + Note that this option adds the OpenSSL development packages as a + kernel build dependency so that the signing tool can use its crypto + library. + + !!!WARNING!!! If you enable this option, you MUST make sure that the + module DOES NOT get stripped after being signed. This includes the + debuginfo strip done by some packagers (such as rpmbuild) and + inclusion into an initramfs that wants the module size reduced. + +config MODULE_SIG_FORCE + bool "Require modules to be validly signed" + depends on MODULE_SIG + help + Reject unsigned modules or signed modules for which we don't have a + key. Without this, such modules will simply taint the kernel. + +config MODULE_SIG_ALL + bool "Automatically sign all modules" + default y + depends on MODULE_SIG + help + Sign all modules during make modules_install. Without this option, + modules must be signed manually, using the scripts/sign-file tool. + +comment "Do not forget to sign required modules with scripts/sign-file" + depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL + +choice + prompt "Which hash algorithm should modules be signed with?" + depends on MODULE_SIG + help + This determines which sort of hashing algorithm will be used during + signature generation. This algorithm _must_ be built into the kernel + directly so that signature verification can take place. It is not + possible to load a signed module containing the algorithm to check + the signature on that module. + +config MODULE_SIG_SHA1 + bool "Sign modules with SHA-1" + select CRYPTO_SHA1 + +config MODULE_SIG_SHA224 + bool "Sign modules with SHA-224" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA256 + bool "Sign modules with SHA-256" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA384 + bool "Sign modules with SHA-384" + select CRYPTO_SHA512 + +config MODULE_SIG_SHA512 + bool "Sign modules with SHA-512" + select CRYPTO_SHA512 + +endchoice + +config MODULE_SIG_HASH + string + depends on MODULE_SIG + default "sha1" if MODULE_SIG_SHA1 + default "sha224" if MODULE_SIG_SHA224 + default "sha256" if MODULE_SIG_SHA256 + default "sha384" if MODULE_SIG_SHA384 + default "sha512" if MODULE_SIG_SHA512 + +config MODULE_COMPRESS + bool "Compress modules on installation" + depends on MODULES + help + + Compresses kernel modules when 'make modules_install' is run; gzip or + xz depending on "Compression algorithm" below. + + module-init-tools MAY support gzip, and kmod MAY support gzip and xz. + + Out-of-tree kernel modules installed using Kbuild will also be + compressed upon installation. + + Note: for modules inside an initrd or initramfs, it's more efficient + to compress the whole initrd or initramfs instead. + + Note: This is fully compatible with signed modules. + + If in doubt, say N. + +choice + prompt "Compression algorithm" + depends on MODULE_COMPRESS + default MODULE_COMPRESS_GZIP + help + This determines which sort of compression will be used during + 'make modules_install'. + + GZIP (default) and XZ are supported. + +config MODULE_COMPRESS_GZIP + bool "GZIP" + +config MODULE_COMPRESS_XZ + bool "XZ" + +endchoice + +config TRIM_UNUSED_KSYMS + bool "Trim unused exported kernel symbols" + depends on MODULES && !UNUSED_SYMBOLS + help + The kernel and some modules make many symbols available for + other modules to use via EXPORT_SYMBOL() and variants. Depending + on the set of modules being selected in your kernel configuration, + many of those exported symbols might never be used. + + This option allows for unused exported symbols to be dropped from + the build. In turn, this provides the compiler more opportunities + (especially when using LTO) for optimizing the code and reducing + binary size. This might have some security advantages as well. + + If unsure, or if you need to build out-of-tree modules, say N. + +endif # MODULES + +config MODULES_TREE_LOOKUP + def_bool y + depends on PERF_EVENTS || TRACING + +config INIT_ALL_POSSIBLE + bool + help + Back when each arch used to define their own cpu_online_mask and + cpu_possible_mask, some of them chose to initialize cpu_possible_mask + with all 1s, and others with all 0s. When they were centralised, + it was better to provide this option than to break all the archs + and have several arch maintainers pursuing me down dark alleys. + +source "block/Kconfig" + +config PREEMPT_NOTIFIERS + bool + +config PADATA + depends on SMP + bool + +config ASN1 + tristate + help + Build a simple ASN.1 grammar compiler that produces a bytecode output + that can be interpreted by the ASN.1 stream decoder and used to + inform it as to what tags are to be expected in a stream and what + functions to call on what tags. + +source "kernel/Kconfig.locks" + +config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE + bool + +# It may be useful for an architecture to override the definitions of the +# SYSCALL_DEFINE() and __SYSCALL_DEFINEx() macros in <linux/syscalls.h> +# and the COMPAT_ variants in <linux/compat.h>, in particular to use a +# different calling convention for syscalls. They can also override the +# macros for not-implemented syscalls in kernel/sys_ni.c and +# kernel/time/posix-stubs.c. All these overrides need to be available in +# <asm/syscall_wrapper.h>. +config ARCH_HAS_SYSCALL_WRAPPER + def_bool n diff --git a/init/Makefile b/init/Makefile new file mode 100644 index 000000000..a3e5ce2bc --- /dev/null +++ b/init/Makefile @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux kernel. +# + +ccflags-y := -fno-function-sections -fno-data-sections + +obj-y := main.o version.o mounts.o +ifneq ($(CONFIG_BLK_DEV_INITRD),y) +obj-y += noinitramfs.o +else +obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o +endif +obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o + +obj-y += init_task.o + +mounts-y := do_mounts.o +mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o +mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o +mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o + +# dependencies on generated files need to be listed explicitly +$(obj)/version.o: include/generated/compile.h + +# compile.h changes depending on hostname, generation number, etc, +# so we regenerate it always. +# mkcompile_h will make sure to only update the +# actual file if its content has changed. + + chk_compile.h = : + quiet_chk_compile.h = echo ' CHK $@' +silent_chk_compile.h = : +include/generated/compile.h: FORCE + @$($(quiet)chk_compile.h) + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)" diff --git a/init/calibrate.c b/init/calibrate.c new file mode 100644 index 000000000..f3831272f --- /dev/null +++ b/init/calibrate.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0 +/* calibrate.c: default delay calibration + * + * Excised from init/main.c + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/jiffies.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/timex.h> +#include <linux/smp.h> +#include <linux/percpu.h> + +unsigned long lpj_fine; +unsigned long preset_lpj; +static int __init lpj_setup(char *str) +{ + preset_lpj = simple_strtoul(str,NULL,0); + return 1; +} + +__setup("lpj=", lpj_setup); + +#ifdef ARCH_HAS_READ_CURRENT_TIMER + +/* This routine uses the read_current_timer() routine and gets the + * loops per jiffy directly, instead of guessing it using delay(). + * Also, this code tries to handle non-maskable asynchronous events + * (like SMIs) + */ +#define DELAY_CALIBRATION_TICKS ((HZ < 100) ? 1 : (HZ/100)) +#define MAX_DIRECT_CALIBRATION_RETRIES 5 + +static unsigned long calibrate_delay_direct(void) +{ + unsigned long pre_start, start, post_start; + unsigned long pre_end, end, post_end; + unsigned long start_jiffies; + unsigned long timer_rate_min, timer_rate_max; + unsigned long good_timer_sum = 0; + unsigned long good_timer_count = 0; + unsigned long measured_times[MAX_DIRECT_CALIBRATION_RETRIES]; + int max = -1; /* index of measured_times with max/min values or not set */ + int min = -1; + int i; + + if (read_current_timer(&pre_start) < 0 ) + return 0; + + /* + * A simple loop like + * while ( jiffies < start_jiffies+1) + * start = read_current_timer(); + * will not do. As we don't really know whether jiffy switch + * happened first or timer_value was read first. And some asynchronous + * event can happen between these two events introducing errors in lpj. + * + * So, we do + * 1. pre_start <- When we are sure that jiffy switch hasn't happened + * 2. check jiffy switch + * 3. start <- timer value before or after jiffy switch + * 4. post_start <- When we are sure that jiffy switch has happened + * + * Note, we don't know anything about order of 2 and 3. + * Now, by looking at post_start and pre_start difference, we can + * check whether any asynchronous event happened or not + */ + + for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) { + pre_start = 0; + read_current_timer(&start); + start_jiffies = jiffies; + while (time_before_eq(jiffies, start_jiffies + 1)) { + pre_start = start; + read_current_timer(&start); + } + read_current_timer(&post_start); + + pre_end = 0; + end = post_start; + while (time_before_eq(jiffies, start_jiffies + 1 + + DELAY_CALIBRATION_TICKS)) { + pre_end = end; + read_current_timer(&end); + } + read_current_timer(&post_end); + + timer_rate_max = (post_end - pre_start) / + DELAY_CALIBRATION_TICKS; + timer_rate_min = (pre_end - post_start) / + DELAY_CALIBRATION_TICKS; + + /* + * If the upper limit and lower limit of the timer_rate is + * >= 12.5% apart, redo calibration. + */ + if (start >= post_end) + printk(KERN_NOTICE "calibrate_delay_direct() ignoring " + "timer_rate as we had a TSC wrap around" + " start=%lu >=post_end=%lu\n", + start, post_end); + if (start < post_end && pre_start != 0 && pre_end != 0 && + (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) { + good_timer_count++; + good_timer_sum += timer_rate_max; + measured_times[i] = timer_rate_max; + if (max < 0 || timer_rate_max > measured_times[max]) + max = i; + if (min < 0 || timer_rate_max < measured_times[min]) + min = i; + } else + measured_times[i] = 0; + + } + + /* + * Find the maximum & minimum - if they differ too much throw out the + * one with the largest difference from the mean and try again... + */ + while (good_timer_count > 1) { + unsigned long estimate; + unsigned long maxdiff; + + /* compute the estimate */ + estimate = (good_timer_sum/good_timer_count); + maxdiff = estimate >> 3; + + /* if range is within 12% let's take it */ + if ((measured_times[max] - measured_times[min]) < maxdiff) + return estimate; + + /* ok - drop the worse value and try again... */ + good_timer_sum = 0; + good_timer_count = 0; + if ((measured_times[max] - estimate) < + (estimate - measured_times[min])) { + printk(KERN_NOTICE "calibrate_delay_direct() dropping " + "min bogoMips estimate %d = %lu\n", + min, measured_times[min]); + measured_times[min] = 0; + min = max; + } else { + printk(KERN_NOTICE "calibrate_delay_direct() dropping " + "max bogoMips estimate %d = %lu\n", + max, measured_times[max]); + measured_times[max] = 0; + max = min; + } + + for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) { + if (measured_times[i] == 0) + continue; + good_timer_count++; + good_timer_sum += measured_times[i]; + if (measured_times[i] < measured_times[min]) + min = i; + if (measured_times[i] > measured_times[max]) + max = i; + } + + } + + printk(KERN_NOTICE "calibrate_delay_direct() failed to get a good " + "estimate for loops_per_jiffy.\nProbably due to long platform " + "interrupts. Consider using \"lpj=\" boot option.\n"); + return 0; +} +#else +static unsigned long calibrate_delay_direct(void) +{ + return 0; +} +#endif + +/* + * This is the number of bits of precision for the loops_per_jiffy. Each + * time we refine our estimate after the first takes 1.5/HZ seconds, so try + * to start with a good estimate. + * For the boot cpu we can skip the delay calibration and assign it a value + * calculated based on the timer frequency. + * For the rest of the CPUs we cannot assume that the timer frequency is same as + * the cpu frequency, hence do the calibration for those. + */ +#define LPS_PREC 8 + +static unsigned long calibrate_delay_converge(void) +{ + /* First stage - slowly accelerate to find initial bounds */ + unsigned long lpj, lpj_base, ticks, loopadd, loopadd_base, chop_limit; + int trials = 0, band = 0, trial_in_band = 0; + + lpj = (1<<12); + + /* wait for "start of" clock tick */ + ticks = jiffies; + while (ticks == jiffies) + ; /* nothing */ + /* Go .. */ + ticks = jiffies; + do { + if (++trial_in_band == (1<<band)) { + ++band; + trial_in_band = 0; + } + __delay(lpj * band); + trials += band; + } while (ticks == jiffies); + /* + * We overshot, so retreat to a clear underestimate. Then estimate + * the largest likely undershoot. This defines our chop bounds. + */ + trials -= band; + loopadd_base = lpj * band; + lpj_base = lpj * trials; + +recalibrate: + lpj = lpj_base; + loopadd = loopadd_base; + + /* + * Do a binary approximation to get lpj set to + * equal one clock (up to LPS_PREC bits) + */ + chop_limit = lpj >> LPS_PREC; + while (loopadd > chop_limit) { + lpj += loopadd; + ticks = jiffies; + while (ticks == jiffies) + ; /* nothing */ + ticks = jiffies; + __delay(lpj); + if (jiffies != ticks) /* longer than 1 tick */ + lpj -= loopadd; + loopadd >>= 1; + } + /* + * If we incremented every single time possible, presume we've + * massively underestimated initially, and retry with a higher + * start, and larger range. (Only seen on x86_64, due to SMIs) + */ + if (lpj + loopadd * 2 == lpj_base + loopadd_base * 2) { + lpj_base = lpj; + loopadd_base <<= 2; + goto recalibrate; + } + + return lpj; +} + +static DEFINE_PER_CPU(unsigned long, cpu_loops_per_jiffy) = { 0 }; + +/* + * Check if cpu calibration delay is already known. For example, + * some processors with multi-core sockets may have all cores + * with the same calibration delay. + * + * Architectures should override this function if a faster calibration + * method is available. + */ +unsigned long __attribute__((weak)) calibrate_delay_is_known(void) +{ + return 0; +} + +/* + * Indicate the cpu delay calibration is done. This can be used by + * architectures to stop accepting delay timer registrations after this point. + */ + +void __attribute__((weak)) calibration_delay_done(void) +{ +} + +void calibrate_delay(void) +{ + unsigned long lpj; + static bool printed; + int this_cpu = smp_processor_id(); + + if (per_cpu(cpu_loops_per_jiffy, this_cpu)) { + lpj = per_cpu(cpu_loops_per_jiffy, this_cpu); + if (!printed) + pr_info("Calibrating delay loop (skipped) " + "already calibrated this CPU"); + } else if (preset_lpj) { + lpj = preset_lpj; + if (!printed) + pr_info("Calibrating delay loop (skipped) " + "preset value.. "); + } else if ((!printed) && lpj_fine) { + lpj = lpj_fine; + pr_info("Calibrating delay loop (skipped), " + "value calculated using timer frequency.. "); + } else if ((lpj = calibrate_delay_is_known())) { + ; + } else if ((lpj = calibrate_delay_direct()) != 0) { + if (!printed) + pr_info("Calibrating delay using timer " + "specific routine.. "); + } else { + if (!printed) + pr_info("Calibrating delay loop... "); + lpj = calibrate_delay_converge(); + } + per_cpu(cpu_loops_per_jiffy, this_cpu) = lpj; + if (!printed) + pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n", + lpj/(500000/HZ), + (lpj/(5000/HZ)) % 100, lpj); + + loops_per_jiffy = lpj; + printed = true; + + calibration_delay_done(); +} diff --git a/init/do_mounts.c b/init/do_mounts.c new file mode 100644 index 000000000..e1c9afa9d --- /dev/null +++ b/init/do_mounts.c @@ -0,0 +1,637 @@ +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/fd.h> +#include <linux/tty.h> +#include <linux/suspend.h> +#include <linux/root_dev.h> +#include <linux/security.h> +#include <linux/delay.h> +#include <linux/genhd.h> +#include <linux/mount.h> +#include <linux/device.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/initrd.h> +#include <linux/async.h> +#include <linux/fs_struct.h> +#include <linux/slab.h> +#include <linux/ramfs.h> +#include <linux/shmem_fs.h> + +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/nfs_mount.h> + +#include "do_mounts.h" + +int __initdata rd_doload; /* 1 = load RAM disk, 0 = don't load */ + +int root_mountflags = MS_RDONLY | MS_SILENT; +static char * __initdata root_device_name; +static char __initdata saved_root_name[64]; +static int root_wait; + +dev_t ROOT_DEV; + +static int __init load_ramdisk(char *str) +{ + rd_doload = simple_strtol(str,NULL,0) & 3; + return 1; +} +__setup("load_ramdisk=", load_ramdisk); + +static int __init readonly(char *str) +{ + if (*str) + return 0; + root_mountflags |= MS_RDONLY; + return 1; +} + +static int __init readwrite(char *str) +{ + if (*str) + return 0; + root_mountflags &= ~MS_RDONLY; + return 1; +} + +__setup("ro", readonly); +__setup("rw", readwrite); + +#ifdef CONFIG_BLOCK +struct uuidcmp { + const char *uuid; + int len; +}; + +/** + * match_dev_by_uuid - callback for finding a partition using its uuid + * @dev: device passed in by the caller + * @data: opaque pointer to the desired struct uuidcmp to match + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int match_dev_by_uuid(struct device *dev, const void *data) +{ + const struct uuidcmp *cmp = data; + struct hd_struct *part = dev_to_part(dev); + + if (!part->info) + goto no_match; + + if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len)) + goto no_match; + + return 1; +no_match: + return 0; +} + + +/** + * devt_from_partuuid - looks up the dev_t of a partition by its UUID + * @uuid_str: char array containing ascii UUID + * + * The function will return the first partition which contains a matching + * UUID value in its partition_meta_info struct. This does not search + * by filesystem UUIDs. + * + * If @uuid_str is followed by a "/PARTNROFF=%d", then the number will be + * extracted and used as an offset from the partition identified by the UUID. + * + * Returns the matching dev_t on success or 0 on failure. + */ +static dev_t devt_from_partuuid(const char *uuid_str) +{ + dev_t res = 0; + struct uuidcmp cmp; + struct device *dev = NULL; + struct gendisk *disk; + struct hd_struct *part; + int offset = 0; + bool clear_root_wait = false; + char *slash; + + cmp.uuid = uuid_str; + + slash = strchr(uuid_str, '/'); + /* Check for optional partition number offset attributes. */ + if (slash) { + char c = 0; + /* Explicitly fail on poor PARTUUID syntax. */ + if (sscanf(slash + 1, + "PARTNROFF=%d%c", &offset, &c) != 1) { + clear_root_wait = true; + goto done; + } + cmp.len = slash - uuid_str; + } else { + cmp.len = strlen(uuid_str); + } + + if (!cmp.len) { + clear_root_wait = true; + goto done; + } + + dev = class_find_device(&block_class, NULL, &cmp, + &match_dev_by_uuid); + if (!dev) + goto done; + + res = dev->devt; + + /* Attempt to find the partition by offset. */ + if (!offset) + goto no_offset; + + res = 0; + disk = part_to_disk(dev_to_part(dev)); + part = disk_get_part(disk, dev_to_part(dev)->partno + offset); + if (part) { + res = part_devt(part); + put_device(part_to_dev(part)); + } + +no_offset: + put_device(dev); +done: + if (clear_root_wait) { + pr_err("VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n"); + if (root_wait) + pr_err("Disabling rootwait; root= is invalid.\n"); + root_wait = 0; + } + return res; +} +#endif + +/* + * Convert a name into device number. We accept the following variants: + * + * 1) <hex_major><hex_minor> device number in hexadecimal represents itself + * no leading 0x, for example b302. + * 2) /dev/nfs represents Root_NFS (0xff) + * 3) /dev/<disk_name> represents the device number of disk + * 4) /dev/<disk_name><decimal> represents the device number + * of partition - device number of disk plus the partition number + * 5) /dev/<disk_name>p<decimal> - same as the above, that form is + * used when disk name of partitioned disk ends on a digit. + * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. + * 7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to + * a partition with a known unique id. + * 8) <major>:<minor> major and minor number of the device separated by + * a colon. + * + * If name doesn't have fall into the categories above, we return (0,0). + * block_class is used to check if something is a disk name. If the disk + * name contains slashes, the device name has them replaced with + * bangs. + */ + +dev_t name_to_dev_t(const char *name) +{ + char s[32]; + char *p; + dev_t res = 0; + int part; + +#ifdef CONFIG_BLOCK + if (strncmp(name, "PARTUUID=", 9) == 0) { + name += 9; + res = devt_from_partuuid(name); + if (!res) + goto fail; + goto done; + } +#endif + + if (strncmp(name, "/dev/", 5) != 0) { + unsigned maj, min, offset; + char dummy; + + if ((sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2) || + (sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3)) { + res = MKDEV(maj, min); + if (maj != MAJOR(res) || min != MINOR(res)) + goto fail; + } else { + res = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + goto fail; + } + goto done; + } + + name += 5; + res = Root_NFS; + if (strcmp(name, "nfs") == 0) + goto done; + res = Root_RAM0; + if (strcmp(name, "ram") == 0) + goto done; + + if (strlen(name) > 31) + goto fail; + strcpy(s, name); + for (p = s; *p; p++) + if (*p == '/') + *p = '!'; + res = blk_lookup_devt(s, 0); + if (res) + goto done; + + /* + * try non-existent, but valid partition, which may only exist + * after revalidating the disk, like partitioned md devices + */ + while (p > s && isdigit(p[-1])) + p--; + if (p == s || !*p || *p == '0') + goto fail; + + /* try disk name without <part number> */ + part = simple_strtoul(p, NULL, 10); + *p = '\0'; + res = blk_lookup_devt(s, part); + if (res) + goto done; + + /* try disk name without p<part number> */ + if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') + goto fail; + p[-1] = '\0'; + res = blk_lookup_devt(s, part); + if (res) + goto done; + +fail: + return 0; +done: + return res; +} +EXPORT_SYMBOL_GPL(name_to_dev_t); + +static int __init root_dev_setup(char *line) +{ + strlcpy(saved_root_name, line, sizeof(saved_root_name)); + return 1; +} + +__setup("root=", root_dev_setup); + +static int __init rootwait_setup(char *str) +{ + if (*str) + return 0; + root_wait = 1; + return 1; +} + +__setup("rootwait", rootwait_setup); + +static char * __initdata root_mount_data; +static int __init root_data_setup(char *str) +{ + root_mount_data = str; + return 1; +} + +static char * __initdata root_fs_names; +static int __init fs_names_setup(char *str) +{ + root_fs_names = str; + return 1; +} + +static unsigned int __initdata root_delay; +static int __init root_delay_setup(char *str) +{ + root_delay = simple_strtoul(str, NULL, 0); + return 1; +} + +__setup("rootflags=", root_data_setup); +__setup("rootfstype=", fs_names_setup); +__setup("rootdelay=", root_delay_setup); + +static void __init get_fs_names(char *page) +{ + char *s = page; + + if (root_fs_names) { + strcpy(page, root_fs_names); + while (*s++) { + if (s[-1] == ',') + s[-1] = '\0'; + } + } else { + int len = get_filesystem_list(page); + char *p, *next; + + page[len] = '\0'; + for (p = page-1; p; p = next) { + next = strchr(++p, '\n'); + if (*p++ != '\t') + continue; + while ((*s++ = *p++) != '\n') + ; + s[-1] = '\0'; + } + } + *s = '\0'; +} + +static int __init do_mount_root(char *name, char *fs, int flags, void *data) +{ + struct super_block *s; + int err = ksys_mount(name, "/root", fs, flags, data); + if (err) + return err; + + ksys_chdir("/root"); + s = current->fs->pwd.dentry->d_sb; + ROOT_DEV = s->s_dev; + printk(KERN_INFO + "VFS: Mounted root (%s filesystem)%s on device %u:%u.\n", + s->s_type->name, + sb_rdonly(s) ? " readonly" : "", + MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); + return 0; +} + +void __init mount_block_root(char *name, int flags) +{ + struct page *page = alloc_page(GFP_KERNEL); + char *fs_names = page_address(page); + char *p; +#ifdef CONFIG_BLOCK + char b[BDEVNAME_SIZE]; +#else + const char *b = name; +#endif + + get_fs_names(fs_names); +retry: + for (p = fs_names; *p; p += strlen(p)+1) { + int err = do_mount_root(name, p, flags, root_mount_data); + switch (err) { + case 0: + goto out; + case -EACCES: + case -EINVAL: + continue; + } + /* + * Allow the user to distinguish between failed sys_open + * and bad superblock on root device. + * and give them a list of the available devices + */ +#ifdef CONFIG_BLOCK + __bdevname(ROOT_DEV, b); +#endif + printk("VFS: Cannot open root device \"%s\" or %s: error %d\n", + root_device_name, b, err); + printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); + + printk_all_partitions(); +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify " + "explicit textual name for \"root=\" boot option.\n"); +#endif + panic("VFS: Unable to mount root fs on %s", b); + } + if (!(flags & SB_RDONLY)) { + flags |= SB_RDONLY; + goto retry; + } + + printk("List of all partitions:\n"); + printk_all_partitions(); + printk("No filesystem could mount root, tried: "); + for (p = fs_names; *p; p += strlen(p)+1) + printk(" %s", p); + printk("\n"); +#ifdef CONFIG_BLOCK + __bdevname(ROOT_DEV, b); +#endif + panic("VFS: Unable to mount root fs on %s", b); +out: + put_page(page); +} + +#ifdef CONFIG_ROOT_NFS + +#define NFSROOT_TIMEOUT_MIN 5 +#define NFSROOT_TIMEOUT_MAX 30 +#define NFSROOT_RETRY_MAX 5 + +static int __init mount_nfs_root(void) +{ + char *root_dev, *root_data; + unsigned int timeout; + int try, err; + + err = nfs_root_data(&root_dev, &root_data); + if (err != 0) + return 0; + + /* + * The server or network may not be ready, so try several + * times. Stop after a few tries in case the client wants + * to fall back to other boot methods. + */ + timeout = NFSROOT_TIMEOUT_MIN; + for (try = 1; ; try++) { + err = do_mount_root(root_dev, "nfs", + root_mountflags, root_data); + if (err == 0) + return 1; + if (try > NFSROOT_RETRY_MAX) + break; + + /* Wait, in case the server refused us immediately */ + ssleep(timeout); + timeout <<= 1; + if (timeout > NFSROOT_TIMEOUT_MAX) + timeout = NFSROOT_TIMEOUT_MAX; + } + return 0; +} +#endif + +#if defined(CONFIG_BLK_DEV_RAM) || defined(CONFIG_BLK_DEV_FD) +void __init change_floppy(char *fmt, ...) +{ + struct termios termios; + char buf[80]; + char c; + int fd; + va_list args; + va_start(args, fmt); + vsprintf(buf, fmt, args); + va_end(args); + fd = ksys_open("/dev/root", O_RDWR | O_NDELAY, 0); + if (fd >= 0) { + ksys_ioctl(fd, FDEJECT, 0); + ksys_close(fd); + } + printk(KERN_NOTICE "VFS: Insert %s and press ENTER\n", buf); + fd = ksys_open("/dev/console", O_RDWR, 0); + if (fd >= 0) { + ksys_ioctl(fd, TCGETS, (long)&termios); + termios.c_lflag &= ~ICANON; + ksys_ioctl(fd, TCSETSF, (long)&termios); + ksys_read(fd, &c, 1); + termios.c_lflag |= ICANON; + ksys_ioctl(fd, TCSETSF, (long)&termios); + ksys_close(fd); + } +} +#endif + +void __init mount_root(void) +{ +#ifdef CONFIG_ROOT_NFS + if (ROOT_DEV == Root_NFS) { + if (mount_nfs_root()) + return; + + printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n"); + ROOT_DEV = Root_FD0; + } +#endif +#ifdef CONFIG_BLK_DEV_FD + if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) { + /* rd_doload is 2 for a dual initrd/ramload setup */ + if (rd_doload==2) { + if (rd_load_disk(1)) { + ROOT_DEV = Root_RAM1; + root_device_name = NULL; + } + } else + change_floppy("root floppy"); + } +#endif +#ifdef CONFIG_BLOCK + { + int err = create_dev("/dev/root", ROOT_DEV); + + if (err < 0) + pr_emerg("Failed to create /dev/root: %d\n", err); + mount_block_root("/dev/root", root_mountflags); + } +#endif +} + +/* + * Prepare the namespace - decide what/where to mount, load ramdisks, etc. + */ +void __init prepare_namespace(void) +{ + int is_floppy; + + if (root_delay) { + printk(KERN_INFO "Waiting %d sec before mounting root device...\n", + root_delay); + ssleep(root_delay); + } + + /* + * wait for the known devices to complete their probing + * + * Note: this is a potential source of long boot delays. + * For example, it is not atypical to wait 5 seconds here + * for the touchpad of a laptop to initialize. + */ + wait_for_device_probe(); + + md_run_setup(); + + if (saved_root_name[0]) { + root_device_name = saved_root_name; + if (!strncmp(root_device_name, "mtd", 3) || + !strncmp(root_device_name, "ubi", 3)) { + mount_block_root(root_device_name, root_mountflags); + goto out; + } + ROOT_DEV = name_to_dev_t(root_device_name); + if (strncmp(root_device_name, "/dev/", 5) == 0) + root_device_name += 5; + } + + if (initrd_load()) + goto out; + + /* wait for any asynchronous scanning to complete */ + if ((ROOT_DEV == 0) && root_wait) { + printk(KERN_INFO "Waiting for root device %s...\n", + saved_root_name); + while (driver_probe_done() != 0 || + (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0) + msleep(5); + async_synchronize_full(); + } + + is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR; + + if (is_floppy && rd_doload && rd_load_disk(0)) + ROOT_DEV = Root_RAM0; + + mount_root(); +out: + devtmpfs_mount("dev"); + ksys_mount(".", "/", NULL, MS_MOVE, NULL); + ksys_chroot("."); +} + +static bool is_tmpfs; +static struct dentry *rootfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static unsigned long once; + void *fill = ramfs_fill_super; + + if (test_and_set_bit(0, &once)) + return ERR_PTR(-ENODEV); + + if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs) + fill = shmem_fill_super; + + return mount_nodev(fs_type, flags, data, fill); +} + +static struct file_system_type rootfs_fs_type = { + .name = "rootfs", + .mount = rootfs_mount, + .kill_sb = kill_litter_super, +}; + +int __init init_rootfs(void) +{ + int err = register_filesystem(&rootfs_fs_type); + + if (err) + return err; + + if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] && + (!root_fs_names || strstr(root_fs_names, "tmpfs"))) { + err = shmem_init(); + is_tmpfs = true; + } else { + err = init_ramfs_fs(); + } + + if (err) + unregister_filesystem(&rootfs_fs_type); + + return err; +} diff --git a/init/do_mounts.h b/init/do_mounts.h new file mode 100644 index 000000000..0bb0806de --- /dev/null +++ b/init/do_mounts.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/syscalls.h> +#include <linux/unistd.h> +#include <linux/slab.h> +#include <linux/mount.h> +#include <linux/major.h> +#include <linux/root_dev.h> + +void change_floppy(char *fmt, ...); +void mount_block_root(char *name, int flags); +void mount_root(void); +extern int root_mountflags; + +static inline int create_dev(char *name, dev_t dev) +{ + ksys_unlink(name); + return ksys_mknod(name, S_IFBLK|0600, new_encode_dev(dev)); +} + +static inline u32 bstat(char *name) +{ + struct kstat stat; + if (vfs_stat(name, &stat) != 0) + return 0; + if (!S_ISBLK(stat.mode)) + return 0; + return stat.rdev; +} + +#ifdef CONFIG_BLK_DEV_RAM + +int __init rd_load_disk(int n); +int __init rd_load_image(char *from); + +#else + +static inline int rd_load_disk(int n) { return 0; } +static inline int rd_load_image(char *from) { return 0; } + +#endif + +#ifdef CONFIG_BLK_DEV_INITRD + +bool __init initrd_load(void); + +#else + +static inline bool initrd_load(void) { return false; } + +#endif + +#ifdef CONFIG_BLK_DEV_MD + +void md_run_setup(void); + +#else + +static inline void md_run_setup(void) {} + +#endif diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c new file mode 100644 index 000000000..d1a5d885c --- /dev/null +++ b/init/do_mounts_initrd.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/unistd.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/minix_fs.h> +#include <linux/romfs_fs.h> +#include <linux/initrd.h> +#include <linux/sched.h> +#include <linux/freezer.h> +#include <linux/kmod.h> + +#include "do_mounts.h" + +unsigned long initrd_start, initrd_end; +int initrd_below_start_ok; +unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */ +static int __initdata mount_initrd = 1; + +static int __init no_initrd(char *str) +{ + mount_initrd = 0; + return 1; +} + +__setup("noinitrd", no_initrd); + +static int init_linuxrc(struct subprocess_info *info, struct cred *new) +{ + ksys_unshare(CLONE_FS | CLONE_FILES); + /* stdin/stdout/stderr for /linuxrc */ + ksys_open("/dev/console", O_RDWR, 0); + ksys_dup(0); + ksys_dup(0); + /* move initrd over / and chdir/chroot in initrd root */ + ksys_chdir("/root"); + ksys_mount(".", "/", NULL, MS_MOVE, NULL); + ksys_chroot("."); + ksys_setsid(); + return 0; +} + +static void __init handle_initrd(void) +{ + struct subprocess_info *info; + static char *argv[] = { "linuxrc", NULL, }; + extern char *envp_init[]; + int error; + + real_root_dev = new_encode_dev(ROOT_DEV); + create_dev("/dev/root.old", Root_RAM0); + /* mount initrd on rootfs' /root */ + mount_block_root("/dev/root.old", root_mountflags & ~MS_RDONLY); + ksys_mkdir("/old", 0700); + ksys_chdir("/old"); + + /* try loading default modules from initrd */ + load_default_modules(); + + /* + * In case that a resume from disk is carried out by linuxrc or one of + * its children, we need to tell the freezer not to wait for us. + */ + current->flags |= PF_FREEZER_SKIP; + + info = call_usermodehelper_setup("/linuxrc", argv, envp_init, + GFP_KERNEL, init_linuxrc, NULL, NULL); + if (!info) + return; + call_usermodehelper_exec(info, UMH_WAIT_PROC); + + current->flags &= ~PF_FREEZER_SKIP; + + /* move initrd to rootfs' /old */ + ksys_mount("..", ".", NULL, MS_MOVE, NULL); + /* switch root and cwd back to / of rootfs */ + ksys_chroot(".."); + + if (new_decode_dev(real_root_dev) == Root_RAM0) { + ksys_chdir("/old"); + return; + } + + ksys_chdir("/"); + ROOT_DEV = new_decode_dev(real_root_dev); + mount_root(); + + printk(KERN_NOTICE "Trying to move old root to /initrd ... "); + error = ksys_mount("/old", "/root/initrd", NULL, MS_MOVE, NULL); + if (!error) + printk("okay\n"); + else { + int fd = ksys_open("/dev/root.old", O_RDWR, 0); + if (error == -ENOENT) + printk("/initrd does not exist. Ignored.\n"); + else + printk("failed\n"); + printk(KERN_NOTICE "Unmounting old root\n"); + ksys_umount("/old", MNT_DETACH); + printk(KERN_NOTICE "Trying to free ramdisk memory ... "); + if (fd < 0) { + error = fd; + } else { + error = ksys_ioctl(fd, BLKFLSBUF, 0); + ksys_close(fd); + } + printk(!error ? "okay\n" : "failed\n"); + } +} + +bool __init initrd_load(void) +{ + if (mount_initrd) { + create_dev("/dev/ram", Root_RAM0); + /* + * Load the initrd data into /dev/ram0. Execute it as initrd + * unless /dev/ram0 is supposed to be our actual root device, + * in that case the ram disk is just set up here, and gets + * mounted in the normal path. + */ + if (rd_load_image("/initrd.image") && ROOT_DEV != Root_RAM0) { + ksys_unlink("/initrd.image"); + handle_initrd(); + return true; + } + } + ksys_unlink("/initrd.image"); + return false; +} diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c new file mode 100644 index 000000000..b84031528 --- /dev/null +++ b/init/do_mounts_md.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/delay.h> +#include <linux/raid/md_u.h> +#include <linux/raid/md_p.h> + +#include "do_mounts.h" + +/* + * When md (and any require personalities) are compiled into the kernel + * (not a module), arrays can be assembles are boot time using with AUTODETECT + * where specially marked partitions are registered with md_autodetect_dev(), + * and with MD_BOOT where devices to be collected are given on the boot line + * with md=..... + * The code for that is here. + */ + +#ifdef CONFIG_MD_AUTODETECT +static int __initdata raid_noautodetect; +#else +static int __initdata raid_noautodetect=1; +#endif +static int __initdata raid_autopart; + +static struct { + int minor; + int partitioned; + int level; + int chunk; + char *device_names; +} md_setup_args[256] __initdata; + +static int md_setup_ents __initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistent-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + * 2001-06-03: Dave Cinege <dcinege@psychosis.com> + * Shifted name_to_kdev_t() and related operations to md_set_drive() + * for later execution. Rewrote section to make devfs compatible. + */ +static int __init md_setup(char *str) +{ + int minor, level, factor, fault, partitioned = 0; + char *pername = ""; + char *str1; + int ent; + + if (*str == 'd') { + partitioned = 1; + str++; + } + if (get_option(&str, &minor) != 2) { /* MD Number */ + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + str1 = str; + for (ent=0 ; ent< md_setup_ents ; ent++) + if (md_setup_args[ent].minor == minor && + md_setup_args[ent].partitioned == partitioned) { + printk(KERN_WARNING "md: md=%s%d, Specified more than once. " + "Replacing previous definition.\n", partitioned?"d":"", minor); + break; + } + if (ent >= ARRAY_SIZE(md_setup_args)) { + printk(KERN_WARNING "md: md=%s%d - too many md initialisations\n", partitioned?"d":"", minor); + return 0; + } + if (ent >= md_setup_ents) + md_setup_ents++; + switch (get_option(&str, &level)) { /* RAID level */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == LEVEL_LINEAR) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args[ent].level = level; + md_setup_args[ent].chunk = 1 << (factor+12); + if (level == LEVEL_LINEAR) + pername = "linear"; + else + pername = "raid0"; + break; + } + /* FALL THROUGH */ + case 1: /* the first device is numeric */ + str = str1; + /* FALL THROUGH */ + case 0: + md_setup_args[ent].level = LEVEL_NONE; + pername="super-block"; + } + + printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", + minor, pername, str); + md_setup_args[ent].device_names = str; + md_setup_args[ent].partitioned = partitioned; + md_setup_args[ent].minor = minor; + + return 1; +} + +static void __init md_setup_drive(void) +{ + int minor, i, ent, partitioned; + dev_t dev; + dev_t devices[MD_SB_DISKS+1]; + + for (ent = 0; ent < md_setup_ents ; ent++) { + int fd; + int err = 0; + char *devname; + mdu_disk_info_t dinfo; + char name[16]; + + minor = md_setup_args[ent].minor; + partitioned = md_setup_args[ent].partitioned; + devname = md_setup_args[ent].device_names; + + sprintf(name, "/dev/md%s%d", partitioned?"_d":"", minor); + if (partitioned) + dev = MKDEV(mdp_major, minor << MdpMinorShift); + else + dev = MKDEV(MD_MAJOR, minor); + create_dev(name, dev); + for (i = 0; i < MD_SB_DISKS && devname != NULL; i++) { + char *p; + char comp_name[64]; + u32 rdev; + + p = strchr(devname, ','); + if (p) + *p++ = 0; + + dev = name_to_dev_t(devname); + if (strncmp(devname, "/dev/", 5) == 0) + devname += 5; + snprintf(comp_name, 63, "/dev/%s", devname); + rdev = bstat(comp_name); + if (rdev) + dev = new_decode_dev(rdev); + if (!dev) { + printk(KERN_WARNING "md: Unknown device name: %s\n", devname); + break; + } + + devices[i] = dev; + + devname = p; + } + devices[i] = 0; + + if (!i) + continue; + + printk(KERN_INFO "md: Loading md%s%d: %s\n", + partitioned ? "_d" : "", minor, + md_setup_args[ent].device_names); + + fd = ksys_open(name, 0, 0); + if (fd < 0) { + printk(KERN_ERR "md: open failed - cannot start " + "array %s\n", name); + continue; + } + if (ksys_ioctl(fd, SET_ARRAY_INFO, 0) == -EBUSY) { + printk(KERN_WARNING + "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", + minor); + ksys_close(fd); + continue; + } + + if (md_setup_args[ent].level != LEVEL_NONE) { + /* non-persistent */ + mdu_array_info_t ainfo; + ainfo.level = md_setup_args[ent].level; + ainfo.size = 0; + ainfo.nr_disks =0; + ainfo.raid_disks =0; + while (devices[ainfo.raid_disks]) + ainfo.raid_disks++; + ainfo.md_minor =minor; + ainfo.not_persistent = 1; + + ainfo.state = (1 << MD_SB_CLEAN); + ainfo.layout = 0; + ainfo.chunk_size = md_setup_args[ent].chunk; + err = ksys_ioctl(fd, SET_ARRAY_INFO, (long)&ainfo); + for (i = 0; !err && i <= MD_SB_DISKS; i++) { + dev = devices[i]; + if (!dev) + break; + dinfo.number = i; + dinfo.raid_disk = i; + dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC); + dinfo.major = MAJOR(dev); + dinfo.minor = MINOR(dev); + err = ksys_ioctl(fd, ADD_NEW_DISK, + (long)&dinfo); + } + } else { + /* persistent */ + for (i = 0; i <= MD_SB_DISKS; i++) { + dev = devices[i]; + if (!dev) + break; + dinfo.major = MAJOR(dev); + dinfo.minor = MINOR(dev); + ksys_ioctl(fd, ADD_NEW_DISK, (long)&dinfo); + } + } + if (!err) + err = ksys_ioctl(fd, RUN_ARRAY, 0); + if (err) + printk(KERN_WARNING "md: starting md%d failed\n", minor); + else { + /* reread the partition table. + * I (neilb) and not sure why this is needed, but I cannot + * boot a kernel with devfs compiled in from partitioned md + * array without it + */ + ksys_close(fd); + fd = ksys_open(name, 0, 0); + ksys_ioctl(fd, BLKRRPART, 0); + } + ksys_close(fd); + } +} + +static int __init raid_setup(char *str) +{ + int len, pos; + + len = strlen(str) + 1; + pos = 0; + + while (pos < len) { + char *comma = strchr(str+pos, ','); + int wlen; + if (comma) + wlen = (comma-str)-pos; + else wlen = (len-1)-pos; + + if (!strncmp(str, "noautodetect", wlen)) + raid_noautodetect = 1; + if (!strncmp(str, "autodetect", wlen)) + raid_noautodetect = 0; + if (strncmp(str, "partitionable", wlen)==0) + raid_autopart = 1; + if (strncmp(str, "part", wlen)==0) + raid_autopart = 1; + pos += wlen+1; + } + return 1; +} + +__setup("raid=", raid_setup); +__setup("md=", md_setup); + +static void __init autodetect_raid(void) +{ + int fd; + + /* + * Since we don't want to detect and use half a raid array, we need to + * wait for the known devices to complete their probing + */ + printk(KERN_INFO "md: Waiting for all devices to be available before autodetect\n"); + printk(KERN_INFO "md: If you don't use raid, use raid=noautodetect\n"); + + wait_for_device_probe(); + + fd = ksys_open("/dev/md0", 0, 0); + if (fd >= 0) { + ksys_ioctl(fd, RAID_AUTORUN, raid_autopart); + ksys_close(fd); + } +} + +void __init md_run_setup(void) +{ + create_dev("/dev/md0", MKDEV(MD_MAJOR, 0)); + + if (raid_noautodetect) + printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=autodetect will force)\n"); + else + autodetect_raid(); + md_setup_drive(); +} diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c new file mode 100644 index 000000000..32fb049d1 --- /dev/null +++ b/init/do_mounts_rd.c @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/minix_fs.h> +#include <linux/ext2_fs.h> +#include <linux/romfs_fs.h> +#include <uapi/linux/cramfs_fs.h> +#include <linux/initrd.h> +#include <linux/string.h> +#include <linux/slab.h> + +#include "do_mounts.h" +#include "../fs/squashfs/squashfs_fs.h" + +#include <linux/decompress/generic.h> + + +int __initdata rd_prompt = 1;/* 1 = prompt for RAM disk, 0 = don't prompt */ + +static int __init prompt_ramdisk(char *str) +{ + rd_prompt = simple_strtol(str,NULL,0) & 1; + return 1; +} +__setup("prompt_ramdisk=", prompt_ramdisk); + +int __initdata rd_image_start; /* starting block # of image */ + +static int __init ramdisk_start_setup(char *str) +{ + rd_image_start = simple_strtol(str,NULL,0); + return 1; +} +__setup("ramdisk_start=", ramdisk_start_setup); + +static int __init crd_load(int in_fd, int out_fd, decompress_fn deco); + +/* + * This routine tries to find a RAM disk image to load, and returns the + * number of blocks to read for a non-compressed image, 0 if the image + * is a compressed image, and -1 if an image with the right magic + * numbers could not be found. + * + * We currently check for the following magic numbers: + * minix + * ext2 + * romfs + * cramfs + * squashfs + * gzip + * bzip2 + * lzma + * xz + * lzo + * lz4 + */ +static int __init +identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) +{ + const int size = 512; + struct minix_super_block *minixsb; + struct romfs_super_block *romfsb; + struct cramfs_super *cramfsb; + struct squashfs_super_block *squashfsb; + int nblocks = -1; + unsigned char *buf; + const char *compress_name; + unsigned long n; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + minixsb = (struct minix_super_block *) buf; + romfsb = (struct romfs_super_block *) buf; + cramfsb = (struct cramfs_super *) buf; + squashfsb = (struct squashfs_super_block *) buf; + memset(buf, 0xe5, size); + + /* + * Read block 0 to test for compressed kernel + */ + ksys_lseek(fd, start_block * BLOCK_SIZE, 0); + ksys_read(fd, buf, size); + + *decompressor = decompress_method(buf, size, &compress_name); + if (compress_name) { + printk(KERN_NOTICE "RAMDISK: %s image found at block %d\n", + compress_name, start_block); + if (!*decompressor) + printk(KERN_EMERG + "RAMDISK: %s decompressor not configured!\n", + compress_name); + nblocks = 0; + goto done; + } + + /* romfs is at block zero too */ + if (romfsb->word0 == ROMSB_WORD0 && + romfsb->word1 == ROMSB_WORD1) { + printk(KERN_NOTICE + "RAMDISK: romfs filesystem found at block %d\n", + start_block); + nblocks = (ntohl(romfsb->size)+BLOCK_SIZE-1)>>BLOCK_SIZE_BITS; + goto done; + } + + if (cramfsb->magic == CRAMFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: cramfs filesystem found at block %d\n", + start_block); + nblocks = (cramfsb->size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS; + goto done; + } + + /* squashfs is at block zero too */ + if (le32_to_cpu(squashfsb->s_magic) == SQUASHFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: squashfs filesystem found at block %d\n", + start_block); + nblocks = (le64_to_cpu(squashfsb->bytes_used) + BLOCK_SIZE - 1) + >> BLOCK_SIZE_BITS; + goto done; + } + + /* + * Read 512 bytes further to check if cramfs is padded + */ + ksys_lseek(fd, start_block * BLOCK_SIZE + 0x200, 0); + ksys_read(fd, buf, size); + + if (cramfsb->magic == CRAMFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: cramfs filesystem found at block %d\n", + start_block); + nblocks = (cramfsb->size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS; + goto done; + } + + /* + * Read block 1 to test for minix and ext2 superblock + */ + ksys_lseek(fd, (start_block+1) * BLOCK_SIZE, 0); + ksys_read(fd, buf, size); + + /* Try minix */ + if (minixsb->s_magic == MINIX_SUPER_MAGIC || + minixsb->s_magic == MINIX_SUPER_MAGIC2) { + printk(KERN_NOTICE + "RAMDISK: Minix filesystem found at block %d\n", + start_block); + nblocks = minixsb->s_nzones << minixsb->s_log_zone_size; + goto done; + } + + /* Try ext2 */ + n = ext2_image_size(buf); + if (n) { + printk(KERN_NOTICE + "RAMDISK: ext2 filesystem found at block %d\n", + start_block); + nblocks = n; + goto done; + } + + printk(KERN_NOTICE + "RAMDISK: Couldn't find valid RAM disk image starting at %d.\n", + start_block); + +done: + ksys_lseek(fd, start_block * BLOCK_SIZE, 0); + kfree(buf); + return nblocks; +} + +int __init rd_load_image(char *from) +{ + int res = 0; + int in_fd, out_fd; + unsigned long rd_blocks, devblocks; + int nblocks, i, disk; + char *buf = NULL; + unsigned short rotate = 0; + decompress_fn decompressor = NULL; +#if !defined(CONFIG_S390) + char rotator[4] = { '|' , '/' , '-' , '\\' }; +#endif + + out_fd = ksys_open("/dev/ram", O_RDWR, 0); + if (out_fd < 0) + goto out; + + in_fd = ksys_open(from, O_RDONLY, 0); + if (in_fd < 0) + goto noclose_input; + + nblocks = identify_ramdisk_image(in_fd, rd_image_start, &decompressor); + if (nblocks < 0) + goto done; + + if (nblocks == 0) { + if (crd_load(in_fd, out_fd, decompressor) == 0) + goto successful_load; + goto done; + } + + /* + * NOTE NOTE: nblocks is not actually blocks but + * the number of kibibytes of data to load into a ramdisk. + */ + if (ksys_ioctl(out_fd, BLKGETSIZE, (unsigned long)&rd_blocks) < 0) + rd_blocks = 0; + else + rd_blocks >>= 1; + + if (nblocks > rd_blocks) { + printk("RAMDISK: image too big! (%dKiB/%ldKiB)\n", + nblocks, rd_blocks); + goto done; + } + + /* + * OK, time to copy in the data + */ + if (ksys_ioctl(in_fd, BLKGETSIZE, (unsigned long)&devblocks) < 0) + devblocks = 0; + else + devblocks >>= 1; + + if (strcmp(from, "/initrd.image") == 0) + devblocks = nblocks; + + if (devblocks == 0) { + printk(KERN_ERR "RAMDISK: could not determine device size\n"); + goto done; + } + + buf = kmalloc(BLOCK_SIZE, GFP_KERNEL); + if (!buf) { + printk(KERN_ERR "RAMDISK: could not allocate buffer\n"); + goto done; + } + + printk(KERN_NOTICE "RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ", + nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : ""); + for (i = 0, disk = 1; i < nblocks; i++) { + if (i && (i % devblocks == 0)) { + pr_cont("done disk #%d.\n", disk++); + rotate = 0; + if (ksys_close(in_fd)) { + printk("Error closing the disk.\n"); + goto noclose_input; + } + change_floppy("disk #%d", disk); + in_fd = ksys_open(from, O_RDONLY, 0); + if (in_fd < 0) { + printk("Error opening disk.\n"); + goto noclose_input; + } + printk("Loading disk #%d... ", disk); + } + ksys_read(in_fd, buf, BLOCK_SIZE); + ksys_write(out_fd, buf, BLOCK_SIZE); +#if !defined(CONFIG_S390) + if (!(i % 16)) { + pr_cont("%c\b", rotator[rotate & 0x3]); + rotate++; + } +#endif + } + pr_cont("done.\n"); + +successful_load: + res = 1; +done: + ksys_close(in_fd); +noclose_input: + ksys_close(out_fd); +out: + kfree(buf); + ksys_unlink("/dev/ram"); + return res; +} + +int __init rd_load_disk(int n) +{ + if (rd_prompt) + change_floppy("root floppy disk to be loaded into RAM disk"); + create_dev("/dev/root", ROOT_DEV); + create_dev("/dev/ram", MKDEV(RAMDISK_MAJOR, n)); + return rd_load_image("/dev/root"); +} + +static int exit_code; +static int decompress_error; +static int crd_infd, crd_outfd; + +static long __init compr_fill(void *buf, unsigned long len) +{ + long r = ksys_read(crd_infd, buf, len); + if (r < 0) + printk(KERN_ERR "RAMDISK: error while reading compressed data"); + else if (r == 0) + printk(KERN_ERR "RAMDISK: EOF while reading compressed data"); + return r; +} + +static long __init compr_flush(void *window, unsigned long outcnt) +{ + long written = ksys_write(crd_outfd, window, outcnt); + if (written != outcnt) { + if (decompress_error == 0) + printk(KERN_ERR + "RAMDISK: incomplete write (%ld != %ld)\n", + written, outcnt); + decompress_error = 1; + return -1; + } + return outcnt; +} + +static void __init error(char *x) +{ + printk(KERN_ERR "%s\n", x); + exit_code = 1; + decompress_error = 1; +} + +static int __init crd_load(int in_fd, int out_fd, decompress_fn deco) +{ + int result; + crd_infd = in_fd; + crd_outfd = out_fd; + + if (!deco) { + pr_emerg("Invalid ramdisk decompression routine. " + "Select appropriate config option.\n"); + panic("Could not decompress initial ramdisk image."); + } + + result = deco(NULL, 0, compr_fill, compr_flush, NULL, NULL, error); + if (decompress_error) + result = 1; + return result; +} diff --git a/init/init_task.c b/init/init_task.c new file mode 100644 index 000000000..994ffe018 --- /dev/null +++ b/init/init_task.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init_task.h> +#include <linux/export.h> +#include <linux/mqueue.h> +#include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> +#include <linux/sched/task.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/audit.h> + +#include <asm/pgtable.h> +#include <linux/uaccess.h> + +static struct signal_struct init_signals = { + .nr_threads = 1, + .thread_head = LIST_HEAD_INIT(init_task.thread_node), + .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(init_signals.wait_chldexit), + .shared_pending = { + .list = LIST_HEAD_INIT(init_signals.shared_pending.list), + .signal = {{0}} + }, + .multiprocess = HLIST_HEAD_INIT, + .rlim = INIT_RLIMITS, + .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), +#ifdef CONFIG_POSIX_TIMERS + .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), + .cputimer = { + .cputime_atomic = INIT_CPUTIME_ATOMIC, + .running = false, + .checking_timer = false, + }, +#endif + INIT_CPU_TIMERS(init_signals) + .pids = { + [PIDTYPE_PID] = &init_struct_pid, + [PIDTYPE_TGID] = &init_struct_pid, + [PIDTYPE_PGID] = &init_struct_pid, + [PIDTYPE_SID] = &init_struct_pid, + }, + INIT_PREV_CPUTIME(init_signals) +}; + +static struct sighand_struct init_sighand = { + .count = ATOMIC_INIT(1), + .action = { { { .sa_handler = SIG_DFL, } }, }, + .siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock), + .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), +}; + +/* + * Set up the first task table, touch at your own risk!. Base=0, + * limit=0x1fffff (=2MB) + */ +struct task_struct init_task +#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK + __init_task_data +#endif += { +#ifdef CONFIG_THREAD_INFO_IN_TASK + .thread_info = INIT_THREAD_INFO(init_task), + .stack_refcount = ATOMIC_INIT(1), +#endif + .state = 0, + .stack = init_stack, + .usage = ATOMIC_INIT(2), + .flags = PF_KTHREAD, + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, + .policy = SCHED_NORMAL, + .cpus_allowed = CPU_MASK_ALL, + .nr_cpus_allowed= NR_CPUS, + .mm = NULL, + .active_mm = &init_mm, + .restart_block = { + .fn = do_no_restart_syscall, + }, + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, + .rt = { + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, + .tasks = LIST_HEAD_INIT(init_task.tasks), +#ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +#endif +#ifdef CONFIG_CGROUP_SCHED + .sched_task_group = &root_task_group, +#endif + .ptraced = LIST_HEAD_INIT(init_task.ptraced), + .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), + .real_parent = &init_task, + .parent = &init_task, + .children = LIST_HEAD_INIT(init_task.children), + .sibling = LIST_HEAD_INIT(init_task.sibling), + .group_leader = &init_task, + RCU_POINTER_INITIALIZER(real_cred, &init_cred), + RCU_POINTER_INITIALIZER(cred, &init_cred), + .comm = INIT_TASK_COMM, + .thread = INIT_THREAD, + .fs = &init_fs, + .files = &init_files, + .signal = &init_signals, + .sighand = &init_sighand, + .nsproxy = &init_nsproxy, + .pending = { + .list = LIST_HEAD_INIT(init_task.pending.list), + .signal = {{0}} + }, + .blocked = {{0}}, + .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), + .journal_info = NULL, + INIT_CPU_TIMERS(init_task) + .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), + .timer_slack_ns = 50000, /* 50 usec default slack */ + .thread_pid = &init_struct_pid, + .thread_group = LIST_HEAD_INIT(init_task.thread_group), + .thread_node = LIST_HEAD_INIT(init_signals.thread_head), +#ifdef CONFIG_AUDITSYSCALL + .loginuid = INVALID_UID, + .sessionid = AUDIT_SID_UNSET, +#endif +#ifdef CONFIG_PERF_EVENTS + .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), + .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), +#endif +#ifdef CONFIG_PREEMPT_RCU + .rcu_read_lock_nesting = 0, + .rcu_read_unlock_special.s = 0, + .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), + .rcu_blocked_node = NULL, +#endif +#ifdef CONFIG_TASKS_RCU + .rcu_tasks_holdout = false, + .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), + .rcu_tasks_idle_cpu = -1, +#endif +#ifdef CONFIG_CPUSETS + .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), +#endif +#ifdef CONFIG_RT_MUTEXES + .pi_waiters = RB_ROOT_CACHED, + .pi_top_task = NULL, +#endif + INIT_PREV_CPUTIME(init_task) +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), + .vtime.starttime = 0, + .vtime.state = VTIME_SYS, +#endif +#ifdef CONFIG_NUMA_BALANCING + .numa_preferred_nid = -1, + .numa_group = NULL, + .numa_faults = NULL, +#endif +#ifdef CONFIG_KASAN + .kasan_depth = 1, +#endif +#ifdef CONFIG_TRACE_IRQFLAGS + .softirqs_enabled = 1, +#endif +#ifdef CONFIG_LOCKDEP + .lockdep_recursion = 0, +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + .ret_stack = NULL, + .tracing_graph_pause = ATOMIC_INIT(0), +#endif +#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) + .trace_recursion = 0, +#endif +#ifdef CONFIG_LIVEPATCH + .patch_state = KLP_UNDEFINED, +#endif +#ifdef CONFIG_SECURITY + .security = NULL, +#endif +}; +EXPORT_SYMBOL(init_task); + +/* + * Initial thread structure. Alignment of this is handled by a special + * linker map entry. + */ +#ifndef CONFIG_THREAD_INFO_IN_TASK +struct thread_info init_thread_info __init_thread_info = INIT_THREAD_INFO(init_task); +#endif diff --git a/init/initramfs.c b/init/initramfs.c new file mode 100644 index 000000000..dab8d6345 --- /dev/null +++ b/init/initramfs.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/dirent.h> +#include <linux/syscalls.h> +#include <linux/utime.h> +#include <linux/file.h> + +static ssize_t __init xwrite(int fd, const char *p, size_t count) +{ + ssize_t out = 0; + + /* sys_write only can write MAX_RW_COUNT aka 2G-4K bytes at most */ + while (count) { + ssize_t rv = ksys_write(fd, p, count); + + if (rv < 0) { + if (rv == -EINTR || rv == -EAGAIN) + continue; + return out ? out : rv; + } else if (rv == 0) + break; + + p += rv; + out += rv; + count -= rv; + } + + return out; +} + +static __initdata char *message; +static void __init error(char *x) +{ + if (!message) + message = x; +} + +/* link hash */ + +#define N_ALIGN(len) ((((len) + 1) & ~3) + 2) + +static __initdata struct hash { + int ino, minor, major; + umode_t mode; + struct hash *next; + char name[N_ALIGN(PATH_MAX)]; +} *head[32]; + +static inline int hash(int major, int minor, int ino) +{ + unsigned long tmp = ino + minor + (major << 3); + tmp += tmp >> 5; + return tmp & 31; +} + +static char __init *find_link(int major, int minor, int ino, + umode_t mode, char *name) +{ + struct hash **p, *q; + for (p = head + hash(major, minor, ino); *p; p = &(*p)->next) { + if ((*p)->ino != ino) + continue; + if ((*p)->minor != minor) + continue; + if ((*p)->major != major) + continue; + if (((*p)->mode ^ mode) & S_IFMT) + continue; + return (*p)->name; + } + q = kmalloc(sizeof(struct hash), GFP_KERNEL); + if (!q) + panic("can't allocate link hash entry"); + q->major = major; + q->minor = minor; + q->ino = ino; + q->mode = mode; + strcpy(q->name, name); + q->next = NULL; + *p = q; + return NULL; +} + +static void __init free_hash(void) +{ + struct hash **p, *q; + for (p = head; p < head + 32; p++) { + while (*p) { + q = *p; + *p = q->next; + kfree(q); + } + } +} + +static long __init do_utime(char *filename, time64_t mtime) +{ + struct timespec64 t[2]; + + t[0].tv_sec = mtime; + t[0].tv_nsec = 0; + t[1].tv_sec = mtime; + t[1].tv_nsec = 0; + + return do_utimes(AT_FDCWD, filename, t, AT_SYMLINK_NOFOLLOW); +} + +static __initdata LIST_HEAD(dir_list); +struct dir_entry { + struct list_head list; + char *name; + time64_t mtime; +}; + +static void __init dir_add(const char *name, time64_t mtime) +{ + struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); + if (!de) + panic("can't allocate dir_entry buffer"); + INIT_LIST_HEAD(&de->list); + de->name = kstrdup(name, GFP_KERNEL); + de->mtime = mtime; + list_add(&de->list, &dir_list); +} + +static void __init dir_utime(void) +{ + struct dir_entry *de, *tmp; + list_for_each_entry_safe(de, tmp, &dir_list, list) { + list_del(&de->list); + do_utime(de->name, de->mtime); + kfree(de->name); + kfree(de); + } +} + +static __initdata time64_t mtime; + +/* cpio header parsing */ + +static __initdata unsigned long ino, major, minor, nlink; +static __initdata umode_t mode; +static __initdata unsigned long body_len, name_len; +static __initdata uid_t uid; +static __initdata gid_t gid; +static __initdata unsigned rdev; + +static void __init parse_header(char *s) +{ + unsigned long parsed[12]; + char buf[9]; + int i; + + buf[8] = '\0'; + for (i = 0, s += 6; i < 12; i++, s += 8) { + memcpy(buf, s, 8); + parsed[i] = simple_strtoul(buf, NULL, 16); + } + ino = parsed[0]; + mode = parsed[1]; + uid = parsed[2]; + gid = parsed[3]; + nlink = parsed[4]; + mtime = parsed[5]; /* breaks in y2106 */ + body_len = parsed[6]; + major = parsed[7]; + minor = parsed[8]; + rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); + name_len = parsed[11]; +} + +/* FSM */ + +static __initdata enum state { + Start, + Collect, + GotHeader, + SkipIt, + GotName, + CopyFile, + GotSymlink, + Reset +} state, next_state; + +static __initdata char *victim; +static unsigned long byte_count __initdata; +static __initdata loff_t this_header, next_header; + +static inline void __init eat(unsigned n) +{ + victim += n; + this_header += n; + byte_count -= n; +} + +static __initdata char *vcollected; +static __initdata char *collected; +static long remains __initdata; +static __initdata char *collect; + +static void __init read_into(char *buf, unsigned size, enum state next) +{ + if (byte_count >= size) { + collected = victim; + eat(size); + state = next; + } else { + collect = collected = buf; + remains = size; + next_state = next; + state = Collect; + } +} + +static __initdata char *header_buf, *symlink_buf, *name_buf; + +static int __init do_start(void) +{ + read_into(header_buf, 110, GotHeader); + return 0; +} + +static int __init do_collect(void) +{ + unsigned long n = remains; + if (byte_count < n) + n = byte_count; + memcpy(collect, victim, n); + eat(n); + collect += n; + if ((remains -= n) != 0) + return 1; + state = next_state; + return 0; +} + +static int __init do_header(void) +{ + if (memcmp(collected, "070707", 6)==0) { + error("incorrect cpio method used: use -H newc option"); + return 1; + } + if (memcmp(collected, "070701", 6)) { + error("no cpio magic"); + return 1; + } + parse_header(collected); + next_header = this_header + N_ALIGN(name_len) + body_len; + next_header = (next_header + 3) & ~3; + state = SkipIt; + if (name_len <= 0 || name_len > PATH_MAX) + return 0; + if (S_ISLNK(mode)) { + if (body_len > PATH_MAX) + return 0; + collect = collected = symlink_buf; + remains = N_ALIGN(name_len) + body_len; + next_state = GotSymlink; + state = Collect; + return 0; + } + if (S_ISREG(mode) || !body_len) + read_into(name_buf, N_ALIGN(name_len), GotName); + return 0; +} + +static int __init do_skip(void) +{ + if (this_header + byte_count < next_header) { + eat(byte_count); + return 1; + } else { + eat(next_header - this_header); + state = next_state; + return 0; + } +} + +static int __init do_reset(void) +{ + while (byte_count && *victim == '\0') + eat(1); + if (byte_count && (this_header & 3)) + error("broken padding"); + return 1; +} + +static void __init clean_path(char *path, umode_t fmode) +{ + struct kstat st; + + if (!vfs_lstat(path, &st) && (st.mode ^ fmode) & S_IFMT) { + if (S_ISDIR(st.mode)) + ksys_rmdir(path); + else + ksys_unlink(path); + } +} + +static int __init maybe_link(void) +{ + if (nlink >= 2) { + char *old = find_link(major, minor, ino, mode, collected); + if (old) { + clean_path(collected, 0); + return (ksys_link(old, collected) < 0) ? -1 : 1; + } + } + return 0; +} + +static __initdata int wfd; + +static int __init do_name(void) +{ + state = SkipIt; + next_state = Reset; + if (strcmp(collected, "TRAILER!!!") == 0) { + free_hash(); + return 0; + } + clean_path(collected, mode); + if (S_ISREG(mode)) { + int ml = maybe_link(); + if (ml >= 0) { + int openflags = O_WRONLY|O_CREAT; + if (ml != 1) + openflags |= O_TRUNC; + wfd = ksys_open(collected, openflags, mode); + + if (wfd >= 0) { + ksys_fchown(wfd, uid, gid); + ksys_fchmod(wfd, mode); + if (body_len) + ksys_ftruncate(wfd, body_len); + vcollected = kstrdup(collected, GFP_KERNEL); + state = CopyFile; + } + } + } else if (S_ISDIR(mode)) { + ksys_mkdir(collected, mode); + ksys_chown(collected, uid, gid); + ksys_chmod(collected, mode); + dir_add(collected, mtime); + } else if (S_ISBLK(mode) || S_ISCHR(mode) || + S_ISFIFO(mode) || S_ISSOCK(mode)) { + if (maybe_link() == 0) { + ksys_mknod(collected, mode, rdev); + ksys_chown(collected, uid, gid); + ksys_chmod(collected, mode); + do_utime(collected, mtime); + } + } + return 0; +} + +static int __init do_copy(void) +{ + if (byte_count >= body_len) { + if (xwrite(wfd, victim, body_len) != body_len) + error("write error"); + ksys_close(wfd); + do_utime(vcollected, mtime); + kfree(vcollected); + eat(body_len); + state = SkipIt; + return 0; + } else { + if (xwrite(wfd, victim, byte_count) != byte_count) + error("write error"); + body_len -= byte_count; + eat(byte_count); + return 1; + } +} + +static int __init do_symlink(void) +{ + collected[N_ALIGN(name_len) + body_len] = '\0'; + clean_path(collected, 0); + ksys_symlink(collected + N_ALIGN(name_len), collected); + ksys_lchown(collected, uid, gid); + do_utime(collected, mtime); + state = SkipIt; + next_state = Reset; + return 0; +} + +static __initdata int (*actions[])(void) = { + [Start] = do_start, + [Collect] = do_collect, + [GotHeader] = do_header, + [SkipIt] = do_skip, + [GotName] = do_name, + [CopyFile] = do_copy, + [GotSymlink] = do_symlink, + [Reset] = do_reset, +}; + +static long __init write_buffer(char *buf, unsigned long len) +{ + byte_count = len; + victim = buf; + + while (!actions[state]()) + ; + return len - byte_count; +} + +static long __init flush_buffer(void *bufv, unsigned long len) +{ + char *buf = (char *) bufv; + long written; + long origLen = len; + if (message) + return -1; + while ((written = write_buffer(buf, len)) < len && !message) { + char c = buf[written]; + if (c == '0') { + buf += written; + len -= written; + state = Start; + } else if (c == 0) { + buf += written; + len -= written; + state = Reset; + } else + error("junk in compressed archive"); + } + return origLen; +} + +static unsigned long my_inptr; /* index of next byte to be processed in inbuf */ + +#include <linux/decompress/generic.h> + +static char * __init unpack_to_rootfs(char *buf, unsigned long len) +{ + long written; + decompress_fn decompress; + const char *compress_name; + static __initdata char msg_buf[64]; + + header_buf = kmalloc(110, GFP_KERNEL); + symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); + name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); + + if (!header_buf || !symlink_buf || !name_buf) + panic("can't allocate buffers"); + + state = Start; + this_header = 0; + message = NULL; + while (!message && len) { + loff_t saved_offset = this_header; + if (*buf == '0' && !(this_header & 3)) { + state = Start; + written = write_buffer(buf, len); + buf += written; + len -= written; + continue; + } + if (!*buf) { + buf++; + len--; + this_header++; + continue; + } + this_header = 0; + decompress = decompress_method(buf, len, &compress_name); + pr_debug("Detected %s compressed data\n", compress_name); + if (decompress) { + int res = decompress(buf, len, NULL, flush_buffer, NULL, + &my_inptr, error); + if (res) + error("decompressor failed"); + } else if (compress_name) { + if (!message) { + snprintf(msg_buf, sizeof msg_buf, + "compression method %s not configured", + compress_name); + message = msg_buf; + } + } else + error("junk in compressed archive"); + if (state != Reset) + error("junk in compressed archive"); + this_header = saved_offset + my_inptr; + buf += my_inptr; + len -= my_inptr; + } + dir_utime(); + kfree(name_buf); + kfree(symlink_buf); + kfree(header_buf); + return message; +} + +static int __initdata do_retain_initrd; + +static int __init retain_initrd_param(char *str) +{ + if (*str) + return 0; + do_retain_initrd = 1; + return 1; +} +__setup("retain_initrd", retain_initrd_param); + +extern char __initramfs_start[]; +extern unsigned long __initramfs_size; +#include <linux/initrd.h> +#include <linux/kexec.h> + +static void __init free_initrd(void) +{ +#ifdef CONFIG_KEXEC_CORE + unsigned long crashk_start = (unsigned long)__va(crashk_res.start); + unsigned long crashk_end = (unsigned long)__va(crashk_res.end); +#endif + if (do_retain_initrd || !initrd_start) + goto skip; + +#ifdef CONFIG_KEXEC_CORE + /* + * If the initrd region is overlapped with crashkernel reserved region, + * free only memory that is not part of crashkernel region. + */ + if (initrd_start < crashk_end && initrd_end > crashk_start) { + /* + * Initialize initrd memory region since the kexec boot does + * not do. + */ + memset((void *)initrd_start, 0, initrd_end - initrd_start); + if (initrd_start < crashk_start) + free_initrd_mem(initrd_start, crashk_start); + if (initrd_end > crashk_end) + free_initrd_mem(crashk_end, initrd_end); + } else +#endif + free_initrd_mem(initrd_start, initrd_end); +skip: + initrd_start = 0; + initrd_end = 0; +} + +#ifdef CONFIG_BLK_DEV_RAM +#define BUF_SIZE 1024 +static void __init clean_rootfs(void) +{ + int fd; + void *buf; + struct linux_dirent64 *dirp; + int num; + + fd = ksys_open("/", O_RDONLY, 0); + WARN_ON(fd < 0); + if (fd < 0) + return; + buf = kzalloc(BUF_SIZE, GFP_KERNEL); + WARN_ON(!buf); + if (!buf) { + ksys_close(fd); + return; + } + + dirp = buf; + num = ksys_getdents64(fd, dirp, BUF_SIZE); + while (num > 0) { + while (num > 0) { + struct kstat st; + int ret; + + ret = vfs_lstat(dirp->d_name, &st); + WARN_ON_ONCE(ret); + if (!ret) { + if (S_ISDIR(st.mode)) + ksys_rmdir(dirp->d_name); + else + ksys_unlink(dirp->d_name); + } + + num -= dirp->d_reclen; + dirp = (void *)dirp + dirp->d_reclen; + } + dirp = buf; + memset(buf, 0, BUF_SIZE); + num = ksys_getdents64(fd, dirp, BUF_SIZE); + } + + ksys_close(fd); + kfree(buf); +} +#endif + +static int __init populate_rootfs(void) +{ + /* Load the built in initramfs */ + char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); + if (err) + panic("%s", err); /* Failed to decompress INTERNAL initramfs */ + /* If available load the bootloader supplied initrd */ + if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { +#ifdef CONFIG_BLK_DEV_RAM + int fd; + printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); + err = unpack_to_rootfs((char *)initrd_start, + initrd_end - initrd_start); + if (!err) + goto done; + + clean_rootfs(); + unpack_to_rootfs(__initramfs_start, __initramfs_size); + + printk(KERN_INFO "rootfs image is not initramfs (%s)" + "; looks like an initrd\n", err); + fd = ksys_open("/initrd.image", + O_WRONLY|O_CREAT, 0700); + if (fd >= 0) { + ssize_t written = xwrite(fd, (char *)initrd_start, + initrd_end - initrd_start); + + if (written != initrd_end - initrd_start) + pr_err("/initrd.image: incomplete write (%zd != %ld)\n", + written, initrd_end - initrd_start); + + ksys_close(fd); + } + done: + /* empty statement */; +#else + printk(KERN_INFO "Unpacking initramfs...\n"); + err = unpack_to_rootfs((char *)initrd_start, + initrd_end - initrd_start); + if (err) + printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); +#endif + } + free_initrd(); + flush_delayed_fput(); + /* + * Try loading default modules from initramfs. This gives + * us a chance to load before device_initcalls. + */ + load_default_modules(); + + return 0; +} +rootfs_initcall(populate_rootfs); diff --git a/init/main.c b/init/main.c new file mode 100644 index 000000000..489a5aa7b --- /dev/null +++ b/init/main.c @@ -0,0 +1,1184 @@ +/* + * linux/init/main.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * GK 2/5/95 - Changed to support mounting root fs via NFS + * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 + * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 + * Simplified starting of init: Michael A. Griffith <grif@acm.org> + */ + +#define DEBUG /* Enable initcall_debug */ + +#include <linux/types.h> +#include <linux/extable.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/binfmts.h> +#include <linux/kernel.h> +#include <linux/syscalls.h> +#include <linux/stackprotector.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/delay.h> +#include <linux/ioport.h> +#include <linux/init.h> +#include <linux/initrd.h> +#include <linux/bootmem.h> +#include <linux/acpi.h> +#include <linux/console.h> +#include <linux/nmi.h> +#include <linux/percpu.h> +#include <linux/kmod.h> +#include <linux/vmalloc.h> +#include <linux/kernel_stat.h> +#include <linux/start_kernel.h> +#include <linux/security.h> +#include <linux/smp.h> +#include <linux/profile.h> +#include <linux/rcupdate.h> +#include <linux/moduleparam.h> +#include <linux/kallsyms.h> +#include <linux/writeback.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/cgroup.h> +#include <linux/efi.h> +#include <linux/tick.h> +#include <linux/sched/isolation.h> +#include <linux/interrupt.h> +#include <linux/taskstats_kern.h> +#include <linux/delayacct.h> +#include <linux/unistd.h> +#include <linux/utsname.h> +#include <linux/rmap.h> +#include <linux/mempolicy.h> +#include <linux/key.h> +#include <linux/buffer_head.h> +#include <linux/page_ext.h> +#include <linux/debug_locks.h> +#include <linux/debugobjects.h> +#include <linux/lockdep.h> +#include <linux/kmemleak.h> +#include <linux/pid_namespace.h> +#include <linux/device.h> +#include <linux/kthread.h> +#include <linux/sched.h> +#include <linux/sched/init.h> +#include <linux/signal.h> +#include <linux/idr.h> +#include <linux/kgdb.h> +#include <linux/ftrace.h> +#include <linux/async.h> +#include <linux/sfi.h> +#include <linux/shmem_fs.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <linux/ptrace.h> +#include <linux/pti.h> +#include <linux/blkdev.h> +#include <linux/elevator.h> +#include <linux/sched/clock.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/context_tracking.h> +#include <linux/random.h> +#include <linux/list.h> +#include <linux/integrity.h> +#include <linux/proc_ns.h> +#include <linux/io.h> +#include <linux/cache.h> +#include <linux/rodata_test.h> +#include <linux/jump_label.h> +#include <linux/mem_encrypt.h> + +#include <asm/io.h> +#include <asm/bugs.h> +#include <asm/setup.h> +#include <asm/sections.h> +#include <asm/cacheflush.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/initcall.h> + +static int kernel_init(void *); + +extern void init_IRQ(void); +extern void radix_tree_init(void); + +/* + * Debug helper: via this flag we know that we are in 'early bootup code' + * where only the boot processor is running with IRQ disabled. This means + * two things - IRQ must not be enabled before the flag is cleared and some + * operations which are not allowed with IRQ disabled are allowed while the + * flag is set. + */ +bool early_boot_irqs_disabled __read_mostly; + +enum system_states system_state __read_mostly; +EXPORT_SYMBOL(system_state); + +/* + * Boot command-line arguments + */ +#define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT +#define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT + +extern void time_init(void); +/* Default late time init is NULL. archs can override this later. */ +void (*__initdata late_time_init)(void); + +/* Untouched command line saved by arch-specific code. */ +char __initdata boot_command_line[COMMAND_LINE_SIZE]; +/* Untouched saved command line (eg. for /proc) */ +char *saved_command_line; +/* Command line for parameter parsing */ +static char *static_command_line; +/* Command line for per-initcall parameter parsing */ +static char *initcall_command_line; + +static char *execute_command; +static char *ramdisk_execute_command; + +/* + * Used to generate warnings if static_key manipulation functions are used + * before jump_label_init is called. + */ +bool static_key_initialized __read_mostly; +EXPORT_SYMBOL_GPL(static_key_initialized); + +/* + * If set, this is an indication to the drivers that reset the underlying + * device before going ahead with the initialization otherwise driver might + * rely on the BIOS and skip the reset operation. + * + * This is useful if kernel is booting in an unreliable environment. + * For ex. kdump situation where previous kernel has crashed, BIOS has been + * skipped and devices will be in unknown state. + */ +unsigned int reset_devices; +EXPORT_SYMBOL(reset_devices); + +static int __init set_reset_devices(char *str) +{ + reset_devices = 1; + return 1; +} + +__setup("reset_devices", set_reset_devices); + +static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; +const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; +static const char *panic_later, *panic_param; + +extern const struct obs_kernel_param __setup_start[], __setup_end[]; + +static bool __init obsolete_checksetup(char *line) +{ + const struct obs_kernel_param *p; + bool had_early_param = false; + + p = __setup_start; + do { + int n = strlen(p->str); + if (parameqn(line, p->str, n)) { + if (p->early) { + /* Already done in parse_early_param? + * (Needs exact match on param part). + * Keep iterating, as we can have early + * params and __setups of same names 8( */ + if (line[n] == '\0' || line[n] == '=') + had_early_param = true; + } else if (!p->setup_func) { + pr_warn("Parameter %s is obsolete, ignored\n", + p->str); + return true; + } else if (p->setup_func(line + n)) + return true; + } + p++; + } while (p < __setup_end); + + return had_early_param; +} + +/* + * This should be approx 2 Bo*oMips to start (note initial shift), and will + * still work even if initially too large, it will just take slightly longer + */ +unsigned long loops_per_jiffy = (1<<12); +EXPORT_SYMBOL(loops_per_jiffy); + +static int __init debug_kernel(char *str) +{ + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; + return 0; +} + +static int __init quiet_kernel(char *str) +{ + console_loglevel = CONSOLE_LOGLEVEL_QUIET; + return 0; +} + +early_param("debug", debug_kernel); +early_param("quiet", quiet_kernel); + +static int __init loglevel(char *str) +{ + int newlevel; + + /* + * Only update loglevel value when a correct setting was passed, + * to prevent blind crashes (when loglevel being set to 0) that + * are quite hard to debug + */ + if (get_option(&str, &newlevel)) { + console_loglevel = newlevel; + return 0; + } + + return -EINVAL; +} + +early_param("loglevel", loglevel); + +/* Change NUL term back to "=", to make "param" the whole string. */ +static int __init repair_env_string(char *param, char *val, + const char *unused, void *arg) +{ + if (val) { + /* param=val or param="val"? */ + if (val == param+strlen(param)+1) + val[-1] = '='; + else if (val == param+strlen(param)+2) { + val[-2] = '='; + memmove(val-1, val, strlen(val)+1); + val--; + } else + BUG(); + } + return 0; +} + +/* Anything after -- gets handed straight to init. */ +static int __init set_init_arg(char *param, char *val, + const char *unused, void *arg) +{ + unsigned int i; + + if (panic_later) + return 0; + + repair_env_string(param, val, unused, NULL); + + for (i = 0; argv_init[i]; i++) { + if (i == MAX_INIT_ARGS) { + panic_later = "init"; + panic_param = param; + return 0; + } + } + argv_init[i] = param; + return 0; +} + +/* + * Unknown boot options get handed to init, unless they look like + * unused parameters (modprobe will find them in /proc/cmdline). + */ +static int __init unknown_bootoption(char *param, char *val, + const char *unused, void *arg) +{ + repair_env_string(param, val, unused, NULL); + + /* Handle obsolete-style parameters */ + if (obsolete_checksetup(param)) + return 0; + + /* Unused module parameter. */ + if (strchr(param, '.') && (!val || strchr(param, '.') < val)) + return 0; + + if (panic_later) + return 0; + + if (val) { + /* Environment option */ + unsigned int i; + for (i = 0; envp_init[i]; i++) { + if (i == MAX_INIT_ENVS) { + panic_later = "env"; + panic_param = param; + } + if (!strncmp(param, envp_init[i], val - param)) + break; + } + envp_init[i] = param; + } else { + /* Command line option */ + unsigned int i; + for (i = 0; argv_init[i]; i++) { + if (i == MAX_INIT_ARGS) { + panic_later = "init"; + panic_param = param; + } + } + argv_init[i] = param; + } + return 0; +} + +static int __init init_setup(char *str) +{ + unsigned int i; + + execute_command = str; + /* + * In case LILO is going to boot us with default command line, + * it prepends "auto" before the whole cmdline which makes + * the shell think it should execute a script with such name. + * So we ignore all arguments entered _before_ init=... [MJ] + */ + for (i = 1; i < MAX_INIT_ARGS; i++) + argv_init[i] = NULL; + return 1; +} +__setup("init=", init_setup); + +static int __init rdinit_setup(char *str) +{ + unsigned int i; + + ramdisk_execute_command = str; + /* See "auto" comment in init_setup */ + for (i = 1; i < MAX_INIT_ARGS; i++) + argv_init[i] = NULL; + return 1; +} +__setup("rdinit=", rdinit_setup); + +#ifndef CONFIG_SMP +static const unsigned int setup_max_cpus = NR_CPUS; +static inline void setup_nr_cpu_ids(void) { } +static inline void smp_prepare_cpus(unsigned int maxcpus) { } +#endif + +/* + * We need to store the untouched command line for future reference. + * We also need to store the touched command line since the parameter + * parsing is performed in place, and we should allow a component to + * store reference of name/value for future reference. + */ +static void __init setup_command_line(char *command_line) +{ + saved_command_line = + memblock_virt_alloc(strlen(boot_command_line) + 1, 0); + initcall_command_line = + memblock_virt_alloc(strlen(boot_command_line) + 1, 0); + static_command_line = memblock_virt_alloc(strlen(command_line) + 1, 0); + strcpy(saved_command_line, boot_command_line); + strcpy(static_command_line, command_line); +} + +/* + * We need to finalize in a non-__init function or else race conditions + * between the root thread and the init thread may cause start_kernel to + * be reaped by free_initmem before the root thread has proceeded to + * cpu_idle. + * + * gcc-3.4 accidentally inlines this function, so use noinline. + */ + +static __initdata DECLARE_COMPLETION(kthreadd_done); + +static noinline void __ref rest_init(void) +{ + struct task_struct *tsk; + int pid; + + rcu_scheduler_starting(); + /* + * We need to spawn init first so that it obtains pid 1, however + * the init task will end up wanting to create kthreads, which, if + * we schedule it before we create kthreadd, will OOPS. + */ + pid = kernel_thread(kernel_init, NULL, CLONE_FS); + /* + * Pin init on the boot CPU. Task migration is not properly working + * until sched_init_smp() has been run. It will set the allowed + * CPUs for init to the non isolated CPUs. + */ + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id())); + rcu_read_unlock(); + + numa_default_policy(); + pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); + rcu_read_lock(); + kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); + rcu_read_unlock(); + + /* + * Enable might_sleep() and smp_processor_id() checks. + * They cannot be enabled earlier because with CONFIG_PREEMPT=y + * kernel_thread() would trigger might_sleep() splats. With + * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled + * already, but it's stuck on the kthreadd_done completion. + */ + system_state = SYSTEM_SCHEDULING; + + complete(&kthreadd_done); + + /* + * The boot idle thread must execute schedule() + * at least once to get things moving: + */ + schedule_preempt_disabled(); + /* Call into cpu_idle with preempt disabled */ + cpu_startup_entry(CPUHP_ONLINE); +} + +/* Check for early params. */ +static int __init do_early_param(char *param, char *val, + const char *unused, void *arg) +{ + const struct obs_kernel_param *p; + + for (p = __setup_start; p < __setup_end; p++) { + if ((p->early && parameq(param, p->str)) || + (strcmp(param, "console") == 0 && + strcmp(p->str, "earlycon") == 0) + ) { + if (p->setup_func(val) != 0) + pr_warn("Malformed early option '%s'\n", param); + } + } + /* We accept everything at this stage. */ + return 0; +} + +void __init parse_early_options(char *cmdline) +{ + parse_args("early options", cmdline, NULL, 0, 0, 0, NULL, + do_early_param); +} + +/* Arch code calls this early on, or if not, just before other parsing. */ +void __init parse_early_param(void) +{ + static int done __initdata; + static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; + + if (done) + return; + + /* All fall through to do_early_param. */ + strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + parse_early_options(tmp_cmdline); + done = 1; +} + +void __init __weak arch_post_acpi_subsys_init(void) { } + +void __init __weak smp_setup_processor_id(void) +{ +} + +# if THREAD_SIZE >= PAGE_SIZE +void __init __weak thread_stack_cache_init(void) +{ +} +#endif + +void __init __weak mem_encrypt_init(void) { } + +bool initcall_debug; +core_param(initcall_debug, initcall_debug, bool, 0644); + +#ifdef TRACEPOINTS_ENABLED +static void __init initcall_debug_enable(void); +#else +static inline void initcall_debug_enable(void) +{ +} +#endif + +/* + * Set up kernel memory allocators + */ +static void __init mm_init(void) +{ + /* + * page_ext requires contiguous pages, + * bigger than MAX_ORDER unless SPARSEMEM. + */ + page_ext_init_flatmem(); + mem_init(); + kmem_cache_init(); + pgtable_init(); + vmalloc_init(); + ioremap_huge_init(); + /* Should be run before the first non-init thread is created */ + init_espfix_bsp(); + /* Should be run after espfix64 is set up. */ + pti_init(); +} + +asmlinkage __visible void __init start_kernel(void) +{ + char *command_line; + char *after_dashes; + + set_task_stack_end_magic(&init_task); + smp_setup_processor_id(); + debug_objects_early_init(); + + cgroup_init_early(); + + local_irq_disable(); + early_boot_irqs_disabled = true; + + /* + * Interrupts are still disabled. Do necessary setups, then + * enable them. + */ + boot_cpu_init(); + page_address_init(); + pr_notice("%s", linux_banner); + setup_arch(&command_line); + mm_init_cpumask(&init_mm); + setup_command_line(command_line); + setup_nr_cpu_ids(); + setup_per_cpu_areas(); + smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ + boot_cpu_hotplug_init(); + + build_all_zonelists(NULL); + page_alloc_init(); + + pr_notice("Kernel command line: %s\n", boot_command_line); + /* parameters may set static keys */ + jump_label_init(); + parse_early_param(); + after_dashes = parse_args("Booting kernel", + static_command_line, __start___param, + __stop___param - __start___param, + -1, -1, NULL, &unknown_bootoption); + if (!IS_ERR_OR_NULL(after_dashes)) + parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, + NULL, set_init_arg); + + /* + * These use large bootmem allocations and must precede + * kmem_cache_init() + */ + setup_log_buf(0); + vfs_caches_init_early(); + sort_main_extable(); + trap_init(); + mm_init(); + + ftrace_init(); + + /* trace_printk can be enabled here */ + early_trace_init(); + + /* + * Set up the scheduler prior starting any interrupts (such as the + * timer interrupt). Full topology setup happens at smp_init() + * time - but meanwhile we still have a functioning scheduler. + */ + sched_init(); + /* + * Disable preemption - early bootup scheduling is extremely + * fragile until we cpu_idle() for the first time. + */ + preempt_disable(); + if (WARN(!irqs_disabled(), + "Interrupts were enabled *very* early, fixing it\n")) + local_irq_disable(); + radix_tree_init(); + + /* + * Set up housekeeping before setting up workqueues to allow the unbound + * workqueue to take non-housekeeping into account. + */ + housekeeping_init(); + + /* + * Allow workqueue creation and work item queueing/cancelling + * early. Work item execution depends on kthreads and starts after + * workqueue_init(). + */ + workqueue_init_early(); + + rcu_init(); + + /* Trace events are available after this */ + trace_init(); + + if (initcall_debug) + initcall_debug_enable(); + + context_tracking_init(); + /* init some links before init_ISA_irqs() */ + early_irq_init(); + init_IRQ(); + tick_init(); + rcu_init_nohz(); + init_timers(); + hrtimers_init(); + softirq_init(); + timekeeping_init(); + time_init(); + + /* + * For best initial stack canary entropy, prepare it after: + * - setup_arch() for any UEFI RNG entropy and boot cmdline access + * - timekeeping_init() for ktime entropy used in random_init() + * - time_init() for making random_get_entropy() work on some platforms + * - random_init() to initialize the RNG from from early entropy sources + */ + random_init(command_line); + boot_init_stack_canary(); + + perf_event_init(); + profile_init(); + call_function_init(); + WARN(!irqs_disabled(), "Interrupts were enabled early\n"); + + early_boot_irqs_disabled = false; + local_irq_enable(); + + kmem_cache_init_late(); + + /* + * HACK ALERT! This is early. We're enabling the console before + * we've done PCI setups etc, and console_init() must be aware of + * this. But we do want output early, in case something goes wrong. + */ + console_init(); + if (panic_later) + panic("Too many boot %s vars at `%s'", panic_later, + panic_param); + + lockdep_init(); + + /* + * Need to run this when irqs are enabled, because it wants + * to self-test [hard/soft]-irqs on/off lock inversion bugs + * too: + */ + locking_selftest(); + + /* + * This needs to be called before any devices perform DMA + * operations that might use the SWIOTLB bounce buffers. It will + * mark the bounce buffers as decrypted so that their usage will + * not cause "plain-text" data to be decrypted when accessed. + */ + mem_encrypt_init(); + +#ifdef CONFIG_BLK_DEV_INITRD + if (initrd_start && !initrd_below_start_ok && + page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { + pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n", + page_to_pfn(virt_to_page((void *)initrd_start)), + min_low_pfn); + initrd_start = 0; + } +#endif + kmemleak_init(); + debug_objects_mem_init(); + setup_per_cpu_pageset(); + numa_policy_init(); + acpi_early_init(); + if (late_time_init) + late_time_init(); + sched_clock_init(); + calibrate_delay(); + pid_idr_init(); + anon_vma_init(); +#ifdef CONFIG_X86 + if (efi_enabled(EFI_RUNTIME_SERVICES)) + efi_enter_virtual_mode(); +#endif + thread_stack_cache_init(); + cred_init(); + fork_init(); + proc_caches_init(); + uts_ns_init(); + buffer_init(); + key_init(); + security_init(); + dbg_late_init(); + vfs_caches_init(); + pagecache_init(); + signals_init(); + seq_file_init(); + proc_root_init(); + nsfs_init(); + cpuset_init(); + cgroup_init(); + taskstats_init_early(); + delayacct_init(); + + check_bugs(); + + acpi_subsystem_init(); + arch_post_acpi_subsys_init(); + sfi_init_late(); + + if (efi_enabled(EFI_RUNTIME_SERVICES)) { + efi_free_boot_services(); + } + + /* Do the rest non-__init'ed, we're now alive */ + rest_init(); + + prevent_tail_call_optimization(); +} + +/* Call all constructor functions linked into the kernel. */ +static void __init do_ctors(void) +{ +#ifdef CONFIG_CONSTRUCTORS + ctor_fn_t *fn = (ctor_fn_t *) __ctors_start; + + for (; fn < (ctor_fn_t *) __ctors_end; fn++) + (*fn)(); +#endif +} + +#ifdef CONFIG_KALLSYMS +struct blacklist_entry { + struct list_head next; + char *buf; +}; + +static __initdata_or_module LIST_HEAD(blacklisted_initcalls); + +static int __init initcall_blacklist(char *str) +{ + char *str_entry; + struct blacklist_entry *entry; + + /* str argument is a comma-separated list of functions */ + do { + str_entry = strsep(&str, ","); + if (str_entry) { + pr_debug("blacklisting initcall %s\n", str_entry); + entry = alloc_bootmem(sizeof(*entry)); + entry->buf = alloc_bootmem(strlen(str_entry) + 1); + strcpy(entry->buf, str_entry); + list_add(&entry->next, &blacklisted_initcalls); + } + } while (str_entry); + + return 1; +} + +static bool __init_or_module initcall_blacklisted(initcall_t fn) +{ + struct blacklist_entry *entry; + char fn_name[KSYM_SYMBOL_LEN]; + unsigned long addr; + + if (list_empty(&blacklisted_initcalls)) + return false; + + addr = (unsigned long) dereference_function_descriptor(fn); + sprint_symbol_no_offset(fn_name, addr); + + /* + * fn will be "function_name [module_name]" where [module_name] is not + * displayed for built-in init functions. Strip off the [module_name]. + */ + strreplace(fn_name, ' ', '\0'); + + list_for_each_entry(entry, &blacklisted_initcalls, next) { + if (!strcmp(fn_name, entry->buf)) { + pr_debug("initcall %s blacklisted\n", fn_name); + return true; + } + } + + return false; +} +#else +static int __init initcall_blacklist(char *str) +{ + pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n"); + return 0; +} + +static bool __init_or_module initcall_blacklisted(initcall_t fn) +{ + return false; +} +#endif +__setup("initcall_blacklist=", initcall_blacklist); + +static __init_or_module void +trace_initcall_start_cb(void *data, initcall_t fn) +{ + ktime_t *calltime = (ktime_t *)data; + + printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); + *calltime = ktime_get(); +} + +static __init_or_module void +trace_initcall_finish_cb(void *data, initcall_t fn, int ret) +{ + ktime_t *calltime = (ktime_t *)data; + ktime_t delta, rettime; + unsigned long long duration; + + rettime = ktime_get(); + delta = ktime_sub(rettime, *calltime); + duration = (unsigned long long) ktime_to_ns(delta) >> 10; + printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", + fn, ret, duration); +} + +static ktime_t initcall_calltime; + +#ifdef TRACEPOINTS_ENABLED +static void __init initcall_debug_enable(void) +{ + int ret; + + ret = register_trace_initcall_start(trace_initcall_start_cb, + &initcall_calltime); + ret |= register_trace_initcall_finish(trace_initcall_finish_cb, + &initcall_calltime); + WARN(ret, "Failed to register initcall tracepoints\n"); +} +# define do_trace_initcall_start trace_initcall_start +# define do_trace_initcall_finish trace_initcall_finish +#else +static inline void do_trace_initcall_start(initcall_t fn) +{ + if (!initcall_debug) + return; + trace_initcall_start_cb(&initcall_calltime, fn); +} +static inline void do_trace_initcall_finish(initcall_t fn, int ret) +{ + if (!initcall_debug) + return; + trace_initcall_finish_cb(&initcall_calltime, fn, ret); +} +#endif /* !TRACEPOINTS_ENABLED */ + +int __init_or_module do_one_initcall(initcall_t fn) +{ + int count = preempt_count(); + char msgbuf[64]; + int ret; + + if (initcall_blacklisted(fn)) + return -EPERM; + + do_trace_initcall_start(fn); + ret = fn(); + do_trace_initcall_finish(fn, ret); + + msgbuf[0] = 0; + + if (preempt_count() != count) { + sprintf(msgbuf, "preemption imbalance "); + preempt_count_set(count); + } + if (irqs_disabled()) { + strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); + local_irq_enable(); + } + WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf); + + add_latent_entropy(); + return ret; +} + + +extern initcall_entry_t __initcall_start[]; +extern initcall_entry_t __initcall0_start[]; +extern initcall_entry_t __initcall1_start[]; +extern initcall_entry_t __initcall2_start[]; +extern initcall_entry_t __initcall3_start[]; +extern initcall_entry_t __initcall4_start[]; +extern initcall_entry_t __initcall5_start[]; +extern initcall_entry_t __initcall6_start[]; +extern initcall_entry_t __initcall7_start[]; +extern initcall_entry_t __initcall_end[]; + +static initcall_entry_t *initcall_levels[] __initdata = { + __initcall0_start, + __initcall1_start, + __initcall2_start, + __initcall3_start, + __initcall4_start, + __initcall5_start, + __initcall6_start, + __initcall7_start, + __initcall_end, +}; + +/* Keep these in sync with initcalls in include/linux/init.h */ +static char *initcall_level_names[] __initdata = { + "pure", + "core", + "postcore", + "arch", + "subsys", + "fs", + "device", + "late", +}; + +static void __init do_initcall_level(int level) +{ + initcall_entry_t *fn; + + strcpy(initcall_command_line, saved_command_line); + parse_args(initcall_level_names[level], + initcall_command_line, __start___param, + __stop___param - __start___param, + level, level, + NULL, &repair_env_string); + + trace_initcall_level(initcall_level_names[level]); + for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) + do_one_initcall(initcall_from_entry(fn)); +} + +static void __init do_initcalls(void) +{ + int level; + + for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) + do_initcall_level(level); +} + +/* + * Ok, the machine is now initialized. None of the devices + * have been touched yet, but the CPU subsystem is up and + * running, and memory and process management works. + * + * Now we can finally start doing some real work.. + */ +static void __init do_basic_setup(void) +{ + cpuset_init_smp(); + shmem_init(); + driver_init(); + init_irq_proc(); + do_ctors(); + usermodehelper_enable(); + do_initcalls(); +} + +static void __init do_pre_smp_initcalls(void) +{ + initcall_entry_t *fn; + + trace_initcall_level("early"); + for (fn = __initcall_start; fn < __initcall0_start; fn++) + do_one_initcall(initcall_from_entry(fn)); +} + +/* + * This function requests modules which should be loaded by default and is + * called twice right after initrd is mounted and right before init is + * exec'd. If such modules are on either initrd or rootfs, they will be + * loaded before control is passed to userland. + */ +void __init load_default_modules(void) +{ + load_default_elevator_module(); +} + +static int run_init_process(const char *init_filename) +{ + argv_init[0] = init_filename; + pr_info("Run %s as init process\n", init_filename); + return do_execve(getname_kernel(init_filename), + (const char __user *const __user *)argv_init, + (const char __user *const __user *)envp_init); +} + +static int try_to_run_init_process(const char *init_filename) +{ + int ret; + + ret = run_init_process(init_filename); + + if (ret && ret != -ENOENT) { + pr_err("Starting init: %s exists but couldn't execute it (error %d)\n", + init_filename, ret); + } + + return ret; +} + +static noinline void __init kernel_init_freeable(void); + +#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX) +bool rodata_enabled __ro_after_init = true; +static int __init set_debug_rodata(char *str) +{ + if (strtobool(str, &rodata_enabled)) + pr_warn("Invalid option string for rodata: '%s'\n", str); + return 1; +} +__setup("rodata=", set_debug_rodata); +#endif + +#ifdef CONFIG_STRICT_KERNEL_RWX +static void mark_readonly(void) +{ + if (rodata_enabled) { + /* + * load_module() results in W+X mappings, which are cleaned up + * with call_rcu_sched(). Let's make sure that queued work is + * flushed so that we don't hit false positives looking for + * insecure pages which are W+X. + */ + rcu_barrier_sched(); + mark_rodata_ro(); + rodata_test(); + } else + pr_info("Kernel memory protection disabled.\n"); +} +#else +static inline void mark_readonly(void) +{ + pr_warn("This architecture does not have kernel memory protection.\n"); +} +#endif + +static int __ref kernel_init(void *unused) +{ + int ret; + + kernel_init_freeable(); + /* need to finish all async __init code before freeing the memory */ + async_synchronize_full(); + ftrace_free_init_mem(); + jump_label_invalidate_initmem(); + free_initmem(); + mark_readonly(); + + /* + * Kernel mappings are now finalized - update the userspace page-table + * to finalize PTI. + */ + pti_finalize(); + + system_state = SYSTEM_RUNNING; + numa_default_policy(); + + rcu_end_inkernel_boot(); + + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) + return 0; + pr_err("Failed to execute %s (error %d)\n", + ramdisk_execute_command, ret); + } + + /* + * We try each of these until one succeeds. + * + * The Bourne shell can be used instead of init if we are + * trying to recover a really broken machine. + */ + if (execute_command) { + ret = run_init_process(execute_command); + if (!ret) + return 0; + panic("Requested init %s failed (error %d).", + execute_command, ret); + } + if (!try_to_run_init_process("/sbin/init") || + !try_to_run_init_process("/etc/init") || + !try_to_run_init_process("/bin/init") || + !try_to_run_init_process("/bin/sh")) + return 0; + + panic("No working init found. Try passing init= option to kernel. " + "See Linux Documentation/admin-guide/init.rst for guidance."); +} + +static noinline void __init kernel_init_freeable(void) +{ + /* + * Wait until kthreadd is all set-up. + */ + wait_for_completion(&kthreadd_done); + + /* Now the scheduler is fully set up and can do blocking allocations */ + gfp_allowed_mask = __GFP_BITS_MASK; + + /* + * init can allocate pages on any node + */ + set_mems_allowed(node_states[N_MEMORY]); + + cad_pid = get_pid(task_pid(current)); + + smp_prepare_cpus(setup_max_cpus); + + workqueue_init(); + + init_mm_internals(); + + do_pre_smp_initcalls(); + lockup_detector_init(); + + smp_init(); + sched_init_smp(); + + page_alloc_init_late(); + /* Initialize page ext after all struct pages are initialized. */ + page_ext_init(); + + do_basic_setup(); + + /* Open the /dev/console on the rootfs, this should never fail */ + if (ksys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) + pr_err("Warning: unable to open an initial console.\n"); + + (void) ksys_dup(0); + (void) ksys_dup(0); + /* + * check if there is an early userspace init. If yes, let it do all + * the work + */ + + if (!ramdisk_execute_command) + ramdisk_execute_command = "/init"; + + if (ksys_access((const char __user *) + ramdisk_execute_command, 0) != 0) { + ramdisk_execute_command = NULL; + prepare_namespace(); + } + + /* + * Ok, we have completed the initial bootup, and + * we're essentially up and running. Get rid of the + * initmem segments and start the user-mode stuff.. + * + * rootfs is available now, try loading the public keys + * and default modules + */ + + integrity_load_keys(); + load_default_modules(); +} diff --git a/init/noinitramfs.c b/init/noinitramfs.c new file mode 100644 index 000000000..f4bad8436 --- /dev/null +++ b/init/noinitramfs.c @@ -0,0 +1,52 @@ +/* + * init/noinitramfs.c + * + * Copyright (C) 2006, NXP Semiconductors, All Rights Reserved + * Author: Jean-Paul Saman <jean-paul.saman@nxp.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/init.h> +#include <linux/stat.h> +#include <linux/kdev_t.h> +#include <linux/syscalls.h> + +/* + * Create a simple rootfs that is similar to the default initramfs + */ +static int __init default_rootfs(void) +{ + int err; + + err = ksys_mkdir((const char __user __force *) "/dev", 0755); + if (err < 0) + goto out; + + err = ksys_mknod((const char __user __force *) "/dev/console", + S_IFCHR | S_IRUSR | S_IWUSR, + new_encode_dev(MKDEV(5, 1))); + if (err < 0) + goto out; + + err = ksys_mkdir((const char __user __force *) "/root", 0700); + if (err < 0) + goto out; + + return 0; + +out: + printk(KERN_WARNING "Failed to create a rootfs\n"); + return err; +} +rootfs_initcall(default_rootfs); diff --git a/init/version.c b/init/version.c new file mode 100644 index 000000000..ef4012ec4 --- /dev/null +++ b/init/version.c @@ -0,0 +1,54 @@ +/* + * linux/init/version.c + * + * Copyright (C) 1992 Theodore Ts'o + * + * May be freely distributed as part of Linux. + */ + +#include <generated/compile.h> +#include <linux/build-salt.h> +#include <linux/export.h> +#include <linux/uts.h> +#include <linux/utsname.h> +#include <generated/utsrelease.h> +#include <linux/version.h> +#include <linux/proc_ns.h> + +#ifndef CONFIG_KALLSYMS +#define version(a) Version_ ## a +#define version_string(a) version(a) + +extern int version_string(LINUX_VERSION_CODE); +int version_string(LINUX_VERSION_CODE); +#endif + +struct uts_namespace init_uts_ns = { + .kref = KREF_INIT(2), + .name = { + .sysname = UTS_SYSNAME, + .nodename = UTS_NODENAME, + .release = UTS_RELEASE, + .version = UTS_VERSION, + .machine = UTS_MACHINE, + .domainname = UTS_DOMAINNAME, + }, + .user_ns = &init_user_ns, + .ns.inum = PROC_UTS_INIT_INO, +#ifdef CONFIG_UTS_NS + .ns.ops = &utsns_operations, +#endif +}; +EXPORT_SYMBOL_GPL(init_uts_ns); + +/* FIXED STRINGS! Don't touch! */ +const char linux_banner[] = + "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n"; + +const char linux_proc_banner[] = + "%s version %s" + " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ")" + " (" LINUX_COMPILER ") %s\n"; + +BUILD_SALT; |