diff options
Diffstat (limited to 'init')
-rw-r--r-- | init/Kconfig | 2395 | ||||
-rw-r--r-- | init/Makefile | 37 | ||||
-rw-r--r-- | init/calibrate.c | 316 | ||||
-rw-r--r-- | init/do_mounts.c | 658 | ||||
-rw-r--r-- | init/do_mounts.h | 43 | ||||
-rw-r--r-- | init/do_mounts_initrd.c | 142 | ||||
-rw-r--r-- | init/do_mounts_rd.c | 334 | ||||
-rw-r--r-- | init/init_task.c | 225 | ||||
-rw-r--r-- | init/initramfs.c | 642 | ||||
-rw-r--r-- | init/main.c | 1538 | ||||
-rw-r--r-- | init/noinitramfs.c | 40 | ||||
-rw-r--r-- | init/version.c | 55 |
12 files changed, 6425 insertions, 0 deletions
diff --git a/init/Kconfig b/init/Kconfig new file mode 100644 index 000000000..9807c66b2 --- /dev/null +++ b/init/Kconfig @@ -0,0 +1,2395 @@ +# SPDX-License-Identifier: GPL-2.0-only +config DEFCONFIG_LIST + string + depends on !UML + option defconfig_list + default "/lib/modules/$(shell,uname -r)/.config" + default "/etc/kernel-config" + default "/boot/config-$(shell,uname -r)" + default "arch/$(SRCARCH)/configs/$(KBUILD_DEFCONFIG)" + +config CC_VERSION_TEXT + string + default "$(CC_VERSION_TEXT)" + help + This is used in unclear ways: + + - Re-run Kconfig when the compiler is updated + The 'default' property references the environment variable, + CC_VERSION_TEXT so it is recorded in include/config/auto.conf.cmd. + When the compiler is updated, Kconfig will be invoked. + + - Ensure full rebuild when the compier is updated + include/linux/kconfig.h contains this option in the comment line so + fixdep adds include/config/cc/version/text.h into the auto-generated + dependency. When the compiler is updated, syncconfig will touch it + and then every file will be rebuilt. + +config CC_IS_GCC + def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q gcc) + +config GCC_VERSION + int + default $(shell,$(srctree)/scripts/gcc-version.sh $(CC)) if CC_IS_GCC + default 0 + +config LD_VERSION + int + default $(shell,$(LD) --version | $(srctree)/scripts/ld-version.sh) + +config CC_IS_CLANG + def_bool $(success,echo "$(CC_VERSION_TEXT)" | grep -q clang) + +config LD_IS_LLD + def_bool $(success,$(LD) -v | head -n 1 | grep -q LLD) + +config CLANG_VERSION + int + default $(shell,$(srctree)/scripts/clang-version.sh $(CC)) + +config AS_IS_GNU + def_bool $(success,test "$(as-name)" = GNU) + +config AS_IS_LLVM + def_bool $(success,test "$(as-name)" = LLVM) + +config AS_VERSION + int + # Use clang version if this is the integrated assembler + default CLANG_VERSION if AS_IS_LLVM + default $(as-version) + +config LLD_VERSION + int + default $(shell,$(srctree)/scripts/lld-version.sh $(LD)) + +config CC_CAN_LINK + bool + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(m64-flag)) if 64BIT + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(m32-flag)) + +config CC_CAN_LINK_STATIC + bool + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(m64-flag) -static) if 64BIT + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(m32-flag) -static) + +config CC_HAS_ASM_GOTO + def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC)) + +config CC_HAS_ASM_GOTO_OUTPUT + depends on CC_HAS_ASM_GOTO + def_bool $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null) + +config CC_HAS_ASM_GOTO_TIED_OUTPUT + depends on CC_HAS_ASM_GOTO_OUTPUT + # Detect buggy gcc and clang, fixed in gcc-11 clang-14. + def_bool $(success,echo 'int foo(int *x) { asm goto (".long (%l[bar]) - .": "+m"(*x) ::: bar); return *x; bar: return 0; }' | $CC -x c - -c -o /dev/null) + +config TOOLS_SUPPORT_RELR + def_bool $(success,env "CC=$(CC)" "LD=$(LD)" "NM=$(NM)" "OBJCOPY=$(OBJCOPY)" $(srctree)/scripts/tools-support-relr.sh) + +config CC_HAS_ASM_INLINE + def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null) + +config CONSTRUCTORS + bool + depends on !UML + +config IRQ_WORK + bool + +config BUILDTIME_TABLE_SORT + bool + +config THREAD_INFO_IN_TASK + bool + help + Select this to move thread_info off the stack into task_struct. To + make this work, an arch will need to remove all thread_info fields + except flags and fix any runtime bugs. + + One subtle change that will be needed is to use try_get_task_stack() + and put_task_stack() in save_thread_stack_tsk() and get_wchan(). + +menu "General setup" + +config BROKEN + bool + +config BROKEN_ON_SMP + bool + depends on BROKEN || !SMP + default y + +config INIT_ENV_ARG_LIMIT + int + default 32 if !UML + default 128 if UML + help + Maximum of each of the number of arguments and environment + variables passed to init from the kernel command line. + +config COMPILE_TEST + bool "Compile also drivers which will not load" + depends on HAS_IOMEM + help + Some drivers can be compiled on a different platform than they are + intended to be run on. Despite they cannot be loaded there (or even + when they load they cannot be used due to missing HW support), + developers still, opposing to distributors, might want to build such + drivers to compile-test them. + + If you are a developer and want to build everything available, say Y + here. If you are a user/distributor, say N here to exclude useless + drivers to be distributed. + +config UAPI_HEADER_TEST + bool "Compile test UAPI headers" + depends on HEADERS_INSTALL && CC_CAN_LINK + help + Compile test headers exported to user-space to ensure they are + self-contained, i.e. compilable as standalone units. + + If you are a developer or tester and want to ensure the exported + headers are self-contained, say Y here. Otherwise, choose N. + +config LOCALVERSION + string "Local version - append to kernel release" + help + Append an extra string to the end of your kernel version. + This will show up when you type uname, for example. + The string you set here will be appended after the contents of + any files with a filename matching localversion* in your + object and source tree, in that order. Your total string can + be a maximum of 64 characters. + +config LOCALVERSION_AUTO + bool "Automatically append version information to the version string" + default y + depends on !COMPILE_TEST + help + This will try to automatically determine if the current tree is a + release tree by looking for git tags that belong to the current + top of tree revision. + + A string of the format -gxxxxxxxx will be added to the localversion + if a git-based tree is found. The string generated by this will be + appended after any matching localversion* files, and after the value + set in CONFIG_LOCALVERSION. + + (The actual string used here is the first eight characters produced + by running the command: + + $ git rev-parse --verify HEAD + + which is done within the script "scripts/setlocalversion".) + +config BUILD_SALT + string "Build ID Salt" + default "" + help + The build ID is used to link binaries and their debug info. Setting + this option will use the value in the calculation of the build id. + This is mostly useful for distributions which want to ensure the + build is unique between builds. It's safe to leave the default. + +config HAVE_KERNEL_GZIP + bool + +config HAVE_KERNEL_BZIP2 + bool + +config HAVE_KERNEL_LZMA + bool + +config HAVE_KERNEL_XZ + bool + +config HAVE_KERNEL_LZO + bool + +config HAVE_KERNEL_LZ4 + bool + +config HAVE_KERNEL_ZSTD + bool + +config HAVE_KERNEL_UNCOMPRESSED + bool + +choice + prompt "Kernel compression mode" + default KERNEL_GZIP + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 || HAVE_KERNEL_ZSTD || HAVE_KERNEL_UNCOMPRESSED + help + The linux kernel is a kind of self-extracting executable. + Several compression algorithms are available, which differ + in efficiency, compression and decompression speed. + Compression speed is only relevant when building a kernel. + Decompression speed is relevant at each boot. + + If you have any problems with bzip2 or lzma compressed + kernels, mail me (Alain Knaff) <alain@knaff.lu>. (An older + version of this functionality (bzip2 only), for 2.4, was + supplied by Christian Ludwig) + + High compression options are mostly useful for users, who + are low on disk space (embedded systems), but for whom ram + size matters less. + + If in doubt, select 'gzip' + +config KERNEL_GZIP + bool "Gzip" + depends on HAVE_KERNEL_GZIP + help + The old and tried gzip compression. It provides a good balance + between compression ratio and decompression speed. + +config KERNEL_BZIP2 + bool "Bzip2" + depends on HAVE_KERNEL_BZIP2 + help + Its compression ratio and speed is intermediate. + Decompression speed is slowest among the choices. The kernel + size is about 10% smaller with bzip2, in comparison to gzip. + Bzip2 uses a large amount of memory. For modern kernels you + will need at least 8MB RAM or more for booting. + +config KERNEL_LZMA + bool "LZMA" + depends on HAVE_KERNEL_LZMA + help + This compression algorithm's ratio is best. Decompression speed + is between gzip and bzip2. Compression is slowest. + The kernel size is about 33% smaller with LZMA in comparison to gzip. + +config KERNEL_XZ + bool "XZ" + depends on HAVE_KERNEL_XZ + help + XZ uses the LZMA2 algorithm and instruction set specific + BCJ filters which can improve compression ratio of executable + code. The size of the kernel is about 30% smaller with XZ in + comparison to gzip. On architectures for which there is a BCJ + filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ + will create a few percent smaller kernel than plain LZMA. + + The speed is about the same as with LZMA: The decompression + speed of XZ is better than that of bzip2 but worse than gzip + and LZO. Compression is slow. + +config KERNEL_LZO + bool "LZO" + depends on HAVE_KERNEL_LZO + help + Its compression ratio is the poorest among the choices. The kernel + size is about 10% bigger than gzip; however its speed + (both compression and decompression) is the fastest. + +config KERNEL_LZ4 + bool "LZ4" + depends on HAVE_KERNEL_LZ4 + help + LZ4 is an LZ77-type compressor with a fixed, byte-oriented encoding. + A preliminary version of LZ4 de/compression tool is available at + <https://code.google.com/p/lz4/>. + + Its compression ratio is worse than LZO. The size of the kernel + is about 8% bigger than LZO. But the decompression speed is + faster than LZO. + +config KERNEL_ZSTD + bool "ZSTD" + depends on HAVE_KERNEL_ZSTD + help + ZSTD is a compression algorithm targeting intermediate compression + with fast decompression speed. It will compress better than GZIP and + decompress around the same speed as LZO, but slower than LZ4. You + will need at least 192 KB RAM or more for booting. The zstd command + line tool is required for compression. + +config KERNEL_UNCOMPRESSED + bool "None" + depends on HAVE_KERNEL_UNCOMPRESSED + help + Produce uncompressed kernel image. This option is usually not what + you want. It is useful for debugging the kernel in slow simulation + environments, where decompressing and moving the kernel is awfully + slow. This option allows early boot code to skip the decompressor + and jump right at uncompressed kernel image. + +endchoice + +config DEFAULT_INIT + string "Default init path" + default "" + help + This option determines the default init for the system if no init= + option is passed on the kernel command line. If the requested path is + not present, we will still then move on to attempting further + locations (e.g. /sbin/init, etc). If this is empty, we will just use + the fallback list when init= is not passed. + +config DEFAULT_HOSTNAME + string "Default hostname" + default "(none)" + help + This option determines the default system hostname before userspace + calls sethostname(2). The kernel traditionally uses "(none)" here, + but you may wish to use a different default here to make a minimal + system more usable with less configuration. + +# +# For some reason microblaze and nios2 hard code SWAP=n. Hopefully we can +# add proper SWAP support to them, in which case this can be remove. +# +config ARCH_NO_SWAP + bool + +config SWAP + bool "Support for paging of anonymous memory (swap)" + depends on MMU && BLOCK && !ARCH_NO_SWAP + default y + help + This option allows you to choose whether you want to have support + for so called swap devices or swap files in your kernel that are + used to provide more virtual memory than the actual RAM present + in your computer. If unsure say Y. + +config SYSVIPC + bool "System V IPC" + help + Inter Process Communication is a suite of library functions and + system calls which let processes (running programs) synchronize and + exchange information. It is generally considered to be a good thing, + and some programs won't run unless you say Y here. In particular, if + you want to run the DOS emulator dosemu under Linux (read the + DOSEMU-HOWTO, available from <http://www.tldp.org/docs.html#howto>), + you'll need to say Y here. + + You can find documentation about IPC with "info ipc" and also in + section 6.4 of the Linux Programmer's Guide, available from + <http://www.tldp.org/guides.html>. + +config SYSVIPC_SYSCTL + bool + depends on SYSVIPC + depends on SYSCTL + default y + +config POSIX_MQUEUE + bool "POSIX Message Queues" + depends on NET + help + POSIX variant of message queues is a part of IPC. In POSIX message + queues every message has a priority which decides about succession + of receiving it by a process. If you want to compile and run + programs written e.g. for Solaris with use of its POSIX message + queues (functions mq_*) say Y here. + + POSIX message queues are visible as a filesystem called 'mqueue' + and can be mounted somewhere if you want to do filesystem + operations on message queues. + + If unsure, say Y. + +config POSIX_MQUEUE_SYSCTL + bool + depends on POSIX_MQUEUE + depends on SYSCTL + default y + +config WATCH_QUEUE + bool "General notification queue" + default n + help + + This is a general notification queue for the kernel to pass events to + userspace by splicing them into pipes. It can be used in conjunction + with watches for key/keyring change notifications and device + notifications. + + See Documentation/watch_queue.rst + +config CROSS_MEMORY_ATTACH + bool "Enable process_vm_readv/writev syscalls" + depends on MMU + default y + help + Enabling this option adds the system calls process_vm_readv and + process_vm_writev which allow a process with the correct privileges + to directly read from or write to another process' address space. + See the man page for more details. + +config USELIB + bool "uselib syscall" + def_bool ALPHA || M68K || SPARC || X86_32 || IA32_EMULATION + help + This option enables the uselib syscall, a system call used in the + dynamic linker from libc5 and earlier. glibc does not use this + system call. If you intend to run programs built on libc5 or + earlier, you may need to enable this syscall. Current systems + running glibc can safely disable this. + +config AUDIT + bool "Auditing support" + depends on NET + help + Enable auditing infrastructure that can be used with another + kernel subsystem, such as SELinux (which requires this for + logging of avc messages output). System call auditing is included + on architectures which support it. + +config HAVE_ARCH_AUDITSYSCALL + bool + +config AUDITSYSCALL + def_bool y + depends on AUDIT && HAVE_ARCH_AUDITSYSCALL + select FSNOTIFY + +source "kernel/irq/Kconfig" +source "kernel/time/Kconfig" +source "kernel/Kconfig.preempt" + +menu "CPU/Task time and stats accounting" + +config VIRT_CPU_ACCOUNTING + bool + +choice + prompt "Cputime accounting" + default TICK_CPU_ACCOUNTING if !PPC64 + default VIRT_CPU_ACCOUNTING_NATIVE if PPC64 + +# Kind of a stub config for the pure tick based cputime accounting +config TICK_CPU_ACCOUNTING + bool "Simple tick based cputime accounting" + depends on !S390 && !NO_HZ_FULL + help + This is the basic tick based cputime accounting that maintains + statistics about user, system and idle time spent on per jiffies + granularity. + + If unsure, say Y. + +config VIRT_CPU_ACCOUNTING_NATIVE + bool "Deterministic task and CPU time accounting" + depends on HAVE_VIRT_CPU_ACCOUNTING && !NO_HZ_FULL + select VIRT_CPU_ACCOUNTING + help + Select this option to enable more accurate task and CPU time + accounting. This is done by reading a CPU counter on each + kernel entry and exit and on transitions within the kernel + between system, softirq and hardirq state, so there is a + small performance impact. In the case of s390 or IBM POWER > 5, + this also enables accounting of stolen time on logically-partitioned + systems. + +config VIRT_CPU_ACCOUNTING_GEN + bool "Full dynticks CPU time accounting" + depends on HAVE_CONTEXT_TRACKING + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN + depends on GENERIC_CLOCKEVENTS + select VIRT_CPU_ACCOUNTING + select CONTEXT_TRACKING + help + Select this option to enable task and CPU time accounting on full + dynticks systems. This accounting is implemented by watching every + kernel-user boundaries using the context tracking subsystem. + The accounting is thus performed at the expense of some significant + overhead. + + For now this is only useful if you are working on the full + dynticks subsystem development. + + If unsure, say N. + +endchoice + +config IRQ_TIME_ACCOUNTING + bool "Fine granularity task level IRQ time accounting" + depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE + help + Select this option to enable fine granularity task irq time + accounting. This is done by reading a timestamp on each + transitions between softirq and hardirq state, so there can be a + small performance impact. + + If in doubt, say N here. + +config HAVE_SCHED_AVG_IRQ + def_bool y + depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING + depends on SMP + +config SCHED_THERMAL_PRESSURE + bool + default y if ARM && ARM_CPU_TOPOLOGY + default y if ARM64 + depends on SMP + depends on CPU_FREQ_THERMAL + help + Select this option to enable thermal pressure accounting in the + scheduler. Thermal pressure is the value conveyed to the scheduler + that reflects the reduction in CPU compute capacity resulted from + thermal throttling. Thermal throttling occurs when the performance of + a CPU is capped due to high operating temperatures. + + If selected, the scheduler will be able to balance tasks accordingly, + i.e. put less load on throttled CPUs than on non/less throttled ones. + + This requires the architecture to implement + arch_set_thermal_pressure() and arch_get_thermal_pressure(). + +config BSD_PROCESS_ACCT + bool "BSD Process Accounting" + depends on MULTIUSER + help + If you say Y here, a user level program will be able to instruct the + kernel (via a special system call) to write process accounting + information to a file: whenever a process exits, information about + that process will be appended to the file by the kernel. The + information includes things such as creation time, owning user, + command name, memory usage, controlling terminal etc. (the complete + list is in the struct acct in <file:include/linux/acct.h>). It is + up to the user level program to do useful things with this + information. This is generally a good idea, so say Y. + +config BSD_PROCESS_ACCT_V3 + bool "BSD Process Accounting version 3 file format" + depends on BSD_PROCESS_ACCT + default n + help + If you say Y here, the process accounting information is written + in a new file format that also logs the process IDs of each + process and its parent. Note that this file format is incompatible + with previous v0/v1/v2 file formats, so you will need updated tools + for processing it. A preliminary version of these tools is available + at <http://www.gnu.org/software/acct/>. + +config TASKSTATS + bool "Export task/process statistics through netlink" + depends on NET + depends on MULTIUSER + default n + help + Export selected statistics for tasks/processes through the + generic netlink interface. Unlike BSD process accounting, the + statistics are available during the lifetime of tasks/processes as + responses to commands. Like BSD accounting, they are sent to user + space on task exit. + + Say N if unsure. + +config TASK_DELAY_ACCT + bool "Enable per-task delay accounting" + depends on TASKSTATS + select SCHED_INFO + help + Collect information on time spent by a task waiting for system + resources like cpu, synchronous block I/O completion and swapping + in pages. Such statistics can help in setting a task's priorities + relative to other tasks for cpu, io, rss limits etc. + + Say N if unsure. + +config TASK_XACCT + bool "Enable extended accounting over taskstats" + depends on TASKSTATS + help + Collect extended task accounting data and send the data + to userland for processing over the taskstats interface. + + Say N if unsure. + +config TASK_IO_ACCOUNTING + bool "Enable per-task storage I/O accounting" + depends on TASK_XACCT + help + Collect information on the number of bytes of storage I/O which this + task has caused. + + Say N if unsure. + +config PSI + bool "Pressure stall information tracking" + help + Collect metrics that indicate how overcommitted the CPU, memory, + and IO capacity are in the system. + + If you say Y here, the kernel will create /proc/pressure/ with the + pressure statistics files cpu, memory, and io. These will indicate + the share of walltime in which some or all tasks in the system are + delayed due to contention of the respective resource. + + In kernels with cgroup support, cgroups (cgroup2 only) will + have cpu.pressure, memory.pressure, and io.pressure files, + which aggregate pressure stalls for the grouped tasks only. + + For more details see Documentation/accounting/psi.rst. + + Say N if unsure. + +config PSI_DEFAULT_DISABLED + bool "Require boot parameter to enable pressure stall information tracking" + default n + depends on PSI + help + If set, pressure stall information tracking will be disabled + per default but can be enabled through passing psi=1 on the + kernel commandline during boot. + + This feature adds some code to the task wakeup and sleep + paths of the scheduler. The overhead is too low to affect + common scheduling-intense workloads in practice (such as + webservers, memcache), but it does show up in artificial + scheduler stress tests, such as hackbench. + + If you are paranoid and not sure what the kernel will be + used for, say Y. + + Say N if unsure. + +endmenu # "CPU/Task time and stats accounting" + +config CPU_ISOLATION + bool "CPU isolation" + depends on SMP || COMPILE_TEST + default y + help + Make sure that CPUs running critical tasks are not disturbed by + any source of "noise" such as unbound workqueues, timers, kthreads... + Unbound jobs get offloaded to housekeeping CPUs. This is driven by + the "isolcpus=" boot parameter. + + Say Y if unsure. + +source "kernel/rcu/Kconfig" + +config BUILD_BIN2C + bool + default n + +config IKCONFIG + tristate "Kernel .config support" + help + This option enables the complete Linux kernel ".config" file + contents to be saved in the kernel. It provides documentation + of which kernel options are used in a running kernel or in an + on-disk kernel. This information can be extracted from the kernel + image file with the script scripts/extract-ikconfig and used as + input to rebuild the current kernel or to build another kernel. + It can also be extracted from a running kernel by reading + /proc/config.gz if enabled (below). + +config IKCONFIG_PROC + bool "Enable access to .config through /proc/config.gz" + depends on IKCONFIG && PROC_FS + help + This option enables access to the kernel configuration file + through /proc/config.gz. + +config IKHEADERS + tristate "Enable kernel headers through /sys/kernel/kheaders.tar.xz" + depends on SYSFS + help + This option enables access to the in-kernel headers that are generated during + the build process. These can be used to build eBPF tracing programs, + or similar programs. If you build the headers as a module, a module called + kheaders.ko is built which can be loaded on-demand to get access to headers. + +config LOG_BUF_SHIFT + int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" + range 12 25 if !H8300 + range 12 19 if H8300 + default 17 + depends on PRINTK + help + Select the minimal kernel log buffer size as a power of 2. + The final size is affected by LOG_CPU_MAX_BUF_SHIFT config + parameter, see below. Any higher size also might be forced + by "log_buf_len" boot parameter. + + Examples: + 17 => 128 KB + 16 => 64 KB + 15 => 32 KB + 14 => 16 KB + 13 => 8 KB + 12 => 4 KB + +config LOG_CPU_MAX_BUF_SHIFT + int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" + depends on SMP + range 0 21 + default 12 if !BASE_SMALL + default 0 if BASE_SMALL + depends on PRINTK + help + This option allows to increase the default ring buffer size + according to the number of CPUs. The value defines the contribution + of each CPU as a power of 2. The used space is typically only few + lines however it might be much more when problems are reported, + e.g. backtraces. + + The increased size means that a new buffer has to be allocated and + the original static one is unused. It makes sense only on systems + with more CPUs. Therefore this value is used only when the sum of + contributions is greater than the half of the default kernel ring + buffer as defined by LOG_BUF_SHIFT. The default values are set + so that more than 16 CPUs are needed to trigger the allocation. + + Also this option is ignored when "log_buf_len" kernel parameter is + used as it forces an exact (power of two) size of the ring buffer. + + The number of possible CPUs is used for this computation ignoring + hotplugging making the computation optimal for the worst case + scenario while allowing a simple algorithm to be used from bootup. + + Examples shift values and their meaning: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + +config PRINTK_SAFE_LOG_BUF_SHIFT + int "Temporary per-CPU printk log buffer size (12 => 4KB, 13 => 8KB)" + range 10 21 + default 13 + depends on PRINTK + help + Select the size of an alternate printk per-CPU buffer where messages + printed from usafe contexts are temporary stored. One example would + be NMI messages, another one - printk recursion. The messages are + copied to the main log buffer in a safe context to avoid a deadlock. + The value defines the size as a power of 2. + + Those messages are rare and limited. The largest one is when + a backtrace is printed. It usually fits into 4KB. Select + 8KB if you want to be on the safe side. + + Examples: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + +# +# Architectures with an unreliable sched_clock() should select this: +# +config HAVE_UNSTABLE_SCHED_CLOCK + bool + +config GENERIC_SCHED_CLOCK + bool + +menu "Scheduler features" + +config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. + + With this option, the user can specify the min and max CPU + utilization allowed for RUNNABLE tasks. The max utilization defines + the maximum frequency a task should use while the min utilization + defines the minimum frequency it should use. + + Both min and max utilization clamp values are hints to the scheduler, + aiming at improving its frequency selection policy, but they do not + enforce or grant any specific bandwidth for tasks. + + If in doubt, say N. + +config UCLAMP_BUCKETS_COUNT + int "Number of supported utilization clamp buckets" + range 5 20 + default 5 + depends on UCLAMP_TASK + help + Defines the number of clamp buckets to use. The range of each bucket + will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the + number of clamp buckets the finer their granularity and the higher + the precision of clamping aggregation and tracking at run-time. + + For example, with the minimum configuration value we will have 5 + clamp buckets tracking 20% utilization each. A 25% boosted tasks will + be refcounted in the [20..39]% bucket and will set the bucket clamp + effective value to 25%. + If a second 30% boosted task should be co-scheduled on the same CPU, + that task will be refcounted in the same bucket of the first task and + it will boost the bucket clamp effective value to 30%. + The clamp effective value of a bucket is reset to its nominal value + (20% in the example above) when there are no more tasks refcounted in + that bucket. + + An additional boost/capping margin can be added to some tasks. In the + example above the 25% task will be boosted to 30% until it exits the + CPU. If that should be considered not acceptable on certain systems, + it's always possible to reduce the margin by increasing the number of + clamp buckets to trade off used memory for run-time tracking + precision. + + If in doubt, use the default value. + +endmenu + +# +# For architectures that want to enable the support for NUMA-affine scheduler +# balancing logic: +# +config ARCH_SUPPORTS_NUMA_BALANCING + bool + +# +# For architectures that prefer to flush all TLBs after a number of pages +# are unmapped instead of sending one IPI per page to flush. The architecture +# must provide guarantees on what happens if a clean TLB cache entry is +# written after the unmap. Details are in mm/rmap.c near the check for +# should_defer_flush. The architecture should also consider if the full flush +# and the refill costs are offset by the savings of sending fewer IPIs. +config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + bool + +config CC_HAS_INT128 + def_bool !$(cc-option,$(m64-flag) -D__SIZEOF_INT128__=0) && 64BIT + +# +# For architectures that know their GCC __int128 support is sound +# +config ARCH_SUPPORTS_INT128 + bool + +# For architectures that (ab)use NUMA to represent different memory regions +# all cpu-local but of different latencies, such as SuperH. +# +config ARCH_WANT_NUMA_VARIABLE_LOCALITY + bool + +config NUMA_BALANCING + bool "Memory placement aware NUMA scheduler" + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when + it has references to the node the task is running on. + + This system will be inactive on UMA systems. + +config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y + depends on NUMA_BALANCING + help + If set, automatic NUMA balancing will be enabled if running on a NUMA + machine. + +menuconfig CGROUPS + bool "Control Group support" + select KERNFS + help + This option adds support for grouping sets of processes together, for + use with process control subsystems such as Cpusets, CFS, memory + controls or device isolation. + See + - Documentation/scheduler/sched-design-CFS.rst (CFS) + - Documentation/admin-guide/cgroup-v1/ (features for grouping, isolation + and resource control) + + Say N if unsure. + +if CGROUPS + +config PAGE_COUNTER + bool + +config MEMCG + bool "Memory controller" + select PAGE_COUNTER + select EVENTFD + help + Provides control over the memory footprint of tasks in a cgroup. + +config MEMCG_SWAP + bool + depends on MEMCG && SWAP + default y + +config MEMCG_KMEM + bool + depends on MEMCG && !SLOB + default y + +config BLK_CGROUP + bool "IO controller" + depends on BLOCK + default n + help + Generic block IO controller cgroup interface. This is the common + cgroup interface which should be used by various IO controlling + policies. + + Currently, CFQ IO scheduler uses it to recognize task groups and + control disk bandwidth allocation (proportional time slice allocation) + to such task groups. It is also used by bio throttling logic in + block layer to implement upper limit in IO rates on a device. + + This option only enables generic Block IO controller infrastructure. + One needs to also enable actual IO controlling logic/policy. For + enabling proportional weight division of disk bandwidth in CFQ, set + CONFIG_BFQ_GROUP_IOSCHED=y; for enabling throttling policy, set + CONFIG_BLK_DEV_THROTTLING=y. + + See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. + +config CGROUP_WRITEBACK + bool + depends on MEMCG && BLK_CGROUP + default y + +menuconfig CGROUP_SCHED + bool "CPU controller" + default n + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +if CGROUP_SCHED +config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED + default CGROUP_SCHED + +config CFS_BANDWIDTH + bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" + depends on FAIR_GROUP_SCHED + default n + help + This option allows users to define CPU bandwidth rates (limits) for + tasks running within the fair group scheduler. Groups with no limit + set are considered to be unconstrained and will run with no + restriction. + See Documentation/scheduler/sched-bwc.rst for more information. + +config RT_GROUP_SCHED + bool "Group scheduling for SCHED_RR/FIFO" + depends on CGROUP_SCHED + default n + help + This feature lets you explicitly allocate real CPU bandwidth + to task groups. If enabled, it will also make it impossible to + schedule realtime tasks for non-root users until you allocate + realtime bandwidth for them. + See Documentation/scheduler/sched-rt-group.rst for more information. + +endif #CGROUP_SCHED + +config UCLAMP_TASK_GROUP + bool "Utilization clamping per group of tasks" + depends on CGROUP_SCHED + depends on UCLAMP_TASK + default n + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks currently scheduled on that CPU. + + When this option is enabled, the user can specify a min and max + CPU bandwidth which is allowed for each single task in a group. + The max bandwidth allows to clamp the maximum frequency a task + can use, while the min bandwidth allows to define a minimum + frequency a task will always use. + + When task group based utilization clamping is enabled, an eventually + specified task-specific clamp value is constrained by the cgroup + specified clamp value. Both minimum and maximum task clamping cannot + be bigger than the corresponding clamping defined at task group level. + + If in doubt, say N. + +config CGROUP_PIDS + bool "PIDs controller" + help + Provides enforcement of process number limits in the scope of a + cgroup. Any attempt to fork more processes than is allowed in the + cgroup will fail. PIDs are fundamentally a global resource because it + is fairly trivial to reach PID exhaustion before you reach even a + conservative kmemcg limit. As a result, it is possible to grind a + system to halt without being limited by other cgroup policies. The + PIDs controller is designed to stop this from happening. + + It should be noted that organisational operations (such as attaching + to a cgroup hierarchy) will *not* be blocked by the PIDs controller, + since the PIDs limit only affects a process's ability to fork, not to + attach to a cgroup. + +config CGROUP_RDMA + bool "RDMA controller" + help + Provides enforcement of RDMA resources defined by IB stack. + It is fairly easy for consumers to exhaust RDMA resources, which + can result into resource unavailability to other consumers. + RDMA controller is designed to stop this from happening. + Attaching processes with active RDMA resources to the cgroup + hierarchy is allowed even if can cross the hierarchy's limit. + +config CGROUP_FREEZER + bool "Freezer controller" + help + Provides a way to freeze and unfreeze all tasks in a + cgroup. + + This option affects the ORIGINAL cgroup interface. The cgroup2 memory + controller includes important in-kernel memory consumers per default. + + If you're using cgroup2, say N. + +config CGROUP_HUGETLB + bool "HugeTLB controller" + depends on HUGETLB_PAGE + select PAGE_COUNTER + default n + help + Provides a cgroup controller for HugeTLB pages. + When you enable this, you can put a per cgroup limit on HugeTLB usage. + The limit is enforced during page fault. Since HugeTLB doesn't + support page reclaim, enforcing the limit at page fault time implies + that, the application will get SIGBUS signal if it tries to access + HugeTLB pages beyond its limit. This requires the application to know + beforehand how much HugeTLB pages it would require for its use. The + control group is tracked in the third page lru pointer. This means + that we cannot use the controller with huge page less than 3 pages. + +config CPUSETS + bool "Cpuset controller" + depends on SMP + help + This option will let you create and manage CPUSETs which + allow dynamically partitioning a system into sets of CPUs and + Memory Nodes and assigning tasks to run only within those sets. + This is primarily useful on large SMP or NUMA systems. + + Say N if unsure. + +config PROC_PID_CPUSET + bool "Include legacy /proc/<pid>/cpuset file" + depends on CPUSETS + default y + +config CGROUP_DEVICE + bool "Device controller" + help + Provides a cgroup controller implementing whitelists for + devices which a process in the cgroup can mknod or open. + +config CGROUP_CPUACCT + bool "Simple CPU accounting controller" + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. + +config CGROUP_PERF + bool "Perf controller" + depends on PERF_EVENTS + help + This option extends the perf per-cpu mode to restrict monitoring + to threads which belong to the cgroup specified and run on the + designated cpu. Or this can be used to have cgroup ID in samples + so that it can monitor performance events among cgroups. + + Say N if unsure. + +config CGROUP_BPF + bool "Support for eBPF programs attached to cgroups" + depends on BPF_SYSCALL + select SOCK_CGROUP_DATA + help + Allow attaching eBPF programs to a cgroup using the bpf(2) + syscall command BPF_PROG_ATTACH. + + In which context these programs are accessed depends on the type + of attachment. For instance, programs that are attached using + BPF_CGROUP_INET_INGRESS will be executed on the ingress path of + inet sockets. + +config CGROUP_DEBUG + bool "Debug controller" + default n + depends on DEBUG_KERNEL + help + This option enables a simple controller that exports + debugging information about the cgroups framework. This + controller is for control cgroup debugging only. Its + interfaces are not stable. + + Say N. + +config SOCK_CGROUP_DATA + bool + default n + +endif # CGROUPS + +menuconfig NAMESPACES + bool "Namespaces support" if EXPERT + depends on MULTIUSER + default !EXPERT + help + Provides the way to make tasks work with different objects using + the same id. For example same IPC id may refer to different objects + or same user id or pid may refer to different tasks when used in + different namespaces. + +if NAMESPACES + +config UTS_NS + bool "UTS namespace" + default y + help + In this namespace tasks see different info provided with the + uname() system call + +config TIME_NS + bool "TIME namespace" + depends on GENERIC_VDSO_TIME_NS + default y + help + In this namespace boottime and monotonic clocks can be set. + The time will keep going with the same pace. + +config IPC_NS + bool "IPC namespace" + depends on (SYSVIPC || POSIX_MQUEUE) + default y + help + In this namespace tasks work with IPC ids which correspond to + different IPC objects in different namespaces. + +config USER_NS + bool "User namespace" + default n + help + This allows containers, i.e. vservers, to use user namespaces + to provide different user info for different servers. + + When user namespaces are enabled in the kernel it is + recommended that the MEMCG option also be enabled and that + user-space use the memory control groups to limit the amount + of memory a memory unprivileged users can use. + + If unsure, say N. + +config PID_NS + bool "PID Namespaces" + default y + help + Support process id namespaces. This allows having multiple + processes with the same pid as long as they are in different + pid namespaces. This is a building block of containers. + +config NET_NS + bool "Network namespace" + depends on NET + default y + help + Allow user space to create what appear to be multiple instances + of the network stack. + +endif # NAMESPACES + +config CHECKPOINT_RESTORE + bool "Checkpoint/restore support" + select PROC_CHILDREN + select KCMP + default n + help + Enables additional kernel features in a sake of checkpoint/restore. + In particular it adds auxiliary prctl codes to setup process text, + data and heap segment sizes, and a few additional /proc filesystem + entries. + + If unsure, say N here. + +config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED + help + This option optimizes the scheduler for common desktop workloads by + automatically creating and populating task groups. This separation + of workloads isolates aggressive CPU burners (like build jobs) from + desktop applications. Task group autogeneration is currently based + upon task session. + +config SYSFS_DEPRECATED + bool "Enable deprecated sysfs features to support old userspace tools" + depends on SYSFS + default n + help + This option adds code that switches the layout of the "block" class + devices, to not show up in /sys/class/block/, but only in + /sys/block/. + + This switch is only active when the sysfs.deprecated=1 boot option is + passed or the SYSFS_DEPRECATED_V2 option is set. + + This option allows new kernels to run on old distributions and tools, + which might get confused by /sys/class/block/. Since 2007/2008 all + major distributions and tools handle this just fine. + + Recent distributions and userspace tools after 2009/2010 depend on + the existence of /sys/class/block/, and will not work with this + option enabled. + + Only if you are using a new kernel on an old distribution, you might + need to say Y here. + +config SYSFS_DEPRECATED_V2 + bool "Enable deprecated sysfs features by default" + default n + depends on SYSFS + depends on SYSFS_DEPRECATED + help + Enable deprecated sysfs by default. + + See the CONFIG_SYSFS_DEPRECATED option for more details about this + option. + + Only if you are using a new kernel on an old distribution, you might + need to say Y here. Even then, odds are you would not need it + enabled, you can always pass the boot option if absolutely necessary. + +config RELAY + bool "Kernel->user space relay support (formerly relayfs)" + select IRQ_WORK + help + This option enables support for relay interface support in + certain file systems (such as debugfs). + It is designed to provide an efficient mechanism for tools and + facilities to relay large amounts of data from kernel space to + user space. + + If unsure, say N. + +config BLK_DEV_INITRD + bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support" + help + The initial RAM filesystem is a ramfs which is loaded by the + boot loader (loadlin or lilo) and that is mounted as root + before the normal boot procedure. It is typically used to + load modules needed to mount the "real" root file system, + etc. See <file:Documentation/admin-guide/initrd.rst> for details. + + If RAM disk support (BLK_DEV_RAM) is also included, this + also enables initial RAM disk (initrd) support and adds + 15 Kbytes (more on some other architectures) to the kernel size. + + If unsure say Y. + +if BLK_DEV_INITRD + +source "usr/Kconfig" + +endif + +config BOOT_CONFIG + bool "Boot config support" + select BLK_DEV_INITRD + help + Extra boot config allows system admin to pass a config file as + complemental extension of kernel cmdline when booting. + The boot config file must be attached at the end of initramfs + with checksum, size and magic word. + See <file:Documentation/admin-guide/bootconfig.rst> for details. + + If unsure, say Y. + +choice + prompt "Compiler optimization level" + default CC_OPTIMIZE_FOR_PERFORMANCE + +config CC_OPTIMIZE_FOR_PERFORMANCE + bool "Optimize for performance (-O2)" + help + This is the default optimization level for the kernel, building + with the "-O2" compiler flag for best performance and most + helpful compile-time warnings. + +config CC_OPTIMIZE_FOR_PERFORMANCE_O3 + bool "Optimize more for performance (-O3)" + depends on ARC + help + Choosing this option will pass "-O3" to your compiler to optimize + the kernel yet more for performance. + +config CC_OPTIMIZE_FOR_SIZE + bool "Optimize for size (-Os)" + help + Choosing this option will pass "-Os" to your compiler resulting + in a smaller kernel. + +endchoice + +config HAVE_LD_DEAD_CODE_DATA_ELIMINATION + bool + help + This requires that the arch annotates or otherwise protects + its external entry points from being discarded. Linker scripts + must also merge .text.*, .data.*, and .bss.* correctly into + output sections. Care must be taken not to pull in unrelated + sections (e.g., '.text.init'). Typically '.' in section names + is used to distinguish them from label names / C identifiers. + +config LD_DEAD_CODE_DATA_ELIMINATION + bool "Dead code and data elimination (EXPERIMENTAL)" + depends on HAVE_LD_DEAD_CODE_DATA_ELIMINATION + depends on EXPERT + depends on $(cc-option,-ffunction-sections -fdata-sections) + depends on $(ld-option,--gc-sections) + help + Enable this if you want to do dead code and data elimination with + the linker by compiling with -ffunction-sections -fdata-sections, + and linking with --gc-sections. + + This can reduce on disk and in-memory size of the kernel + code and static data, particularly for small configs and + on small systems. This has the possibility of introducing + silently broken kernel if the required annotations are not + present. This option is not well tested yet, so use at your + own risk. + +config LD_ORPHAN_WARN + def_bool y + depends on ARCH_WANT_LD_ORPHAN_WARN + depends on !LD_IS_LLD || LLD_VERSION >= 110000 + depends on $(ld-option,--orphan-handling=warn) + +config SYSCTL + bool + +config HAVE_UID16 + bool + +config SYSCTL_EXCEPTION_TRACE + bool + help + Enable support for /proc/sys/debug/exception-trace. + +config SYSCTL_ARCH_UNALIGN_NO_WARN + bool + help + Enable support for /proc/sys/kernel/ignore-unaligned-usertrap + Allows arch to define/use @no_unaligned_warning to possibly warn + about unaligned access emulation going on under the hood. + +config SYSCTL_ARCH_UNALIGN_ALLOW + bool + help + Enable support for /proc/sys/kernel/unaligned-trap + Allows arches to define/use @unaligned_enabled to runtime toggle + the unaligned access emulation. + see arch/parisc/kernel/unaligned.c for reference + +config HAVE_PCSPKR_PLATFORM + bool + +# interpreter that classic socket filters depend on +config BPF + bool + +menuconfig EXPERT + bool "Configure standard kernel features (expert users)" + # Unhide debug options, to make the on-by-default options visible + select DEBUG_KERNEL + help + This option allows certain base kernel options and settings + to be disabled or tweaked. This is for specialized + environments which can tolerate a "non-standard" kernel. + Only use this if you really know what you are doing. + +config UID16 + bool "Enable 16-bit UID system calls" if EXPERT + depends on HAVE_UID16 && MULTIUSER + default y + help + This enables the legacy 16-bit UID syscall wrappers. + +config MULTIUSER + bool "Multiple users, groups and capabilities support" if EXPERT + default y + help + This option enables support for non-root users, groups and + capabilities. + + If you say N here, all processes will run with UID 0, GID 0, and all + possible capabilities. Saying N here also compiles out support for + system calls related to UIDs, GIDs, and capabilities, such as setuid, + setgid, and capset. + + If unsure, say Y here. + +config SGETMASK_SYSCALL + bool "sgetmask/ssetmask syscalls support" if EXPERT + def_bool PARISC || M68K || PPC || MIPS || X86 || SPARC || MICROBLAZE || SUPERH + help + sys_sgetmask and sys_ssetmask are obsolete system calls + no longer supported in libc but still enabled by default in some + architectures. + + If unsure, leave the default option here. + +config SYSFS_SYSCALL + bool "Sysfs syscall support" if EXPERT + default y + help + sys_sysfs is an obsolete system call no longer supported in libc. + Note that disabling this option is more secure but might break + compatibility with some systems. + + If unsure say Y here. + +config FHANDLE + bool "open by fhandle syscalls" if EXPERT + select EXPORTFS + default y + help + If you say Y here, a user level program will be able to map + file names to handle and then later use the handle for + different file system operations. This is useful in implementing + userspace file servers, which now track files using handles instead + of names. The handle would remain the same even if file names + get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) + syscalls. + +config POSIX_TIMERS + bool "Posix Clocks & timers" if EXPERT + default y + help + This includes native support for POSIX timers to the kernel. + Some embedded systems have no use for them and therefore they + can be configured out to reduce the size of the kernel image. + + When this option is disabled, the following syscalls won't be + available: timer_create, timer_gettime: timer_getoverrun, + timer_settime, timer_delete, clock_adjtime, getitimer, + setitimer, alarm. Furthermore, the clock_settime, clock_gettime, + clock_getres and clock_nanosleep syscalls will be limited to + CLOCK_REALTIME, CLOCK_MONOTONIC and CLOCK_BOOTTIME only. + + If unsure say y. + +config PRINTK + default y + bool "Enable support for printk" if EXPERT + select IRQ_WORK + help + This option enables normal printk support. Removing it + eliminates most of the message strings from the kernel image + and makes the kernel more or less silent. As this makes it + very difficult to diagnose system problems, saying N here is + strongly discouraged. + +config PRINTK_NMI + def_bool y + depends on PRINTK + depends on HAVE_NMI + +config BUG + bool "BUG() support" if EXPERT + default y + help + Disabling this option eliminates support for BUG and WARN, reducing + the size of your kernel image and potentially quietly ignoring + numerous fatal conditions. You should only consider disabling this + option for embedded systems with no facilities for reporting errors. + Just say Y. + +config ELF_CORE + depends on COREDUMP + default y + bool "Enable ELF core dumps" if EXPERT + help + Enable support for generating core dumps. Disabling saves about 4k. + + +config PCSPKR_PLATFORM + bool "Enable PC-Speaker support" if EXPERT + depends on HAVE_PCSPKR_PLATFORM + select I8253_LOCK + default y + help + This option allows to disable the internal PC-Speaker + support, saving some memory. + +config BASE_FULL + default y + bool "Enable full-sized data structures for core" if EXPERT + help + Disabling this option reduces the size of miscellaneous core + kernel data structures. This saves memory on small machines, + but may reduce performance. + +config FUTEX + bool "Enable futex support" if EXPERT + default y + imply RT_MUTEXES + help + Disabling this option will cause the kernel to be built without + support for "fast userspace mutexes". The resulting kernel may not + run glibc-based applications correctly. + +config FUTEX_PI + bool + depends on FUTEX && RT_MUTEXES + default y + +config HAVE_FUTEX_CMPXCHG + bool + depends on FUTEX + help + Architectures should select this if futex_atomic_cmpxchg_inatomic() + is implemented and always working. This removes a couple of runtime + checks. + +config EPOLL + bool "Enable eventpoll support" if EXPERT + default y + help + Disabling this option will cause the kernel to be built without + support for epoll family of system calls. + +config SIGNALFD + bool "Enable signalfd() system call" if EXPERT + default y + help + Enable the signalfd() system call that allows to receive signals + on a file descriptor. + + If unsure, say Y. + +config TIMERFD + bool "Enable timerfd() system call" if EXPERT + default y + help + Enable the timerfd() system call that allows to receive timer + events on a file descriptor. + + If unsure, say Y. + +config EVENTFD + bool "Enable eventfd() system call" if EXPERT + default y + help + Enable the eventfd() system call that allows to receive both + kernel notification (ie. KAIO) or userspace notifications. + + If unsure, say Y. + +config SHMEM + bool "Use full shmem filesystem" if EXPERT + default y + depends on MMU + help + The shmem is an internal filesystem used to manage shared memory. + It is backed by swap and manages resource limits. It is also exported + to userspace as tmpfs if TMPFS is enabled. Disabling this + option replaces shmem and tmpfs with the much simpler ramfs code, + which may be appropriate on small systems without swap. + +config AIO + bool "Enable AIO support" if EXPERT + default y + help + This option enables POSIX asynchronous I/O which may by used + by some high performance threaded applications. Disabling + this option saves about 7k. + +config IO_URING + bool "Enable IO uring support" if EXPERT + select IO_WQ + default y + help + This option enables support for the io_uring interface, enabling + applications to submit and complete IO through submission and + completion rings that are shared between the kernel and application. + +config ADVISE_SYSCALLS + bool "Enable madvise/fadvise syscalls" if EXPERT + default y + help + This option enables the madvise and fadvise syscalls, used by + applications to advise the kernel about their future memory or file + usage, improving performance. If building an embedded system where no + applications use these syscalls, you can disable this option to save + space. + +config HAVE_ARCH_USERFAULTFD_WP + bool + help + Arch has userfaultfd write protection support + +config MEMBARRIER + bool "Enable membarrier() system call" if EXPERT + default y + help + Enable the membarrier() system call that allows issuing memory + barriers across all running threads, which can be used to distribute + the cost of user-space memory barriers asymmetrically by transforming + pairs of memory barriers into pairs consisting of membarrier() and a + compiler barrier. + + If unsure, say Y. + +config KALLSYMS + bool "Load all symbols for debugging/ksymoops" if EXPERT + default y + help + Say Y here to let the kernel print out symbolic crash information and + symbolic stack backtraces. This increases the size of the kernel + somewhat, as all symbols have to be loaded into the kernel image. + +config KALLSYMS_ALL + bool "Include all symbols in kallsyms" + depends on DEBUG_KERNEL && KALLSYMS + help + Normally kallsyms only contains the symbols of functions for nicer + OOPS messages and backtraces (i.e., symbols from the text and inittext + sections). This is sufficient for most cases. And only in very rare + cases (e.g., when a debugger is used) all symbols are required (e.g., + names of variables from the data sections, etc). + + This option makes sure that all symbols are loaded into the kernel + image (i.e., symbols from all sections) in cost of increased kernel + size (depending on the kernel configuration, it may be 300KiB or + something like this). + + Say N unless you really need all symbols. + +config KALLSYMS_ABSOLUTE_PERCPU + bool + depends on KALLSYMS + default X86_64 && SMP + +config KALLSYMS_BASE_RELATIVE + bool + depends on KALLSYMS + default !IA64 + help + Instead of emitting them as absolute values in the native word size, + emit the symbol references in the kallsyms table as 32-bit entries, + each containing a relative value in the range [base, base + U32_MAX] + or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either + an absolute value in the range [0, S32_MAX] or a relative value in the + range [base, base + S32_MAX], where base is the lowest relative symbol + address encountered in the image. + + On 64-bit builds, this reduces the size of the address table by 50%, + but more importantly, it results in entries whose values are build + time constants, and no relocation pass is required at runtime to fix + up the entries based on the runtime load address of the kernel. + +# end of the "standard kernel features (expert users)" menu + +# syscall, maps, verifier + +config BPF_LSM + bool "LSM Instrumentation with BPF" + depends on BPF_EVENTS + depends on BPF_SYSCALL + depends on SECURITY + depends on BPF_JIT + help + Enables instrumentation of the security hooks with eBPF programs for + implementing dynamic MAC and Audit Policies. + + If you are unsure how to answer this question, answer N. + +config BPF_SYSCALL + bool "Enable bpf() system call" + select BPF + select IRQ_WORK + select TASKS_TRACE_RCU + default n + help + Enable the bpf() system call that allows to manipulate eBPF + programs and maps via file descriptors. + +config ARCH_WANT_DEFAULT_BPF_JIT + bool + +config BPF_JIT_ALWAYS_ON + bool "Permanently enable BPF JIT and remove BPF interpreter" + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT + help + Enables BPF JIT and removes BPF interpreter to avoid + speculative execution of BPF instructions by the interpreter + +config BPF_JIT_DEFAULT_ON + def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON + depends on HAVE_EBPF_JIT && BPF_JIT + +config BPF_UNPRIV_DEFAULT_OFF + bool "Disable unprivileged BPF by default" + depends on BPF_SYSCALL + help + Disables unprivileged BPF by default by setting the corresponding + /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can + still reenable it by setting it to 0 later on, or permanently + disable it by setting it to 1 (from which no other transition to + 0 is possible anymore). + +source "kernel/bpf/preload/Kconfig" + +config USERFAULTFD + bool "Enable userfaultfd() system call" + depends on MMU + help + Enable the userfaultfd() system call that allows to intercept and + handle page faults in userland. + +config ARCH_HAS_MEMBARRIER_CALLBACKS + bool + +config ARCH_HAS_MEMBARRIER_SYNC_CORE + bool + +config KCMP + bool "Enable kcmp() system call" if EXPERT + help + Enable the kernel resource comparison system call. It provides + user-space with the ability to compare two processes to see if they + share a common resource, such as a file descriptor or even virtual + memory space. + + If unsure, say N. + +config RSEQ + bool "Enable rseq() system call" if EXPERT + default y + depends on HAVE_RSEQ + select MEMBARRIER + help + Enable the restartable sequences system call. It provides a + user-space cache for the current CPU number value, which + speeds up getting the current CPU number from user-space, + as well as an ABI to speed up user-space operations on + per-CPU data. + + If unsure, say Y. + +config DEBUG_RSEQ + default n + bool "Enabled debugging of rseq() system call" if EXPERT + depends on RSEQ && DEBUG_KERNEL + help + Enable extra debugging checks for the rseq system call. + + If unsure, say N. + +config EMBEDDED + bool "Embedded system" + option allnoconfig_y + select EXPERT + help + This option should be enabled if compiling the kernel for + an embedded system so certain expert options are available + for configuration. + +config HAVE_PERF_EVENTS + bool + help + See tools/perf/design.txt for details. + +config PERF_USE_VMALLOC + bool + help + See tools/perf/design.txt for details + +config PC104 + bool "PC/104 support" if EXPERT + help + Expose PC/104 form factor device drivers and options available for + selection and configuration. Enable this option if your target + machine has a PC/104 bus. + +menu "Kernel Performance Events And Counters" + +config PERF_EVENTS + bool "Kernel performance events and counters" + default y if PROFILING + depends on HAVE_PERF_EVENTS + select IRQ_WORK + select SRCU + help + Enable kernel support for various performance events provided + by software and hardware. + + Software events are supported either built-in or via the + use of generic tracepoints. + + Most modern CPUs support performance events via performance + counter registers. These registers count the number of certain + types of hw events: such as instructions executed, cachemisses + suffered, or branches mis-predicted - without slowing down the + kernel or applications. These registers can also trigger interrupts + when a threshold number of events have passed - and can thus be + used to profile the code that runs on that CPU. + + The Linux Performance Event subsystem provides an abstraction of + these software and hardware event capabilities, available via a + system call and used by the "perf" utility in tools/perf/. It + provides per task and per CPU counters, and it provides event + capabilities on top of those. + + Say Y if unsure. + +config DEBUG_PERF_USE_VMALLOC + default n + bool "Debug: use vmalloc to back perf mmap() buffers" + depends on PERF_EVENTS && DEBUG_KERNEL && !PPC + select PERF_USE_VMALLOC + help + Use vmalloc memory to back perf mmap() buffers. + + Mostly useful for debugging the vmalloc code on platforms + that don't require it. + + Say N if unsure. + +endmenu + +config VM_EVENT_COUNTERS + default y + bool "Enable VM event counters for /proc/vmstat" if EXPERT + help + VM event counters are needed for event counts to be shown. + This option allows the disabling of the VM event counters + on EXPERT systems. /proc/vmstat will only show page counts + if VM event counters are disabled. + +config SLUB_DEBUG + default y + bool "Enable SLUB debugging support" if EXPERT + depends on SLUB && SYSFS + help + SLUB has extensive debug support features. Disabling these can + result in significant savings in code size. This also disables + SLUB sysfs support. /sys/slab will not exist and there will be + no support for cache validation etc. + +config SLUB_MEMCG_SYSFS_ON + default n + bool "Enable memcg SLUB sysfs support by default" if EXPERT + depends on SLUB && SYSFS && MEMCG + help + SLUB creates a directory under /sys/kernel/slab for each + allocation cache to host info and debug files. If memory + cgroup is enabled, each cache can have per memory cgroup + caches. SLUB can create the same sysfs directories for these + caches under /sys/kernel/slab/CACHE/cgroup but it can lead + to a very high number of debug files being created. This is + controlled by slub_memcg_sysfs boot parameter and this + config option determines the parameter's default value. + +config COMPAT_BRK + bool "Disable heap randomization" + default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). + This option changes the bootup default to heap randomization + disabled, and can be overridden at runtime by setting + /proc/sys/kernel/randomize_va_space to 2. + + On non-ancient distros (post-2000 ones) N is usually a safe choice. + +choice + prompt "Choose SLAB allocator" + default SLUB + help + This option allows to select a slab allocator. + +config SLAB + bool "SLAB" + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + The regular slab allocator that is established and known to work + well in all environments. It organizes cache hot objects in + per cpu and per node queues. + +config SLUB + bool "SLUB (Unqueued Allocator)" + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + SLUB is a slab allocator that minimizes cache line usage + instead of managing queues of cached objects (SLAB approach). + Per cpu caching is realized using slabs of objects instead + of queues of objects. SLUB can use memory efficiently + and has enhanced diagnostics. SLUB is the default choice for + a slab allocator. + +config SLOB + depends on EXPERT + bool "SLOB (Simple Allocator)" + help + SLOB replaces the stock allocator with a drastically simpler + allocator. SLOB is generally more space efficient but + does not perform as well on large systems. + +endchoice + +config SLAB_MERGE_DEFAULT + bool "Allow slab caches to be merged" + default y + help + For reduced kernel memory fragmentation, slab caches can be + merged when they share the same size and other characteristics. + This carries a risk of kernel heap overflows being able to + overwrite objects from merged caches (and more easily control + cache layout), which makes such heap attacks easier to exploit + by attackers. By keeping caches unmerged, these kinds of exploits + can usually only damage objects in the same cache. To disable + merging at runtime, "slab_nomerge" can be passed on the kernel + command line. + +config SLAB_FREELIST_RANDOM + bool "Randomize slab freelist" + depends on SLAB || SLUB + help + Randomizes the freelist order used on creating new pages. This + security feature reduces the predictability of the kernel slab + allocator against heap overflows. + +config SLAB_FREELIST_HARDENED + bool "Harden slab freelist metadata" + depends on SLAB || SLUB + help + Many kernel heap attacks try to target slab cache metadata and + other infrastructure. This options makes minor performance + sacrifices to harden the kernel slab allocator against common + freelist exploit methods. Some slab implementations have more + sanity-checking than others. This option is most effective with + CONFIG_SLUB. + +config SHUFFLE_PAGE_ALLOCATOR + bool "Page allocator randomization" + default SLAB_FREELIST_RANDOM && ACPI_NUMA + help + Randomization of the page allocator improves the average + utilization of a direct-mapped memory-side-cache. See section + 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI + 6.2a specification for an example of how a platform advertises + the presence of a memory-side-cache. There are also incidental + security benefits as it reduces the predictability of page + allocations to compliment SLAB_FREELIST_RANDOM, but the + default granularity of shuffling on the "MAX_ORDER - 1" i.e, + 10th order of pages is selected based on cache utilization + benefits on x86. + + While the randomization improves cache utilization it may + negatively impact workloads on platforms without a cache. For + this reason, by default, the randomization is enabled only + after runtime detection of a direct-mapped memory-side-cache. + Otherwise, the randomization may be force enabled with the + 'page_alloc.shuffle' kernel command line parameter. + + Say Y if unsure. + +config SLUB_CPU_PARTIAL + default y + depends on SLUB && SMP + bool "SLUB per cpu partial cache" + help + Per cpu partial caches accelerate objects allocation and freeing + that is local to a processor at the price of more indeterminism + in the latency of the free. On overflow these caches will be cleared + which requires the taking of locks that may cause latency spikes. + Typically one would choose no for a realtime system. + +config MMAP_ALLOW_UNINITIALIZED + bool "Allow mmapped anonymous memory to be uninitialized" + depends on EXPERT && !MMU + default n + help + Normally, and according to the Linux spec, anonymous memory obtained + from mmap() has its contents cleared before it is passed to + userspace. Enabling this config option allows you to request that + mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus + providing a huge performance boost. If this option is not enabled, + then the flag will be ignored. + + This is taken advantage of by uClibc's malloc(), and also by + ELF-FDPIC binfmt's brk and stack allocator. + + Because of the obvious security issues, this option should only be + enabled on embedded devices where you control what is run in + userspace. Since that isn't generally a problem on no-MMU systems, + it is normally safe to say Y here. + + See Documentation/admin-guide/mm/nommu-mmap.rst for more information. + +config SYSTEM_DATA_VERIFICATION + def_bool n + select SYSTEM_TRUSTED_KEYRING + select KEYS + select CRYPTO + select CRYPTO_RSA + select ASYMMETRIC_KEY_TYPE + select ASYMMETRIC_PUBLIC_KEY_SUBTYPE + select ASN1 + select OID_REGISTRY + select X509_CERTIFICATE_PARSER + select PKCS7_MESSAGE_PARSER + help + Provide PKCS#7 message verification using the contents of the system + trusted keyring to provide public keys. This then can be used for + module verification, kexec image verification and firmware blob + verification. + +config PROFILING + bool "Profiling support" + help + Say Y here to enable the extended profiling support mechanisms used + by profilers such as OProfile. + +# +# Place an empty function call at each tracepoint site. Can be +# dynamically changed for a probe function. +# +config TRACEPOINTS + bool + +endmenu # General setup + +source "arch/Kconfig" + +config RT_MUTEXES + bool + +config BASE_SMALL + int + default 0 if BASE_FULL + default 1 if !BASE_FULL + +config MODULE_SIG_FORMAT + def_bool n + select SYSTEM_DATA_VERIFICATION + +menuconfig MODULES + bool "Enable loadable module support" + option modules + help + Kernel modules are small pieces of compiled code which can + be inserted in the running kernel, rather than being + permanently built into the kernel. You use the "modprobe" + tool to add (and sometimes remove) them. If you say Y here, + many parts of the kernel can be built as modules (by + answering M instead of Y where indicated): this is most + useful for infrequently used options which are not required + for booting. For more information, see the man pages for + modprobe, lsmod, modinfo, insmod and rmmod. + + If you say Y here, you will need to run "make + modules_install" to put the modules under /lib/modules/ + where modprobe can find them (you may need to be root to do + this). + + If unsure, say Y. + +if MODULES + +config MODULE_FORCE_LOAD + bool "Forced module loading" + default n + help + Allow loading of modules without version information (ie. modprobe + --force). Forced module loading sets the 'F' (forced) taint flag and + is usually a really bad idea. + +config MODULE_UNLOAD + bool "Module unloading" + help + Without this option you will not be able to unload any + modules (note that some modules may not be unloadable + anyway), which makes your kernel smaller, faster + and simpler. If unsure, say Y. + +config MODULE_FORCE_UNLOAD + bool "Forced module unloading" + depends on MODULE_UNLOAD + help + This option allows you to force a module to unload, even if the + kernel believes it is unsafe: the kernel will remove the module + without waiting for anyone to stop using it (using the -f option to + rmmod). This is mainly for kernel developers and desperate users. + If unsure, say N. + +config MODVERSIONS + bool "Module versioning support" + help + Usually, you have to use modules compiled with your kernel. + Saying Y here makes it sometimes possible to use modules + compiled for different kernels, by adding enough information + to the modules to (hopefully) spot any changes which would + make them incompatible with the kernel you are running. If + unsure, say N. + +config ASM_MODVERSIONS + bool + default HAVE_ASM_MODVERSIONS && MODVERSIONS + help + This enables module versioning for exported symbols also from + assembly. This can be enabled only when the target architecture + supports it. + +config MODULE_REL_CRCS + bool + depends on MODVERSIONS + +config MODULE_SRCVERSION_ALL + bool "Source checksum for all modules" + help + Modules which contain a MODULE_VERSION get an extra "srcversion" + field inserted into their modinfo section, which contains a + sum of the source files which made it. This helps maintainers + see exactly which source was used to build a module (since + others sometimes change the module source without updating + the version). With this option, such a "srcversion" field + will be created for all modules. If unsure, say N. + +config MODULE_SIG + bool "Module signature verification" + select MODULE_SIG_FORMAT + help + Check modules for valid signatures upon load: the signature + is simply appended to the module. For more information see + <file:Documentation/admin-guide/module-signing.rst>. + + Note that this option adds the OpenSSL development packages as a + kernel build dependency so that the signing tool can use its crypto + library. + + You should enable this option if you wish to use either + CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via + another LSM - otherwise unsigned modules will be loadable regardless + of the lockdown policy. + + !!!WARNING!!! If you enable this option, you MUST make sure that the + module DOES NOT get stripped after being signed. This includes the + debuginfo strip done by some packagers (such as rpmbuild) and + inclusion into an initramfs that wants the module size reduced. + +config MODULE_SIG_FORCE + bool "Require modules to be validly signed" + depends on MODULE_SIG + help + Reject unsigned modules or signed modules for which we don't have a + key. Without this, such modules will simply taint the kernel. + +config MODULE_SIG_ALL + bool "Automatically sign all modules" + default y + depends on MODULE_SIG + help + Sign all modules during make modules_install. Without this option, + modules must be signed manually, using the scripts/sign-file tool. + +comment "Do not forget to sign required modules with scripts/sign-file" + depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL + +choice + prompt "Which hash algorithm should modules be signed with?" + depends on MODULE_SIG + help + This determines which sort of hashing algorithm will be used during + signature generation. This algorithm _must_ be built into the kernel + directly so that signature verification can take place. It is not + possible to load a signed module containing the algorithm to check + the signature on that module. + +config MODULE_SIG_SHA1 + bool "Sign modules with SHA-1" + select CRYPTO_SHA1 + +config MODULE_SIG_SHA224 + bool "Sign modules with SHA-224" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA256 + bool "Sign modules with SHA-256" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA384 + bool "Sign modules with SHA-384" + select CRYPTO_SHA512 + +config MODULE_SIG_SHA512 + bool "Sign modules with SHA-512" + select CRYPTO_SHA512 + +endchoice + +config MODULE_SIG_HASH + string + depends on MODULE_SIG + default "sha1" if MODULE_SIG_SHA1 + default "sha224" if MODULE_SIG_SHA224 + default "sha256" if MODULE_SIG_SHA256 + default "sha384" if MODULE_SIG_SHA384 + default "sha512" if MODULE_SIG_SHA512 + +config MODULE_COMPRESS + bool "Compress modules on installation" + help + + Compresses kernel modules when 'make modules_install' is run; gzip or + xz depending on "Compression algorithm" below. + + module-init-tools MAY support gzip, and kmod MAY support gzip and xz. + + Out-of-tree kernel modules installed using Kbuild will also be + compressed upon installation. + + Note: for modules inside an initrd or initramfs, it's more efficient + to compress the whole initrd or initramfs instead. + + Note: This is fully compatible with signed modules. + + If in doubt, say N. + +choice + prompt "Compression algorithm" + depends on MODULE_COMPRESS + default MODULE_COMPRESS_GZIP + help + This determines which sort of compression will be used during + 'make modules_install'. + + GZIP (default) and XZ are supported. + +config MODULE_COMPRESS_GZIP + bool "GZIP" + +config MODULE_COMPRESS_XZ + bool "XZ" + +endchoice + +config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS + bool "Allow loading of modules with missing namespace imports" + help + Symbols exported with EXPORT_SYMBOL_NS*() are considered exported in + a namespace. A module that makes use of a symbol exported with such a + namespace is required to import the namespace via MODULE_IMPORT_NS(). + There is no technical reason to enforce correct namespace imports, + but it creates consistency between symbols defining namespaces and + users importing namespaces they make use of. This option relaxes this + requirement and lifts the enforcement when loading a module. + + If unsure, say N. + +config UNUSED_SYMBOLS + bool "Enable unused/obsolete exported symbols" + default y if X86 + help + Unused but exported symbols make the kernel needlessly bigger. For + that reason most of these unused exports will soon be removed. This + option is provided temporarily to provide a transition period in case + some external kernel module needs one of these symbols anyway. If you + encounter such a case in your module, consider if you are actually + using the right API. (rationale: since nobody in the kernel is using + this in a module, there is a pretty good chance it's actually the + wrong interface to use). If you really need the symbol, please send a + mail to the linux kernel mailing list mentioning the symbol and why + you really need it, and what the merge plan to the mainline kernel for + your module is. + +config TRIM_UNUSED_KSYMS + bool "Trim unused exported kernel symbols" + depends on !UNUSED_SYMBOLS + help + The kernel and some modules make many symbols available for + other modules to use via EXPORT_SYMBOL() and variants. Depending + on the set of modules being selected in your kernel configuration, + many of those exported symbols might never be used. + + This option allows for unused exported symbols to be dropped from + the build. In turn, this provides the compiler more opportunities + (especially when using LTO) for optimizing the code and reducing + binary size. This might have some security advantages as well. + + If unsure, or if you need to build out-of-tree modules, say N. + +config UNUSED_KSYMS_WHITELIST + string "Whitelist of symbols to keep in ksymtab" + depends on TRIM_UNUSED_KSYMS + help + By default, all unused exported symbols will be un-exported from the + build when TRIM_UNUSED_KSYMS is selected. + + UNUSED_KSYMS_WHITELIST allows to whitelist symbols that must be kept + exported at all times, even in absence of in-tree users. The value to + set here is the path to a text file containing the list of symbols, + one per line. The path can be absolute, or relative to the kernel + source tree. + +endif # MODULES + +config MODULES_TREE_LOOKUP + def_bool y + depends on PERF_EVENTS || TRACING + +config INIT_ALL_POSSIBLE + bool + help + Back when each arch used to define their own cpu_online_mask and + cpu_possible_mask, some of them chose to initialize cpu_possible_mask + with all 1s, and others with all 0s. When they were centralised, + it was better to provide this option than to break all the archs + and have several arch maintainers pursuing me down dark alleys. + +source "block/Kconfig" + +config PREEMPT_NOTIFIERS + bool + +config PADATA + depends on SMP + bool + +config ASN1 + tristate + help + Build a simple ASN.1 grammar compiler that produces a bytecode output + that can be interpreted by the ASN.1 stream decoder and used to + inform it as to what tags are to be expected in a stream and what + functions to call on what tags. + +source "kernel/Kconfig.locks" + +config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + bool + +config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE + bool + +# It may be useful for an architecture to override the definitions of the +# SYSCALL_DEFINE() and __SYSCALL_DEFINEx() macros in <linux/syscalls.h> +# and the COMPAT_ variants in <linux/compat.h>, in particular to use a +# different calling convention for syscalls. They can also override the +# macros for not-implemented syscalls in kernel/sys_ni.c and +# kernel/time/posix-stubs.c. All these overrides need to be available in +# <asm/syscall_wrapper.h>. +config ARCH_HAS_SYSCALL_WRAPPER + def_bool n diff --git a/init/Makefile b/init/Makefile new file mode 100644 index 000000000..6bc37f64b --- /dev/null +++ b/init/Makefile @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux kernel. +# + +ccflags-y := -fno-function-sections -fno-data-sections + +obj-y := main.o version.o mounts.o +ifneq ($(CONFIG_BLK_DEV_INITRD),y) +obj-y += noinitramfs.o +else +obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o +endif +obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o + +obj-y += init_task.o + +mounts-y := do_mounts.o +mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o +mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o + +# dependencies on generated files need to be listed explicitly +$(obj)/version.o: include/generated/compile.h + +# compile.h changes depending on hostname, generation number, etc, +# so we regenerate it always. +# mkcompile_h will make sure to only update the +# actual file if its content has changed. + + chk_compile.h = : + quiet_chk_compile.h = echo ' CHK $@' +silent_chk_compile.h = : +include/generated/compile.h: FORCE + @$($(quiet)chk_compile.h) + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \ + "$(CONFIG_PREEMPT_RT)" $(CONFIG_CC_VERSION_TEXT) "$(LD)" diff --git a/init/calibrate.c b/init/calibrate.c new file mode 100644 index 000000000..f3831272f --- /dev/null +++ b/init/calibrate.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0 +/* calibrate.c: default delay calibration + * + * Excised from init/main.c + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/jiffies.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/timex.h> +#include <linux/smp.h> +#include <linux/percpu.h> + +unsigned long lpj_fine; +unsigned long preset_lpj; +static int __init lpj_setup(char *str) +{ + preset_lpj = simple_strtoul(str,NULL,0); + return 1; +} + +__setup("lpj=", lpj_setup); + +#ifdef ARCH_HAS_READ_CURRENT_TIMER + +/* This routine uses the read_current_timer() routine and gets the + * loops per jiffy directly, instead of guessing it using delay(). + * Also, this code tries to handle non-maskable asynchronous events + * (like SMIs) + */ +#define DELAY_CALIBRATION_TICKS ((HZ < 100) ? 1 : (HZ/100)) +#define MAX_DIRECT_CALIBRATION_RETRIES 5 + +static unsigned long calibrate_delay_direct(void) +{ + unsigned long pre_start, start, post_start; + unsigned long pre_end, end, post_end; + unsigned long start_jiffies; + unsigned long timer_rate_min, timer_rate_max; + unsigned long good_timer_sum = 0; + unsigned long good_timer_count = 0; + unsigned long measured_times[MAX_DIRECT_CALIBRATION_RETRIES]; + int max = -1; /* index of measured_times with max/min values or not set */ + int min = -1; + int i; + + if (read_current_timer(&pre_start) < 0 ) + return 0; + + /* + * A simple loop like + * while ( jiffies < start_jiffies+1) + * start = read_current_timer(); + * will not do. As we don't really know whether jiffy switch + * happened first or timer_value was read first. And some asynchronous + * event can happen between these two events introducing errors in lpj. + * + * So, we do + * 1. pre_start <- When we are sure that jiffy switch hasn't happened + * 2. check jiffy switch + * 3. start <- timer value before or after jiffy switch + * 4. post_start <- When we are sure that jiffy switch has happened + * + * Note, we don't know anything about order of 2 and 3. + * Now, by looking at post_start and pre_start difference, we can + * check whether any asynchronous event happened or not + */ + + for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) { + pre_start = 0; + read_current_timer(&start); + start_jiffies = jiffies; + while (time_before_eq(jiffies, start_jiffies + 1)) { + pre_start = start; + read_current_timer(&start); + } + read_current_timer(&post_start); + + pre_end = 0; + end = post_start; + while (time_before_eq(jiffies, start_jiffies + 1 + + DELAY_CALIBRATION_TICKS)) { + pre_end = end; + read_current_timer(&end); + } + read_current_timer(&post_end); + + timer_rate_max = (post_end - pre_start) / + DELAY_CALIBRATION_TICKS; + timer_rate_min = (pre_end - post_start) / + DELAY_CALIBRATION_TICKS; + + /* + * If the upper limit and lower limit of the timer_rate is + * >= 12.5% apart, redo calibration. + */ + if (start >= post_end) + printk(KERN_NOTICE "calibrate_delay_direct() ignoring " + "timer_rate as we had a TSC wrap around" + " start=%lu >=post_end=%lu\n", + start, post_end); + if (start < post_end && pre_start != 0 && pre_end != 0 && + (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) { + good_timer_count++; + good_timer_sum += timer_rate_max; + measured_times[i] = timer_rate_max; + if (max < 0 || timer_rate_max > measured_times[max]) + max = i; + if (min < 0 || timer_rate_max < measured_times[min]) + min = i; + } else + measured_times[i] = 0; + + } + + /* + * Find the maximum & minimum - if they differ too much throw out the + * one with the largest difference from the mean and try again... + */ + while (good_timer_count > 1) { + unsigned long estimate; + unsigned long maxdiff; + + /* compute the estimate */ + estimate = (good_timer_sum/good_timer_count); + maxdiff = estimate >> 3; + + /* if range is within 12% let's take it */ + if ((measured_times[max] - measured_times[min]) < maxdiff) + return estimate; + + /* ok - drop the worse value and try again... */ + good_timer_sum = 0; + good_timer_count = 0; + if ((measured_times[max] - estimate) < + (estimate - measured_times[min])) { + printk(KERN_NOTICE "calibrate_delay_direct() dropping " + "min bogoMips estimate %d = %lu\n", + min, measured_times[min]); + measured_times[min] = 0; + min = max; + } else { + printk(KERN_NOTICE "calibrate_delay_direct() dropping " + "max bogoMips estimate %d = %lu\n", + max, measured_times[max]); + measured_times[max] = 0; + max = min; + } + + for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) { + if (measured_times[i] == 0) + continue; + good_timer_count++; + good_timer_sum += measured_times[i]; + if (measured_times[i] < measured_times[min]) + min = i; + if (measured_times[i] > measured_times[max]) + max = i; + } + + } + + printk(KERN_NOTICE "calibrate_delay_direct() failed to get a good " + "estimate for loops_per_jiffy.\nProbably due to long platform " + "interrupts. Consider using \"lpj=\" boot option.\n"); + return 0; +} +#else +static unsigned long calibrate_delay_direct(void) +{ + return 0; +} +#endif + +/* + * This is the number of bits of precision for the loops_per_jiffy. Each + * time we refine our estimate after the first takes 1.5/HZ seconds, so try + * to start with a good estimate. + * For the boot cpu we can skip the delay calibration and assign it a value + * calculated based on the timer frequency. + * For the rest of the CPUs we cannot assume that the timer frequency is same as + * the cpu frequency, hence do the calibration for those. + */ +#define LPS_PREC 8 + +static unsigned long calibrate_delay_converge(void) +{ + /* First stage - slowly accelerate to find initial bounds */ + unsigned long lpj, lpj_base, ticks, loopadd, loopadd_base, chop_limit; + int trials = 0, band = 0, trial_in_band = 0; + + lpj = (1<<12); + + /* wait for "start of" clock tick */ + ticks = jiffies; + while (ticks == jiffies) + ; /* nothing */ + /* Go .. */ + ticks = jiffies; + do { + if (++trial_in_band == (1<<band)) { + ++band; + trial_in_band = 0; + } + __delay(lpj * band); + trials += band; + } while (ticks == jiffies); + /* + * We overshot, so retreat to a clear underestimate. Then estimate + * the largest likely undershoot. This defines our chop bounds. + */ + trials -= band; + loopadd_base = lpj * band; + lpj_base = lpj * trials; + +recalibrate: + lpj = lpj_base; + loopadd = loopadd_base; + + /* + * Do a binary approximation to get lpj set to + * equal one clock (up to LPS_PREC bits) + */ + chop_limit = lpj >> LPS_PREC; + while (loopadd > chop_limit) { + lpj += loopadd; + ticks = jiffies; + while (ticks == jiffies) + ; /* nothing */ + ticks = jiffies; + __delay(lpj); + if (jiffies != ticks) /* longer than 1 tick */ + lpj -= loopadd; + loopadd >>= 1; + } + /* + * If we incremented every single time possible, presume we've + * massively underestimated initially, and retry with a higher + * start, and larger range. (Only seen on x86_64, due to SMIs) + */ + if (lpj + loopadd * 2 == lpj_base + loopadd_base * 2) { + lpj_base = lpj; + loopadd_base <<= 2; + goto recalibrate; + } + + return lpj; +} + +static DEFINE_PER_CPU(unsigned long, cpu_loops_per_jiffy) = { 0 }; + +/* + * Check if cpu calibration delay is already known. For example, + * some processors with multi-core sockets may have all cores + * with the same calibration delay. + * + * Architectures should override this function if a faster calibration + * method is available. + */ +unsigned long __attribute__((weak)) calibrate_delay_is_known(void) +{ + return 0; +} + +/* + * Indicate the cpu delay calibration is done. This can be used by + * architectures to stop accepting delay timer registrations after this point. + */ + +void __attribute__((weak)) calibration_delay_done(void) +{ +} + +void calibrate_delay(void) +{ + unsigned long lpj; + static bool printed; + int this_cpu = smp_processor_id(); + + if (per_cpu(cpu_loops_per_jiffy, this_cpu)) { + lpj = per_cpu(cpu_loops_per_jiffy, this_cpu); + if (!printed) + pr_info("Calibrating delay loop (skipped) " + "already calibrated this CPU"); + } else if (preset_lpj) { + lpj = preset_lpj; + if (!printed) + pr_info("Calibrating delay loop (skipped) " + "preset value.. "); + } else if ((!printed) && lpj_fine) { + lpj = lpj_fine; + pr_info("Calibrating delay loop (skipped), " + "value calculated using timer frequency.. "); + } else if ((lpj = calibrate_delay_is_known())) { + ; + } else if ((lpj = calibrate_delay_direct()) != 0) { + if (!printed) + pr_info("Calibrating delay using timer " + "specific routine.. "); + } else { + if (!printed) + pr_info("Calibrating delay loop... "); + lpj = calibrate_delay_converge(); + } + per_cpu(cpu_loops_per_jiffy, this_cpu) = lpj; + if (!printed) + pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n", + lpj/(500000/HZ), + (lpj/(5000/HZ)) % 100, lpj); + + loops_per_jiffy = lpj; + printed = true; + + calibration_delay_done(); +} diff --git a/init/do_mounts.c b/init/do_mounts.c new file mode 100644 index 000000000..8ef154fb4 --- /dev/null +++ b/init/do_mounts.c @@ -0,0 +1,658 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/fd.h> +#include <linux/tty.h> +#include <linux/suspend.h> +#include <linux/root_dev.h> +#include <linux/security.h> +#include <linux/delay.h> +#include <linux/genhd.h> +#include <linux/mount.h> +#include <linux/device.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/initrd.h> +#include <linux/async.h> +#include <linux/fs_struct.h> +#include <linux/slab.h> +#include <linux/ramfs.h> +#include <linux/shmem_fs.h> + +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/nfs_mount.h> +#include <linux/raid/detect.h> +#include <uapi/linux/mount.h> + +#include "do_mounts.h" + +int root_mountflags = MS_RDONLY | MS_SILENT; +static char * __initdata root_device_name; +static char __initdata saved_root_name[64]; +static int root_wait; + +dev_t ROOT_DEV; + +static int __init load_ramdisk(char *str) +{ + pr_warn("ignoring the deprecated load_ramdisk= option\n"); + return 1; +} +__setup("load_ramdisk=", load_ramdisk); + +static int __init readonly(char *str) +{ + if (*str) + return 0; + root_mountflags |= MS_RDONLY; + return 1; +} + +static int __init readwrite(char *str) +{ + if (*str) + return 0; + root_mountflags &= ~MS_RDONLY; + return 1; +} + +__setup("ro", readonly); +__setup("rw", readwrite); + +#ifdef CONFIG_BLOCK +struct uuidcmp { + const char *uuid; + int len; +}; + +/** + * match_dev_by_uuid - callback for finding a partition using its uuid + * @dev: device passed in by the caller + * @data: opaque pointer to the desired struct uuidcmp to match + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int match_dev_by_uuid(struct device *dev, const void *data) +{ + const struct uuidcmp *cmp = data; + struct hd_struct *part = dev_to_part(dev); + + if (!part->info) + goto no_match; + + if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len)) + goto no_match; + + return 1; +no_match: + return 0; +} + + +/** + * devt_from_partuuid - looks up the dev_t of a partition by its UUID + * @uuid_str: char array containing ascii UUID + * + * The function will return the first partition which contains a matching + * UUID value in its partition_meta_info struct. This does not search + * by filesystem UUIDs. + * + * If @uuid_str is followed by a "/PARTNROFF=%d", then the number will be + * extracted and used as an offset from the partition identified by the UUID. + * + * Returns the matching dev_t on success or 0 on failure. + */ +static dev_t devt_from_partuuid(const char *uuid_str) +{ + dev_t res = 0; + struct uuidcmp cmp; + struct device *dev = NULL; + struct gendisk *disk; + struct hd_struct *part; + int offset = 0; + bool clear_root_wait = false; + char *slash; + + cmp.uuid = uuid_str; + + slash = strchr(uuid_str, '/'); + /* Check for optional partition number offset attributes. */ + if (slash) { + char c = 0; + /* Explicitly fail on poor PARTUUID syntax. */ + if (sscanf(slash + 1, + "PARTNROFF=%d%c", &offset, &c) != 1) { + clear_root_wait = true; + goto done; + } + cmp.len = slash - uuid_str; + } else { + cmp.len = strlen(uuid_str); + } + + if (!cmp.len) { + clear_root_wait = true; + goto done; + } + + dev = class_find_device(&block_class, NULL, &cmp, + &match_dev_by_uuid); + if (!dev) + goto done; + + res = dev->devt; + + /* Attempt to find the partition by offset. */ + if (!offset) + goto no_offset; + + res = 0; + disk = part_to_disk(dev_to_part(dev)); + part = disk_get_part(disk, dev_to_part(dev)->partno + offset); + if (part) { + res = part_devt(part); + put_device(part_to_dev(part)); + } + +no_offset: + put_device(dev); +done: + if (clear_root_wait) { + pr_err("VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n"); + if (root_wait) + pr_err("Disabling rootwait; root= is invalid.\n"); + root_wait = 0; + } + return res; +} + +/** + * match_dev_by_label - callback for finding a partition using its label + * @dev: device passed in by the caller + * @data: opaque pointer to the label to match + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int match_dev_by_label(struct device *dev, const void *data) +{ + const char *label = data; + struct hd_struct *part = dev_to_part(dev); + + if (part->info && !strcmp(label, part->info->volname)) + return 1; + + return 0; +} +#endif + +/* + * Convert a name into device number. We accept the following variants: + * + * 1) <hex_major><hex_minor> device number in hexadecimal represents itself + * no leading 0x, for example b302. + * 2) /dev/nfs represents Root_NFS (0xff) + * 3) /dev/<disk_name> represents the device number of disk + * 4) /dev/<disk_name><decimal> represents the device number + * of partition - device number of disk plus the partition number + * 5) /dev/<disk_name>p<decimal> - same as the above, that form is + * used when disk name of partitioned disk ends on a digit. + * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. + * 7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to + * a partition with a known unique id. + * 8) <major>:<minor> major and minor number of the device separated by + * a colon. + * 9) PARTLABEL=<name> with name being the GPT partition label. + * MSDOS partitions do not support labels! + * 10) /dev/cifs represents Root_CIFS (0xfe) + * + * If name doesn't have fall into the categories above, we return (0,0). + * block_class is used to check if something is a disk name. If the disk + * name contains slashes, the device name has them replaced with + * bangs. + */ + +dev_t name_to_dev_t(const char *name) +{ + char s[32]; + char *p; + dev_t res = 0; + int part; + +#ifdef CONFIG_BLOCK + if (strncmp(name, "PARTUUID=", 9) == 0) { + name += 9; + res = devt_from_partuuid(name); + if (!res) + goto fail; + goto done; + } else if (strncmp(name, "PARTLABEL=", 10) == 0) { + struct device *dev; + + dev = class_find_device(&block_class, NULL, name + 10, + &match_dev_by_label); + if (!dev) + goto fail; + + res = dev->devt; + put_device(dev); + goto done; + } +#endif + + if (strncmp(name, "/dev/", 5) != 0) { + unsigned maj, min, offset; + char dummy; + + if ((sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2) || + (sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3)) { + res = MKDEV(maj, min); + if (maj != MAJOR(res) || min != MINOR(res)) + goto fail; + } else { + res = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + goto fail; + } + goto done; + } + + name += 5; + res = Root_NFS; + if (strcmp(name, "nfs") == 0) + goto done; + res = Root_CIFS; + if (strcmp(name, "cifs") == 0) + goto done; + res = Root_RAM0; + if (strcmp(name, "ram") == 0) + goto done; + + if (strlen(name) > 31) + goto fail; + strcpy(s, name); + for (p = s; *p; p++) + if (*p == '/') + *p = '!'; + res = blk_lookup_devt(s, 0); + if (res) + goto done; + + /* + * try non-existent, but valid partition, which may only exist + * after revalidating the disk, like partitioned md devices + */ + while (p > s && isdigit(p[-1])) + p--; + if (p == s || !*p || *p == '0') + goto fail; + + /* try disk name without <part number> */ + part = simple_strtoul(p, NULL, 10); + *p = '\0'; + res = blk_lookup_devt(s, part); + if (res) + goto done; + + /* try disk name without p<part number> */ + if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') + goto fail; + p[-1] = '\0'; + res = blk_lookup_devt(s, part); + if (res) + goto done; + +fail: + return 0; +done: + return res; +} +EXPORT_SYMBOL_GPL(name_to_dev_t); + +static int __init root_dev_setup(char *line) +{ + strlcpy(saved_root_name, line, sizeof(saved_root_name)); + return 1; +} + +__setup("root=", root_dev_setup); + +static int __init rootwait_setup(char *str) +{ + if (*str) + return 0; + root_wait = 1; + return 1; +} + +__setup("rootwait", rootwait_setup); + +static char * __initdata root_mount_data; +static int __init root_data_setup(char *str) +{ + root_mount_data = str; + return 1; +} + +static char * __initdata root_fs_names; +static int __init fs_names_setup(char *str) +{ + root_fs_names = str; + return 1; +} + +static unsigned int __initdata root_delay; +static int __init root_delay_setup(char *str) +{ + root_delay = simple_strtoul(str, NULL, 0); + return 1; +} + +__setup("rootflags=", root_data_setup); +__setup("rootfstype=", fs_names_setup); +__setup("rootdelay=", root_delay_setup); + +static void __init get_fs_names(char *page) +{ + char *s = page; + + if (root_fs_names) { + strcpy(page, root_fs_names); + while (*s++) { + if (s[-1] == ',') + s[-1] = '\0'; + } + } else { + int len = get_filesystem_list(page); + char *p, *next; + + page[len] = '\0'; + for (p = page-1; p; p = next) { + next = strchr(++p, '\n'); + if (*p++ != '\t') + continue; + while ((*s++ = *p++) != '\n') + ; + s[-1] = '\0'; + } + } + *s = '\0'; +} + +static int __init do_mount_root(const char *name, const char *fs, + const int flags, const void *data) +{ + struct super_block *s; + struct page *p = NULL; + char *data_page = NULL; + int ret; + + if (data) { + /* init_mount() requires a full page as fifth argument */ + p = alloc_page(GFP_KERNEL); + if (!p) + return -ENOMEM; + data_page = page_address(p); + /* zero-pad. init_mount() will make sure it's terminated */ + strncpy(data_page, data, PAGE_SIZE); + } + + ret = init_mount(name, "/root", fs, flags, data_page); + if (ret) + goto out; + + init_chdir("/root"); + s = current->fs->pwd.dentry->d_sb; + ROOT_DEV = s->s_dev; + printk(KERN_INFO + "VFS: Mounted root (%s filesystem)%s on device %u:%u.\n", + s->s_type->name, + sb_rdonly(s) ? " readonly" : "", + MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); + +out: + if (p) + put_page(p); + return ret; +} + +void __init mount_block_root(char *name, int flags) +{ + struct page *page = alloc_page(GFP_KERNEL); + char *fs_names = page_address(page); + char *p; + char b[BDEVNAME_SIZE]; + + scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)", + MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); + get_fs_names(fs_names); +retry: + for (p = fs_names; *p; p += strlen(p)+1) { + int err = do_mount_root(name, p, flags, root_mount_data); + switch (err) { + case 0: + goto out; + case -EACCES: + case -EINVAL: + continue; + } + /* + * Allow the user to distinguish between failed sys_open + * and bad superblock on root device. + * and give them a list of the available devices + */ + printk("VFS: Cannot open root device \"%s\" or %s: error %d\n", + root_device_name, b, err); + printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); + + printk_all_partitions(); +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify " + "explicit textual name for \"root=\" boot option.\n"); +#endif + panic("VFS: Unable to mount root fs on %s", b); + } + if (!(flags & SB_RDONLY)) { + flags |= SB_RDONLY; + goto retry; + } + + printk("List of all partitions:\n"); + printk_all_partitions(); + printk("No filesystem could mount root, tried: "); + for (p = fs_names; *p; p += strlen(p)+1) + printk(" %s", p); + printk("\n"); + panic("VFS: Unable to mount root fs on %s", b); +out: + put_page(page); +} + +#ifdef CONFIG_ROOT_NFS + +#define NFSROOT_TIMEOUT_MIN 5 +#define NFSROOT_TIMEOUT_MAX 30 +#define NFSROOT_RETRY_MAX 5 + +static int __init mount_nfs_root(void) +{ + char *root_dev, *root_data; + unsigned int timeout; + int try, err; + + err = nfs_root_data(&root_dev, &root_data); + if (err != 0) + return 0; + + /* + * The server or network may not be ready, so try several + * times. Stop after a few tries in case the client wants + * to fall back to other boot methods. + */ + timeout = NFSROOT_TIMEOUT_MIN; + for (try = 1; ; try++) { + err = do_mount_root(root_dev, "nfs", + root_mountflags, root_data); + if (err == 0) + return 1; + if (try > NFSROOT_RETRY_MAX) + break; + + /* Wait, in case the server refused us immediately */ + ssleep(timeout); + timeout <<= 1; + if (timeout > NFSROOT_TIMEOUT_MAX) + timeout = NFSROOT_TIMEOUT_MAX; + } + return 0; +} +#endif + +#ifdef CONFIG_CIFS_ROOT + +extern int cifs_root_data(char **dev, char **opts); + +#define CIFSROOT_TIMEOUT_MIN 5 +#define CIFSROOT_TIMEOUT_MAX 30 +#define CIFSROOT_RETRY_MAX 5 + +static int __init mount_cifs_root(void) +{ + char *root_dev, *root_data; + unsigned int timeout; + int try, err; + + err = cifs_root_data(&root_dev, &root_data); + if (err != 0) + return 0; + + timeout = CIFSROOT_TIMEOUT_MIN; + for (try = 1; ; try++) { + err = do_mount_root(root_dev, "cifs", root_mountflags, + root_data); + if (err == 0) + return 1; + if (try > CIFSROOT_RETRY_MAX) + break; + + ssleep(timeout); + timeout <<= 1; + if (timeout > CIFSROOT_TIMEOUT_MAX) + timeout = CIFSROOT_TIMEOUT_MAX; + } + return 0; +} +#endif + +void __init mount_root(void) +{ +#ifdef CONFIG_ROOT_NFS + if (ROOT_DEV == Root_NFS) { + if (!mount_nfs_root()) + printk(KERN_ERR "VFS: Unable to mount root fs via NFS.\n"); + return; + } +#endif +#ifdef CONFIG_CIFS_ROOT + if (ROOT_DEV == Root_CIFS) { + if (!mount_cifs_root()) + printk(KERN_ERR "VFS: Unable to mount root fs via SMB.\n"); + return; + } +#endif +#ifdef CONFIG_BLOCK + { + int err = create_dev("/dev/root", ROOT_DEV); + + if (err < 0) + pr_emerg("Failed to create /dev/root: %d\n", err); + mount_block_root("/dev/root", root_mountflags); + } +#endif +} + +/* + * Prepare the namespace - decide what/where to mount, load ramdisks, etc. + */ +void __init prepare_namespace(void) +{ + if (root_delay) { + printk(KERN_INFO "Waiting %d sec before mounting root device...\n", + root_delay); + ssleep(root_delay); + } + + /* + * wait for the known devices to complete their probing + * + * Note: this is a potential source of long boot delays. + * For example, it is not atypical to wait 5 seconds here + * for the touchpad of a laptop to initialize. + */ + wait_for_device_probe(); + + md_run_setup(); + + if (saved_root_name[0]) { + root_device_name = saved_root_name; + if (!strncmp(root_device_name, "mtd", 3) || + !strncmp(root_device_name, "ubi", 3)) { + mount_block_root(root_device_name, root_mountflags); + goto out; + } + ROOT_DEV = name_to_dev_t(root_device_name); + if (strncmp(root_device_name, "/dev/", 5) == 0) + root_device_name += 5; + } + + if (initrd_load()) + goto out; + + /* wait for any asynchronous scanning to complete */ + if ((ROOT_DEV == 0) && root_wait) { + printk(KERN_INFO "Waiting for root device %s...\n", + saved_root_name); + while (driver_probe_done() != 0 || + (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0) + msleep(5); + async_synchronize_full(); + } + + mount_root(); +out: + devtmpfs_mount(); + init_mount(".", "/", NULL, MS_MOVE, NULL); + init_chroot("."); +} + +static bool is_tmpfs; +static int rootfs_init_fs_context(struct fs_context *fc) +{ + if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs) + return shmem_init_fs_context(fc); + + return ramfs_init_fs_context(fc); +} + +struct file_system_type rootfs_fs_type = { + .name = "rootfs", + .init_fs_context = rootfs_init_fs_context, + .kill_sb = kill_litter_super, +}; + +void __init init_rootfs(void) +{ + if (IS_ENABLED(CONFIG_TMPFS)) { + if (!saved_root_name[0] && !root_fs_names) + is_tmpfs = true; + else if (root_fs_names && !!strstr(root_fs_names, "tmpfs")) + is_tmpfs = true; + } +} diff --git a/init/do_mounts.h b/init/do_mounts.h new file mode 100644 index 000000000..7a29ac3e4 --- /dev/null +++ b/init/do_mounts.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/syscalls.h> +#include <linux/unistd.h> +#include <linux/slab.h> +#include <linux/mount.h> +#include <linux/major.h> +#include <linux/root_dev.h> +#include <linux/init_syscalls.h> + +void mount_block_root(char *name, int flags); +void mount_root(void); +extern int root_mountflags; + +static inline __init int create_dev(char *name, dev_t dev) +{ + init_unlink(name); + return init_mknod(name, S_IFBLK | 0600, new_encode_dev(dev)); +} + +#ifdef CONFIG_BLK_DEV_RAM + +int __init rd_load_disk(int n); +int __init rd_load_image(char *from); + +#else + +static inline int rd_load_disk(int n) { return 0; } +static inline int rd_load_image(char *from) { return 0; } + +#endif + +#ifdef CONFIG_BLK_DEV_INITRD + +bool __init initrd_load(void); + +#else + +static inline bool initrd_load(void) { return false; } + +#endif diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c new file mode 100644 index 000000000..533d81ed7 --- /dev/null +++ b/init/do_mounts_initrd.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/unistd.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/minix_fs.h> +#include <linux/romfs_fs.h> +#include <linux/initrd.h> +#include <linux/sched.h> +#include <linux/freezer.h> +#include <linux/kmod.h> +#include <uapi/linux/mount.h> + +#include "do_mounts.h" + +unsigned long initrd_start, initrd_end; +int initrd_below_start_ok; +unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */ +static int __initdata mount_initrd = 1; + +phys_addr_t phys_initrd_start __initdata; +unsigned long phys_initrd_size __initdata; + +static int __init no_initrd(char *str) +{ + mount_initrd = 0; + return 1; +} + +__setup("noinitrd", no_initrd); + +static int __init early_initrdmem(char *p) +{ + phys_addr_t start; + unsigned long size; + char *endp; + + start = memparse(p, &endp); + if (*endp == ',') { + size = memparse(endp + 1, NULL); + + phys_initrd_start = start; + phys_initrd_size = size; + } + return 0; +} +early_param("initrdmem", early_initrdmem); + +static int __init early_initrd(char *p) +{ + return early_initrdmem(p); +} +early_param("initrd", early_initrd); + +static int __init init_linuxrc(struct subprocess_info *info, struct cred *new) +{ + ksys_unshare(CLONE_FS | CLONE_FILES); + console_on_rootfs(); + /* move initrd over / and chdir/chroot in initrd root */ + init_chdir("/root"); + init_mount(".", "/", NULL, MS_MOVE, NULL); + init_chroot("."); + ksys_setsid(); + return 0; +} + +static void __init handle_initrd(void) +{ + struct subprocess_info *info; + static char *argv[] = { "linuxrc", NULL, }; + extern char *envp_init[]; + int error; + + pr_warn("using deprecated initrd support, will be removed in 2021.\n"); + + real_root_dev = new_encode_dev(ROOT_DEV); + create_dev("/dev/root.old", Root_RAM0); + /* mount initrd on rootfs' /root */ + mount_block_root("/dev/root.old", root_mountflags & ~MS_RDONLY); + init_mkdir("/old", 0700); + init_chdir("/old"); + + /* + * In case that a resume from disk is carried out by linuxrc or one of + * its children, we need to tell the freezer not to wait for us. + */ + current->flags |= PF_FREEZER_SKIP; + + info = call_usermodehelper_setup("/linuxrc", argv, envp_init, + GFP_KERNEL, init_linuxrc, NULL, NULL); + if (!info) + return; + call_usermodehelper_exec(info, UMH_WAIT_PROC); + + current->flags &= ~PF_FREEZER_SKIP; + + /* move initrd to rootfs' /old */ + init_mount("..", ".", NULL, MS_MOVE, NULL); + /* switch root and cwd back to / of rootfs */ + init_chroot(".."); + + if (new_decode_dev(real_root_dev) == Root_RAM0) { + init_chdir("/old"); + return; + } + + init_chdir("/"); + ROOT_DEV = new_decode_dev(real_root_dev); + mount_root(); + + printk(KERN_NOTICE "Trying to move old root to /initrd ... "); + error = init_mount("/old", "/root/initrd", NULL, MS_MOVE, NULL); + if (!error) + printk("okay\n"); + else { + if (error == -ENOENT) + printk("/initrd does not exist. Ignored.\n"); + else + printk("failed\n"); + printk(KERN_NOTICE "Unmounting old root\n"); + init_umount("/old", MNT_DETACH); + } +} + +bool __init initrd_load(void) +{ + if (mount_initrd) { + create_dev("/dev/ram", Root_RAM0); + /* + * Load the initrd data into /dev/ram0. Execute it as initrd + * unless /dev/ram0 is supposed to be our actual root device, + * in that case the ram disk is just set up here, and gets + * mounted in the normal path. + */ + if (rd_load_image("/initrd.image") && ROOT_DEV != Root_RAM0) { + init_unlink("/initrd.image"); + handle_initrd(); + return true; + } + } + init_unlink("/initrd.image"); + return false; +} diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c new file mode 100644 index 000000000..ac021ae6e --- /dev/null +++ b/init/do_mounts_rd.c @@ -0,0 +1,334 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/minix_fs.h> +#include <linux/ext2_fs.h> +#include <linux/romfs_fs.h> +#include <uapi/linux/cramfs_fs.h> +#include <linux/initrd.h> +#include <linux/string.h> +#include <linux/slab.h> + +#include "do_mounts.h" +#include "../fs/squashfs/squashfs_fs.h" + +#include <linux/decompress/generic.h> + +static struct file *in_file, *out_file; +static loff_t in_pos, out_pos; + +static int __init prompt_ramdisk(char *str) +{ + pr_warn("ignoring the deprecated prompt_ramdisk= option\n"); + return 1; +} +__setup("prompt_ramdisk=", prompt_ramdisk); + +int __initdata rd_image_start; /* starting block # of image */ + +static int __init ramdisk_start_setup(char *str) +{ + rd_image_start = simple_strtol(str,NULL,0); + return 1; +} +__setup("ramdisk_start=", ramdisk_start_setup); + +static int __init crd_load(decompress_fn deco); + +/* + * This routine tries to find a RAM disk image to load, and returns the + * number of blocks to read for a non-compressed image, 0 if the image + * is a compressed image, and -1 if an image with the right magic + * numbers could not be found. + * + * We currently check for the following magic numbers: + * minix + * ext2 + * romfs + * cramfs + * squashfs + * gzip + * bzip2 + * lzma + * xz + * lzo + * lz4 + */ +static int __init +identify_ramdisk_image(struct file *file, loff_t pos, + decompress_fn *decompressor) +{ + const int size = 512; + struct minix_super_block *minixsb; + struct romfs_super_block *romfsb; + struct cramfs_super *cramfsb; + struct squashfs_super_block *squashfsb; + int nblocks = -1; + unsigned char *buf; + const char *compress_name; + unsigned long n; + int start_block = rd_image_start; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + minixsb = (struct minix_super_block *) buf; + romfsb = (struct romfs_super_block *) buf; + cramfsb = (struct cramfs_super *) buf; + squashfsb = (struct squashfs_super_block *) buf; + memset(buf, 0xe5, size); + + /* + * Read block 0 to test for compressed kernel + */ + pos = start_block * BLOCK_SIZE; + kernel_read(file, buf, size, &pos); + + *decompressor = decompress_method(buf, size, &compress_name); + if (compress_name) { + printk(KERN_NOTICE "RAMDISK: %s image found at block %d\n", + compress_name, start_block); + if (!*decompressor) + printk(KERN_EMERG + "RAMDISK: %s decompressor not configured!\n", + compress_name); + nblocks = 0; + goto done; + } + + /* romfs is at block zero too */ + if (romfsb->word0 == ROMSB_WORD0 && + romfsb->word1 == ROMSB_WORD1) { + printk(KERN_NOTICE + "RAMDISK: romfs filesystem found at block %d\n", + start_block); + nblocks = (ntohl(romfsb->size)+BLOCK_SIZE-1)>>BLOCK_SIZE_BITS; + goto done; + } + + if (cramfsb->magic == CRAMFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: cramfs filesystem found at block %d\n", + start_block); + nblocks = (cramfsb->size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS; + goto done; + } + + /* squashfs is at block zero too */ + if (le32_to_cpu(squashfsb->s_magic) == SQUASHFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: squashfs filesystem found at block %d\n", + start_block); + nblocks = (le64_to_cpu(squashfsb->bytes_used) + BLOCK_SIZE - 1) + >> BLOCK_SIZE_BITS; + goto done; + } + + /* + * Read 512 bytes further to check if cramfs is padded + */ + pos = start_block * BLOCK_SIZE + 0x200; + kernel_read(file, buf, size, &pos); + + if (cramfsb->magic == CRAMFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: cramfs filesystem found at block %d\n", + start_block); + nblocks = (cramfsb->size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS; + goto done; + } + + /* + * Read block 1 to test for minix and ext2 superblock + */ + pos = (start_block + 1) * BLOCK_SIZE; + kernel_read(file, buf, size, &pos); + + /* Try minix */ + if (minixsb->s_magic == MINIX_SUPER_MAGIC || + minixsb->s_magic == MINIX_SUPER_MAGIC2) { + printk(KERN_NOTICE + "RAMDISK: Minix filesystem found at block %d\n", + start_block); + nblocks = minixsb->s_nzones << minixsb->s_log_zone_size; + goto done; + } + + /* Try ext2 */ + n = ext2_image_size(buf); + if (n) { + printk(KERN_NOTICE + "RAMDISK: ext2 filesystem found at block %d\n", + start_block); + nblocks = n; + goto done; + } + + printk(KERN_NOTICE + "RAMDISK: Couldn't find valid RAM disk image starting at %d.\n", + start_block); + +done: + kfree(buf); + return nblocks; +} + +static unsigned long nr_blocks(struct file *file) +{ + struct inode *inode = file->f_mapping->host; + + if (!S_ISBLK(inode->i_mode)) + return 0; + return i_size_read(inode) >> 10; +} + +int __init rd_load_image(char *from) +{ + int res = 0; + unsigned long rd_blocks, devblocks; + int nblocks, i; + char *buf = NULL; + unsigned short rotate = 0; + decompress_fn decompressor = NULL; +#if !defined(CONFIG_S390) + char rotator[4] = { '|' , '/' , '-' , '\\' }; +#endif + + out_file = filp_open("/dev/ram", O_RDWR, 0); + if (IS_ERR(out_file)) + goto out; + + in_file = filp_open(from, O_RDONLY, 0); + if (IS_ERR(in_file)) + goto noclose_input; + + in_pos = rd_image_start * BLOCK_SIZE; + nblocks = identify_ramdisk_image(in_file, in_pos, &decompressor); + if (nblocks < 0) + goto done; + + if (nblocks == 0) { + if (crd_load(decompressor) == 0) + goto successful_load; + goto done; + } + + /* + * NOTE NOTE: nblocks is not actually blocks but + * the number of kibibytes of data to load into a ramdisk. + */ + rd_blocks = nr_blocks(out_file); + if (nblocks > rd_blocks) { + printk("RAMDISK: image too big! (%dKiB/%ldKiB)\n", + nblocks, rd_blocks); + goto done; + } + + /* + * OK, time to copy in the data + */ + if (strcmp(from, "/initrd.image") == 0) + devblocks = nblocks; + else + devblocks = nr_blocks(in_file); + + if (devblocks == 0) { + printk(KERN_ERR "RAMDISK: could not determine device size\n"); + goto done; + } + + buf = kmalloc(BLOCK_SIZE, GFP_KERNEL); + if (!buf) { + printk(KERN_ERR "RAMDISK: could not allocate buffer\n"); + goto done; + } + + printk(KERN_NOTICE "RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ", + nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : ""); + for (i = 0; i < nblocks; i++) { + if (i && (i % devblocks == 0)) { + pr_cont("done disk #1.\n"); + rotate = 0; + fput(in_file); + break; + } + kernel_read(in_file, buf, BLOCK_SIZE, &in_pos); + kernel_write(out_file, buf, BLOCK_SIZE, &out_pos); +#if !defined(CONFIG_S390) + if (!(i % 16)) { + pr_cont("%c\b", rotator[rotate & 0x3]); + rotate++; + } +#endif + } + pr_cont("done.\n"); + +successful_load: + res = 1; +done: + fput(in_file); +noclose_input: + fput(out_file); +out: + kfree(buf); + init_unlink("/dev/ram"); + return res; +} + +int __init rd_load_disk(int n) +{ + create_dev("/dev/root", ROOT_DEV); + create_dev("/dev/ram", MKDEV(RAMDISK_MAJOR, n)); + return rd_load_image("/dev/root"); +} + +static int exit_code; +static int decompress_error; + +static long __init compr_fill(void *buf, unsigned long len) +{ + long r = kernel_read(in_file, buf, len, &in_pos); + if (r < 0) + printk(KERN_ERR "RAMDISK: error while reading compressed data"); + else if (r == 0) + printk(KERN_ERR "RAMDISK: EOF while reading compressed data"); + return r; +} + +static long __init compr_flush(void *window, unsigned long outcnt) +{ + long written = kernel_write(out_file, window, outcnt, &out_pos); + if (written != outcnt) { + if (decompress_error == 0) + printk(KERN_ERR + "RAMDISK: incomplete write (%ld != %ld)\n", + written, outcnt); + decompress_error = 1; + return -1; + } + return outcnt; +} + +static void __init error(char *x) +{ + printk(KERN_ERR "%s\n", x); + exit_code = 1; + decompress_error = 1; +} + +static int __init crd_load(decompress_fn deco) +{ + int result; + + if (!deco) { + pr_emerg("Invalid ramdisk decompression routine. " + "Select appropriate config option.\n"); + panic("Could not decompress initial ramdisk image."); + } + + result = deco(NULL, 0, compr_fill, compr_flush, NULL, NULL, error); + if (decompress_error) + result = 1; + return result; +} diff --git a/init/init_task.c b/init/init_task.c new file mode 100644 index 000000000..5fa18ed59 --- /dev/null +++ b/init/init_task.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init_task.h> +#include <linux/export.h> +#include <linux/mqueue.h> +#include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> +#include <linux/sched/task.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/audit.h> +#include <linux/numa.h> +#include <linux/scs.h> + +#include <linux/uaccess.h> + +static struct signal_struct init_signals = { + .nr_threads = 1, + .thread_head = LIST_HEAD_INIT(init_task.thread_node), + .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(init_signals.wait_chldexit), + .shared_pending = { + .list = LIST_HEAD_INIT(init_signals.shared_pending.list), + .signal = {{0}} + }, + .multiprocess = HLIST_HEAD_INIT, + .rlim = INIT_RLIMITS, + .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), + .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), +#ifdef CONFIG_POSIX_TIMERS + .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), + .cputimer = { + .cputime_atomic = INIT_CPUTIME_ATOMIC, + }, +#endif + INIT_CPU_TIMERS(init_signals) + .pids = { + [PIDTYPE_PID] = &init_struct_pid, + [PIDTYPE_TGID] = &init_struct_pid, + [PIDTYPE_PGID] = &init_struct_pid, + [PIDTYPE_SID] = &init_struct_pid, + }, + INIT_PREV_CPUTIME(init_signals) +}; + +static struct sighand_struct init_sighand = { + .count = REFCOUNT_INIT(1), + .action = { { { .sa_handler = SIG_DFL, } }, }, + .siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock), + .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), +}; + +#ifdef CONFIG_SHADOW_CALL_STACK +unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] + __init_task_data = { + [(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC +}; +#endif + +/* + * Set up the first task table, touch at your own risk!. Base=0, + * limit=0x1fffff (=2MB) + */ +struct task_struct init_task +#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK + __init_task_data +#endif + __aligned(L1_CACHE_BYTES) += { +#ifdef CONFIG_THREAD_INFO_IN_TASK + .thread_info = INIT_THREAD_INFO(init_task), + .stack_refcount = REFCOUNT_INIT(1), +#endif + .state = 0, + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, + .nr_cpus_allowed= NR_CPUS, + .mm = NULL, + .active_mm = &init_mm, + .restart_block = { + .fn = do_no_restart_syscall, + }, + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, + .rt = { + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, + .tasks = LIST_HEAD_INIT(init_task.tasks), +#ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +#endif +#ifdef CONFIG_CGROUP_SCHED + .sched_task_group = &root_task_group, +#endif + .ptraced = LIST_HEAD_INIT(init_task.ptraced), + .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), + .real_parent = &init_task, + .parent = &init_task, + .children = LIST_HEAD_INIT(init_task.children), + .sibling = LIST_HEAD_INIT(init_task.sibling), + .group_leader = &init_task, + RCU_POINTER_INITIALIZER(real_cred, &init_cred), + RCU_POINTER_INITIALIZER(cred, &init_cred), + .comm = INIT_TASK_COMM, + .thread = INIT_THREAD, + .fs = &init_fs, + .files = &init_files, +#ifdef CONFIG_IO_URING + .io_uring = NULL, +#endif + .signal = &init_signals, + .sighand = &init_sighand, + .nsproxy = &init_nsproxy, + .pending = { + .list = LIST_HEAD_INIT(init_task.pending.list), + .signal = {{0}} + }, + .blocked = {{0}}, + .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), + .journal_info = NULL, + INIT_CPU_TIMERS(init_task) + .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), + .timer_slack_ns = 50000, /* 50 usec default slack */ + .thread_pid = &init_struct_pid, + .thread_group = LIST_HEAD_INIT(init_task.thread_group), + .thread_node = LIST_HEAD_INIT(init_signals.thread_head), +#ifdef CONFIG_AUDIT + .loginuid = INVALID_UID, + .sessionid = AUDIT_SID_UNSET, +#endif +#ifdef CONFIG_PERF_EVENTS + .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), + .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), +#endif +#ifdef CONFIG_PREEMPT_RCU + .rcu_read_lock_nesting = 0, + .rcu_read_unlock_special.s = 0, + .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), + .rcu_blocked_node = NULL, +#endif +#ifdef CONFIG_TASKS_RCU + .rcu_tasks_holdout = false, + .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), + .rcu_tasks_idle_cpu = -1, +#endif +#ifdef CONFIG_TASKS_TRACE_RCU + .trc_reader_nesting = 0, + .trc_reader_special.s = 0, + .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), +#endif +#ifdef CONFIG_CPUSETS + .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq, + &init_task.alloc_lock), +#endif +#ifdef CONFIG_RT_MUTEXES + .pi_waiters = RB_ROOT_CACHED, + .pi_top_task = NULL, +#endif + INIT_PREV_CPUTIME(init_task) +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), + .vtime.starttime = 0, + .vtime.state = VTIME_SYS, +#endif +#ifdef CONFIG_NUMA_BALANCING + .numa_preferred_nid = NUMA_NO_NODE, + .numa_group = NULL, + .numa_faults = NULL, +#endif +#ifdef CONFIG_KASAN + .kasan_depth = 1, +#endif +#ifdef CONFIG_KCSAN + .kcsan_ctx = { + .disable_count = 0, + .atomic_next = 0, + .atomic_nest_count = 0, + .in_flat_atomic = false, + .access_mask = 0, + .scoped_accesses = {LIST_POISON1, NULL}, + }, +#endif +#ifdef CONFIG_TRACE_IRQFLAGS + .softirqs_enabled = 1, +#endif +#ifdef CONFIG_LOCKDEP + .lockdep_depth = 0, /* no locks held yet */ + .curr_chain_key = INITIAL_CHAIN_KEY, + .lockdep_recursion = 0, +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + .ret_stack = NULL, + .tracing_graph_pause = ATOMIC_INIT(0), +#endif +#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION) + .trace_recursion = 0, +#endif +#ifdef CONFIG_LIVEPATCH + .patch_state = KLP_UNDEFINED, +#endif +#ifdef CONFIG_SECURITY + .security = NULL, +#endif +#ifdef CONFIG_SECCOMP_FILTER + .seccomp = { .filter_count = ATOMIC_INIT(0) }, +#endif +}; +EXPORT_SYMBOL(init_task); + +/* + * Initial thread structure. Alignment of this is handled by a special + * linker map entry. + */ +#ifndef CONFIG_THREAD_INFO_IN_TASK +struct thread_info init_thread_info __init_thread_info = INIT_THREAD_INFO(init_task); +#endif diff --git a/init/initramfs.c b/init/initramfs.c new file mode 100644 index 000000000..55b74d7e5 --- /dev/null +++ b/init/initramfs.c @@ -0,0 +1,642 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/dirent.h> +#include <linux/syscalls.h> +#include <linux/utime.h> +#include <linux/file.h> +#include <linux/memblock.h> +#include <linux/namei.h> +#include <linux/init_syscalls.h> + +static ssize_t __init xwrite(struct file *file, const char *p, size_t count, + loff_t *pos) +{ + ssize_t out = 0; + + /* sys_write only can write MAX_RW_COUNT aka 2G-4K bytes at most */ + while (count) { + ssize_t rv = kernel_write(file, p, count, pos); + + if (rv < 0) { + if (rv == -EINTR || rv == -EAGAIN) + continue; + return out ? out : rv; + } else if (rv == 0) + break; + + p += rv; + out += rv; + count -= rv; + } + + return out; +} + +static __initdata char *message; +static void __init error(char *x) +{ + if (!message) + message = x; +} + +/* link hash */ + +#define N_ALIGN(len) ((((len) + 1) & ~3) + 2) + +static __initdata struct hash { + int ino, minor, major; + umode_t mode; + struct hash *next; + char name[N_ALIGN(PATH_MAX)]; +} *head[32]; + +static inline int hash(int major, int minor, int ino) +{ + unsigned long tmp = ino + minor + (major << 3); + tmp += tmp >> 5; + return tmp & 31; +} + +static char __init *find_link(int major, int minor, int ino, + umode_t mode, char *name) +{ + struct hash **p, *q; + for (p = head + hash(major, minor, ino); *p; p = &(*p)->next) { + if ((*p)->ino != ino) + continue; + if ((*p)->minor != minor) + continue; + if ((*p)->major != major) + continue; + if (((*p)->mode ^ mode) & S_IFMT) + continue; + return (*p)->name; + } + q = kmalloc(sizeof(struct hash), GFP_KERNEL); + if (!q) + panic("can't allocate link hash entry"); + q->major = major; + q->minor = minor; + q->ino = ino; + q->mode = mode; + strcpy(q->name, name); + q->next = NULL; + *p = q; + return NULL; +} + +static void __init free_hash(void) +{ + struct hash **p, *q; + for (p = head; p < head + 32; p++) { + while (*p) { + q = *p; + *p = q->next; + kfree(q); + } + } +} + +static long __init do_utime(char *filename, time64_t mtime) +{ + struct timespec64 t[2]; + + t[0].tv_sec = mtime; + t[0].tv_nsec = 0; + t[1].tv_sec = mtime; + t[1].tv_nsec = 0; + return init_utimes(filename, t); +} + +static __initdata LIST_HEAD(dir_list); +struct dir_entry { + struct list_head list; + char *name; + time64_t mtime; +}; + +static void __init dir_add(const char *name, time64_t mtime) +{ + struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); + if (!de) + panic("can't allocate dir_entry buffer"); + INIT_LIST_HEAD(&de->list); + de->name = kstrdup(name, GFP_KERNEL); + de->mtime = mtime; + list_add(&de->list, &dir_list); +} + +static void __init dir_utime(void) +{ + struct dir_entry *de, *tmp; + list_for_each_entry_safe(de, tmp, &dir_list, list) { + list_del(&de->list); + do_utime(de->name, de->mtime); + kfree(de->name); + kfree(de); + } +} + +static __initdata time64_t mtime; + +/* cpio header parsing */ + +static __initdata unsigned long ino, major, minor, nlink; +static __initdata umode_t mode; +static __initdata unsigned long body_len, name_len; +static __initdata uid_t uid; +static __initdata gid_t gid; +static __initdata unsigned rdev; + +static void __init parse_header(char *s) +{ + unsigned long parsed[12]; + char buf[9]; + int i; + + buf[8] = '\0'; + for (i = 0, s += 6; i < 12; i++, s += 8) { + memcpy(buf, s, 8); + parsed[i] = simple_strtoul(buf, NULL, 16); + } + ino = parsed[0]; + mode = parsed[1]; + uid = parsed[2]; + gid = parsed[3]; + nlink = parsed[4]; + mtime = parsed[5]; /* breaks in y2106 */ + body_len = parsed[6]; + major = parsed[7]; + minor = parsed[8]; + rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); + name_len = parsed[11]; +} + +/* FSM */ + +static __initdata enum state { + Start, + Collect, + GotHeader, + SkipIt, + GotName, + CopyFile, + GotSymlink, + Reset +} state, next_state; + +static __initdata char *victim; +static unsigned long byte_count __initdata; +static __initdata loff_t this_header, next_header; + +static inline void __init eat(unsigned n) +{ + victim += n; + this_header += n; + byte_count -= n; +} + +static __initdata char *collected; +static long remains __initdata; +static __initdata char *collect; + +static void __init read_into(char *buf, unsigned size, enum state next) +{ + if (byte_count >= size) { + collected = victim; + eat(size); + state = next; + } else { + collect = collected = buf; + remains = size; + next_state = next; + state = Collect; + } +} + +static __initdata char *header_buf, *symlink_buf, *name_buf; + +static int __init do_start(void) +{ + read_into(header_buf, 110, GotHeader); + return 0; +} + +static int __init do_collect(void) +{ + unsigned long n = remains; + if (byte_count < n) + n = byte_count; + memcpy(collect, victim, n); + eat(n); + collect += n; + if ((remains -= n) != 0) + return 1; + state = next_state; + return 0; +} + +static int __init do_header(void) +{ + if (memcmp(collected, "070707", 6)==0) { + error("incorrect cpio method used: use -H newc option"); + return 1; + } + if (memcmp(collected, "070701", 6)) { + error("no cpio magic"); + return 1; + } + parse_header(collected); + next_header = this_header + N_ALIGN(name_len) + body_len; + next_header = (next_header + 3) & ~3; + state = SkipIt; + if (name_len <= 0 || name_len > PATH_MAX) + return 0; + if (S_ISLNK(mode)) { + if (body_len > PATH_MAX) + return 0; + collect = collected = symlink_buf; + remains = N_ALIGN(name_len) + body_len; + next_state = GotSymlink; + state = Collect; + return 0; + } + if (S_ISREG(mode) || !body_len) + read_into(name_buf, N_ALIGN(name_len), GotName); + return 0; +} + +static int __init do_skip(void) +{ + if (this_header + byte_count < next_header) { + eat(byte_count); + return 1; + } else { + eat(next_header - this_header); + state = next_state; + return 0; + } +} + +static int __init do_reset(void) +{ + while (byte_count && *victim == '\0') + eat(1); + if (byte_count && (this_header & 3)) + error("broken padding"); + return 1; +} + +static void __init clean_path(char *path, umode_t fmode) +{ + struct kstat st; + + if (!init_stat(path, &st, AT_SYMLINK_NOFOLLOW) && + (st.mode ^ fmode) & S_IFMT) { + if (S_ISDIR(st.mode)) + init_rmdir(path); + else + init_unlink(path); + } +} + +static int __init maybe_link(void) +{ + if (nlink >= 2) { + char *old = find_link(major, minor, ino, mode, collected); + if (old) { + clean_path(collected, 0); + return (init_link(old, collected) < 0) ? -1 : 1; + } + } + return 0; +} + +static __initdata struct file *wfile; +static __initdata loff_t wfile_pos; + +static int __init do_name(void) +{ + state = SkipIt; + next_state = Reset; + if (strcmp(collected, "TRAILER!!!") == 0) { + free_hash(); + return 0; + } + clean_path(collected, mode); + if (S_ISREG(mode)) { + int ml = maybe_link(); + if (ml >= 0) { + int openflags = O_WRONLY|O_CREAT; + if (ml != 1) + openflags |= O_TRUNC; + wfile = filp_open(collected, openflags, mode); + if (IS_ERR(wfile)) + return 0; + wfile_pos = 0; + + vfs_fchown(wfile, uid, gid); + vfs_fchmod(wfile, mode); + if (body_len) + vfs_truncate(&wfile->f_path, body_len); + state = CopyFile; + } + } else if (S_ISDIR(mode)) { + init_mkdir(collected, mode); + init_chown(collected, uid, gid, 0); + init_chmod(collected, mode); + dir_add(collected, mtime); + } else if (S_ISBLK(mode) || S_ISCHR(mode) || + S_ISFIFO(mode) || S_ISSOCK(mode)) { + if (maybe_link() == 0) { + init_mknod(collected, mode, rdev); + init_chown(collected, uid, gid, 0); + init_chmod(collected, mode); + do_utime(collected, mtime); + } + } + return 0; +} + +static int __init do_copy(void) +{ + if (byte_count >= body_len) { + struct timespec64 t[2] = { }; + if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len) + error("write error"); + + t[0].tv_sec = mtime; + t[1].tv_sec = mtime; + vfs_utimes(&wfile->f_path, t); + + fput(wfile); + eat(body_len); + state = SkipIt; + return 0; + } else { + if (xwrite(wfile, victim, byte_count, &wfile_pos) != byte_count) + error("write error"); + body_len -= byte_count; + eat(byte_count); + return 1; + } +} + +static int __init do_symlink(void) +{ + collected[N_ALIGN(name_len) + body_len] = '\0'; + clean_path(collected, 0); + init_symlink(collected + N_ALIGN(name_len), collected); + init_chown(collected, uid, gid, AT_SYMLINK_NOFOLLOW); + do_utime(collected, mtime); + state = SkipIt; + next_state = Reset; + return 0; +} + +static __initdata int (*actions[])(void) = { + [Start] = do_start, + [Collect] = do_collect, + [GotHeader] = do_header, + [SkipIt] = do_skip, + [GotName] = do_name, + [CopyFile] = do_copy, + [GotSymlink] = do_symlink, + [Reset] = do_reset, +}; + +static long __init write_buffer(char *buf, unsigned long len) +{ + byte_count = len; + victim = buf; + + while (!actions[state]()) + ; + return len - byte_count; +} + +static long __init flush_buffer(void *bufv, unsigned long len) +{ + char *buf = (char *) bufv; + long written; + long origLen = len; + if (message) + return -1; + while ((written = write_buffer(buf, len)) < len && !message) { + char c = buf[written]; + if (c == '0') { + buf += written; + len -= written; + state = Start; + } else if (c == 0) { + buf += written; + len -= written; + state = Reset; + } else + error("junk within compressed archive"); + } + return origLen; +} + +static unsigned long my_inptr; /* index of next byte to be processed in inbuf */ + +#include <linux/decompress/generic.h> + +static char * __init unpack_to_rootfs(char *buf, unsigned long len) +{ + long written; + decompress_fn decompress; + const char *compress_name; + static __initdata char msg_buf[64]; + + header_buf = kmalloc(110, GFP_KERNEL); + symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); + name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); + + if (!header_buf || !symlink_buf || !name_buf) + panic("can't allocate buffers"); + + state = Start; + this_header = 0; + message = NULL; + while (!message && len) { + loff_t saved_offset = this_header; + if (*buf == '0' && !(this_header & 3)) { + state = Start; + written = write_buffer(buf, len); + buf += written; + len -= written; + continue; + } + if (!*buf) { + buf++; + len--; + this_header++; + continue; + } + this_header = 0; + decompress = decompress_method(buf, len, &compress_name); + pr_debug("Detected %s compressed data\n", compress_name); + if (decompress) { + int res = decompress(buf, len, NULL, flush_buffer, NULL, + &my_inptr, error); + if (res) + error("decompressor failed"); + } else if (compress_name) { + if (!message) { + snprintf(msg_buf, sizeof msg_buf, + "compression method %s not configured", + compress_name); + message = msg_buf; + } + } else + error("invalid magic at start of compressed archive"); + if (state != Reset) + error("junk at the end of compressed archive"); + this_header = saved_offset + my_inptr; + buf += my_inptr; + len -= my_inptr; + } + dir_utime(); + kfree(name_buf); + kfree(symlink_buf); + kfree(header_buf); + return message; +} + +static int __initdata do_retain_initrd; + +static int __init retain_initrd_param(char *str) +{ + if (*str) + return 0; + do_retain_initrd = 1; + return 1; +} +__setup("retain_initrd", retain_initrd_param); + +#ifdef CONFIG_ARCH_HAS_KEEPINITRD +static int __init keepinitrd_setup(char *__unused) +{ + do_retain_initrd = 1; + return 1; +} +__setup("keepinitrd", keepinitrd_setup); +#endif + +extern char __initramfs_start[]; +extern unsigned long __initramfs_size; +#include <linux/initrd.h> +#include <linux/kexec.h> + +void __weak __init free_initrd_mem(unsigned long start, unsigned long end) +{ +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK + unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE); + unsigned long aligned_end = ALIGN(end, PAGE_SIZE); + + memblock_free(__pa(aligned_start), aligned_end - aligned_start); +#endif + + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); +} + +#ifdef CONFIG_KEXEC_CORE +static bool __init kexec_free_initrd(void) +{ + unsigned long crashk_start = (unsigned long)__va(crashk_res.start); + unsigned long crashk_end = (unsigned long)__va(crashk_res.end); + + /* + * If the initrd region is overlapped with crashkernel reserved region, + * free only memory that is not part of crashkernel region. + */ + if (initrd_start >= crashk_end || initrd_end <= crashk_start) + return false; + + /* + * Initialize initrd memory region since the kexec boot does not do. + */ + memset((void *)initrd_start, 0, initrd_end - initrd_start); + if (initrd_start < crashk_start) + free_initrd_mem(initrd_start, crashk_start); + if (initrd_end > crashk_end) + free_initrd_mem(crashk_end, initrd_end); + return true; +} +#else +static inline bool kexec_free_initrd(void) +{ + return false; +} +#endif /* CONFIG_KEXEC_CORE */ + +#ifdef CONFIG_BLK_DEV_RAM +static void __init populate_initrd_image(char *err) +{ + ssize_t written; + struct file *file; + loff_t pos = 0; + + unpack_to_rootfs(__initramfs_start, __initramfs_size); + + printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", + err); + file = filp_open("/initrd.image", O_WRONLY | O_CREAT, 0700); + if (IS_ERR(file)) + return; + + written = xwrite(file, (char *)initrd_start, initrd_end - initrd_start, + &pos); + if (written != initrd_end - initrd_start) + pr_err("/initrd.image: incomplete write (%zd != %ld)\n", + written, initrd_end - initrd_start); + fput(file); +} +#endif /* CONFIG_BLK_DEV_RAM */ + +static int __init populate_rootfs(void) +{ + /* Load the built in initramfs */ + char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); + if (err) + panic("%s", err); /* Failed to decompress INTERNAL initramfs */ + + if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) + goto done; + + if (IS_ENABLED(CONFIG_BLK_DEV_RAM)) + printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); + else + printk(KERN_INFO "Unpacking initramfs...\n"); + + err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); + if (err) { +#ifdef CONFIG_BLK_DEV_RAM + populate_initrd_image(err); +#else + printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); +#endif + } + +done: + /* + * If the initrd region is overlapped with crashkernel reserved region, + * free only memory that is not part of crashkernel region. + */ + if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) + free_initrd_mem(initrd_start, initrd_end); + initrd_start = 0; + initrd_end = 0; + + flush_delayed_fput(); + return 0; +} +rootfs_initcall(populate_rootfs); diff --git a/init/main.c b/init/main.c new file mode 100644 index 000000000..298989b0d --- /dev/null +++ b/init/main.c @@ -0,0 +1,1538 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/init/main.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * GK 2/5/95 - Changed to support mounting root fs via NFS + * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 + * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 + * Simplified starting of init: Michael A. Griffith <grif@acm.org> + */ + +#define DEBUG /* Enable initcall_debug */ + +#include <linux/types.h> +#include <linux/extable.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/binfmts.h> +#include <linux/kernel.h> +#include <linux/syscalls.h> +#include <linux/stackprotector.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/delay.h> +#include <linux/ioport.h> +#include <linux/init.h> +#include <linux/initrd.h> +#include <linux/memblock.h> +#include <linux/acpi.h> +#include <linux/bootconfig.h> +#include <linux/console.h> +#include <linux/nmi.h> +#include <linux/percpu.h> +#include <linux/kmod.h> +#include <linux/kprobes.h> +#include <linux/vmalloc.h> +#include <linux/kernel_stat.h> +#include <linux/start_kernel.h> +#include <linux/security.h> +#include <linux/smp.h> +#include <linux/profile.h> +#include <linux/rcupdate.h> +#include <linux/moduleparam.h> +#include <linux/kallsyms.h> +#include <linux/writeback.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/cgroup.h> +#include <linux/efi.h> +#include <linux/tick.h> +#include <linux/sched/isolation.h> +#include <linux/interrupt.h> +#include <linux/taskstats_kern.h> +#include <linux/delayacct.h> +#include <linux/unistd.h> +#include <linux/utsname.h> +#include <linux/rmap.h> +#include <linux/mempolicy.h> +#include <linux/key.h> +#include <linux/buffer_head.h> +#include <linux/page_ext.h> +#include <linux/debug_locks.h> +#include <linux/debugobjects.h> +#include <linux/lockdep.h> +#include <linux/kmemleak.h> +#include <linux/padata.h> +#include <linux/pid_namespace.h> +#include <linux/device/driver.h> +#include <linux/kthread.h> +#include <linux/sched.h> +#include <linux/sched/init.h> +#include <linux/signal.h> +#include <linux/idr.h> +#include <linux/kgdb.h> +#include <linux/ftrace.h> +#include <linux/async.h> +#include <linux/sfi.h> +#include <linux/shmem_fs.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <linux/ptrace.h> +#include <linux/pti.h> +#include <linux/blkdev.h> +#include <linux/elevator.h> +#include <linux/sched/clock.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/context_tracking.h> +#include <linux/random.h> +#include <linux/list.h> +#include <linux/integrity.h> +#include <linux/proc_ns.h> +#include <linux/io.h> +#include <linux/cache.h> +#include <linux/rodata_test.h> +#include <linux/jump_label.h> +#include <linux/kcsan.h> +#include <linux/init_syscalls.h> + +#include <asm/io.h> +#include <asm/setup.h> +#include <asm/sections.h> +#include <asm/cacheflush.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/initcall.h> + +#include <kunit/test.h> + +static int kernel_init(void *); + +extern void init_IRQ(void); +extern void radix_tree_init(void); + +/* + * Debug helper: via this flag we know that we are in 'early bootup code' + * where only the boot processor is running with IRQ disabled. This means + * two things - IRQ must not be enabled before the flag is cleared and some + * operations which are not allowed with IRQ disabled are allowed while the + * flag is set. + */ +bool early_boot_irqs_disabled __read_mostly; + +enum system_states system_state __read_mostly; +EXPORT_SYMBOL(system_state); + +/* + * Boot command-line arguments + */ +#define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT +#define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT + +extern void time_init(void); +/* Default late time init is NULL. archs can override this later. */ +void (*__initdata late_time_init)(void); + +/* Untouched command line saved by arch-specific code. */ +char __initdata boot_command_line[COMMAND_LINE_SIZE]; +/* Untouched saved command line (eg. for /proc) */ +char *saved_command_line; +/* Command line for parameter parsing */ +static char *static_command_line; +/* Untouched extra command line */ +static char *extra_command_line; +/* Extra init arguments */ +static char *extra_init_args; + +#ifdef CONFIG_BOOT_CONFIG +/* Is bootconfig on command line? */ +static bool bootconfig_found; +static bool initargs_found; +#else +# define bootconfig_found false +# define initargs_found false +#endif + +static char *execute_command; +static char *ramdisk_execute_command = "/init"; + +/* + * Used to generate warnings if static_key manipulation functions are used + * before jump_label_init is called. + */ +bool static_key_initialized __read_mostly; +EXPORT_SYMBOL_GPL(static_key_initialized); + +/* + * If set, this is an indication to the drivers that reset the underlying + * device before going ahead with the initialization otherwise driver might + * rely on the BIOS and skip the reset operation. + * + * This is useful if kernel is booting in an unreliable environment. + * For ex. kdump situation where previous kernel has crashed, BIOS has been + * skipped and devices will be in unknown state. + */ +unsigned int reset_devices; +EXPORT_SYMBOL(reset_devices); + +static int __init set_reset_devices(char *str) +{ + reset_devices = 1; + return 1; +} + +__setup("reset_devices", set_reset_devices); + +static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; +const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; +static const char *panic_later, *panic_param; + +extern const struct obs_kernel_param __setup_start[], __setup_end[]; + +static bool __init obsolete_checksetup(char *line) +{ + const struct obs_kernel_param *p; + bool had_early_param = false; + + p = __setup_start; + do { + int n = strlen(p->str); + if (parameqn(line, p->str, n)) { + if (p->early) { + /* Already done in parse_early_param? + * (Needs exact match on param part). + * Keep iterating, as we can have early + * params and __setups of same names 8( */ + if (line[n] == '\0' || line[n] == '=') + had_early_param = true; + } else if (!p->setup_func) { + pr_warn("Parameter %s is obsolete, ignored\n", + p->str); + return true; + } else if (p->setup_func(line + n)) + return true; + } + p++; + } while (p < __setup_end); + + return had_early_param; +} + +/* + * This should be approx 2 Bo*oMips to start (note initial shift), and will + * still work even if initially too large, it will just take slightly longer + */ +unsigned long loops_per_jiffy = (1<<12); +EXPORT_SYMBOL(loops_per_jiffy); + +static int __init debug_kernel(char *str) +{ + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; + return 0; +} + +static int __init quiet_kernel(char *str) +{ + console_loglevel = CONSOLE_LOGLEVEL_QUIET; + return 0; +} + +early_param("debug", debug_kernel); +early_param("quiet", quiet_kernel); + +static int __init loglevel(char *str) +{ + int newlevel; + + /* + * Only update loglevel value when a correct setting was passed, + * to prevent blind crashes (when loglevel being set to 0) that + * are quite hard to debug + */ + if (get_option(&str, &newlevel)) { + console_loglevel = newlevel; + return 0; + } + + return -EINVAL; +} + +early_param("loglevel", loglevel); + +#ifdef CONFIG_BLK_DEV_INITRD +static void * __init get_boot_config_from_initrd(u32 *_size, u32 *_csum) +{ + u32 size, csum; + char *data; + u32 *hdr; + int i; + + if (!initrd_end) + return NULL; + + data = (char *)initrd_end - BOOTCONFIG_MAGIC_LEN; + /* + * Since Grub may align the size of initrd to 4, we must + * check the preceding 3 bytes as well. + */ + for (i = 0; i < 4; i++) { + if (!memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN)) + goto found; + data--; + } + return NULL; + +found: + hdr = (u32 *)(data - 8); + size = le32_to_cpu(hdr[0]); + csum = le32_to_cpu(hdr[1]); + + data = ((void *)hdr) - size; + if ((unsigned long)data < initrd_start) { + pr_err("bootconfig size %d is greater than initrd size %ld\n", + size, initrd_end - initrd_start); + return NULL; + } + + /* Remove bootconfig from initramfs/initrd */ + initrd_end = (unsigned long)data; + if (_size) + *_size = size; + if (_csum) + *_csum = csum; + + return data; +} +#else +static void * __init get_boot_config_from_initrd(u32 *_size, u32 *_csum) +{ + return NULL; +} +#endif + +#ifdef CONFIG_BOOT_CONFIG + +static char xbc_namebuf[XBC_KEYLEN_MAX] __initdata; + +#define rest(dst, end) ((end) > (dst) ? (end) - (dst) : 0) + +static int __init xbc_snprint_cmdline(char *buf, size_t size, + struct xbc_node *root) +{ + struct xbc_node *knode, *vnode; + char *end = buf + size; + const char *val; + int ret; + + xbc_node_for_each_key_value(root, knode, val) { + ret = xbc_node_compose_key_after(root, knode, + xbc_namebuf, XBC_KEYLEN_MAX); + if (ret < 0) + return ret; + + vnode = xbc_node_get_child(knode); + if (!vnode) { + ret = snprintf(buf, rest(buf, end), "%s ", xbc_namebuf); + if (ret < 0) + return ret; + buf += ret; + continue; + } + xbc_array_for_each_value(vnode, val) { + ret = snprintf(buf, rest(buf, end), "%s=\"%s\" ", + xbc_namebuf, val); + if (ret < 0) + return ret; + buf += ret; + } + } + + return buf - (end - size); +} +#undef rest + +/* Make an extra command line under given key word */ +static char * __init xbc_make_cmdline(const char *key) +{ + struct xbc_node *root; + char *new_cmdline; + int ret, len = 0; + + root = xbc_find_node(key); + if (!root) + return NULL; + + /* Count required buffer size */ + len = xbc_snprint_cmdline(NULL, 0, root); + if (len <= 0) + return NULL; + + new_cmdline = memblock_alloc(len + 1, SMP_CACHE_BYTES); + if (!new_cmdline) { + pr_err("Failed to allocate memory for extra kernel cmdline.\n"); + return NULL; + } + + ret = xbc_snprint_cmdline(new_cmdline, len + 1, root); + if (ret < 0 || ret > len) { + pr_err("Failed to print extra kernel cmdline.\n"); + memblock_free(__pa(new_cmdline), len + 1); + return NULL; + } + + return new_cmdline; +} + +static u32 boot_config_checksum(unsigned char *p, u32 size) +{ + u32 ret = 0; + + while (size--) + ret += *p++; + + return ret; +} + +static int __init bootconfig_params(char *param, char *val, + const char *unused, void *arg) +{ + if (strcmp(param, "bootconfig") == 0) { + bootconfig_found = true; + } + return 0; +} + +static void __init setup_boot_config(const char *cmdline) +{ + static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; + const char *msg; + int pos; + u32 size, csum; + char *data, *copy, *err; + int ret; + + /* Cut out the bootconfig data even if we have no bootconfig option */ + data = get_boot_config_from_initrd(&size, &csum); + + strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, + bootconfig_params); + + if (IS_ERR(err) || !bootconfig_found) + return; + + /* parse_args() stops at '--' and returns an address */ + if (err) + initargs_found = true; + + if (!data) { + pr_err("'bootconfig' found on command line, but no bootconfig found\n"); + return; + } + + if (size >= XBC_DATA_MAX) { + pr_err("bootconfig size %d greater than max size %d\n", + size, XBC_DATA_MAX); + return; + } + + if (boot_config_checksum((unsigned char *)data, size) != csum) { + pr_err("bootconfig checksum failed\n"); + return; + } + + copy = memblock_alloc(size + 1, SMP_CACHE_BYTES); + if (!copy) { + pr_err("Failed to allocate memory for bootconfig\n"); + return; + } + + memcpy(copy, data, size); + copy[size] = '\0'; + + ret = xbc_init(copy, &msg, &pos); + if (ret < 0) { + if (pos < 0) + pr_err("Failed to init bootconfig: %s.\n", msg); + else + pr_err("Failed to parse bootconfig: %s at %d.\n", + msg, pos); + } else { + pr_info("Load bootconfig: %d bytes %d nodes\n", size, ret); + /* keys starting with "kernel." are passed via cmdline */ + extra_command_line = xbc_make_cmdline("kernel"); + /* Also, "init." keys are init arguments */ + extra_init_args = xbc_make_cmdline("init"); + } + return; +} + +#else + +static void __init setup_boot_config(const char *cmdline) +{ + /* Remove bootconfig data from initrd */ + get_boot_config_from_initrd(NULL, NULL); +} + +static int __init warn_bootconfig(char *str) +{ + pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); + return 0; +} +early_param("bootconfig", warn_bootconfig); + +#endif + +/* Change NUL term back to "=", to make "param" the whole string. */ +static void __init repair_env_string(char *param, char *val) +{ + if (val) { + /* param=val or param="val"? */ + if (val == param+strlen(param)+1) + val[-1] = '='; + else if (val == param+strlen(param)+2) { + val[-2] = '='; + memmove(val-1, val, strlen(val)+1); + } else + BUG(); + } +} + +/* Anything after -- gets handed straight to init. */ +static int __init set_init_arg(char *param, char *val, + const char *unused, void *arg) +{ + unsigned int i; + + if (panic_later) + return 0; + + repair_env_string(param, val); + + for (i = 0; argv_init[i]; i++) { + if (i == MAX_INIT_ARGS) { + panic_later = "init"; + panic_param = param; + return 0; + } + } + argv_init[i] = param; + return 0; +} + +/* + * Unknown boot options get handed to init, unless they look like + * unused parameters (modprobe will find them in /proc/cmdline). + */ +static int __init unknown_bootoption(char *param, char *val, + const char *unused, void *arg) +{ + size_t len = strlen(param); + + repair_env_string(param, val); + + /* Handle obsolete-style parameters */ + if (obsolete_checksetup(param)) + return 0; + + /* Unused module parameter. */ + if (strnchr(param, len, '.')) + return 0; + + if (panic_later) + return 0; + + if (val) { + /* Environment option */ + unsigned int i; + for (i = 0; envp_init[i]; i++) { + if (i == MAX_INIT_ENVS) { + panic_later = "env"; + panic_param = param; + } + if (!strncmp(param, envp_init[i], len+1)) + break; + } + envp_init[i] = param; + } else { + /* Command line option */ + unsigned int i; + for (i = 0; argv_init[i]; i++) { + if (i == MAX_INIT_ARGS) { + panic_later = "init"; + panic_param = param; + } + } + argv_init[i] = param; + } + return 0; +} + +static int __init init_setup(char *str) +{ + unsigned int i; + + execute_command = str; + /* + * In case LILO is going to boot us with default command line, + * it prepends "auto" before the whole cmdline which makes + * the shell think it should execute a script with such name. + * So we ignore all arguments entered _before_ init=... [MJ] + */ + for (i = 1; i < MAX_INIT_ARGS; i++) + argv_init[i] = NULL; + return 1; +} +__setup("init=", init_setup); + +static int __init rdinit_setup(char *str) +{ + unsigned int i; + + ramdisk_execute_command = str; + /* See "auto" comment in init_setup */ + for (i = 1; i < MAX_INIT_ARGS; i++) + argv_init[i] = NULL; + return 1; +} +__setup("rdinit=", rdinit_setup); + +#ifndef CONFIG_SMP +static const unsigned int setup_max_cpus = NR_CPUS; +static inline void setup_nr_cpu_ids(void) { } +static inline void smp_prepare_cpus(unsigned int maxcpus) { } +#endif + +/* + * We need to store the untouched command line for future reference. + * We also need to store the touched command line since the parameter + * parsing is performed in place, and we should allow a component to + * store reference of name/value for future reference. + */ +static void __init setup_command_line(char *command_line) +{ + size_t len, xlen = 0, ilen = 0; + + if (extra_command_line) + xlen = strlen(extra_command_line); + if (extra_init_args) + ilen = strlen(extra_init_args) + 4; /* for " -- " */ + + len = xlen + strlen(boot_command_line) + 1; + + saved_command_line = memblock_alloc(len + ilen, SMP_CACHE_BYTES); + if (!saved_command_line) + panic("%s: Failed to allocate %zu bytes\n", __func__, len + ilen); + + static_command_line = memblock_alloc(len, SMP_CACHE_BYTES); + if (!static_command_line) + panic("%s: Failed to allocate %zu bytes\n", __func__, len); + + if (xlen) { + /* + * We have to put extra_command_line before boot command + * lines because there could be dashes (separator of init + * command line) in the command lines. + */ + strcpy(saved_command_line, extra_command_line); + strcpy(static_command_line, extra_command_line); + } + strcpy(saved_command_line + xlen, boot_command_line); + strcpy(static_command_line + xlen, command_line); + + if (ilen) { + /* + * Append supplemental init boot args to saved_command_line + * so that user can check what command line options passed + * to init. + */ + len = strlen(saved_command_line); + if (initargs_found) { + saved_command_line[len++] = ' '; + } else { + strcpy(saved_command_line + len, " -- "); + len += 4; + } + + strcpy(saved_command_line + len, extra_init_args); + } +} + +/* + * We need to finalize in a non-__init function or else race conditions + * between the root thread and the init thread may cause start_kernel to + * be reaped by free_initmem before the root thread has proceeded to + * cpu_idle. + * + * gcc-3.4 accidentally inlines this function, so use noinline. + */ + +static __initdata DECLARE_COMPLETION(kthreadd_done); + +noinline void __ref rest_init(void) +{ + struct task_struct *tsk; + int pid; + + rcu_scheduler_starting(); + /* + * We need to spawn init first so that it obtains pid 1, however + * the init task will end up wanting to create kthreads, which, if + * we schedule it before we create kthreadd, will OOPS. + */ + pid = kernel_thread(kernel_init, NULL, CLONE_FS); + /* + * Pin init on the boot CPU. Task migration is not properly working + * until sched_init_smp() has been run. It will set the allowed + * CPUs for init to the non isolated CPUs. + */ + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id())); + rcu_read_unlock(); + + numa_default_policy(); + pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); + rcu_read_lock(); + kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); + rcu_read_unlock(); + + /* + * Enable might_sleep() and smp_processor_id() checks. + * They cannot be enabled earlier because with CONFIG_PREEMPTION=y + * kernel_thread() would trigger might_sleep() splats. With + * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled + * already, but it's stuck on the kthreadd_done completion. + */ + system_state = SYSTEM_SCHEDULING; + + complete(&kthreadd_done); + + /* + * The boot idle thread must execute schedule() + * at least once to get things moving: + */ + schedule_preempt_disabled(); + /* Call into cpu_idle with preempt disabled */ + cpu_startup_entry(CPUHP_ONLINE); +} + +/* Check for early params. */ +static int __init do_early_param(char *param, char *val, + const char *unused, void *arg) +{ + const struct obs_kernel_param *p; + + for (p = __setup_start; p < __setup_end; p++) { + if ((p->early && parameq(param, p->str)) || + (strcmp(param, "console") == 0 && + strcmp(p->str, "earlycon") == 0) + ) { + if (p->setup_func(val) != 0) + pr_warn("Malformed early option '%s'\n", param); + } + } + /* We accept everything at this stage. */ + return 0; +} + +void __init parse_early_options(char *cmdline) +{ + parse_args("early options", cmdline, NULL, 0, 0, 0, NULL, + do_early_param); +} + +/* Arch code calls this early on, or if not, just before other parsing. */ +void __init parse_early_param(void) +{ + static int done __initdata; + static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; + + if (done) + return; + + /* All fall through to do_early_param. */ + strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + parse_early_options(tmp_cmdline); + done = 1; +} + +void __init __weak arch_post_acpi_subsys_init(void) { } + +void __init __weak smp_setup_processor_id(void) +{ +} + +# if THREAD_SIZE >= PAGE_SIZE +void __init __weak thread_stack_cache_init(void) +{ +} +#endif + +void __init __weak poking_init(void) { } + +void __init __weak pgtable_cache_init(void) { } + +bool initcall_debug; +core_param(initcall_debug, initcall_debug, bool, 0644); + +#ifdef TRACEPOINTS_ENABLED +static void __init initcall_debug_enable(void); +#else +static inline void initcall_debug_enable(void) +{ +} +#endif + +/* Report memory auto-initialization states for this boot. */ +static void __init report_meminit(void) +{ + const char *stack; + + if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN)) + stack = "all(pattern)"; + else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO)) + stack = "all(zero)"; + else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL)) + stack = "byref_all(zero)"; + else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF)) + stack = "byref(zero)"; + else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER)) + stack = "__user(zero)"; + else + stack = "off"; + + pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n", + stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off", + want_init_on_free() ? "on" : "off"); + if (want_init_on_free()) + pr_info("mem auto-init: clearing system memory may take some time...\n"); +} + +/* + * Set up kernel memory allocators + */ +static void __init mm_init(void) +{ + /* + * page_ext requires contiguous pages, + * bigger than MAX_ORDER unless SPARSEMEM. + */ + page_ext_init_flatmem(); + init_debug_pagealloc(); + report_meminit(); + mem_init(); + kmem_cache_init(); + kmemleak_init(); + pgtable_init(); + debug_objects_mem_init(); + vmalloc_init(); + ioremap_huge_init(); + /* Should be run before the first non-init thread is created */ + init_espfix_bsp(); + /* Should be run after espfix64 is set up. */ + pti_init(); + mm_cache_init(); +} + +void __init __weak arch_call_rest_init(void) +{ + rest_init(); +} + +asmlinkage __visible void __init __no_sanitize_address start_kernel(void) +{ + char *command_line; + char *after_dashes; + + set_task_stack_end_magic(&init_task); + smp_setup_processor_id(); + debug_objects_early_init(); + + cgroup_init_early(); + + local_irq_disable(); + early_boot_irqs_disabled = true; + + /* + * Interrupts are still disabled. Do necessary setups, then + * enable them. + */ + boot_cpu_init(); + page_address_init(); + pr_notice("%s", linux_banner); + early_security_init(); + setup_arch(&command_line); + setup_boot_config(command_line); + setup_command_line(command_line); + setup_nr_cpu_ids(); + setup_per_cpu_areas(); + smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ + boot_cpu_hotplug_init(); + + build_all_zonelists(NULL); + page_alloc_init(); + + pr_notice("Kernel command line: %s\n", saved_command_line); + /* parameters may set static keys */ + jump_label_init(); + parse_early_param(); + after_dashes = parse_args("Booting kernel", + static_command_line, __start___param, + __stop___param - __start___param, + -1, -1, NULL, &unknown_bootoption); + if (!IS_ERR_OR_NULL(after_dashes)) + parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, + NULL, set_init_arg); + if (extra_init_args) + parse_args("Setting extra init args", extra_init_args, + NULL, 0, -1, -1, NULL, set_init_arg); + + /* + * These use large bootmem allocations and must precede + * kmem_cache_init() + */ + setup_log_buf(0); + vfs_caches_init_early(); + sort_main_extable(); + trap_init(); + mm_init(); + poking_init(); + ftrace_init(); + + /* trace_printk can be enabled here */ + early_trace_init(); + + /* + * Set up the scheduler prior starting any interrupts (such as the + * timer interrupt). Full topology setup happens at smp_init() + * time - but meanwhile we still have a functioning scheduler. + */ + sched_init(); + + if (WARN(!irqs_disabled(), + "Interrupts were enabled *very* early, fixing it\n")) + local_irq_disable(); + radix_tree_init(); + + /* + * Set up housekeeping before setting up workqueues to allow the unbound + * workqueue to take non-housekeeping into account. + */ + housekeeping_init(); + + /* + * Allow workqueue creation and work item queueing/cancelling + * early. Work item execution depends on kthreads and starts after + * workqueue_init(). + */ + workqueue_init_early(); + + rcu_init(); + + /* Trace events are available after this */ + trace_init(); + + if (initcall_debug) + initcall_debug_enable(); + + context_tracking_init(); + /* init some links before init_ISA_irqs() */ + early_irq_init(); + init_IRQ(); + tick_init(); + rcu_init_nohz(); + init_timers(); + hrtimers_init(); + softirq_init(); + timekeeping_init(); + time_init(); + + /* + * For best initial stack canary entropy, prepare it after: + * - setup_arch() for any UEFI RNG entropy and boot cmdline access + * - timekeeping_init() for ktime entropy used in random_init() + * - time_init() for making random_get_entropy() work on some platforms + * - random_init() to initialize the RNG from from early entropy sources + */ + random_init(command_line); + boot_init_stack_canary(); + + perf_event_init(); + profile_init(); + call_function_init(); + WARN(!irqs_disabled(), "Interrupts were enabled early\n"); + + early_boot_irqs_disabled = false; + local_irq_enable(); + + kmem_cache_init_late(); + + /* + * HACK ALERT! This is early. We're enabling the console before + * we've done PCI setups etc, and console_init() must be aware of + * this. But we do want output early, in case something goes wrong. + */ + console_init(); + if (panic_later) + panic("Too many boot %s vars at `%s'", panic_later, + panic_param); + + lockdep_init(); + + /* + * Need to run this when irqs are enabled, because it wants + * to self-test [hard/soft]-irqs on/off lock inversion bugs + * too: + */ + locking_selftest(); + +#ifdef CONFIG_BLK_DEV_INITRD + if (initrd_start && !initrd_below_start_ok && + page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { + pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n", + page_to_pfn(virt_to_page((void *)initrd_start)), + min_low_pfn); + initrd_start = 0; + } +#endif + setup_per_cpu_pageset(); + numa_policy_init(); + acpi_early_init(); + if (late_time_init) + late_time_init(); + sched_clock_init(); + calibrate_delay(); + + arch_cpu_finalize_init(); + + pid_idr_init(); + anon_vma_init(); +#ifdef CONFIG_X86 + if (efi_enabled(EFI_RUNTIME_SERVICES)) + efi_enter_virtual_mode(); +#endif + thread_stack_cache_init(); + cred_init(); + fork_init(); + proc_caches_init(); + uts_ns_init(); + buffer_init(); + key_init(); + security_init(); + dbg_late_init(); + vfs_caches_init(); + pagecache_init(); + signals_init(); + seq_file_init(); + proc_root_init(); + nsfs_init(); + cpuset_init(); + cgroup_init(); + taskstats_init_early(); + delayacct_init(); + + acpi_subsystem_init(); + arch_post_acpi_subsys_init(); + sfi_init_late(); + kcsan_init(); + + /* Do the rest non-__init'ed, we're now alive */ + arch_call_rest_init(); + + prevent_tail_call_optimization(); +} + +/* Call all constructor functions linked into the kernel. */ +static void __init do_ctors(void) +{ +#ifdef CONFIG_CONSTRUCTORS + ctor_fn_t *fn = (ctor_fn_t *) __ctors_start; + + for (; fn < (ctor_fn_t *) __ctors_end; fn++) + (*fn)(); +#endif +} + +#ifdef CONFIG_KALLSYMS +struct blacklist_entry { + struct list_head next; + char *buf; +}; + +static __initdata_or_module LIST_HEAD(blacklisted_initcalls); + +static int __init initcall_blacklist(char *str) +{ + char *str_entry; + struct blacklist_entry *entry; + + /* str argument is a comma-separated list of functions */ + do { + str_entry = strsep(&str, ","); + if (str_entry) { + pr_debug("blacklisting initcall %s\n", str_entry); + entry = memblock_alloc(sizeof(*entry), + SMP_CACHE_BYTES); + if (!entry) + panic("%s: Failed to allocate %zu bytes\n", + __func__, sizeof(*entry)); + entry->buf = memblock_alloc(strlen(str_entry) + 1, + SMP_CACHE_BYTES); + if (!entry->buf) + panic("%s: Failed to allocate %zu bytes\n", + __func__, strlen(str_entry) + 1); + strcpy(entry->buf, str_entry); + list_add(&entry->next, &blacklisted_initcalls); + } + } while (str_entry); + + return 1; +} + +static bool __init_or_module initcall_blacklisted(initcall_t fn) +{ + struct blacklist_entry *entry; + char fn_name[KSYM_SYMBOL_LEN]; + unsigned long addr; + + if (list_empty(&blacklisted_initcalls)) + return false; + + addr = (unsigned long) dereference_function_descriptor(fn); + sprint_symbol_no_offset(fn_name, addr); + + /* + * fn will be "function_name [module_name]" where [module_name] is not + * displayed for built-in init functions. Strip off the [module_name]. + */ + strreplace(fn_name, ' ', '\0'); + + list_for_each_entry(entry, &blacklisted_initcalls, next) { + if (!strcmp(fn_name, entry->buf)) { + pr_debug("initcall %s blacklisted\n", fn_name); + return true; + } + } + + return false; +} +#else +static int __init initcall_blacklist(char *str) +{ + pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n"); + return 0; +} + +static bool __init_or_module initcall_blacklisted(initcall_t fn) +{ + return false; +} +#endif +__setup("initcall_blacklist=", initcall_blacklist); + +static __init_or_module void +trace_initcall_start_cb(void *data, initcall_t fn) +{ + ktime_t *calltime = (ktime_t *)data; + + printk(KERN_DEBUG "calling %pS @ %i\n", fn, task_pid_nr(current)); + *calltime = ktime_get(); +} + +static __init_or_module void +trace_initcall_finish_cb(void *data, initcall_t fn, int ret) +{ + ktime_t *calltime = (ktime_t *)data; + ktime_t delta, rettime; + unsigned long long duration; + + rettime = ktime_get(); + delta = ktime_sub(rettime, *calltime); + duration = (unsigned long long) ktime_to_ns(delta) >> 10; + printk(KERN_DEBUG "initcall %pS returned %d after %lld usecs\n", + fn, ret, duration); +} + +static ktime_t initcall_calltime; + +#ifdef TRACEPOINTS_ENABLED +static void __init initcall_debug_enable(void) +{ + int ret; + + ret = register_trace_initcall_start(trace_initcall_start_cb, + &initcall_calltime); + ret |= register_trace_initcall_finish(trace_initcall_finish_cb, + &initcall_calltime); + WARN(ret, "Failed to register initcall tracepoints\n"); +} +# define do_trace_initcall_start trace_initcall_start +# define do_trace_initcall_finish trace_initcall_finish +#else +static inline void do_trace_initcall_start(initcall_t fn) +{ + if (!initcall_debug) + return; + trace_initcall_start_cb(&initcall_calltime, fn); +} +static inline void do_trace_initcall_finish(initcall_t fn, int ret) +{ + if (!initcall_debug) + return; + trace_initcall_finish_cb(&initcall_calltime, fn, ret); +} +#endif /* !TRACEPOINTS_ENABLED */ + +int __init_or_module do_one_initcall(initcall_t fn) +{ + int count = preempt_count(); + char msgbuf[64]; + int ret; + + if (initcall_blacklisted(fn)) + return -EPERM; + + do_trace_initcall_start(fn); + ret = fn(); + do_trace_initcall_finish(fn, ret); + + msgbuf[0] = 0; + + if (preempt_count() != count) { + sprintf(msgbuf, "preemption imbalance "); + preempt_count_set(count); + } + if (irqs_disabled()) { + strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); + local_irq_enable(); + } + WARN(msgbuf[0], "initcall %pS returned with %s\n", fn, msgbuf); + + add_latent_entropy(); + return ret; +} + + +extern initcall_entry_t __initcall_start[]; +extern initcall_entry_t __initcall0_start[]; +extern initcall_entry_t __initcall1_start[]; +extern initcall_entry_t __initcall2_start[]; +extern initcall_entry_t __initcall3_start[]; +extern initcall_entry_t __initcall4_start[]; +extern initcall_entry_t __initcall5_start[]; +extern initcall_entry_t __initcall6_start[]; +extern initcall_entry_t __initcall7_start[]; +extern initcall_entry_t __initcall_end[]; + +static initcall_entry_t *initcall_levels[] __initdata = { + __initcall0_start, + __initcall1_start, + __initcall2_start, + __initcall3_start, + __initcall4_start, + __initcall5_start, + __initcall6_start, + __initcall7_start, + __initcall_end, +}; + +/* Keep these in sync with initcalls in include/linux/init.h */ +static const char *initcall_level_names[] __initdata = { + "pure", + "core", + "postcore", + "arch", + "subsys", + "fs", + "device", + "late", +}; + +static int __init ignore_unknown_bootoption(char *param, char *val, + const char *unused, void *arg) +{ + return 0; +} + +static void __init do_initcall_level(int level, char *command_line) +{ + initcall_entry_t *fn; + + parse_args(initcall_level_names[level], + command_line, __start___param, + __stop___param - __start___param, + level, level, + NULL, ignore_unknown_bootoption); + + trace_initcall_level(initcall_level_names[level]); + for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) + do_one_initcall(initcall_from_entry(fn)); +} + +static void __init do_initcalls(void) +{ + int level; + size_t len = strlen(saved_command_line) + 1; + char *command_line; + + command_line = kzalloc(len, GFP_KERNEL); + if (!command_line) + panic("%s: Failed to allocate %zu bytes\n", __func__, len); + + for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) { + /* Parser modifies command_line, restore it each time */ + strcpy(command_line, saved_command_line); + do_initcall_level(level, command_line); + } + + kfree(command_line); +} + +/* + * Ok, the machine is now initialized. None of the devices + * have been touched yet, but the CPU subsystem is up and + * running, and memory and process management works. + * + * Now we can finally start doing some real work.. + */ +static void __init do_basic_setup(void) +{ + cpuset_init_smp(); + driver_init(); + init_irq_proc(); + do_ctors(); + usermodehelper_enable(); + do_initcalls(); +} + +static void __init do_pre_smp_initcalls(void) +{ + initcall_entry_t *fn; + + trace_initcall_level("early"); + for (fn = __initcall_start; fn < __initcall0_start; fn++) + do_one_initcall(initcall_from_entry(fn)); +} + +static int run_init_process(const char *init_filename) +{ + const char *const *p; + + argv_init[0] = init_filename; + pr_info("Run %s as init process\n", init_filename); + pr_debug(" with arguments:\n"); + for (p = argv_init; *p; p++) + pr_debug(" %s\n", *p); + pr_debug(" with environment:\n"); + for (p = envp_init; *p; p++) + pr_debug(" %s\n", *p); + return kernel_execve(init_filename, argv_init, envp_init); +} + +static int try_to_run_init_process(const char *init_filename) +{ + int ret; + + ret = run_init_process(init_filename); + + if (ret && ret != -ENOENT) { + pr_err("Starting init: %s exists but couldn't execute it (error %d)\n", + init_filename, ret); + } + + return ret; +} + +static noinline void __init kernel_init_freeable(void); + +#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX) +bool rodata_enabled __ro_after_init = true; +static int __init set_debug_rodata(char *str) +{ + if (strtobool(str, &rodata_enabled)) + pr_warn("Invalid option string for rodata: '%s'\n", str); + return 1; +} +__setup("rodata=", set_debug_rodata); +#endif + +#ifdef CONFIG_STRICT_KERNEL_RWX +static void mark_readonly(void) +{ + if (rodata_enabled) { + /* + * load_module() results in W+X mappings, which are cleaned + * up with call_rcu(). Let's make sure that queued work is + * flushed so that we don't hit false positives looking for + * insecure pages which are W+X. + */ + rcu_barrier(); + mark_rodata_ro(); + rodata_test(); + } else + pr_info("Kernel memory protection disabled.\n"); +} +#elif defined(CONFIG_ARCH_HAS_STRICT_KERNEL_RWX) +static inline void mark_readonly(void) +{ + pr_warn("Kernel memory protection not selected by kernel config.\n"); +} +#else +static inline void mark_readonly(void) +{ + pr_warn("This architecture does not have kernel memory protection.\n"); +} +#endif + +void __weak free_initmem(void) +{ + free_initmem_default(POISON_FREE_INITMEM); +} + +static int __ref kernel_init(void *unused) +{ + int ret; + + kernel_init_freeable(); + /* need to finish all async __init code before freeing the memory */ + async_synchronize_full(); + kprobe_free_init_mem(); + ftrace_free_init_mem(); + kgdb_free_init_mem(); + free_initmem(); + mark_readonly(); + + /* + * Kernel mappings are now finalized - update the userspace page-table + * to finalize PTI. + */ + pti_finalize(); + + system_state = SYSTEM_RUNNING; + numa_default_policy(); + + rcu_end_inkernel_boot(); + + do_sysctl_args(); + + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) + return 0; + pr_err("Failed to execute %s (error %d)\n", + ramdisk_execute_command, ret); + } + + /* + * We try each of these until one succeeds. + * + * The Bourne shell can be used instead of init if we are + * trying to recover a really broken machine. + */ + if (execute_command) { + ret = run_init_process(execute_command); + if (!ret) + return 0; + panic("Requested init %s failed (error %d).", + execute_command, ret); + } + + if (CONFIG_DEFAULT_INIT[0] != '\0') { + ret = run_init_process(CONFIG_DEFAULT_INIT); + if (ret) + pr_err("Default init %s failed (error %d)\n", + CONFIG_DEFAULT_INIT, ret); + else + return 0; + } + + if (!try_to_run_init_process("/sbin/init") || + !try_to_run_init_process("/etc/init") || + !try_to_run_init_process("/bin/init") || + !try_to_run_init_process("/bin/sh")) + return 0; + + panic("No working init found. Try passing init= option to kernel. " + "See Linux Documentation/admin-guide/init.rst for guidance."); +} + +/* Open /dev/console, for stdin/stdout/stderr, this should never fail */ +void __init console_on_rootfs(void) +{ + struct file *file = filp_open("/dev/console", O_RDWR, 0); + + if (IS_ERR(file)) { + pr_err("Warning: unable to open an initial console.\n"); + return; + } + init_dup(file); + init_dup(file); + init_dup(file); + fput(file); +} + +static noinline void __init kernel_init_freeable(void) +{ + /* + * Wait until kthreadd is all set-up. + */ + wait_for_completion(&kthreadd_done); + + /* Now the scheduler is fully set up and can do blocking allocations */ + gfp_allowed_mask = __GFP_BITS_MASK; + + /* + * init can allocate pages on any node + */ + set_mems_allowed(node_states[N_MEMORY]); + + cad_pid = get_pid(task_pid(current)); + + smp_prepare_cpus(setup_max_cpus); + + workqueue_init(); + + init_mm_internals(); + + rcu_init_tasks_generic(); + do_pre_smp_initcalls(); + lockup_detector_init(); + + smp_init(); + sched_init_smp(); + + padata_init(); + page_alloc_init_late(); + /* Initialize page ext after all struct pages are initialized. */ + page_ext_init(); + + do_basic_setup(); + + kunit_run_all_tests(); + + console_on_rootfs(); + + /* + * check if there is an early userspace init. If yes, let it do all + * the work + */ + if (init_eaccess(ramdisk_execute_command) != 0) { + ramdisk_execute_command = NULL; + prepare_namespace(); + } + + /* + * Ok, we have completed the initial bootup, and + * we're essentially up and running. Get rid of the + * initmem segments and start the user-mode stuff.. + * + * rootfs is available now, try loading the public keys + * and default modules + */ + + integrity_load_keys(); +} diff --git a/init/noinitramfs.c b/init/noinitramfs.c new file mode 100644 index 000000000..3d62b07f3 --- /dev/null +++ b/init/noinitramfs.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * init/noinitramfs.c + * + * Copyright (C) 2006, NXP Semiconductors, All Rights Reserved + * Author: Jean-Paul Saman <jean-paul.saman@nxp.com> + */ +#include <linux/init.h> +#include <linux/stat.h> +#include <linux/kdev_t.h> +#include <linux/syscalls.h> +#include <linux/init_syscalls.h> + +/* + * Create a simple rootfs that is similar to the default initramfs + */ +static int __init default_rootfs(void) +{ + int err; + + err = init_mkdir("/dev", 0755); + if (err < 0) + goto out; + + err = init_mknod("/dev/console", S_IFCHR | S_IRUSR | S_IWUSR, + new_encode_dev(MKDEV(5, 1))); + if (err < 0) + goto out; + + err = init_mkdir("/root", 0700); + if (err < 0) + goto out; + + return 0; + +out: + printk(KERN_WARNING "Failed to create a rootfs\n"); + return err; +} +rootfs_initcall(default_rootfs); diff --git a/init/version.c b/init/version.c new file mode 100644 index 000000000..cba341161 --- /dev/null +++ b/init/version.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/init/version.c + * + * Copyright (C) 1992 Theodore Ts'o + * + * May be freely distributed as part of Linux. + */ + +#include <generated/compile.h> +#include <linux/build-salt.h> +#include <linux/export.h> +#include <linux/uts.h> +#include <linux/utsname.h> +#include <generated/utsrelease.h> +#include <linux/version.h> +#include <linux/proc_ns.h> + +#ifndef CONFIG_KALLSYMS +#define version(a) Version_ ## a +#define version_string(a) version(a) + +extern int version_string(LINUX_VERSION_CODE); +int version_string(LINUX_VERSION_CODE); +#endif + +struct uts_namespace init_uts_ns = { + .kref = KREF_INIT(2), + .name = { + .sysname = UTS_SYSNAME, + .nodename = UTS_NODENAME, + .release = UTS_RELEASE, + .version = UTS_VERSION, + .machine = UTS_MACHINE, + .domainname = UTS_DOMAINNAME, + }, + .user_ns = &init_user_ns, + .ns.inum = PROC_UTS_INIT_INO, +#ifdef CONFIG_UTS_NS + .ns.ops = &utsns_operations, +#endif +}; +EXPORT_SYMBOL_GPL(init_uts_ns); + +/* FIXED STRINGS! Don't touch! */ +const char linux_banner[] = + "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n"; + +const char linux_proc_banner[] = + "%s version %s" + " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ")" + " (" LINUX_COMPILER ") %s\n"; + +BUILD_SALT; |